Index: trunk/tools/wsor/newbie_warnings/track_hugglers_ng.py |
— | — | @@ -0,0 +1,192 @@ |
| 2 | +import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types, time |
| 3 | +import wmf |
| 4 | + |
| 5 | +def encode(v): |
| 6 | + if v == None: return "\N" |
| 7 | + |
| 8 | + if type(v) == types.LongType: v = int(v) |
| 9 | + elif type(v) == types.UnicodeType: v = v.encode('utf-8') |
| 10 | + |
| 11 | + return str(v).encode("string-escape") |
| 12 | + |
| 13 | + |
| 14 | +def emit(event, p, time): |
| 15 | + print( |
| 16 | + "\t".join(encode(v) for v in [ |
| 17 | + event, |
| 18 | + p['user_id'], |
| 19 | + p['user_name'], |
| 20 | + time |
| 21 | + ]) |
| 22 | + ) |
| 23 | + sys.stdout.flush() |
| 24 | + |
| 25 | + |
| 26 | +def main(): |
| 27 | + parser = argparse.ArgumentParser( |
| 28 | + description='' |
| 29 | + ) |
| 30 | + parser.add_argument( |
| 31 | + '-c', '--cnf', |
| 32 | + metavar="<path>", |
| 33 | + type=str, |
| 34 | + help='the path to MySQL config info (defaults to ~/.my.cnf)', |
| 35 | + default=os.path.expanduser("~/.my.cnf") |
| 36 | + ) |
| 37 | + parser.add_argument( |
| 38 | + '-s', '--host', |
| 39 | + type=str, |
| 40 | + help='the database host to connect to (defaults to localhost)', |
| 41 | + default="localhost" |
| 42 | + ) |
| 43 | + parser.add_argument( |
| 44 | + '-d', '--db', |
| 45 | + type=str, |
| 46 | + help='the language db to run the query in (defaults to enwiki)', |
| 47 | + default="enwiki" |
| 48 | + ) |
| 49 | + parser.add_argument( |
| 50 | + '-o', '--out', |
| 51 | + type=lambda fn:open(fn, 'a+'), |
| 52 | + help='Where should output be appended', |
| 53 | + default=sys.stdout |
| 54 | + ) |
| 55 | + args = parser.parse_args() |
| 56 | + |
| 57 | + LOGGING_STREAM = sys.stderr |
| 58 | + logging.basicConfig( |
| 59 | + level=logging.DEBUG, |
| 60 | + stream=LOGGING_STREAM, |
| 61 | + format='%(asctime)s %(levelname)-8s %(message)s', |
| 62 | + datefmt='%b-%d %H:%M:%S' |
| 63 | + ) |
| 64 | + |
| 65 | + logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) |
| 66 | + db = Database( |
| 67 | + host=args.host, |
| 68 | + db=args.db, |
| 69 | + read_default_file=args.cnf |
| 70 | + ) |
| 71 | + |
| 72 | + try: |
| 73 | + oldPosts = {} |
| 74 | + lastTime = db.getTime() |
| 75 | + time.sleep(5) |
| 76 | + while True: |
| 77 | + logging.info("Tracking %s posts. Looking for new ones since %s." % (len(oldPosts), lastTime)) |
| 78 | + newUsers = set(db.getHugglePostsSince(lastTime)) |
| 79 | + currTime = db.getTime() |
| 80 | + currUsers = set() |
| 81 | + for p in db.getWaitingPosts(oldPosts.viewkeys() | newUsers): |
| 82 | + if p['user_name'] not in oldPosts: |
| 83 | + #Found a new posting |
| 84 | + LOGGING_STREAM.write(">") |
| 85 | + p['posting'] = currTime |
| 86 | + oldPosts[p['user_name']] = p |
| 87 | + emit("received", p, currTime) |
| 88 | + elif p['messages'] < oldPosts[p['user_name']]['messages']: |
| 89 | + #Looks like someone checked the message |
| 90 | + LOGGING_STREAM.write("<") |
| 91 | + emit("read", oldPosts[p['user_name']], currTime) |
| 92 | + del oldPosts[p['user_name']] |
| 93 | + else: |
| 94 | + #Same shit, different minute |
| 95 | + pass |
| 96 | + |
| 97 | + currUsers.add(p['user_name']) |
| 98 | + |
| 99 | + for missing in oldPosts.viewkeys() - currUsers: |
| 100 | + LOGGING_STREAM.write("<") |
| 101 | + emit("read", oldPosts[missing], currTime) |
| 102 | + del oldPosts[missing] |
| 103 | + |
| 104 | + lastTime = currTime |
| 105 | + LOGGING_STREAM.write("\n") |
| 106 | + time.sleep(5) |
| 107 | + |
| 108 | + except KeyboardInterrupt: |
| 109 | + logging.info("Keyboard interrupt detected. Shutting down.") |
| 110 | + except Exception as e: |
| 111 | + logging.error(str(e)) |
| 112 | + |
| 113 | + print(repr(oldPosts)) |
| 114 | + print(lastTime) |
| 115 | + |
| 116 | + |
| 117 | + |
| 118 | +def safe(val): |
| 119 | + return '"' + val.replace('"', '\\"') + '"' |
| 120 | + |
| 121 | +class Database: |
| 122 | + |
| 123 | + def __init__(self, *args, **kwargs): |
| 124 | + self.args = args |
| 125 | + self.kwargs = kwargs |
| 126 | + self.usersConn = MySQLdb.connect(*args, **kwargs) |
| 127 | + |
| 128 | + |
| 129 | + |
| 130 | + def getTime(self): |
| 131 | + cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor) |
| 132 | + cursor.execute( |
| 133 | + """ |
| 134 | + SELECT rc_timestamp AS time |
| 135 | + FROM recentchanges |
| 136 | + ORDER BY rc_timestamp DESC |
| 137 | + LIMIT 1 |
| 138 | + """ |
| 139 | + ) |
| 140 | + self.usersConn.commit() |
| 141 | + for row in cursor: |
| 142 | + return row['time'] |
| 143 | + |
| 144 | + |
| 145 | + def getHugglePostsSince(self, timestamp): |
| 146 | + cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor) |
| 147 | + cursor.execute(""" |
| 148 | + SELECT DISTINCT p.page_title AS title |
| 149 | + FROM revision r |
| 150 | + INNER JOIN page p |
| 151 | + ON r.rev_page = p.page_id |
| 152 | + WHERE p.page_namespace = 3 |
| 153 | + AND r.rev_timestamp >= %(timestamp)s |
| 154 | + AND r.rev_comment LIKE %(like)s |
| 155 | + """, |
| 156 | + { |
| 157 | + "timestamp": timestamp, |
| 158 | + "like": "%" + "WP:HG" + "%", |
| 159 | + "clue": "%" + "Warning" + "%" |
| 160 | + } |
| 161 | + ) |
| 162 | + return (p['title'].replace("_", " ") for p in cursor) |
| 163 | + |
| 164 | + def getWaitingPosts(self, users): |
| 165 | + cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor) |
| 166 | + userString = ",".join(safe(u) for u in users) |
| 167 | + if len(userString) != 0: |
| 168 | + cursor.execute(""" |
| 169 | + SELECT |
| 170 | + u.user_id, |
| 171 | + u.user_name, |
| 172 | + count(*) as messages, |
| 173 | + u.user_touched as last_touched, |
| 174 | + FROM user_newtalk nt |
| 175 | + LEFT JOIN user u |
| 176 | + ON u.user_id = nt.user_id |
| 177 | + WHERE u.user_name IN (""" + userString + """) |
| 178 | + GROUP BY u.user_id, u.user_name |
| 179 | + UNION |
| 180 | + SELECT |
| 181 | + NULL as user_id, |
| 182 | + nt.user_ip as user_name, |
| 183 | + count(*) as messages, |
| 184 | + NULL as last_touched, |
| 185 | + FROM user_newtalk nt |
| 186 | + WHERE nt.user_ip IN (""" + userString + """) |
| 187 | + GROUP BY nt.user_ip, NULL |
| 188 | + """ |
| 189 | + ) |
| 190 | + for post in cursor: |
| 191 | + yield post |
| 192 | + |
| 193 | +if __name__ == "__main__": main() |
Index: trunk/tools/wsor/newbie_warnings/track_hugglings.py |
— | — | @@ -0,0 +1,184 @@ |
| 2 | +import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types, time |
| 3 | +import wmf |
| 4 | + |
| 5 | +def encode(v): |
| 6 | + if v == None: return "\N" |
| 7 | + |
| 8 | + if type(v) == types.LongType: v = int(v) |
| 9 | + elif type(v) == types.UnicodeType: v = v.encode('utf-8') |
| 10 | + |
| 11 | + return str(v).encode("string-escape") |
| 12 | + |
| 13 | + |
| 14 | +def emit(event, p, time): |
| 15 | + print( |
| 16 | + "\t".join(encode(v) for v in [ |
| 17 | + event, |
| 18 | + p['user_id'], |
| 19 | + p['user_name'], |
| 20 | + time |
| 21 | + ]) |
| 22 | + ) |
| 23 | + sys.stdout.flush() |
| 24 | + |
| 25 | + |
| 26 | +def main(): |
| 27 | + parser = argparse.ArgumentParser( |
| 28 | + description='' |
| 29 | + ) |
| 30 | + parser.add_argument( |
| 31 | + '-c', '--cnf', |
| 32 | + metavar="<path>", |
| 33 | + type=str, |
| 34 | + help='the path to MySQL config info (defaults to ~/.my.cnf)', |
| 35 | + default=os.path.expanduser("~/.my.cnf") |
| 36 | + ) |
| 37 | + parser.add_argument( |
| 38 | + '-s', '--host', |
| 39 | + type=str, |
| 40 | + help='the database host to connect to (defaults to localhost)', |
| 41 | + default="localhost" |
| 42 | + ) |
| 43 | + parser.add_argument( |
| 44 | + '-d', '--db', |
| 45 | + type=str, |
| 46 | + help='the language db to run the query in (defaults to enwiki)', |
| 47 | + default="enwiki" |
| 48 | + ) |
| 49 | + parser.add_argument( |
| 50 | + '-o', '--out', |
| 51 | + type=lambda fn:open(fn, 'a+'), |
| 52 | + help='Where should output be appended', |
| 53 | + default=sys.stdout |
| 54 | + ) |
| 55 | + args = parser.parse_args() |
| 56 | + |
| 57 | + LOGGING_STREAM = sys.stderr |
| 58 | + logging.basicConfig( |
| 59 | + level=logging.DEBUG, |
| 60 | + stream=LOGGING_STREAM, |
| 61 | + format='%(asctime)s %(levelname)-8s %(message)s', |
| 62 | + datefmt='%b-%d %H:%M:%S' |
| 63 | + ) |
| 64 | + |
| 65 | + logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) |
| 66 | + db = Database( |
| 67 | + host=args.host, |
| 68 | + db=args.db, |
| 69 | + read_default_file=args.cnf |
| 70 | + ) |
| 71 | + |
| 72 | + try: |
| 73 | + oldPosts = {} |
| 74 | + lastTime = db.getTime() |
| 75 | + time.sleep(5) |
| 76 | + while True: |
| 77 | + logging.info("Tracking %s posts. Looking for new ones since %s." % (len(oldPosts), lastTime)) |
| 78 | + newUsers = set(db.getHugglePostsSince(lastTime)) |
| 79 | + currTime = db.getTime() |
| 80 | + currUsers = set() |
| 81 | + for p in db.getWaitingPosts(oldPosts.viewkeys() | newUsers): |
| 82 | + if p['user_name'] not in oldPosts: |
| 83 | + #Found a new posting |
| 84 | + LOGGING_STREAM.write(">") |
| 85 | + p['posting'] = currTime |
| 86 | + oldPosts[p['user_name']] = p |
| 87 | + emit("received", p, currTime) |
| 88 | + elif p['messages'] < oldPosts[p['user_name']]['messages']: |
| 89 | + #Looks like someone checked the message |
| 90 | + LOGGING_STREAM.write("<") |
| 91 | + emit("read", oldPosts[p['user_name']], currTime) |
| 92 | + del oldPosts[p['user_name']] |
| 93 | + else: |
| 94 | + #Same shit, different minute |
| 95 | + pass |
| 96 | + |
| 97 | + currUsers.add(p['user_name']) |
| 98 | + |
| 99 | + for missing in oldPosts.viewkeys() - currUsers: |
| 100 | + LOGGING_STREAM.write("<") |
| 101 | + emit("read", oldPosts[missing], currTime) |
| 102 | + del oldPosts[missing] |
| 103 | + |
| 104 | + lastTime = currTime |
| 105 | + LOGGING_STREAM.write("\n") |
| 106 | + time.sleep(5) |
| 107 | + |
| 108 | + except KeyboardInterrupt: |
| 109 | + logging.info("Keyboard interrupt detected. Shutting down.") |
| 110 | + except Exception as e: |
| 111 | + logging.error(str(e)) |
| 112 | + |
| 113 | + print(repr(oldPosts)) |
| 114 | + print(lastTime) |
| 115 | + |
| 116 | + |
| 117 | + |
| 118 | +def safe(val): |
| 119 | + return '"' + val.replace('"', '\\"') + '"' |
| 120 | + |
| 121 | +class Database: |
| 122 | + |
| 123 | + def __init__(self, *args, **kwargs): |
| 124 | + self.args = args |
| 125 | + self.kwargs = kwargs |
| 126 | + self.usersConn = MySQLdb.connect(*args, **kwargs) |
| 127 | + |
| 128 | + |
| 129 | + |
| 130 | + def getTime(self): |
| 131 | + cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor) |
| 132 | + cursor.execute( |
| 133 | + """ |
| 134 | + SELECT rc_timestamp AS time |
| 135 | + FROM recentchanges |
| 136 | + ORDER BY rc_timestamp DESC |
| 137 | + LIMIT 1 |
| 138 | + """ |
| 139 | + ) |
| 140 | + self.usersConn.commit() |
| 141 | + for row in cursor: |
| 142 | + return row['time'] |
| 143 | + |
| 144 | + |
| 145 | + def getHugglePostsSince(self, timestamp): |
| 146 | + cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor) |
| 147 | + cursor.execute(""" |
| 148 | + |
| 149 | + """, |
| 150 | + { |
| 151 | + "timestamp": timestamp, |
| 152 | + "like": "%" + "WP:HG" + "%", |
| 153 | + "clue": "%" + "Warning" + "%" |
| 154 | + } |
| 155 | + ) |
| 156 | + return (p['title'].replace("_", " ") for p in cursor) |
| 157 | + |
| 158 | + def getWaitingPosts(self, users): |
| 159 | + cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor) |
| 160 | + userString = ",".join(safe(u) for u in users) |
| 161 | + if len(userString) != 0: |
| 162 | + cursor.execute(""" |
| 163 | + SELECT |
| 164 | + u.user_id, |
| 165 | + u.user_name, |
| 166 | + count(*) as messages |
| 167 | + FROM user_newtalk nt |
| 168 | + LEFT JOIN user u |
| 169 | + ON u.user_id = nt.user_id |
| 170 | + WHERE u.user_name IN (""" + userString + """) |
| 171 | + GROUP BY u.user_id, u.user_name |
| 172 | + UNION |
| 173 | + SELECT |
| 174 | + NULL as user_id, |
| 175 | + nt.user_ip as user_name, |
| 176 | + count(*) as messages |
| 177 | + FROM user_newtalk nt |
| 178 | + WHERE nt.user_ip IN (""" + userString + """) |
| 179 | + GROUP BY nt.user_ip, NULL |
| 180 | + """ |
| 181 | + ) |
| 182 | + for post in cursor: |
| 183 | + yield post |
| 184 | + |
| 185 | +if __name__ == "__main__": main() |
Index: trunk/tools/wsor/newbie_warnings/track_messages.py |
— | — | @@ -91,7 +91,7 @@ |
92 | 92 | cursor = self.usersConn.cursor(MySQLdb.cursors.DictCursor) |
93 | 93 | cursor.execute( |
94 | 94 | """ |
95 | | - SELECT rc_timestamp AS time |
| 95 | + SELECT SQL_NO_CACHE rc_timestamp AS time |
96 | 96 | FROM recentchanges |
97 | 97 | ORDER BY rc_timestamp DESC |
98 | 98 | LIMIT 1 |
Index: trunk/tools/wsor/newbie_warnings/queries.sql |
— | — | @@ -181,3 +181,18 @@ |
182 | 182 | FROM user_newtalk nt |
183 | 183 | WHERE nt.user_ip IN ("EpochFail") |
184 | 184 | GROUP BY nt.user_ip, NULL; |
| 185 | + |
| 186 | + |
| 187 | +SELECT |
| 188 | + p.page_id as user_talk_id, |
| 189 | + p.page_title as user_talk_page, |
| 190 | + REPLACTE(p.page_title, "_", " ") as user_name, |
| 191 | + tl.tl_title as template |
| 192 | +FROM enwiki.templatelinks tl |
| 193 | +INNER JOIN enwiki.page p |
| 194 | + ON page_id = tl_from |
| 195 | +WHERE tl_title IN ('Z49','Z50','Z51','Z52','Z53','Z54','Z55','Z56') |
| 196 | +AND tl_namespace = 10 |
| 197 | +AND page_namespace = 3 |
| 198 | + |
| 199 | + |
Index: trunk/tools/wsor/newbie_warnings/track_hugglers.py |
— | — | @@ -19,6 +19,7 @@ |
20 | 20 | time |
21 | 21 | ]) |
22 | 22 | ) |
| 23 | + sys.stdout.flush() |
23 | 24 | |
24 | 25 | |
25 | 26 | def main(): |
— | — | @@ -149,10 +150,7 @@ |
150 | 151 | ON r.rev_page = p.page_id |
151 | 152 | WHERE p.page_namespace = 3 |
152 | 153 | AND r.rev_timestamp >= %(timestamp)s |
153 | | - AND ( |
154 | | - r.rev_comment LIKE %(like)s OR |
155 | | - r.rev_comment LIKE %(clue)s |
156 | | - ) |
| 154 | + AND r.rev_comment LIKE %(like)s |
157 | 155 | """, |
158 | 156 | { |
159 | 157 | "timestamp": timestamp, |
— | — | @@ -167,7 +165,7 @@ |
168 | 166 | userString = ",".join(safe(u) for u in users) |
169 | 167 | if len(userString) != 0: |
170 | 168 | cursor.execute(""" |
171 | | - SELECT |
| 169 | + SELECT |
172 | 170 | u.user_id, |
173 | 171 | u.user_name, |
174 | 172 | count(*) as messages |
Index: trunk/tools/wsor/first_session/get_first_n_sessions.py |
— | — | @@ -0,0 +1,247 @@ |
| 2 | +import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types |
| 3 | +import wmf |
| 4 | + |
| 5 | +def encode(v): |
| 6 | + if v == None: return "\N" |
| 7 | + |
| 8 | + if type(v) == types.LongType: v = int(v) |
| 9 | + elif type(v) == types.UnicodeType: v = v.encode('utf-8') |
| 10 | + |
| 11 | + return str(v).encode("string-escape") |
| 12 | + |
| 13 | + |
| 14 | +def main(): |
| 15 | + parser = argparse.ArgumentParser( |
| 16 | + description='Gathers editor data for first and last session' |
| 17 | + ) |
| 18 | + parser.add_argument( |
| 19 | + 'n', |
| 20 | + type=int, |
| 21 | + help='the minimum number of edits that editors must have perfomed to be included' |
| 22 | + ) |
| 23 | + parser.add_argument( |
| 24 | + 'session', |
| 25 | + type=int, |
| 26 | + help='maximum time between session edits (in seconds)' |
| 27 | + ) |
| 28 | + parser.add_argument( |
| 29 | + '-c', '--cnf', |
| 30 | + metavar="<path>", |
| 31 | + type=str, |
| 32 | + help='the path to MySQL config info (defaults to ~/.my.cnf)', |
| 33 | + default=os.path.expanduser("~/.my.cnf") |
| 34 | + ) |
| 35 | + parser.add_argument( |
| 36 | + '-s', '--host', |
| 37 | + type=str, |
| 38 | + help='the database host to connect to (defaults to localhost)', |
| 39 | + default="localhost" |
| 40 | + ) |
| 41 | + parser.add_argument( |
| 42 | + '-d', '--db', |
| 43 | + type=str, |
| 44 | + help='the language db to run the query in (defaults to enwiki)', |
| 45 | + default="enwiki" |
| 46 | + ) |
| 47 | + parser.add_argument( |
| 48 | + '-o', '--out', |
| 49 | + type=lambda fn:open(fn, 'w'), |
| 50 | + help='an output file to write to (defaults to stdout)', |
| 51 | + default=sys.stdout |
| 52 | + ) |
| 53 | + args = parser.parse_args() |
| 54 | + |
| 55 | + LOGGING_STREAM = sys.stderr |
| 56 | + logging.basicConfig( |
| 57 | + level=logging.DEBUG, |
| 58 | + stream=LOGGING_STREAM, |
| 59 | + format='%(asctime)s %(levelname)-8s %(message)s', |
| 60 | + datefmt='%b-%d %H:%M:%S' |
| 61 | + ) |
| 62 | + |
| 63 | + logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) |
| 64 | + db = Database( |
| 65 | + host=args.host, |
| 66 | + db=args.db, |
| 67 | + read_default_file=args.cnf |
| 68 | + ) |
| 69 | + headers = [ |
| 70 | + 'user_id', |
| 71 | + 'user_name', |
| 72 | + 'first_edit', |
| 73 | + 'last_edit', |
| 74 | + 'edit_count' |
| 75 | + ] |
| 76 | + for i in range(0, args.n): |
| 77 | + headers.append("es_%s_start" % i) |
| 78 | + headers.append("es_%s_end" % i) |
| 79 | + headers.append("es_%s_edits" % i) |
| 80 | + headers.append("es_%s_reverted" % i) |
| 81 | + headers.append("es_%s_vandalism" % i) |
| 82 | + headers.append("es_%s_deleted" % i) |
| 83 | + |
| 84 | + |
| 85 | + print("\t".join(headers)) |
| 86 | + |
| 87 | + logging.info("Loading users:") |
| 88 | + |
| 89 | + users = [] |
| 90 | + for user in db.getSampledUsers(): |
| 91 | + users.append(user) |
| 92 | + LOGGING_STREAM.write(".") |
| 93 | + LOGGING_STREAM.write("\n") |
| 94 | + |
| 95 | + logging.info("Processing users:") |
| 96 | + for user in users: |
| 97 | + i = 0 |
| 98 | + for session in sessions(db.getEdits(user['user_id']), args.session): |
| 99 | + user['es_%s_start' % i] = session[0]['timestamp'] |
| 100 | + user['es_%s_end' % i] = session[-1]['timestamp'] |
| 101 | + user['es_%s_edits' % i] = len(session) |
| 102 | + user['es_%s_reverted' % i] = 0 |
| 103 | + user['es_%s_vandalism' % i] = 0 |
| 104 | + user['es_%s_deleted' % i] = 0 |
| 105 | + |
| 106 | + for edit in session: |
| 107 | + user['es_%s_reverted' % i] += edit['is_reverted'] |
| 108 | + user['es_%s_vandalism' % i] += edit['is_vandalism'] |
| 109 | + user['es_%s_deleted' % i] += edit['deleted'] |
| 110 | + |
| 111 | + i += 1 |
| 112 | + if i >= args.n: |
| 113 | + break |
| 114 | + |
| 115 | + |
| 116 | + args.out.write("\t".join(encode(user.get(h)) for h in headers) + "\n") |
| 117 | + LOGGING_STREAM.write(".") |
| 118 | + |
| 119 | + LOGGING_STREAM.write("\n") |
| 120 | + |
| 121 | + |
| 122 | +def sessions(edits, sessionThreshold=3600): |
| 123 | + sessionEdits = [] |
| 124 | + for edit in edits: |
| 125 | + edit['timestamp'] = wmf.wp2Timestamp(edit['rev_timestamp']) |
| 126 | + if len(sessionEdits) == 0: |
| 127 | + sessionEdits.append(edit) |
| 128 | + elif (edit['timestamp'] - sessionEdits[-1]['timestamp']) < sessionThreshold: |
| 129 | + sessionEdits.append(edit) |
| 130 | + else: |
| 131 | + yield sessionEdits |
| 132 | + sessionEdits = [edit] |
| 133 | + |
| 134 | + |
| 135 | + if len(sessionEdits) > 0: |
| 136 | + yield sessionEdits |
| 137 | + |
| 138 | + |
| 139 | + |
| 140 | + |
| 141 | +class Database: |
| 142 | + |
| 143 | + def __init__(self, *args, **kwargs): |
| 144 | + self.args = args |
| 145 | + self.kwargs = kwargs |
| 146 | + self.usersConn = MySQLdb.connect(*args, **kwargs) |
| 147 | + self.revsConn = MySQLdb.connect(*args, **kwargs) |
| 148 | + self.archConn = MySQLdb.connect(*args, **kwargs) |
| 149 | + |
| 150 | + def getSampledUsers(self): |
| 151 | + cursor = self.usersConn.cursor(MySQLdb.cursors.SSDictCursor) |
| 152 | + cursor.execute( |
| 153 | + """ |
| 154 | + SELECT |
| 155 | + u.user_id, |
| 156 | + u.user_name, |
| 157 | + um.first_edit, |
| 158 | + um.last_edit, |
| 159 | + u.user_editcount as edit_count |
| 160 | + FROM halfak.user_session_sample us |
| 161 | + INNER JOIN user u |
| 162 | + ON u.user_id = us.user_id |
| 163 | + INNER JOIN halfak.user_meta_20110715 um |
| 164 | + ON u.user_id = um.user_id |
| 165 | + """ |
| 166 | + ) |
| 167 | + for row in cursor: |
| 168 | + yield row |
| 169 | + |
| 170 | + |
| 171 | + |
| 172 | + def getEdits(self, userId, chronologically=True): |
| 173 | + userId = int(userId) |
| 174 | + revisionCursor = self.revsConn.cursor(MySQLdb.cursors.SSDictCursor) |
| 175 | + archiveCursor = self.archConn.cursor(MySQLdb.cursors.SSDictCursor) |
| 176 | + |
| 177 | + if chronologically: direction = "ASC" |
| 178 | + else: direction = "DESC" |
| 179 | + |
| 180 | + revisionCursor.execute( |
| 181 | + """ |
| 182 | + SELECT |
| 183 | + r.rev_id, |
| 184 | + r.rev_timestamp, |
| 185 | + rvtd.revision_id IS NOT NULL AS is_reverted, |
| 186 | + rvtd.is_vandalism IS NOT NULL AND rvtd.is_vandalism = TRUE AS is_vandalism, |
| 187 | + False AS deleted |
| 188 | + FROM revision r |
| 189 | + LEFT JOIN halfak.reverted_20110115 rvtd |
| 190 | + ON r.rev_id = rvtd.revision_id |
| 191 | + WHERE rev_user = %(user_id)s |
| 192 | + ORDER BY r.rev_timestamp """ + direction + """ |
| 193 | + """, |
| 194 | + { |
| 195 | + 'user_id': userId |
| 196 | + } |
| 197 | + ) |
| 198 | + archiveCursor.execute( |
| 199 | + """ |
| 200 | + SELECT |
| 201 | + ar_rev_id AS rev_id, |
| 202 | + ar_timestamp AS rev_timestamp, |
| 203 | + False AS is_reverted, |
| 204 | + False AS is_vandalism, |
| 205 | + True AS deleted |
| 206 | + FROM archive |
| 207 | + WHERE ar_user = %(user_id)s |
| 208 | + ORDER BY ar_timestamp """ + direction + """ |
| 209 | + """, |
| 210 | + { |
| 211 | + 'user_id': userId |
| 212 | + } |
| 213 | + ) |
| 214 | + if chronologically: |
| 215 | + order = lambda t1, t2:t1 < t2 |
| 216 | + else: |
| 217 | + order = lambda t1, t2:t1 > t2 |
| 218 | + |
| 219 | + revPointer = revisionCursor.fetchone() |
| 220 | + archPointer = archiveCursor.fetchone() |
| 221 | + while revPointer != None or archPointer != None: #still something to output |
| 222 | + if revPointer != None and archPointer != None: #both cursors still have something |
| 223 | + if order(revPointer['rev_timestamp'], archPointer['rev_timestamp']): |
| 224 | + yield revPointer |
| 225 | + revPointer = revisionCursor.fetchone() |
| 226 | + else: |
| 227 | + yield archPointer |
| 228 | + archPointer = archiveCursor.fetchone() |
| 229 | + elif revPointer != None: #only revisions left |
| 230 | + yield revPointer |
| 231 | + revPointer = revisionCursor.fetchone() |
| 232 | + elif archPointer != None: #only archives left |
| 233 | + yield archPointer |
| 234 | + archPointer = archiveCursor.fetchone() |
| 235 | + |
| 236 | + revisionCursor.close() |
| 237 | + archiveCursor.close() |
| 238 | + |
| 239 | + |
| 240 | + |
| 241 | + def getFirstEdits(self, userId, maximum=10000): |
| 242 | + return self.getEdits(userId, maximum, chronologically=True) |
| 243 | + |
| 244 | + def getLastEdits(self, userId, maximum=10000): |
| 245 | + return self.getEdits(userId, maximum, chronologically=False) |
| 246 | + |
| 247 | + |
| 248 | +if __name__ == "__main__": main() |
Index: trunk/tools/wsor/first_session/R/.Rhistory |
— | — | @@ -0,0 +1,512 @@ |
| 2 | +pch=4, |
| 3 | +lty=4 |
| 4 | +), |
| 5 | +"32"=list( |
| 6 | +col="#00BBBB", |
| 7 | +pch=5, |
| 8 | +lty=5 |
| 9 | +), |
| 10 | +"64"=list( |
| 11 | +col="#BB00BB", |
| 12 | +pch=6, |
| 13 | +lty=6 |
| 14 | +) |
| 15 | +) |
| 16 | +xyplot( |
| 17 | +early_survival ~ year, |
| 18 | +data=limited_year_edits_props, |
| 19 | +groups=es_0_bucket, |
| 20 | +panel=function(x, y, subscripts, groups, ...){ |
| 21 | +f = limited_year_edits_props[subscripts,] |
| 22 | +for(group in groups){ |
| 23 | +group = as.character(group) |
| 24 | +subf = f[f$es_0_bucket == group,] |
| 25 | +p = subf$early_survival |
| 26 | +x = subf$year |
| 27 | +n = subf$n |
| 28 | +panel.xyplot( |
| 29 | +x, p, |
| 30 | +col=params[[group]]$col, |
| 31 | +pch=params[[group]]$pch, |
| 32 | +... |
| 33 | +) |
| 34 | +panel.lines( |
| 35 | +x, p, |
| 36 | +col=params[[group]]$col, |
| 37 | +lwd=2, |
| 38 | +... |
| 39 | +) |
| 40 | +se = sqrt(p*(1-p)/n) |
| 41 | +panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05) |
| 42 | +} |
| 43 | +}, |
| 44 | +ylim=c(0, 1), |
| 45 | +main="Early survival proportion for new editors grouped by edits in their first session", |
| 46 | +ylab="Proportion of surviving editors", |
| 47 | +xlab="Years", |
| 48 | +sub="early survival = editing more than 1 month after first session", |
| 49 | +auto.key=list( |
| 50 | +text=paste("~", names(params), "edits"), |
| 51 | +col=c( |
| 52 | +"#000000", |
| 53 | +"#FF0000", |
| 54 | +"#00FF00", |
| 55 | +"#0000FF", |
| 56 | +"#BBBB00", |
| 57 | +"#00BBBB", |
| 58 | +"#BB00BB" |
| 59 | +) |
| 60 | +) |
| 61 | +) |
| 62 | +dev.off() |
| 63 | +user_sessions$es_0_no_arch = 2^round(log((user_sessions$es_0_edits - user_sessions$es_0_deleted)+1, base=2)) |
| 64 | +no_arch_edits_props = with( |
| 65 | +summaryBy( |
| 66 | +early_survival ~ year + es_0_no_arch, |
| 67 | +data=user_sessions[ |
| 68 | +!is.na(user_sessions$year) & |
| 69 | +user_sessions$es_0_no_arch <= 256, |
| 70 | +], |
| 71 | +FUN=c(mean, length) |
| 72 | +), |
| 73 | +data.frame( |
| 74 | +year = year, |
| 75 | +es_0_no_arch = es_0_no_arch, |
| 76 | +early_survival = early_survival.mean, |
| 77 | +n = early_survival.length |
| 78 | +) |
| 79 | +) |
| 80 | +png("plots/early_survival.by_year.es_lines.no_archive.png", height=768, width=1024) |
| 81 | +limited_year_edits_props = no_arch_edits_props[ |
| 82 | +no_arch_edits_props$n >= 10 & |
| 83 | +no_arch_edits_props$es_0_no_arch <= 16, |
| 84 | +] |
| 85 | +params = list( |
| 86 | +"0"=list( |
| 87 | +col="#AAAAAA", |
| 88 | +pch=0, |
| 89 | +lty=0 |
| 90 | +), |
| 91 | +"1"=list( |
| 92 | +col="#000000", |
| 93 | +pch=0, |
| 94 | +lty=0 |
| 95 | +), |
| 96 | +"2"=list( |
| 97 | +col="#FF0000", |
| 98 | +pch=1, |
| 99 | +lty=1 |
| 100 | +), |
| 101 | +"4"=list( |
| 102 | +col="#00FF00", |
| 103 | +pch=2, |
| 104 | +lty=2 |
| 105 | +), |
| 106 | +"8"=list( |
| 107 | +col="#0000FF", |
| 108 | +pch=3, |
| 109 | +lty=3 |
| 110 | +), |
| 111 | +"16"=list( |
| 112 | +col="#BBBB00", |
| 113 | +pch=4, |
| 114 | +lty=4 |
| 115 | +) |
| 116 | +) |
| 117 | +xyplot( |
| 118 | +early_survival ~ year, |
| 119 | +data=limited_year_edits_props, |
| 120 | +groups=es_0_no_arch, |
| 121 | +panel=function(x, y, subscripts, groups, ...){ |
| 122 | +f = limited_year_edits_props[subscripts,] |
| 123 | +for(group in groups){ |
| 124 | +group = as.character(group) |
| 125 | +subf = f[f$es_0_no_arch == group,] |
| 126 | +p = subf$early_survival |
| 127 | +x = subf$year |
| 128 | +n = subf$n |
| 129 | +panel.xyplot( |
| 130 | +x, p, |
| 131 | +col=params[[group]]$col, |
| 132 | +pch=params[[group]]$pch, |
| 133 | +... |
| 134 | +) |
| 135 | +panel.lines( |
| 136 | +x, p, |
| 137 | +col=params[[group]]$col, |
| 138 | +lwd=2, |
| 139 | +... |
| 140 | +) |
| 141 | +se = sqrt(p*(1-p)/n) |
| 142 | +panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05) |
| 143 | +} |
| 144 | +}, |
| 145 | +ylim=c(0, 1), |
| 146 | +main="Early survival proportion for new editors grouped by edits (not deleted) in their first session", |
| 147 | +ylab="Proportion of surviving editors", |
| 148 | +xlab="Years", |
| 149 | +sub="early survival = editing more than 1 month after first session", |
| 150 | +auto.key=list( |
| 151 | +text=paste("~", names(params), "edits"), |
| 152 | +col=c( |
| 153 | +"#AAAAAA", |
| 154 | +"#000000", |
| 155 | +"#FF0000", |
| 156 | +"#00FF00", |
| 157 | +"#0000FF", |
| 158 | +"#BBBB00", |
| 159 | +"#00BBBB", |
| 160 | +"#BB00BB" |
| 161 | +) |
| 162 | +) |
| 163 | +) |
| 164 | +dev.off() |
| 165 | +png("plots/early_survival.by_year.es_lines.no_archive.png", height=768, width=1024) |
| 166 | +limited_year_edits_props = no_arch_edits_props[ |
| 167 | +no_arch_edits_props$n >= 10 & |
| 168 | +no_arch_edits_props$es_0_no_arch <= 16, |
| 169 | +] |
| 170 | +params = list( |
| 171 | +"0"=list( |
| 172 | +col="#AAAAAA", |
| 173 | +pch=0, |
| 174 | +lty=0 |
| 175 | +), |
| 176 | +"1"=list( |
| 177 | +col="#000000", |
| 178 | +pch=0, |
| 179 | +lty=0 |
| 180 | +), |
| 181 | +"2"=list( |
| 182 | +col="#FF0000", |
| 183 | +pch=1, |
| 184 | +lty=1 |
| 185 | +), |
| 186 | +"4"=list( |
| 187 | +col="#00FF00", |
| 188 | +pch=2, |
| 189 | +lty=2 |
| 190 | +), |
| 191 | +"8"=list( |
| 192 | +col="#0000FF", |
| 193 | +pch=3, |
| 194 | +lty=3 |
| 195 | +), |
| 196 | +"16"=list( |
| 197 | +col="#BBBB00", |
| 198 | +pch=4, |
| 199 | +lty=4 |
| 200 | +) |
| 201 | +) |
| 202 | +xyplot( |
| 203 | +early_survival ~ year, |
| 204 | +data=limited_year_edits_props, |
| 205 | +groups=es_0_no_arch, |
| 206 | +panel=function(x, y, subscripts, groups, ...){ |
| 207 | +f = limited_year_edits_props[subscripts,] |
| 208 | +for(group in groups){ |
| 209 | +group = as.character(group) |
| 210 | +subf = f[f$es_0_no_arch == group,] |
| 211 | +p = subf$early_survival |
| 212 | +x = subf$year |
| 213 | +n = subf$n |
| 214 | +panel.xyplot( |
| 215 | +x, p, |
| 216 | +col=params[[group]]$col, |
| 217 | +pch=params[[group]]$pch, |
| 218 | +... |
| 219 | +) |
| 220 | +panel.lines( |
| 221 | +x, p, |
| 222 | +col=params[[group]]$col, |
| 223 | +lwd=2, |
| 224 | +... |
| 225 | +) |
| 226 | +se = sqrt(p*(1-p)/n) |
| 227 | +panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05) |
| 228 | +} |
| 229 | +}, |
| 230 | +ylim=c(0, 1), |
| 231 | +main="Early survival proportion for new editors grouped by edits (not deleted) in their first session", |
| 232 | +ylab="Proportion of surviving editors", |
| 233 | +xlab="Years", |
| 234 | +sub="early survival = editing more than 1 month after first session", |
| 235 | +auto.key=list( |
| 236 | +text=paste("~", names(params), "edits"), |
| 237 | +col=c( |
| 238 | +"#AAAAAA", |
| 239 | +"#000000", |
| 240 | +"#FF0000", |
| 241 | +"#00FF00", |
| 242 | +"#0000FF", |
| 243 | +"#BBBB00", |
| 244 | +"#00BBBB", |
| 245 | +"#BB00BB" |
| 246 | +), |
| 247 | +points=F |
| 248 | +) |
| 249 | +) |
| 250 | +dev.off() |
| 251 | +png("plots/early_survival.by_year.es_lines.png", height=768, width=1024) |
| 252 | +limited_year_edits_props = year_edits_props[ |
| 253 | +year_edits_props$n >= 10 & |
| 254 | +year_edits_props$es_0_bucket <= 16, |
| 255 | +] |
| 256 | +params = list( |
| 257 | +"1"=list( |
| 258 | +col="#000000", |
| 259 | +pch=0, |
| 260 | +lty=0 |
| 261 | +), |
| 262 | +"2"=list( |
| 263 | +col="#FF0000", |
| 264 | +pch=1, |
| 265 | +lty=1 |
| 266 | +), |
| 267 | +"4"=list( |
| 268 | +col="#00FF00", |
| 269 | +pch=2, |
| 270 | +lty=2 |
| 271 | +), |
| 272 | +"8"=list( |
| 273 | +col="#0000FF", |
| 274 | +pch=3, |
| 275 | +lty=3 |
| 276 | +), |
| 277 | +"16"=list( |
| 278 | +col="#BBBB00", |
| 279 | +pch=4, |
| 280 | +lty=4 |
| 281 | +), |
| 282 | +"32"=list( |
| 283 | +col="#00BBBB", |
| 284 | +pch=5, |
| 285 | +lty=5 |
| 286 | +), |
| 287 | +"64"=list( |
| 288 | +col="#BB00BB", |
| 289 | +pch=6, |
| 290 | +lty=6 |
| 291 | +) |
| 292 | +) |
| 293 | +xyplot( |
| 294 | +early_survival ~ year, |
| 295 | +data=limited_year_edits_props, |
| 296 | +groups=es_0_bucket, |
| 297 | +panel=function(x, y, subscripts, groups, ...){ |
| 298 | +f = limited_year_edits_props[subscripts,] |
| 299 | +for(group in groups){ |
| 300 | +group = as.character(group) |
| 301 | +subf = f[f$es_0_bucket == group,] |
| 302 | +p = subf$early_survival |
| 303 | +x = subf$year |
| 304 | +n = subf$n |
| 305 | +panel.xyplot( |
| 306 | +x, p, |
| 307 | +col=params[[group]]$col, |
| 308 | +pch=params[[group]]$pch, |
| 309 | +... |
| 310 | +) |
| 311 | +panel.lines( |
| 312 | +x, p, |
| 313 | +col=params[[group]]$col, |
| 314 | +lwd=2, |
| 315 | +... |
| 316 | +) |
| 317 | +se = sqrt(p*(1-p)/n) |
| 318 | +panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05) |
| 319 | +} |
| 320 | +}, |
| 321 | +ylim=c(0, 1), |
| 322 | +main="Early survival proportion for new editors grouped by edits in their first session", |
| 323 | +ylab="Proportion of surviving editors", |
| 324 | +xlab="Years", |
| 325 | +sub="early survival = editing more than 1 month after first session", |
| 326 | +auto.key=list( |
| 327 | +text=paste("~", names(params), "edits"), |
| 328 | +col=c( |
| 329 | +"#000000", |
| 330 | +"#FF0000", |
| 331 | +"#00FF00", |
| 332 | +"#0000FF", |
| 333 | +"#BBBB00", |
| 334 | +"#00BBBB", |
| 335 | +"#BB00BB" |
| 336 | +), |
| 337 | +points=F |
| 338 | +) |
| 339 | +) |
| 340 | +dev.off() |
| 341 | +user_sessions$es_0_no_arch = 2^round(log(user_sessions$es_0_edits - user_sessions$es_0_deleted, base=2)) |
| 342 | +no_arch_edits_props = with( |
| 343 | +summaryBy( |
| 344 | +early_survival ~ year + es_0_no_arch, |
| 345 | +data=user_sessions[ |
| 346 | +!is.na(user_sessions$year) & |
| 347 | +user_sessions$es_0_no_arch <= 256, |
| 348 | +], |
| 349 | +FUN=c(mean, length) |
| 350 | +), |
| 351 | +data.frame( |
| 352 | +year = year, |
| 353 | +es_0_no_arch = es_0_no_arch, |
| 354 | +early_survival = early_survival.mean, |
| 355 | +n = early_survival.length |
| 356 | +) |
| 357 | +) |
| 358 | +png("plots/early_survival.by_year.es_lines.no_archive.png", height=768, width=1024) |
| 359 | +limited_year_edits_props = no_arch_edits_props[ |
| 360 | +no_arch_edits_props$n >= 10 & |
| 361 | +no_arch_edits_props$es_0_no_arch <= 16, |
| 362 | +] |
| 363 | +params = list( |
| 364 | +"0"=list( |
| 365 | +col="#AAAAAA", |
| 366 | +pch=0, |
| 367 | +lty=0 |
| 368 | +), |
| 369 | +"1"=list( |
| 370 | +col="#000000", |
| 371 | +pch=0, |
| 372 | +lty=0 |
| 373 | +), |
| 374 | +"2"=list( |
| 375 | +col="#FF0000", |
| 376 | +pch=1, |
| 377 | +lty=1 |
| 378 | +), |
| 379 | +"4"=list( |
| 380 | +col="#00FF00", |
| 381 | +pch=2, |
| 382 | +lty=2 |
| 383 | +), |
| 384 | +"8"=list( |
| 385 | +col="#0000FF", |
| 386 | +pch=3, |
| 387 | +lty=3 |
| 388 | +), |
| 389 | +"16"=list( |
| 390 | +col="#BBBB00", |
| 391 | +pch=4, |
| 392 | +lty=4 |
| 393 | +) |
| 394 | +) |
| 395 | +xyplot( |
| 396 | +early_survival ~ year, |
| 397 | +data=limited_year_edits_props, |
| 398 | +groups=es_0_no_arch, |
| 399 | +panel=function(x, y, subscripts, groups, ...){ |
| 400 | +f = limited_year_edits_props[subscripts,] |
| 401 | +for(group in groups){ |
| 402 | +group = as.character(group) |
| 403 | +subf = f[f$es_0_no_arch == group,] |
| 404 | +p = subf$early_survival |
| 405 | +x = subf$year |
| 406 | +n = subf$n |
| 407 | +panel.xyplot( |
| 408 | +x, p, |
| 409 | +col=params[[group]]$col, |
| 410 | +pch=params[[group]]$pch, |
| 411 | +... |
| 412 | +) |
| 413 | +panel.lines( |
| 414 | +x, p, |
| 415 | +col=params[[group]]$col, |
| 416 | +lwd=2, |
| 417 | +... |
| 418 | +) |
| 419 | +se = sqrt(p*(1-p)/n) |
| 420 | +panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05) |
| 421 | +} |
| 422 | +}, |
| 423 | +ylim=c(0, 1), |
| 424 | +main="Early survival proportion for new editors grouped by edits (not deleted) in their first session", |
| 425 | +ylab="Proportion of surviving editors", |
| 426 | +xlab="Years", |
| 427 | +sub="early survival = editing more than 1 month after first session", |
| 428 | +auto.key=list( |
| 429 | +text=paste("~", names(params), "edits"), |
| 430 | +col=c( |
| 431 | +"#AAAAAA", |
| 432 | +"#000000", |
| 433 | +"#FF0000", |
| 434 | +"#00FF00", |
| 435 | +"#0000FF", |
| 436 | +"#BBBB00", |
| 437 | +"#00BBBB", |
| 438 | +"#BB00BB" |
| 439 | +), |
| 440 | +points=F |
| 441 | +) |
| 442 | +) |
| 443 | +dev.off() |
| 444 | +es_0_bucket = 10^floor(log(user_sessions$es_0_edits, base=10)) |
| 445 | +table(es_0_bucket) |
| 446 | +three_es_buckets = with( |
| 447 | +summaryBy( |
| 448 | +es_0_edits + |
| 449 | +es_1_edits + |
| 450 | +es_2_edits ~ |
| 451 | +year + es_0_bucket, |
| 452 | +data=user_sessions, |
| 453 | +FUN=c(mean, sd, length) |
| 454 | +), |
| 455 | +data.frame( |
| 456 | +year = year |
| 457 | +es_0_bucket = es_0_bucket, |
| 458 | +es_0_mean = es_0_edits.mean, |
| 459 | +es_0_sd = es_0_edits.sd, |
| 460 | +es_0_n = es_0_edits.length, |
| 461 | +es_1_mean = es_1_edits.mean, |
| 462 | +es_1_sd = es_1_edits.sd, |
| 463 | +es_1_n = es_1_edits.length, |
| 464 | +es_2_mean = es_2_edits.mean, |
| 465 | +es_2_sd = es_2_edits.sd, |
| 466 | +es_2_n = es_2_edits.length |
| 467 | +) |
| 468 | +)three_es_buckets = with( |
| 469 | +summaryBy( |
| 470 | +es_0_edits + |
| 471 | +es_1_edits + |
| 472 | +es_2_edits ~ |
| 473 | +year + es_0_bucket, |
| 474 | +data=user_sessions, |
| 475 | +FUN=c(mean, sd, length) |
| 476 | +), |
| 477 | +data.frame( |
| 478 | +year = year, |
| 479 | +bucket = es_0_bucket, |
| 480 | +es_0_mean = es_0_edits.mean, |
| 481 | +es_0_sd = es_0_edits.sd, |
| 482 | +es_0_n = es_0_edits.length, |
| 483 | +es_1_mean = es_1_edits.mean, |
| 484 | +es_1_sd = es_1_edits.sd, |
| 485 | +es_1_n = es_1_edits.length, |
| 486 | +es_2_mean = es_2_edits.mean, |
| 487 | +es_2_sd = es_2_edits.sd, |
| 488 | +es_2_n = es_2_edits.length |
| 489 | +) |
| 490 | +three_es_buckets = with( |
| 491 | +summaryBy( |
| 492 | +es_0_edits + |
| 493 | +es_1_edits + |
| 494 | +es_2_edits ~ |
| 495 | +year + es_0_bucket, |
| 496 | +data=user_sessions, |
| 497 | +FUN=c(mean, sd, length) |
| 498 | +), |
| 499 | +data.frame( |
| 500 | +year = year, |
| 501 | +bucket = es_0_bucket, |
| 502 | +es_0_mean = es_0_edits.mean, |
| 503 | +es_0_sd = es_0_edits.sd, |
| 504 | +es_0_n = es_0_edits.length, |
| 505 | +es_1_mean = es_1_edits.mean, |
| 506 | +es_1_sd = es_1_edits.sd, |
| 507 | +es_1_n = es_1_edits.length, |
| 508 | +es_2_mean = es_2_edits.mean, |
| 509 | +es_2_sd = es_2_edits.sd, |
| 510 | +es_2_n = es_2_edits.length |
| 511 | +) |
| 512 | +) |
| 513 | +three_es_buckets |
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.no_archive.png |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.no_archive.png |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 514 | + application/octet-stream |
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_first_session.png |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_first_session.png |
___________________________________________________________________ |
Added: svn:mime-type |
2 | 515 | + application/octet-stream |
Index: trunk/tools/wsor/first_session/R/plots/edit_count_distribution.png |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/tools/wsor/first_session/R/plots/edit_count_distribution.png |
___________________________________________________________________ |
Added: svn:mime-type |
3 | 516 | + application/octet-stream |
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.png |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_lines.png |
___________________________________________________________________ |
Added: svn:mime-type |
4 | 517 | + application/octet-stream |
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.png |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.png |
___________________________________________________________________ |
Added: svn:mime-type |
5 | 518 | + application/octet-stream |
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.no_vandals.png |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.no_vandals.png |
___________________________________________________________________ |
Added: svn:mime-type |
6 | 519 | + application/octet-stream |
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.png |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.png |
___________________________________________________________________ |
Added: svn:mime-type |
7 | 520 | + application/octet-stream |
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.boxplot.png |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.boxplot.png |
___________________________________________________________________ |
Added: svn:mime-type |
8 | 521 | + application/octet-stream |
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_10.png |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_10.png |
___________________________________________________________________ |
Added: svn:mime-type |
9 | 522 | + application/octet-stream |
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_100.png |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year.es_100.png |
___________________________________________________________________ |
Added: svn:mime-type |
10 | 523 | + application/octet-stream |
Index: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.no_vandals.png |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/tools/wsor/first_session/R/plots/early_survival.by_year_and_rejection.no_vandals.png |
___________________________________________________________________ |
Added: svn:mime-type |
11 | 524 | + application/octet-stream |
Index: trunk/tools/wsor/first_session/R/plots/edit_sessions.by_year_and_es_0_bucket.png |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/tools/wsor/first_session/R/plots/edit_sessions.by_year_and_es_0_bucket.png |
___________________________________________________________________ |
Added: svn:mime-type |
12 | 525 | + application/octet-stream |
Index: trunk/tools/wsor/first_session/R/plots/edit_count_distribution.prop.png |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/tools/wsor/first_session/R/plots/edit_count_distribution.prop.png |
___________________________________________________________________ |
Added: svn:mime-type |
13 | 526 | + application/octet-stream |
Index: trunk/tools/wsor/first_session/R/first_session_characteristics.R |
— | — | @@ -0,0 +1,132 @@ |
| 2 | +source("loader/user_sessions.R") |
| 3 | + |
| 4 | +library(lattice) |
| 5 | +library(doBy) |
| 6 | + |
| 7 | +user_sessions = load_user_sessions() |
| 8 | +user_sessions$year = strftime(user_sessions$first_edit, format="%Y") |
| 9 | +user_sessions$early_survival = user_sessions$last_edit - user_sessions$es_0_end >= 30 |
| 10 | + |
| 11 | +user_sessions$es_0_bucket = 10^floor(log(user_sessions$es_0_edits, base=10)) |
| 12 | +user_sessions$es_1_edits = naReplace(user_sessions$es_1_edits, 0) |
| 13 | +user_sessions$es_2_edits = naReplace(user_sessions$es_2_edits, 0) |
| 14 | + |
| 15 | + |
| 16 | + |
| 17 | + |
| 18 | + |
| 19 | +three_es_buckets = with( |
| 20 | + summaryBy( |
| 21 | + es_0_edits + |
| 22 | + es_1_edits + |
| 23 | + es_2_edits ~ |
| 24 | + year + es_0_bucket, |
| 25 | + data=user_sessions, |
| 26 | + FUN=c(mean, sd, length) |
| 27 | + ), |
| 28 | + rbind( |
| 29 | + data.frame( |
| 30 | + year = year, |
| 31 | + bucket = es_0_bucket, |
| 32 | + es = 0, |
| 33 | + mean = es_0_edits.mean, |
| 34 | + sd = es_0_edits.sd, |
| 35 | + n = es_0_edits.length |
| 36 | + ), |
| 37 | + data.frame( |
| 38 | + year = year, |
| 39 | + bucket = es_0_bucket, |
| 40 | + es = 1, |
| 41 | + mean = es_1_edits.mean, |
| 42 | + sd = es_1_edits.sd, |
| 43 | + n = es_1_edits.length |
| 44 | + ), |
| 45 | + data.frame( |
| 46 | + year = year, |
| 47 | + bucket = es_0_bucket, |
| 48 | + es = 2, |
| 49 | + mean = es_2_edits.mean, |
| 50 | + sd = es_2_edits.sd, |
| 51 | + n = es_2_edits.length |
| 52 | + ) |
| 53 | + ) |
| 54 | +) |
| 55 | + |
| 56 | + |
| 57 | +png("plots/edit_sessions.by_year_and_es_0_bucket.png", height=768, width=1024) |
| 58 | +limited_three_es_buckets = three_es_buckets[ |
| 59 | + three_es_buckets$n >= 10 & |
| 60 | + three_es_buckets$bucket <= 16, |
| 61 | +] |
| 62 | +params = list( |
| 63 | + "1"=list( |
| 64 | + col="#000000", |
| 65 | + pch=0, |
| 66 | + lty=0 |
| 67 | + ), |
| 68 | + "2"=list( |
| 69 | + col="#FF0000", |
| 70 | + pch=1, |
| 71 | + lty=1 |
| 72 | + ), |
| 73 | + "4"=list( |
| 74 | + col="#00FF00", |
| 75 | + pch=2, |
| 76 | + lty=2 |
| 77 | + ), |
| 78 | + "8"=list( |
| 79 | + col="#0000FF", |
| 80 | + pch=3, |
| 81 | + lty=3 |
| 82 | + ), |
| 83 | + "16"=list( |
| 84 | + col="#BBBB00", |
| 85 | + pch=4, |
| 86 | + lty=4 |
| 87 | + ) |
| 88 | +) |
| 89 | +xyplot( |
| 90 | + mean ~ es | as.factor(year), |
| 91 | + data=limited_three_es_buckets, |
| 92 | + groups=bucket, |
| 93 | + panel=function(x, y, subscripts, groups, ...){ |
| 94 | + f = limited_three_es_buckets[subscripts,] |
| 95 | + for(group in groups){ |
| 96 | + group = as.character(group) |
| 97 | + subf = f[f$bucket == group,] |
| 98 | + y = subf$mean |
| 99 | + x = subf$es |
| 100 | + n = subf$n |
| 101 | + sd = subf$sd |
| 102 | + se = sd/sqrt(n) |
| 103 | + panel.xyplot( |
| 104 | + x, y, |
| 105 | + col=params[[group]]$col, |
| 106 | + pch=params[[group]]$pch, |
| 107 | + ... |
| 108 | + ) |
| 109 | + panel.lines( |
| 110 | + x, y, |
| 111 | + col=params[[group]]$col, |
| 112 | + lwd=2, |
| 113 | + ... |
| 114 | + ) |
| 115 | + panel.arrows(x, y+se, x, y-se, ends="both", col="#777777", angle=90, length=.01) |
| 116 | + } |
| 117 | + }, |
| 118 | + main="Session activity by editor first session group", |
| 119 | + ylab="Average session edits", |
| 120 | + xlab="Edit session", |
| 121 | + auto.key=list( |
| 122 | + text=paste("~", names(params), "edits"), |
| 123 | + col=c( |
| 124 | + "#000000", |
| 125 | + "#FF0000", |
| 126 | + "#00FF00", |
| 127 | + "#0000FF", |
| 128 | + "#BBBB00" |
| 129 | + ), |
| 130 | + points=F |
| 131 | + ) |
| 132 | +) |
| 133 | +dev.off() |
Index: trunk/tools/wsor/first_session/R/first_session_survival.R |
— | — | @@ -0,0 +1,457 @@ |
| 2 | +source("loader/user_sessions.R") |
| 3 | + |
| 4 | +library(lattice) |
| 5 | +library(doBy) |
| 6 | + |
| 7 | +user_sessions = load_user_sessions() |
| 8 | +user_sessions$year = strftime(user_sessions$first_edit, format="%Y") |
| 9 | +user_sessions$early_survival = user_sessions$last_edit - user_sessions$es_0_end >= 30 |
| 10 | + |
| 11 | +year_props = with( |
| 12 | + summaryBy( |
| 13 | + early_survival ~ year, |
| 14 | + data=user_sessions[!is.na(user_sessions$year),], |
| 15 | + FUN=c(mean, length) |
| 16 | + ), |
| 17 | + data.frame( |
| 18 | + year = year, |
| 19 | + early_survival = early_survival.mean, |
| 20 | + n = early_survival.length |
| 21 | + ) |
| 22 | +) |
| 23 | + |
| 24 | +png("plots/early_survival.by_year.png", height=768, width=1024) |
| 25 | +xyplot( |
| 26 | + early_survival ~ year, |
| 27 | + data=year_props, |
| 28 | + panel=function(x, y, subscripts, ...){ |
| 29 | + f = year_props[subscripts,] |
| 30 | + panel.xyplot(x, y, ...) |
| 31 | + panel.lines(x, y, ...) |
| 32 | + x = f$year |
| 33 | + p = f$early_survival |
| 34 | + n = f$n |
| 35 | + se = sqrt(p*(1-p)/n) |
| 36 | + panel.arrows(x, p+se, x, p-se, ends="both", angle=90, length=.1) |
| 37 | + }, |
| 38 | + ylim=c(0, 1), |
| 39 | + main="Early survival proportion for new editors", |
| 40 | + ylab="Proportion of surviving editors", |
| 41 | + xlab="Year", |
| 42 | + sub="early survival = editing more than 1 month after first session" |
| 43 | +) |
| 44 | +dev.off() |
| 45 | + |
| 46 | +year_props.no_vandal = with( |
| 47 | + summaryBy( |
| 48 | + early_survival ~ year, |
| 49 | + data=user_sessions[ |
| 50 | + !is.na(user_sessions$year) & |
| 51 | + user_sessions$es_0_edits >= 2 & |
| 52 | + user_sessions$es_0_vandalism / user_sessions$es_0_edits <= .25, |
| 53 | + ], |
| 54 | + FUN=c(mean, length) |
| 55 | + ), |
| 56 | + data.frame( |
| 57 | + year = year, |
| 58 | + early_survival = early_survival.mean, |
| 59 | + n = early_survival.length |
| 60 | + ) |
| 61 | +) |
| 62 | + |
| 63 | +png("plots/early_survival.by_year.no_vandals.png", height=768, width=1024) |
| 64 | +xyplot( |
| 65 | + early_survival ~ year, |
| 66 | + data=year_props.no_vandal, |
| 67 | + panel=function(x, y, subscripts, ...){ |
| 68 | + f = year_props.no_vandal[subscripts,] |
| 69 | + panel.xyplot(x, y, ...) |
| 70 | + panel.lines(x, y, ...) |
| 71 | + x = f$year |
| 72 | + p = f$early_survival |
| 73 | + n = f$n |
| 74 | + se = sqrt(p*(1-p)/n) |
| 75 | + panel.arrows(x, p+se, x, p-se, ends="both", angle=90, length=.1) |
| 76 | + }, |
| 77 | + ylim=c(0, 1), |
| 78 | + main="Early survival proportion for new editors (no vandals)", |
| 79 | + ylab="Proportion of surviving editors", |
| 80 | + xlab="Year", |
| 81 | + sub="early survival = editing more than 1 month after first session" |
| 82 | +) |
| 83 | +dev.off() |
| 84 | + |
| 85 | +user_sessions$es_0_bucket = 2^round(log(user_sessions$es_0_edits, base=2)) |
| 86 | + |
| 87 | +year_edits_props = with( |
| 88 | + summaryBy( |
| 89 | + early_survival ~ year + es_0_bucket, |
| 90 | + data=user_sessions[ |
| 91 | + !is.na(user_sessions$year) & |
| 92 | + user_sessions$es_0_bucket <= 256, |
| 93 | + ], |
| 94 | + FUN=c(mean, length) |
| 95 | + ), |
| 96 | + data.frame( |
| 97 | + year = year, |
| 98 | + es_0_bucket = es_0_bucket, |
| 99 | + early_survival = early_survival.mean, |
| 100 | + n = early_survival.length |
| 101 | + ) |
| 102 | +) |
| 103 | + |
| 104 | +png("plots/early_survival.by_year_and_first_session.png", height=768, width=1024) |
| 105 | +xyplot( |
| 106 | + early_survival ~ es_0_bucket | as.factor(year), |
| 107 | + data=year_edits_props, |
| 108 | + panel=function(x, y, subscripts, ...){ |
| 109 | + f = year_edits_props[subscripts,] |
| 110 | + panel.xyplot(x, y, ...) |
| 111 | + x = log(f$es_0_bucket, base=2) |
| 112 | + p = f$early_survival |
| 113 | + n = f$n |
| 114 | + se = sqrt(p*(1-p)/n) |
| 115 | + panel.arrows(x, p+se, x, p-se, ends="both", angle=90, length=.1) |
| 116 | + panel.lines(-5:10, .2, col="#BBBBBB") |
| 117 | + panel.lines(-5:10, .4, col="#BBBBBB") |
| 118 | + panel.lines(-5:10, .6, col="#BBBBBB") |
| 119 | + panel.lines(-5:10, .8, col="#BBBBBB") |
| 120 | + }, |
| 121 | + ylim=c(0, 1), |
| 122 | + main="Early survival proportion for new editors by first session edits", |
| 123 | + ylab="Proportion of surviving editors", |
| 124 | + xlab="First session edits", |
| 125 | + sub="early survival = editing more than 1 month after first session", |
| 126 | + scales=list(x=list(log=2, at=2^(0:8))), |
| 127 | + xlim=c(.5, 300) |
| 128 | +) |
| 129 | +dev.off() |
| 130 | + |
| 131 | +png("plots/early_survival.by_year.es_lines.png", height=768, width=1024) |
| 132 | +limited_year_edits_props = year_edits_props[ |
| 133 | + year_edits_props$n >= 10 & |
| 134 | + year_edits_props$es_0_bucket <= 16, |
| 135 | +] |
| 136 | +params = list( |
| 137 | + "1"=list( |
| 138 | + col="#000000", |
| 139 | + pch=0, |
| 140 | + lty=0 |
| 141 | + ), |
| 142 | + "2"=list( |
| 143 | + col="#FF0000", |
| 144 | + pch=1, |
| 145 | + lty=1 |
| 146 | + ), |
| 147 | + "4"=list( |
| 148 | + col="#00FF00", |
| 149 | + pch=2, |
| 150 | + lty=2 |
| 151 | + ), |
| 152 | + "8"=list( |
| 153 | + col="#0000FF", |
| 154 | + pch=3, |
| 155 | + lty=3 |
| 156 | + ), |
| 157 | + "16"=list( |
| 158 | + col="#BBBB00", |
| 159 | + pch=4, |
| 160 | + lty=4 |
| 161 | + ), |
| 162 | + "32"=list( |
| 163 | + col="#00BBBB", |
| 164 | + pch=5, |
| 165 | + lty=5 |
| 166 | + ), |
| 167 | + "64"=list( |
| 168 | + col="#BB00BB", |
| 169 | + pch=6, |
| 170 | + lty=6 |
| 171 | + ) |
| 172 | +) |
| 173 | +xyplot( |
| 174 | + early_survival ~ year, |
| 175 | + data=limited_year_edits_props, |
| 176 | + groups=es_0_bucket, |
| 177 | + panel=function(x, y, subscripts, groups, ...){ |
| 178 | + f = limited_year_edits_props[subscripts,] |
| 179 | + for(group in groups){ |
| 180 | + group = as.character(group) |
| 181 | + subf = f[f$es_0_bucket == group,] |
| 182 | + p = subf$early_survival |
| 183 | + x = subf$year |
| 184 | + n = subf$n |
| 185 | + panel.xyplot( |
| 186 | + x, p, |
| 187 | + col=params[[group]]$col, |
| 188 | + pch=params[[group]]$pch, |
| 189 | + ... |
| 190 | + ) |
| 191 | + panel.lines( |
| 192 | + x, p, |
| 193 | + col=params[[group]]$col, |
| 194 | + lwd=2, |
| 195 | + ... |
| 196 | + ) |
| 197 | + se = sqrt(p*(1-p)/n) |
| 198 | + panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05) |
| 199 | + } |
| 200 | + }, |
| 201 | + ylim=c(0, 1), |
| 202 | + main="Early survival proportion for new editors grouped by edits in their first session", |
| 203 | + ylab="Proportion of surviving editors", |
| 204 | + xlab="Years", |
| 205 | + sub="early survival = editing more than 1 month after first session", |
| 206 | + auto.key=list( |
| 207 | + text=paste("~", names(params), "edits"), |
| 208 | + col=c( |
| 209 | + "#000000", |
| 210 | + "#FF0000", |
| 211 | + "#00FF00", |
| 212 | + "#0000FF", |
| 213 | + "#BBBB00", |
| 214 | + "#00BBBB", |
| 215 | + "#BB00BB" |
| 216 | + ), |
| 217 | + points=F |
| 218 | + ) |
| 219 | +) |
| 220 | +dev.off() |
| 221 | + |
| 222 | + |
| 223 | +user_sessions$es_0_no_arch = 2^round(log(user_sessions$es_0_edits - user_sessions$es_0_deleted, base=2)) |
| 224 | + |
| 225 | +no_arch_edits_props = with( |
| 226 | + summaryBy( |
| 227 | + early_survival ~ year + es_0_no_arch, |
| 228 | + data=user_sessions[ |
| 229 | + !is.na(user_sessions$year) & |
| 230 | + user_sessions$es_0_no_arch <= 256, |
| 231 | + ], |
| 232 | + FUN=c(mean, length) |
| 233 | + ), |
| 234 | + data.frame( |
| 235 | + year = year, |
| 236 | + es_0_no_arch = es_0_no_arch, |
| 237 | + early_survival = early_survival.mean, |
| 238 | + n = early_survival.length |
| 239 | + ) |
| 240 | +) |
| 241 | + |
| 242 | + |
| 243 | +png("plots/early_survival.by_year.es_lines.no_archive.png", height=768, width=1024) |
| 244 | +limited_year_edits_props = no_arch_edits_props[ |
| 245 | + no_arch_edits_props$n >= 10 & |
| 246 | + no_arch_edits_props$es_0_no_arch <= 16, |
| 247 | +] |
| 248 | +params = list( |
| 249 | + "0"=list( |
| 250 | + col="#AAAAAA", |
| 251 | + pch=0, |
| 252 | + lty=0 |
| 253 | + ), |
| 254 | + "1"=list( |
| 255 | + col="#000000", |
| 256 | + pch=0, |
| 257 | + lty=0 |
| 258 | + ), |
| 259 | + "2"=list( |
| 260 | + col="#FF0000", |
| 261 | + pch=1, |
| 262 | + lty=1 |
| 263 | + ), |
| 264 | + "4"=list( |
| 265 | + col="#00FF00", |
| 266 | + pch=2, |
| 267 | + lty=2 |
| 268 | + ), |
| 269 | + "8"=list( |
| 270 | + col="#0000FF", |
| 271 | + pch=3, |
| 272 | + lty=3 |
| 273 | + ), |
| 274 | + "16"=list( |
| 275 | + col="#BBBB00", |
| 276 | + pch=4, |
| 277 | + lty=4 |
| 278 | + ) |
| 279 | +) |
| 280 | +xyplot( |
| 281 | + early_survival ~ year, |
| 282 | + data=limited_year_edits_props, |
| 283 | + groups=es_0_no_arch, |
| 284 | + panel=function(x, y, subscripts, groups, ...){ |
| 285 | + f = limited_year_edits_props[subscripts,] |
| 286 | + for(group in groups){ |
| 287 | + group = as.character(group) |
| 288 | + subf = f[f$es_0_no_arch == group,] |
| 289 | + p = subf$early_survival |
| 290 | + x = subf$year |
| 291 | + n = subf$n |
| 292 | + panel.xyplot( |
| 293 | + x, p, |
| 294 | + col=params[[group]]$col, |
| 295 | + pch=params[[group]]$pch, |
| 296 | + ... |
| 297 | + ) |
| 298 | + panel.lines( |
| 299 | + x, p, |
| 300 | + col=params[[group]]$col, |
| 301 | + lwd=2, |
| 302 | + ... |
| 303 | + ) |
| 304 | + se = sqrt(p*(1-p)/n) |
| 305 | + panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05) |
| 306 | + } |
| 307 | + }, |
| 308 | + ylim=c(0, 1), |
| 309 | + main="Early survival proportion for new editors grouped by edits (not deleted) in their first session", |
| 310 | + ylab="Proportion of surviving editors", |
| 311 | + xlab="Years", |
| 312 | + sub="early survival = editing more than 1 month after first session", |
| 313 | + auto.key=list( |
| 314 | + text=paste("~", names(params), "edits"), |
| 315 | + col=c( |
| 316 | + "#AAAAAA", |
| 317 | + "#000000", |
| 318 | + "#FF0000", |
| 319 | + "#00FF00", |
| 320 | + "#0000FF", |
| 321 | + "#BBBB00", |
| 322 | + "#00BBBB", |
| 323 | + "#BB00BB" |
| 324 | + ), |
| 325 | + points=F |
| 326 | + ) |
| 327 | +) |
| 328 | +dev.off() |
| 329 | + |
| 330 | + |
| 331 | +user_sessions$years_since_2001 = as.numeric((user_sessions$first_edit - as.POSIXct("2001-01-01"))/365) |
| 332 | +user_sessions$initial_rejection = with( |
| 333 | + user_sessions, |
| 334 | + ( |
| 335 | + naReplace(es_0_deleted, 0) + naReplace(es_0_reverted, 0) + |
| 336 | + naReplace(es_1_deleted, 0) + naReplace(es_1_reverted, 0) + |
| 337 | + naReplace(es_2_deleted, 0) + naReplace(es_2_reverted, 0) |
| 338 | + )/( |
| 339 | + naReplace(es_0_edits, 0) + |
| 340 | + naReplace(es_1_edits, 0) + |
| 341 | + naReplace(es_2_edits, 0) |
| 342 | + ) |
| 343 | +) |
| 344 | +sc = scale |
| 345 | +summary(glm( |
| 346 | + early_survival ~ |
| 347 | + sc(es_0_edits) * |
| 348 | + sc(years_since_2001) * |
| 349 | + sc(initial_rejection), |
| 350 | + data=user_sessions[ |
| 351 | + user_sessions$es_0_edits > 3, |
| 352 | + ], |
| 353 | + family=binomial(link="logit") |
| 354 | +)) |
| 355 | + |
| 356 | + |
| 357 | +user_sessions$initial_rejection_group = round(user_sessions$initial_rejection/2, 1)*2 |
| 358 | + |
| 359 | +survival_by_year_and_rejection = with( |
| 360 | + summaryBy( |
| 361 | + early_survival ~ year + initial_rejection_group, |
| 362 | + data=user_sessions[ |
| 363 | + user_sessions$es_0_edits > 3 & |
| 364 | + user_sessions$es_0_vandalism == 0, |
| 365 | + ], |
| 366 | + FUN=c(mean, length) |
| 367 | + ), |
| 368 | + data.frame( |
| 369 | + year = year, |
| 370 | + rejection_group = initial_rejection_group, |
| 371 | + early_survival = early_survival.mean, |
| 372 | + n = early_survival.length |
| 373 | + ) |
| 374 | +) |
| 375 | + |
| 376 | +png("plots/early_survival.by_year_and_rejection.no_vandals.png", height=768, width=1024) |
| 377 | +limited_frame = survival_by_year_and_rejection[ |
| 378 | + survival_by_year_and_rejection$n >= 10, |
| 379 | +] |
| 380 | +params = list( |
| 381 | + "0"=list( |
| 382 | + col="#AAAAAA", |
| 383 | + pch=0, |
| 384 | + lty=0 |
| 385 | + ), |
| 386 | + "0.2"=list( |
| 387 | + col="#FF0000", |
| 388 | + pch=1, |
| 389 | + lty=1 |
| 390 | + ), |
| 391 | + "0.4"=list( |
| 392 | + col="#0000FF", |
| 393 | + pch=3, |
| 394 | + lty=3 |
| 395 | + ), |
| 396 | + "0.6"=list( |
| 397 | + col="#00BBBB", |
| 398 | + pch=5, |
| 399 | + lty=4 |
| 400 | + ), |
| 401 | + "0.8"=list( |
| 402 | + col="#BB0000", |
| 403 | + pch=7, |
| 404 | + lty=4 |
| 405 | + ), |
| 406 | + "1"=list( |
| 407 | + col="#00BB00", |
| 408 | + pch=9, |
| 409 | + lty=4 |
| 410 | + ) |
| 411 | +) |
| 412 | +xyplot( |
| 413 | + early_survival ~ year, |
| 414 | + data=limited_frame, |
| 415 | + groups=rejection_group, |
| 416 | + panel=function(x, y, subscripts, groups, ...){ |
| 417 | + f = limited_frame[subscripts,] |
| 418 | + for(group in groups){ |
| 419 | + group = as.character(group) |
| 420 | + subf = f[f$rejection_group == group,] |
| 421 | + p = subf$early_survival |
| 422 | + x = subf$year |
| 423 | + n = subf$n |
| 424 | + panel.xyplot( |
| 425 | + x, p, |
| 426 | + col=params[[group]]$col, |
| 427 | + pch=params[[group]]$pch, |
| 428 | + ... |
| 429 | + ) |
| 430 | + panel.lines( |
| 431 | + x, p, |
| 432 | + col=params[[group]]$col, |
| 433 | + lwd=2, |
| 434 | + ... |
| 435 | + ) |
| 436 | + se = sqrt(p*(1-p)/n) |
| 437 | + panel.arrows(x, p+se, x, p-se, ends="both", col="#777777", angle=90, length=.05) |
| 438 | + } |
| 439 | + }, |
| 440 | + ylim=c(0, 1), |
| 441 | + main="Early survival proportion for new editors grouped by early rejection proportion", |
| 442 | + ylab="Proportion of surviving editors", |
| 443 | + xlab="Years", |
| 444 | + sub="early survival = editing more than 1 month after first session\nrejection = proportion of revisions reverted or deleted in first edit sessions.", |
| 445 | + auto.key=list( |
| 446 | + text=paste("~", names(params), " rejection"), |
| 447 | + col=c( |
| 448 | + "#AAAAAA", |
| 449 | + "#FF0000", |
| 450 | + "#0000FF", |
| 451 | + "#00BBBB", |
| 452 | + "#BB0000", |
| 453 | + "#00BB00" |
| 454 | + ), |
| 455 | + points=F |
| 456 | + ) |
| 457 | +) |
| 458 | +dev.off() |
Index: trunk/tools/wsor/first_session/R/loader/user_sessions.R |
— | — | @@ -0,0 +1,38 @@ |
| 2 | +source("util/env.R") |
| 3 | + |
| 4 | +load_user_sessions = function(verbose=T, reload=F){ |
| 5 | + filename = paste(DATA_DIR, "user_sessions.3.tsv", sep="/") |
| 6 | + if(!exists("USER_SESSIONS")){ |
| 7 | + USER_SESSIONS <<- NULL |
| 8 | + } |
| 9 | + if(is.null(USER_SESSIONS) | reload){ |
| 10 | + USER_SESSIONS <<- NULL |
| 11 | + } |
| 12 | + if(is.null(USER_SESSIONS)){ |
| 13 | + if(verbose){cat("Loading ", filename, "...")} |
| 14 | + USER_SESSIONS <<- read.table( |
| 15 | + filename, |
| 16 | + header=T, sep="\t", |
| 17 | + quote="", comment.char="", |
| 18 | + na.strings="\\N" |
| 19 | + ) |
| 20 | + USER_SESSIONS$first_edit = strptime( |
| 21 | + as.character(USER_SESSIONS$first_edit), |
| 22 | + "%Y%m%d%H%M%S" |
| 23 | + ) |
| 24 | + USER_SESSIONS$last_edit = strptime( |
| 25 | + as.character(USER_SESSIONS$last_edit), |
| 26 | + "%Y%m%d%H%M%S" |
| 27 | + ) |
| 28 | + USER_SESSIONS$es_0_start = as.POSIXct(USER_SESSIONS$es_0_start, origin="1970-01-01") |
| 29 | + USER_SESSIONS$es_1_start = as.POSIXct(USER_SESSIONS$es_1_start, origin="1970-01-01") |
| 30 | + USER_SESSIONS$es_2_start = as.POSIXct(USER_SESSIONS$es_2_start, origin="1970-01-01") |
| 31 | + USER_SESSIONS$es_0_end = as.POSIXct(USER_SESSIONS$es_0_end, origin="1970-01-01") |
| 32 | + USER_SESSIONS$es_1_end = as.POSIXct(USER_SESSIONS$es_1_end, origin="1970-01-01") |
| 33 | + USER_SESSIONS$es_2_end = as.POSIXct(USER_SESSIONS$es_2_end, origin="1970-01-01") |
| 34 | + if(verbose){cat("DONE!\n")} |
| 35 | + } |
| 36 | + USER_SESSIONS |
| 37 | +} |
| 38 | + |
| 39 | + |
Index: trunk/tools/wsor/first_session/R/edit_distributions.R |
— | — | @@ -0,0 +1,64 @@ |
| 2 | +source("loader/user_sessions.R") |
| 3 | + |
| 4 | +library(lattice) |
| 5 | +library(doBy) |
| 6 | + |
| 7 | +user_sessions = load_user_sessions() |
| 8 | +user_sessions$year = floor(user_sessions$first_edit/10000000000) |
| 9 | + |
| 10 | + |
| 11 | +year_edits = data.frame() |
| 12 | +for(year in unique(user_sessions$year)){ |
| 13 | + tab = data.frame( |
| 14 | + table( |
| 15 | + 10^round( |
| 16 | + log( |
| 17 | + user_sessions[user_sessions$year == year,]$edit_count, |
| 18 | + base=10 |
| 19 | + ) |
| 20 | + ) |
| 21 | + ) |
| 22 | + ) |
| 23 | + |
| 24 | + year_edits = rbind( |
| 25 | + year_edits, |
| 26 | + data.frame( |
| 27 | + year = year, |
| 28 | + edits = as.numeric(as.character(tab$Var1)), |
| 29 | + freq = tab$Freq, |
| 30 | + prop = tab$Freq/sum(tab$Freq) |
| 31 | + ) |
| 32 | + ) |
| 33 | +} |
| 34 | + |
| 35 | +png("plots/edit_count_distribution.png", height=768, width=1024) |
| 36 | +xyplot( |
| 37 | + freq ~ edits | as.factor(year), |
| 38 | + data = year_edits[year_edits$edits > 0,], |
| 39 | + type="o", |
| 40 | + scales=list( |
| 41 | + x=list(log=10, at=10^(0:6), labels=10^(0:6))#, |
| 42 | + #y=list(log=10) |
| 43 | + ), |
| 44 | + main="Editor edit count distributions by editor first edit year", |
| 45 | + xlab="Number of edits (log10 bucketed)", |
| 46 | + ylab="Number of editors", |
| 47 | + sub="based on a random sample of <= 10,000 editors from each year" |
| 48 | +) |
| 49 | +dev.off() |
| 50 | + |
| 51 | +png("plots/edit_count_distribution.prop.png", height=768, width=1024) |
| 52 | +xyplot( |
| 53 | + prop ~ edits | as.factor(year), |
| 54 | + data = year_edits[year_edits$edits > 0,], |
| 55 | + type="o", |
| 56 | + scales=list( |
| 57 | + x=list(log=10, at=10^(0:6), labels=10^(0:6))#, |
| 58 | + #y=list(log=10) |
| 59 | + ), |
| 60 | + main="Editor edit count distributions by editor first edit year", |
| 61 | + xlab="Number of edits (log10 bucketed)", |
| 62 | + ylab="Proportion of editors", |
| 63 | + sub="based on a random sample of <= 10,000 editors from each year" |
| 64 | +) |
| 65 | +dev.off() |
Index: trunk/tools/wsor/first_session/R/.RData |
Cannot display: file marked as a binary type. |
svn:mime-type = application/octet-stream |
Property changes on: trunk/tools/wsor/first_session/R/.RData |
___________________________________________________________________ |
Added: svn:mime-type |
1 | 66 | + application/octet-stream |
Index: trunk/tools/wsor/first_session/R/Rplots.pdf |
— | — | @@ -0,0 +1,267 @@ |
| 2 | +%PDF-1.4 |
| 3 | +%���ρ�\r |
| 4 | +1 0 obj |
| 5 | +<< |
| 6 | +/CreationDate (D:20110719222129) |
| 7 | +/ModDate (D:20110719222129) |
| 8 | +/Title (R Graphics Output) |
| 9 | +/Producer (R 2.13.1) |
| 10 | +/Creator (R) |
| 11 | +>> |
| 12 | +endobj |
| 13 | +2 0 obj |
| 14 | +<< |
| 15 | +/Type /Catalog |
| 16 | +/Pages 3 0 R |
| 17 | +>> |
| 18 | +endobj |
| 19 | +3 0 obj |
| 20 | +<< |
| 21 | +/Type /Pages |
| 22 | +/Kids [ |
| 23 | +] |
| 24 | +/Count 0 |
| 25 | +/MediaBox [0 0 504 504] |
| 26 | +>> |
| 27 | +endobj |
| 28 | +4 0 obj |
| 29 | +<< |
| 30 | +/ProcSet [/PDF /Text] |
| 31 | +/Font <<>> |
| 32 | +/ExtGState << >> |
| 33 | +/ColorSpace << /sRGB 5 0 R >> |
| 34 | +>> |
| 35 | +endobj |
| 36 | +5 0 obj |
| 37 | +[/ICCBased 6 0 R] |
| 38 | +endobj |
| 39 | +6 0 obj |
| 40 | +<< /N 3 /Alternate /DeviceRGB /Length 9433 /Filter /ASCIIHexDecode >> |
| 41 | +stream |
| 42 | +00 00 0c 48 4c 69 6e 6f 02 10 00 00 6d 6e 74 72 |
| 43 | +52 47 42 20 58 59 5a 20 07 ce 00 02 00 09 00 06 |
| 44 | +00 31 00 00 61 63 73 70 4d 53 46 54 00 00 00 00 |
| 45 | +49 45 43 20 73 52 47 42 00 00 00 00 00 00 00 00 |
| 46 | +00 00 00 00 00 00 f6 d6 00 01 00 00 00 00 d3 2d |
| 47 | +48 50 20 20 00 00 00 00 00 00 00 00 00 00 00 00 |
| 48 | +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |
| 49 | +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |
| 50 | +00 00 00 11 63 70 72 74 00 00 01 50 00 00 00 33 |
| 51 | +64 65 73 63 00 00 01 84 00 00 00 6c 77 74 70 74 |
| 52 | +00 00 01 f0 00 00 00 14 62 6b 70 74 00 00 02 04 |
| 53 | +00 00 00 14 72 58 59 5a 00 00 02 18 00 00 00 14 |
| 54 | +67 58 59 5a 00 00 02 2c 00 00 00 14 62 58 59 5a |
| 55 | +00 00 02 40 00 00 00 14 64 6d 6e 64 00 00 02 54 |
| 56 | +00 00 00 70 64 6d 64 64 00 00 02 c4 00 00 00 88 |
| 57 | +76 75 65 64 00 00 03 4c 00 00 00 86 76 69 65 77 |
| 58 | +00 00 03 d4 00 00 00 24 6c 75 6d 69 00 00 03 f8 |
| 59 | +00 00 00 14 6d 65 61 73 00 00 04 0c 00 00 00 24 |
| 60 | +74 65 63 68 00 00 04 30 00 00 00 0c 72 54 52 43 |
| 61 | +00 00 04 3c 00 00 08 0c 67 54 52 43 00 00 04 3c |
| 62 | +00 00 08 0c 62 54 52 43 00 00 04 3c 00 00 08 0c |
| 63 | +74 65 78 74 00 00 00 00 43 6f 70 79 72 69 67 68 |
| 64 | +74 20 28 63 29 20 31 39 39 38 20 48 65 77 6c 65 |
| 65 | +74 74 2d 50 61 63 6b 61 72 64 20 43 6f 6d 70 61 |
| 66 | +6e 79 00 00 64 65 73 63 00 00 00 00 00 00 00 12 |
| 67 | +73 52 47 42 20 49 45 43 36 31 39 36 36 2d 32 2e |
| 68 | +31 00 00 00 00 00 00 00 00 00 00 00 12 73 52 47 |
| 69 | +42 20 49 45 43 36 31 39 36 36 2d 32 2e 31 00 00 |
| 70 | +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |
| 71 | +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |
| 72 | +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |
| 73 | +58 59 5a 20 00 00 00 00 00 00 f3 51 00 01 00 00 |
| 74 | +00 01 16 cc 58 59 5a 20 00 00 00 00 00 00 00 00 |
| 75 | +00 00 00 00 00 00 00 00 58 59 5a 20 00 00 00 00 |
| 76 | +00 00 6f a2 00 00 38 f5 00 00 03 90 58 59 5a 20 |
| 77 | +00 00 00 00 00 00 62 99 00 00 b7 85 00 00 18 da |
| 78 | +58 59 5a 20 00 00 00 00 00 00 24 a0 00 00 0f 84 |
| 79 | +00 00 b6 cf 64 65 73 63 00 00 00 00 00 00 00 16 |
| 80 | +49 45 43 20 68 74 74 70 3a 2f 2f 77 77 77 2e 69 |
| 81 | +65 63 2e 63 68 00 00 00 00 00 00 00 00 00 00 00 |
| 82 | +16 49 45 43 20 68 74 74 70 3a 2f 2f 77 77 77 2e |
| 83 | +69 65 63 2e 63 68 00 00 00 00 00 00 00 00 00 00 |
| 84 | +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |
| 85 | +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |
| 86 | +00 00 00 00 64 65 73 63 00 00 00 00 00 00 00 2e |
| 87 | +49 45 43 20 36 31 39 36 36 2d 32 2e 31 20 44 65 |
| 88 | +66 61 75 6c 74 20 52 47 42 20 63 6f 6c 6f 75 72 |
| 89 | +20 73 70 61 63 65 20 2d 20 73 52 47 42 00 00 00 |
| 90 | +00 00 00 00 00 00 00 00 2e 49 45 43 20 36 31 39 |
| 91 | +36 36 2d 32 2e 31 20 44 65 66 61 75 6c 74 20 52 |
| 92 | +47 42 20 63 6f 6c 6f 75 72 20 73 70 61 63 65 20 |
| 93 | +2d 20 73 52 47 42 00 00 00 00 00 00 00 00 00 00 |
| 94 | +00 00 00 00 00 00 00 00 00 00 00 00 64 65 73 63 |
| 95 | +00 00 00 00 00 00 00 2c 52 65 66 65 72 65 6e 63 |
| 96 | +65 20 56 69 65 77 69 6e 67 20 43 6f 6e 64 69 74 |
| 97 | +69 6f 6e 20 69 6e 20 49 45 43 36 31 39 36 36 2d |
| 98 | +32 2e 31 00 00 00 00 00 00 00 00 00 00 00 2c 52 |
| 99 | +65 66 65 72 65 6e 63 65 20 56 69 65 77 69 6e 67 |
| 100 | +20 43 6f 6e 64 69 74 69 6f 6e 20 69 6e 20 49 45 |
| 101 | +43 36 31 39 36 36 2d 32 2e 31 00 00 00 00 00 00 |
| 102 | +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |
| 103 | +00 00 00 00 76 69 65 77 00 00 00 00 00 13 a4 fe |
| 104 | +00 14 5f 2e 00 10 cf 14 00 03 ed cc 00 04 13 0b |
| 105 | +00 03 5c 9e 00 00 00 01 58 59 5a 20 00 00 00 00 |
| 106 | +00 4c 09 56 00 50 00 00 00 57 1f e7 6d 65 61 73 |
| 107 | +00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 00 |
| 108 | +00 00 00 00 00 00 00 00 00 00 02 8f 00 00 00 02 |
| 109 | +73 69 67 20 00 00 00 00 43 52 54 20 63 75 72 76 |
| 110 | +00 00 00 00 00 00 04 00 00 00 00 05 00 0a 00 0f |
| 111 | +00 14 00 19 00 1e 00 23 00 28 00 2d 00 32 00 37 |
| 112 | +00 3b 00 40 00 45 00 4a 00 4f 00 54 00 59 00 5e |
| 113 | +00 63 00 68 00 6d 00 72 00 77 00 7c 00 81 00 86 |
| 114 | +00 8b 00 90 00 95 00 9a 00 9f 00 a4 00 a9 00 ae |
| 115 | +00 b2 00 b7 00 bc 00 c1 00 c6 00 cb 00 d0 00 d5 |
| 116 | +00 db 00 e0 00 e5 00 eb 00 f0 00 f6 00 fb 01 01 |
| 117 | +01 07 01 0d 01 13 01 19 01 1f 01 25 01 2b 01 32 |
| 118 | +01 38 01 3e 01 45 01 4c 01 52 01 59 01 60 01 67 |
| 119 | +01 6e 01 75 01 7c 01 83 01 8b 01 92 01 9a 01 a1 |
| 120 | +01 a9 01 b1 01 b9 01 c1 01 c9 01 d1 01 d9 01 e1 |
| 121 | +01 e9 01 f2 01 fa 02 03 02 0c 02 14 02 1d 02 26 |
| 122 | +02 2f 02 38 02 41 02 4b 02 54 02 5d 02 67 02 71 |
| 123 | +02 7a 02 84 02 8e 02 98 02 a2 02 ac 02 b6 02 c1 |
| 124 | +02 cb 02 d5 02 e0 02 eb 02 f5 03 00 03 0b 03 16 |
| 125 | +03 21 03 2d 03 38 03 43 03 4f 03 5a 03 66 03 72 |
| 126 | +03 7e 03 8a 03 96 03 a2 03 ae 03 ba 03 c7 03 d3 |
| 127 | +03 e0 03 ec 03 f9 04 06 04 13 04 20 04 2d 04 3b |
| 128 | +04 48 04 55 04 63 04 71 04 7e 04 8c 04 9a 04 a8 |
| 129 | +04 b6 04 c4 04 d3 04 e1 04 f0 04 fe 05 0d 05 1c |
| 130 | +05 2b 05 3a 05 49 05 58 05 67 05 77 05 86 05 96 |
| 131 | +05 a6 05 b5 05 c5 05 d5 05 e5 05 f6 06 06 06 16 |
| 132 | +06 27 06 37 06 48 06 59 06 6a 06 7b 06 8c 06 9d |
| 133 | +06 af 06 c0 06 d1 06 e3 06 f5 07 07 07 19 07 2b |
| 134 | +07 3d 07 4f 07 61 07 74 07 86 07 99 07 ac 07 bf |
| 135 | +07 d2 07 e5 07 f8 08 0b 08 1f 08 32 08 46 08 5a |
| 136 | +08 6e 08 82 08 96 08 aa 08 be 08 d2 08 e7 08 fb |
| 137 | +09 10 09 25 09 3a 09 4f 09 64 09 79 09 8f 09 a4 |
| 138 | +09 ba 09 cf 09 e5 09 fb 0a 11 0a 27 0a 3d 0a 54 |
| 139 | +0a 6a 0a 81 0a 98 0a ae 0a c5 0a dc 0a f3 0b 0b |
| 140 | +0b 22 0b 39 0b 51 0b 69 0b 80 0b 98 0b b0 0b c8 |
| 141 | +0b e1 0b f9 0c 12 0c 2a 0c 43 0c 5c 0c 75 0c 8e |
| 142 | +0c a7 0c c0 0c d9 0c f3 0d 0d 0d 26 0d 40 0d 5a |
| 143 | +0d 74 0d 8e 0d a9 0d c3 0d de 0d f8 0e 13 0e 2e |
| 144 | +0e 49 0e 64 0e 7f 0e 9b 0e b6 0e d2 0e ee 0f 09 |
| 145 | +0f 25 0f 41 0f 5e 0f 7a 0f 96 0f b3 0f cf 0f ec |
| 146 | +10 09 10 26 10 43 10 61 10 7e 10 9b 10 b9 10 d7 |
| 147 | +10 f5 11 13 11 31 11 4f 11 6d 11 8c 11 aa 11 c9 |
| 148 | +11 e8 12 07 12 26 12 45 12 64 12 84 12 a3 12 c3 |
| 149 | +12 e3 13 03 13 23 13 43 13 63 13 83 13 a4 13 c5 |
| 150 | +13 e5 14 06 14 27 14 49 14 6a 14 8b 14 ad 14 ce |
| 151 | +14 f0 15 12 15 34 15 56 15 78 15 9b 15 bd 15 e0 |
| 152 | +16 03 16 26 16 49 16 6c 16 8f 16 b2 16 d6 16 fa |
| 153 | +17 1d 17 41 17 65 17 89 17 ae 17 d2 17 f7 18 1b |
| 154 | +18 40 18 65 18 8a 18 af 18 d5 18 fa 19 20 19 45 |
| 155 | +19 6b 19 91 19 b7 19 dd 1a 04 1a 2a 1a 51 1a 77 |
| 156 | +1a 9e 1a c5 1a ec 1b 14 1b 3b 1b 63 1b 8a 1b b2 |
| 157 | +1b da 1c 02 1c 2a 1c 52 1c 7b 1c a3 1c cc 1c f5 |
| 158 | +1d 1e 1d 47 1d 70 1d 99 1d c3 1d ec 1e 16 1e 40 |
| 159 | +1e 6a 1e 94 1e be 1e e9 1f 13 1f 3e 1f 69 1f 94 |
| 160 | +1f bf 1f ea 20 15 20 41 20 6c 20 98 20 c4 20 f0 |
| 161 | +21 1c 21 48 21 75 21 a1 21 ce 21 fb 22 27 22 55 |
| 162 | +22 82 22 af 22 dd 23 0a 23 38 23 66 23 94 23 c2 |
| 163 | +23 f0 24 1f 24 4d 24 7c 24 ab 24 da 25 09 25 38 |
| 164 | +25 68 25 97 25 c7 25 f7 26 27 26 57 26 87 26 b7 |
| 165 | +26 e8 27 18 27 49 27 7a 27 ab 27 dc 28 0d 28 3f |
| 166 | +28 71 28 a2 28 d4 29 06 29 38 29 6b 29 9d 29 d0 |
| 167 | +2a 02 2a 35 2a 68 2a 9b 2a cf 2b 02 2b 36 2b 69 |
| 168 | +2b 9d 2b d1 2c 05 2c 39 2c 6e 2c a2 2c d7 2d 0c |
| 169 | +2d 41 2d 76 2d ab 2d e1 2e 16 2e 4c 2e 82 2e b7 |
| 170 | +2e ee 2f 24 2f 5a 2f 91 2f c7 2f fe 30 35 30 6c |
| 171 | +30 a4 30 db 31 12 31 4a 31 82 31 ba 31 f2 32 2a |
| 172 | +32 63 32 9b 32 d4 33 0d 33 46 33 7f 33 b8 33 f1 |
| 173 | +34 2b 34 65 34 9e 34 d8 35 13 35 4d 35 87 35 c2 |
| 174 | +35 fd 36 37 36 72 36 ae 36 e9 37 24 37 60 37 9c |
| 175 | +37 d7 38 14 38 50 38 8c 38 c8 39 05 39 42 39 7f |
| 176 | +39 bc 39 f9 3a 36 3a 74 3a b2 3a ef 3b 2d 3b 6b |
| 177 | +3b aa 3b e8 3c 27 3c 65 3c a4 3c e3 3d 22 3d 61 |
| 178 | +3d a1 3d e0 3e 20 3e 60 3e a0 3e e0 3f 21 3f 61 |
| 179 | +3f a2 3f e2 40 23 40 64 40 a6 40 e7 41 29 41 6a |
| 180 | +41 ac 41 ee 42 30 42 72 42 b5 42 f7 43 3a 43 7d |
| 181 | +43 c0 44 03 44 47 44 8a 44 ce 45 12 45 55 45 9a |
| 182 | +45 de 46 22 46 67 46 ab 46 f0 47 35 47 7b 47 c0 |
| 183 | +48 05 48 4b 48 91 48 d7 49 1d 49 63 49 a9 49 f0 |
| 184 | +4a 37 4a 7d 4a c4 4b 0c 4b 53 4b 9a 4b e2 4c 2a |
| 185 | +4c 72 4c ba 4d 02 4d 4a 4d 93 4d dc 4e 25 4e 6e |
| 186 | +4e b7 4f 00 4f 49 4f 93 4f dd 50 27 50 71 50 bb |
| 187 | +51 06 51 50 51 9b 51 e6 52 31 52 7c 52 c7 53 13 |
| 188 | +53 5f 53 aa 53 f6 54 42 54 8f 54 db 55 28 55 75 |
| 189 | +55 c2 56 0f 56 5c 56 a9 56 f7 57 44 57 92 57 e0 |
| 190 | +58 2f 58 7d 58 cb 59 1a 59 69 59 b8 5a 07 5a 56 |
| 191 | +5a a6 5a f5 5b 45 5b 95 5b e5 5c 35 5c 86 5c d6 |
| 192 | +5d 27 5d 78 5d c9 5e 1a 5e 6c 5e bd 5f 0f 5f 61 |
| 193 | +5f b3 60 05 60 57 60 aa 60 fc 61 4f 61 a2 61 f5 |
| 194 | +62 49 62 9c 62 f0 63 43 63 97 63 eb 64 40 64 94 |
| 195 | +64 e9 65 3d 65 92 65 e7 66 3d 66 92 66 e8 67 3d |
| 196 | +67 93 67 e9 68 3f 68 96 68 ec 69 43 69 9a 69 f1 |
| 197 | +6a 48 6a 9f 6a f7 6b 4f 6b a7 6b ff 6c 57 6c af |
| 198 | +6d 08 6d 60 6d b9 6e 12 6e 6b 6e c4 6f 1e 6f 78 |
| 199 | +6f d1 70 2b 70 86 70 e0 71 3a 71 95 71 f0 72 4b |
| 200 | +72 a6 73 01 73 5d 73 b8 74 14 74 70 74 cc 75 28 |
| 201 | +75 85 75 e1 76 3e 76 9b 76 f8 77 56 77 b3 78 11 |
| 202 | +78 6e 78 cc 79 2a 79 89 79 e7 7a 46 7a a5 7b 04 |
| 203 | +7b 63 7b c2 7c 21 7c 81 7c e1 7d 41 7d a1 7e 01 |
| 204 | +7e 62 7e c2 7f 23 7f 84 7f e5 80 47 80 a8 81 0a |
| 205 | +81 6b 81 cd 82 30 82 92 82 f4 83 57 83 ba 84 1d |
| 206 | +84 80 84 e3 85 47 85 ab 86 0e 86 72 86 d7 87 3b |
| 207 | +87 9f 88 04 88 69 88 ce 89 33 89 99 89 fe 8a 64 |
| 208 | +8a ca 8b 30 8b 96 8b fc 8c 63 8c ca 8d 31 8d 98 |
| 209 | +8d ff 8e 66 8e ce 8f 36 8f 9e 90 06 90 6e 90 d6 |
| 210 | +91 3f 91 a8 92 11 92 7a 92 e3 93 4d 93 b6 94 20 |
| 211 | +94 8a 94 f4 95 5f 95 c9 96 34 96 9f 97 0a 97 75 |
| 212 | +97 e0 98 4c 98 b8 99 24 99 90 99 fc 9a 68 9a d5 |
| 213 | +9b 42 9b af 9c 1c 9c 89 9c f7 9d 64 9d d2 9e 40 |
| 214 | +9e ae 9f 1d 9f 8b 9f fa a0 69 a0 d8 a1 47 a1 b6 |
| 215 | +a2 26 a2 96 a3 06 a3 76 a3 e6 a4 56 a4 c7 a5 38 |
| 216 | +a5 a9 a6 1a a6 8b a6 fd a7 6e a7 e0 a8 52 a8 c4 |
| 217 | +a9 37 a9 a9 aa 1c aa 8f ab 02 ab 75 ab e9 ac 5c |
| 218 | +ac d0 ad 44 ad b8 ae 2d ae a1 af 16 af 8b b0 00 |
| 219 | +b0 75 b0 ea b1 60 b1 d6 b2 4b b2 c2 b3 38 b3 ae |
| 220 | +b4 25 b4 9c b5 13 b5 8a b6 01 b6 79 b6 f0 b7 68 |
| 221 | +b7 e0 b8 59 b8 d1 b9 4a b9 c2 ba 3b ba b5 bb 2e |
| 222 | +bb a7 bc 21 bc 9b bd 15 bd 8f be 0a be 84 be ff |
| 223 | +bf 7a bf f5 c0 70 c0 ec c1 67 c1 e3 c2 5f c2 db |
| 224 | +c3 58 c3 d4 c4 51 c4 ce c5 4b c5 c8 c6 46 c6 c3 |
| 225 | +c7 41 c7 bf c8 3d c8 bc c9 3a c9 b9 ca 38 ca b7 |
| 226 | +cb 36 cb b6 cc 35 cc b5 cd 35 cd b5 ce 36 ce b6 |
| 227 | +cf 37 cf b8 d0 39 d0 ba d1 3c d1 be d2 3f d2 c1 |
| 228 | +d3 44 d3 c6 d4 49 d4 cb d5 4e d5 d1 d6 55 d6 d8 |
| 229 | +d7 5c d7 e0 d8 64 d8 e8 d9 6c d9 f1 da 76 da fb |
| 230 | +db 80 dc 05 dc 8a dd 10 dd 96 de 1c de a2 df 29 |
| 231 | +df af e0 36 e0 bd e1 44 e1 cc e2 53 e2 db e3 63 |
| 232 | +e3 eb e4 73 e4 fc e5 84 e6 0d e6 96 e7 1f e7 a9 |
| 233 | +e8 32 e8 bc e9 46 e9 d0 ea 5b ea e5 eb 70 eb fb |
| 234 | +ec 86 ed 11 ed 9c ee 28 ee b4 ef 40 ef cc f0 58 |
| 235 | +f0 e5 f1 72 f1 ff f2 8c f3 19 f3 a7 f4 34 f4 c2 |
| 236 | +f5 50 f5 de f6 6d f6 fb f7 8a f8 19 f8 a8 f9 38 |
| 237 | +f9 c7 fa 57 fa e7 fb 77 fc 07 fc 98 fd 29 fd ba |
| 238 | +fe 4b fe dc ff 6d ff ff > |
| 239 | +endstream |
| 240 | +endobj |
| 241 | +7 0 obj |
| 242 | +<< |
| 243 | +/Type /Encoding |
| 244 | +/BaseEncoding /WinAnsiEncoding |
| 245 | +/Differences [ 45/minus 96/quoteleft |
| 246 | +144/dotlessi /grave /acute /circumflex /tilde /macron /breve /dotaccent |
| 247 | +/dieresis /.notdef /ring /cedilla /.notdef /hungarumlaut /ogonek /caron /space] |
| 248 | +>> |
| 249 | +endobj |
| 250 | +xref |
| 251 | +0 8 |
| 252 | +0000000000 65535 f |
| 253 | +0000000021 00000 n |
| 254 | +0000000164 00000 n |
| 255 | +0000000213 00000 n |
| 256 | +0000000290 00000 n |
| 257 | +0000000391 00000 n |
| 258 | +0000000424 00000 n |
| 259 | +0000009960 00000 n |
| 260 | +trailer |
| 261 | +<< |
| 262 | +/Size 8 |
| 263 | +/Info 1 0 R |
| 264 | +/Root 2 0 R |
| 265 | +>> |
| 266 | +startxref |
| 267 | +10217 |
| 268 | +%%EOF |
Index: trunk/tools/wsor/first_session/R/first_sessions.R |
— | — | @@ -0,0 +1,8 @@ |
| 2 | +source("loader/user_sessions.R") |
| 3 | + |
| 4 | +library(lattice) |
| 5 | +library(doBy) |
| 6 | + |
| 7 | + |
| 8 | + |
| 9 | + |
Index: trunk/tools/wsor/first_session/R/util/env.R |
— | — | @@ -0,0 +1,15 @@ |
| 2 | +DATA_DIR = "../data" |
| 3 | + |
| 4 | + |
| 5 | +naReplace = function(x, replacement){ |
| 6 | + sapply( |
| 7 | + x, |
| 8 | + function(v){ |
| 9 | + if(is.na(v)){ |
| 10 | + replacement |
| 11 | + }else{ |
| 12 | + v |
| 13 | + } |
| 14 | + } |
| 15 | + ) |
| 16 | +} |
Index: trunk/tools/wsor/first_session/foo |
— | — | @@ -0,0 +1,24 @@ |
| 2 | +user_id user_name first_edit last_edit edit_count es_0_start es_0_end es_0_edits es_0_reverted es_0_vandalism es_0_deleted es_1_start es_1_end es_1_edits es_1_reverted es_1_vandalism es_1_deleted es_2_start es_2_end es_2_edits es_2_reverted es_2_vandalism es_2_deleted |
| 3 | +1 Damian Yerrick 20010929004320 20110715131605 13196 1001724200 1001724224 2 0 0 0 1001735270 1001735270 1 0 0 0 1001778000 1001781732 4 1 0 1 |
| 4 | +2 AxelBoldt 20010726145009 20110715175901 34804 996159009 996159009 1 0 0 0 996164049 996164049 1 0 0 0 996175238 996177464 3 0 0 0 |
| 5 | +3 Tobias Hoevekamp 20010326202105 20040329205621 1903 985638065 985638065 1 1 0 0 985683223 985683379 2 0 0 0 985944995 985944995 1 0 0 0 |
| 6 | +4 Magnus Manske 20010728082538 20110714220907 20038 996308738 996308738 1 0 0 0 996429215 996429215 2 2 0 0 996439880 996439880 1 0 0 0 |
| 7 | +5 Hoevekam 20030709192137 20041227165610 3 1057778497 1057778497 1 0 0 0 1095446631 1095446631 1 0 0 0 1104166570 1104166570 1 0 0 0 |
| 8 | +6 Paul Drye 20010919131128 20080605202716 1135 1000905088 1000906103 5 3 0 0 1001350404 1001350920 2 0 0 0 1001359775 1001359775 1 0 0 0 |
| 9 | +7 Joao 20010826124114 20040606005523 266 998829674 998832110 6 0 0 0 998850967 998852514 3 0 0 0 998856582 998856582 1 0 0 0 |
| 10 | +8 TwoOneTwo 20010909202356 20110205015022 2135 1000067036 1000067036 1 0 0 0 1000157045 1000157045 1 0 0 0 1000326898 1000326898 1 0 0 0 |
| 11 | +9 Chenyu 20011118233022 20020124230110 166 1006126222 1006130510 5 0 0 0 1006138034 1006140164 7 3 0 2 1006147796 1006147828 3 0 0 1 |
| 12 | +10 Tbc 20010803144007 20020105091549 125 996849607 996849607 1 0 0 0 996875902 996875902 1 0 0 0 996967861 996967861 1 1 0 0 |
| 13 | +11 Kpjas 20010506173149 20110714103545 6302 989170309 989170309 1 0 0 0 990957006 990957006 1 0 0 0 991557466 991557466 1 0 0 0 |
| 14 | +12 Matthew Woodcraft 20011202215229 20110510203406 725 1007329949 1007336697 7 0 0 1 1007344959 1007344959 1 0 0 0 1007852954 1007852954 1 0 0 0 |
| 15 | +13 SteveSmith 20020124050249 20030520014515 82 1011848569 1011848569 1 0 0 0 1011879408 1011880799 5 0 0 0 1011892200 1011900596 10 1 0 0 |
| 16 | +14 RjLesch 20010727142501 20020708073228 872 996243901 996245695 4 1 0 0 996522629 996522629 1 0 0 0 996698056 996698374 2 0 0 0 |
| 17 | +15 Trelvis 20011213170430 20040906184834 673 1008263070 1008263442 2 1 0 0 1008279572 1008283623 6 3 0 0 1008287606 1008287606 1 0 0 0 |
| 18 | +16 General Wesc 20010805053252 20110403162228 1505 996989572 996989815 2 0 0 0 996994099 996998763 4 0 0 0 997040223 997044154 11 0 0 1 |
| 19 | +17 Peter Winnberg 20011110110118 20061124144257 464 1005390078 1005390078 1 0 0 0 1005586430 1005586639 2 1 0 0 1005596572 1005596572 1 0 0 0 |
| 20 | +18 MichaelTinkler 20010731150633 20020903033518 2468 996591993 996594597 2 0 0 0 996610359 996610359 1 0 0 0 996767279 996767279 1 0 0 0 |
| 21 | +19 Ignaciovicario 20020126191706 20020225154311 2 1012072626 1012072626 1 0 0 0 1014651791 1014651791 1 0 0 0 \N \N \N \N \N \N |
| 22 | +20 Pingos 20020113023235 20040311053834 17 1010889155 1010889303 4 2 0 1 1011038119 1011038119 1 0 0 0 1011044975 1011045028 3 0 0 3 |
| 23 | +21 Firepink 20020118155248 20020225155115 63 1011369168 1011371693 3 0 0 0 1011487743 1011487946 3 0 0 0 1011545356 1011546803 2 0 0 0 |
| 24 | +22 Luis Oliveira 20020124235009 20050130223105 27 1011916209 1011918167 4 0 0 0 1011965218 1011965218 1 0 0 0 1011969929 1011971575 3 0 0 0 |
| 25 | +23 Goran 20020225154311 20030221002037 11 1014651791 1014652275 4 2 0 0 1039118155 1039118752 4 0 0 0 1039135447 1039135447 1 0 0 0 |
Index: trunk/tools/wsor/first_session/data |
— | — | @@ -0,0 +1 @@ |
| 2 | +link /home/halfak/data/first_session |
\ No newline at end of file |
Property changes on: trunk/tools/wsor/first_session/data |
___________________________________________________________________ |
Added: svn:special |
1 | 3 | + * |
Index: trunk/tools/wsor/first_session/testing.sql |
— | — | @@ -0,0 +1,197 @@ |
| 2 | + |
| 3 | +CREATE TABLE halfak.user_session_sample |
| 4 | +SELECT |
| 5 | + user_id, |
| 6 | + YEAR(first_edit) AS year, |
| 7 | + MONTH(first_edit) >= 7 AS semester |
| 8 | +FROM halfak.user_meta_20110715 |
| 9 | +WHERE first_edit BETWEEN "20010000000000" AND "20019999999999" |
| 10 | +ORDER BY RAND() |
| 11 | +LIMIT 10000; |
| 12 | + |
| 13 | +INSERT INTO halfak.user_session_sample |
| 14 | +SELECT |
| 15 | + user_id, |
| 16 | + YEAR(first_edit) AS year, |
| 17 | + MONTH(first_edit) >= 7 AS semester |
| 18 | +FROM halfak.user_meta_20110715 |
| 19 | +WHERE first_edit BETWEEN "20020000000000" AND "20029999999999" |
| 20 | +ORDER BY RAND() |
| 21 | +LIMIT 10000; |
| 22 | + |
| 23 | +INSERT INTO halfak.user_session_sample |
| 24 | +SELECT |
| 25 | + user_id, |
| 26 | + YEAR(first_edit) AS year, |
| 27 | + MONTH(first_edit) >= 7 AS semester |
| 28 | +FROM halfak.user_meta_20110715 |
| 29 | +WHERE first_edit BETWEEN "20030000000000" AND "20039999999999" |
| 30 | +ORDER BY RAND() |
| 31 | +LIMIT 10000; |
| 32 | + |
| 33 | +INSERT INTO halfak.user_session_sample |
| 34 | +SELECT |
| 35 | + user_id, |
| 36 | + YEAR(first_edit) AS year, |
| 37 | + MONTH(first_edit) >= 7 AS semester |
| 38 | +FROM halfak.user_meta_20110715 |
| 39 | +WHERE first_edit BETWEEN "20040000000000" AND "20049999999999" |
| 40 | +ORDER BY RAND() |
| 41 | +LIMIT 10000; |
| 42 | + |
| 43 | +INSERT INTO halfak.user_session_sample |
| 44 | +SELECT |
| 45 | + user_id, |
| 46 | + YEAR(first_edit) AS year, |
| 47 | + MONTH(first_edit) >= 7 AS semester |
| 48 | +FROM halfak.user_meta_20110715 |
| 49 | +WHERE first_edit BETWEEN "20050000000000" AND "20059999999999" |
| 50 | +ORDER BY RAND() |
| 51 | +LIMIT 10000; |
| 52 | + |
| 53 | +INSERT INTO halfak.user_session_sample |
| 54 | +SELECT |
| 55 | + user_id, |
| 56 | + YEAR(first_edit) AS year, |
| 57 | + MONTH(first_edit) >= 7 AS semester |
| 58 | +FROM halfak.user_meta_20110715 |
| 59 | +WHERE first_edit BETWEEN "20060000000000" AND "20069999999999" |
| 60 | +ORDER BY RAND() |
| 61 | +LIMIT 10000; |
| 62 | + |
| 63 | +INSERT INTO halfak.user_session_sample |
| 64 | +SELECT |
| 65 | + user_id, |
| 66 | + YEAR(first_edit) AS year, |
| 67 | + MONTH(first_edit) >= 7 AS semester |
| 68 | +FROM halfak.user_meta_20110715 |
| 69 | +WHERE first_edit BETWEEN "20070000000000" AND "20079999999999" |
| 70 | +ORDER BY RAND() |
| 71 | +LIMIT 10000; |
| 72 | + |
| 73 | +INSERT INTO halfak.user_session_sample |
| 74 | +SELECT |
| 75 | + user_id, |
| 76 | + YEAR(first_edit) AS year, |
| 77 | + MONTH(first_edit) >= 7 AS semester |
| 78 | +FROM halfak.user_meta_20110715 |
| 79 | +WHERE first_edit BETWEEN "20080000000000" AND "20089999999999" |
| 80 | +ORDER BY RAND() |
| 81 | +LIMIT 10000; |
| 82 | + |
| 83 | +INSERT INTO halfak.user_session_sample |
| 84 | +SELECT |
| 85 | + user_id, |
| 86 | + YEAR(first_edit) AS year, |
| 87 | + MONTH(first_edit) >= 7 AS semester |
| 88 | +FROM halfak.user_meta_20110715 |
| 89 | +WHERE first_edit BETWEEN "20090000000000" AND "20099999999999" |
| 90 | +ORDER BY RAND() |
| 91 | +LIMIT 10000; |
| 92 | + |
| 93 | +INSERT INTO halfak.user_session_sample |
| 94 | +SELECT |
| 95 | + user_id, |
| 96 | + YEAR(first_edit) AS year, |
| 97 | + MONTH(first_edit) >= 7 AS semester |
| 98 | +FROM halfak.user_meta_20110715 |
| 99 | +WHERE first_edit BETWEEN "20100000000000" AND "20109999999999" |
| 100 | +ORDER BY RAND() |
| 101 | +LIMIT 10000; |
| 102 | + |
| 103 | + |
| 104 | + |
| 105 | +USE enwiki; |
| 106 | +CREATE TABLE zexley.user_meta_firsts |
| 107 | +SELECT |
| 108 | + user_id, |
| 109 | + first_edit, |
| 110 | + last_edit, |
| 111 | + sum(rev_timestamp BETWEEN u.first_edit AND DATE_ADD(u.first_edit, INTERVAL .25 YEAR)) as 1q, |
| 112 | + sum(rev_timestamp BETWEEN DATE_ADD(u.first_edit, INTERVAL .25 YEAR) AND DATE_ADD(u.first_edit, INTERVAL .5 YEAR)) as 2q, |
| 113 | + sum(rev_timestamp BETWEEN DATE_ADD(u.first_edit, INTERVAL .5 YEAR) AND DATE_ADD(u.first_edit, INTERVAL .75 YEAR)) as 3q, |
| 114 | + sum(rev_timestamp > DATE_ADD(u.first_edit, INTERVAL .75 YEAR)) as 4q |
| 115 | +FROM ( |
| 116 | +SELECT |
| 117 | + u.user_id AS user_id, |
| 118 | + u.first_edit AS first_edit, |
| 119 | + u.last_edit AS last_edit, |
| 120 | + r.rev_timestamp |
| 121 | +FROM halfak.user_meta_20110715 u |
| 122 | +LEFT JOIN revision r |
| 123 | + ON u.user_id = r.rev_user AND |
| 124 | + r.rev_timestamp BETWEEN u.first_edit AND DATE_ADD(u.first_edit, INTERVAL 1 YEAR) |
| 125 | +UNION |
| 126 | +SELECT |
| 127 | + u.user_id AS user_id, |
| 128 | + u.first_edit AS first_edit, |
| 129 | + u.last_edit AS last_edit, |
| 130 | + ar_timestamp AS rev_timestamp |
| 131 | +FROM halfak.user_meta_20110715 u |
| 132 | +LEFT JOIN archive a |
| 133 | + ON u.user_id = ar_user AND |
| 134 | + ar_timestamp BETWEEN u.first_edit AND DATE_ADD(u.first_edit, INTERVAL 1 YEAR) |
| 135 | +) AS r |
| 136 | +GROUP BY user_id |
| 137 | + |
| 138 | + |
| 139 | +CREATE TABLE halfak.rev_len_changed |
| 140 | +SELECT |
| 141 | + r.rev_id, |
| 142 | + r.rev_timestamp, |
| 143 | + YEAR(r.rev_timestamp) AS rev_year, |
| 144 | + MONTH(r.rev_timestamp) AS rev_month, |
| 145 | + r.rev_len, |
| 146 | + r.rev_user AS user_id, |
| 147 | + r.rev_user_text AS user_text, |
| 148 | + `change` AS len_change |
| 149 | + p.page_id AS page_id, |
| 150 | + p.page_namespace AS namespace |
| 151 | +FROM revision r |
| 152 | +INNER JOIN user u |
| 153 | + ON r.rev_user = u.user_id |
| 154 | +INNER JOIN halfak.user_meta_20110715 um |
| 155 | + ON um.user_id = r.rev_user |
| 156 | +INNER JOIN halfak.rev_len_change rlc |
| 157 | + ON r.rev_id = rlc.rev_id |
| 158 | +INNER JOIN page p |
| 159 | + ON p.page_id = r.rev_page; |
| 160 | + |
| 161 | +ALTER TABLE halfak.rev_len_changed |
| 162 | +ADD COLUMN rev_year INT UNSIGNED |
| 163 | +AFTER rev_timestamp; |
| 164 | + |
| 165 | +ALTER TABLE halfak.rev_len_changed |
| 166 | +ADD COLUMN rev_month INT UNSIGNED |
| 167 | +AFTER rev_timestamp; |
| 168 | + |
| 169 | +UPDATE halfak.rev_len_changed |
| 170 | +SET |
| 171 | + rev_year = YEAR(rev_timestamp), |
| 172 | + rev_month = MONTH(rev_timestamp); |
| 173 | + |
| 174 | + |
| 175 | +CREATE UNIQUE INDEX rev_idx ON halfak.rev_len_changed_final (rev_id); |
| 176 | +CREATE INDEX user_year_month_namespace ON halfak.rev_len_changed_final (user_id, rev_year, rev_month, namespace); |
| 177 | + |
| 178 | + |
| 179 | + |
| 180 | + |
| 181 | + |
| 182 | +SELECT |
| 183 | + user_id, |
| 184 | + rev_year, |
| 185 | + rev_month, |
| 186 | + namespace, |
| 187 | + first_edit, |
| 188 | + COUNT(*) as edits, |
| 189 | + SUM(IF(len_change > 0,len_change,0)) as len_added, |
| 190 | + SUM(IF(len_change < 0,len_change*-1,0)) as len_removed |
| 191 | +FROM halfak.rev_len_changed |
| 192 | +GROUP BY |
| 193 | + user_id, |
| 194 | + rev_year, |
| 195 | + rev_month, |
| 196 | + namespace, |
| 197 | + first_edit |
| 198 | +WHERE user_id = 2356767; |
Index: trunk/tools/wsor/vandal_conversion/R/conversions.R |
— | — | @@ -1,4 +1,120 @@ |
2 | 2 | source("loader/load_editor_first_and_last.R") |
| 3 | +source("loader/load_editor_edit_count.R") |
3 | 4 | |
4 | 5 | editor_first_and_last = load_editor_first_and_last() |
| 6 | +efl = unique(editor_first_and_last) |
| 7 | +efl = efl[efl$last10_edits == 10,] |
5 | 8 | |
| 9 | +editor_edit_count = load_editor_edit_count() |
| 10 | +efl = merge( |
| 11 | + efl, |
| 12 | + editor_edit_count, |
| 13 | + by=c("user_id", "user_name") |
| 14 | +) |
| 15 | + |
| 16 | +library(lattice) |
| 17 | + |
| 18 | +#plot(table(efl$fes_edits)) |
| 19 | +#xyplot(table(efl$fes_edits)~as.numeric(names(table(efl$fes_edits))), scales=list(x=list(log=2), y=list(log=2))) |
| 20 | + |
| 21 | +png("plots/fes_discarded.hist.png", height=768, width=1024) |
| 22 | +efl$fes_discarded = efl$fes_reverted + efl$fes_deleted |
| 23 | +efl$fes_discarded_prop = efl$fes_discarded / efl$fes_edits |
| 24 | +plot( |
| 25 | + table(round(efl[efl$fes_edits >= 4,]$fes_discarded_prop, 1)), |
| 26 | + main="Histogram of the proportion of first session edits that were discarded", |
| 27 | + sub="for editors with at least 20 edits and 4 in first session. Discarded edits have been reverted or deleted", |
| 28 | + ylab="Frequency", |
| 29 | + xlab="Proportion of discarded edits" |
| 30 | +) |
| 31 | +dev.off() |
| 32 | + |
| 33 | +png("plots/fes_vandalism.hist.png", height=768, width=1024) |
| 34 | +efl$fes_vandalism_prop = efl$fes_vandalism / (efl$fes_edits - efl$fes_deleted) |
| 35 | +plot( |
| 36 | + table(round(efl[(efl$fes_edits - efl$fes_deleted) >= 1,]$fes_vandalism_prop, 1)), |
| 37 | + main="Histogram of the proportion of kept 1st session edits that were vandalism", |
| 38 | + sub="for editors with at least 20 edits and 1 kept edits in first session.", |
| 39 | + ylab="Frequency", |
| 40 | + xlab="Proportion of vandalism edits" |
| 41 | +) |
| 42 | +dev.off() |
| 43 | + |
| 44 | +png("plots/fes_reverted.hist.png", height=768, width=1024) |
| 45 | +efl$fes_reverted_prop = efl$fes_reverted / (efl$fes_edits - efl$fes_deleted) |
| 46 | +plot( |
| 47 | + table(round(efl[(efl$fes_edits - efl$fes_deleted) >= 1,]$fes_reverted_prop, 1)), |
| 48 | + main="Histogram of the proportion of kept 1st session edits that were reverted", |
| 49 | + sub="for editors with at least 20 edits and 1 kept edits in first session.", |
| 50 | + ylab="Frequency", |
| 51 | + xlab="Proportion of reverted edits" |
| 52 | +) |
| 53 | +dev.off() |
| 54 | + |
| 55 | + |
| 56 | +png("plots/last10_discarded.hist.png", height=768, width=1024) |
| 57 | +efl$last10_discarded = efl$last10_reverted + efl$last10_deleted |
| 58 | +efl$last10_discarded_prop = efl$last10_discarded / efl$last10_edits |
| 59 | +plot( |
| 60 | + table(round(efl$last10_discarded_prop, 1)), |
| 61 | + main="Histogram of the proportion of the last 10 edits that were discarded", |
| 62 | + sub="for editors with at least 20 edits. Discarded edits have been reverted or deleted", |
| 63 | + ylab="Frequency", |
| 64 | + xlab="Proportion of discarded edits" |
| 65 | +) |
| 66 | +dev.off() |
| 67 | + |
| 68 | + |
| 69 | +png("plots/future_edits.hist.png", height=768, width=1024) |
| 70 | +efl$future_edits = efl$edit_count - efl$fes_edits |
| 71 | +plot( |
| 72 | + table(10^round(log(efl$future_edits, base=10), 1)), |
| 73 | + main="Histogram of edits after first session for edits who made at least 20 edits", |
| 74 | + xlab="Edits after first session (log10 bucketed, scaled)", |
| 75 | + ylab="Frequency", |
| 76 | + type="o", |
| 77 | + log="x" |
| 78 | +) |
| 79 | +dev.off() |
| 80 | + |
| 81 | + |
| 82 | +top_100 = efl[order(efl$edit_count, decreasing=T),][1:100,] |
| 83 | +png("plots/fes_discarded.hist.top_100.png", height=768, width=1024) |
| 84 | +top_100$fes_discarded = top_100$fes_reverted + top_100$fes_deleted |
| 85 | +top_100$fes_discarded_prop = top_100$fes_discarded / top_100$fes_edits |
| 86 | +plot( |
| 87 | + table(round(top_100$fes_discarded_prop, 1)), |
| 88 | + main="Histogram of the proportion of the last 10 edits that were discarded", |
| 89 | + sub="for the top 100 editors by edit count. Discarded edits have been reverted or deleted", |
| 90 | + ylab="Frequency", |
| 91 | + xlab="Proportion of discarded edits" |
| 92 | +) |
| 93 | +dev.off() |
| 94 | + |
| 95 | +png("plots/fes_reverted.hist.top_100.png", height=768, width=1024) |
| 96 | +top_100$fes_reverted_prop = top_100$fes_reverted / (top_100$fes_edits - top_100$fes_deleted) |
| 97 | +plot( |
| 98 | + table(round(top_100[top_100$fes_edits - top_100$fes_deleted >= 1,]$fes_reverted_prop, 1)), |
| 99 | + main="Histogram of the proportion of the last 10 edits that were reverted", |
| 100 | + sub="for the top 100 editors by edit count.", |
| 101 | + ylab="Frequency", |
| 102 | + xlab="Proportion of reverted edits" |
| 103 | +) |
| 104 | +dev.off() |
| 105 | + |
| 106 | +png("plots/fes_vandal.hist.top_100.png", height=768, width=1024) |
| 107 | +top_100$fes_vandalism_prop = top_100$fes_vandalism / (top_100$fes_edits - top_100$fes_deleted) |
| 108 | +plot( |
| 109 | + table(round(top_100[top_100$fes_edits - top_100$fes_deleted >= 1,]$fes_vandalism_prop, 1)), |
| 110 | + main="Histogram of the proportion of the last 10 edits that were reverted for vandalism", |
| 111 | + sub="for the top 100 editors by edit count.", |
| 112 | + ylab="Frequency", |
| 113 | + xlab="Proportion of edits reverted for vandalism" |
| 114 | +) |
| 115 | +dev.off() |
| 116 | + |
| 117 | + |
| 118 | +summary(top_100$fes_vandalism > 0) |
| 119 | +summary(top_100$fes_reverted > 0) |
| 120 | +summary(top_100$fes_discarded > 0) |
| 121 | + |
Index: trunk/tools/wsor/vandal_conversion/R/util/env.R |
— | — | @@ -1 +1 @@ |
2 | | -DATA_DIR = "/home/aaron/data/vandal_conversion" |
| 2 | +DATA_DIR = "../data" |
Index: trunk/tools/wsor/vandal_conversion/get_editor_editcount.py |
— | — | @@ -0,0 +1,109 @@ |
| 2 | +import sys, MySQLdb, MySQLdb.cursors, argparse, os, logging, types |
| 3 | +import wmf |
| 4 | + |
| 5 | +def encode(v): |
| 6 | + if v == None: return "\N" |
| 7 | + |
| 8 | + if type(v) == types.LongType: v = int(v) |
| 9 | + elif type(v) == types.UnicodeType: v = v.encode('utf-8') |
| 10 | + |
| 11 | + return str(v).encode("string-escape") |
| 12 | + |
| 13 | + |
| 14 | +def main(): |
| 15 | + parser = argparse.ArgumentParser( |
| 16 | + description='Gathers editor data for first and last session' |
| 17 | + ) |
| 18 | + parser.add_argument( |
| 19 | + 'min_edits', |
| 20 | + type=int, |
| 21 | + help='the minimum number of edits that editors must have perfomed to be included' |
| 22 | + ) |
| 23 | + parser.add_argument( |
| 24 | + '-c', '--cnf', |
| 25 | + metavar="<path>", |
| 26 | + type=str, |
| 27 | + help='the path to MySQL config info (defaults to ~/.my.cnf)', |
| 28 | + default=os.path.expanduser("~/.my.cnf") |
| 29 | + ) |
| 30 | + parser.add_argument( |
| 31 | + '-s', '--host', |
| 32 | + type=str, |
| 33 | + help='the database host to connect to (defaults to localhost)', |
| 34 | + default="localhost" |
| 35 | + ) |
| 36 | + parser.add_argument( |
| 37 | + '-d', '--db', |
| 38 | + type=str, |
| 39 | + help='the language db to run the query in (defaults to enwiki)', |
| 40 | + default="enwiki" |
| 41 | + ) |
| 42 | + args = parser.parse_args() |
| 43 | + |
| 44 | + LOGGING_STREAM = sys.stderr |
| 45 | + logging.basicConfig( |
| 46 | + level=logging.DEBUG, |
| 47 | + stream=LOGGING_STREAM, |
| 48 | + format='%(asctime)s %(levelname)-8s %(message)s', |
| 49 | + datefmt='%b-%d %H:%M:%S' |
| 50 | + ) |
| 51 | + |
| 52 | + logging.info("Connecting to %s:%s using %s." % (args.host, args.db, args.cnf)) |
| 53 | + db = Database( |
| 54 | + host=args.host, |
| 55 | + db=args.db, |
| 56 | + read_default_file=args.cnf |
| 57 | + ) |
| 58 | + headers = [ |
| 59 | + 'user_id', |
| 60 | + 'user_name', |
| 61 | + 'edit_count' |
| 62 | + ] |
| 63 | + print("\t".join(headers)) |
| 64 | + |
| 65 | + logging.info("Processing users:") |
| 66 | + |
| 67 | + for user in db.getUsers(minimumEdits=args.min_edits): |
| 68 | + print("\t".join(encode(user[h]) for h in headers)) |
| 69 | + LOGGING_STREAM.write(".") |
| 70 | + |
| 71 | + LOGGING_STREAM.write("\n") |
| 72 | + |
| 73 | + |
| 74 | +class Database: |
| 75 | + |
| 76 | + def __init__(self, *args, **kwargs): |
| 77 | + self.args = args |
| 78 | + self.kwargs = kwargs |
| 79 | + self.usersConn = MySQLdb.connect(*args, **kwargs) |
| 80 | + self.revsConn = MySQLdb.connect(*args, **kwargs) |
| 81 | + self.archConn = MySQLdb.connect(*args, **kwargs) |
| 82 | + |
| 83 | + def getUsers(self, minimumEdits=0): |
| 84 | + minimumEdits = int(minimumEdits) |
| 85 | + cursor = self.usersConn.cursor(MySQLdb.cursors.SSDictCursor) |
| 86 | + cursor.execute( |
| 87 | + """ |
| 88 | + SELECT |
| 89 | + u.user_id, |
| 90 | + u.user_name, |
| 91 | + u.user_editcount as edit_count |
| 92 | + FROM user u |
| 93 | + WHERE u.user_editcount >= %(minimum_edits)s |
| 94 | + """, |
| 95 | + { |
| 96 | + 'minimum_edits': minimumEdits |
| 97 | + } |
| 98 | + ) |
| 99 | + for row in cursor: |
| 100 | + yield row |
| 101 | + |
| 102 | + |
| 103 | + def getFirstEdits(self, userId, maximum=10000): |
| 104 | + return self.getEdits(userId, maximum, chronologically=True) |
| 105 | + |
| 106 | + def getLastEdits(self, userId, maximum=10000): |
| 107 | + return self.getEdits(userId, maximum, chronologically=False) |
| 108 | + |
| 109 | + |
| 110 | +if __name__ == "__main__": main() |
Index: trunk/tools/wsor/diffs/example.py |
— | — | @@ -0,0 +1,101 @@ |
| 2 | +from StringIO import StringIO |
| 3 | +from diff_match_patch import diff_match_patch |
| 4 | +import re |
| 5 | + |
| 6 | +revs = [ |
| 7 | + {'rev_id': 1, 'content':'Foo derp 263254'}, |
| 8 | + {'rev_id': 2, 'content':'Foo derp 26354'} |
| 9 | +] |
| 10 | + |
| 11 | +def tokenize(content): |
| 12 | + return re.findall( |
| 13 | + r"[\w]+" + #Word |
| 14 | + r"|\[\[" + #Opening internal link |
| 15 | + r"|\]\]" + #Closing internal link |
| 16 | + r"|\{\{" + #Opening template |
| 17 | + r"|\}\}" + #Closing template |
| 18 | + r"|\{\{\{" + #Opening template var |
| 19 | + r"|\}\}\}" + #Closing template var |
| 20 | + r"|\n+" + #Line breaks |
| 21 | + r"| +" + #Spaces |
| 22 | + r"|&\w+;" + #HTML escape sequence |
| 23 | + r"|'''" + #Bold |
| 24 | + r"|''" + #Italics |
| 25 | + r"|=+" + #Header |
| 26 | + r"|\{\|" + #Opening table |
| 27 | + r"|\|\}" + #Closing table |
| 28 | + r"|\|\-" + #Table row |
| 29 | + r"|.", #Misc character |
| 30 | + content |
| 31 | + ) |
| 32 | + |
| 33 | +def hashTokens(tokens, hash2Token=[], token2Hash={}): |
| 34 | + hashBuffer = StringIO() |
| 35 | + for t in tokens: |
| 36 | + if t in token2Hash: |
| 37 | + hashBuffer.write(unichr(token2Hash[t]+1)) |
| 38 | + else: |
| 39 | + hashId = len(hash2Token) |
| 40 | + hash2Token.append(t) |
| 41 | + token2Hash[t] = hashId |
| 42 | + hashBuffer.write(unichr(hashId+1)) |
| 43 | + |
| 44 | + return (hashBuffer.getvalue(), hash2Token, token2Hash) |
| 45 | + |
| 46 | +def unhash(hashes, hash2Token, sep=''): |
| 47 | + return sep.join(hash2Token[ord(h)-1] for h in hashes) |
| 48 | + |
| 49 | +def simpleDiff(content1, content2, tokenize=tokenize, sep='', report=[-1,0,1]): |
| 50 | + hashes1, h2t, t2h = hashTokens(tokenize(content1)) |
| 51 | + hashes2, h2t, t2h = hashTokens(tokenize(content2), h2t, t2h) |
| 52 | + |
| 53 | + report = set(report) |
| 54 | + |
| 55 | + dmp = diff_match_patch() |
| 56 | + |
| 57 | + diffs = dmp.diff_main(hashes1, hashes2, checklines=False) |
| 58 | + |
| 59 | + position = 0 |
| 60 | + for (ar,hashes) in diffs: |
| 61 | + content = unhash(hashes,h2t,sep=sep) |
| 62 | + if ar in report: |
| 63 | + yield position, ar, content |
| 64 | + |
| 65 | + if ar != -1: position += len(content) |
| 66 | + |
| 67 | + |
| 68 | +def main(): |
| 69 | + |
| 70 | + lastRev = {'content':''} |
| 71 | + content = '' |
| 72 | + for rev in revs: |
| 73 | + buff = StringIO() |
| 74 | + oldPos = 0 |
| 75 | + lastPos = 0 |
| 76 | + for pos, ar, c in simpleDiff(lastRev['content'], rev['content'], report=[-1,1]): |
| 77 | + equal = content[oldPos:oldPos+pos-lastPos] |
| 78 | + buff.write(equal) |
| 79 | + lastPos += len(equal) |
| 80 | + oldPos += len(equal) |
| 81 | + |
| 82 | + if ar == 1: |
| 83 | + buff.write(c) |
| 84 | + lastPos += len(c) |
| 85 | + elif ar == -1: |
| 86 | + oldPos += len(c) |
| 87 | + |
| 88 | + |
| 89 | + print("%s, %s, %r" % (pos, ar, c)) |
| 90 | + |
| 91 | + buff.write(content[oldPos:]) |
| 92 | + |
| 93 | + |
| 94 | + content = buff.getvalue() |
| 95 | + print("Rev: id=%s\n\t%r\n\t%r" % (rev['rev_id'], rev['content'], content)) |
| 96 | + lastRev = rev |
| 97 | + |
| 98 | + |
| 99 | + |
| 100 | + |
| 101 | +if __name__ == "__main__": main() |
| 102 | + |
Index: trunk/tools/wsor/diffs/revision_differ.py |
— | — | @@ -0,0 +1,150 @@ |
| 2 | +#!/usr/local/bin/pypy |
| 3 | + |
| 4 | +import logging,traceback |
| 5 | +import sys, re |
| 6 | +from StringIO import StringIO |
| 7 | + |
| 8 | +from diff_match_patch import diff_match_patch |
| 9 | + |
| 10 | +from xml_simulator import RecordingFileWrapper |
| 11 | +from wmf.dump.iterator import Iterator |
| 12 | +import wmf |
| 13 | + |
| 14 | +def tokenize(content): |
| 15 | + return re.findall( |
| 16 | + r"[\w]+" + #Word |
| 17 | + r"|\[\[" + #Opening internal link |
| 18 | + r"|\]\]" + #Closing internal link |
| 19 | + r"|\{\{" + #Opening template |
| 20 | + r"|\}\}" + #Closing template |
| 21 | + r"|\{\{\{" + #Opening template var |
| 22 | + r"|\}\}\}" + #Closing template var |
| 23 | + r"|\n+" + #Line breaks |
| 24 | + r"| +" + #Spaces |
| 25 | + r"|&\w+;" + #HTML escape sequence |
| 26 | + r"|'''" + #Bold |
| 27 | + r"|''" + #Italics |
| 28 | + r"|=+" + #Header |
| 29 | + r"|\{\|" + #Opening table |
| 30 | + r"|\|\}" + #Closing table |
| 31 | + r"|\|\-" + #Table row |
| 32 | + r"|.", #Misc character |
| 33 | + content |
| 34 | + ) |
| 35 | + |
| 36 | +def hashTokens(tokens, hash2Token=[], token2Hash={}): |
| 37 | + hashBuffer = StringIO() |
| 38 | + for t in tokens: |
| 39 | + if t in token2Hash: |
| 40 | + hashBuffer.write(unichr(token2Hash[t]+1)) |
| 41 | + else: |
| 42 | + hashId = len(hash2Token) |
| 43 | + hash2Token.append(t) |
| 44 | + token2Hash[t] = hashId |
| 45 | + hashBuffer.write(unichr(hashId+1)) |
| 46 | + |
| 47 | + return (hashBuffer.getvalue(), hash2Token, token2Hash) |
| 48 | + |
| 49 | +def unhash(hashes, hash2Token, sep=''): |
| 50 | + return sep.join(hash2Token[ord(h)-1] for h in hashes) |
| 51 | + |
| 52 | +def simpleDiff(content1, content2, tokenize=tokenize, sep='', report=[-1,0,1]): |
| 53 | + hashes1, h2t, t2h = hashTokens(tokenize(content1)) |
| 54 | + hashes2, h2t, t2h = hashTokens(tokenize(content2), h2t, t2h) |
| 55 | + |
| 56 | + report = set(report) |
| 57 | + |
| 58 | + dmp = diff_match_patch() |
| 59 | + |
| 60 | + diffs = dmp.diff_main(hashes1, hashes2, checklines=False) |
| 61 | + |
| 62 | + position = 0 |
| 63 | + for (ar,hashes) in diffs: |
| 64 | + content = unhash(hashes,h2t,sep=sep) |
| 65 | + if ar in report: |
| 66 | + yield position, ar, content |
| 67 | + |
| 68 | + if ar != -1: position += len(content) |
| 69 | + |
| 70 | + |
| 71 | +metaXML = """ |
| 72 | +<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en"> |
| 73 | +<siteinfo> |
| 74 | +<sitename>Wikipedia</sitename> |
| 75 | +<base>http://en.wikipedia.org/wiki/Main_Page</base> |
| 76 | +<generator>MediaWiki 1.17wmf1</generator> |
| 77 | +<case>first-letter</case> |
| 78 | +<namespaces> |
| 79 | +<namespace key="-2" case="first-letter">Media</namespace> |
| 80 | +<namespace key="-1" case="first-letter">Special</namespace> |
| 81 | +<namespace key="0" case="first-letter" /> |
| 82 | +<namespace key="1" case="first-letter">Talk</namespace> |
| 83 | +<namespace key="2" case="first-letter">User</namespace> |
| 84 | +<namespace key="3" case="first-letter">User talk</namespace> |
| 85 | +<namespace key="4" case="first-letter">Wikipedia</namespace> |
| 86 | +<namespace key="5" case="first-letter">Wikipedia talk</namespace> |
| 87 | +<namespace key="6" case="first-letter">File</namespace> |
| 88 | +<namespace key="7" case="first-letter">File talk</namespace> |
| 89 | +<namespace key="8" case="first-letter">MediaWiki</namespace> |
| 90 | +<namespace key="9" case="first-letter">MediaWiki talk</namespace> |
| 91 | +<namespace key="10" case="first-letter">Template</namespace> |
| 92 | +<namespace key="11" case="first-letter">Template talk</namespace> |
| 93 | +<namespace key="12" case="first-letter">Help</namespace> |
| 94 | +<namespace key="13" case="first-letter">Help talk</namespace> |
| 95 | +<namespace key="14" case="first-letter">Category</namespace> |
| 96 | +<namespace key="15" case="first-letter">Category talk</namespace> |
| 97 | +<namespace key="100" case="first-letter">Portal</namespace> |
| 98 | +<namespace key="101" case="first-letter">Portal talk</namespace> |
| 99 | +<namespace key="108" case="first-letter">Book</namespace> |
| 100 | +<namespace key="109" case="first-letter">Book talk</namespace> |
| 101 | +</namespaces> |
| 102 | +</siteinfo> |
| 103 | +""" |
| 104 | +xmlSim = RecordingFileWrapper(sys.stdin, pre=metaXML, post='</mediawiki>') |
| 105 | + |
| 106 | +try: |
| 107 | + dump = Iterator(xmlSim) |
| 108 | +except Exception as e: |
| 109 | + sys.stderr.write(str(e) + xmlSim.getHistory()) |
| 110 | + sys.exit(1) |
| 111 | + |
| 112 | + |
| 113 | +for page in dump.readPages(): |
| 114 | + sys.stderr.write('Processing: %s - %s\n' % (page.getId(), page.getTitle().encode('UTF-8'))) |
| 115 | + try: |
| 116 | + lastRev = None |
| 117 | + for revision in page.readRevisions(): |
| 118 | + if lastRev == None: |
| 119 | + lastRev = revision |
| 120 | + else: |
| 121 | + namespace, title = wmf.normalizeTitle(page.getTitle(), namespaces=dump.namespaces) |
| 122 | + nsId = dump.namespaces[namespace] |
| 123 | + row = [ |
| 124 | + repr(revision.getId()), |
| 125 | + repr(page.getId()), |
| 126 | + repr(nsId), |
| 127 | + repr(title), |
| 128 | + repr(revision.getTimestamp()), |
| 129 | + repr(revision.getComment()), |
| 130 | + repr(revision.getMinor()), |
| 131 | + repr(revision.getContributor().getId()), |
| 132 | + repr(revision.getContributor().getUsername()) |
| 133 | + ] |
| 134 | + |
| 135 | + for d in simpleDiff(lastRev.getText(), revision.getText(), report=[-1,1]): |
| 136 | + row.append(":".join(repr(v) for v in d)) |
| 137 | + |
| 138 | + print("\t".join(row)) |
| 139 | + |
| 140 | + except Exception as e: |
| 141 | + sys.stderr.write('%s' % e) |
| 142 | + #fh.write('%s' % e) |
| 143 | + #logging.error( |
| 144 | + # "Failed to process page %s:%s - %s" % ( |
| 145 | + # page.getId(), |
| 146 | + # page.getTitle(), |
| 147 | + # e |
| 148 | + # )) |
| 149 | + #logging.error(traceback.print_exc()) |
| 150 | +#fh.close() |
| 151 | +#sys.exit(0) |
Property changes on: trunk/tools/wsor/diffs/revision_differ.py |
___________________________________________________________________ |
Added: svn:executable |
1 | 152 | + * |
Index: trunk/tools/wsor/diffs/xml_simulator.py |
— | — | @@ -0,0 +1,80 @@ |
| 2 | +import sys |
| 3 | +from StringIO import StringIO |
| 4 | +from collections import deque |
| 5 | + |
| 6 | +class FileWrapper: |
| 7 | + |
| 8 | + def __init__(self, fp, pre='', post=''): |
| 9 | + self.fp = fp |
| 10 | + self.pre = StringIO(pre) |
| 11 | + self.post = StringIO(post) |
| 12 | + self.closed = False |
| 13 | + self.mode = "r" |
| 14 | + |
| 15 | + def read(self, bytes=sys.maxint): |
| 16 | + bytes = int(bytes) |
| 17 | + if self.closed: raise ValueError("I/O operation on closed file") |
| 18 | + |
| 19 | + preBytes = self.pre.read(bytes) |
| 20 | + if len(preBytes) < bytes: |
| 21 | + fpBytes = self.fp.read(bytes-len(preBytes)) |
| 22 | + else: |
| 23 | + fpBytes = '' |
| 24 | + |
| 25 | + if len(preBytes) + len(fpBytes) < bytes: |
| 26 | + postBytes = self.post.read(bytes-(len(preBytes) + len(fpBytes))) |
| 27 | + else: |
| 28 | + postBytes = '' |
| 29 | + |
| 30 | + return preBytes + fpBytes + postBytes |
| 31 | + |
| 32 | + def readline(self): |
| 33 | + if self.closed: raise ValueError("I/O operation on closed file") |
| 34 | + |
| 35 | + output = self.pre.readline() |
| 36 | + if len(output) == 0 or output[-1] != "\n": |
| 37 | + output += self.fp.readline() |
| 38 | + if len(output) == 0 or output[-1] != "\n": |
| 39 | + output += self.post.readline() |
| 40 | + |
| 41 | + return output |
| 42 | + |
| 43 | + def readlines(self): raise NotImplementedError() |
| 44 | + |
| 45 | + def __iter__(self): |
| 46 | + |
| 47 | + line = self.readline() |
| 48 | + while line != '': |
| 49 | + yield line |
| 50 | + line = self.readline() |
| 51 | + |
| 52 | + |
| 53 | + def seek(self): raise NotImplementedError() |
| 54 | + def write(self): raise NotImplementedError() |
| 55 | + def writelines(self): raise NotImplementedError() |
| 56 | + def tell(self): |
| 57 | + return self.pre.tell() + self.fp.tell() + self.post.tell() |
| 58 | + |
| 59 | + |
| 60 | + def close(self): |
| 61 | + self.closed = True |
| 62 | + self.fp.close() |
| 63 | + |
| 64 | +class RecordingFileWrapper(FileWrapper): |
| 65 | + |
| 66 | + def __init__(self, fp, pre='', post='', record=10000): |
| 67 | + self.history = deque(maxlen=record) |
| 68 | + FileWrapper.__init__(self, fp, pre=pre, post=post) |
| 69 | + |
| 70 | + def read(self, bytes=sys.maxint): |
| 71 | + outBytes = FileWrapper.read(self, bytes) |
| 72 | + self.history.extend(outBytes) |
| 73 | + return outBytes |
| 74 | + |
| 75 | + def readline(self): |
| 76 | + outBytes = FileWrapper.readline(self) |
| 77 | + self.history.extend(outBytes) |
| 78 | + return outBytes |
| 79 | + |
| 80 | + def getHistory(self): |
| 81 | + return ''.join(self.history) |
Property changes on: trunk/tools/wsor/diffs/xml_simulator.py |
___________________________________________________________________ |
Added: svn:executable |
1 | 82 | + * |
Index: trunk/tools/wsor/diffs/diff_match_patch.py |
— | — | @@ -0,0 +1,1949 @@ |
| 2 | +#!/usr/bin/env python |
| 3 | + |
| 4 | +"""Diff Match and Patch |
| 5 | + |
| 6 | +Copyright 2006 Google Inc. |
| 7 | +http://code.google.com/p/google-diff-match-patch/ |
| 8 | + |
| 9 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 10 | +you may not use this file except in compliance with the License. |
| 11 | +You may obtain a copy of the License at |
| 12 | + |
| 13 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 14 | + |
| 15 | +Unless required by applicable law or agreed to in writing, software |
| 16 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 17 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 18 | +See the License for the specific language governing permissions and |
| 19 | +limitations under the License. |
| 20 | +""" |
| 21 | + |
| 22 | +"""Functions for diff, match and patch. |
| 23 | + |
| 24 | +Computes the difference between two texts to create a patch. |
| 25 | +Applies the patch onto another text, allowing for errors. |
| 26 | +""" |
| 27 | + |
| 28 | +__author__ = 'fraser@google.com (Neil Fraser)' |
| 29 | + |
| 30 | +import math |
| 31 | +import time |
| 32 | +import urllib |
| 33 | +import re |
| 34 | +import sys |
| 35 | + |
| 36 | +class diff_match_patch: |
| 37 | + """Class containing the diff, match and patch methods. |
| 38 | + |
| 39 | + Also contains the behaviour settings. |
| 40 | + """ |
| 41 | + |
| 42 | + def __init__(self): |
| 43 | + """Inits a diff_match_patch object with default settings. |
| 44 | + Redefine these in your program to override the defaults. |
| 45 | + """ |
| 46 | + |
| 47 | + # Number of seconds to map a diff before giving up (0 for infinity). |
| 48 | + self.Diff_Timeout = 1.0 |
| 49 | + # Cost of an empty edit operation in terms of edit characters. |
| 50 | + self.Diff_EditCost = 4 |
| 51 | + # At what point is no match declared (0.0 = perfection, 1.0 = very loose). |
| 52 | + self.Match_Threshold = 0.5 |
| 53 | + # How far to search for a match (0 = exact location, 1000+ = broad match). |
| 54 | + # A match this many characters away from the expected location will add |
| 55 | + # 1.0 to the score (0.0 is a perfect match). |
| 56 | + self.Match_Distance = 1000 |
| 57 | + # When deleting a large block of text (over ~64 characters), how close does |
| 58 | + # the contents have to match the expected contents. (0.0 = perfection, |
| 59 | + # 1.0 = very loose). Note that Match_Threshold controls how closely the |
| 60 | + # end points of a delete need to match. |
| 61 | + self.Patch_DeleteThreshold = 0.5 |
| 62 | + # Chunk size for context length. |
| 63 | + self.Patch_Margin = 4 |
| 64 | + |
| 65 | + # The number of bits in an int. |
| 66 | + # Python has no maximum, thus to disable patch splitting set to 0. |
| 67 | + # However to avoid long patches in certain pathological cases, use 32. |
| 68 | + # Multiple short patches (using native ints) are much faster than long ones. |
| 69 | + self.Match_MaxBits = 32 |
| 70 | + |
| 71 | + # DIFF FUNCTIONS |
| 72 | + |
| 73 | + # The data structure representing a diff is an array of tuples: |
| 74 | + # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")] |
| 75 | + # which means: delete "Hello", add "Goodbye" and keep " world." |
| 76 | + DIFF_DELETE = -1 |
| 77 | + DIFF_INSERT = 1 |
| 78 | + DIFF_EQUAL = 0 |
| 79 | + |
| 80 | + def diff_main(self, text1, text2, checklines=True, deadline=None): |
| 81 | + """Find the differences between two texts. Simplifies the problem by |
| 82 | + stripping any common prefix or suffix off the texts before diffing. |
| 83 | + |
| 84 | + Args: |
| 85 | + text1: Old string to be diffed. |
| 86 | + text2: New string to be diffed. |
| 87 | + checklines: Optional speedup flag. If present and false, then don't run |
| 88 | + a line-level diff first to identify the changed areas. |
| 89 | + Defaults to true, which does a faster, slightly less optimal diff. |
| 90 | + deadline: Optional time when the diff should be complete by. Used |
| 91 | + internally for recursive calls. Users should set DiffTimeout instead. |
| 92 | + |
| 93 | + Returns: |
| 94 | + Array of changes. |
| 95 | + """ |
| 96 | + # Set a deadline by which time the diff must be complete. |
| 97 | + if deadline == None: |
| 98 | + # Unlike in most languages, Python counts time in seconds. |
| 99 | + if self.Diff_Timeout <= 0: |
| 100 | + deadline = sys.maxint |
| 101 | + else: |
| 102 | + deadline = time.time() + self.Diff_Timeout |
| 103 | + |
| 104 | + # Check for null inputs. |
| 105 | + if text1 == None or text2 == None: |
| 106 | + raise ValueError("Null inputs. (diff_main)") |
| 107 | + |
| 108 | + # Check for equality (speedup). |
| 109 | + if text1 == text2: |
| 110 | + if text1: |
| 111 | + return [(self.DIFF_EQUAL, text1)] |
| 112 | + return [] |
| 113 | + |
| 114 | + # Trim off common prefix (speedup). |
| 115 | + commonlength = self.diff_commonPrefix(text1, text2) |
| 116 | + commonprefix = text1[:commonlength] |
| 117 | + text1 = text1[commonlength:] |
| 118 | + text2 = text2[commonlength:] |
| 119 | + |
| 120 | + # Trim off common suffix (speedup). |
| 121 | + commonlength = self.diff_commonSuffix(text1, text2) |
| 122 | + if commonlength == 0: |
| 123 | + commonsuffix = '' |
| 124 | + else: |
| 125 | + commonsuffix = text1[-commonlength:] |
| 126 | + text1 = text1[:-commonlength] |
| 127 | + text2 = text2[:-commonlength] |
| 128 | + |
| 129 | + # Compute the diff on the middle block. |
| 130 | + diffs = self.diff_compute(text1, text2, checklines, deadline) |
| 131 | + |
| 132 | + # Restore the prefix and suffix. |
| 133 | + if commonprefix: |
| 134 | + diffs[:0] = [(self.DIFF_EQUAL, commonprefix)] |
| 135 | + if commonsuffix: |
| 136 | + diffs.append((self.DIFF_EQUAL, commonsuffix)) |
| 137 | + self.diff_cleanupMerge(diffs) |
| 138 | + return diffs |
| 139 | + |
| 140 | + def diff_compute(self, text1, text2, checklines, deadline): |
| 141 | + """Find the differences between two texts. Assumes that the texts do not |
| 142 | + have any common prefix or suffix. |
| 143 | + |
| 144 | + Args: |
| 145 | + text1: Old string to be diffed. |
| 146 | + text2: New string to be diffed. |
| 147 | + checklines: Speedup flag. If false, then don't run a line-level diff |
| 148 | + first to identify the changed areas. |
| 149 | + If true, then run a faster, slightly less optimal diff. |
| 150 | + deadline: Time when the diff should be complete by. |
| 151 | + |
| 152 | + Returns: |
| 153 | + Array of changes. |
| 154 | + """ |
| 155 | + if not text1: |
| 156 | + # Just add some text (speedup). |
| 157 | + return [(self.DIFF_INSERT, text2)] |
| 158 | + |
| 159 | + if not text2: |
| 160 | + # Just delete some text (speedup). |
| 161 | + return [(self.DIFF_DELETE, text1)] |
| 162 | + |
| 163 | + if len(text1) > len(text2): |
| 164 | + (longtext, shorttext) = (text1, text2) |
| 165 | + else: |
| 166 | + (shorttext, longtext) = (text1, text2) |
| 167 | + i = longtext.find(shorttext) |
| 168 | + if i != -1: |
| 169 | + # Shorter text is inside the longer text (speedup). |
| 170 | + diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext), |
| 171 | + (self.DIFF_INSERT, longtext[i + len(shorttext):])] |
| 172 | + # Swap insertions for deletions if diff is reversed. |
| 173 | + if len(text1) > len(text2): |
| 174 | + diffs[0] = (self.DIFF_DELETE, diffs[0][1]) |
| 175 | + diffs[2] = (self.DIFF_DELETE, diffs[2][1]) |
| 176 | + return diffs |
| 177 | + |
| 178 | + if len(shorttext) == 1: |
| 179 | + # Single character string. |
| 180 | + # After the previous speedup, the character can't be an equality. |
| 181 | + return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)] |
| 182 | + longtext = shorttext = None # Garbage collect. |
| 183 | + |
| 184 | + # Check to see if the problem can be split in two. |
| 185 | + hm = self.diff_halfMatch(text1, text2) |
| 186 | + if hm: |
| 187 | + # A half-match was found, sort out the return data. |
| 188 | + (text1_a, text1_b, text2_a, text2_b, mid_common) = hm |
| 189 | + # Send both pairs off for separate processing. |
| 190 | + diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline) |
| 191 | + diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline) |
| 192 | + # Merge the results. |
| 193 | + return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b |
| 194 | + |
| 195 | + if checklines and len(text1) > 100 and len(text2) > 100: |
| 196 | + return self.diff_lineMode(text1, text2, deadline) |
| 197 | + |
| 198 | + return self.diff_bisect(text1, text2, deadline) |
| 199 | + |
| 200 | + def diff_lineMode(self, text1, text2, deadline): |
| 201 | + """Do a quick line-level diff on both strings, then rediff the parts for |
| 202 | + greater accuracy. |
| 203 | + This speedup can produce non-minimal diffs. |
| 204 | + |
| 205 | + Args: |
| 206 | + text1: Old string to be diffed. |
| 207 | + text2: New string to be diffed. |
| 208 | + deadline: Time when the diff should be complete by. |
| 209 | + |
| 210 | + Returns: |
| 211 | + Array of changes. |
| 212 | + """ |
| 213 | + |
| 214 | + # Scan the text on a line-by-line basis first. |
| 215 | + (text1, text2, linearray) = self.diff_linesToChars(text1, text2) |
| 216 | + |
| 217 | + diffs = self.diff_main(text1, text2, False, deadline) |
| 218 | + |
| 219 | + # Convert the diff back to original text. |
| 220 | + self.diff_charsToLines(diffs, linearray) |
| 221 | + # Eliminate freak matches (e.g. blank lines) |
| 222 | + self.diff_cleanupSemantic(diffs) |
| 223 | + |
| 224 | + # Rediff any replacement blocks, this time character-by-character. |
| 225 | + # Add a dummy entry at the end. |
| 226 | + diffs.append((self.DIFF_EQUAL, '')) |
| 227 | + pointer = 0 |
| 228 | + count_delete = 0 |
| 229 | + count_insert = 0 |
| 230 | + text_delete = '' |
| 231 | + text_insert = '' |
| 232 | + while pointer < len(diffs): |
| 233 | + if diffs[pointer][0] == self.DIFF_INSERT: |
| 234 | + count_insert += 1 |
| 235 | + text_insert += diffs[pointer][1] |
| 236 | + elif diffs[pointer][0] == self.DIFF_DELETE: |
| 237 | + count_delete += 1 |
| 238 | + text_delete += diffs[pointer][1] |
| 239 | + elif diffs[pointer][0] == self.DIFF_EQUAL: |
| 240 | + # Upon reaching an equality, check for prior redundancies. |
| 241 | + if count_delete >= 1 and count_insert >= 1: |
| 242 | + # Delete the offending records and add the merged ones. |
| 243 | + a = self.diff_main(text_delete, text_insert, False, deadline) |
| 244 | + diffs[pointer - count_delete - count_insert : pointer] = a |
| 245 | + pointer = pointer - count_delete - count_insert + len(a) |
| 246 | + count_insert = 0 |
| 247 | + count_delete = 0 |
| 248 | + text_delete = '' |
| 249 | + text_insert = '' |
| 250 | + |
| 251 | + pointer += 1 |
| 252 | + |
| 253 | + diffs.pop() # Remove the dummy entry at the end. |
| 254 | + |
| 255 | + return diffs |
| 256 | + |
| 257 | + def diff_bisect(self, text1, text2, deadline): |
| 258 | + """Find the 'middle snake' of a diff, split the problem in two |
| 259 | + and return the recursively constructed diff. |
| 260 | + See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. |
| 261 | + |
| 262 | + Args: |
| 263 | + text1: Old string to be diffed. |
| 264 | + text2: New string to be diffed. |
| 265 | + deadline: Time at which to bail if not yet complete. |
| 266 | + |
| 267 | + Returns: |
| 268 | + Array of diff tuples. |
| 269 | + """ |
| 270 | + |
| 271 | + # Cache the text lengths to prevent multiple calls. |
| 272 | + text1_length = len(text1) |
| 273 | + text2_length = len(text2) |
| 274 | + max_d = (text1_length + text2_length + 1) / 2 |
| 275 | + v_offset = max_d |
| 276 | + v_length = 2 * max_d |
| 277 | + v1 = [-1] * v_length |
| 278 | + v1[v_offset + 1] = 0 |
| 279 | + v2 = v1[:] |
| 280 | + delta = text1_length - text2_length |
| 281 | + # If the total number of characters is odd, then the front path will |
| 282 | + # collide with the reverse path. |
| 283 | + front = (delta % 2 != 0) |
| 284 | + # Offsets for start and end of k loop. |
| 285 | + # Prevents mapping of space beyond the grid. |
| 286 | + k1start = 0 |
| 287 | + k1end = 0 |
| 288 | + k2start = 0 |
| 289 | + k2end = 0 |
| 290 | + for d in xrange(max_d): |
| 291 | + # Bail out if deadline is reached. |
| 292 | + if time.time() > deadline: |
| 293 | + break |
| 294 | + |
| 295 | + # Walk the front path one step. |
| 296 | + for k1 in xrange(-d + k1start, d + 1 - k1end, 2): |
| 297 | + k1_offset = v_offset + k1 |
| 298 | + if (k1 == -d or k1 != d and |
| 299 | + v1[k1_offset - 1] < v1[k1_offset + 1]): |
| 300 | + x1 = v1[k1_offset + 1] |
| 301 | + else: |
| 302 | + x1 = v1[k1_offset - 1] + 1 |
| 303 | + y1 = x1 - k1 |
| 304 | + while (x1 < text1_length and y1 < text2_length and |
| 305 | + text1[x1] == text2[y1]): |
| 306 | + x1 += 1 |
| 307 | + y1 += 1 |
| 308 | + v1[k1_offset] = x1 |
| 309 | + if x1 > text1_length: |
| 310 | + # Ran off the right of the graph. |
| 311 | + k1end += 2 |
| 312 | + elif y1 > text2_length: |
| 313 | + # Ran off the bottom of the graph. |
| 314 | + k1start += 2 |
| 315 | + elif front: |
| 316 | + k2_offset = v_offset + delta - k1 |
| 317 | + if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1: |
| 318 | + # Mirror x2 onto top-left coordinate system. |
| 319 | + x2 = text1_length - v2[k2_offset] |
| 320 | + if x1 >= x2: |
| 321 | + # Overlap detected. |
| 322 | + return self.diff_bisectSplit(text1, text2, x1, y1, deadline) |
| 323 | + |
| 324 | + # Walk the reverse path one step. |
| 325 | + for k2 in xrange(-d + k2start, d + 1 - k2end, 2): |
| 326 | + k2_offset = v_offset + k2 |
| 327 | + if (k2 == -d or k2 != d and |
| 328 | + v2[k2_offset - 1] < v2[k2_offset + 1]): |
| 329 | + x2 = v2[k2_offset + 1] |
| 330 | + else: |
| 331 | + x2 = v2[k2_offset - 1] + 1 |
| 332 | + y2 = x2 - k2 |
| 333 | + while (x2 < text1_length and y2 < text2_length and |
| 334 | + text1[-x2 - 1] == text2[-y2 - 1]): |
| 335 | + x2 += 1 |
| 336 | + y2 += 1 |
| 337 | + v2[k2_offset] = x2 |
| 338 | + if x2 > text1_length: |
| 339 | + # Ran off the left of the graph. |
| 340 | + k2end += 2 |
| 341 | + elif y2 > text2_length: |
| 342 | + # Ran off the top of the graph. |
| 343 | + k2start += 2 |
| 344 | + elif not front: |
| 345 | + k1_offset = v_offset + delta - k2 |
| 346 | + if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1: |
| 347 | + x1 = v1[k1_offset] |
| 348 | + y1 = v_offset + x1 - k1_offset |
| 349 | + # Mirror x2 onto top-left coordinate system. |
| 350 | + x2 = text1_length - x2 |
| 351 | + if x1 >= x2: |
| 352 | + # Overlap detected. |
| 353 | + return self.diff_bisectSplit(text1, text2, x1, y1, deadline) |
| 354 | + |
| 355 | + # Diff took too long and hit the deadline or |
| 356 | + # number of diffs equals number of characters, no commonality at all. |
| 357 | + return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)] |
| 358 | + |
| 359 | + def diff_bisectSplit(self, text1, text2, x, y, deadline): |
| 360 | + """Given the location of the 'middle snake', split the diff in two parts |
| 361 | + and recurse. |
| 362 | + |
| 363 | + Args: |
| 364 | + text1: Old string to be diffed. |
| 365 | + text2: New string to be diffed. |
| 366 | + x: Index of split point in text1. |
| 367 | + y: Index of split point in text2. |
| 368 | + deadline: Time at which to bail if not yet complete. |
| 369 | + |
| 370 | + Returns: |
| 371 | + Array of diff tuples. |
| 372 | + """ |
| 373 | + text1a = text1[:x] |
| 374 | + text2a = text2[:y] |
| 375 | + text1b = text1[x:] |
| 376 | + text2b = text2[y:] |
| 377 | + |
| 378 | + # Compute both diffs serially. |
| 379 | + diffs = self.diff_main(text1a, text2a, False, deadline) |
| 380 | + diffsb = self.diff_main(text1b, text2b, False, deadline) |
| 381 | + |
| 382 | + return diffs + diffsb |
| 383 | + |
| 384 | + def diff_linesToChars(self, text1, text2): |
| 385 | + """Split two texts into an array of strings. Reduce the texts to a string |
| 386 | + of hashes where each Unicode character represents one line. |
| 387 | + |
| 388 | + Args: |
| 389 | + text1: First string. |
| 390 | + text2: Second string. |
| 391 | + |
| 392 | + Returns: |
| 393 | + Three element tuple, containing the encoded text1, the encoded text2 and |
| 394 | + the array of unique strings. The zeroth element of the array of unique |
| 395 | + strings is intentionally blank. |
| 396 | + """ |
| 397 | + lineArray = [] # e.g. lineArray[4] == "Hello\n" |
| 398 | + lineHash = {} # e.g. lineHash["Hello\n"] == 4 |
| 399 | + |
| 400 | + # "\x00" is a valid character, but various debuggers don't like it. |
| 401 | + # So we'll insert a junk entry to avoid generating a null character. |
| 402 | + lineArray.append('') |
| 403 | + |
| 404 | + def diff_linesToCharsMunge(text): |
| 405 | + """Split a text into an array of strings. Reduce the texts to a string |
| 406 | + of hashes where each Unicode character represents one line. |
| 407 | + Modifies linearray and linehash through being a closure. |
| 408 | + |
| 409 | + Args: |
| 410 | + text: String to encode. |
| 411 | + |
| 412 | + Returns: |
| 413 | + Encoded string. |
| 414 | + """ |
| 415 | + chars = [] |
| 416 | + # Walk the text, pulling out a substring for each line. |
| 417 | + # text.split('\n') would would temporarily double our memory footprint. |
| 418 | + # Modifying text would create many large strings to garbage collect. |
| 419 | + lineStart = 0 |
| 420 | + lineEnd = -1 |
| 421 | + while lineEnd < len(text) - 1: |
| 422 | + lineEnd = text.find('\n', lineStart) |
| 423 | + if lineEnd == -1: |
| 424 | + lineEnd = len(text) - 1 |
| 425 | + line = text[lineStart:lineEnd + 1] |
| 426 | + lineStart = lineEnd + 1 |
| 427 | + |
| 428 | + if line in lineHash: |
| 429 | + chars.append(unichr(lineHash[line])) |
| 430 | + else: |
| 431 | + lineArray.append(line) |
| 432 | + lineHash[line] = len(lineArray) - 1 |
| 433 | + chars.append(unichr(len(lineArray) - 1)) |
| 434 | + return "".join(chars) |
| 435 | + |
| 436 | + chars1 = diff_linesToCharsMunge(text1) |
| 437 | + chars2 = diff_linesToCharsMunge(text2) |
| 438 | + return (chars1, chars2, lineArray) |
| 439 | + |
| 440 | + def diff_linesToWords(self, text1, text2): |
| 441 | + """ |
| 442 | + INSERT BY FABIAN |
| 443 | + Split two texts into an array of strings. Reduce the texts to a string |
| 444 | + of hashes where each Unicode character represents one word. |
| 445 | + |
| 446 | + Args: |
| 447 | + text1: First string. |
| 448 | + text2: Second string. |
| 449 | + |
| 450 | + Returns: |
| 451 | + Three element tuple, containing the encoded text1, the encoded text2 and |
| 452 | + the array of unique strings. The zeroth element of the array of unique |
| 453 | + strings is intentionally blank. |
| 454 | + """ |
| 455 | + lineArray = [] # e.g. lineArray[4] == "Hello\n" |
| 456 | + lineHash = {} # e.g. lineHash["Hello\n"] == 4 |
| 457 | + |
| 458 | + # "\x00" is a valid character, but various debuggers don't like it. |
| 459 | + # So we'll insert a junk entry to avoid generating a null character. |
| 460 | + lineArray.append('') |
| 461 | + |
| 462 | + def diff_linesToCharsMunge(text): |
| 463 | + """Split a text into an array of strings. Reduce the texts to a string |
| 464 | + of hashes where each Unicode character represents one line. |
| 465 | + Modifies linearray and linehash through being a closure. |
| 466 | + |
| 467 | + Args: |
| 468 | + text: String to encode. |
| 469 | + |
| 470 | + Returns: |
| 471 | + Encoded string. |
| 472 | + """ |
| 473 | + chars = [] |
| 474 | + # Walk the text, pulling out a substring for each line. |
| 475 | + # text.split('\n') would would temporarily double our memory footprint. |
| 476 | + # Modifying text would create many large strings to garbage collect. |
| 477 | + lineStart = 0 |
| 478 | + lineEnd = -1 |
| 479 | + while lineEnd < len(text) - 1: |
| 480 | + lineEnd = text.find(' ', lineStart) |
| 481 | + if lineEnd == -1: |
| 482 | + lineEnd = len(text) - 1 |
| 483 | + line = text[lineStart:lineEnd + 1] |
| 484 | + lineStart = lineEnd + 1 |
| 485 | + |
| 486 | + if line in lineHash: |
| 487 | + chars.append(unichr(lineHash[line])) |
| 488 | + else: |
| 489 | + lineArray.append(line) |
| 490 | + lineHash[line] = len(lineArray) - 1 |
| 491 | + chars.append(unichr(len(lineArray) - 1)) |
| 492 | + return "".join(chars) |
| 493 | + |
| 494 | + chars1 = diff_linesToCharsMunge(text1) |
| 495 | + chars2 = diff_linesToCharsMunge(text2) |
| 496 | + return (chars1, chars2, lineArray) |
| 497 | + |
| 498 | + |
| 499 | + |
| 500 | + def diff_charsToLines(self, diffs, lineArray): |
| 501 | + """Rehydrate the text in a diff from a string of line hashes to real lines |
| 502 | + of text. |
| 503 | + |
| 504 | + Args: |
| 505 | + diffs: Array of diff tuples. |
| 506 | + lineArray: Array of unique strings. |
| 507 | + """ |
| 508 | + for x in xrange(len(diffs)): |
| 509 | + text = [] |
| 510 | + for char in diffs[x][1]: |
| 511 | + text.append(lineArray[ord(char)]) |
| 512 | + diffs[x] = (diffs[x][0], "".join(text)) |
| 513 | + |
| 514 | + def diff_commonPrefix(self, text1, text2): |
| 515 | + """Determine the common prefix of two strings. |
| 516 | + |
| 517 | + Args: |
| 518 | + text1: First string. |
| 519 | + text2: Second string. |
| 520 | + |
| 521 | + Returns: |
| 522 | + The number of characters common to the start of each string. |
| 523 | + """ |
| 524 | + # Quick check for common null cases. |
| 525 | + if not text1 or not text2 or text1[0] != text2[0]: |
| 526 | + return 0 |
| 527 | + # Binary search. |
| 528 | + # Performance analysis: http://neil.fraser.name/news/2007/10/09/ |
| 529 | + pointermin = 0 |
| 530 | + pointermax = min(len(text1), len(text2)) |
| 531 | + pointermid = pointermax |
| 532 | + pointerstart = 0 |
| 533 | + while pointermin < pointermid: |
| 534 | + if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]: |
| 535 | + pointermin = pointermid |
| 536 | + pointerstart = pointermin |
| 537 | + else: |
| 538 | + pointermax = pointermid |
| 539 | + pointermid = int((pointermax - pointermin) / 2 + pointermin) |
| 540 | + return pointermid |
| 541 | + |
| 542 | + def diff_commonSuffix(self, text1, text2): |
| 543 | + """Determine the common suffix of two strings. |
| 544 | + |
| 545 | + Args: |
| 546 | + text1: First string. |
| 547 | + text2: Second string. |
| 548 | + |
| 549 | + Returns: |
| 550 | + The number of characters common to the end of each string. |
| 551 | + """ |
| 552 | + # Quick check for common null cases. |
| 553 | + if not text1 or not text2 or text1[-1] != text2[-1]: |
| 554 | + return 0 |
| 555 | + # Binary search. |
| 556 | + # Performance analysis: http://neil.fraser.name/news/2007/10/09/ |
| 557 | + pointermin = 0 |
| 558 | + pointermax = min(len(text1), len(text2)) |
| 559 | + pointermid = pointermax |
| 560 | + pointerend = 0 |
| 561 | + while pointermin < pointermid: |
| 562 | + if (text1[-pointermid:len(text1) - pointerend] == |
| 563 | + text2[-pointermid:len(text2) - pointerend]): |
| 564 | + pointermin = pointermid |
| 565 | + pointerend = pointermin |
| 566 | + else: |
| 567 | + pointermax = pointermid |
| 568 | + pointermid = int((pointermax - pointermin) / 2 + pointermin) |
| 569 | + return pointermid |
| 570 | + |
| 571 | + def diff_commonOverlap(self, text1, text2): |
| 572 | + """Determine if the suffix of one string is the prefix of another. |
| 573 | + |
| 574 | + Args: |
| 575 | + text1 First string. |
| 576 | + text2 Second string. |
| 577 | + |
| 578 | + Returns: |
| 579 | + The number of characters common to the end of the first |
| 580 | + string and the start of the second string. |
| 581 | + """ |
| 582 | + # Cache the text lengths to prevent multiple calls. |
| 583 | + text1_length = len(text1) |
| 584 | + text2_length = len(text2) |
| 585 | + # Eliminate the null case. |
| 586 | + if text1_length == 0 or text2_length == 0: |
| 587 | + return 0 |
| 588 | + # Truncate the longer string. |
| 589 | + if text1_length > text2_length: |
| 590 | + text1 = text1[-text2_length:] |
| 591 | + elif text1_length < text2_length: |
| 592 | + text2 = text2[:text1_length] |
| 593 | + text_length = min(text1_length, text2_length) |
| 594 | + # Quick check for the worst case. |
| 595 | + if text1 == text2: |
| 596 | + return text_length |
| 597 | + |
| 598 | + # Start by looking for a single character match |
| 599 | + # and increase length until no match is found. |
| 600 | + # Performance analysis: http://neil.fraser.name/news/2010/11/04/ |
| 601 | + best = 0 |
| 602 | + length = 1 |
| 603 | + while True: |
| 604 | + pattern = text1[-length:] |
| 605 | + found = text2.find(pattern) |
| 606 | + if found == -1: |
| 607 | + return best |
| 608 | + length += found |
| 609 | + if found == 0 or text1[-length:] == text2[:length]: |
| 610 | + best = length |
| 611 | + length += 1 |
| 612 | + |
| 613 | + def diff_halfMatch(self, text1, text2): |
| 614 | + """Do the two texts share a substring which is at least half the length of |
| 615 | + the longer text? |
| 616 | + This speedup can produce non-minimal diffs. |
| 617 | + |
| 618 | + Args: |
| 619 | + text1: First string. |
| 620 | + text2: Second string. |
| 621 | + |
| 622 | + Returns: |
| 623 | + Five element Array, containing the prefix of text1, the suffix of text1, |
| 624 | + the prefix of text2, the suffix of text2 and the common middle. Or None |
| 625 | + if there was no match. |
| 626 | + """ |
| 627 | + if self.Diff_Timeout <= 0: |
| 628 | + # Don't risk returning a non-optimal diff if we have unlimited time. |
| 629 | + return None |
| 630 | + if len(text1) > len(text2): |
| 631 | + (longtext, shorttext) = (text1, text2) |
| 632 | + else: |
| 633 | + (shorttext, longtext) = (text1, text2) |
| 634 | + if len(longtext) < 4 or len(shorttext) * 2 < len(longtext): |
| 635 | + return None # Pointless. |
| 636 | + |
| 637 | + def diff_halfMatchI(longtext, shorttext, i): |
| 638 | + """Does a substring of shorttext exist within longtext such that the |
| 639 | + substring is at least half the length of longtext? |
| 640 | + Closure, but does not reference any external variables. |
| 641 | + |
| 642 | + Args: |
| 643 | + longtext: Longer string. |
| 644 | + shorttext: Shorter string. |
| 645 | + i: Start index of quarter length substring within longtext. |
| 646 | + |
| 647 | + Returns: |
| 648 | + Five element Array, containing the prefix of longtext, the suffix of |
| 649 | + longtext, the prefix of shorttext, the suffix of shorttext and the |
| 650 | + common middle. Or None if there was no match. |
| 651 | + """ |
| 652 | + seed = longtext[i:i + len(longtext) / 4] |
| 653 | + best_common = '' |
| 654 | + j = shorttext.find(seed) |
| 655 | + while j != -1: |
| 656 | + prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:]) |
| 657 | + suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j]) |
| 658 | + if len(best_common) < suffixLength + prefixLength: |
| 659 | + best_common = (shorttext[j - suffixLength:j] + |
| 660 | + shorttext[j:j + prefixLength]) |
| 661 | + best_longtext_a = longtext[:i - suffixLength] |
| 662 | + best_longtext_b = longtext[i + prefixLength:] |
| 663 | + best_shorttext_a = shorttext[:j - suffixLength] |
| 664 | + best_shorttext_b = shorttext[j + prefixLength:] |
| 665 | + j = shorttext.find(seed, j + 1) |
| 666 | + |
| 667 | + if len(best_common) * 2 >= len(longtext): |
| 668 | + return (best_longtext_a, best_longtext_b, |
| 669 | + best_shorttext_a, best_shorttext_b, best_common) |
| 670 | + else: |
| 671 | + return None |
| 672 | + |
| 673 | + # First check if the second quarter is the seed for a half-match. |
| 674 | + hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) / 4) |
| 675 | + # Check again based on the third quarter. |
| 676 | + hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) / 2) |
| 677 | + if not hm1 and not hm2: |
| 678 | + return None |
| 679 | + elif not hm2: |
| 680 | + hm = hm1 |
| 681 | + elif not hm1: |
| 682 | + hm = hm2 |
| 683 | + else: |
| 684 | + # Both matched. Select the longest. |
| 685 | + if len(hm1[4]) > len(hm2[4]): |
| 686 | + hm = hm1 |
| 687 | + else: |
| 688 | + hm = hm2 |
| 689 | + |
| 690 | + # A half-match was found, sort out the return data. |
| 691 | + if len(text1) > len(text2): |
| 692 | + (text1_a, text1_b, text2_a, text2_b, mid_common) = hm |
| 693 | + else: |
| 694 | + (text2_a, text2_b, text1_a, text1_b, mid_common) = hm |
| 695 | + return (text1_a, text1_b, text2_a, text2_b, mid_common) |
| 696 | + |
| 697 | + def diff_cleanupSemantic(self, diffs): |
| 698 | + """Reduce the number of edits by eliminating semantically trivial |
| 699 | + equalities. |
| 700 | + |
| 701 | + Args: |
| 702 | + diffs: Array of diff tuples. |
| 703 | + """ |
| 704 | + changes = False |
| 705 | + equalities = [] # Stack of indices where equalities are found. |
| 706 | + lastequality = None # Always equal to equalities[-1][1] |
| 707 | + pointer = 0 # Index of current position. |
| 708 | + # Number of chars that changed prior to the equality. |
| 709 | + length_insertions1, length_deletions1 = 0, 0 |
| 710 | + # Number of chars that changed after the equality. |
| 711 | + length_insertions2, length_deletions2 = 0, 0 |
| 712 | + while pointer < len(diffs): |
| 713 | + if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found. |
| 714 | + equalities.append(pointer) |
| 715 | + length_insertions1, length_insertions2 = length_insertions2, 0 |
| 716 | + length_deletions1, length_deletions2 = length_deletions2, 0 |
| 717 | + lastequality = diffs[pointer][1] |
| 718 | + else: # An insertion or deletion. |
| 719 | + if diffs[pointer][0] == self.DIFF_INSERT: |
| 720 | + length_insertions2 += len(diffs[pointer][1]) |
| 721 | + else: |
| 722 | + length_deletions2 += len(diffs[pointer][1]) |
| 723 | + # Eliminate an equality that is smaller or equal to the edits on both |
| 724 | + # sides of it. |
| 725 | + if (lastequality != None and (len(lastequality) <= |
| 726 | + max(length_insertions1, length_deletions1)) and |
| 727 | + (len(lastequality) <= max(length_insertions2, length_deletions2))): |
| 728 | + # Duplicate record. |
| 729 | + diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality)) |
| 730 | + # Change second copy to insert. |
| 731 | + diffs[equalities[-1] + 1] = (self.DIFF_INSERT, |
| 732 | + diffs[equalities[-1] + 1][1]) |
| 733 | + # Throw away the equality we just deleted. |
| 734 | + equalities.pop() |
| 735 | + # Throw away the previous equality (it needs to be reevaluated). |
| 736 | + if len(equalities): |
| 737 | + equalities.pop() |
| 738 | + if len(equalities): |
| 739 | + pointer = equalities[-1] |
| 740 | + else: |
| 741 | + pointer = -1 |
| 742 | + # Reset the counters. |
| 743 | + length_insertions1, length_deletions1 = 0, 0 |
| 744 | + length_insertions2, length_deletions2 = 0, 0 |
| 745 | + lastequality = None |
| 746 | + changes = True |
| 747 | + pointer += 1 |
| 748 | + |
| 749 | + # Normalize the diff. |
| 750 | + if changes: |
| 751 | + self.diff_cleanupMerge(diffs) |
| 752 | + self.diff_cleanupSemanticLossless(diffs) |
| 753 | + |
| 754 | + # Find any overlaps between deletions and insertions. |
| 755 | + # e.g: <del>abcxxx</del><ins>xxxdef</ins> |
| 756 | + # -> <del>abc</del>xxx<ins>def</ins> |
| 757 | + # Only extract an overlap if it is as big as the edit ahead or behind it. |
| 758 | + pointer = 1 |
| 759 | + while pointer < len(diffs): |
| 760 | + if (diffs[pointer - 1][0] == self.DIFF_DELETE and |
| 761 | + diffs[pointer][0] == self.DIFF_INSERT): |
| 762 | + deletion = diffs[pointer - 1][1] |
| 763 | + insertion = diffs[pointer][1] |
| 764 | + overlap_length = self.diff_commonOverlap(deletion, insertion) |
| 765 | + if (overlap_length >= len(deletion) / 2.0 or |
| 766 | + overlap_length >= len(insertion) / 2.0): |
| 767 | + # Overlap found. Insert an equality and trim the surrounding edits. |
| 768 | + diffs.insert(pointer, (self.DIFF_EQUAL, insertion[:overlap_length])) |
| 769 | + diffs[pointer - 1] = (self.DIFF_DELETE, |
| 770 | + deletion[:len(deletion) - overlap_length]) |
| 771 | + diffs[pointer + 1] = (self.DIFF_INSERT, insertion[overlap_length:]) |
| 772 | + pointer += 1 |
| 773 | + pointer += 1 |
| 774 | + pointer += 1 |
| 775 | + |
| 776 | + def diff_cleanupSemanticLossless(self, diffs): |
| 777 | + """Look for single edits surrounded on both sides by equalities |
| 778 | + which can be shifted sideways to align the edit to a word boundary. |
| 779 | + e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came. |
| 780 | + |
| 781 | + Args: |
| 782 | + diffs: Array of diff tuples. |
| 783 | + """ |
| 784 | + |
| 785 | + def diff_cleanupSemanticScore(one, two): |
| 786 | + """Given two strings, compute a score representing whether the |
| 787 | + internal boundary falls on logical boundaries. |
| 788 | + Scores range from 5 (best) to 0 (worst). |
| 789 | + Closure, but does not reference any external variables. |
| 790 | + |
| 791 | + Args: |
| 792 | + one: First string. |
| 793 | + two: Second string. |
| 794 | + |
| 795 | + Returns: |
| 796 | + The score. |
| 797 | + """ |
| 798 | + if not one or not two: |
| 799 | + # Edges are the best. |
| 800 | + return 5 |
| 801 | + |
| 802 | + # Each port of this function behaves slightly differently due to |
| 803 | + # subtle differences in each language's definition of things like |
| 804 | + # 'whitespace'. Since this function's purpose is largely cosmetic, |
| 805 | + # the choice has been made to use each language's native features |
| 806 | + # rather than force total conformity. |
| 807 | + score = 0 |
| 808 | + # One point for non-alphanumeric. |
| 809 | + if not one[-1].isalnum() or not two[0].isalnum(): |
| 810 | + score += 1 |
| 811 | + # Two points for whitespace. |
| 812 | + if one[-1].isspace() or two[0].isspace(): |
| 813 | + score += 1 |
| 814 | + # Three points for line breaks. |
| 815 | + if (one[-1] == "\r" or one[-1] == "\n" or |
| 816 | + two[0] == "\r" or two[0] == "\n"): |
| 817 | + score += 1 |
| 818 | + # Four points for blank lines. |
| 819 | + if (re.search("\\n\\r?\\n$", one) or |
| 820 | + re.match("^\\r?\\n\\r?\\n", two)): |
| 821 | + score += 1 |
| 822 | + return score |
| 823 | + |
| 824 | + pointer = 1 |
| 825 | + # Intentionally ignore the first and last element (don't need checking). |
| 826 | + while pointer < len(diffs) - 1: |
| 827 | + if (diffs[pointer - 1][0] == self.DIFF_EQUAL and |
| 828 | + diffs[pointer + 1][0] == self.DIFF_EQUAL): |
| 829 | + # This is a single edit surrounded by equalities. |
| 830 | + equality1 = diffs[pointer - 1][1] |
| 831 | + edit = diffs[pointer][1] |
| 832 | + equality2 = diffs[pointer + 1][1] |
| 833 | + |
| 834 | + # First, shift the edit as far left as possible. |
| 835 | + commonOffset = self.diff_commonSuffix(equality1, edit) |
| 836 | + if commonOffset: |
| 837 | + commonString = edit[-commonOffset:] |
| 838 | + equality1 = equality1[:-commonOffset] |
| 839 | + edit = commonString + edit[:-commonOffset] |
| 840 | + equality2 = commonString + equality2 |
| 841 | + |
| 842 | + # Second, step character by character right, looking for the best fit. |
| 843 | + bestEquality1 = equality1 |
| 844 | + bestEdit = edit |
| 845 | + bestEquality2 = equality2 |
| 846 | + bestScore = (diff_cleanupSemanticScore(equality1, edit) + |
| 847 | + diff_cleanupSemanticScore(edit, equality2)) |
| 848 | + while edit and equality2 and edit[0] == equality2[0]: |
| 849 | + equality1 += edit[0] |
| 850 | + edit = edit[1:] + equality2[0] |
| 851 | + equality2 = equality2[1:] |
| 852 | + score = (diff_cleanupSemanticScore(equality1, edit) + |
| 853 | + diff_cleanupSemanticScore(edit, equality2)) |
| 854 | + # The >= encourages trailing rather than leading whitespace on edits. |
| 855 | + if score >= bestScore: |
| 856 | + bestScore = score |
| 857 | + bestEquality1 = equality1 |
| 858 | + bestEdit = edit |
| 859 | + bestEquality2 = equality2 |
| 860 | + |
| 861 | + if diffs[pointer - 1][1] != bestEquality1: |
| 862 | + # We have an improvement, save it back to the diff. |
| 863 | + if bestEquality1: |
| 864 | + diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1) |
| 865 | + else: |
| 866 | + del diffs[pointer - 1] |
| 867 | + pointer -= 1 |
| 868 | + diffs[pointer] = (diffs[pointer][0], bestEdit) |
| 869 | + if bestEquality2: |
| 870 | + diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2) |
| 871 | + else: |
| 872 | + del diffs[pointer + 1] |
| 873 | + pointer -= 1 |
| 874 | + pointer += 1 |
| 875 | + |
| 876 | + def diff_cleanupEfficiency(self, diffs): |
| 877 | + """Reduce the number of edits by eliminating operationally trivial |
| 878 | + equalities. |
| 879 | + |
| 880 | + Args: |
| 881 | + diffs: Array of diff tuples. |
| 882 | + """ |
| 883 | + changes = False |
| 884 | + equalities = [] # Stack of indices where equalities are found. |
| 885 | + lastequality = '' # Always equal to equalities[-1][1] |
| 886 | + pointer = 0 # Index of current position. |
| 887 | + pre_ins = False # Is there an insertion operation before the last equality. |
| 888 | + pre_del = False # Is there a deletion operation before the last equality. |
| 889 | + post_ins = False # Is there an insertion operation after the last equality. |
| 890 | + post_del = False # Is there a deletion operation after the last equality. |
| 891 | + while pointer < len(diffs): |
| 892 | + if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found. |
| 893 | + if (len(diffs[pointer][1]) < self.Diff_EditCost and |
| 894 | + (post_ins or post_del)): |
| 895 | + # Candidate found. |
| 896 | + equalities.append(pointer) |
| 897 | + pre_ins = post_ins |
| 898 | + pre_del = post_del |
| 899 | + lastequality = diffs[pointer][1] |
| 900 | + else: |
| 901 | + # Not a candidate, and can never become one. |
| 902 | + equalities = [] |
| 903 | + lastequality = '' |
| 904 | + |
| 905 | + post_ins = post_del = False |
| 906 | + else: # An insertion or deletion. |
| 907 | + if diffs[pointer][0] == self.DIFF_DELETE: |
| 908 | + post_del = True |
| 909 | + else: |
| 910 | + post_ins = True |
| 911 | + |
| 912 | + # Five types to be split: |
| 913 | + # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del> |
| 914 | + # <ins>A</ins>X<ins>C</ins><del>D</del> |
| 915 | + # <ins>A</ins><del>B</del>X<ins>C</ins> |
| 916 | + # <ins>A</del>X<ins>C</ins><del>D</del> |
| 917 | + # <ins>A</ins><del>B</del>X<del>C</del> |
| 918 | + |
| 919 | + if lastequality and ((pre_ins and pre_del and post_ins and post_del) or |
| 920 | + ((len(lastequality) < self.Diff_EditCost / 2) and |
| 921 | + (pre_ins + pre_del + post_ins + post_del) == 3)): |
| 922 | + # Duplicate record. |
| 923 | + diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality)) |
| 924 | + # Change second copy to insert. |
| 925 | + diffs[equalities[-1] + 1] = (self.DIFF_INSERT, |
| 926 | + diffs[equalities[-1] + 1][1]) |
| 927 | + equalities.pop() # Throw away the equality we just deleted. |
| 928 | + lastequality = '' |
| 929 | + if pre_ins and pre_del: |
| 930 | + # No changes made which could affect previous entry, keep going. |
| 931 | + post_ins = post_del = True |
| 932 | + equalities = [] |
| 933 | + else: |
| 934 | + if len(equalities): |
| 935 | + equalities.pop() # Throw away the previous equality. |
| 936 | + if len(equalities): |
| 937 | + pointer = equalities[-1] |
| 938 | + else: |
| 939 | + pointer = -1 |
| 940 | + post_ins = post_del = False |
| 941 | + changes = True |
| 942 | + pointer += 1 |
| 943 | + |
| 944 | + if changes: |
| 945 | + self.diff_cleanupMerge(diffs) |
| 946 | + |
| 947 | + def diff_cleanupMerge(self, diffs): |
| 948 | + """Reorder and merge like edit sections. Merge equalities. |
| 949 | + Any edit section can move as long as it doesn't cross an equality. |
| 950 | + |
| 951 | + Args: |
| 952 | + diffs: Array of diff tuples. |
| 953 | + """ |
| 954 | + diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end. |
| 955 | + pointer = 0 |
| 956 | + count_delete = 0 |
| 957 | + count_insert = 0 |
| 958 | + text_delete = '' |
| 959 | + text_insert = '' |
| 960 | + while pointer < len(diffs): |
| 961 | + if diffs[pointer][0] == self.DIFF_INSERT: |
| 962 | + count_insert += 1 |
| 963 | + text_insert += diffs[pointer][1] |
| 964 | + pointer += 1 |
| 965 | + elif diffs[pointer][0] == self.DIFF_DELETE: |
| 966 | + count_delete += 1 |
| 967 | + text_delete += diffs[pointer][1] |
| 968 | + pointer += 1 |
| 969 | + elif diffs[pointer][0] == self.DIFF_EQUAL: |
| 970 | + # Upon reaching an equality, check for prior redundancies. |
| 971 | + if count_delete + count_insert > 1: |
| 972 | + if count_delete != 0 and count_insert != 0: |
| 973 | + # Factor out any common prefixies. |
| 974 | + commonlength = self.diff_commonPrefix(text_insert, text_delete) |
| 975 | + if commonlength != 0: |
| 976 | + x = pointer - count_delete - count_insert - 1 |
| 977 | + if x >= 0 and diffs[x][0] == self.DIFF_EQUAL: |
| 978 | + diffs[x] = (diffs[x][0], diffs[x][1] + |
| 979 | + text_insert[:commonlength]) |
| 980 | + else: |
| 981 | + diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength])) |
| 982 | + pointer += 1 |
| 983 | + text_insert = text_insert[commonlength:] |
| 984 | + text_delete = text_delete[commonlength:] |
| 985 | + # Factor out any common suffixies. |
| 986 | + commonlength = self.diff_commonSuffix(text_insert, text_delete) |
| 987 | + if commonlength != 0: |
| 988 | + diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] + |
| 989 | + diffs[pointer][1]) |
| 990 | + text_insert = text_insert[:-commonlength] |
| 991 | + text_delete = text_delete[:-commonlength] |
| 992 | + # Delete the offending records and add the merged ones. |
| 993 | + if count_delete == 0: |
| 994 | + diffs[pointer - count_insert : pointer] = [ |
| 995 | + (self.DIFF_INSERT, text_insert)] |
| 996 | + elif count_insert == 0: |
| 997 | + diffs[pointer - count_delete : pointer] = [ |
| 998 | + (self.DIFF_DELETE, text_delete)] |
| 999 | + else: |
| 1000 | + diffs[pointer - count_delete - count_insert : pointer] = [ |
| 1001 | + (self.DIFF_DELETE, text_delete), |
| 1002 | + (self.DIFF_INSERT, text_insert)] |
| 1003 | + pointer = pointer - count_delete - count_insert + 1 |
| 1004 | + if count_delete != 0: |
| 1005 | + pointer += 1 |
| 1006 | + if count_insert != 0: |
| 1007 | + pointer += 1 |
| 1008 | + elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL: |
| 1009 | + # Merge this equality with the previous one. |
| 1010 | + diffs[pointer - 1] = (diffs[pointer - 1][0], |
| 1011 | + diffs[pointer - 1][1] + diffs[pointer][1]) |
| 1012 | + del diffs[pointer] |
| 1013 | + else: |
| 1014 | + pointer += 1 |
| 1015 | + |
| 1016 | + count_insert = 0 |
| 1017 | + count_delete = 0 |
| 1018 | + text_delete = '' |
| 1019 | + text_insert = '' |
| 1020 | + |
| 1021 | + if diffs[-1][1] == '': |
| 1022 | + diffs.pop() # Remove the dummy entry at the end. |
| 1023 | + |
| 1024 | + # Second pass: look for single edits surrounded on both sides by equalities |
| 1025 | + # which can be shifted sideways to eliminate an equality. |
| 1026 | + # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC |
| 1027 | + changes = False |
| 1028 | + pointer = 1 |
| 1029 | + # Intentionally ignore the first and last element (don't need checking). |
| 1030 | + while pointer < len(diffs) - 1: |
| 1031 | + if (diffs[pointer - 1][0] == self.DIFF_EQUAL and |
| 1032 | + diffs[pointer + 1][0] == self.DIFF_EQUAL): |
| 1033 | + # This is a single edit surrounded by equalities. |
| 1034 | + if diffs[pointer][1].endswith(diffs[pointer - 1][1]): |
| 1035 | + # Shift the edit over the previous equality. |
| 1036 | + diffs[pointer] = (diffs[pointer][0], |
| 1037 | + diffs[pointer - 1][1] + |
| 1038 | + diffs[pointer][1][:-len(diffs[pointer - 1][1])]) |
| 1039 | + diffs[pointer + 1] = (diffs[pointer + 1][0], |
| 1040 | + diffs[pointer - 1][1] + diffs[pointer + 1][1]) |
| 1041 | + del diffs[pointer - 1] |
| 1042 | + changes = True |
| 1043 | + elif diffs[pointer][1].startswith(diffs[pointer + 1][1]): |
| 1044 | + # Shift the edit over the next equality. |
| 1045 | + diffs[pointer - 1] = (diffs[pointer - 1][0], |
| 1046 | + diffs[pointer - 1][1] + diffs[pointer + 1][1]) |
| 1047 | + diffs[pointer] = (diffs[pointer][0], |
| 1048 | + diffs[pointer][1][len(diffs[pointer + 1][1]):] + |
| 1049 | + diffs[pointer + 1][1]) |
| 1050 | + del diffs[pointer + 1] |
| 1051 | + changes = True |
| 1052 | + pointer += 1 |
| 1053 | + |
| 1054 | + # If shifts were made, the diff needs reordering and another shift sweep. |
| 1055 | + if changes: |
| 1056 | + self.diff_cleanupMerge(diffs) |
| 1057 | + |
| 1058 | + def diff_xIndex(self, diffs, loc): |
| 1059 | + """loc is a location in text1, compute and return the equivalent location |
| 1060 | + in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8 |
| 1061 | + |
| 1062 | + Args: |
| 1063 | + diffs: Array of diff tuples. |
| 1064 | + loc: Location within text1. |
| 1065 | + |
| 1066 | + Returns: |
| 1067 | + Location within text2. |
| 1068 | + """ |
| 1069 | + chars1 = 0 |
| 1070 | + chars2 = 0 |
| 1071 | + last_chars1 = 0 |
| 1072 | + last_chars2 = 0 |
| 1073 | + for x in xrange(len(diffs)): |
| 1074 | + (op, text) = diffs[x] |
| 1075 | + if op != self.DIFF_INSERT: # Equality or deletion. |
| 1076 | + chars1 += len(text) |
| 1077 | + if op != self.DIFF_DELETE: # Equality or insertion. |
| 1078 | + chars2 += len(text) |
| 1079 | + if chars1 > loc: # Overshot the location. |
| 1080 | + break |
| 1081 | + last_chars1 = chars1 |
| 1082 | + last_chars2 = chars2 |
| 1083 | + |
| 1084 | + if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE: |
| 1085 | + # The location was deleted. |
| 1086 | + return last_chars2 |
| 1087 | + # Add the remaining len(character). |
| 1088 | + return last_chars2 + (loc - last_chars1) |
| 1089 | + |
| 1090 | + def diff_prettyHtml(self, diffs): |
| 1091 | + """Convert a diff array into a pretty HTML report. |
| 1092 | + |
| 1093 | + Args: |
| 1094 | + diffs: Array of diff tuples. |
| 1095 | + |
| 1096 | + Returns: |
| 1097 | + HTML representation. |
| 1098 | + """ |
| 1099 | + html = [] |
| 1100 | + i = 0 |
| 1101 | + for (op, data) in diffs: |
| 1102 | + text = (data.replace("&", "&").replace("<", "<") |
| 1103 | + .replace(">", ">").replace("\n", "¶<br>")) |
| 1104 | + if op == self.DIFF_INSERT: |
| 1105 | + html.append("<ins style=\"background:#e6ffe6;\">%s</ins>" % text) |
| 1106 | + elif op == self.DIFF_DELETE: |
| 1107 | + html.append("<del style=\"background:#ffe6e6;\">%s</del>" % text) |
| 1108 | + elif op == self.DIFF_EQUAL: |
| 1109 | + html.append("<span>%s</span>" % text) |
| 1110 | + if op != self.DIFF_DELETE: |
| 1111 | + i += len(data) |
| 1112 | + return "".join(html) |
| 1113 | + |
| 1114 | + def diff_text1(self, diffs): |
| 1115 | + """Compute and return the source text (all equalities and deletions). |
| 1116 | + |
| 1117 | + Args: |
| 1118 | + diffs: Array of diff tuples. |
| 1119 | + |
| 1120 | + Returns: |
| 1121 | + Source text. |
| 1122 | + """ |
| 1123 | + text = [] |
| 1124 | + for (op, data) in diffs: |
| 1125 | + if op != self.DIFF_INSERT: |
| 1126 | + text.append(data) |
| 1127 | + return "".join(text) |
| 1128 | + |
| 1129 | + def diff_text2(self, diffs): |
| 1130 | + """Compute and return the destination text (all equalities and insertions). |
| 1131 | + |
| 1132 | + Args: |
| 1133 | + diffs: Array of diff tuples. |
| 1134 | + |
| 1135 | + Returns: |
| 1136 | + Destination text. |
| 1137 | + """ |
| 1138 | + text = [] |
| 1139 | + for (op, data) in diffs: |
| 1140 | + if op != self.DIFF_DELETE: |
| 1141 | + text.append(data) |
| 1142 | + return "".join(text) |
| 1143 | + |
| 1144 | + def diff_levenshtein(self, diffs): |
| 1145 | + """Compute the Levenshtein distance; the number of inserted, deleted or |
| 1146 | + substituted characters. |
| 1147 | + |
| 1148 | + Args: |
| 1149 | + diffs: Array of diff tuples. |
| 1150 | + |
| 1151 | + Returns: |
| 1152 | + Number of changes. |
| 1153 | + """ |
| 1154 | + levenshtein = 0 |
| 1155 | + insertions = 0 |
| 1156 | + deletions = 0 |
| 1157 | + for (op, data) in diffs: |
| 1158 | + if op == self.DIFF_INSERT: |
| 1159 | + insertions += len(data) |
| 1160 | + elif op == self.DIFF_DELETE: |
| 1161 | + deletions += len(data) |
| 1162 | + elif op == self.DIFF_EQUAL: |
| 1163 | + # A deletion and an insertion is one substitution. |
| 1164 | + levenshtein += max(insertions, deletions) |
| 1165 | + insertions = 0 |
| 1166 | + deletions = 0 |
| 1167 | + levenshtein += max(insertions, deletions) |
| 1168 | + return levenshtein |
| 1169 | + |
| 1170 | + def diff_toDelta(self, diffs): |
| 1171 | + """Crush the diff into an encoded string which describes the operations |
| 1172 | + required to transform text1 into text2. |
| 1173 | + E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. |
| 1174 | + Operations are tab-separated. Inserted text is escaped using %xx notation. |
| 1175 | + |
| 1176 | + Args: |
| 1177 | + diffs: Array of diff tuples. |
| 1178 | + |
| 1179 | + Returns: |
| 1180 | + Delta text. |
| 1181 | + """ |
| 1182 | + text = [] |
| 1183 | + for (op, data) in diffs: |
| 1184 | + if op == self.DIFF_INSERT: |
| 1185 | + # High ascii will raise UnicodeDecodeError. Use Unicode instead. |
| 1186 | + data = data.encode("utf-8") |
| 1187 | + text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# ")) |
| 1188 | + elif op == self.DIFF_DELETE: |
| 1189 | + text.append("-%d" % len(data)) |
| 1190 | + elif op == self.DIFF_EQUAL: |
| 1191 | + text.append("=%d" % len(data)) |
| 1192 | + return "\t".join(text) |
| 1193 | + |
| 1194 | + def diff_fromDelta(self, text1, delta): |
| 1195 | + """Given the original text1, and an encoded string which describes the |
| 1196 | + operations required to transform text1 into text2, compute the full diff. |
| 1197 | + |
| 1198 | + Args: |
| 1199 | + text1: Source string for the diff. |
| 1200 | + delta: Delta text. |
| 1201 | + |
| 1202 | + Returns: |
| 1203 | + Array of diff tuples. |
| 1204 | + |
| 1205 | + Raises: |
| 1206 | + ValueError: If invalid input. |
| 1207 | + """ |
| 1208 | + if type(delta) == unicode: |
| 1209 | + # Deltas should be composed of a subset of ascii chars, Unicode not |
| 1210 | + # required. If this encode raises UnicodeEncodeError, delta is invalid. |
| 1211 | + delta = delta.encode("ascii") |
| 1212 | + diffs = [] |
| 1213 | + pointer = 0 # Cursor in text1 |
| 1214 | + tokens = delta.split("\t") |
| 1215 | + for token in tokens: |
| 1216 | + if token == "": |
| 1217 | + # Blank tokens are ok (from a trailing \t). |
| 1218 | + continue |
| 1219 | + # Each token begins with a one character parameter which specifies the |
| 1220 | + # operation of this token (delete, insert, equality). |
| 1221 | + param = token[1:] |
| 1222 | + if token[0] == "+": |
| 1223 | + param = urllib.unquote(param).decode("utf-8") |
| 1224 | + diffs.append((self.DIFF_INSERT, param)) |
| 1225 | + elif token[0] == "-" or token[0] == "=": |
| 1226 | + try: |
| 1227 | + n = int(param) |
| 1228 | + except ValueError: |
| 1229 | + raise ValueError("Invalid number in diff_fromDelta: " + param) |
| 1230 | + if n < 0: |
| 1231 | + raise ValueError("Negative number in diff_fromDelta: " + param) |
| 1232 | + text = text1[pointer : pointer + n] |
| 1233 | + pointer += n |
| 1234 | + if token[0] == "=": |
| 1235 | + diffs.append((self.DIFF_EQUAL, text)) |
| 1236 | + else: |
| 1237 | + diffs.append((self.DIFF_DELETE, text)) |
| 1238 | + else: |
| 1239 | + # Anything else is an error. |
| 1240 | + raise ValueError("Invalid diff operation in diff_fromDelta: " + |
| 1241 | + token[0]) |
| 1242 | + if pointer != len(text1): |
| 1243 | + raise ValueError( |
| 1244 | + "Delta length (%d) does not equal source text length (%d)." % |
| 1245 | + (pointer, len(text1))) |
| 1246 | + return diffs |
| 1247 | + |
| 1248 | + # MATCH FUNCTIONS |
| 1249 | + |
| 1250 | + def match_main(self, text, pattern, loc): |
| 1251 | + """Locate the best instance of 'pattern' in 'text' near 'loc'. |
| 1252 | + |
| 1253 | + Args: |
| 1254 | + text: The text to search. |
| 1255 | + pattern: The pattern to search for. |
| 1256 | + loc: The location to search around. |
| 1257 | + |
| 1258 | + Returns: |
| 1259 | + Best match index or -1. |
| 1260 | + """ |
| 1261 | + # Check for null inputs. |
| 1262 | + if text == None or pattern == None: |
| 1263 | + raise ValueError("Null inputs. (match_main)") |
| 1264 | + |
| 1265 | + loc = max(0, min(loc, len(text))) |
| 1266 | + if text == pattern: |
| 1267 | + # Shortcut (potentially not guaranteed by the algorithm) |
| 1268 | + return 0 |
| 1269 | + elif not text: |
| 1270 | + # Nothing to match. |
| 1271 | + return -1 |
| 1272 | + elif text[loc:loc + len(pattern)] == pattern: |
| 1273 | + # Perfect match at the perfect spot! (Includes case of null pattern) |
| 1274 | + return loc |
| 1275 | + else: |
| 1276 | + # Do a fuzzy compare. |
| 1277 | + match = self.match_bitap(text, pattern, loc) |
| 1278 | + return match |
| 1279 | + |
| 1280 | + def match_bitap(self, text, pattern, loc): |
| 1281 | + """Locate the best instance of 'pattern' in 'text' near 'loc' using the |
| 1282 | + Bitap algorithm. |
| 1283 | + |
| 1284 | + Args: |
| 1285 | + text: The text to search. |
| 1286 | + pattern: The pattern to search for. |
| 1287 | + loc: The location to search around. |
| 1288 | + |
| 1289 | + Returns: |
| 1290 | + Best match index or -1. |
| 1291 | + """ |
| 1292 | + # Python doesn't have a maxint limit, so ignore this check. |
| 1293 | + #if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits: |
| 1294 | + # raise ValueError("Pattern too long for this application.") |
| 1295 | + |
| 1296 | + # Initialise the alphabet. |
| 1297 | + s = self.match_alphabet(pattern) |
| 1298 | + |
| 1299 | + def match_bitapScore(e, x): |
| 1300 | + """Compute and return the score for a match with e errors and x location. |
| 1301 | + Accesses loc and pattern through being a closure. |
| 1302 | + |
| 1303 | + Args: |
| 1304 | + e: Number of errors in match. |
| 1305 | + x: Location of match. |
| 1306 | + |
| 1307 | + Returns: |
| 1308 | + Overall score for match (0.0 = good, 1.0 = bad). |
| 1309 | + """ |
| 1310 | + accuracy = float(e) / len(pattern) |
| 1311 | + proximity = abs(loc - x) |
| 1312 | + if not self.Match_Distance: |
| 1313 | + # Dodge divide by zero error. |
| 1314 | + return proximity and 1.0 or accuracy |
| 1315 | + return accuracy + (proximity / float(self.Match_Distance)) |
| 1316 | + |
| 1317 | + # Highest score beyond which we give up. |
| 1318 | + score_threshold = self.Match_Threshold |
| 1319 | + # Is there a nearby exact match? (speedup) |
| 1320 | + best_loc = text.find(pattern, loc) |
| 1321 | + if best_loc != -1: |
| 1322 | + score_threshold = min(match_bitapScore(0, best_loc), score_threshold) |
| 1323 | + # What about in the other direction? (speedup) |
| 1324 | + best_loc = text.rfind(pattern, loc + len(pattern)) |
| 1325 | + if best_loc != -1: |
| 1326 | + score_threshold = min(match_bitapScore(0, best_loc), score_threshold) |
| 1327 | + |
| 1328 | + # Initialise the bit arrays. |
| 1329 | + matchmask = 1 << (len(pattern) - 1) |
| 1330 | + best_loc = -1 |
| 1331 | + |
| 1332 | + bin_max = len(pattern) + len(text) |
| 1333 | + # Empty initialization added to appease pychecker. |
| 1334 | + last_rd = None |
| 1335 | + for d in xrange(len(pattern)): |
| 1336 | + # Scan for the best match each iteration allows for one more error. |
| 1337 | + # Run a binary search to determine how far from 'loc' we can stray at |
| 1338 | + # this error level. |
| 1339 | + bin_min = 0 |
| 1340 | + bin_mid = bin_max |
| 1341 | + while bin_min < bin_mid: |
| 1342 | + if match_bitapScore(d, loc + bin_mid) <= score_threshold: |
| 1343 | + bin_min = bin_mid |
| 1344 | + else: |
| 1345 | + bin_max = bin_mid |
| 1346 | + bin_mid = (bin_max - bin_min) / 2 + bin_min |
| 1347 | + |
| 1348 | + # Use the result from this iteration as the maximum for the next. |
| 1349 | + bin_max = bin_mid |
| 1350 | + start = max(1, loc - bin_mid + 1) |
| 1351 | + finish = min(loc + bin_mid, len(text)) + len(pattern) |
| 1352 | + |
| 1353 | + rd = range(finish + 1) |
| 1354 | + rd.append((1 << d) - 1) |
| 1355 | + for j in xrange(finish, start - 1, -1): |
| 1356 | + if len(text) <= j - 1: |
| 1357 | + # Out of range. |
| 1358 | + charMatch = 0 |
| 1359 | + else: |
| 1360 | + charMatch = s.get(text[j - 1], 0) |
| 1361 | + if d == 0: # First pass: exact match. |
| 1362 | + rd[j] = ((rd[j + 1] << 1) | 1) & charMatch |
| 1363 | + else: # Subsequent passes: fuzzy match. |
| 1364 | + rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | ( |
| 1365 | + ((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1] |
| 1366 | + if rd[j] & matchmask: |
| 1367 | + score = match_bitapScore(d, j - 1) |
| 1368 | + # This match will almost certainly be better than any existing match. |
| 1369 | + # But check anyway. |
| 1370 | + if score <= score_threshold: |
| 1371 | + # Told you so. |
| 1372 | + score_threshold = score |
| 1373 | + best_loc = j - 1 |
| 1374 | + if best_loc > loc: |
| 1375 | + # When passing loc, don't exceed our current distance from loc. |
| 1376 | + start = max(1, 2 * loc - best_loc) |
| 1377 | + else: |
| 1378 | + # Already passed loc, downhill from here on in. |
| 1379 | + break |
| 1380 | + # No hope for a (better) match at greater error levels. |
| 1381 | + if match_bitapScore(d + 1, loc) > score_threshold: |
| 1382 | + break |
| 1383 | + last_rd = rd |
| 1384 | + return best_loc |
| 1385 | + |
| 1386 | + def match_alphabet(self, pattern): |
| 1387 | + """Initialise the alphabet for the Bitap algorithm. |
| 1388 | + |
| 1389 | + Args: |
| 1390 | + pattern: The text to encode. |
| 1391 | + |
| 1392 | + Returns: |
| 1393 | + Hash of character locations. |
| 1394 | + """ |
| 1395 | + s = {} |
| 1396 | + for char in pattern: |
| 1397 | + s[char] = 0 |
| 1398 | + for i in xrange(len(pattern)): |
| 1399 | + s[pattern[i]] |= 1 << (len(pattern) - i - 1) |
| 1400 | + return s |
| 1401 | + |
| 1402 | + # PATCH FUNCTIONS |
| 1403 | + |
| 1404 | + def patch_addContext(self, patch, text): |
| 1405 | + """Increase the context until it is unique, |
| 1406 | + but don't let the pattern expand beyond Match_MaxBits. |
| 1407 | + |
| 1408 | + Args: |
| 1409 | + patch: The patch to grow. |
| 1410 | + text: Source text. |
| 1411 | + """ |
| 1412 | + if len(text) == 0: |
| 1413 | + return |
| 1414 | + pattern = text[patch.start2 : patch.start2 + patch.length1] |
| 1415 | + padding = 0 |
| 1416 | + |
| 1417 | + # Look for the first and last matches of pattern in text. If two different |
| 1418 | + # matches are found, increase the pattern length. |
| 1419 | + while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits == |
| 1420 | + 0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin - |
| 1421 | + self.Patch_Margin)): |
| 1422 | + padding += self.Patch_Margin |
| 1423 | + pattern = text[max(0, patch.start2 - padding) : |
| 1424 | + patch.start2 + patch.length1 + padding] |
| 1425 | + # Add one chunk for good luck. |
| 1426 | + padding += self.Patch_Margin |
| 1427 | + |
| 1428 | + # Add the prefix. |
| 1429 | + prefix = text[max(0, patch.start2 - padding) : patch.start2] |
| 1430 | + if prefix: |
| 1431 | + patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)] |
| 1432 | + # Add the suffix. |
| 1433 | + suffix = text[patch.start2 + patch.length1 : |
| 1434 | + patch.start2 + patch.length1 + padding] |
| 1435 | + if suffix: |
| 1436 | + patch.diffs.append((self.DIFF_EQUAL, suffix)) |
| 1437 | + |
| 1438 | + # Roll back the start points. |
| 1439 | + patch.start1 -= len(prefix) |
| 1440 | + patch.start2 -= len(prefix) |
| 1441 | + # Extend lengths. |
| 1442 | + patch.length1 += len(prefix) + len(suffix) |
| 1443 | + patch.length2 += len(prefix) + len(suffix) |
| 1444 | + |
| 1445 | + def patch_make(self, a, b=None, c=None): |
| 1446 | + """Compute a list of patches to turn text1 into text2. |
| 1447 | + Use diffs if provided, otherwise compute it ourselves. |
| 1448 | + There are four ways to call this function, depending on what data is |
| 1449 | + available to the caller: |
| 1450 | + Method 1: |
| 1451 | + a = text1, b = text2 |
| 1452 | + Method 2: |
| 1453 | + a = diffs |
| 1454 | + Method 3 (optimal): |
| 1455 | + a = text1, b = diffs |
| 1456 | + Method 4 (deprecated, use method 3): |
| 1457 | + a = text1, b = text2, c = diffs |
| 1458 | + |
| 1459 | + Args: |
| 1460 | + a: text1 (methods 1,3,4) or Array of diff tuples for text1 to |
| 1461 | + text2 (method 2). |
| 1462 | + b: text2 (methods 1,4) or Array of diff tuples for text1 to |
| 1463 | + text2 (method 3) or undefined (method 2). |
| 1464 | + c: Array of diff tuples for text1 to text2 (method 4) or |
| 1465 | + undefined (methods 1,2,3). |
| 1466 | + |
| 1467 | + Returns: |
| 1468 | + Array of patch objects. |
| 1469 | + """ |
| 1470 | + text1 = None |
| 1471 | + diffs = None |
| 1472 | + # Note that texts may arrive as 'str' or 'unicode'. |
| 1473 | + if isinstance(a, basestring) and isinstance(b, basestring) and c is None: |
| 1474 | + # Method 1: text1, text2 |
| 1475 | + # Compute diffs from text1 and text2. |
| 1476 | + text1 = a |
| 1477 | + diffs = self.diff_main(text1, b, True) |
| 1478 | + if len(diffs) > 2: |
| 1479 | + self.diff_cleanupSemantic(diffs) |
| 1480 | + self.diff_cleanupEfficiency(diffs) |
| 1481 | + elif isinstance(a, list) and b is None and c is None: |
| 1482 | + # Method 2: diffs |
| 1483 | + # Compute text1 from diffs. |
| 1484 | + diffs = a |
| 1485 | + text1 = self.diff_text1(diffs) |
| 1486 | + elif isinstance(a, basestring) and isinstance(b, list) and c is None: |
| 1487 | + # Method 3: text1, diffs |
| 1488 | + text1 = a |
| 1489 | + diffs = b |
| 1490 | + elif (isinstance(a, basestring) and isinstance(b, basestring) and |
| 1491 | + isinstance(c, list)): |
| 1492 | + # Method 4: text1, text2, diffs |
| 1493 | + # text2 is not used. |
| 1494 | + text1 = a |
| 1495 | + diffs = c |
| 1496 | + else: |
| 1497 | + raise ValueError("Unknown call format to patch_make.") |
| 1498 | + |
| 1499 | + if not diffs: |
| 1500 | + return [] # Get rid of the None case. |
| 1501 | + patches = [] |
| 1502 | + patch = patch_obj() |
| 1503 | + char_count1 = 0 # Number of characters into the text1 string. |
| 1504 | + char_count2 = 0 # Number of characters into the text2 string. |
| 1505 | + prepatch_text = text1 # Recreate the patches to determine context info. |
| 1506 | + postpatch_text = text1 |
| 1507 | + for x in xrange(len(diffs)): |
| 1508 | + (diff_type, diff_text) = diffs[x] |
| 1509 | + if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL: |
| 1510 | + # A new patch starts here. |
| 1511 | + patch.start1 = char_count1 |
| 1512 | + patch.start2 = char_count2 |
| 1513 | + if diff_type == self.DIFF_INSERT: |
| 1514 | + # Insertion |
| 1515 | + patch.diffs.append(diffs[x]) |
| 1516 | + patch.length2 += len(diff_text) |
| 1517 | + postpatch_text = (postpatch_text[:char_count2] + diff_text + |
| 1518 | + postpatch_text[char_count2:]) |
| 1519 | + elif diff_type == self.DIFF_DELETE: |
| 1520 | + # Deletion. |
| 1521 | + patch.length1 += len(diff_text) |
| 1522 | + patch.diffs.append(diffs[x]) |
| 1523 | + postpatch_text = (postpatch_text[:char_count2] + |
| 1524 | + postpatch_text[char_count2 + len(diff_text):]) |
| 1525 | + elif (diff_type == self.DIFF_EQUAL and |
| 1526 | + len(diff_text) <= 2 * self.Patch_Margin and |
| 1527 | + len(patch.diffs) != 0 and len(diffs) != x + 1): |
| 1528 | + # Small equality inside a patch. |
| 1529 | + patch.diffs.append(diffs[x]) |
| 1530 | + patch.length1 += len(diff_text) |
| 1531 | + patch.length2 += len(diff_text) |
| 1532 | + |
| 1533 | + if (diff_type == self.DIFF_EQUAL and |
| 1534 | + len(diff_text) >= 2 * self.Patch_Margin): |
| 1535 | + # Time for a new patch. |
| 1536 | + if len(patch.diffs) != 0: |
| 1537 | + self.patch_addContext(patch, prepatch_text) |
| 1538 | + patches.append(patch) |
| 1539 | + patch = patch_obj() |
| 1540 | + # Unlike Unidiff, our patch lists have a rolling context. |
| 1541 | + # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff |
| 1542 | + # Update prepatch text & pos to reflect the application of the |
| 1543 | + # just completed patch. |
| 1544 | + prepatch_text = postpatch_text |
| 1545 | + char_count1 = char_count2 |
| 1546 | + |
| 1547 | + # Update the current character count. |
| 1548 | + if diff_type != self.DIFF_INSERT: |
| 1549 | + char_count1 += len(diff_text) |
| 1550 | + if diff_type != self.DIFF_DELETE: |
| 1551 | + char_count2 += len(diff_text) |
| 1552 | + |
| 1553 | + # Pick up the leftover patch if not empty. |
| 1554 | + if len(patch.diffs) != 0: |
| 1555 | + self.patch_addContext(patch, prepatch_text) |
| 1556 | + patches.append(patch) |
| 1557 | + return patches |
| 1558 | + |
| 1559 | + def patch_deepCopy(self, patches): |
| 1560 | + """Given an array of patches, return another array that is identical. |
| 1561 | + |
| 1562 | + Args: |
| 1563 | + patches: Array of patch objects. |
| 1564 | + |
| 1565 | + Returns: |
| 1566 | + Array of patch objects. |
| 1567 | + """ |
| 1568 | + patchesCopy = [] |
| 1569 | + for patch in patches: |
| 1570 | + patchCopy = patch_obj() |
| 1571 | + # No need to deep copy the tuples since they are immutable. |
| 1572 | + patchCopy.diffs = patch.diffs[:] |
| 1573 | + patchCopy.start1 = patch.start1 |
| 1574 | + patchCopy.start2 = patch.start2 |
| 1575 | + patchCopy.length1 = patch.length1 |
| 1576 | + patchCopy.length2 = patch.length2 |
| 1577 | + patchesCopy.append(patchCopy) |
| 1578 | + return patchesCopy |
| 1579 | + |
| 1580 | + def patch_apply(self, patches, text): |
| 1581 | + """Merge a set of patches onto the text. Return a patched text, as well |
| 1582 | + as a list of true/false values indicating which patches were applied. |
| 1583 | + |
| 1584 | + Args: |
| 1585 | + patches: Array of patch objects. |
| 1586 | + text: Old text. |
| 1587 | + |
| 1588 | + Returns: |
| 1589 | + Two element Array, containing the new text and an array of boolean values. |
| 1590 | + """ |
| 1591 | + if not patches: |
| 1592 | + return (text, []) |
| 1593 | + |
| 1594 | + # Deep copy the patches so that no changes are made to originals. |
| 1595 | + patches = self.patch_deepCopy(patches) |
| 1596 | + |
| 1597 | + nullPadding = self.patch_addPadding(patches) |
| 1598 | + text = nullPadding + text + nullPadding |
| 1599 | + self.patch_splitMax(patches) |
| 1600 | + |
| 1601 | + # delta keeps track of the offset between the expected and actual location |
| 1602 | + # of the previous patch. If there are patches expected at positions 10 and |
| 1603 | + # 20, but the first patch was found at 12, delta is 2 and the second patch |
| 1604 | + # has an effective expected position of 22. |
| 1605 | + delta = 0 |
| 1606 | + results = [] |
| 1607 | + for patch in patches: |
| 1608 | + expected_loc = patch.start2 + delta |
| 1609 | + text1 = self.diff_text1(patch.diffs) |
| 1610 | + end_loc = -1 |
| 1611 | + if len(text1) > self.Match_MaxBits: |
| 1612 | + # patch_splitMax will only provide an oversized pattern in the case of |
| 1613 | + # a monster delete. |
| 1614 | + start_loc = self.match_main(text, text1[:self.Match_MaxBits], |
| 1615 | + expected_loc) |
| 1616 | + if start_loc != -1: |
| 1617 | + end_loc = self.match_main(text, text1[-self.Match_MaxBits:], |
| 1618 | + expected_loc + len(text1) - self.Match_MaxBits) |
| 1619 | + if end_loc == -1 or start_loc >= end_loc: |
| 1620 | + # Can't find valid trailing context. Drop this patch. |
| 1621 | + start_loc = -1 |
| 1622 | + else: |
| 1623 | + start_loc = self.match_main(text, text1, expected_loc) |
| 1624 | + if start_loc == -1: |
| 1625 | + # No match found. :( |
| 1626 | + results.append(False) |
| 1627 | + # Subtract the delta for this failed patch from subsequent patches. |
| 1628 | + delta -= patch.length2 - patch.length1 |
| 1629 | + else: |
| 1630 | + # Found a match. :) |
| 1631 | + results.append(True) |
| 1632 | + delta = start_loc - expected_loc |
| 1633 | + if end_loc == -1: |
| 1634 | + text2 = text[start_loc : start_loc + len(text1)] |
| 1635 | + else: |
| 1636 | + text2 = text[start_loc : end_loc + self.Match_MaxBits] |
| 1637 | + if text1 == text2: |
| 1638 | + # Perfect match, just shove the replacement text in. |
| 1639 | + text = (text[:start_loc] + self.diff_text2(patch.diffs) + |
| 1640 | + text[start_loc + len(text1):]) |
| 1641 | + else: |
| 1642 | + # Imperfect match. |
| 1643 | + # Run a diff to get a framework of equivalent indices. |
| 1644 | + diffs = self.diff_main(text1, text2, False) |
| 1645 | + if (len(text1) > self.Match_MaxBits and |
| 1646 | + self.diff_levenshtein(diffs) / float(len(text1)) > |
| 1647 | + self.Patch_DeleteThreshold): |
| 1648 | + # The end points match, but the content is unacceptably bad. |
| 1649 | + results[-1] = False |
| 1650 | + else: |
| 1651 | + self.diff_cleanupSemanticLossless(diffs) |
| 1652 | + index1 = 0 |
| 1653 | + for (op, data) in patch.diffs: |
| 1654 | + if op != self.DIFF_EQUAL: |
| 1655 | + index2 = self.diff_xIndex(diffs, index1) |
| 1656 | + if op == self.DIFF_INSERT: # Insertion |
| 1657 | + text = text[:start_loc + index2] + data + text[start_loc + |
| 1658 | + index2:] |
| 1659 | + elif op == self.DIFF_DELETE: # Deletion |
| 1660 | + text = text[:start_loc + index2] + text[start_loc + |
| 1661 | + self.diff_xIndex(diffs, index1 + len(data)):] |
| 1662 | + if op != self.DIFF_DELETE: |
| 1663 | + index1 += len(data) |
| 1664 | + # Strip the padding off. |
| 1665 | + text = text[len(nullPadding):-len(nullPadding)] |
| 1666 | + return (text, results) |
| 1667 | + |
| 1668 | + def patch_addPadding(self, patches): |
| 1669 | + """Add some padding on text start and end so that edges can match |
| 1670 | + something. Intended to be called only from within patch_apply. |
| 1671 | + |
| 1672 | + Args: |
| 1673 | + patches: Array of patch objects. |
| 1674 | + |
| 1675 | + Returns: |
| 1676 | + The padding string added to each side. |
| 1677 | + """ |
| 1678 | + paddingLength = self.Patch_Margin |
| 1679 | + nullPadding = "" |
| 1680 | + for x in xrange(1, paddingLength + 1): |
| 1681 | + nullPadding += chr(x) |
| 1682 | + |
| 1683 | + # Bump all the patches forward. |
| 1684 | + for patch in patches: |
| 1685 | + patch.start1 += paddingLength |
| 1686 | + patch.start2 += paddingLength |
| 1687 | + |
| 1688 | + # Add some padding on start of first diff. |
| 1689 | + patch = patches[0] |
| 1690 | + diffs = patch.diffs |
| 1691 | + if not diffs or diffs[0][0] != self.DIFF_EQUAL: |
| 1692 | + # Add nullPadding equality. |
| 1693 | + diffs.insert(0, (self.DIFF_EQUAL, nullPadding)) |
| 1694 | + patch.start1 -= paddingLength # Should be 0. |
| 1695 | + patch.start2 -= paddingLength # Should be 0. |
| 1696 | + patch.length1 += paddingLength |
| 1697 | + patch.length2 += paddingLength |
| 1698 | + elif paddingLength > len(diffs[0][1]): |
| 1699 | + # Grow first equality. |
| 1700 | + extraLength = paddingLength - len(diffs[0][1]) |
| 1701 | + newText = nullPadding[len(diffs[0][1]):] + diffs[0][1] |
| 1702 | + diffs[0] = (diffs[0][0], newText) |
| 1703 | + patch.start1 -= extraLength |
| 1704 | + patch.start2 -= extraLength |
| 1705 | + patch.length1 += extraLength |
| 1706 | + patch.length2 += extraLength |
| 1707 | + |
| 1708 | + # Add some padding on end of last diff. |
| 1709 | + patch = patches[-1] |
| 1710 | + diffs = patch.diffs |
| 1711 | + if not diffs or diffs[-1][0] != self.DIFF_EQUAL: |
| 1712 | + # Add nullPadding equality. |
| 1713 | + diffs.append((self.DIFF_EQUAL, nullPadding)) |
| 1714 | + patch.length1 += paddingLength |
| 1715 | + patch.length2 += paddingLength |
| 1716 | + elif paddingLength > len(diffs[-1][1]): |
| 1717 | + # Grow last equality. |
| 1718 | + extraLength = paddingLength - len(diffs[-1][1]) |
| 1719 | + newText = diffs[-1][1] + nullPadding[:extraLength] |
| 1720 | + diffs[-1] = (diffs[-1][0], newText) |
| 1721 | + patch.length1 += extraLength |
| 1722 | + patch.length2 += extraLength |
| 1723 | + |
| 1724 | + return nullPadding |
| 1725 | + |
| 1726 | + def patch_splitMax(self, patches): |
| 1727 | + """Look through the patches and break up any which are longer than the |
| 1728 | + maximum limit of the match algorithm. |
| 1729 | + Intended to be called only from within patch_apply. |
| 1730 | + |
| 1731 | + Args: |
| 1732 | + patches: Array of patch objects. |
| 1733 | + """ |
| 1734 | + patch_size = self.Match_MaxBits |
| 1735 | + if patch_size == 0: |
| 1736 | + # Python has the option of not splitting strings due to its ability |
| 1737 | + # to handle integers of arbitrary precision. |
| 1738 | + return |
| 1739 | + for x in xrange(len(patches)): |
| 1740 | + if patches[x].length1 > patch_size: |
| 1741 | + bigpatch = patches[x] |
| 1742 | + # Remove the big old patch. |
| 1743 | + del patches[x] |
| 1744 | + x -= 1 |
| 1745 | + start1 = bigpatch.start1 |
| 1746 | + start2 = bigpatch.start2 |
| 1747 | + precontext = '' |
| 1748 | + while len(bigpatch.diffs) != 0: |
| 1749 | + # Create one of several smaller patches. |
| 1750 | + patch = patch_obj() |
| 1751 | + empty = True |
| 1752 | + patch.start1 = start1 - len(precontext) |
| 1753 | + patch.start2 = start2 - len(precontext) |
| 1754 | + if precontext: |
| 1755 | + patch.length1 = patch.length2 = len(precontext) |
| 1756 | + patch.diffs.append((self.DIFF_EQUAL, precontext)) |
| 1757 | + |
| 1758 | + while (len(bigpatch.diffs) != 0 and |
| 1759 | + patch.length1 < patch_size - self.Patch_Margin): |
| 1760 | + (diff_type, diff_text) = bigpatch.diffs[0] |
| 1761 | + if diff_type == self.DIFF_INSERT: |
| 1762 | + # Insertions are harmless. |
| 1763 | + patch.length2 += len(diff_text) |
| 1764 | + start2 += len(diff_text) |
| 1765 | + patch.diffs.append(bigpatch.diffs.pop(0)) |
| 1766 | + empty = False |
| 1767 | + elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and |
| 1768 | + patch.diffs[0][0] == self.DIFF_EQUAL and |
| 1769 | + len(diff_text) > 2 * patch_size): |
| 1770 | + # This is a large deletion. Let it pass in one chunk. |
| 1771 | + patch.length1 += len(diff_text) |
| 1772 | + start1 += len(diff_text) |
| 1773 | + empty = False |
| 1774 | + patch.diffs.append((diff_type, diff_text)) |
| 1775 | + del bigpatch.diffs[0] |
| 1776 | + else: |
| 1777 | + # Deletion or equality. Only take as much as we can stomach. |
| 1778 | + diff_text = diff_text[:patch_size - patch.length1 - |
| 1779 | + self.Patch_Margin] |
| 1780 | + patch.length1 += len(diff_text) |
| 1781 | + start1 += len(diff_text) |
| 1782 | + if diff_type == self.DIFF_EQUAL: |
| 1783 | + patch.length2 += len(diff_text) |
| 1784 | + start2 += len(diff_text) |
| 1785 | + else: |
| 1786 | + empty = False |
| 1787 | + |
| 1788 | + patch.diffs.append((diff_type, diff_text)) |
| 1789 | + if diff_text == bigpatch.diffs[0][1]: |
| 1790 | + del bigpatch.diffs[0] |
| 1791 | + else: |
| 1792 | + bigpatch.diffs[0] = (bigpatch.diffs[0][0], |
| 1793 | + bigpatch.diffs[0][1][len(diff_text):]) |
| 1794 | + |
| 1795 | + # Compute the head context for the next patch. |
| 1796 | + precontext = self.diff_text2(patch.diffs) |
| 1797 | + precontext = precontext[-self.Patch_Margin:] |
| 1798 | + # Append the end context for this patch. |
| 1799 | + postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin] |
| 1800 | + if postcontext: |
| 1801 | + patch.length1 += len(postcontext) |
| 1802 | + patch.length2 += len(postcontext) |
| 1803 | + if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL: |
| 1804 | + patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] + |
| 1805 | + postcontext) |
| 1806 | + else: |
| 1807 | + patch.diffs.append((self.DIFF_EQUAL, postcontext)) |
| 1808 | + |
| 1809 | + if not empty: |
| 1810 | + x += 1 |
| 1811 | + patches.insert(x, patch) |
| 1812 | + |
| 1813 | + def patch_toText(self, patches): |
| 1814 | + """Take a list of patches and return a textual representation. |
| 1815 | + |
| 1816 | + Args: |
| 1817 | + patches: Array of patch objects. |
| 1818 | + |
| 1819 | + Returns: |
| 1820 | + Text representation of patches. |
| 1821 | + """ |
| 1822 | + text = [] |
| 1823 | + for patch in patches: |
| 1824 | + text.append(str(patch)) |
| 1825 | + return "".join(text) |
| 1826 | + |
| 1827 | + def patch_fromText(self, textline): |
| 1828 | + """Parse a textual representation of patches and return a list of patch |
| 1829 | + objects. |
| 1830 | + |
| 1831 | + Args: |
| 1832 | + textline: Text representation of patches. |
| 1833 | + |
| 1834 | + Returns: |
| 1835 | + Array of patch objects. |
| 1836 | + |
| 1837 | + Raises: |
| 1838 | + ValueError: If invalid input. |
| 1839 | + """ |
| 1840 | + if type(textline) == unicode: |
| 1841 | + # Patches should be composed of a subset of ascii chars, Unicode not |
| 1842 | + # required. If this encode raises UnicodeEncodeError, patch is invalid. |
| 1843 | + textline = textline.encode("ascii") |
| 1844 | + patches = [] |
| 1845 | + if not textline: |
| 1846 | + return patches |
| 1847 | + text = textline.split('\n') |
| 1848 | + while len(text) != 0: |
| 1849 | + m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0]) |
| 1850 | + if not m: |
| 1851 | + raise ValueError("Invalid patch string: " + text[0]) |
| 1852 | + patch = patch_obj() |
| 1853 | + patches.append(patch) |
| 1854 | + patch.start1 = int(m.group(1)) |
| 1855 | + if m.group(2) == '': |
| 1856 | + patch.start1 -= 1 |
| 1857 | + patch.length1 = 1 |
| 1858 | + elif m.group(2) == '0': |
| 1859 | + patch.length1 = 0 |
| 1860 | + else: |
| 1861 | + patch.start1 -= 1 |
| 1862 | + patch.length1 = int(m.group(2)) |
| 1863 | + |
| 1864 | + patch.start2 = int(m.group(3)) |
| 1865 | + if m.group(4) == '': |
| 1866 | + patch.start2 -= 1 |
| 1867 | + patch.length2 = 1 |
| 1868 | + elif m.group(4) == '0': |
| 1869 | + patch.length2 = 0 |
| 1870 | + else: |
| 1871 | + patch.start2 -= 1 |
| 1872 | + patch.length2 = int(m.group(4)) |
| 1873 | + |
| 1874 | + del text[0] |
| 1875 | + |
| 1876 | + while len(text) != 0: |
| 1877 | + if text[0]: |
| 1878 | + sign = text[0][0] |
| 1879 | + else: |
| 1880 | + sign = '' |
| 1881 | + line = urllib.unquote(text[0][1:]) |
| 1882 | + line = line.decode("utf-8") |
| 1883 | + if sign == '+': |
| 1884 | + # Insertion. |
| 1885 | + patch.diffs.append((self.DIFF_INSERT, line)) |
| 1886 | + elif sign == '-': |
| 1887 | + # Deletion. |
| 1888 | + patch.diffs.append((self.DIFF_DELETE, line)) |
| 1889 | + elif sign == ' ': |
| 1890 | + # Minor equality. |
| 1891 | + patch.diffs.append((self.DIFF_EQUAL, line)) |
| 1892 | + elif sign == '@': |
| 1893 | + # Start of next patch. |
| 1894 | + break |
| 1895 | + elif sign == '': |
| 1896 | + # Blank line? Whatever. |
| 1897 | + pass |
| 1898 | + else: |
| 1899 | + # WTF? |
| 1900 | + raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line)) |
| 1901 | + del text[0] |
| 1902 | + return patches |
| 1903 | + |
| 1904 | + |
| 1905 | +class patch_obj: |
| 1906 | + """Class representing one patch operation. |
| 1907 | + """ |
| 1908 | + |
| 1909 | + def __init__(self): |
| 1910 | + """Initializes with an empty list of diffs. |
| 1911 | + """ |
| 1912 | + self.diffs = [] |
| 1913 | + self.start1 = None |
| 1914 | + self.start2 = None |
| 1915 | + self.length1 = 0 |
| 1916 | + self.length2 = 0 |
| 1917 | + |
| 1918 | + def __str__(self): |
| 1919 | + """Emmulate GNU diff's format. |
| 1920 | + Header: @@ -382,8 +481,9 @@ |
| 1921 | + Indicies are printed as 1-based, not 0-based. |
| 1922 | + |
| 1923 | + Returns: |
| 1924 | + The GNU diff string. |
| 1925 | + """ |
| 1926 | + if self.length1 == 0: |
| 1927 | + coords1 = str(self.start1) + ",0" |
| 1928 | + elif self.length1 == 1: |
| 1929 | + coords1 = str(self.start1 + 1) |
| 1930 | + else: |
| 1931 | + coords1 = str(self.start1 + 1) + "," + str(self.length1) |
| 1932 | + if self.length2 == 0: |
| 1933 | + coords2 = str(self.start2) + ",0" |
| 1934 | + elif self.length2 == 1: |
| 1935 | + coords2 = str(self.start2 + 1) |
| 1936 | + else: |
| 1937 | + coords2 = str(self.start2 + 1) + "," + str(self.length2) |
| 1938 | + text = ["@@ -", coords1, " +", coords2, " @@\n"] |
| 1939 | + # Escape the body of the patch with %xx notation. |
| 1940 | + for (op, data) in self.diffs: |
| 1941 | + if op == diff_match_patch.DIFF_INSERT: |
| 1942 | + text.append("+") |
| 1943 | + elif op == diff_match_patch.DIFF_DELETE: |
| 1944 | + text.append("-") |
| 1945 | + elif op == diff_match_patch.DIFF_EQUAL: |
| 1946 | + text.append(" ") |
| 1947 | + # High ascii will raise UnicodeDecodeError. Use Unicode instead. |
| 1948 | + data = data.encode("utf-8") |
| 1949 | + text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + "\n") |
| 1950 | + return "".join(text) |
Property changes on: trunk/tools/wsor/diffs/diff_match_patch.py |
___________________________________________________________________ |
Added: svn:executable |
1 | 1951 | + * |
Index: trunk/tools/wsor/diffs/page_sample.xml |
— | — | @@ -0,0 +1,37 @@ |
| 2 | + <page> |
| 3 | + <title>Bassist</title> |
| 4 | + <id>60001</id> |
| 5 | + <revision> |
| 6 | + <id>108204</id> |
| 7 | + <timestamp>2002-06-30T02:03:23Z</timestamp> |
| 8 | + <contributor> |
| 9 | + <ip>195.149.37.198</ip> |
| 10 | + </contributor> |
| 11 | + <minor /> |
| 12 | + <comment>stub</comment> |
| 13 | + <text xml:space="preserve">A <b>bassist</b> is somebody who plays a [[bass guitar]] or [[double bass]].</text> |
| 14 | + </revision> |
| 15 | + <revision> |
| 16 | + <id>208937</id> |
| 17 | + <timestamp>2002-06-30T16:00:41Z</timestamp> |
| 18 | + <contributor> |
| 19 | + <username>JeLuF</username> |
| 20 | + <id>733</id> |
| 21 | + </contributor> |
| 22 | + <comment>added list</comment> |
| 23 | + <text xml:space="preserve">A <b>bassist</b> is somebody who plays a [[bass guitar]] or [[double bass]]. |
| 24 | + |
| 25 | +Famous bassists include: |
| 26 | +* [[Ron Carter]] |
| 27 | +* [[Les Claypool]] from [[Primus]] |
| 28 | +* [[John Entwistle]] from [[The Who]] |
| 29 | +* [[Kelly Grouchet]] from [[Electric Light Orchestra]] |
| 30 | +* [[Glenn Hughes]] from [[Deep Purple]] |
| 31 | +* [[Lemmy Kilmister]] from [[Motorhead]] |
| 32 | +* Sir [[Paul McCartney]] from [[The Beatles]] |
| 33 | +* [[Charles Mingus]] |
| 34 | +* [[Jason Newsted]] from [[Metallica]] |
| 35 | +* [[Sting]] from [[The Police]] |
| 36 | +* [[Leon Wilkeson]] from [[Lynyrd Skynyrd]]</text> |
| 37 | + </revision> |
| 38 | + </page> |