Index: trunk/tools/editor_trends/etl/variables.py |
— | — | @@ -235,6 +235,25 @@ |
236 | 236 | return 999 |
237 | 237 | |
238 | 238 | |
| 239 | +def store_revert_information(hash, revision_id, contributor, reverts): |
| 240 | + hash = hash['hash'] |
| 241 | + reverts.setdefault(hash, {}) |
| 242 | + reverts[hash]['revision_id'] = revision_id |
| 243 | + reverts[hash]['contributor'] = contributor |
| 244 | + return reverts |
| 245 | + |
| 246 | + |
| 247 | +def determine_past_revert(hash, revert, reverts): |
| 248 | + past_revert = {} |
| 249 | + hash = hash['hash'] |
| 250 | + if revert['revert'] == 1: |
| 251 | + past_revert['reverted_revision_id'] = reverts[hash]['revision_id'] |
| 252 | + past_revert['reverted_contributor'] = reverst[hash]['contributor'] |
| 253 | + else: |
| 254 | + past_revert['reverted_revision_id'] = -1 |
| 255 | + past_revert['reverted_contributor'] = -1 |
| 256 | + return past_revert |
| 257 | + |
239 | 258 | def is_revision_reverted(hash_cur, hashes): |
240 | 259 | ''' |
241 | 260 | Determine whether an edit was reverted or not based on md5 hashes |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -43,7 +43,7 @@ |
44 | 44 | from classes import buffer
|
45 | 45 | from analyses.adhoc import bot_detector
|
46 | 46 |
|
47 | | -def parse_revision(revision, article, xml_namespace, cache, bots, md5hashes, size):
|
| 47 | +def parse_revision(revision, article, xml_namespace, cache, bots, md5hashes, size, reverts):
|
48 | 48 | '''
|
49 | 49 | This function has as input a single revision from a Wikipedia dump file,
|
50 | 50 | article id it belongs to, the xml_namespace of the Wikipedia dump file,
|
— | — | @@ -54,19 +54,19 @@ |
55 | 55 | if revision == None:
|
56 | 56 | #the entire revision is empty, weird.
|
57 | 57 | #dump(revision)
|
58 | | - return md5hashes, size
|
| 58 | + return md5hashes, size, reverts
|
59 | 59 |
|
60 | 60 | contributor = revision.find('%s%s' % (xml_namespace, 'contributor'))
|
61 | 61 | contributor = variables.parse_contributor(contributor, bots, xml_namespace)
|
62 | 62 | if not contributor:
|
63 | 63 | #editor is anonymous, ignore
|
64 | | - return md5hashes, size
|
| 64 | + return md5hashes, size, reverts
|
65 | 65 |
|
66 | 66 | revision_id = revision.find('%s%s' % (xml_namespace, 'id'))
|
67 | 67 | revision_id = variables.extract_revision_id(revision_id)
|
68 | 68 | if revision_id == None:
|
69 | 69 | #revision_id is missing, which is weird
|
70 | | - return md5hashes, size
|
| 70 | + return md5hashes, size, reverts
|
71 | 71 |
|
72 | 72 | article['revision_id'] = revision_id
|
73 | 73 | text = variables.extract_revision_text(revision, xml_namespace)
|
— | — | @@ -80,14 +80,17 @@ |
81 | 81 |
|
82 | 82 | hash = variables.create_md5hash(text)
|
83 | 83 | revert = variables.is_revision_reverted(hash['hash'], md5hashes)
|
| 84 | + reverts = variables.store_revert_information(hash, revision_id, contributor, reverts)
|
| 85 | + past_revert = variables.determine_past_revert(hash, revert, reverts)
|
84 | 86 | md5hashes.append(hash['hash'])
|
85 | 87 | size = variables.calculate_delta_article_size(size, text)
|
86 | 88 |
|
87 | 89 | article.update(hash)
|
88 | 90 | article.update(size)
|
89 | 91 | article.update(revert)
|
| 92 | + article.update(past_revert)
|
90 | 93 | cache.add(article)
|
91 | | - return md5hashes, size
|
| 94 | + return md5hashes, size, reverts
|
92 | 95 |
|
93 | 96 |
|
94 | 97 | def parse_xml(fh, rts, cache, process_id, file_id):
|
— | — | @@ -108,6 +111,7 @@ |
109 | 112 |
|
110 | 113 | article = {}
|
111 | 114 | size = {}
|
| 115 | + reverts = {}
|
112 | 116 | id = False
|
113 | 117 | ns = False
|
114 | 118 | parse = False
|
— | — | @@ -155,7 +159,7 @@ |
156 | 160 | if event is start:
|
157 | 161 | clear = False
|
158 | 162 | else:
|
159 | | - md5hashes, size = parse_revision(elem, article, xml_namespace, cache, bots, md5hashes, size)
|
| 163 | + md5hashes, size, reverts = parse_revision(elem, article, xml_namespace, cache, bots, md5hashes, size, reverts)
|
160 | 164 | cache.count_revisions += 1
|
161 | 165 | clear = True
|
162 | 166 | if clear:
|
— | — | @@ -182,6 +186,7 @@ |
183 | 187 | #Reset all variables for next article
|
184 | 188 | article = {}
|
185 | 189 | size = {}
|
| 190 | + reverts = {}
|
186 | 191 | md5hashes = deque()
|
187 | 192 | id = False
|
188 | 193 | parse = False
|
Index: trunk/tools/editor_trends/classes/buffer.py |
— | — | @@ -81,8 +81,9 @@ |
82 | 82 | self.filehandles = [file_utils.create_txt_filehandle(self.rts.txt, |
83 | 83 | file_id, 'a', 'utf-8') for file_id in xrange(self.rts.max_filehandles)] |
84 | 84 | self.keys = ['id', 'article_id', 'revision_id', 'username', 'namespace', |
85 | | - 'title', 'timestamp', 'hash', 'revert', 'bot', 'cur_size', |
86 | | - 'delta'] |
| 85 | + 'title', 'timestamp', 'hash', 'revert', |
| 86 | + 'reverted_contributor', 'reverted_revision_id', 'bot', |
| 87 | + 'cur_size', 'delta'] |
87 | 88 | self.fh_articles = file_utils.create_txt_filehandle(self.rts.txt, |
88 | 89 | 'articles_%s' % self.process_id, 'w', 'utf-8') |
89 | 90 | self.fh_comments = file_utils.create_txt_filehandle(self.rts.txt, |