r86815 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r86814‎ | r86815 | r86816 >
Date:14:26, 24 April 2011
Author:diederik
Status:deferred
Tags:
Comment:
Added past revert information as new variables
Modified paths:
  • /trunk/tools/editor_trends/classes/buffer.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/variables.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/variables.py
@@ -235,6 +235,25 @@
236236 return 999
237237
238238
 239+def store_revert_information(hash, revision_id, contributor, reverts):
 240+ hash = hash['hash']
 241+ reverts.setdefault(hash, {})
 242+ reverts[hash]['revision_id'] = revision_id
 243+ reverts[hash]['contributor'] = contributor
 244+ return reverts
 245+
 246+
 247+def determine_past_revert(hash, revert, reverts):
 248+ past_revert = {}
 249+ hash = hash['hash']
 250+ if revert['revert'] == 1:
 251+ past_revert['reverted_revision_id'] = reverts[hash]['revision_id']
 252+ past_revert['reverted_contributor'] = reverst[hash]['contributor']
 253+ else:
 254+ past_revert['reverted_revision_id'] = -1
 255+ past_revert['reverted_contributor'] = -1
 256+ return past_revert
 257+
239258 def is_revision_reverted(hash_cur, hashes):
240259 '''
241260 Determine whether an edit was reverted or not based on md5 hashes
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -43,7 +43,7 @@
4444 from classes import buffer
4545 from analyses.adhoc import bot_detector
4646
47 -def parse_revision(revision, article, xml_namespace, cache, bots, md5hashes, size):
 47+def parse_revision(revision, article, xml_namespace, cache, bots, md5hashes, size, reverts):
4848 '''
4949 This function has as input a single revision from a Wikipedia dump file,
5050 article id it belongs to, the xml_namespace of the Wikipedia dump file,
@@ -54,19 +54,19 @@
5555 if revision == None:
5656 #the entire revision is empty, weird.
5757 #dump(revision)
58 - return md5hashes, size
 58+ return md5hashes, size, reverts
5959
6060 contributor = revision.find('%s%s' % (xml_namespace, 'contributor'))
6161 contributor = variables.parse_contributor(contributor, bots, xml_namespace)
6262 if not contributor:
6363 #editor is anonymous, ignore
64 - return md5hashes, size
 64+ return md5hashes, size, reverts
6565
6666 revision_id = revision.find('%s%s' % (xml_namespace, 'id'))
6767 revision_id = variables.extract_revision_id(revision_id)
6868 if revision_id == None:
6969 #revision_id is missing, which is weird
70 - return md5hashes, size
 70+ return md5hashes, size, reverts
7171
7272 article['revision_id'] = revision_id
7373 text = variables.extract_revision_text(revision, xml_namespace)
@@ -80,14 +80,17 @@
8181
8282 hash = variables.create_md5hash(text)
8383 revert = variables.is_revision_reverted(hash['hash'], md5hashes)
 84+ reverts = variables.store_revert_information(hash, revision_id, contributor, reverts)
 85+ past_revert = variables.determine_past_revert(hash, revert, reverts)
8486 md5hashes.append(hash['hash'])
8587 size = variables.calculate_delta_article_size(size, text)
8688
8789 article.update(hash)
8890 article.update(size)
8991 article.update(revert)
 92+ article.update(past_revert)
9093 cache.add(article)
91 - return md5hashes, size
 94+ return md5hashes, size, reverts
9295
9396
9497 def parse_xml(fh, rts, cache, process_id, file_id):
@@ -108,6 +111,7 @@
109112
110113 article = {}
111114 size = {}
 115+ reverts = {}
112116 id = False
113117 ns = False
114118 parse = False
@@ -155,7 +159,7 @@
156160 if event is start:
157161 clear = False
158162 else:
159 - md5hashes, size = parse_revision(elem, article, xml_namespace, cache, bots, md5hashes, size)
 163+ md5hashes, size, reverts = parse_revision(elem, article, xml_namespace, cache, bots, md5hashes, size, reverts)
160164 cache.count_revisions += 1
161165 clear = True
162166 if clear:
@@ -182,6 +186,7 @@
183187 #Reset all variables for next article
184188 article = {}
185189 size = {}
 190+ reverts = {}
186191 md5hashes = deque()
187192 id = False
188193 parse = False
Index: trunk/tools/editor_trends/classes/buffer.py
@@ -81,8 +81,9 @@
8282 self.filehandles = [file_utils.create_txt_filehandle(self.rts.txt,
8383 file_id, 'a', 'utf-8') for file_id in xrange(self.rts.max_filehandles)]
8484 self.keys = ['id', 'article_id', 'revision_id', 'username', 'namespace',
85 - 'title', 'timestamp', 'hash', 'revert', 'bot', 'cur_size',
86 - 'delta']
 85+ 'title', 'timestamp', 'hash', 'revert',
 86+ 'reverted_contributor', 'reverted_revision_id', 'bot',
 87+ 'cur_size', 'delta']
8788 self.fh_articles = file_utils.create_txt_filehandle(self.rts.txt,
8889 'articles_%s' % self.process_id, 'w', 'utf-8')
8990 self.fh_comments = file_utils.create_txt_filehandle(self.rts.txt,