r86080 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r86079‎ | r86080 | r86081 >
Date:20:06, 14 April 2011
Author:diederik
Status:deferred
Tags:
Comment:
Simplification of CSVFile class.
Modified paths:
  • /trunk/tools/editor_trends/classes/buffer.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/classes/buffer.py
@@ -59,6 +59,7 @@
6060 self.process_id = process_id
6161 self.count_articles = 0
6262 self.count_revisions = 0
 63+ self.n = 0
6364 self.filehandles = [file_utils.create_txt_filehandle(self.rts.txt,
6465 file_id, 'a', 'utf-8') for file_id in xrange(self.rts.max_filehandles)]
6566 self.keys = ['revision_id', 'article_id', 'id', 'username', 'namespace',
@@ -87,53 +88,34 @@
8889 hashes[file_id].append(editor)
8990 return hashes
9091
91 - def group_revisions_by_fileid(self):
92 - '''
93 - This function groups observation by editor id and then by file_id,
94 - this way we have to make fewer file opening calls and should reduce
95 - processing time.
96 - '''
97 - data = {}
98 - editors = {}
99 - #first, we group all revisions by editor
100 -
101 - for revision in self.revisions.values():
102 - row = []
103 - #strip away the keys and make sure that the values are always in the same sequence
104 - for key in self.keys:
105 - row.append(revision[key].decode('utf-8'))
106 - editor_id = row[0]
107 - data.setdefault(editor_id, [])
108 - data[editor_id].append(row)
109 - editors.setdefault(editor_id, self.get_hash(editor_id))
110 -
111 - #now, we are going to group all editors by file_id
112 - print editors
113 - file_ids = self.invert_dictionary(editors)
114 - print file_ids
115 - self.revisions = {}
116 - for file_id, editors in file_ids.iteritems():
117 - for editor in editors:
118 - self.revisions.setdefault(file_id, [])
119 - self.revisions[file_id].extend(data[editor])
120 - print file_id, data[editor]
121 -
12292 def add(self, revision):
123 - self.stringify(revision)
124 - id = revision['revision_id']
125 - self.revisions[id] = revision
126 - if len(self.revisions) > 10000:
 93+ revision = self.stringify(revision)
 94+ id = revision['id']
 95+ file_id = self.get_hash(id)
 96+ revision = self.simplify(revision)
 97+ self.revisions.setdefault(file_id, [])
 98+ self.revisions[file_id].append(revision)
 99+ if self.n > 10000:
127100 #print '%s: Emptying buffer %s - buffer size %s' % (datetime.datetime.now(), self.id, len(self.revisions))
128101 self.store()
 102+ else:
 103+ self.n += 1
129104
 105+ def simplify(self, revision):
 106+ row = []
 107+ for key in self.keys:
 108+ row.append(value.decode('utf-8'))
 109+ return row
130110
131111 def stringify(self, revision):
132112 for key, value in revision.iteritems():
 113+ value = revision[key]
133114 try:
134115 value = str(value)
135116 except UnicodeEncodeError:
136117 value = value.encode('utf-8')
137118 revision[key] = value
 119+ return revision
138120
139121
140122 def summary(self):
@@ -188,7 +170,7 @@
189171
190172 def write_revisions(self):
191173 #t0 = datetime.datetime.now()
192 - self.group_revisions_by_fileid()
 174+ #self.group_revisions_by_fileid()
193175 file_ids = self.revisions.keys()
194176 for file_id in file_ids:
195177 wait = True