r86012 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r86011‎ | r86012 | r86013 >
Date:23:49, 13 April 2011
Author:diederik
Status:deferred
Tags:
Comment:
Added progress information.
Modified paths:
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/extracter.py
@@ -137,7 +137,7 @@
138138 elif event is end and elem.tag.endswith('title'):
139139 title = variables.parse_title(elem)
140140 current_namespace = variables.determine_namespace(title, namespaces, include_ns)
141 - if current_namespace != False:
 141+ if isinstance(current_namespace, int):
142142 parse = True
143143 count_articles += 1
144144 if count_articles % 10000 == 0:
@@ -160,6 +160,9 @@
161161 id = False
162162 parse = False
163163
 164+ else:
 165+ elem.clear()
 166+
164167 except SyntaxError, error:
165168 print 'Encountered invalid XML tag. Error message: %s' % error
166169 dump(elem)
@@ -175,6 +178,7 @@
176179 fh = file_utils.create_txt_filehandle(rts.txt, filename, 'w', 'utf-8')
177180 file_utils.write_dict_to_csv(counts, fh, keys)
178181 fh.close()
 182+ counts = {}
179183
180184
181185 def parse_xml(fh, rts, cache, process_id, file_id):
@@ -304,7 +308,7 @@
305309 files = file_utils.retrieve_file_list(rts.input_location)
306310
307311 if rts.kaggle:
308 - processors = 2
 312+ processors = 4
309313 elif len(files) > cpu_count():
310314 processors = cpu_count() - 1
311315 else: