Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -137,7 +137,7 @@ |
138 | 138 | elif event is end and elem.tag.endswith('title'):
|
139 | 139 | title = variables.parse_title(elem)
|
140 | 140 | current_namespace = variables.determine_namespace(title, namespaces, include_ns)
|
141 | | - if current_namespace != False:
|
| 141 | + if isinstance(current_namespace, int):
|
142 | 142 | parse = True
|
143 | 143 | count_articles += 1
|
144 | 144 | if count_articles % 10000 == 0:
|
— | — | @@ -160,6 +160,9 @@ |
161 | 161 | id = False
|
162 | 162 | parse = False
|
163 | 163 |
|
| 164 | + else:
|
| 165 | + elem.clear()
|
| 166 | +
|
164 | 167 | except SyntaxError, error:
|
165 | 168 | print 'Encountered invalid XML tag. Error message: %s' % error
|
166 | 169 | dump(elem)
|
— | — | @@ -175,6 +178,7 @@ |
176 | 179 | fh = file_utils.create_txt_filehandle(rts.txt, filename, 'w', 'utf-8')
|
177 | 180 | file_utils.write_dict_to_csv(counts, fh, keys)
|
178 | 181 | fh.close()
|
| 182 | + counts = {}
|
179 | 183 |
|
180 | 184 |
|
181 | 185 | def parse_xml(fh, rts, cache, process_id, file_id):
|
— | — | @@ -304,7 +308,7 @@ |
305 | 309 | files = file_utils.retrieve_file_list(rts.input_location)
|
306 | 310 |
|
307 | 311 | if rts.kaggle:
|
308 | | - processors = 2
|
| 312 | + processors = 4
|
309 | 313 | elif len(files) > cpu_count():
|
310 | 314 | processors = cpu_count() - 1
|
311 | 315 | else:
|