r79606 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r79605‎ | r79606 | r79607 >
Date:22:00, 4 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
Greatly simplified the chunking code.
Modified paths:
  • /trunk/tools/editor_trends/etl/chunker.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/chunker.py
@@ -21,9 +21,7 @@
2222 import sys
2323 import codecs
2424 import re
25 -import json
2625 import os
27 -import random
2826
2927 import progressbar
3028
@@ -31,11 +29,7 @@
3230 sys.path.append('..')
3331 import configuration
3432 settings = configuration.Settings()
35 -
3633 from utils import utils
37 -import extract
38 -import wikitree.parser
39 -from bots import bots
4034
4135
4236 try:
@@ -75,28 +69,28 @@
7670 return element
7771
7872
79 -def load_namespace(language):
80 - file = '%s_ns.json' % language
81 - fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding)
82 - ns = json.load(fh)
83 - fh.close()
84 - ns = ns['query']['namespaces']
85 - return ns
 73+#def load_namespace(language):
 74+# file = '%s_ns.json' % language
 75+# fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding)
 76+# ns = json.load(fh)
 77+# fh.close()
 78+# ns = ns['query']['namespaces']
 79+# return ns
 80+#
 81+#
 82+#def build_namespaces_locale(namespaces, include=[0]):
 83+# '''
 84+# @include is a list of namespace keys that should not be ignored, the default
 85+# setting is to ignore all namespaces except the main namespace.
 86+# '''
 87+# ns = []
 88+# for namespace in namespaces:
 89+# if namespace not in include:
 90+# value = namespaces[namespace].get(u'*', None)
 91+# ns.append(value)
 92+# return ns
8693
8794
88 -def build_namespaces_locale(namespaces, include=[0]):
89 - '''
90 - @include is a list of namespace keys that should not be ignored, the default
91 - setting is to ignore all namespaces except the main namespace.
92 - '''
93 - ns = []
94 - for namespace in namespaces:
95 - if namespace not in include:
96 - value = namespaces[namespace].get(u'*', None)
97 - ns.append(value)
98 - return ns
99 -
100 -
10195 def parse_comments(xml, function):
10296 revisions = xml.findall('revision')
10397 for revision in revisions:
@@ -107,15 +101,15 @@
108102 return xml
109103
110104
111 -def is_article_main_namespace(elem, namespace):
112 - '''
113 - checks whether the article belongs to the main namespace
114 - '''
115 - title = elem.find('title').text
116 - for ns in namespace:
117 - if title.startswith(ns):
118 - return False
119 - return True
 105+#def is_article_main_namespace(elem, namespace):
 106+# '''
 107+# checks whether the article belongs to the main namespace
 108+# '''
 109+# title = elem.find('title').text
 110+# for ns in namespace:
 111+# if title.startswith(ns):
 112+# return False
 113+# return True
120114
121115
122116 def write_xml_file(element, fh, output, counter, format):
@@ -125,15 +119,18 @@
126120 size = len(xml_string)
127121 fh, counter, new_file = create_file_handle(fh, output, counter, size, format)
128122 fh.write(xml_string)
129 - except MemoryError:
130 - print 'Add error capturing logic'
131 - except UnicodeEncodeError, error:
 123+ except Exception, error:
132124 print error
133 - n = random.randrange(0, 10000)
134 - f = '%s%s.bin' % ('element', n)
135 - new_file = False
136 - #if element != None:
137 - # utils.store_object(element, settings.binary_location, f)
 125+
 126+# MemoryError:
 127+# print 'Add error capturing logic'
 128+# except UnicodeEncodeError, error:
 129+# print error
 130+# n = random.randrange(0, 10000)
 131+# f = '%s%s.bin' % ('element', n)
 132+# new_file = False
 133+# #if element != None:
 134+# # utils.store_object(element, settings.binary_location, f)
138135 fh.write('\n')
139136 return fh, counter, new_file
140137
@@ -162,44 +159,44 @@
163160 return fh, counter, False
164161
165162
166 -def flatten_xml_elements(data, page, bots):
167 - headers = ['id', 'date', 'article', 'username']
168 - tags = {'contributor': {'id': extract.extract_contributor_id,
169 - 'bot': extract.determine_username_is_bot,
170 - 'username': extract.extract_username,
171 - },
172 - 'timestamp': {'date': xml.extract_text},
173 - }
174 - vars = {}
175 - flat = []
 163+#def flatten_xml_elements(data, page, bots):
 164+# headers = ['id', 'date', 'article', 'username']
 165+# tags = {'contributor': {'id': extract.extract_contributor_id,
 166+# 'bot': extract.determine_username_is_bot,
 167+# 'username': extract.extract_username,
 168+# },
 169+# 'timestamp': {'date': xml.extract_text},
 170+# }
 171+# vars = {}
 172+# flat = []
 173+#
 174+# for x, elems in enumerate(data):
 175+# vars[x] = {}
 176+# vars[x]['article'] = page
 177+# for tag in tags:
 178+# el = xml.retrieve_xml_node(elems, tag)
 179+# for function in tags[tag].keys():
 180+# f = tags[tag][function]
 181+# value = f(el, bots=bots)
 182+# if type(value) == type({}):
 183+# for kw in value:
 184+# vars[x][kw] = value[kw]
 185+# else:
 186+# vars[x][function] = value
 187+#
 188+# for x, var in enumerate(vars):
 189+# if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None:
 190+# continue
 191+# else:
 192+# f = []
 193+# for head in headers:
 194+# f.append(vars[x][head])
 195+# flat.append(f)
 196+#
 197+# return flat
176198
177 - for x, elems in enumerate(data):
178 - vars[x] = {}
179 - vars[x]['article'] = page
180 - for tag in tags:
181 - el = xml.retrieve_xml_node(elems, tag)
182 - for function in tags[tag].keys():
183 - f = tags[tag][function]
184 - value = f(el, bots=bots)
185 - if type(value) == type({}):
186 - for kw in value:
187 - vars[x][kw] = value[kw]
188 - else:
189 - vars[x][function] = value
190199
191 - for x, var in enumerate(vars):
192 - if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None:
193 - continue
194 - else:
195 - f = []
196 - for head in headers:
197 - f.append(vars[x][head])
198 - flat.append(f)
199 -
200 - return flat
201 -
202 -
203 -def split_file(location, file, project, language_code, namespaces=[0], format='xml', zip=False):
 200+def split_file(location, file, project, language_code, zip=False):
204201 '''
205202 Reads xml file and splits it in N chunks
206203 @namespaces is a list indicating which namespaces should be included, default
@@ -207,11 +204,11 @@
208205 @zip indicates whether to compress the chunk or not
209206 '''
210207 input = os.path.join(location, file)
211 - if format == 'xml':
212 - output = os.path.join(location, 'chunks')
213 - else:
214 - output = os.path.join(location, 'txt')
215 - bot_ids = bots.retrieve_bots(language_code)
 208+ #if format == 'xml':
 209+ output = os.path.join(location, 'chunks')
 210+ #else:
 211+ # output = os.path.join(location, 'txt')
 212+ # bot_ids = bots.retrieve_bots(language_code)
216213 settings.verify_environment([output])
217214
218215 fh = None
@@ -230,25 +227,26 @@
231228 if event == 'end':
232229 if elem.tag == tag:
233230 elem = remove_namespace(elem, settings.xml_namespace)
234 - if is_article_main_namespace(elem, ns):
235 - page = elem.find('id').text
236 - elem = parse_comments(elem, remove_numeric_character_references)
 231+ #if is_article_main_namespace(elem, ns):
 232+ # page = elem.find('id').text
 233+ elem = parse_comments(elem, remove_numeric_character_references)
237234
238 - if format == 'xml':
239 - fh, counter, new_file = write_xml_file(elem, fh, output, counter, format)
240 - else:
241 - data = [el.getchildren() for el in elem if el.tag == 'revision']
242 - data = flatten_xml_elements(data, page, bot_ids)
243 - if data != None:
244 - size = 64 * len(data)
245 - fh, counter, new_file = create_file_handle(fh, output, counter, size, format)
246 - utils.write_list_to_csv(data, fh, recursive=False, newline=True)
 235+ # if format == 'xml':
 236+ fh, counter, new_file = write_xml_file(elem, fh, output, counter, format)
 237+# else:
 238+# data = [el.getchildren() for el in elem if el.tag == 'revision']
 239+# data = flatten_xml_elements(data, page, bot_ids)
 240+# if data != None:
 241+# size = 64 * len(data)
 242+# fh, counter, new_file = create_file_handle(fh, output, counter, size, format)
 243+# utils.write_list_to_csv(data, fh, recursive=False, newline=True)
247244
248 - if zip and new_file:
249 - file = str(counter - 1) + format
250 - utils.zip_archive(settings.path_ziptool, output, file)
251 - utils.delete_file(output, file)
 245+ if zip and new_file:
 246+ file = str(counter - 1) + format
 247+ utils.zip_archive(settings.path_ziptool, output, file)
 248+ utils.delete_file(output, file)
252249 root.clear() # when done parsing a section clear the tree to safe memory
 250+
253251 except SyntaxError:
254252 f = utils.create_txt_filehandle(settings.log_location, 'split_xml', 'w', settings.encoding)
255253 f.write(cElementTree.tostring(elem))

Status & tagging log