Index: trunk/tools/editor_trends/etl/chunker.py |
— | — | @@ -21,9 +21,7 @@ |
22 | 22 | import sys |
23 | 23 | import codecs |
24 | 24 | import re |
25 | | -import json |
26 | 25 | import os |
27 | | -import random |
28 | 26 | |
29 | 27 | import progressbar |
30 | 28 | |
— | — | @@ -31,11 +29,7 @@ |
32 | 30 | sys.path.append('..') |
33 | 31 | import configuration |
34 | 32 | settings = configuration.Settings() |
35 | | - |
36 | 33 | from utils import utils |
37 | | -import extract |
38 | | -import wikitree.parser |
39 | | -from bots import bots |
40 | 34 | |
41 | 35 | |
42 | 36 | try: |
— | — | @@ -75,28 +69,28 @@ |
76 | 70 | return element |
77 | 71 | |
78 | 72 | |
79 | | -def load_namespace(language): |
80 | | - file = '%s_ns.json' % language |
81 | | - fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding) |
82 | | - ns = json.load(fh) |
83 | | - fh.close() |
84 | | - ns = ns['query']['namespaces'] |
85 | | - return ns |
| 73 | +#def load_namespace(language): |
| 74 | +# file = '%s_ns.json' % language |
| 75 | +# fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding) |
| 76 | +# ns = json.load(fh) |
| 77 | +# fh.close() |
| 78 | +# ns = ns['query']['namespaces'] |
| 79 | +# return ns |
| 80 | +# |
| 81 | +# |
| 82 | +#def build_namespaces_locale(namespaces, include=[0]): |
| 83 | +# ''' |
| 84 | +# @include is a list of namespace keys that should not be ignored, the default |
| 85 | +# setting is to ignore all namespaces except the main namespace. |
| 86 | +# ''' |
| 87 | +# ns = [] |
| 88 | +# for namespace in namespaces: |
| 89 | +# if namespace not in include: |
| 90 | +# value = namespaces[namespace].get(u'*', None) |
| 91 | +# ns.append(value) |
| 92 | +# return ns |
86 | 93 | |
87 | 94 | |
88 | | -def build_namespaces_locale(namespaces, include=[0]): |
89 | | - ''' |
90 | | - @include is a list of namespace keys that should not be ignored, the default |
91 | | - setting is to ignore all namespaces except the main namespace. |
92 | | - ''' |
93 | | - ns = [] |
94 | | - for namespace in namespaces: |
95 | | - if namespace not in include: |
96 | | - value = namespaces[namespace].get(u'*', None) |
97 | | - ns.append(value) |
98 | | - return ns |
99 | | - |
100 | | - |
101 | 95 | def parse_comments(xml, function): |
102 | 96 | revisions = xml.findall('revision') |
103 | 97 | for revision in revisions: |
— | — | @@ -107,15 +101,15 @@ |
108 | 102 | return xml |
109 | 103 | |
110 | 104 | |
111 | | -def is_article_main_namespace(elem, namespace): |
112 | | - ''' |
113 | | - checks whether the article belongs to the main namespace |
114 | | - ''' |
115 | | - title = elem.find('title').text |
116 | | - for ns in namespace: |
117 | | - if title.startswith(ns): |
118 | | - return False |
119 | | - return True |
| 105 | +#def is_article_main_namespace(elem, namespace): |
| 106 | +# ''' |
| 107 | +# checks whether the article belongs to the main namespace |
| 108 | +# ''' |
| 109 | +# title = elem.find('title').text |
| 110 | +# for ns in namespace: |
| 111 | +# if title.startswith(ns): |
| 112 | +# return False |
| 113 | +# return True |
120 | 114 | |
121 | 115 | |
122 | 116 | def write_xml_file(element, fh, output, counter, format): |
— | — | @@ -125,15 +119,18 @@ |
126 | 120 | size = len(xml_string) |
127 | 121 | fh, counter, new_file = create_file_handle(fh, output, counter, size, format) |
128 | 122 | fh.write(xml_string) |
129 | | - except MemoryError: |
130 | | - print 'Add error capturing logic' |
131 | | - except UnicodeEncodeError, error: |
| 123 | + except Exception, error: |
132 | 124 | print error |
133 | | - n = random.randrange(0, 10000) |
134 | | - f = '%s%s.bin' % ('element', n) |
135 | | - new_file = False |
136 | | - #if element != None: |
137 | | - # utils.store_object(element, settings.binary_location, f) |
| 125 | + |
| 126 | +# MemoryError: |
| 127 | +# print 'Add error capturing logic' |
| 128 | +# except UnicodeEncodeError, error: |
| 129 | +# print error |
| 130 | +# n = random.randrange(0, 10000) |
| 131 | +# f = '%s%s.bin' % ('element', n) |
| 132 | +# new_file = False |
| 133 | +# #if element != None: |
| 134 | +# # utils.store_object(element, settings.binary_location, f) |
138 | 135 | fh.write('\n') |
139 | 136 | return fh, counter, new_file |
140 | 137 | |
— | — | @@ -162,44 +159,44 @@ |
163 | 160 | return fh, counter, False |
164 | 161 | |
165 | 162 | |
166 | | -def flatten_xml_elements(data, page, bots): |
167 | | - headers = ['id', 'date', 'article', 'username'] |
168 | | - tags = {'contributor': {'id': extract.extract_contributor_id, |
169 | | - 'bot': extract.determine_username_is_bot, |
170 | | - 'username': extract.extract_username, |
171 | | - }, |
172 | | - 'timestamp': {'date': xml.extract_text}, |
173 | | - } |
174 | | - vars = {} |
175 | | - flat = [] |
| 163 | +#def flatten_xml_elements(data, page, bots): |
| 164 | +# headers = ['id', 'date', 'article', 'username'] |
| 165 | +# tags = {'contributor': {'id': extract.extract_contributor_id, |
| 166 | +# 'bot': extract.determine_username_is_bot, |
| 167 | +# 'username': extract.extract_username, |
| 168 | +# }, |
| 169 | +# 'timestamp': {'date': xml.extract_text}, |
| 170 | +# } |
| 171 | +# vars = {} |
| 172 | +# flat = [] |
| 173 | +# |
| 174 | +# for x, elems in enumerate(data): |
| 175 | +# vars[x] = {} |
| 176 | +# vars[x]['article'] = page |
| 177 | +# for tag in tags: |
| 178 | +# el = xml.retrieve_xml_node(elems, tag) |
| 179 | +# for function in tags[tag].keys(): |
| 180 | +# f = tags[tag][function] |
| 181 | +# value = f(el, bots=bots) |
| 182 | +# if type(value) == type({}): |
| 183 | +# for kw in value: |
| 184 | +# vars[x][kw] = value[kw] |
| 185 | +# else: |
| 186 | +# vars[x][function] = value |
| 187 | +# |
| 188 | +# for x, var in enumerate(vars): |
| 189 | +# if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None: |
| 190 | +# continue |
| 191 | +# else: |
| 192 | +# f = [] |
| 193 | +# for head in headers: |
| 194 | +# f.append(vars[x][head]) |
| 195 | +# flat.append(f) |
| 196 | +# |
| 197 | +# return flat |
176 | 198 | |
177 | | - for x, elems in enumerate(data): |
178 | | - vars[x] = {} |
179 | | - vars[x]['article'] = page |
180 | | - for tag in tags: |
181 | | - el = xml.retrieve_xml_node(elems, tag) |
182 | | - for function in tags[tag].keys(): |
183 | | - f = tags[tag][function] |
184 | | - value = f(el, bots=bots) |
185 | | - if type(value) == type({}): |
186 | | - for kw in value: |
187 | | - vars[x][kw] = value[kw] |
188 | | - else: |
189 | | - vars[x][function] = value |
190 | 199 | |
191 | | - for x, var in enumerate(vars): |
192 | | - if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None: |
193 | | - continue |
194 | | - else: |
195 | | - f = [] |
196 | | - for head in headers: |
197 | | - f.append(vars[x][head]) |
198 | | - flat.append(f) |
199 | | - |
200 | | - return flat |
201 | | - |
202 | | - |
203 | | -def split_file(location, file, project, language_code, namespaces=[0], format='xml', zip=False): |
| 200 | +def split_file(location, file, project, language_code, zip=False): |
204 | 201 | ''' |
205 | 202 | Reads xml file and splits it in N chunks |
206 | 203 | @namespaces is a list indicating which namespaces should be included, default |
— | — | @@ -207,11 +204,11 @@ |
208 | 205 | @zip indicates whether to compress the chunk or not |
209 | 206 | ''' |
210 | 207 | input = os.path.join(location, file) |
211 | | - if format == 'xml': |
212 | | - output = os.path.join(location, 'chunks') |
213 | | - else: |
214 | | - output = os.path.join(location, 'txt') |
215 | | - bot_ids = bots.retrieve_bots(language_code) |
| 208 | + #if format == 'xml': |
| 209 | + output = os.path.join(location, 'chunks') |
| 210 | + #else: |
| 211 | + # output = os.path.join(location, 'txt') |
| 212 | + # bot_ids = bots.retrieve_bots(language_code) |
216 | 213 | settings.verify_environment([output]) |
217 | 214 | |
218 | 215 | fh = None |
— | — | @@ -230,25 +227,26 @@ |
231 | 228 | if event == 'end': |
232 | 229 | if elem.tag == tag: |
233 | 230 | elem = remove_namespace(elem, settings.xml_namespace) |
234 | | - if is_article_main_namespace(elem, ns): |
235 | | - page = elem.find('id').text |
236 | | - elem = parse_comments(elem, remove_numeric_character_references) |
| 231 | + #if is_article_main_namespace(elem, ns): |
| 232 | + # page = elem.find('id').text |
| 233 | + elem = parse_comments(elem, remove_numeric_character_references) |
237 | 234 | |
238 | | - if format == 'xml': |
239 | | - fh, counter, new_file = write_xml_file(elem, fh, output, counter, format) |
240 | | - else: |
241 | | - data = [el.getchildren() for el in elem if el.tag == 'revision'] |
242 | | - data = flatten_xml_elements(data, page, bot_ids) |
243 | | - if data != None: |
244 | | - size = 64 * len(data) |
245 | | - fh, counter, new_file = create_file_handle(fh, output, counter, size, format) |
246 | | - utils.write_list_to_csv(data, fh, recursive=False, newline=True) |
| 235 | + # if format == 'xml': |
| 236 | + fh, counter, new_file = write_xml_file(elem, fh, output, counter, format) |
| 237 | +# else: |
| 238 | +# data = [el.getchildren() for el in elem if el.tag == 'revision'] |
| 239 | +# data = flatten_xml_elements(data, page, bot_ids) |
| 240 | +# if data != None: |
| 241 | +# size = 64 * len(data) |
| 242 | +# fh, counter, new_file = create_file_handle(fh, output, counter, size, format) |
| 243 | +# utils.write_list_to_csv(data, fh, recursive=False, newline=True) |
247 | 244 | |
248 | | - if zip and new_file: |
249 | | - file = str(counter - 1) + format |
250 | | - utils.zip_archive(settings.path_ziptool, output, file) |
251 | | - utils.delete_file(output, file) |
| 245 | + if zip and new_file: |
| 246 | + file = str(counter - 1) + format |
| 247 | + utils.zip_archive(settings.path_ziptool, output, file) |
| 248 | + utils.delete_file(output, file) |
252 | 249 | root.clear() # when done parsing a section clear the tree to safe memory |
| 250 | + |
253 | 251 | except SyntaxError: |
254 | 252 | f = utils.create_txt_filehandle(settings.log_location, 'split_xml', 'w', settings.encoding) |
255 | 253 | f.write(cElementTree.tostring(elem)) |