Index: trunk/tools/editor_trends/utils/compression.py |
— | — | @@ -1,149 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__email__ = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2010-11-27' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -import sys |
22 | | -import subprocess |
23 | | -import os |
24 | | -if '..' not in sys.path: |
25 | | - sys.path.append('..') |
26 | | - |
27 | | -from classes import settings |
28 | | -settings = settings.Settings() |
29 | | -from classes import exceptions |
30 | | -import file_utils |
31 | | -import timer |
32 | | -import log |
33 | | - |
34 | | -class Compressor(object): |
35 | | - |
36 | | - def __init__(self, location, file, output=None): |
37 | | - self.extension = file_utils.determine_file_extension(file) |
38 | | - self.file = file |
39 | | - self.location = location |
40 | | - self.path = os.path.join(self.location, self.file) |
41 | | - self.output = None |
42 | | - self.name = None |
43 | | - self.program = [] |
44 | | - self.compression = '7z' |
45 | | - |
46 | | - def __str__(self): |
47 | | - return self.name |
48 | | - |
49 | | - def support_extension(self, extension): |
50 | | - if extension in self.extensions: |
51 | | - return True |
52 | | - else: |
53 | | - return False |
54 | | - |
55 | | - def compress(self): |
56 | | - ''' |
57 | | - @path is the absolute path to the zip program |
58 | | - @location is the directory where to store the compressed file |
59 | | - @source is the name of the zipfile |
60 | | - ''' |
61 | | - if self.program == []: |
62 | | - self.init_compression_tool(self.extension, 'compress') |
63 | | - |
64 | | - if self.program_installed == None: |
65 | | - raise exceptions.CompressionNotSupportedError |
66 | | - |
67 | | - args = {'7z': ['%s' % self.program_installed, 'a', '-scsUTF-8', |
68 | | - '-t%s' % self.compression, |
69 | | - '%s' % self.output, |
70 | | - '%s' % self.input], |
71 | | - } |
72 | | - |
73 | | - commands = args.get(self.name, None) |
74 | | - if commands != None: |
75 | | - p = subprocess.Popen(commands, shell=True).wait() |
76 | | - else: |
77 | | - raise exceptions.CompressionNotSupportedError |
78 | | - |
79 | | - |
80 | | - def extract(self): |
81 | | - ''' |
82 | | - @location is the directory where to store the compressed file |
83 | | - @source is the name of the archive |
84 | | - @extension is a helper variable to identify which tool to use. |
85 | | - ''' |
86 | | - if self.program == []: |
87 | | - self.init_compression_tool(self.extension, 'extract') |
88 | | - |
89 | | - if self.program_installed == None: |
90 | | - raise exceptions.CompressionNotSupportedError |
91 | | - |
92 | | - #print self.location |
93 | | - #print self.file |
94 | | - if not file_utils.check_file_exists(self.location, self.file): |
95 | | - raise exceptions.FileNotFoundException(self.location, self.file) |
96 | | - |
97 | | - args = {'7z': ['%s' % self.program_installed, 'e', '-y', '-o%s' % self.location, '%s' % self.path], |
98 | | - 'bunzip2': ['%s' % self.program_installed, '-k', '%s' % self.path], |
99 | | - 'zip': ['%s' % self.program_installed, '-o', '%s' % self.path], |
100 | | - 'gz': ['%s' % self.program_installed, '-xzvf', '%s' % self.path], |
101 | | - 'tar': ['%s' % self.program_installed, '-xvf', '%s' % self.path] |
102 | | - } |
103 | | - commands = args.get(self.name, None) |
104 | | - #print commands |
105 | | - if commands != None: |
106 | | - p = subprocess.call(commands, shell=True) |
107 | | - #p = subprocess.Popen(commands, shell=True).wait() |
108 | | - else: |
109 | | - raise exceptions.CompressionNotSupportedError |
110 | | - return p |
111 | | - |
112 | | - def init_compression_tool(self, extension, action): |
113 | | - compression = {'gz': [['tar', 'tar'], ['7z', '7z']], |
114 | | - 'bz2': [['bzip2', 'bunzip2'], ['7z', '7z']], |
115 | | - '7z': [['7z', '7z']], |
116 | | - 'zip': [['zip', 'unzip'], ['7z', '7z']], |
117 | | - 'tar': [['tar', 'tar'], ['7z', '7z']], |
118 | | - } |
119 | | - |
120 | | - for ext in compression[extension]: |
121 | | - archive, extract = ext[0], ext[1] |
122 | | - if action == 'extract': |
123 | | - self.program.append(extract) |
124 | | - else: |
125 | | - self.program.append(extract) |
126 | | - |
127 | | - for p in self.program: |
128 | | - path = settings.detect_installed_program(p) |
129 | | - if path != None: |
130 | | - self.name = p |
131 | | - self.program_installed = path |
132 | | - |
133 | | - |
134 | | -def launch_zip_extractor(location, filename, properties): |
135 | | - ''' |
136 | | - |
137 | | - ''' |
138 | | - print 'Unzipping zip file' |
139 | | - stopwatch = timer.Timer() |
140 | | - log.to_db(properties, 'dataset', 'unpack', stopwatch, event='start') |
141 | | - compressor = Compressor(location, filename) |
142 | | - retcode = compressor.extract() |
143 | | - stopwatch.elapsed() |
144 | | - log.to_db(properties, 'dataset', 'unpack', stopwatch, event='finish') |
145 | | - return retcode |
146 | | - |
147 | | - |
148 | | -if __name__ == '__main__': |
149 | | - c = Compressor('C:\Users\diederik.vanliere\Documents', 'test.zip') |
150 | | - c.extract() |
Index: trunk/tools/editor_trends/utils/inventory.py |
— | — | @@ -65,7 +65,7 @@ |
66 | 66 | |
67 | 67 | |
68 | 68 | def store_available_dumps(self): |
69 | | - db = storage.Database(rts.storage, 'wikilytics', 'available_dumps') |
| 69 | + db = storage.init_database(rts.storage, 'wikilytics', 'available_dumps') |
70 | 70 | db.save({'project': self.project, 'dumps': self.data}) |
71 | 71 | |
72 | 72 | def run(self): |
Index: trunk/tools/editor_trends/utils/file_utils.py |
— | — | @@ -26,17 +26,14 @@ |
27 | 27 | import bz2 |
28 | 28 | import gzip |
29 | 29 | import re |
30 | | -import htmlentitydefs |
31 | 30 | import time |
32 | 31 | import datetime |
33 | 32 | import cPickle |
34 | 33 | import codecs |
35 | 34 | import os |
36 | | -import ctypes |
37 | 35 | import sys |
38 | 36 | import subprocess |
39 | 37 | import shutil |
40 | | -import multiprocessing |
41 | 38 | |
42 | 39 | if '..' not in sys.path: |
43 | 40 | sys.path.append('..') |
— | — | @@ -45,20 +42,11 @@ |
46 | 43 | settings = settings.Settings() |
47 | 44 | |
48 | 45 | from classes import exceptions |
49 | | -import messages |
50 | | -import text_utils |
51 | 46 | |
52 | | -try: |
53 | | - import psyco |
54 | | - psyco.full() |
55 | | -except ImportError: |
56 | | - pass |
57 | | - |
58 | | - |
59 | | -#RE_ERROR_LOCATION = re.compile('\d+') |
60 | | -#RE_NUMERIC_CHARACTER = re.compile('&#?\w+;') |
61 | | - |
62 | 47 | def read_unicode_text(fh): |
| 48 | + ''' |
| 49 | + @fh should be a file object |
| 50 | + ''' |
63 | 51 | data = [] |
64 | 52 | try: |
65 | 53 | for line in fh: |
— | — | @@ -70,24 +58,6 @@ |
71 | 59 | return data |
72 | 60 | |
73 | 61 | |
74 | | -def check_if_process_is_running(pid): |
75 | | - try: |
76 | | - if settings.OS == 'Windows': |
77 | | - PROCESS_TERMINATE = 1 |
78 | | - handle = ctypes.windll.kernel32.OpenProcess(PROCESS_TERMINATE, False, pid) |
79 | | - ctypes.windll.kernel32.CloseHandle(handle) |
80 | | - if handle != 0: |
81 | | - return True |
82 | | - else: |
83 | | - return False |
84 | | - else: |
85 | | - os.kill(pid, 0) |
86 | | - return True |
87 | | - except Exception, error: |
88 | | - print error |
89 | | - return False |
90 | | - |
91 | | - |
92 | 62 | def read_raw_data(fh): |
93 | 63 | ''' |
94 | 64 | @fh should be a file object |
— | — | @@ -113,6 +83,9 @@ |
114 | 84 | |
115 | 85 | |
116 | 86 | def create_directory(path): |
| 87 | + ''' |
| 88 | + @path is the absolute path |
| 89 | + ''' |
117 | 90 | try: |
118 | 91 | os.mkdir(path) |
119 | 92 | return True |
— | — | @@ -120,17 +93,11 @@ |
121 | 94 | return False |
122 | 95 | |
123 | 96 | |
124 | | -def determine_file_extension(filename): |
125 | | - pos = filename.rfind('.') + 1 |
126 | | - return filename[pos:] |
127 | | - |
128 | | - |
129 | 97 | def determine_file_mode(extension): |
130 | 98 | ''' |
131 | | - Checks if a given extension is an ASCII extension or not. The settings file |
132 | | - provides known ASCII extensions. |
| 99 | + Checks if a given extension is an ASCII extension or not. |
133 | 100 | ''' |
134 | | - if extension in settings.ascii_extensions: |
| 101 | + if extension in ['.txt', '.csv', '.json', '.xml']: |
135 | 102 | return 'w' |
136 | 103 | else: |
137 | 104 | return 'wb' |
— | — | @@ -203,24 +170,25 @@ |
204 | 171 | |
205 | 172 | |
206 | 173 | def create_txt_filehandle(location, filename, mode, encoding): |
| 174 | + '''Create a filehandle for text file with utf-8 encoding''' |
207 | 175 | filename = str(filename) |
208 | 176 | if not filename.endswith('.csv'): |
209 | 177 | filename = construct_filename(filename, '.csv') |
210 | 178 | path = os.path.join(location, filename) |
211 | | - return codecs.open(path, mode, encoding=encoding) |
| 179 | + return codecs.open(path, mode, encoding='utf-8') |
212 | 180 | |
213 | 181 | |
214 | 182 | def create_streaming_buffer(path): |
215 | | - extension = determine_file_extension(path) |
216 | | - if extension == 'gz': |
| 183 | + extension = os.path.splitext(path)[1] |
| 184 | + if extension == '.gz': |
217 | 185 | fh = gzip.GzipFile(path, 'rb') |
218 | | - elif extension == 'bz2': |
| 186 | + elif extension == '.bz2': |
219 | 187 | fh = bz2.BZ2File(path, 'rb') |
220 | | - elif extension == '7z': |
| 188 | + elif extension == '.7z': |
221 | 189 | #TODO: might be too linux specific |
222 | 190 | fh = subprocess.Popen('7z e -bd -so %s 2>/dev/null' % path, shell=True, |
223 | 191 | stdout=subprocess.PIPE, bufsize=65535).stdout |
224 | | - elif extension == 'xml': |
| 192 | + elif extension == '.xml': |
225 | 193 | fh = create_txt_filehandle(path, None, 'r', 'utf-8') |
226 | 194 | else: |
227 | 195 | raise exceptions.CompressedFileNotSupported(extension) |
— | — | @@ -240,6 +208,9 @@ |
241 | 209 | |
242 | 210 | |
243 | 211 | def delete_file(location, filename, directory=False): |
| 212 | + ''' |
| 213 | + Delete a file or a directory |
| 214 | + ''' |
244 | 215 | res = True |
245 | 216 | if not directory: |
246 | 217 | if check_file_exists(location, filename): |
— | — | @@ -259,6 +230,7 @@ |
260 | 231 | |
261 | 232 | |
262 | 233 | def determine_filesize(location, filename): |
| 234 | + '''Determine the file size of a local file''' |
263 | 235 | path = os.path.join(location, filename) |
264 | 236 | return os.path.getsize(path) |
265 | 237 | |
— | — | @@ -276,6 +248,7 @@ |
277 | 249 | #sraise exceptions.NotYetImplementedError(set_modified_data) |
278 | 250 | |
279 | 251 | def get_modified_date(location, filename): |
| 252 | + '''determine the date the file was originally created''' |
280 | 253 | path = os.path.join(location, filename) |
281 | 254 | mod_date = os.stat(path).st_mtime |
282 | 255 | mod_date = datetime.datetime.fromtimestamp(mod_date) |
— | — | @@ -283,6 +256,7 @@ |
284 | 257 | |
285 | 258 | |
286 | 259 | def check_file_exists(location, filename): |
| 260 | + '''check if a file exists in particular location''' |
287 | 261 | if hasattr(filename, '__call__'): |
288 | 262 | filename = construct_filename(filename, '.bin') |
289 | 263 | if os.path.exists(os.path.join(location, filename)): |
— | — | @@ -292,6 +266,7 @@ |
293 | 267 | |
294 | 268 | |
295 | 269 | def which(program): |
| 270 | + '''determine the path where program can be found''' |
296 | 271 | def is_exe(fpath): |
297 | 272 | return os.path.exists(fpath) and os.access(fpath, os.X_OK) |
298 | 273 | |
— | — | @@ -308,7 +283,7 @@ |
309 | 284 | raise exceptions.FileNotFoundException(program) |
310 | 285 | |
311 | 286 | |
312 | | -def store_object(object, location, filename): |
| 287 | +def store_object(obj, location, filename): |
313 | 288 | ''' |
314 | 289 | Pickle object |
315 | 290 | ''' |
— | — | @@ -317,7 +292,7 @@ |
318 | 293 | if not filename.endswith('.bin'): |
319 | 294 | filename = filename + '.bin' |
320 | 295 | fh = create_binary_filehandle(location, filename, 'wb') |
321 | | - cPickle.dump(object, fh) |
| 296 | + cPickle.dump(obj, fh) |
322 | 297 | fh.close() |
323 | 298 | |
324 | 299 | |
— | — | @@ -364,9 +339,7 @@ |
365 | 340 | Determine the name of a file by stripping away all extensions. |
366 | 341 | ''' |
367 | 342 | while filename.find('.') > -1: |
368 | | - ext = determine_file_extension(filename) |
369 | | - ext = '.%s' % ext |
370 | | - filename = filename.replace(ext, '') |
| 343 | + filename = os.path.splitext(filename)[0] |
371 | 344 | return filename |
372 | 345 | |
373 | 346 | |
— | — | @@ -386,15 +359,15 @@ |
387 | 360 | mask = re.compile('[\w\d*]') |
388 | 361 | all_files = os.listdir(location) |
389 | 362 | files = [] |
390 | | - for file in all_files: |
391 | | - file = file.split('.') |
392 | | - if len(file) == 1: |
| 363 | + for filename in all_files: |
| 364 | + filename = filename.split('.') |
| 365 | + if len(filename) == 1: |
393 | 366 | continue |
394 | 367 | if extension: |
395 | | - if re.match(mask, file[0]) and file[-1].endswith(extension): |
396 | | - files.append('.'.join(file)) |
397 | | - elif re.match(mask, file[0]): |
398 | | - files.append('.'.join(file)) |
| 368 | + if re.match(mask, filename[0]) and filename[-1].endswith(extension): |
| 369 | + files.append('.'.join(filename)) |
| 370 | + elif re.match(mask, filename[0]): |
| 371 | + files.append('.'.join(filename)) |
399 | 372 | return files |
400 | 373 | |
401 | 374 | |
— | — | @@ -415,7 +388,7 @@ |
416 | 389 | chunks[x] = datalist[a:b] |
417 | 390 | a = (x + 1) * parts |
418 | 391 | if a >= len(datalist): |
419 | | - break |
| 392 | + break |
420 | 393 | return chunks |
421 | 394 | |
422 | 395 | |
Index: trunk/tools/editor_trends/utils/http_utils.py |
— | — | @@ -63,18 +63,22 @@ |
64 | 64 | them in a queue. |
65 | 65 | ''' |
66 | 66 | task_queue = multiprocessing.JoinableQueue() |
67 | | - ext = file_utils.determine_file_extension(filename) |
| 67 | + ext = os.path.splitext(filename)[1] |
68 | 68 | canonical_filename = file_utils.determine_canonical_name(filename) |
69 | | - for x in xrange(1, 100): |
70 | | - f = '%s%s.xml.%s' % (canonical_filename, x, ext) |
| 69 | + x = 1 |
| 70 | + while True: |
| 71 | + f = '%s%s.xml%s' % (canonical_filename, x, ext) |
71 | 72 | res = get_headers(domain, path, f) |
72 | | - if res == None or res.status != 200: |
73 | | - if x == 1: |
74 | | - task_queue.put(filename) |
| 73 | + if res == None or res.status != 200 and x == 1: |
| 74 | + task_queue.put(filename) |
75 | 75 | break |
76 | | - else: |
| 76 | + elif res.status == 200: |
77 | 77 | print 'Added chunk to download: %s' % f |
78 | 78 | task_queue.put(f) |
| 79 | + else: |
| 80 | + break |
| 81 | + x += 1 |
| 82 | + |
79 | 83 | if x == 1: |
80 | 84 | for x in xrange(1): |
81 | 85 | task_queue.put(None) |
Index: trunk/tools/editor_trends/utils/messages.py |
— | — | @@ -20,9 +20,9 @@ |
21 | 21 | |
22 | 22 | def show(func): |
23 | 23 | ''' |
24 | | - @func should be an qsize() belonging to a task queue. qsize() is not supported |
25 | | - on OSX hence this simple workaround to make sure that we can continue supporting |
26 | | - OSX. |
| 24 | + @func should be an qsize() belonging to a task queue. qsize() is not |
| 25 | + supported on OSX hence this simple workaround to make sure that we can |
| 26 | + continue supporting OSX. |
27 | 27 | ''' |
28 | 28 | try: |
29 | 29 | return func() |