r86674 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r86673‎ | r86674 | r86675 >
Date:22:15, 21 April 2011
Author:diederik
Status:deferred
Tags:
Comment:
Implemented pylint suggestions and delete old file.
Modified paths:
  • /trunk/tools/editor_trends/utils/compression.py (deleted) (history)
  • /trunk/tools/editor_trends/utils/file_utils.py (modified) (history)
  • /trunk/tools/editor_trends/utils/http_utils.py (modified) (history)
  • /trunk/tools/editor_trends/utils/inventory.py (modified) (history)
  • /trunk/tools/editor_trends/utils/messages.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/utils/compression.py
@@ -1,149 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2010-11-27'
19 -__version__ = '0.1'
20 -
21 -import sys
22 -import subprocess
23 -import os
24 -if '..' not in sys.path:
25 - sys.path.append('..')
26 -
27 -from classes import settings
28 -settings = settings.Settings()
29 -from classes import exceptions
30 -import file_utils
31 -import timer
32 -import log
33 -
34 -class Compressor(object):
35 -
36 - def __init__(self, location, file, output=None):
37 - self.extension = file_utils.determine_file_extension(file)
38 - self.file = file
39 - self.location = location
40 - self.path = os.path.join(self.location, self.file)
41 - self.output = None
42 - self.name = None
43 - self.program = []
44 - self.compression = '7z'
45 -
46 - def __str__(self):
47 - return self.name
48 -
49 - def support_extension(self, extension):
50 - if extension in self.extensions:
51 - return True
52 - else:
53 - return False
54 -
55 - def compress(self):
56 - '''
57 - @path is the absolute path to the zip program
58 - @location is the directory where to store the compressed file
59 - @source is the name of the zipfile
60 - '''
61 - if self.program == []:
62 - self.init_compression_tool(self.extension, 'compress')
63 -
64 - if self.program_installed == None:
65 - raise exceptions.CompressionNotSupportedError
66 -
67 - args = {'7z': ['%s' % self.program_installed, 'a', '-scsUTF-8',
68 - '-t%s' % self.compression,
69 - '%s' % self.output,
70 - '%s' % self.input],
71 - }
72 -
73 - commands = args.get(self.name, None)
74 - if commands != None:
75 - p = subprocess.Popen(commands, shell=True).wait()
76 - else:
77 - raise exceptions.CompressionNotSupportedError
78 -
79 -
80 - def extract(self):
81 - '''
82 - @location is the directory where to store the compressed file
83 - @source is the name of the archive
84 - @extension is a helper variable to identify which tool to use.
85 - '''
86 - if self.program == []:
87 - self.init_compression_tool(self.extension, 'extract')
88 -
89 - if self.program_installed == None:
90 - raise exceptions.CompressionNotSupportedError
91 -
92 - #print self.location
93 - #print self.file
94 - if not file_utils.check_file_exists(self.location, self.file):
95 - raise exceptions.FileNotFoundException(self.location, self.file)
96 -
97 - args = {'7z': ['%s' % self.program_installed, 'e', '-y', '-o%s' % self.location, '%s' % self.path],
98 - 'bunzip2': ['%s' % self.program_installed, '-k', '%s' % self.path],
99 - 'zip': ['%s' % self.program_installed, '-o', '%s' % self.path],
100 - 'gz': ['%s' % self.program_installed, '-xzvf', '%s' % self.path],
101 - 'tar': ['%s' % self.program_installed, '-xvf', '%s' % self.path]
102 - }
103 - commands = args.get(self.name, None)
104 - #print commands
105 - if commands != None:
106 - p = subprocess.call(commands, shell=True)
107 - #p = subprocess.Popen(commands, shell=True).wait()
108 - else:
109 - raise exceptions.CompressionNotSupportedError
110 - return p
111 -
112 - def init_compression_tool(self, extension, action):
113 - compression = {'gz': [['tar', 'tar'], ['7z', '7z']],
114 - 'bz2': [['bzip2', 'bunzip2'], ['7z', '7z']],
115 - '7z': [['7z', '7z']],
116 - 'zip': [['zip', 'unzip'], ['7z', '7z']],
117 - 'tar': [['tar', 'tar'], ['7z', '7z']],
118 - }
119 -
120 - for ext in compression[extension]:
121 - archive, extract = ext[0], ext[1]
122 - if action == 'extract':
123 - self.program.append(extract)
124 - else:
125 - self.program.append(extract)
126 -
127 - for p in self.program:
128 - path = settings.detect_installed_program(p)
129 - if path != None:
130 - self.name = p
131 - self.program_installed = path
132 -
133 -
134 -def launch_zip_extractor(location, filename, properties):
135 - '''
136 -
137 - '''
138 - print 'Unzipping zip file'
139 - stopwatch = timer.Timer()
140 - log.to_db(properties, 'dataset', 'unpack', stopwatch, event='start')
141 - compressor = Compressor(location, filename)
142 - retcode = compressor.extract()
143 - stopwatch.elapsed()
144 - log.to_db(properties, 'dataset', 'unpack', stopwatch, event='finish')
145 - return retcode
146 -
147 -
148 -if __name__ == '__main__':
149 - c = Compressor('C:\Users\diederik.vanliere\Documents', 'test.zip')
150 - c.extract()
Index: trunk/tools/editor_trends/utils/inventory.py
@@ -65,7 +65,7 @@
6666
6767
6868 def store_available_dumps(self):
69 - db = storage.Database(rts.storage, 'wikilytics', 'available_dumps')
 69+ db = storage.init_database(rts.storage, 'wikilytics', 'available_dumps')
7070 db.save({'project': self.project, 'dumps': self.data})
7171
7272 def run(self):
Index: trunk/tools/editor_trends/utils/file_utils.py
@@ -26,17 +26,14 @@
2727 import bz2
2828 import gzip
2929 import re
30 -import htmlentitydefs
3130 import time
3231 import datetime
3332 import cPickle
3433 import codecs
3534 import os
36 -import ctypes
3735 import sys
3836 import subprocess
3937 import shutil
40 -import multiprocessing
4138
4239 if '..' not in sys.path:
4340 sys.path.append('..')
@@ -45,20 +42,11 @@
4643 settings = settings.Settings()
4744
4845 from classes import exceptions
49 -import messages
50 -import text_utils
5146
52 -try:
53 - import psyco
54 - psyco.full()
55 -except ImportError:
56 - pass
57 -
58 -
59 -#RE_ERROR_LOCATION = re.compile('\d+')
60 -#RE_NUMERIC_CHARACTER = re.compile('&#?\w+;')
61 -
6247 def read_unicode_text(fh):
 48+ '''
 49+ @fh should be a file object
 50+ '''
6351 data = []
6452 try:
6553 for line in fh:
@@ -70,24 +58,6 @@
7159 return data
7260
7361
74 -def check_if_process_is_running(pid):
75 - try:
76 - if settings.OS == 'Windows':
77 - PROCESS_TERMINATE = 1
78 - handle = ctypes.windll.kernel32.OpenProcess(PROCESS_TERMINATE, False, pid)
79 - ctypes.windll.kernel32.CloseHandle(handle)
80 - if handle != 0:
81 - return True
82 - else:
83 - return False
84 - else:
85 - os.kill(pid, 0)
86 - return True
87 - except Exception, error:
88 - print error
89 - return False
90 -
91 -
9262 def read_raw_data(fh):
9363 '''
9464 @fh should be a file object
@@ -113,6 +83,9 @@
11484
11585
11686 def create_directory(path):
 87+ '''
 88+ @path is the absolute path
 89+ '''
11790 try:
11891 os.mkdir(path)
11992 return True
@@ -120,17 +93,11 @@
12194 return False
12295
12396
124 -def determine_file_extension(filename):
125 - pos = filename.rfind('.') + 1
126 - return filename[pos:]
127 -
128 -
12997 def determine_file_mode(extension):
13098 '''
131 - Checks if a given extension is an ASCII extension or not. The settings file
132 - provides known ASCII extensions.
 99+ Checks if a given extension is an ASCII extension or not.
133100 '''
134 - if extension in settings.ascii_extensions:
 101+ if extension in ['.txt', '.csv', '.json', '.xml']:
135102 return 'w'
136103 else:
137104 return 'wb'
@@ -203,24 +170,25 @@
204171
205172
206173 def create_txt_filehandle(location, filename, mode, encoding):
 174+ '''Create a filehandle for text file with utf-8 encoding'''
207175 filename = str(filename)
208176 if not filename.endswith('.csv'):
209177 filename = construct_filename(filename, '.csv')
210178 path = os.path.join(location, filename)
211 - return codecs.open(path, mode, encoding=encoding)
 179+ return codecs.open(path, mode, encoding='utf-8')
212180
213181
214182 def create_streaming_buffer(path):
215 - extension = determine_file_extension(path)
216 - if extension == 'gz':
 183+ extension = os.path.splitext(path)[1]
 184+ if extension == '.gz':
217185 fh = gzip.GzipFile(path, 'rb')
218 - elif extension == 'bz2':
 186+ elif extension == '.bz2':
219187 fh = bz2.BZ2File(path, 'rb')
220 - elif extension == '7z':
 188+ elif extension == '.7z':
221189 #TODO: might be too linux specific
222190 fh = subprocess.Popen('7z e -bd -so %s 2>/dev/null' % path, shell=True,
223191 stdout=subprocess.PIPE, bufsize=65535).stdout
224 - elif extension == 'xml':
 192+ elif extension == '.xml':
225193 fh = create_txt_filehandle(path, None, 'r', 'utf-8')
226194 else:
227195 raise exceptions.CompressedFileNotSupported(extension)
@@ -240,6 +208,9 @@
241209
242210
243211 def delete_file(location, filename, directory=False):
 212+ '''
 213+ Delete a file or a directory
 214+ '''
244215 res = True
245216 if not directory:
246217 if check_file_exists(location, filename):
@@ -259,6 +230,7 @@
260231
261232
262233 def determine_filesize(location, filename):
 234+ '''Determine the file size of a local file'''
263235 path = os.path.join(location, filename)
264236 return os.path.getsize(path)
265237
@@ -276,6 +248,7 @@
277249 #sraise exceptions.NotYetImplementedError(set_modified_data)
278250
279251 def get_modified_date(location, filename):
 252+ '''determine the date the file was originally created'''
280253 path = os.path.join(location, filename)
281254 mod_date = os.stat(path).st_mtime
282255 mod_date = datetime.datetime.fromtimestamp(mod_date)
@@ -283,6 +256,7 @@
284257
285258
286259 def check_file_exists(location, filename):
 260+ '''check if a file exists in particular location'''
287261 if hasattr(filename, '__call__'):
288262 filename = construct_filename(filename, '.bin')
289263 if os.path.exists(os.path.join(location, filename)):
@@ -292,6 +266,7 @@
293267
294268
295269 def which(program):
 270+ '''determine the path where program can be found'''
296271 def is_exe(fpath):
297272 return os.path.exists(fpath) and os.access(fpath, os.X_OK)
298273
@@ -308,7 +283,7 @@
309284 raise exceptions.FileNotFoundException(program)
310285
311286
312 -def store_object(object, location, filename):
 287+def store_object(obj, location, filename):
313288 '''
314289 Pickle object
315290 '''
@@ -317,7 +292,7 @@
318293 if not filename.endswith('.bin'):
319294 filename = filename + '.bin'
320295 fh = create_binary_filehandle(location, filename, 'wb')
321 - cPickle.dump(object, fh)
 296+ cPickle.dump(obj, fh)
322297 fh.close()
323298
324299
@@ -364,9 +339,7 @@
365340 Determine the name of a file by stripping away all extensions.
366341 '''
367342 while filename.find('.') > -1:
368 - ext = determine_file_extension(filename)
369 - ext = '.%s' % ext
370 - filename = filename.replace(ext, '')
 343+ filename = os.path.splitext(filename)[0]
371344 return filename
372345
373346
@@ -386,15 +359,15 @@
387360 mask = re.compile('[\w\d*]')
388361 all_files = os.listdir(location)
389362 files = []
390 - for file in all_files:
391 - file = file.split('.')
392 - if len(file) == 1:
 363+ for filename in all_files:
 364+ filename = filename.split('.')
 365+ if len(filename) == 1:
393366 continue
394367 if extension:
395 - if re.match(mask, file[0]) and file[-1].endswith(extension):
396 - files.append('.'.join(file))
397 - elif re.match(mask, file[0]):
398 - files.append('.'.join(file))
 368+ if re.match(mask, filename[0]) and filename[-1].endswith(extension):
 369+ files.append('.'.join(filename))
 370+ elif re.match(mask, filename[0]):
 371+ files.append('.'.join(filename))
399372 return files
400373
401374
@@ -415,7 +388,7 @@
416389 chunks[x] = datalist[a:b]
417390 a = (x + 1) * parts
418391 if a >= len(datalist):
419 - break
 392+ break
420393 return chunks
421394
422395
Index: trunk/tools/editor_trends/utils/http_utils.py
@@ -63,18 +63,22 @@
6464 them in a queue.
6565 '''
6666 task_queue = multiprocessing.JoinableQueue()
67 - ext = file_utils.determine_file_extension(filename)
 67+ ext = os.path.splitext(filename)[1]
6868 canonical_filename = file_utils.determine_canonical_name(filename)
69 - for x in xrange(1, 100):
70 - f = '%s%s.xml.%s' % (canonical_filename, x, ext)
 69+ x = 1
 70+ while True:
 71+ f = '%s%s.xml%s' % (canonical_filename, x, ext)
7172 res = get_headers(domain, path, f)
72 - if res == None or res.status != 200:
73 - if x == 1:
74 - task_queue.put(filename)
 73+ if res == None or res.status != 200 and x == 1:
 74+ task_queue.put(filename)
7575 break
76 - else:
 76+ elif res.status == 200:
7777 print 'Added chunk to download: %s' % f
7878 task_queue.put(f)
 79+ else:
 80+ break
 81+ x += 1
 82+
7983 if x == 1:
8084 for x in xrange(1):
8185 task_queue.put(None)
Index: trunk/tools/editor_trends/utils/messages.py
@@ -20,9 +20,9 @@
2121
2222 def show(func):
2323 '''
24 - @func should be an qsize() belonging to a task queue. qsize() is not supported
25 - on OSX hence this simple workaround to make sure that we can continue supporting
26 - OSX.
 24+ @func should be an qsize() belonging to a task queue. qsize() is not
 25+ supported on OSX hence this simple workaround to make sure that we can
 26+ continue supporting OSX.
2727 '''
2828 try:
2929 return func()