r80727 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r80726‎ | r80727 | r80728 >
Date:23:12, 21 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
Split utils.py in two separate files:
1) file_utils.py (all kinds of filesystem related helper functions)
2) text_utils.py (all kinds of string / text parsing related helper functions)
Modified paths:
  • /trunk/tools/editor_trends/utils/file_utils.py (added) (history)
  • /trunk/tools/editor_trends/utils/text_utils.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/utils/file_utils.py
@@ -0,0 +1,369 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
 20+
 21+'''
 22+The utils module contains helper functions that will be needed throughout.
 23+It provides functions to read / write data to text and binary files, fix markup
 24+and track error messages.
 25+'''
 26+
 27+import re
 28+import htmlentitydefs
 29+import cPickle
 30+import codecs
 31+import os
 32+import ctypes
 33+import sys
 34+import shutil
 35+sys.path.append('..')
 36+
 37+
 38+import configuration
 39+settings = configuration.Settings()
 40+
 41+import exceptions
 42+import messages
 43+
 44+try:
 45+ import psyco
 46+ psyco.full()
 47+except ImportError:
 48+ pass
 49+
 50+
 51+#RE_ERROR_LOCATION = re.compile('\d+')
 52+#RE_NUMERIC_CHARACTER = re.compile('&#?\w+;')
 53+
 54+def check_if_process_is_running(pid):
 55+ try:
 56+ if settings.OS == 'Windows':
 57+ PROCESS_TERMINATE = 1
 58+ handle = ctypes.windll.kernel32.OpenProcess(PROCESS_TERMINATE, False, pid)
 59+ ctypes.windll.kernel32.CloseHandle(handle)
 60+ if handle != 0:
 61+ return True
 62+ else:
 63+ return False
 64+ else:
 65+ os.kill(pid, 0)
 66+ return True
 67+ except Exception, error:
 68+ print error
 69+ return False
 70+
 71+
 72+def read_raw_data(fh):
 73+ '''
 74+ @fh should be a file object
 75+ '''
 76+ for line in fh:
 77+ line = line.strip()
 78+ if line == '':
 79+ continue
 80+ else:
 81+ line = line.split('\t')
 82+ yield line
 83+
 84+
 85+
 86+
 87+
 88+# read / write data related functions
 89+def read_data_from_csv(location, filename, encoding):
 90+ '''
 91+ @filename is the path (either absolute or relative) including the name of
 92+ of the file
 93+ @encoding is usually utf-8
 94+ '''
 95+ fh = create_txt_filehandle(location, filename, 'r', encoding)
 96+ for line in fh:
 97+ yield line
 98+
 99+ fh.close()
 100+
 101+
 102+def create_directory(path):
 103+ try:
 104+ os.mkdir(path)
 105+ return True
 106+ except IOError:
 107+ return False
 108+
 109+
 110+def determine_file_extension(filename):
 111+ pos = filename.rfind('.') + 1
 112+ return filename[pos:]
 113+
 114+
 115+def determine_file_mode(extension):
 116+ '''
 117+ Checks if a given extension is an ASCII extension or not. The settings file
 118+ provides known ASCII extensions.
 119+ '''
 120+ if extension in settings.ascii_extensions:
 121+ return 'w'
 122+ else:
 123+ return 'wb'
 124+
 125+
 126+def write_list_to_csv(data, fh, recursive=False, newline=True, format='wide'):
 127+ '''
 128+ @data is a list which can contain other lists that will be written as a
 129+ single line to a textfile
 130+ @fh is a handle to an open text
 131+
 132+ The calling function is responsible for:
 133+ 1) closing the filehandle
 134+ '''
 135+ tab = False
 136+ wrote_newline = None
 137+ if recursive:
 138+ recursive = False
 139+ for x, d in enumerate(data):
 140+ if tab:
 141+ fh.write('\t')
 142+ if type(d) == type([]):
 143+ recursive = write_list_to_csv(d, fh, recursive=True, newline=False)
 144+ #when there is a list of lists but no other elements in the first list
 145+ #then write a newline.
 146+ if len(d) == len(data[x]):
 147+ fh.write('\n')
 148+ elif type(d) == type({}):
 149+ tab = write_dict_to_csv(d, fh, d.keys(), write_key=False, format=format)
 150+ else:
 151+ fh.write('%s' % d)
 152+ tab = True
 153+
 154+ if recursive:
 155+ tab = False
 156+ return True
 157+ if newline:
 158+ fh.write('\n')
 159+
 160+
 161+def write_dict_to_csv(data, fh, keys, write_key=True, format='long'):
 162+ assert format == 'long' or format == 'wide'
 163+
 164+ if format == 'long':
 165+ for key in keys:
 166+ if write_key:
 167+ fh.write('%s\t' % key)
 168+ if type(data[key]) == type([]):
 169+ for d in data[key]:
 170+ fh.write('%s\t%s\n' % (key, d))
 171+ elif type(data[key]) == type({}):
 172+ write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format)
 173+# elif getattr(data[key], '__iter__', False):
 174+# for d in data[key]:
 175+# fh.write('%s\t%s\t%s\n' % (key, d, data[key][d]))
 176+ else:
 177+ fh.write('%s\n' % (data[key]))
 178+ elif format == 'wide':
 179+ for key in keys:
 180+ if write_key:
 181+ fh.write('%s\t' % key)
 182+ if type(data[key]) == type([]):
 183+ for d in data[key]:
 184+ fh.write('%s\t')
 185+ elif type(data[key]) == type({}):
 186+ write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format)
 187+ else:
 188+ fh.write('%s\t' % (data[key]))
 189+ fh.write('\n')
 190+
 191+ #if type(data[key]) == type([]):
 192+ # write_list_to_csv(data[key], fh, recursive=False, newline=True)
 193+
 194+def create_txt_filehandle(location, name, mode, encoding):
 195+ filename = construct_filename(name, '.csv')
 196+ path = os.path.join(location, filename)
 197+ return codecs.open(path, mode, encoding=encoding)
 198+
 199+
 200+def create_binary_filehandle(location, filename, mode):
 201+ path = os.path.join(location, filename)
 202+ return open(path, mode)
 203+
 204+
 205+def construct_filename(name, extension):
 206+ if hasattr(name, '__call__'):
 207+ return '%s%s' % (name.func_name, extension)
 208+ else:
 209+ return name
 210+
 211+
 212+def delete_file(location, filename, directory=False):
 213+ if not directory:
 214+ if check_file_exists(location, filename):
 215+ try:
 216+ path = os.path.join(location, filename)
 217+ os.remove(path)
 218+ except WindowsError, error:
 219+ print error
 220+ else:
 221+ try:
 222+ shutil.rmtree(location)
 223+ except Exception, error:
 224+ print error
 225+
 226+
 227+def determine_filesize(location, filename):
 228+ path = os.path.join(location, filename)
 229+ return os.path.getsize(path)
 230+
 231+
 232+def check_file_exists(location, filename):
 233+ if hasattr(filename, '__call__'):
 234+ filename = construct_filename(filename, '.bin')
 235+ if os.path.exists(os.path.join(location, filename)):
 236+ return True
 237+ else:
 238+ return False
 239+
 240+
 241+def which(program):
 242+ def is_exe(fpath):
 243+ return os.path.exists(fpath) and os.access(fpath, os.X_OK)
 244+
 245+ fpath, fname = os.path.split(program)
 246+ if fpath:
 247+ if is_exe(program):
 248+ return program
 249+ else:
 250+ for path in os.environ["PATH"].split(os.pathsep):
 251+ exe_file = os.path.join(path, program)
 252+ if is_exe(exe_file):
 253+ return exe_file
 254+
 255+ raise exceptions.FileNotFoundException(program)
 256+
 257+
 258+def store_object(object, location, filename):
 259+ '''
 260+ Pickle object
 261+ '''
 262+ if hasattr(filename, '__call__'):
 263+ filename = construct_filename(filename, '.bin')
 264+ if not filename.endswith('.bin'):
 265+ filename = filename + '.bin'
 266+ fh = create_binary_filehandle(location, filename, 'wb')
 267+ cPickle.dump(object, fh)
 268+ fh.close()
 269+
 270+
 271+def load_object(location, filename):
 272+ '''
 273+ Load pickled object
 274+ '''
 275+ if hasattr(filename, '__call__'):
 276+ filename = construct_filename(filename, '.bin')
 277+ if not filename.endswith('.bin'):
 278+ filename = filename + '.bin'
 279+ fh = create_binary_filehandle(location, filename, 'rb')
 280+ obj = cPickle.load(fh)
 281+ fh.close()
 282+ return obj
 283+
 284+
 285+
 286+def create_dict_from_csv_file(location, filename, encoding, keys=None):
 287+ '''
 288+ Constructs a dictionary from a txtfile
 289+ The first column of the csv file should contain the main key for the dictionary.
 290+ If there are more than one value in the values list, then a @keys variable should
 291+ be supplied and the key sequence should match the value sequence.
 292+ '''
 293+ d = {}
 294+ for line in read_data_from_csv(location, filename, encoding):
 295+ line = line.strip()
 296+ line = line.split('\t')
 297+ key = line[0]
 298+ values = line[1:]
 299+ if len(values) == 1:
 300+ d[key] = values
 301+ else:
 302+ assert keys != None
 303+ d[key] = {}
 304+ for k, v in zip(keys, values):
 305+ d[key][k] = v
 306+ return d
 307+
 308+
 309+def determine_canonical_name(filename):
 310+ '''
 311+ Determine the name of a file by stripping away all extensions.
 312+ '''
 313+ while filename.find('.') > -1:
 314+ ext = determine_file_extension(filename)
 315+ ext = '.%s' % ext
 316+ filename = filename.replace(ext, '')
 317+ return filename
 318+
 319+
 320+def retrieve_file_list(location, extension, mask=None):
 321+ '''
 322+ Retrieve a list of files from a specified location.
 323+ @location: either an absolute or relative path
 324+ @extension: only include files with extension (optional)
 325+ @mask: only include files that start with mask (optional), this is
 326+ interpreted as a regular expression.
 327+
 328+ @return: a list of files matching the criteria
 329+ '''
 330+ if mask:
 331+ mask = re.compile(mask)
 332+ else:
 333+ mask = re.compile('[\w\d*]')
 334+ all_files = os.listdir(location)
 335+ files = []
 336+ for file in all_files:
 337+ file = file.split('.')
 338+ if len(file) == 1:
 339+ continue
 340+ if re.match(mask, file[0]) and file[-1].endswith(extension):
 341+ files.append('.'.join(file))
 342+ return files
 343+
 344+
 345+def merge_list(datalist):
 346+ merged = []
 347+ for d in datalist:
 348+ for x in datalist[d]:
 349+ merged.append(x)
 350+ return merged
 351+
 352+
 353+def split_list(datalist, maxval):
 354+ chunks = {}
 355+ a = 0
 356+ parts = int(round(float(len(datalist)) / maxval, 0))
 357+ for x in xrange(maxval):
 358+ b = a + parts
 359+ chunks[x] = datalist[a:b]
 360+ a = (x + 1) * parts
 361+ if a >= len(datalist):
 362+ break
 363+ return chunks
 364+
 365+
 366+def debug():
 367+ pass
 368+
 369+if __name__ == '__main__':
 370+ debug()
Property changes on: trunk/tools/editor_trends/utils/file_utils.py
___________________________________________________________________
Added: svn:eol-style
1371 + native
Added: svn:mime-type
2372 + text/plain
Index: trunk/tools/editor_trends/utils/text_utils.py
@@ -0,0 +1,53 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-21'
 19+__version__ = '0.1'
 20+
 21+import datetime
 22+import time
 23+import sys
 24+
 25+sys.path.append('..')
 26+import configuration
 27+settings = configuration.Settings()
 28+
 29+
 30+def convert_timestamp_to_date(timestamp):
 31+ return datetime.datetime.strptime(timestamp[:10], settings.date_format)
 32+
 33+
 34+def convert_timestamp_to_datetime_naive(timestamp):
 35+ return datetime.datetime.strptime(timestamp, settings.timestamp_format)
 36+
 37+
 38+def convert_timestamp_to_datetime_utc(timestamp):
 39+ tz = datetime.tzinfo('utc')
 40+ d = convert_timestamp_to_datetime_naive(timestamp)
 41+ #return d.replace(tzinfo=tz) #enabling this line crashes pymongo
 42+ return d
 43+
 44+
 45+
 46+def invert_dict(dictionary):
 47+ '''
 48+ @dictionary is a simple dictionary containing simple values, ie. no lists,
 49+ or other dictionaries
 50+ output: dictionary where key and value are swapped.
 51+ '''
 52+ return dict([[v, k] for k, v in dictionary.items()])
 53+
 54+
Property changes on: trunk/tools/editor_trends/utils/text_utils.py
___________________________________________________________________
Added: svn:eol-style
155 + native

Status & tagging log