Index: trunk/tools/editor_trends/utils/file_utils.py |
— | — | @@ -0,0 +1,369 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +''' |
| 22 | +The utils module contains helper functions that will be needed throughout. |
| 23 | +It provides functions to read / write data to text and binary files, fix markup |
| 24 | +and track error messages. |
| 25 | +''' |
| 26 | + |
| 27 | +import re |
| 28 | +import htmlentitydefs |
| 29 | +import cPickle |
| 30 | +import codecs |
| 31 | +import os |
| 32 | +import ctypes |
| 33 | +import sys |
| 34 | +import shutil |
| 35 | +sys.path.append('..') |
| 36 | + |
| 37 | + |
| 38 | +import configuration |
| 39 | +settings = configuration.Settings() |
| 40 | + |
| 41 | +import exceptions |
| 42 | +import messages |
| 43 | + |
| 44 | +try: |
| 45 | + import psyco |
| 46 | + psyco.full() |
| 47 | +except ImportError: |
| 48 | + pass |
| 49 | + |
| 50 | + |
| 51 | +#RE_ERROR_LOCATION = re.compile('\d+') |
| 52 | +#RE_NUMERIC_CHARACTER = re.compile('&#?\w+;') |
| 53 | + |
| 54 | +def check_if_process_is_running(pid): |
| 55 | + try: |
| 56 | + if settings.OS == 'Windows': |
| 57 | + PROCESS_TERMINATE = 1 |
| 58 | + handle = ctypes.windll.kernel32.OpenProcess(PROCESS_TERMINATE, False, pid) |
| 59 | + ctypes.windll.kernel32.CloseHandle(handle) |
| 60 | + if handle != 0: |
| 61 | + return True |
| 62 | + else: |
| 63 | + return False |
| 64 | + else: |
| 65 | + os.kill(pid, 0) |
| 66 | + return True |
| 67 | + except Exception, error: |
| 68 | + print error |
| 69 | + return False |
| 70 | + |
| 71 | + |
| 72 | +def read_raw_data(fh): |
| 73 | + ''' |
| 74 | + @fh should be a file object |
| 75 | + ''' |
| 76 | + for line in fh: |
| 77 | + line = line.strip() |
| 78 | + if line == '': |
| 79 | + continue |
| 80 | + else: |
| 81 | + line = line.split('\t') |
| 82 | + yield line |
| 83 | + |
| 84 | + |
| 85 | + |
| 86 | + |
| 87 | + |
| 88 | +# read / write data related functions |
| 89 | +def read_data_from_csv(location, filename, encoding): |
| 90 | + ''' |
| 91 | + @filename is the path (either absolute or relative) including the name of |
| 92 | + of the file |
| 93 | + @encoding is usually utf-8 |
| 94 | + ''' |
| 95 | + fh = create_txt_filehandle(location, filename, 'r', encoding) |
| 96 | + for line in fh: |
| 97 | + yield line |
| 98 | + |
| 99 | + fh.close() |
| 100 | + |
| 101 | + |
| 102 | +def create_directory(path): |
| 103 | + try: |
| 104 | + os.mkdir(path) |
| 105 | + return True |
| 106 | + except IOError: |
| 107 | + return False |
| 108 | + |
| 109 | + |
| 110 | +def determine_file_extension(filename): |
| 111 | + pos = filename.rfind('.') + 1 |
| 112 | + return filename[pos:] |
| 113 | + |
| 114 | + |
| 115 | +def determine_file_mode(extension): |
| 116 | + ''' |
| 117 | + Checks if a given extension is an ASCII extension or not. The settings file |
| 118 | + provides known ASCII extensions. |
| 119 | + ''' |
| 120 | + if extension in settings.ascii_extensions: |
| 121 | + return 'w' |
| 122 | + else: |
| 123 | + return 'wb' |
| 124 | + |
| 125 | + |
| 126 | +def write_list_to_csv(data, fh, recursive=False, newline=True, format='wide'): |
| 127 | + ''' |
| 128 | + @data is a list which can contain other lists that will be written as a |
| 129 | + single line to a textfile |
| 130 | + @fh is a handle to an open text |
| 131 | + |
| 132 | + The calling function is responsible for: |
| 133 | + 1) closing the filehandle |
| 134 | + ''' |
| 135 | + tab = False |
| 136 | + wrote_newline = None |
| 137 | + if recursive: |
| 138 | + recursive = False |
| 139 | + for x, d in enumerate(data): |
| 140 | + if tab: |
| 141 | + fh.write('\t') |
| 142 | + if type(d) == type([]): |
| 143 | + recursive = write_list_to_csv(d, fh, recursive=True, newline=False) |
| 144 | + #when there is a list of lists but no other elements in the first list |
| 145 | + #then write a newline. |
| 146 | + if len(d) == len(data[x]): |
| 147 | + fh.write('\n') |
| 148 | + elif type(d) == type({}): |
| 149 | + tab = write_dict_to_csv(d, fh, d.keys(), write_key=False, format=format) |
| 150 | + else: |
| 151 | + fh.write('%s' % d) |
| 152 | + tab = True |
| 153 | + |
| 154 | + if recursive: |
| 155 | + tab = False |
| 156 | + return True |
| 157 | + if newline: |
| 158 | + fh.write('\n') |
| 159 | + |
| 160 | + |
| 161 | +def write_dict_to_csv(data, fh, keys, write_key=True, format='long'): |
| 162 | + assert format == 'long' or format == 'wide' |
| 163 | + |
| 164 | + if format == 'long': |
| 165 | + for key in keys: |
| 166 | + if write_key: |
| 167 | + fh.write('%s\t' % key) |
| 168 | + if type(data[key]) == type([]): |
| 169 | + for d in data[key]: |
| 170 | + fh.write('%s\t%s\n' % (key, d)) |
| 171 | + elif type(data[key]) == type({}): |
| 172 | + write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format) |
| 173 | +# elif getattr(data[key], '__iter__', False): |
| 174 | +# for d in data[key]: |
| 175 | +# fh.write('%s\t%s\t%s\n' % (key, d, data[key][d])) |
| 176 | + else: |
| 177 | + fh.write('%s\n' % (data[key])) |
| 178 | + elif format == 'wide': |
| 179 | + for key in keys: |
| 180 | + if write_key: |
| 181 | + fh.write('%s\t' % key) |
| 182 | + if type(data[key]) == type([]): |
| 183 | + for d in data[key]: |
| 184 | + fh.write('%s\t') |
| 185 | + elif type(data[key]) == type({}): |
| 186 | + write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format) |
| 187 | + else: |
| 188 | + fh.write('%s\t' % (data[key])) |
| 189 | + fh.write('\n') |
| 190 | + |
| 191 | + #if type(data[key]) == type([]): |
| 192 | + # write_list_to_csv(data[key], fh, recursive=False, newline=True) |
| 193 | + |
| 194 | +def create_txt_filehandle(location, name, mode, encoding): |
| 195 | + filename = construct_filename(name, '.csv') |
| 196 | + path = os.path.join(location, filename) |
| 197 | + return codecs.open(path, mode, encoding=encoding) |
| 198 | + |
| 199 | + |
| 200 | +def create_binary_filehandle(location, filename, mode): |
| 201 | + path = os.path.join(location, filename) |
| 202 | + return open(path, mode) |
| 203 | + |
| 204 | + |
| 205 | +def construct_filename(name, extension): |
| 206 | + if hasattr(name, '__call__'): |
| 207 | + return '%s%s' % (name.func_name, extension) |
| 208 | + else: |
| 209 | + return name |
| 210 | + |
| 211 | + |
| 212 | +def delete_file(location, filename, directory=False): |
| 213 | + if not directory: |
| 214 | + if check_file_exists(location, filename): |
| 215 | + try: |
| 216 | + path = os.path.join(location, filename) |
| 217 | + os.remove(path) |
| 218 | + except WindowsError, error: |
| 219 | + print error |
| 220 | + else: |
| 221 | + try: |
| 222 | + shutil.rmtree(location) |
| 223 | + except Exception, error: |
| 224 | + print error |
| 225 | + |
| 226 | + |
| 227 | +def determine_filesize(location, filename): |
| 228 | + path = os.path.join(location, filename) |
| 229 | + return os.path.getsize(path) |
| 230 | + |
| 231 | + |
| 232 | +def check_file_exists(location, filename): |
| 233 | + if hasattr(filename, '__call__'): |
| 234 | + filename = construct_filename(filename, '.bin') |
| 235 | + if os.path.exists(os.path.join(location, filename)): |
| 236 | + return True |
| 237 | + else: |
| 238 | + return False |
| 239 | + |
| 240 | + |
| 241 | +def which(program): |
| 242 | + def is_exe(fpath): |
| 243 | + return os.path.exists(fpath) and os.access(fpath, os.X_OK) |
| 244 | + |
| 245 | + fpath, fname = os.path.split(program) |
| 246 | + if fpath: |
| 247 | + if is_exe(program): |
| 248 | + return program |
| 249 | + else: |
| 250 | + for path in os.environ["PATH"].split(os.pathsep): |
| 251 | + exe_file = os.path.join(path, program) |
| 252 | + if is_exe(exe_file): |
| 253 | + return exe_file |
| 254 | + |
| 255 | + raise exceptions.FileNotFoundException(program) |
| 256 | + |
| 257 | + |
| 258 | +def store_object(object, location, filename): |
| 259 | + ''' |
| 260 | + Pickle object |
| 261 | + ''' |
| 262 | + if hasattr(filename, '__call__'): |
| 263 | + filename = construct_filename(filename, '.bin') |
| 264 | + if not filename.endswith('.bin'): |
| 265 | + filename = filename + '.bin' |
| 266 | + fh = create_binary_filehandle(location, filename, 'wb') |
| 267 | + cPickle.dump(object, fh) |
| 268 | + fh.close() |
| 269 | + |
| 270 | + |
| 271 | +def load_object(location, filename): |
| 272 | + ''' |
| 273 | + Load pickled object |
| 274 | + ''' |
| 275 | + if hasattr(filename, '__call__'): |
| 276 | + filename = construct_filename(filename, '.bin') |
| 277 | + if not filename.endswith('.bin'): |
| 278 | + filename = filename + '.bin' |
| 279 | + fh = create_binary_filehandle(location, filename, 'rb') |
| 280 | + obj = cPickle.load(fh) |
| 281 | + fh.close() |
| 282 | + return obj |
| 283 | + |
| 284 | + |
| 285 | + |
| 286 | +def create_dict_from_csv_file(location, filename, encoding, keys=None): |
| 287 | + ''' |
| 288 | + Constructs a dictionary from a txtfile |
| 289 | + The first column of the csv file should contain the main key for the dictionary. |
| 290 | + If there are more than one value in the values list, then a @keys variable should |
| 291 | + be supplied and the key sequence should match the value sequence. |
| 292 | + ''' |
| 293 | + d = {} |
| 294 | + for line in read_data_from_csv(location, filename, encoding): |
| 295 | + line = line.strip() |
| 296 | + line = line.split('\t') |
| 297 | + key = line[0] |
| 298 | + values = line[1:] |
| 299 | + if len(values) == 1: |
| 300 | + d[key] = values |
| 301 | + else: |
| 302 | + assert keys != None |
| 303 | + d[key] = {} |
| 304 | + for k, v in zip(keys, values): |
| 305 | + d[key][k] = v |
| 306 | + return d |
| 307 | + |
| 308 | + |
| 309 | +def determine_canonical_name(filename): |
| 310 | + ''' |
| 311 | + Determine the name of a file by stripping away all extensions. |
| 312 | + ''' |
| 313 | + while filename.find('.') > -1: |
| 314 | + ext = determine_file_extension(filename) |
| 315 | + ext = '.%s' % ext |
| 316 | + filename = filename.replace(ext, '') |
| 317 | + return filename |
| 318 | + |
| 319 | + |
| 320 | +def retrieve_file_list(location, extension, mask=None): |
| 321 | + ''' |
| 322 | + Retrieve a list of files from a specified location. |
| 323 | + @location: either an absolute or relative path |
| 324 | + @extension: only include files with extension (optional) |
| 325 | + @mask: only include files that start with mask (optional), this is |
| 326 | + interpreted as a regular expression. |
| 327 | + |
| 328 | + @return: a list of files matching the criteria |
| 329 | + ''' |
| 330 | + if mask: |
| 331 | + mask = re.compile(mask) |
| 332 | + else: |
| 333 | + mask = re.compile('[\w\d*]') |
| 334 | + all_files = os.listdir(location) |
| 335 | + files = [] |
| 336 | + for file in all_files: |
| 337 | + file = file.split('.') |
| 338 | + if len(file) == 1: |
| 339 | + continue |
| 340 | + if re.match(mask, file[0]) and file[-1].endswith(extension): |
| 341 | + files.append('.'.join(file)) |
| 342 | + return files |
| 343 | + |
| 344 | + |
| 345 | +def merge_list(datalist): |
| 346 | + merged = [] |
| 347 | + for d in datalist: |
| 348 | + for x in datalist[d]: |
| 349 | + merged.append(x) |
| 350 | + return merged |
| 351 | + |
| 352 | + |
| 353 | +def split_list(datalist, maxval): |
| 354 | + chunks = {} |
| 355 | + a = 0 |
| 356 | + parts = int(round(float(len(datalist)) / maxval, 0)) |
| 357 | + for x in xrange(maxval): |
| 358 | + b = a + parts |
| 359 | + chunks[x] = datalist[a:b] |
| 360 | + a = (x + 1) * parts |
| 361 | + if a >= len(datalist): |
| 362 | + break |
| 363 | + return chunks |
| 364 | + |
| 365 | + |
| 366 | +def debug(): |
| 367 | + pass |
| 368 | + |
| 369 | +if __name__ == '__main__': |
| 370 | + debug() |
Property changes on: trunk/tools/editor_trends/utils/file_utils.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 371 | + native |
Added: svn:mime-type |
2 | 372 | + text/plain |
Index: trunk/tools/editor_trends/utils/text_utils.py |
— | — | @@ -0,0 +1,53 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-21' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import datetime |
| 22 | +import time |
| 23 | +import sys |
| 24 | + |
| 25 | +sys.path.append('..') |
| 26 | +import configuration |
| 27 | +settings = configuration.Settings() |
| 28 | + |
| 29 | + |
| 30 | +def convert_timestamp_to_date(timestamp): |
| 31 | + return datetime.datetime.strptime(timestamp[:10], settings.date_format) |
| 32 | + |
| 33 | + |
| 34 | +def convert_timestamp_to_datetime_naive(timestamp): |
| 35 | + return datetime.datetime.strptime(timestamp, settings.timestamp_format) |
| 36 | + |
| 37 | + |
| 38 | +def convert_timestamp_to_datetime_utc(timestamp): |
| 39 | + tz = datetime.tzinfo('utc') |
| 40 | + d = convert_timestamp_to_datetime_naive(timestamp) |
| 41 | + #return d.replace(tzinfo=tz) #enabling this line crashes pymongo |
| 42 | + return d |
| 43 | + |
| 44 | + |
| 45 | + |
| 46 | +def invert_dict(dictionary): |
| 47 | + ''' |
| 48 | + @dictionary is a simple dictionary containing simple values, ie. no lists, |
| 49 | + or other dictionaries |
| 50 | + output: dictionary where key and value are swapped. |
| 51 | + ''' |
| 52 | + return dict([[v, k] for k, v in dictionary.items()]) |
| 53 | + |
| 54 | + |
Property changes on: trunk/tools/editor_trends/utils/text_utils.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 55 | + native |