r80727 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r80726‎ \| r80727 \| r80728 >
Date:	23:12, 21 January 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Split utils.py in two separate files: 1) file_utils.py (all kinds of filesystem related helper functions) 2) text_utils.py (all kinds of string / text parsing related helper functions)
Modified paths:	/trunk/tools/editor_trends/utils/file_utils.py (added) (history) /trunk/tools/editor_trends/utils/text_utils.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/utils/file_utils.py
—	—	@@ -0,0 +1,369 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
	20	+
	21	+'''
	22	+The utils module contains helper functions that will be needed throughout.
	23	+It provides functions to read / write data to text and binary files, fix markup
	24	+and track error messages.
	25	+'''
	26	+
	27	+import re
	28	+import htmlentitydefs
	29	+import cPickle
	30	+import codecs
	31	+import os
	32	+import ctypes
	33	+import sys
	34	+import shutil
	35	+sys.path.append('..')
	36	+
	37	+
	38	+import configuration
	39	+settings = configuration.Settings()
	40	+
	41	+import exceptions
	42	+import messages
	43	+
	44	+try:
	45	+ import psyco
	46	+ psyco.full()
	47	+except ImportError:
	48	+ pass
	49	+
	50	+
	51	+#RE_ERROR_LOCATION = re.compile('\d+')
	52	+#RE_NUMERIC_CHARACTER = re.compile('&#?\w+;')
	53	+
	54	+def check_if_process_is_running(pid):
	55	+ try:
	56	+ if settings.OS == 'Windows':
	57	+ PROCESS_TERMINATE = 1
	58	+ handle = ctypes.windll.kernel32.OpenProcess(PROCESS_TERMINATE, False, pid)
	59	+ ctypes.windll.kernel32.CloseHandle(handle)
	60	+ if handle != 0:
	61	+ return True
	62	+ else:
	63	+ return False
	64	+ else:
	65	+ os.kill(pid, 0)
	66	+ return True
	67	+ except Exception, error:
	68	+ print error
	69	+ return False
	70	+
	71	+
	72	+def read_raw_data(fh):
	73	+ '''
	74	+ @fh should be a file object
	75	+ '''
	76	+ for line in fh:
	77	+ line = line.strip()
	78	+ if line == '':
	79	+ continue
	80	+ else:
	81	+ line = line.split('\t')
	82	+ yield line
	83	+
	84	+
	85	+
	86	+
	87	+
	88	+# read / write data related functions
	89	+def read_data_from_csv(location, filename, encoding):
	90	+ '''
	91	+ @filename is the path (either absolute or relative) including the name of
	92	+ of the file
	93	+ @encoding is usually utf-8
	94	+ '''
	95	+ fh = create_txt_filehandle(location, filename, 'r', encoding)
	96	+ for line in fh:
	97	+ yield line
	98	+
	99	+ fh.close()
	100	+
	101	+
	102	+def create_directory(path):
	103	+ try:
	104	+ os.mkdir(path)
	105	+ return True
	106	+ except IOError:
	107	+ return False
	108	+
	109	+
	110	+def determine_file_extension(filename):
	111	+ pos = filename.rfind('.') + 1
	112	+ return filename[pos:]
	113	+
	114	+
	115	+def determine_file_mode(extension):
	116	+ '''
	117	+ Checks if a given extension is an ASCII extension or not. The settings file
	118	+ provides known ASCII extensions.
	119	+ '''
	120	+ if extension in settings.ascii_extensions:
	121	+ return 'w'
	122	+ else:
	123	+ return 'wb'
	124	+
	125	+
	126	+def write_list_to_csv(data, fh, recursive=False, newline=True, format='wide'):
	127	+ '''
	128	+ @data is a list which can contain other lists that will be written as a
	129	+ single line to a textfile
	130	+ @fh is a handle to an open text
	131	+
	132	+ The calling function is responsible for:
	133	+ 1) closing the filehandle
	134	+ '''
	135	+ tab = False
	136	+ wrote_newline = None
	137	+ if recursive:
	138	+ recursive = False
	139	+ for x, d in enumerate(data):
	140	+ if tab:
	141	+ fh.write('\t')
	142	+ if type(d) == type([]):
	143	+ recursive = write_list_to_csv(d, fh, recursive=True, newline=False)
	144	+ #when there is a list of lists but no other elements in the first list
	145	+ #then write a newline.
	146	+ if len(d) == len(data[x]):
	147	+ fh.write('\n')
	148	+ elif type(d) == type({}):
	149	+ tab = write_dict_to_csv(d, fh, d.keys(), write_key=False, format=format)
	150	+ else:
	151	+ fh.write('%s' % d)
	152	+ tab = True
	153	+
	154	+ if recursive:
	155	+ tab = False
	156	+ return True
	157	+ if newline:
	158	+ fh.write('\n')
	159	+
	160	+
	161	+def write_dict_to_csv(data, fh, keys, write_key=True, format='long'):
	162	+ assert format == 'long' or format == 'wide'
	163	+
	164	+ if format == 'long':
	165	+ for key in keys:
	166	+ if write_key:
	167	+ fh.write('%s\t' % key)
	168	+ if type(data[key]) == type([]):
	169	+ for d in data[key]:
	170	+ fh.write('%s\t%s\n' % (key, d))
	171	+ elif type(data[key]) == type({}):
	172	+ write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format)
	173	+# elif getattr(data[key], '__iter__', False):
	174	+# for d in data[key]:
	175	+# fh.write('%s\t%s\t%s\n' % (key, d, data[key][d]))
	176	+ else:
	177	+ fh.write('%s\n' % (data[key]))
	178	+ elif format == 'wide':
	179	+ for key in keys:
	180	+ if write_key:
	181	+ fh.write('%s\t' % key)
	182	+ if type(data[key]) == type([]):
	183	+ for d in data[key]:
	184	+ fh.write('%s\t')
	185	+ elif type(data[key]) == type({}):
	186	+ write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format)
	187	+ else:
	188	+ fh.write('%s\t' % (data[key]))
	189	+ fh.write('\n')
	190	+
	191	+ #if type(data[key]) == type([]):
	192	+ # write_list_to_csv(data[key], fh, recursive=False, newline=True)
	193	+
	194	+def create_txt_filehandle(location, name, mode, encoding):
	195	+ filename = construct_filename(name, '.csv')
	196	+ path = os.path.join(location, filename)
	197	+ return codecs.open(path, mode, encoding=encoding)
	198	+
	199	+
	200	+def create_binary_filehandle(location, filename, mode):
	201	+ path = os.path.join(location, filename)
	202	+ return open(path, mode)
	203	+
	204	+
	205	+def construct_filename(name, extension):
	206	+ if hasattr(name, '__call__'):
	207	+ return '%s%s' % (name.func_name, extension)
	208	+ else:
	209	+ return name
	210	+
	211	+
	212	+def delete_file(location, filename, directory=False):
	213	+ if not directory:
	214	+ if check_file_exists(location, filename):
	215	+ try:
	216	+ path = os.path.join(location, filename)
	217	+ os.remove(path)
	218	+ except WindowsError, error:
	219	+ print error
	220	+ else:
	221	+ try:
	222	+ shutil.rmtree(location)
	223	+ except Exception, error:
	224	+ print error
	225	+
	226	+
	227	+def determine_filesize(location, filename):
	228	+ path = os.path.join(location, filename)
	229	+ return os.path.getsize(path)
	230	+
	231	+
	232	+def check_file_exists(location, filename):
	233	+ if hasattr(filename, '__call__'):
	234	+ filename = construct_filename(filename, '.bin')
	235	+ if os.path.exists(os.path.join(location, filename)):
	236	+ return True
	237	+ else:
	238	+ return False
	239	+
	240	+
	241	+def which(program):
	242	+ def is_exe(fpath):
	243	+ return os.path.exists(fpath) and os.access(fpath, os.X_OK)
	244	+
	245	+ fpath, fname = os.path.split(program)
	246	+ if fpath:
	247	+ if is_exe(program):
	248	+ return program
	249	+ else:
	250	+ for path in os.environ["PATH"].split(os.pathsep):
	251	+ exe_file = os.path.join(path, program)
	252	+ if is_exe(exe_file):
	253	+ return exe_file
	254	+
	255	+ raise exceptions.FileNotFoundException(program)
	256	+
	257	+
	258	+def store_object(object, location, filename):
	259	+ '''
	260	+ Pickle object
	261	+ '''
	262	+ if hasattr(filename, '__call__'):
	263	+ filename = construct_filename(filename, '.bin')
	264	+ if not filename.endswith('.bin'):
	265	+ filename = filename + '.bin'
	266	+ fh = create_binary_filehandle(location, filename, 'wb')
	267	+ cPickle.dump(object, fh)
	268	+ fh.close()
	269	+
	270	+
	271	+def load_object(location, filename):
	272	+ '''
	273	+ Load pickled object
	274	+ '''
	275	+ if hasattr(filename, '__call__'):
	276	+ filename = construct_filename(filename, '.bin')
	277	+ if not filename.endswith('.bin'):
	278	+ filename = filename + '.bin'
	279	+ fh = create_binary_filehandle(location, filename, 'rb')
	280	+ obj = cPickle.load(fh)
	281	+ fh.close()
	282	+ return obj
	283	+
	284	+
	285	+
	286	+def create_dict_from_csv_file(location, filename, encoding, keys=None):
	287	+ '''
	288	+ Constructs a dictionary from a txtfile
	289	+ The first column of the csv file should contain the main key for the dictionary.
	290	+ If there are more than one value in the values list, then a @keys variable should
	291	+ be supplied and the key sequence should match the value sequence.
	292	+ '''
	293	+ d = {}
	294	+ for line in read_data_from_csv(location, filename, encoding):
	295	+ line = line.strip()
	296	+ line = line.split('\t')
	297	+ key = line[0]
	298	+ values = line[1:]
	299	+ if len(values) == 1:
	300	+ d[key] = values
	301	+ else:
	302	+ assert keys != None
	303	+ d[key] = {}
	304	+ for k, v in zip(keys, values):
	305	+ d[key][k] = v
	306	+ return d
	307	+
	308	+
	309	+def determine_canonical_name(filename):
	310	+ '''
	311	+ Determine the name of a file by stripping away all extensions.
	312	+ '''
	313	+ while filename.find('.') > -1:
	314	+ ext = determine_file_extension(filename)
	315	+ ext = '.%s' % ext
	316	+ filename = filename.replace(ext, '')
	317	+ return filename
	318	+
	319	+
	320	+def retrieve_file_list(location, extension, mask=None):
	321	+ '''
	322	+ Retrieve a list of files from a specified location.
	323	+ @location: either an absolute or relative path
	324	+ @extension: only include files with extension (optional)
	325	+ @mask: only include files that start with mask (optional), this is
	326	+ interpreted as a regular expression.
	327	+
	328	+ @return: a list of files matching the criteria
	329	+ '''
	330	+ if mask:
	331	+ mask = re.compile(mask)
	332	+ else:
	333	+ mask = re.compile('[\w\d*]')
	334	+ all_files = os.listdir(location)
	335	+ files = []
	336	+ for file in all_files:
	337	+ file = file.split('.')
	338	+ if len(file) == 1:
	339	+ continue
	340	+ if re.match(mask, file[0]) and file[-1].endswith(extension):
	341	+ files.append('.'.join(file))
	342	+ return files
	343	+
	344	+
	345	+def merge_list(datalist):
	346	+ merged = []
	347	+ for d in datalist:
	348	+ for x in datalist[d]:
	349	+ merged.append(x)
	350	+ return merged
	351	+
	352	+
	353	+def split_list(datalist, maxval):
	354	+ chunks = {}
	355	+ a = 0
	356	+ parts = int(round(float(len(datalist)) / maxval, 0))
	357	+ for x in xrange(maxval):
	358	+ b = a + parts
	359	+ chunks[x] = datalist[a:b]
	360	+ a = (x + 1) * parts
	361	+ if a >= len(datalist):
	362	+ break
	363	+ return chunks
	364	+
	365	+
	366	+def debug():
	367	+ pass
	368	+
	369	+if __name__ == '__main__':
	370	+ debug()
Property changes on: trunk/tools/editor_trends/utils/file_utils.py
___________________________________________________________________
Added: svn:eol-style
1	371	+ native
Added: svn:mime-type
2	372	+ text/plain
Index: trunk/tools/editor_trends/utils/text_utils.py
—	—	@@ -0,0 +1,53 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-21'
	19	+__version__ = '0.1'
	20	+
	21	+import datetime
	22	+import time
	23	+import sys
	24	+
	25	+sys.path.append('..')
	26	+import configuration
	27	+settings = configuration.Settings()
	28	+
	29	+
	30	+def convert_timestamp_to_date(timestamp):
	31	+ return datetime.datetime.strptime(timestamp[:10], settings.date_format)
	32	+
	33	+
	34	+def convert_timestamp_to_datetime_naive(timestamp):
	35	+ return datetime.datetime.strptime(timestamp, settings.timestamp_format)
	36	+
	37	+
	38	+def convert_timestamp_to_datetime_utc(timestamp):
	39	+ tz = datetime.tzinfo('utc')
	40	+ d = convert_timestamp_to_datetime_naive(timestamp)
	41	+ #return d.replace(tzinfo=tz) #enabling this line crashes pymongo
	42	+ return d
	43	+
	44	+
	45	+
	46	+def invert_dict(dictionary):
	47	+ '''
	48	+ @dictionary is a simple dictionary containing simple values, ie. no lists,
	49	+ or other dictionaries
	50	+ output: dictionary where key and value are swapped.
	51	+ '''
	52	+ return dict([[v, k] for k, v in dictionary.items()])
	53	+
	54	+
Property changes on: trunk/tools/editor_trends/utils/text_utils.py
___________________________________________________________________
Added: svn:eol-style
1	55	+ native

Status & tagging log

23:51, 21 January 2011 Reedy (talk | contribs) changed the status of r80727 [removed: new added: deferred]