r92095 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r92094‎ \| r92095 \| r92096 >
Date:	20:36, 13 July 2011
Author:	rfaulk
Status:	deferred
Tags:
Comment:	Some logic to extract data from tokenized files Logic to handle vandal data, and to classify converted vandals
Modified paths:	/trunk/tools/wsor/scripts/classes/WSORFileLoader.py (added) (history)

Diff [purge]

Index: trunk/tools/wsor/scripts/classes/WSORFileLoader.py
—	—	@@ -0,0 +1,287 @@
	2	+"""
	3	+
	4	+ WSOR dataloader class to process file contents
	5	+
	6	+
	7	+ e.g. '/home/rfaulkner/trunk/projects/data/en.editor_first_and_last.20.tsv'
	8	+
	9	+"""
	10	+
	11	+
	12	+""" Meta """
	13	+__author__ = "Ryan Faulkner"
	14	+__revision__ = "$Rev$"
	15	+__date__ = "July 11th, 2011"
	16	+
	17	+
	18	+""" Import python base modules """
	19	+import sys, getopt, re, datetime, logging, settings
	20	+
	21	+""" Modify the classpath to include local projects """
	22	+sys.path.append(settings.__project_home__)
	23	+
	24	+""" Import Analytics modules """
	25	+import WSOR.scripts.classes.WSORSlaveDataLoader as WSORSDL
	26	+
	27	+""" Configure the logger """
	28	+LOGGING_STREAM = sys.stderr
	29	+logging.basicConfig(level=logging.DEBUG, stream=LOGGING_STREAM, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%b-%d %H:%M:%S')
	30	+
	31	+
	32	+"""
	33	+
	34	+ DataLoader class to import file based data
	35	+
	36	+"""
	37	+class WSORFileLoader(object):
	38	+
	39	+ def __init__(self):
	40	+
	41	+ logging.info('Creating object %s' % str(type(self)))
	42	+ self.contents = dict()
	43	+
	44	+ return
	45	+
	46	+ """
	47	+ The base class simply spills the file contents
	48	+ """
	49	+ def process_file(self, filename):
	50	+ file = open(filename, 'r')
	51	+
	52	+ file_contents = ''
	53	+ line = file.readline()
	54	+ while (line != ''):
	55	+ file_contents = file_contents + line
	56	+ line = file.readline()
	57	+
	58	+ file.close()
	59	+
	60	+ return file_contents
	61	+
	62	+
	63	+"""
	64	+
	65	+ File reader for token separated value text files
	66	+
	67	+ Maintains a dictionary that stores lists for each token field.
	68	+
	69	+ This also inherits the database methods
	70	+
	71	+"""
	72	+class WSORTokenizedTextFileLoader(WSORFileLoader, WSORSDL.WSORSlaveDataLoader):
	73	+
	74	+ def __init__(self, token_separator):
	75	+
	76	+ """ Call the parent constructor"""
	77	+ WSORFileLoader.__init__(self)
	78	+
	79	+ self.token_separator = token_separator
	80	+
	81	+ return
	82	+
	83	+ """
	84	+ Get method for data object
	85	+ """
	86	+ def get_records(self):
	87	+ return self.contents
	88	+
	89	+ """
	90	+ Pre-processing and helper for process_file
	91	+ """
	92	+ def process_file_header(self, filename):
	93	+
	94	+ """ Open the file and read the first line """
	95	+ file = open(filename, 'r')
	96	+ line = file.readline()
	97	+ self.file_header_contents = line.split(self.token_separator)
	98	+
	99	+ for field in self.file_header_contents:
	100	+ self.contents[field] = list()
	101	+
	102	+ logging.info('File header processed: %s' % str(self.file_header_contents))
	103	+
	104	+ """ return the file object rather than assigning as a member since it's state may change """
	105	+ return file
	106	+
	107	+
	108	+ """
	109	+ Override to process a token separator text file
	110	+ """
	111	+ def process_file(self, filename):
	112	+
	113	+ """ Process the header - get an index of fields """
	114	+ file = self.process_file_header(filename)
	115	+ field_index = range(len(self.contents.keys()))
	116	+
	117	+ """ process the remainder of the file """
	118	+ logging.info('Processing rows...')
	119	+
	120	+ line = file.readline()
	121	+ while (line != ''):
	122	+
	123	+ try:
	124	+ line = file.readline()
	125	+ elems = line.split(self.token_separator)
	126	+
	127	+ if len(elems) < len(field_index):
	128	+ raise IndexError('Too few elements in record. Omitting row.')
	129	+
	130	+ for i in field_index:
	131	+ field = self.file_header_contents[i]
	132	+ elem = elems[i]
	133	+ self.contents[field].append(elem)
	134	+
	135	+ except Exception as inst:
	136	+
	137	+ logging.error('Error processing row:')
	138	+ #logging.error(type(inst)) # the exception instance
	139	+ #logging.error(inst.args) # arguments stored in .args
	140	+ logging.error(inst) # __str__ allows args to printed directly
	141	+
	142	+ pass
	143	+
	144	+ logging.info('Processing complete.')
	145	+ file.close()
	146	+
	147	+ return self.contents
	148	+
	149	+ """
	150	+ Output data to a token separated file
	151	+ """
	152	+ def write_dict_to_file(self, dict_obj, filename):
	153	+
	154	+ fields = dict_obj.keys()
	155	+ num_fields = len(fields)
	156	+ field_index = range(len(fields))
	157	+
	158	+ """ Although the number of entries for each field should be the same always take the key with the fewest elements """
	159	+ num_records = len(dict_obj[fields[0]])
	160	+ for field in fields:
	161	+ if len(dict_obj[field]) < num_records:
	162	+ num_records = len(dict_obj[field])
	163	+
	164	+ index = range(num_records)
	165	+
	166	+ file = open(filename, 'w')
	167	+
	168	+ try:
	169	+ """ Write the header """
	170	+ for fi in field_index:
	171	+ if fi == num_fields - 1:
	172	+ file.write(fields[fi] + '\n')
	173	+ else:
	174	+ file.write(fields[fi] + self.token_separator)
	175	+
	176	+ """ Write the data """
	177	+ for i in index:
	178	+ for fi in field_index:
	179	+ if fi == num_fields - 1 and i != num_records - 1:
	180	+ file.write(str(dict_obj[fields[fi]][i]) + '\n')
	181	+ elif i != num_records - 1:
	182	+ file.write(str(dict_obj[fields[fi]][i]) + self.token_separator)
	183	+ else:
	184	+ file.write(str(dict_obj[fields[fi]][i]))
	185	+
	186	+ except Exception as e:
	187	+
	188	+ logging.error(e)
	189	+
	190	+ finally:
	191	+
	192	+ file.close()
	193	+
	194	+
	195	+
	196	+"""
	197	+
	198	+ Custom loader that operates on rows in /home/rfaulkner/trunk/projects/data/en.editor_first_and_last.20.tsv
	199	+
	200	+ This file stores the first and last edit info of 500K users
	201	+
	202	+ ['fes_edits',
	203	+ 'user_id',
	204	+ 'last10_reverted',
	205	+ 'fes_vandalism',
	206	+ 'first_edit',
	207	+ 'last10_edits',
	208	+ 'fes_reverted',
	209	+ 'last_edit',
	210	+ 'fes_deleted',
	211	+ 'user_name',
	212	+ 'last10_vandalism',
	213	+ 'last10_deleted\n']
	214	+
	215	+"""
	216	+class WSOR_custom1_Loader(WSORTokenizedTextFileLoader):
	217	+
	218	+ def __init__(self):
	219	+
	220	+ self._filename_ = '/home/rfaulkner/trunk/projects/data/en.editor_first_and_last.20.tsv'
	221	+
	222	+ WSORTokenizedTextFileLoader.__init__(self, '\t')
	223	+ self.process_file(self._filename_)
	224	+
	225	+ def find_coverted_users(self, portion):
	226	+
	227	+ converted_users = dict()
	228	+ #converted_users['last10_vandalism'] = list()
	229	+ converted_users['user_id'] = list()
	230	+ converted_users['user_name'] = list()
	231	+
	232	+ index = range(len(self.contents[self.contents.keys()[0]]))
	233	+
	234	+ """
	235	+ for i in index:
	236	+ if int(self.contents['last10_vandalism'][i]) > portion:
	237	+
	238	+ converted_users['last10_vandalism'].append(self.contents['last10_vandalism'][i])
	239	+ converted_users['user_id'].append(self.contents['user_id'][i])
	240	+ converted_users['user_name'].append(self.contents['user_name'][i])
	241	+ """
	242	+
	243	+ """
	244	+ Get all users that have:
	245	+
	246	+ 1. A fisrt edit session of vandalism
	247	+ 2. No vandalism in their last 10 edits
	248	+ """
	249	+ for i in index:
	250	+ if int(self.contents['last10_vandalism'][i]) == 0 and int(self.contents['fes_vandalism'][i]) > 0:
	251	+
	252	+ #converted_users['last10_vandalism'].append(self.contents['last10_vandalism'][i])
	253	+ converted_users['user_id'].append(self.contents['user_id'][i])
	254	+ converted_users['user_name'].append(self.contents['user_name'][i])
	255	+
	256	+ return converted_users
	257	+
	258	+
	259	+ def get_user_revisions(self, user_id):
	260	+
	261	+ self.init_db()
	262	+
	263	+ sql_stmnt = 'select * from where rev_user = %s' % str(user_id)
	264	+
	265	+ logging.info('Running query on revision table...')
	266	+
	267	+ try:
	268	+ self._cur_.execute(sql_stmnt)
	269	+
	270	+ """ GET THE COLUMN NAMES FROM THE QUERY RESULTS """
	271	+ self._col_names_ = list()
	272	+ for i in self._cur_.description:
	273	+ self._col_names_.append(i[0])
	274	+
	275	+ self._results_ = self._cur_.fetchall()
	276	+
	277	+ except Exception as inst:
	278	+
	279	+ logging.error(type(inst)) # the exception instance
	280	+ logging.error(inst.args) # arguments stored in .args
	281	+ logging.error(inst) # __str__ allows args to printed directly
	282	+
	283	+
	284	+
	285	+
	286	+ self.close_db()
	287	+
	288	+
\ No newline at end of file
Property changes on: trunk/tools/wsor/scripts/classes/WSORFileLoader.py
___________________________________________________________________
Added: svn:eol-style
1	289	+ native

Status & tagging log

21:19, 13 July 2011 😂 (talk | contribs) changed the status of r92095 [removed: new added: deferred]