r92095 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r92094‎ | r92095 | r92096 >
Date:20:36, 13 July 2011
Author:rfaulk
Status:deferred
Tags:
Comment:
Some logic to extract data from tokenized files
Logic to handle vandal data, and to classify converted vandals
Modified paths:
  • /trunk/tools/wsor/scripts/classes/WSORFileLoader.py (added) (history)

Diff [purge]

Index: trunk/tools/wsor/scripts/classes/WSORFileLoader.py
@@ -0,0 +1,287 @@
 2+"""
 3+
 4+ WSOR dataloader class to process file contents
 5+
 6+
 7+ e.g. '/home/rfaulkner/trunk/projects/data/en.editor_first_and_last.20.tsv'
 8+
 9+"""
 10+
 11+
 12+""" Meta """
 13+__author__ = "Ryan Faulkner"
 14+__revision__ = "$Rev$"
 15+__date__ = "July 11th, 2011"
 16+
 17+
 18+""" Import python base modules """
 19+import sys, getopt, re, datetime, logging, settings
 20+
 21+""" Modify the classpath to include local projects """
 22+sys.path.append(settings.__project_home__)
 23+
 24+""" Import Analytics modules """
 25+import WSOR.scripts.classes.WSORSlaveDataLoader as WSORSDL
 26+
 27+""" Configure the logger """
 28+LOGGING_STREAM = sys.stderr
 29+logging.basicConfig(level=logging.DEBUG, stream=LOGGING_STREAM, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%b-%d %H:%M:%S')
 30+
 31+
 32+"""
 33+
 34+ DataLoader class to import file based data
 35+
 36+"""
 37+class WSORFileLoader(object):
 38+
 39+ def __init__(self):
 40+
 41+ logging.info('Creating object %s' % str(type(self)))
 42+ self.contents = dict()
 43+
 44+ return
 45+
 46+ """
 47+ The base class simply spills the file contents
 48+ """
 49+ def process_file(self, filename):
 50+ file = open(filename, 'r')
 51+
 52+ file_contents = ''
 53+ line = file.readline()
 54+ while (line != ''):
 55+ file_contents = file_contents + line
 56+ line = file.readline()
 57+
 58+ file.close()
 59+
 60+ return file_contents
 61+
 62+
 63+"""
 64+
 65+ File reader for token separated value text files
 66+
 67+ Maintains a dictionary that stores lists for each token field.
 68+
 69+ This also inherits the database methods
 70+
 71+"""
 72+class WSORTokenizedTextFileLoader(WSORFileLoader, WSORSDL.WSORSlaveDataLoader):
 73+
 74+ def __init__(self, token_separator):
 75+
 76+ """ Call the parent constructor"""
 77+ WSORFileLoader.__init__(self)
 78+
 79+ self.token_separator = token_separator
 80+
 81+ return
 82+
 83+ """
 84+ Get method for data object
 85+ """
 86+ def get_records(self):
 87+ return self.contents
 88+
 89+ """
 90+ Pre-processing and helper for process_file
 91+ """
 92+ def process_file_header(self, filename):
 93+
 94+ """ Open the file and read the first line """
 95+ file = open(filename, 'r')
 96+ line = file.readline()
 97+ self.file_header_contents = line.split(self.token_separator)
 98+
 99+ for field in self.file_header_contents:
 100+ self.contents[field] = list()
 101+
 102+ logging.info('File header processed: %s' % str(self.file_header_contents))
 103+
 104+ """ return the file object rather than assigning as a member since it's state may change """
 105+ return file
 106+
 107+
 108+ """
 109+ Override to process a token separator text file
 110+ """
 111+ def process_file(self, filename):
 112+
 113+ """ Process the header - get an index of fields """
 114+ file = self.process_file_header(filename)
 115+ field_index = range(len(self.contents.keys()))
 116+
 117+ """ process the remainder of the file """
 118+ logging.info('Processing rows...')
 119+
 120+ line = file.readline()
 121+ while (line != ''):
 122+
 123+ try:
 124+ line = file.readline()
 125+ elems = line.split(self.token_separator)
 126+
 127+ if len(elems) < len(field_index):
 128+ raise IndexError('Too few elements in record. Omitting row.')
 129+
 130+ for i in field_index:
 131+ field = self.file_header_contents[i]
 132+ elem = elems[i]
 133+ self.contents[field].append(elem)
 134+
 135+ except Exception as inst:
 136+
 137+ logging.error('Error processing row:')
 138+ #logging.error(type(inst)) # the exception instance
 139+ #logging.error(inst.args) # arguments stored in .args
 140+ logging.error(inst) # __str__ allows args to printed directly
 141+
 142+ pass
 143+
 144+ logging.info('Processing complete.')
 145+ file.close()
 146+
 147+ return self.contents
 148+
 149+ """
 150+ Output data to a token separated file
 151+ """
 152+ def write_dict_to_file(self, dict_obj, filename):
 153+
 154+ fields = dict_obj.keys()
 155+ num_fields = len(fields)
 156+ field_index = range(len(fields))
 157+
 158+ """ Although the number of entries for each field should be the same always take the key with the fewest elements """
 159+ num_records = len(dict_obj[fields[0]])
 160+ for field in fields:
 161+ if len(dict_obj[field]) < num_records:
 162+ num_records = len(dict_obj[field])
 163+
 164+ index = range(num_records)
 165+
 166+ file = open(filename, 'w')
 167+
 168+ try:
 169+ """ Write the header """
 170+ for fi in field_index:
 171+ if fi == num_fields - 1:
 172+ file.write(fields[fi] + '\n')
 173+ else:
 174+ file.write(fields[fi] + self.token_separator)
 175+
 176+ """ Write the data """
 177+ for i in index:
 178+ for fi in field_index:
 179+ if fi == num_fields - 1 and i != num_records - 1:
 180+ file.write(str(dict_obj[fields[fi]][i]) + '\n')
 181+ elif i != num_records - 1:
 182+ file.write(str(dict_obj[fields[fi]][i]) + self.token_separator)
 183+ else:
 184+ file.write(str(dict_obj[fields[fi]][i]))
 185+
 186+ except Exception as e:
 187+
 188+ logging.error(e)
 189+
 190+ finally:
 191+
 192+ file.close()
 193+
 194+
 195+
 196+"""
 197+
 198+ Custom loader that operates on rows in /home/rfaulkner/trunk/projects/data/en.editor_first_and_last.20.tsv
 199+
 200+ This file stores the first and last edit info of 500K users
 201+
 202+ ['fes_edits',
 203+ 'user_id',
 204+ 'last10_reverted',
 205+ 'fes_vandalism',
 206+ 'first_edit',
 207+ 'last10_edits',
 208+ 'fes_reverted',
 209+ 'last_edit',
 210+ 'fes_deleted',
 211+ 'user_name',
 212+ 'last10_vandalism',
 213+ 'last10_deleted\n']
 214+
 215+"""
 216+class WSOR_custom1_Loader(WSORTokenizedTextFileLoader):
 217+
 218+ def __init__(self):
 219+
 220+ self._filename_ = '/home/rfaulkner/trunk/projects/data/en.editor_first_and_last.20.tsv'
 221+
 222+ WSORTokenizedTextFileLoader.__init__(self, '\t')
 223+ self.process_file(self._filename_)
 224+
 225+ def find_coverted_users(self, portion):
 226+
 227+ converted_users = dict()
 228+ #converted_users['last10_vandalism'] = list()
 229+ converted_users['user_id'] = list()
 230+ converted_users['user_name'] = list()
 231+
 232+ index = range(len(self.contents[self.contents.keys()[0]]))
 233+
 234+ """
 235+ for i in index:
 236+ if int(self.contents['last10_vandalism'][i]) > portion:
 237+
 238+ converted_users['last10_vandalism'].append(self.contents['last10_vandalism'][i])
 239+ converted_users['user_id'].append(self.contents['user_id'][i])
 240+ converted_users['user_name'].append(self.contents['user_name'][i])
 241+ """
 242+
 243+ """
 244+ Get all users that have:
 245+
 246+ 1. A fisrt edit session of vandalism
 247+ 2. No vandalism in their last 10 edits
 248+ """
 249+ for i in index:
 250+ if int(self.contents['last10_vandalism'][i]) == 0 and int(self.contents['fes_vandalism'][i]) > 0:
 251+
 252+ #converted_users['last10_vandalism'].append(self.contents['last10_vandalism'][i])
 253+ converted_users['user_id'].append(self.contents['user_id'][i])
 254+ converted_users['user_name'].append(self.contents['user_name'][i])
 255+
 256+ return converted_users
 257+
 258+
 259+ def get_user_revisions(self, user_id):
 260+
 261+ self.init_db()
 262+
 263+ sql_stmnt = 'select * from where rev_user = %s' % str(user_id)
 264+
 265+ logging.info('Running query on revision table...')
 266+
 267+ try:
 268+ self._cur_.execute(sql_stmnt)
 269+
 270+ """ GET THE COLUMN NAMES FROM THE QUERY RESULTS """
 271+ self._col_names_ = list()
 272+ for i in self._cur_.description:
 273+ self._col_names_.append(i[0])
 274+
 275+ self._results_ = self._cur_.fetchall()
 276+
 277+ except Exception as inst:
 278+
 279+ logging.error(type(inst)) # the exception instance
 280+ logging.error(inst.args) # arguments stored in .args
 281+ logging.error(inst) # __str__ allows args to printed directly
 282+
 283+
 284+
 285+
 286+ self.close_db()
 287+
 288+
\ No newline at end of file
Property changes on: trunk/tools/wsor/scripts/classes/WSORFileLoader.py
___________________________________________________________________
Added: svn:eol-style
1289 + native

Status & tagging log