r86286 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r86285‎ | r86286 | r86287 >
Date:21:12, 17 April 2011
Author:rfaulk
Status:deferred
Tags:
Comment:
Added reporting and loading classes for handling confidence testing. This has been imported from compute_confidence.py. Added some documentation otherwise.
Modified paths:
  • /trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py (modified) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/classes/DataReporting.py (modified) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py
@@ -119,7 +119,7 @@
120120
121121
122122 """ Load the SQL File & Format """
123 - filename = self._sql_path_+ query_name + '.sql'
 123+ filename = self._sql_path_ + query_name + '.sql'
124124 sql_stmnt = mh.read_sql(filename)
125125
126126 sql_stmnt = QD.format_query(query_name, sql_stmnt, [start_time, end_time, campaign, interval])
@@ -205,4 +205,133 @@
206206 class BannerLPReportingLoader(DataLoader):
207207
208208 def run_query(self):
209 - return
\ No newline at end of file
 209+ return
 210+
 211+
 212+
 213+class HypothesisTestLoader(DataLoader):
 214+
 215+ """
 216+ Execute data acquisition for hypothesis tester
 217+
 218+ INPUT:
 219+ query_name -
 220+ metric_name -
 221+ campaign -
 222+ item_1 -
 223+ item_2 -
 224+ start_time -
 225+ end_time -
 226+ interval -
 227+ num_samples -
 228+
 229+ RETURN:
 230+ metrics_1 -
 231+ metrics_2 -
 232+ times_indices -
 233+
 234+ """
 235+ def run_query(self, query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples):
 236+
 237+ """ retrieve time lists with timestamp format 1 (yyyyMMddhhmmss) """
 238+ ret = TP.get_time_lists(start_time, end_time, interval, num_samples, 1)
 239+ times = ret[0]
 240+ times_indices = ret[1]
 241+
 242+ self.init_db()
 243+
 244+ filename = self._sql_path_ + query_name + '.sql'
 245+ sql_stmnt = mh.read_sql(filename)
 246+
 247+ metric_index = QD.get_metric_index(query_name, metric_name)
 248+ metrics_1 = []
 249+ metrics_2 = []
 250+
 251+ for i in range(len(times) - 1):
 252+
 253+ # print '\nExecuting number ' + str(i) + ' batch of of data.'
 254+ t1 = times[i]
 255+ t2 = times[i+1]
 256+
 257+ formatted_sql_stmnt_1 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_1, campaign])
 258+ formatted_sql_stmnt_2 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_2, campaign])
 259+
 260+ try:
 261+ err_msg = formatted_sql_stmnt_1
 262+
 263+ self._cur_.execute(formatted_sql_stmnt_1)
 264+ results_1 = self._cur_.fetchone() # there should only be a single row
 265+
 266+ err_msg = formatted_sql_stmnt_2
 267+
 268+ self._cur_.execute(formatted_sql_stmnt_2)
 269+ results_2 = self._cur_.fetchone() # there should only be a single row
 270+
 271+ except Exception as inst:
 272+ print type(inst) # the exception instance
 273+ print inst.args # arguments stored in .args
 274+ print inst # __str__ allows args to printed directly
 275+
 276+ self._db_.rollback()
 277+ sys.exit("Database Interface Exception:\n" + err_msg)
 278+
 279+ metrics_1.append(results_1[metric_index])
 280+ metrics_2.append(results_2[metric_index])
 281+
 282+ #print metrics_1
 283+ #print metrics_2
 284+
 285+ self.close_db()
 286+
 287+ # return the metric values at each time
 288+ return [metrics_1, metrics_2, times_indices]
 289+
 290+
 291+
 292+"""
 293+
 294+ CLASS :: TTestLoaderHelp
 295+
 296+ Provides data access particular to the t-test
 297+
 298+ METHODS:
 299+ init_db -
 300+ close_db -
 301+"""
 302+class TTestLoaderHelp(DataLoader):
 303+
 304+ """
 305+ This method knows about faulkner.t_test. This is a lookup table for p-values
 306+ given the degrees of freedom and statistic t test
 307+
 308+ INPUT:
 309+ degrees_of_freedom -
 310+ t -
 311+
 312+ RETURN:
 313+ p -
 314+
 315+ """
 316+ def get_pValue(self, degrees_of_freedom, t):
 317+
 318+ self.init_db()
 319+
 320+ select_stmnt = 'select max(p) from t_test where degrees_of_freedom = ' + str(degrees_of_freedom) + ' and t >= ' + str(t)
 321+
 322+ try:
 323+ self._cur_.execute(select_stmnt)
 324+ results = self._cur_.fetchone()
 325+
 326+ if results[0] != None:
 327+ p = float(results[0])
 328+ else:
 329+ p = .0005
 330+ except:
 331+ self._db_.rollback()
 332+ self._db_.close()
 333+ sys.exit('Could not execute: ' + select_stmnt)
 334+
 335+ self._db_.close()
 336+
 337+ return p
 338+
\ No newline at end of file
Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/DataReporting.py
@@ -240,17 +240,33 @@
241241
242242 """
243243
244 -CLASS :: ^TotalAmountsReporting^
 244+ CLASS :: ^TotalAmountsReporting^
 245+
 246+ This subclass handles reporting on total amounts for the fundraiser.
245247
246 -This subclass handles reporting on total amounts for the fundraiser.
247 -
248248 """
249249
250250 class TotalAmountsReporting(DataReporting):
251251
 252+ """
 253+ <description>
 254+
 255+ INPUT:
 256+
 257+ RETURN:
 258+
 259+ """
252260 def __init__(self):
253261 self.data = []
 262+
 263+ """
 264+ <description>
254265
 266+ INPUT:
 267+
 268+ RETURN:
 269+
 270+ """
255271 def run_query(self, start_time, end_time, query_name, descriptor):
256272
257273 self.init_db()
@@ -333,7 +349,14 @@
334350 pylab.savefig(fname+'.png', format='png')
335351
336352
337 -
 353+ """
 354+ <description>
 355+
 356+ INPUT:
 357+
 358+ RETURN:
 359+
 360+ """
338361 def run_hr(self, type):
339362
340363
@@ -384,7 +407,14 @@
385408 self.gen_plot(time_range, counts, labels, title, xlabel, ylabel, ranges, subplot_index, fname)
386409
387410
388 -
 411+ """
 412+ <description>
 413+
 414+ INPUT:
 415+
 416+ RETURN:
 417+
 418+ """
389419 def run_day(self,type):
390420
391421 # Current date & time
@@ -431,7 +461,14 @@
432462 fname = query_name + descriptor + '_' + type
433463 self.gen_plot(time_range, counts, labels, title, xlabel, ylabel, ranges, subplot_index, fname)
434464
 465+ """
 466+ <description>
435467
 468+ INPUT:
 469+
 470+ RETURN:
 471+
 472+ """
436473 def get_query_fields(self, labels, counts, type, start_time, end_time):
437474
438475 if type == 'BAN_EM':
@@ -467,15 +504,22 @@
468505
469506 """
470507
471 -CLASS :: ^BannerLPReporting^
 508+ CLASS :: ^BannerLPReporting^
 509+
 510+ This subclass handles reporting on banners and landing pages for the fundraiser.
472511
473 -This subclass handles reporting on banners and landing pages for the fundraiser.
474 -
475512 """
476513
477514 class BannerLPReporting(DataReporting):
478515
 516+ """
 517+ <description>
479518
 519+ INPUT:
 520+
 521+ RETURN:
 522+
 523+ """
480524 def __init__(self, *args):
481525
482526 if len(args) == 2:
@@ -484,7 +528,14 @@
485529 else:
486530 self.campaign = None
487531 self.start_time = None
 532+ """
 533+ <description>
488534
 535+ INPUT:
 536+
 537+ RETURN:
 538+
 539+ """
489540 def run_query(self,start_time, end_time, campaign, query_name, metric_name):
490541
491542 self.init_db()
@@ -572,7 +623,14 @@
573624 # return [metric_lists, time_norm, table_data]
574625 return [metric_lists, time_norm]
575626
 627+ """
 628+ <description>
576629
 630+ INPUT:
 631+
 632+ RETURN:
 633+
 634+ """
577635 def gen_plot(self,counts, times, title, xlabel, ylabel, ranges, subplot_index, fname):
578636 pylab.subplot(subplot_index)
579637 pylab.figure(num=None,figsize=[26,14])
@@ -597,7 +655,14 @@
598656
599657
600658 """
601 -
 659+
 660+ <description>
 661+
 662+ INPUT:
 663+
 664+ RETURN:
 665+
 666+
602667 type = 'LP' || 'BAN' || 'BAN-TEST' || 'LP-TEST'
603668
604669 """
@@ -692,7 +757,15 @@
693758
694759 return [metrics, times]
695760
 761+ """ !! MOVE INTO DATA LOADER!!
696762
 763+ <description>
 764+
 765+ INPUT:
 766+
 767+ RETURN:
 768+
 769+ """
697770 def get_latest_campaign(self):
698771
699772 query_name = 'report_latest_campaign'
@@ -725,12 +798,15 @@
726799
727800 return [campaign, timestamp]
728801
729 - """
730 -
 802+ """ !! SHOULD BE MOVED TO TIMEPROCESSOR !!
731803 Takes as input and converts it to a set of hours counting back from 0
 804+ <description>
732805
733 - time_lists - a dictionary of timestamp lists
 806+ INPUT:
 807+ time_lists - a dictionary of timestamp lists
734808 time_norm - a dictionary of normalized times
 809+
 810+ RETURN:
735811
736812 """
737813 def normalize_timestamps(self, time_lists):
@@ -774,6 +850,14 @@
775851
776852 class MinerReporting(DataReporting):
777853
 854+ """
 855+ <description>
 856+
 857+ INPUT:
 858+
 859+ RETURN:
 860+
 861+ """
778862 def run_query(self, start_time, end_time, query_name):
779863
780864 self.init_db()
@@ -816,8 +900,15 @@
817901 return [counts, time_norm]
818902
819903
820 - # Create histograms for hourly counts
821 -
 904+ """
 905+ Create histograms for hourly counts
 906+ <description>
 907+
 908+ INPUT:
 909+
 910+ RETURN:
 911+
 912+ """
822913 def gen_plot(self,counts, times, title, xlabel, ylabel, ranges, subplot_index, fname):
823914
824915 pylab.subplot(subplot_index)
@@ -835,7 +926,15 @@
836927
837928 pylab.title(title)
838929 pylab.savefig(fname, format='png')
 930+
 931+ """
 932+ <description>
839933
 934+ INPUT:
 935+
 936+ RETURN:
 937+
 938+ """
840939 def run(self, query_name):
841940
842941 # Current date & time
@@ -883,26 +982,33 @@
884983
885984 """
886985
887 -CLASS :: IntervalReporting
 986+ CLASS :: IntervalReporting
 987+
 988+ Performs queries that take timestamps, query, and an interval as arguments. Data for a single metric
 989+ is generated for each time interval in the time period defined by the start and end timestamps.
 990+
 991+ Types of queries supported:
 992+
 993+ report_banner_metrics_minutely
 994+ report_LP_metrics_minutely
888995
889 -Performs queries that take timestamps, query, and an interval as arguments. Data for a single metric
890 -is generated for each time interval in the time period defined by the start and end timestamps.
891 -
892 -Types of queries supported:
893 -
894 -report_banner_metrics_minutely
895 -report_LP_metrics_minutely
896 -
897996 """
898997
899998 class IntervalReporting(DataReporting):
900999
 1000+ """
 1001+ <description>
 1002+
 1003+ INPUT:
 1004+
 1005+ RETURN:
 1006+
9011007 """
902 - """
9031008 def __init__(self):
9041009 self._data_loader_ = DL.IntervalReportingLoader()
9051010
9061011 """
 1012+ <description>
9071013 """
9081014 def usage(self):
9091015
@@ -918,7 +1024,13 @@
9191025 return
9201026
9211027 """
922 - Execute reporting query and generate plots
 1028+ Execute reporting query and generate plots
 1029+ <description>
 1030+
 1031+ INPUT:
 1032+
 1033+ RETURN:
 1034+
9231035 """
9241036 def gen_plot(self, metrics, times, title, xlabel, ylabel, ranges, subplot_index, fname, labels):
9251037
@@ -970,7 +1082,13 @@
9711083
9721084
9731085 """
974 - Execute reporting query and generate plots
 1086+ Execute reporting query and generate plots
 1087+ <description>
 1088+
 1089+ INPUT:
 1090+
 1091+ RETURN:
 1092+
9751093 """
9761094 def run(self, start_time, end_time, interval, query_type, metric_name, campaign, labels):
9771095
@@ -1022,3 +1140,246 @@
10231141 self.gen_plot(counts, times, title, xlabel, ylabel, ranges, subplot_index, fname, labels)
10241142
10251143
 1144+"""
 1145+
 1146+ CLASS :: ConfidenceReporting
 1147+
 1148+ Reports confidence values on specified metrics
 1149+
 1150+ Types of queries supported:
 1151+
 1152+ report_banner_confidence
 1153+ report_LP_confidence
 1154+
 1155+"""
 1156+
 1157+class ConfidenceReporting(DataReporting):
 1158+
 1159+ _hypothesis_test_ = None
 1160+
 1161+
 1162+ """
 1163+
 1164+ Constructor for confidence reporting class
 1165+
 1166+ INPUT:
 1167+
 1168+ hypothesis_test - an instance reflecting the type of test being used
 1169+
 1170+
 1171+ """
 1172+ def __init__(self, hypothesis_test):
 1173+
 1174+ """ check to make sure this is in fact a hypothsis test """
 1175+ self._hypothesis_test_ = hypothesis_test
 1176+ self._data_loader_ = HypothesisTestLoader()
 1177+
 1178+ """
 1179+ Describes how to run a report !! MODIFY !!
 1180+ """
 1181+ def usage(self):
 1182+
 1183+ print 'Types of queries:'
 1184+ print ' (1) banner'
 1185+ print ' (2) LP'
 1186+ print ''
 1187+ print 'e.g.'
 1188+ print " run('20101230160400', '20101230165400', 2, 'banner', 'imp', '20101230JA091_US')"
 1189+ print " run('20101230160400', '20101230165400', 2, 'LP', 'views', '20101230JA091_US')"
 1190+ print ''
 1191+
 1192+ return
 1193+
 1194+
 1195+ """
 1196+ <description>
 1197+
 1198+ INPUT:
 1199+
 1200+ RETURN:
 1201+
 1202+ """
 1203+ def gen_plot(self,means_1, means_2, std_devs_1, std_devs_2, times_indices, title, xlabel, ylabel, ranges, subplot_index, labels, fname):
 1204+
 1205+ file_format = 'png'
 1206+
 1207+ pylab.subplot(subplot_index)
 1208+ pylab.figure(num=None,figsize=[26,14])
 1209+
 1210+ e1 = pylab.errorbar(times_indices, means_1, yerr=std_devs_1, fmt='xb-')
 1211+ e2 = pylab.errorbar(times_indices, means_2, yerr=std_devs_2, fmt='dr-')
 1212+ # pylab.hist(counts, times)
 1213+
 1214+ """ Set the figure and font size """
 1215+ fig_width_pt = 246.0 # Get this from LaTeX using \showthe\columnwidth
 1216+ inches_per_pt = 1.0/72.27 # Convert pt to inch
 1217+ golden_mean = (math.sqrt(5)-1.0)/2.0 # Aesthetic ratio
 1218+ fig_width = fig_width_pt*inches_per_pt # width in inches
 1219+ fig_height = fig_width*golden_mean # height in inches
 1220+ fig_size = [fig_width,fig_height]
 1221+
 1222+ font_size = 20
 1223+
 1224+ params = { 'axes.labelsize': font_size,
 1225+ 'text.fontsize': font_size,
 1226+ 'xtick.labelsize': font_size,
 1227+ 'ytick.labelsize': font_size,
 1228+ 'legend.pad': 0.1, # empty space around the legend box
 1229+ 'legend.fontsize': font_size,
 1230+ 'font.size': font_size,
 1231+ 'text.usetex': False,
 1232+ 'figure.figsize': fig_size}
 1233+
 1234+ pylab.rcParams.update(params)
 1235+
 1236+ pylab.grid()
 1237+ pylab.ylim(ranges[2], ranges[3])
 1238+ pylab.xlim(ranges[0], ranges[1])
 1239+ pylab.legend([e1[0], e2[0]], labels,loc=2)
 1240+
 1241+ pylab.xlabel(xlabel)
 1242+ pylab.ylabel(ylabel)
 1243+
 1244+ pylab.title(title)
 1245+ pylab.savefig(fname + '.' + file_format, format=file_format)
 1246+
 1247+
 1248+ """
 1249+ Print in Tabular form the means and standard deviation of each group over each
 1250+ interval
 1251+
 1252+ INPUT:
 1253+
 1254+ RETURN:
 1255+
 1256+ """
 1257+ def print_metrics(self, filename, metric_name, means_1, means_2, std_devs_1, std_devs_2, times_indices, labels, test_call):
 1258+
 1259+ filename += '.txt'
 1260+ file = open(filename, 'w')
 1261+
 1262+ """ Compute % increase and report """
 1263+ av_means_1 = sum(means_1) / len(means_1)
 1264+ av_means_2 = sum(means_2) / len(means_2)
 1265+ percent_increase = math.fabs(av_means_1 - av_means_2) / min(av_means_1,av_means_2) * 100.0
 1266+
 1267+ """ Compute the average standard deviations """
 1268+ av_std_dev_1 = 0
 1269+ av_std_dev_2 = 0
 1270+
 1271+ for i in range(len(std_devs_1)):
 1272+ av_std_dev_1 = av_std_dev_1 + math.pow(std_devs_1[i], 2)
 1273+ av_std_dev_2 = av_std_dev_2 + math.pow(std_devs_2[i], 2)
 1274+
 1275+ av_std_dev_1 = math.pow(av_std_dev_1, 0.5) / len(std_devs_1)
 1276+ av_std_dev_2 = math.pow(av_std_dev_2, 0.5) / len(std_devs_1)
 1277+
 1278+ """ Assign the winner """
 1279+ if av_means_1 > av_means_2:
 1280+ winner = labels[0]
 1281+ else:
 1282+ winner = labels[1]
 1283+
 1284+ win_str = "\nThe winner " + winner + " had a %.2f%s increase."
 1285+ win_str = win_str % (percent_increase, '%')
 1286+
 1287+ print '\nCOMMAND = ' + test_call
 1288+ file.write('\nCOMMAND = ' + test_call)
 1289+
 1290+
 1291+ print '\n\n' + metric_name
 1292+ print '\nitem 1 = ' + labels[0]
 1293+ print 'item 2 = ' + labels[1]
 1294+ print win_str
 1295+ print '\ninterval\tmean1\t\tmean2\t\tstddev1\t\tstddev2\n'
 1296+ file.write('\n\n' + metric_name)
 1297+ file.write('\nitem 1 = ' + labels[0] + '\n')
 1298+ file.write('\nitem 2 = ' + labels[1] + '\n')
 1299+ file.write(win_str)
 1300+ file.write('\n\ninterval\tmean1\t\tmean2\t\tstddev1\t\tstddev2\n\n')
 1301+
 1302+
 1303+ """ Print out the parameters for each interval """
 1304+
 1305+ for i in range(len(times_indices)):
 1306+ line_args = str(i) + '\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\n'
 1307+ line_str = line_args % (means_1[i], means_2[i], std_devs_1[i], std_devs_2[i])
 1308+ print line_str
 1309+ file.write(line_str)
 1310+
 1311+ """ Print out the averaged parameters """
 1312+ line_args = '%.5f\t\t' + '%.5f\t\t' + '%.5f\t\t' + '%.5f\n'
 1313+ line_str = line_args % (av_means_1, av_means_2, av_std_dev_1, av_std_dev_2)
 1314+
 1315+ print '\n\nOverall Parameters -- the confidence test was run with these parameters:\n'
 1316+ print '\nmean1\t\tmean2\t\tstddev1\t\tstddev2\n'
 1317+ print line_str
 1318+
 1319+ file.write('\n\nOverall Parameters:\n')
 1320+ file.write('\nmean1\t\tmean2\t\tstddev1\t\tstddev2\n')
 1321+ file.write(line_str)
 1322+
 1323+
 1324+ file.close()
 1325+
 1326+ """
 1327+ Executes the test reporting
 1328+
 1329+ INPUT:
 1330+
 1331+ RETURN:
 1332+
 1333+ """
 1334+ def run(self, test_name, query_name, metric_name, campaign, items, start_time, end_time, interval, num_samples):
 1335+
 1336+ """ TEMPORARY - map items and labels, this should be more generalized """
 1337+ counter = 1
 1338+ for key in items.keys():
 1339+ if counter == 1:
 1340+ item_1 = items[key]
 1341+ label_1 = key
 1342+ elif counter == 2:
 1343+ item_2 = items[key]
 1344+ label_2 = key
 1345+ counter += 1
 1346+
 1347+ """ Retrieve values from database """
 1348+ ret = _data_loader_.query_tables(query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples)
 1349+ metrics_1 = ret[0]
 1350+ metrics_2 = ret[1]
 1351+ times_indices = ret[2]
 1352+
 1353+ """ run the confidence test """
 1354+ ret = _hypothesis_test_.confidence_test(metrics_1, metrics_2, num_samples)
 1355+ means_1 = ret[0]
 1356+ means_2 = ret[1]
 1357+ std_devs_1 = ret[2]
 1358+ std_devs_2 = ret[3]
 1359+ confidence = ret[4]
 1360+
 1361+ """ plot the results """
 1362+ xlabel = 'Hours'
 1363+ subplot_index = 111
 1364+ fname = './tests/' + campaign + '_conf_' + metric_name
 1365+
 1366+ title = confidence + '\n\n' + test_name + ' -- ' + TP.timestamp_convert_format(start_time,1,2) + ' - ' + TP.timestamp_convert_format(end_time,1,2)
 1367+
 1368+ max_mean = max(max(means_1),max(means_2))
 1369+ max_sd = max(max(std_devs_1),max(std_devs_2))
 1370+ max_y = float(max_mean) + float(max_sd)
 1371+ max_y = max_y + 0.1 * max_y
 1372+ max_x = max(times_indices) + min(times_indices)
 1373+ ranges = [0.0, max_x, 0, max_y]
 1374+
 1375+ ylabel = metric_name
 1376+ labels = [label_1, label_2]
 1377+
 1378+ self.gen_plot(means_1, means_2, std_devs_1, std_devs_2, times_indices, title, xlabel, ylabel, ranges, subplot_index, labels, fname)
 1379+
 1380+ """ Print out results """
 1381+ test_call = "run_test('" + test_name + "', '" + query_name + "', '" + metric_name + "', '" + campaign + "', '" + \
 1382+ item_1 + "', '" + item_2 + "', '" + start_time + "', '" + end_time + "', " + str(interval) + ", " + str(num_samples) + ")"
 1383+ self.print_metrics(fname, title, means_1, means_2, std_devs_1, std_devs_2, times_indices, labels, test_call)
 1384+
 1385+ return
 1386+
\ No newline at end of file

Status & tagging log