r86600 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r86599‎ | r86600 | r86601 >
Date:07:45, 21 April 2011
Author:rfaulk
Status:deferred
Tags:
Comment:
Imported confidence testing reporting, data, and testing functionality into DataReporting, DataLoader, TimestampProcessor, and HypothesisTest. These aspects of the hypothesis test reporting have been decoupled.
Modified paths:
  • /trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py (modified) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/classes/DataReporting.py (modified) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/classes/TimestampProcessor.py (modified) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/classes/compute_confidence.py (modified) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/compute_confidence.py
@@ -101,10 +101,17 @@
102102
103103 self._data_loader_._db_.rollback()
104104 sys.exit("Database Interface Exception:\n" + err_msg)
105 -
106 - metrics_1.append(results_1[metric_index])
107 - metrics_2.append(results_2[metric_index])
108105
 106+ """ If no results are returned in this set the sample value is 0.0 """
 107+ try:
 108+ metrics_1.append(results_1[metric_index])
 109+ except TypeError:
 110+ metrics_1.append(0.0)
 111+ try:
 112+ metrics_2.append(results_2[metric_index])
 113+ except TypeError:
 114+ metrics_2.append(0.0)
 115+
109116 #print metrics_1
110117 #print metrics_2
111118
@@ -323,17 +330,17 @@
324331 # Compute the mean and variance for each group across all trials
325332 for i in range(num_trials):
326333
327 - m1 = 0 # mean of group 1
328 - m2 = 0 # mean of group 2
329 - var1 = 0 # variance of group 1
330 - var2 = 0 # variance of group 2
 334+ m1 = 0.0 # mean of group 1
 335+ m2 = 0.0 # mean of group 2
 336+ var1 = 0.0 # variance of group 1
 337+ var2 = 0.0 # variance of group 2
331338
332339 for j in range(num_samples):
333340 index = i * num_samples + j
334341
335342 # Compute mean for each group
336 - m1 = m1 + metrics_1[index]
337 - m2 = m2 + metrics_2[index]
 343+ m1 = m1 + float(metrics_1[index])
 344+ m2 = m2 + float(metrics_2[index])
338345
339346 m1 = m1 / num_samples
340347 m2 = m2 / num_samples
@@ -342,8 +349,8 @@
343350 for j in range(num_samples):
344351 index = i + j
345352
346 - var1 = var1 + math.pow((metrics_1[i] - m1), 2)
347 - var2 = var2 + math.pow((metrics_2[i] - m2), 2)
 353+ var1 = var1 + math.pow((float(metrics_1[i]) - m1), 2)
 354+ var2 = var2 + math.pow((float(metrics_2[i]) - m2), 2)
348355
349356 means_1.append(float(m1))
350357 means_2.append(float(m2))
Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py
@@ -158,18 +158,20 @@
159159 """ If the first element is not the start time add it
160160 this will be the case if there is no data for the first interval
161161 NOTE: two datapoints are added at the beginning to define the first interval """
 162+ times[key_name].append(start_time_obj)
 163+ times[key_name].append(start_time_obj + interval_obj)
 164+
162165 if start_time_obj_str != row[time_index]:
163 - times[key_name].append(start_time_obj)
164166 metrics[key_name].append(0.0)
 167+ metrics[key_name].append(0.0)
165168
166 - times[key_name].append(start_time_obj + interval_obj)
167 - metrics[key_name].append(0.0)
168 - else:
169 - metrics[key_name].append(row[metric_index])
170169 times[key_name].append(time_obj)
171 -
172 - metrics[key_name].append(row[metric_index])
173170 times[key_name].append(time_obj + interval_obj)
 171+
 172+ metrics[key_name].append(row[metric_index])
 173+ metrics[key_name].append(row[metric_index])
 174+
 175+ final_time[key_name] = row[time_index]
174176
175177
176178 except Exception as inst:
@@ -181,7 +183,8 @@
182184 sys.exit(0)
183185
184186
185 - """ Ensure that the last time in the list is the endtime less the interval """
 187+ """ Ensure that the last time in the list is the endtime less the interval """
 188+
186189 for key in times.keys():
187190 if final_time[key] != end_time_obj_str:
188191 times[key].append(end_time_obj)
@@ -317,8 +320,16 @@
318321 self._db_.rollback()
319322 sys.exit("Database Interface Exception:\n" + err_msg)
320323
321 - metrics_1.append(results_1[metric_index])
322 - metrics_2.append(results_2[metric_index])
 324+ """ If no results are returned in this set the sample value is 0.0
 325+ !! MODIFY -- these results should not count as data points !! """
 326+ try:
 327+ metrics_1.append(results_1[metric_index])
 328+ except TypeError:
 329+ metrics_1.append(0.0)
 330+ try:
 331+ metrics_2.append(results_2[metric_index])
 332+ except TypeError:
 333+ metrics_2.append(0.0)
323334
324335 #print metrics_1
325336 #print metrics_2
Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/DataReporting.py
@@ -30,6 +30,7 @@
3131 import miner_help as mh
3232 import TimestampProcessor as TP
3333 import DataLoader as DL
 34+import HypothesisTest as HT
3435
3536 matplotlib.use('Agg')
3637
@@ -51,8 +52,40 @@
5253 """
5354 class DataReporting(object):
5455
 56+ _font_size_ = 24
 57+ _fig_width_pt_ = 246.0 # Get this from LaTeX using \showthe\columnwidth
 58+ _inches_per_pt_ = 1.0/72.27 # Convert pt to inch
 59+ _use_labels_= False
 60+ _fig_file_format_ = 'png'
 61+ _plot_type_ = 'line'
 62+ _item_keys_ = list()
5563 _data_loader_ = None
5664
 65+
 66+ def __init__(self, **kwargs):
 67+
 68+ for key in kwargs:
 69+
 70+ if key == 'font_size':
 71+ self._font_size_ = kwargs[key]
 72+ elif key == 'fig_width_pt':
 73+ self._fig_width_pt_ = kwargs[key]
 74+ elif key == 'inches_per_pt':
 75+ self._inches_per_pt_ = kwargs[key]
 76+ elif key == 'use_labels':
 77+ self._use_labels_ = kwargs[key]
 78+ elif key == 'fig_file_format':
 79+ self._fig_file_format_ = kwargs[key]
 80+ elif key == 'plot_type':
 81+ self._plot_type_ = kwargs[key]
 82+ elif key == 'item_keys':
 83+ self._item_keys_ = kwargs[key]
 84+ elif key == 'data_loader': # Set custom data loaders
 85+ if kwargs[key] == 'campaign_interval':
 86+ self._data_loader_ = DL.CampaignIntervalReportingLoader()
 87+
 88+ print self._data_loader_.__str__
 89+
5790 """
5891
5992 Smooths a list of values
@@ -928,7 +961,7 @@
929962 pylab.savefig(fname, format='png')
930963
931964 """
932 - <description>
 965+ Entry point and definition for execution of miner reporting
933966
934967 INPUT:
935968
@@ -996,13 +1029,9 @@
9971030
9981031 class IntervalReporting(DataReporting):
9991032
1000 - _font_size_ = 24
1001 - _fig_width_pt_ = 246.0 # Get this from LaTeX using \showthe\columnwidth
1002 - _inches_per_pt_ = 1.0/72.27 # Convert pt to inch
1003 - _use_labels_= False
1004 - _fig_file_format_ = 'png'
1005 - _plot_type_ = 'line'
 1033+
10061034
 1035+
10071036 """
10081037 Constructor for IntervalReporting
10091038
@@ -1014,26 +1043,10 @@
10151044 def __init__(self, **kwargs):
10161045
10171046 self._data_loader_ = DL.IntervalReportingLoader()
1018 -
1019 - for key in kwargs:
1020 -
1021 - if key == 'font_size':
1022 - self._font_size_ = kwargs[key]
1023 - elif key == 'fig_width_pt':
1024 - self._fig_width_pt_ = kwargs[key]
1025 - elif key == 'inches_per_pt':
1026 - self._inches_per_pt_ = kwargs[key]
1027 - elif key == 'use_labels':
1028 - self._use_labels_ = kwargs[key]
1029 - elif key == 'fig_file_format':
1030 - self._fig_file_format_ = kwargs[key]
1031 - elif key == 'plot_type':
1032 - self._plot_type_ = kwargs[key]
1033 - elif key == 'data_loader': # Set custom data loaders
1034 - if kwargs[key] == 'campaign_interval':
1035 - self._data_loader_ = DL.CampaignIntervalReportingLoader()
 1047+ DataReporting.__init__(self, **kwargs)
10361048
1037 - print self._data_loader_.__str__
 1049+
 1050+
10381051
10391052 """
10401053 <description>
@@ -1052,8 +1065,28 @@
10531066 print ''
10541067
10551068 return
1056 -
 1069+
10571070 """
 1071+ Selecting a subset of the key items in a dictionary
 1072+
 1073+ INPUT:
 1074+ dict_lists - dictionary to be parsed
 1075+
 1076+ RETURN:
 1077+ new_dict_lists - new dictionary containing only keys in self._item_keys_
 1078+ """
 1079+ def select_metric_keys(self, dict_lists):
 1080+ new_dict_lists = dict()
 1081+
 1082+ dict_lists_keys = dict_lists.keys()
 1083+
 1084+ for key in self._item_keys_:
 1085+ if key in dict_lists_keys:
 1086+ new_dict_lists[key] = dict_lists[key]
 1087+
 1088+ return new_dict_lists
 1089+
 1090+ """
10581091 Execute reporting query and generate plots
10591092 <description>
10601093
@@ -1131,6 +1164,12 @@
11321165 counts = return_val[0]
11331166 times = return_val[1]
11341167
 1168+ """ Select only the specified item keys """
 1169+ print counts.keys()
 1170+ if len(self._item_keys_) > 0:
 1171+ counts = self.select_metric_keys(counts)
 1172+ times = self.select_metric_keys(times)
 1173+
11351174 """ Convert Times to Integers that indicate relative times AND normalize the intervals in case any are missing """
11361175 for key in times.keys():
11371176 times[key] = TP.normalize_timestamps(times[key], False, 2)
@@ -1142,7 +1181,7 @@
11431182
11441183 xlabel = 'MINUTES'
11451184 subplot_index = 111
1146 - fname = campaign + '_' + metric_name
 1185+ fname = campaign + '_' + query_type + '_' + metric_name
11471186
11481187 metric_full_name = QD.get_metric_full_name(metric_name)
11491188 title = campaign + ': ' + metric_full_name + ' -- ' + TP.timestamp_convert_format(start_time,1,2) + ' - ' + TP.timestamp_convert_format(end_time,1,2)
@@ -1200,25 +1239,26 @@
12011240
12021241
12031242 """
1204 - def __init__(self, hypothesis_test):
 1243+ def __init__(self, **kwargs):
12051244
1206 - """ check to make sure this is in fact a hypothsis test """
1207 - self._hypothesis_test_ = hypothesis_test
1208 - self._data_loader_ = HypothesisTestLoader()
1209 -
 1245+
 1246+ for key in kwargs:
 1247+
 1248+ if key == 'hyp_test': # Set the hypothesis test
 1249+ if kwargs[key] == 't_test':
 1250+ self._hypothesis_test_ = HT.TTest()
 1251+
 1252+ print self._hypothesis_test_.__str__
 1253+
 1254+ self._data_loader_ = DL.HypothesisTestLoader()
 1255+ DataReporting.__init__(self, **kwargs)
 1256+
12101257 """
12111258 Describes how to run a report !! MODIFY !!
12121259 """
12131260 def usage(self):
12141261
1215 - print 'Types of queries:'
1216 - print ' (1) banner'
1217 - print ' (2) LP'
12181262 print ''
1219 - print 'e.g.'
1220 - print " run('20101230160400', '20101230165400', 2, 'banner', 'imp', '20101230JA091_US')"
1221 - print " run('20101230160400', '20101230165400', 2, 'LP', 'views', '20101230JA091_US')"
1222 - print ''
12231263
12241264 return
12251265
@@ -1303,8 +1343,8 @@
13041344 av_std_dev_1 = av_std_dev_1 + math.pow(std_devs_1[i], 2)
13051345 av_std_dev_2 = av_std_dev_2 + math.pow(std_devs_2[i], 2)
13061346
1307 - av_std_dev_1 = math.pow(av_std_dev_1, 0.5) / len(std_devs_1)
1308 - av_std_dev_2 = math.pow(av_std_dev_2, 0.5) / len(std_devs_1)
 1347+ av_std_dev_1 = math.pow(av_std_dev_1 / len(std_devs_1), 0.5)
 1348+ av_std_dev_2 = math.pow(av_std_dev_2 / len(std_devs_1), 0.5)
13091349
13101350 """ Assign the winner """
13111351 if av_means_1 > av_means_2:
@@ -1312,7 +1352,7 @@
13131353 else:
13141354 winner = labels[1]
13151355
1316 - win_str = "\nThe winner " + winner + " had a %.2f%s increase."
 1356+ win_str = '\nThe winner "' + winner + '" had a %.2f%s increase.'
13171357 win_str = win_str % (percent_increase, '%')
13181358
13191359 print '\nCOMMAND = ' + test_call
@@ -1376,13 +1416,13 @@
13771417 counter += 1
13781418
13791419 """ Retrieve values from database """
1380 - ret = _data_loader_.query_tables(query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples)
 1420+ ret = self._data_loader_.run_query(query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples)
13811421 metrics_1 = ret[0]
13821422 metrics_2 = ret[1]
13831423 times_indices = ret[2]
13841424
13851425 """ run the confidence test """
1386 - ret = _hypothesis_test_.confidence_test(metrics_1, metrics_2, num_samples)
 1426+ ret = self._hypothesis_test_.confidence_test(metrics_1, metrics_2, num_samples)
13871427 means_1 = ret[0]
13881428 means_2 = ret[1]
13891429 std_devs_1 = ret[2]
@@ -1409,7 +1449,7 @@
14101450 self.gen_plot(means_1, means_2, std_devs_1, std_devs_2, times_indices, title, xlabel, ylabel, ranges, subplot_index, labels, fname)
14111451
14121452 """ Print out results """
1413 - test_call = "run_test('" + test_name + "', '" + query_name + "', '" + metric_name + "', '" + campaign + "', '" + \
 1453+ test_call = "run('" + test_name + "', '" + query_name + "', '" + metric_name + "', '" + campaign + "', '" + \
14141454 item_1 + "', '" + item_2 + "', '" + start_time + "', '" + end_time + "', " + str(interval) + ", " + str(num_samples) + ")"
14151455 self.print_metrics(fname, title, means_1, means_2, std_devs_1, std_devs_2, times_indices, labels, test_call)
14161456
Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/TimestampProcessor.py
@@ -423,14 +423,14 @@
424424 time_indices - list of indices counting from zero marking the indices for reporting test interval parameters
425425
426426 """
427 -def get_time_lists(self, start_time, end_time, interval, num_samples, format):
 427+def get_time_lists(start_time, end_time, interval, num_samples, format):
428428
429429 """ range must be divisible by interval - convert to hours """
430430 range = float(interval * num_samples) / 60
431431
432432 """ Compose times """
433 - start_datetime = dt.datetime(int(start_time[0:4]), int(start_time[4:6]), int(start_time[6:8]), int(start_time[8:10]), int(start_time[10:12]), int(start_time[12:14]))
434 - end_datetime = dt.datetime(int(end_time[0:4]), int(end_time[4:6]), int(end_time[6:8]), int(end_time[8:10]), int(end_time[10:12]), int(end_time[12:14]))
 433+ start_datetime = datetime.datetime(int(start_time[0:4]), int(start_time[4:6]), int(start_time[6:8]), int(start_time[8:10]), int(start_time[10:12]), int(start_time[12:14]))
 434+ end_datetime = datetime.datetime(int(end_time[0:4]), int(end_time[4:6]), int(end_time[6:8]), int(end_time[8:10]), int(end_time[10:12]), int(end_time[12:14]))
435435
436436 """ current timestamp and hour index """
437437 curr_datetime = start_datetime
@@ -474,7 +474,7 @@
475475
476476
477477 """ increment the time by interval minutes """
478 - td = dt.timedelta(minutes=interval)
 478+ td = datetime.timedelta(minutes=interval)
479479 curr_datetime = curr_datetime + td
480480
481481 """ append the last items onto time lists """

Status & tagging log