Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/compute_confidence.py |
— | — | @@ -101,10 +101,17 @@ |
102 | 102 | |
103 | 103 | self._data_loader_._db_.rollback() |
104 | 104 | sys.exit("Database Interface Exception:\n" + err_msg) |
105 | | - |
106 | | - metrics_1.append(results_1[metric_index]) |
107 | | - metrics_2.append(results_2[metric_index]) |
108 | 105 | |
| 106 | + """ If no results are returned in this set the sample value is 0.0 """ |
| 107 | + try: |
| 108 | + metrics_1.append(results_1[metric_index]) |
| 109 | + except TypeError: |
| 110 | + metrics_1.append(0.0) |
| 111 | + try: |
| 112 | + metrics_2.append(results_2[metric_index]) |
| 113 | + except TypeError: |
| 114 | + metrics_2.append(0.0) |
| 115 | + |
109 | 116 | #print metrics_1 |
110 | 117 | #print metrics_2 |
111 | 118 | |
— | — | @@ -323,17 +330,17 @@ |
324 | 331 | # Compute the mean and variance for each group across all trials |
325 | 332 | for i in range(num_trials): |
326 | 333 | |
327 | | - m1 = 0 # mean of group 1 |
328 | | - m2 = 0 # mean of group 2 |
329 | | - var1 = 0 # variance of group 1 |
330 | | - var2 = 0 # variance of group 2 |
| 334 | + m1 = 0.0 # mean of group 1 |
| 335 | + m2 = 0.0 # mean of group 2 |
| 336 | + var1 = 0.0 # variance of group 1 |
| 337 | + var2 = 0.0 # variance of group 2 |
331 | 338 | |
332 | 339 | for j in range(num_samples): |
333 | 340 | index = i * num_samples + j |
334 | 341 | |
335 | 342 | # Compute mean for each group |
336 | | - m1 = m1 + metrics_1[index] |
337 | | - m2 = m2 + metrics_2[index] |
| 343 | + m1 = m1 + float(metrics_1[index]) |
| 344 | + m2 = m2 + float(metrics_2[index]) |
338 | 345 | |
339 | 346 | m1 = m1 / num_samples |
340 | 347 | m2 = m2 / num_samples |
— | — | @@ -342,8 +349,8 @@ |
343 | 350 | for j in range(num_samples): |
344 | 351 | index = i + j |
345 | 352 | |
346 | | - var1 = var1 + math.pow((metrics_1[i] - m1), 2) |
347 | | - var2 = var2 + math.pow((metrics_2[i] - m2), 2) |
| 353 | + var1 = var1 + math.pow((float(metrics_1[i]) - m1), 2) |
| 354 | + var2 = var2 + math.pow((float(metrics_2[i]) - m2), 2) |
348 | 355 | |
349 | 356 | means_1.append(float(m1)) |
350 | 357 | means_2.append(float(m2)) |
Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py |
— | — | @@ -158,18 +158,20 @@ |
159 | 159 | """ If the first element is not the start time add it |
160 | 160 | this will be the case if there is no data for the first interval |
161 | 161 | NOTE: two datapoints are added at the beginning to define the first interval """ |
| 162 | + times[key_name].append(start_time_obj) |
| 163 | + times[key_name].append(start_time_obj + interval_obj) |
| 164 | + |
162 | 165 | if start_time_obj_str != row[time_index]: |
163 | | - times[key_name].append(start_time_obj) |
164 | 166 | metrics[key_name].append(0.0) |
| 167 | + metrics[key_name].append(0.0) |
165 | 168 | |
166 | | - times[key_name].append(start_time_obj + interval_obj) |
167 | | - metrics[key_name].append(0.0) |
168 | | - else: |
169 | | - metrics[key_name].append(row[metric_index]) |
170 | 169 | times[key_name].append(time_obj) |
171 | | - |
172 | | - metrics[key_name].append(row[metric_index]) |
173 | 170 | times[key_name].append(time_obj + interval_obj) |
| 171 | + |
| 172 | + metrics[key_name].append(row[metric_index]) |
| 173 | + metrics[key_name].append(row[metric_index]) |
| 174 | + |
| 175 | + final_time[key_name] = row[time_index] |
174 | 176 | |
175 | 177 | |
176 | 178 | except Exception as inst: |
— | — | @@ -181,7 +183,8 @@ |
182 | 184 | sys.exit(0) |
183 | 185 | |
184 | 186 | |
185 | | - """ Ensure that the last time in the list is the endtime less the interval """ |
| 187 | + """ Ensure that the last time in the list is the endtime less the interval """ |
| 188 | + |
186 | 189 | for key in times.keys(): |
187 | 190 | if final_time[key] != end_time_obj_str: |
188 | 191 | times[key].append(end_time_obj) |
— | — | @@ -317,8 +320,16 @@ |
318 | 321 | self._db_.rollback() |
319 | 322 | sys.exit("Database Interface Exception:\n" + err_msg) |
320 | 323 | |
321 | | - metrics_1.append(results_1[metric_index]) |
322 | | - metrics_2.append(results_2[metric_index]) |
| 324 | + """ If no results are returned in this set the sample value is 0.0 |
| 325 | + !! MODIFY -- these results should not count as data points !! """ |
| 326 | + try: |
| 327 | + metrics_1.append(results_1[metric_index]) |
| 328 | + except TypeError: |
| 329 | + metrics_1.append(0.0) |
| 330 | + try: |
| 331 | + metrics_2.append(results_2[metric_index]) |
| 332 | + except TypeError: |
| 333 | + metrics_2.append(0.0) |
323 | 334 | |
324 | 335 | #print metrics_1 |
325 | 336 | #print metrics_2 |
Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/DataReporting.py |
— | — | @@ -30,6 +30,7 @@ |
31 | 31 | import miner_help as mh |
32 | 32 | import TimestampProcessor as TP |
33 | 33 | import DataLoader as DL |
| 34 | +import HypothesisTest as HT |
34 | 35 | |
35 | 36 | matplotlib.use('Agg') |
36 | 37 | |
— | — | @@ -51,8 +52,40 @@ |
52 | 53 | """ |
53 | 54 | class DataReporting(object): |
54 | 55 | |
| 56 | + _font_size_ = 24 |
| 57 | + _fig_width_pt_ = 246.0 # Get this from LaTeX using \showthe\columnwidth |
| 58 | + _inches_per_pt_ = 1.0/72.27 # Convert pt to inch |
| 59 | + _use_labels_= False |
| 60 | + _fig_file_format_ = 'png' |
| 61 | + _plot_type_ = 'line' |
| 62 | + _item_keys_ = list() |
55 | 63 | _data_loader_ = None |
56 | 64 | |
| 65 | + |
| 66 | + def __init__(self, **kwargs): |
| 67 | + |
| 68 | + for key in kwargs: |
| 69 | + |
| 70 | + if key == 'font_size': |
| 71 | + self._font_size_ = kwargs[key] |
| 72 | + elif key == 'fig_width_pt': |
| 73 | + self._fig_width_pt_ = kwargs[key] |
| 74 | + elif key == 'inches_per_pt': |
| 75 | + self._inches_per_pt_ = kwargs[key] |
| 76 | + elif key == 'use_labels': |
| 77 | + self._use_labels_ = kwargs[key] |
| 78 | + elif key == 'fig_file_format': |
| 79 | + self._fig_file_format_ = kwargs[key] |
| 80 | + elif key == 'plot_type': |
| 81 | + self._plot_type_ = kwargs[key] |
| 82 | + elif key == 'item_keys': |
| 83 | + self._item_keys_ = kwargs[key] |
| 84 | + elif key == 'data_loader': # Set custom data loaders |
| 85 | + if kwargs[key] == 'campaign_interval': |
| 86 | + self._data_loader_ = DL.CampaignIntervalReportingLoader() |
| 87 | + |
| 88 | + print self._data_loader_.__str__ |
| 89 | + |
57 | 90 | """ |
58 | 91 | |
59 | 92 | Smooths a list of values |
— | — | @@ -928,7 +961,7 @@ |
929 | 962 | pylab.savefig(fname, format='png') |
930 | 963 | |
931 | 964 | """ |
932 | | - <description> |
| 965 | + Entry point and definition for execution of miner reporting |
933 | 966 | |
934 | 967 | INPUT: |
935 | 968 | |
— | — | @@ -996,13 +1029,9 @@ |
997 | 1030 | |
998 | 1031 | class IntervalReporting(DataReporting): |
999 | 1032 | |
1000 | | - _font_size_ = 24 |
1001 | | - _fig_width_pt_ = 246.0 # Get this from LaTeX using \showthe\columnwidth |
1002 | | - _inches_per_pt_ = 1.0/72.27 # Convert pt to inch |
1003 | | - _use_labels_= False |
1004 | | - _fig_file_format_ = 'png' |
1005 | | - _plot_type_ = 'line' |
| 1033 | + |
1006 | 1034 | |
| 1035 | + |
1007 | 1036 | """ |
1008 | 1037 | Constructor for IntervalReporting |
1009 | 1038 | |
— | — | @@ -1014,26 +1043,10 @@ |
1015 | 1044 | def __init__(self, **kwargs): |
1016 | 1045 | |
1017 | 1046 | self._data_loader_ = DL.IntervalReportingLoader() |
1018 | | - |
1019 | | - for key in kwargs: |
1020 | | - |
1021 | | - if key == 'font_size': |
1022 | | - self._font_size_ = kwargs[key] |
1023 | | - elif key == 'fig_width_pt': |
1024 | | - self._fig_width_pt_ = kwargs[key] |
1025 | | - elif key == 'inches_per_pt': |
1026 | | - self._inches_per_pt_ = kwargs[key] |
1027 | | - elif key == 'use_labels': |
1028 | | - self._use_labels_ = kwargs[key] |
1029 | | - elif key == 'fig_file_format': |
1030 | | - self._fig_file_format_ = kwargs[key] |
1031 | | - elif key == 'plot_type': |
1032 | | - self._plot_type_ = kwargs[key] |
1033 | | - elif key == 'data_loader': # Set custom data loaders |
1034 | | - if kwargs[key] == 'campaign_interval': |
1035 | | - self._data_loader_ = DL.CampaignIntervalReportingLoader() |
| 1047 | + DataReporting.__init__(self, **kwargs) |
1036 | 1048 | |
1037 | | - print self._data_loader_.__str__ |
| 1049 | + |
| 1050 | + |
1038 | 1051 | |
1039 | 1052 | """ |
1040 | 1053 | <description> |
— | — | @@ -1052,8 +1065,28 @@ |
1053 | 1066 | print '' |
1054 | 1067 | |
1055 | 1068 | return |
1056 | | - |
| 1069 | + |
1057 | 1070 | """ |
| 1071 | + Selecting a subset of the key items in a dictionary |
| 1072 | + |
| 1073 | + INPUT: |
| 1074 | + dict_lists - dictionary to be parsed |
| 1075 | + |
| 1076 | + RETURN: |
| 1077 | + new_dict_lists - new dictionary containing only keys in self._item_keys_ |
| 1078 | + """ |
| 1079 | + def select_metric_keys(self, dict_lists): |
| 1080 | + new_dict_lists = dict() |
| 1081 | + |
| 1082 | + dict_lists_keys = dict_lists.keys() |
| 1083 | + |
| 1084 | + for key in self._item_keys_: |
| 1085 | + if key in dict_lists_keys: |
| 1086 | + new_dict_lists[key] = dict_lists[key] |
| 1087 | + |
| 1088 | + return new_dict_lists |
| 1089 | + |
| 1090 | + """ |
1058 | 1091 | Execute reporting query and generate plots |
1059 | 1092 | <description> |
1060 | 1093 | |
— | — | @@ -1131,6 +1164,12 @@ |
1132 | 1165 | counts = return_val[0] |
1133 | 1166 | times = return_val[1] |
1134 | 1167 | |
| 1168 | + """ Select only the specified item keys """ |
| 1169 | + print counts.keys() |
| 1170 | + if len(self._item_keys_) > 0: |
| 1171 | + counts = self.select_metric_keys(counts) |
| 1172 | + times = self.select_metric_keys(times) |
| 1173 | + |
1135 | 1174 | """ Convert Times to Integers that indicate relative times AND normalize the intervals in case any are missing """ |
1136 | 1175 | for key in times.keys(): |
1137 | 1176 | times[key] = TP.normalize_timestamps(times[key], False, 2) |
— | — | @@ -1142,7 +1181,7 @@ |
1143 | 1182 | |
1144 | 1183 | xlabel = 'MINUTES' |
1145 | 1184 | subplot_index = 111 |
1146 | | - fname = campaign + '_' + metric_name |
| 1185 | + fname = campaign + '_' + query_type + '_' + metric_name |
1147 | 1186 | |
1148 | 1187 | metric_full_name = QD.get_metric_full_name(metric_name) |
1149 | 1188 | title = campaign + ': ' + metric_full_name + ' -- ' + TP.timestamp_convert_format(start_time,1,2) + ' - ' + TP.timestamp_convert_format(end_time,1,2) |
— | — | @@ -1200,25 +1239,26 @@ |
1201 | 1240 | |
1202 | 1241 | |
1203 | 1242 | """ |
1204 | | - def __init__(self, hypothesis_test): |
| 1243 | + def __init__(self, **kwargs): |
1205 | 1244 | |
1206 | | - """ check to make sure this is in fact a hypothsis test """ |
1207 | | - self._hypothesis_test_ = hypothesis_test |
1208 | | - self._data_loader_ = HypothesisTestLoader() |
1209 | | - |
| 1245 | + |
| 1246 | + for key in kwargs: |
| 1247 | + |
| 1248 | + if key == 'hyp_test': # Set the hypothesis test |
| 1249 | + if kwargs[key] == 't_test': |
| 1250 | + self._hypothesis_test_ = HT.TTest() |
| 1251 | + |
| 1252 | + print self._hypothesis_test_.__str__ |
| 1253 | + |
| 1254 | + self._data_loader_ = DL.HypothesisTestLoader() |
| 1255 | + DataReporting.__init__(self, **kwargs) |
| 1256 | + |
1210 | 1257 | """ |
1211 | 1258 | Describes how to run a report !! MODIFY !! |
1212 | 1259 | """ |
1213 | 1260 | def usage(self): |
1214 | 1261 | |
1215 | | - print 'Types of queries:' |
1216 | | - print ' (1) banner' |
1217 | | - print ' (2) LP' |
1218 | 1262 | print '' |
1219 | | - print 'e.g.' |
1220 | | - print " run('20101230160400', '20101230165400', 2, 'banner', 'imp', '20101230JA091_US')" |
1221 | | - print " run('20101230160400', '20101230165400', 2, 'LP', 'views', '20101230JA091_US')" |
1222 | | - print '' |
1223 | 1263 | |
1224 | 1264 | return |
1225 | 1265 | |
— | — | @@ -1303,8 +1343,8 @@ |
1304 | 1344 | av_std_dev_1 = av_std_dev_1 + math.pow(std_devs_1[i], 2) |
1305 | 1345 | av_std_dev_2 = av_std_dev_2 + math.pow(std_devs_2[i], 2) |
1306 | 1346 | |
1307 | | - av_std_dev_1 = math.pow(av_std_dev_1, 0.5) / len(std_devs_1) |
1308 | | - av_std_dev_2 = math.pow(av_std_dev_2, 0.5) / len(std_devs_1) |
| 1347 | + av_std_dev_1 = math.pow(av_std_dev_1 / len(std_devs_1), 0.5) |
| 1348 | + av_std_dev_2 = math.pow(av_std_dev_2 / len(std_devs_1), 0.5) |
1309 | 1349 | |
1310 | 1350 | """ Assign the winner """ |
1311 | 1351 | if av_means_1 > av_means_2: |
— | — | @@ -1312,7 +1352,7 @@ |
1313 | 1353 | else: |
1314 | 1354 | winner = labels[1] |
1315 | 1355 | |
1316 | | - win_str = "\nThe winner " + winner + " had a %.2f%s increase." |
| 1356 | + win_str = '\nThe winner "' + winner + '" had a %.2f%s increase.' |
1317 | 1357 | win_str = win_str % (percent_increase, '%') |
1318 | 1358 | |
1319 | 1359 | print '\nCOMMAND = ' + test_call |
— | — | @@ -1376,13 +1416,13 @@ |
1377 | 1417 | counter += 1 |
1378 | 1418 | |
1379 | 1419 | """ Retrieve values from database """ |
1380 | | - ret = _data_loader_.query_tables(query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples) |
| 1420 | + ret = self._data_loader_.run_query(query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples) |
1381 | 1421 | metrics_1 = ret[0] |
1382 | 1422 | metrics_2 = ret[1] |
1383 | 1423 | times_indices = ret[2] |
1384 | 1424 | |
1385 | 1425 | """ run the confidence test """ |
1386 | | - ret = _hypothesis_test_.confidence_test(metrics_1, metrics_2, num_samples) |
| 1426 | + ret = self._hypothesis_test_.confidence_test(metrics_1, metrics_2, num_samples) |
1387 | 1427 | means_1 = ret[0] |
1388 | 1428 | means_2 = ret[1] |
1389 | 1429 | std_devs_1 = ret[2] |
— | — | @@ -1409,7 +1449,7 @@ |
1410 | 1450 | self.gen_plot(means_1, means_2, std_devs_1, std_devs_2, times_indices, title, xlabel, ylabel, ranges, subplot_index, labels, fname) |
1411 | 1451 | |
1412 | 1452 | """ Print out results """ |
1413 | | - test_call = "run_test('" + test_name + "', '" + query_name + "', '" + metric_name + "', '" + campaign + "', '" + \ |
| 1453 | + test_call = "run('" + test_name + "', '" + query_name + "', '" + metric_name + "', '" + campaign + "', '" + \ |
1414 | 1454 | item_1 + "', '" + item_2 + "', '" + start_time + "', '" + end_time + "', " + str(interval) + ", " + str(num_samples) + ")" |
1415 | 1455 | self.print_metrics(fname, title, means_1, means_2, std_devs_1, std_devs_2, times_indices, labels, test_call) |
1416 | 1456 | |
Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/TimestampProcessor.py |
— | — | @@ -423,14 +423,14 @@ |
424 | 424 | time_indices - list of indices counting from zero marking the indices for reporting test interval parameters |
425 | 425 | |
426 | 426 | """ |
427 | | -def get_time_lists(self, start_time, end_time, interval, num_samples, format): |
| 427 | +def get_time_lists(start_time, end_time, interval, num_samples, format): |
428 | 428 | |
429 | 429 | """ range must be divisible by interval - convert to hours """ |
430 | 430 | range = float(interval * num_samples) / 60 |
431 | 431 | |
432 | 432 | """ Compose times """ |
433 | | - start_datetime = dt.datetime(int(start_time[0:4]), int(start_time[4:6]), int(start_time[6:8]), int(start_time[8:10]), int(start_time[10:12]), int(start_time[12:14])) |
434 | | - end_datetime = dt.datetime(int(end_time[0:4]), int(end_time[4:6]), int(end_time[6:8]), int(end_time[8:10]), int(end_time[10:12]), int(end_time[12:14])) |
| 433 | + start_datetime = datetime.datetime(int(start_time[0:4]), int(start_time[4:6]), int(start_time[6:8]), int(start_time[8:10]), int(start_time[10:12]), int(start_time[12:14])) |
| 434 | + end_datetime = datetime.datetime(int(end_time[0:4]), int(end_time[4:6]), int(end_time[6:8]), int(end_time[8:10]), int(end_time[10:12]), int(end_time[12:14])) |
435 | 435 | |
436 | 436 | """ current timestamp and hour index """ |
437 | 437 | curr_datetime = start_datetime |
— | — | @@ -474,7 +474,7 @@ |
475 | 475 | |
476 | 476 | |
477 | 477 | """ increment the time by interval minutes """ |
478 | | - td = dt.timedelta(minutes=interval) |
| 478 | + td = datetime.timedelta(minutes=interval) |
479 | 479 | curr_datetime = curr_datetime + td |
480 | 480 | |
481 | 481 | """ append the last items onto time lists """ |