Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py |
— | — | @@ -33,15 +33,19 @@ |
34 | 34 | |
35 | 35 | BASE CLASS :: DataLoader |
36 | 36 | |
37 | | - This is the base class for data handling functionality of metrics. |
| 37 | + This is the base class for data handling functionality of metrics. Inherited classes will be associated with a set of SQL files that loosely define a set related query types. |
38 | 38 | |
| 39 | + Handles database connections. Stores state values based on query results and also meta data for the queries themselves which can be resolved to full statements by interacting with QueryData. |
| 40 | + |
| 41 | + Data is stored according to a key name on which sets are separated. Subclasses can further specify how the data is split and related. |
| 42 | + |
39 | 43 | METHODS: |
40 | 44 | init_db |
41 | 45 | close_db |
42 | | - compose_key |
43 | | - include_keys |
44 | | - exclude_keys |
45 | | - get_sql_filename_for_query |
| 46 | + compose_key - build a new key in the results based on existing one |
| 47 | + include_keys - include only certain keys based on key name |
| 48 | + exclude_keys - explicitly remove certain types of keys based on key name |
| 49 | + get_sql_filename_for_query - resolve a sql filename based on the simpler query type |
46 | 50 | |
47 | 51 | |
48 | 52 | """ |
— | — | @@ -172,11 +176,8 @@ |
173 | 177 | """ |
174 | 178 | Return a specific query name given a query type |
175 | 179 | |
176 | | - INPUT: |
177 | | - query_type - |
178 | | - |
179 | 180 | RETURN: |
180 | | - query_name - |
| 181 | + query_name - name of the sql file |
181 | 182 | |
182 | 183 | """ |
183 | 184 | def get_sql_filename_for_query(self): |
— | — | @@ -184,13 +185,17 @@ |
185 | 186 | try: |
186 | 187 | return self._query_names_[self._query_type_] |
187 | 188 | except KeyError: |
188 | | - print >> sys.stderr, 'Could not find a query for type: ' + query_type |
| 189 | + print >> sys.stderr, 'Could not find a query for type: ' + self._query_type_ |
189 | 190 | sys.exit(2) |
190 | 191 | |
191 | 192 | |
192 | 193 | |
193 | | - |
| 194 | +""" |
| 195 | + |
| 196 | + This Loader inherits the functionality of DaatLoader and handles SQL queries that group data by time intervals. These are generally preferable for most |
| 197 | + of the time dependent data analysis and also provides functionality that enables the raw results to be combined over all keys |
194 | 198 | |
| 199 | +""" |
195 | 200 | class IntervalReportingLoader(DataLoader): |
196 | 201 | |
197 | 202 | _summary_data_ = None |
— | — | @@ -201,11 +206,15 @@ |
202 | 207 | self._query_names_['campaign'] = 'report_campaign_metrics_minutely' |
203 | 208 | self._query_names_['campaign_total'] = 'report_campaign_metrics_minutely_total' |
204 | 209 | |
| 210 | + self._query_names_[FDH._QTYPE_BANNER_ + FDH._QTYPE_TIME_] = 'report_banner_metrics_minutely_all' |
| 211 | + self._query_names_[FDH._QTYPE_LP_ + FDH._QTYPE_TIME_] = 'report_lp_metrics_minutely_all' |
| 212 | + self._query_names_[FDH._QTYPE_CAMPAIGN_ + FDH._QTYPE_TIME_] = 'report_campaign_metrics_minutely_all' |
| 213 | + |
205 | 214 | self._query_type_ = query_type |
206 | 215 | |
207 | 216 | """ hardcode the data handler for now """ |
208 | 217 | self._data_handler_ = FDH |
209 | | - |
| 218 | + |
210 | 219 | """ |
211 | 220 | Executes the query which generates interval metrics and sets _results_ and _col_names_ |
212 | 221 | |
— | — | @@ -256,7 +265,7 @@ |
257 | 266 | |
258 | 267 | metric_index = QD.get_metric_index(query_name, metric_name) |
259 | 268 | time_index = QD.get_time_index(query_name) |
260 | | - |
| 269 | + #print sql_stmnt |
261 | 270 | """ Compose the data for each separate donor pipeline artifact """ |
262 | 271 | try: |
263 | 272 | # err_msg = sql_stmnt |
— | — | @@ -337,14 +346,9 @@ |
338 | 347 | |
339 | 348 | |
340 | 349 | """ |
341 | | - Post process raw data from query. Combines data rows according to column type definitions. This must be run *after* |
| 350 | + Post process raw data from query. Combines data rows according to column type definitions. This must be run *after* run_query. |
342 | 351 | |
343 | | - INPUT: |
344 | | - data - a list of rows |
345 | | - data_handler - the data handler module |
346 | | - query_name - |
347 | | - RETURN: |
348 | | - the dictionary of combined rows (note that there must be a key column) |
| 352 | + This allows aggregates of query data to be performed after the actual processing of the query. |
349 | 353 | """ |
350 | 354 | def combine_rows(self): |
351 | 355 | |
— | — | @@ -380,7 +384,13 @@ |
381 | 385 | """ Change null values to 0 """ |
382 | 386 | if field == None or field == 'NULL': |
383 | 387 | field = 0.0 |
384 | | - |
| 388 | + |
| 389 | + """ |
| 390 | + COMBINE THE DATA FOR EACH KEY |
| 391 | + |
| 392 | + Based on the column type compile an aggregate (e.g. sum, average) |
| 393 | + |
| 394 | + """ |
385 | 395 | if col_type == self._data_handler_._COLTYPE_RATE_: |
386 | 396 | |
387 | 397 | try: |
— | — | @@ -395,11 +405,14 @@ |
396 | 406 | except KeyError as e: |
397 | 407 | data_dict[key][self._col_names_[i]] = float(field) |
398 | 408 | |
399 | | - """ !! MODIFY --- this could cause issues in the case of missing data """ |
| 409 | + """ !! MODIFY / FIXME --- this could cause issues in the case of missing data """ |
400 | 410 | num_rows = len(self._results_) / len(data_dict.keys()) |
401 | 411 | |
402 | | - """ POST PROCESSING |
403 | | - Normalize rate columns """ |
| 412 | + """ |
| 413 | + POST PROCESSING |
| 414 | + |
| 415 | + Normalize rate columns |
| 416 | + """ |
404 | 417 | for i in range(len(col_types)): |
405 | 418 | if col_types[i] == self._data_handler_._COLTYPE_RATE_: |
406 | 419 | for key in data_dict.keys(): |
— | — | @@ -411,6 +424,10 @@ |
412 | 425 | |
413 | 426 | """ |
414 | 427 | |
| 428 | + This class inherits the IntrvalLoader functionality but utilizes the campaign DataLoader instead. Also the results generated incorporate campaign totals also -- |
| 429 | + the result fully specifies each campaign item (campaign - banner - landing page) and the number of views resulting. |
| 430 | + |
| 431 | + This differs from the interval reporter in that it compiles results for views and donations only for an entire donation process or pipeline. |
415 | 432 | |
416 | 433 | """ |
417 | 434 | class CampaignIntervalReportingLoader(IntervalReportingLoader): |
— | — | @@ -453,181 +470,40 @@ |
454 | 471 | |
455 | 472 | return [metrics, times] |
456 | 473 | |
457 | | - |
458 | | -""" |
459 | | - |
460 | | - CLASS :: BannerLPReportingLoader |
461 | 474 | |
462 | | - This dataloader handles reporting on banners and landing pages. |
463 | | - |
464 | | - METHODS: |
465 | | - run_query |
466 | | - |
467 | | - |
468 | 475 | """ |
469 | | -class BannerLPReportingLoader(DataLoader): |
470 | | - |
471 | | - def __init__(self, query_type): |
472 | | - self._query_names_['LP'] = 'report_LP_metrics' |
473 | | - self._query_names_['BAN'] = 'report_banner_metrics' |
474 | | - self._query_names_['BAN-TEST'] = 'report_banner_metrics' |
475 | | - self._query_names_['LP-TEST'] = 'report_LP_metrics' |
476 | | - |
477 | | - self._query_type_ = query_type |
478 | | - |
479 | | - """ |
480 | | - <description> |
481 | | - |
482 | | - INPUT: |
483 | | - |
484 | | - RETURN: |
485 | | - |
486 | | - """ |
487 | | - def run_query(self,start_time, end_time, campaign, metric_name): |
488 | | - |
489 | | - self.init_db() |
490 | | - |
491 | | - metric_lists = Hlp.AutoVivification() |
492 | | - time_lists = Hlp.AutoVivification() |
493 | | - # table_data = [] # store the results in a table for reporting |
494 | | - |
495 | | - # Load the SQL File & Format |
496 | | - filename = self._sql_path_ + query_name + '.sql' |
497 | | - sql_stmnt = Hlp.read_sql(filename) |
498 | | - |
499 | | - sql_stmnt = QD.format_query(query_name, sql_stmnt, [start_time, end_time, campaign]) |
500 | | - |
501 | | - key_index = QD.get_key_index(query_name) |
502 | | - time_index = QD.get_time_index(query_name) |
503 | | - metric_index = QD.get_metric_index(query_name, metric_name) |
504 | | - |
505 | | - """ Composes the data for each banner or LP """ |
506 | | - try: |
507 | | - err_msg = sql_stmnt |
508 | | - self.cur.execute(sql_stmnt) |
509 | | - |
510 | | - results = self.cur.fetchall() |
511 | | - |
512 | | - for row in results: |
513 | | - |
514 | | - key_name = row[key_index] |
515 | | - |
516 | | - try: |
517 | | - metric_lists[key_name].append(row[metric_index]) |
518 | | - time_lists[key_name].append(row[time_index]) |
519 | | - except: |
520 | | - metric_lists[key_name] = list() |
521 | | - time_lists[key_name] = list() |
522 | | - |
523 | | - metric_lists[key_name].append(row[metric_index]) |
524 | | - time_lists[key_name].append(row[time_index]) |
525 | | - |
526 | | - except: |
527 | | - self.db.rollback() |
528 | | - sys.exit("Database Interface Exception:\n" + err_msg) |
529 | | - |
530 | | - """ Convert Times to Integers """ |
531 | | - max_i = 0 |
532 | | - |
533 | | - for key in time_lists.keys(): |
534 | | - for date_str in time_lists[key]: |
535 | | - day_int = int(date_str[8:10]) |
536 | | - hr_int = int(date_str[11:13]) |
537 | | - date_int = int(date_str[0:4]+date_str[5:7]+date_str[8:10]+date_str[11:13]) |
538 | | - if date_int > max_i: |
539 | | - max_i = date_int |
540 | | - max_day = day_int |
541 | | - max_hr = hr_int |
542 | | - |
543 | | - |
544 | | - # Normalize dates |
545 | | - time_norm = Hlp.AutoVivification() |
546 | | - for key in time_lists.keys(): |
547 | | - for date_str in time_lists[key]: |
548 | | - day = int(date_str[8:10]) |
549 | | - hr = int(date_str[11:13]) |
550 | | - # date_int = int(date_str[0:4]+date_str[5:7]+date_str[8:10]+date_str[11:13]) |
551 | | - elem = (day - max_day) * 24 + (hr - max_hr) |
552 | | - try: |
553 | | - time_norm[key].append(elem) |
554 | | - except: |
555 | | - time_norm[key] = list() |
556 | | - time_norm[key].append(elem) |
557 | | - |
558 | | - |
559 | | - self.close_db() |
560 | | - |
561 | | - return [metric_lists, time_norm] |
562 | | - |
563 | | - |
564 | | - """ |
565 | | - |
566 | | - <description> |
567 | | - |
568 | | - INPUT: |
569 | | - |
570 | | - RETURN: |
571 | | - |
572 | | - """ |
573 | | - def get_latest_campaign(self): |
574 | | - |
575 | | - query_name = 'report_latest_campaign' |
576 | | - self.init_db() |
577 | | - |
578 | | - """ Look at campaigns over the past 24 hours - TS format=1, TS resolution=1 """ |
579 | | - now = datetime.datetime.now() |
580 | | - hours_back = 72 |
581 | | - times = self.gen_date_strings(now, hours_back,1,1) |
582 | | - |
583 | | - sql_stmnt = Hlp.read_sql('./sql/report_latest_campaign.sql') |
584 | | - sql_stmnt = QD.format_query(query_name, sql_stmnt, [times[0]]) |
585 | | - |
586 | | - campaign_index = QD.get_campaign_index(query_name) |
587 | | - time_index = QD.get_time_index(query_name) |
588 | | - |
589 | | - try: |
590 | | - err_msg = sql_stmnt |
591 | | - self.cur.execute(sql_stmnt) |
592 | | - |
593 | | - row = self.cur.fetchone() |
594 | | - except: |
595 | | - self.db.rollback() |
596 | | - sys.exit("Database Interface Exception:\n" + err_msg) |
597 | | - |
598 | | - campaign = row[campaign_index] |
599 | | - timestamp = row[time_index] |
600 | | - |
601 | | - self.close_db() |
602 | | - |
603 | | - return [campaign, timestamp] |
604 | | - |
605 | | - |
606 | | - |
| 476 | + This class is concerned with preparing the data for a hypothesis test and is consumed by classes which perform this analysis in HypothesisTest.py |
| 477 | +""" |
607 | 478 | class HypothesisTestLoader(DataLoader): |
608 | 479 | |
609 | 480 | """ |
610 | | - Execute data acquisition for hypothesis tester |
| 481 | + Execute data acquisition for hypothesis tester. The idea is that data sampled identically over time for item 1 and item 2 will be generated on which analysis may be carried out. |
611 | 482 | |
| 483 | + !! MODIFY/ FIXME -- the sampling is awkward, sampling interval and test interval should be specified explicitly !! |
| 484 | + |
612 | 485 | INPUT: |
613 | | - query_name - |
614 | | - metric_name - |
| 486 | + query_name - non-formatted sql filename !!MODIFY / FIXME -- this should be a type instead !! |
| 487 | + metric_name - metric to be extracted from the data |
615 | 488 | campaign - |
616 | | - item_1 - |
617 | | - item_2 - |
| 489 | + item_1 - artifact or key name |
| 490 | + item_2 - artifact or key name |
618 | 491 | start_time - |
619 | 492 | end_time - |
620 | | - interval - |
621 | | - num_samples - |
| 493 | + interval - test interval; this groups a set of samples together to perform a paired t-test |
| 494 | + num_samples - samples per test interval (sampling interval = interval / num_samples) |
622 | 495 | |
623 | 496 | RETURN: |
624 | | - metrics_1 - |
625 | | - metrics_2 - |
626 | | - times_indices - |
| 497 | + metrics_1 - the metrics for item 1 |
| 498 | + metrics_2 - the metrics for item 2 |
| 499 | + times_indices - the sampling intervals |
627 | 500 | |
628 | 501 | """ |
629 | 502 | def run_query(self, query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples): |
630 | 503 | |
631 | | - """ retrieve time lists with timestamp format 1 (yyyyMMddhhmmss) """ |
| 504 | + """ |
| 505 | + Retrieve time lists with timestamp format 1 (yyyyMMddhhmmss) |
| 506 | + This breaks the metrics into evenly sampled intervals |
| 507 | + """ |
632 | 508 | ret = TP.get_time_lists(start_time, end_time, interval, num_samples, 1) |
633 | 509 | times = ret[0] |
634 | 510 | times_indices = ret[1] |
— | — | @@ -641,6 +517,12 @@ |
642 | 518 | metrics_1 = [] |
643 | 519 | metrics_2 = [] |
644 | 520 | |
| 521 | + """ |
| 522 | + EXECUTE THE QUERIES FOR EACH INTERVAL |
| 523 | + |
| 524 | + Generates metrics for each artifact, sampled in the same way |
| 525 | + |
| 526 | + """ |
645 | 527 | for i in range(len(times) - 1): |
646 | 528 | |
647 | 529 | # print '\nExecuting number ' + str(i) + ' batch of of data.' |
— | — | @@ -693,12 +575,10 @@ |
694 | 576 | |
695 | 577 | CLASS :: CampaignReportingLoader |
696 | 578 | |
697 | | - This dataloader handles reporting on utm_campaigns. |
698 | | - |
699 | | - METHODS: |
700 | | - run_query |
701 | | - |
702 | | - |
| 579 | + This inherits from the DataLoader base class and handles reporting on utm_campaigns. This reporter handles returning lists of banners and lps from a given campaign and also handles |
| 580 | + reporting donation totals across all campaigns. While this interface is concerned with campaign reporting it does not associated results with time intervals within the start and |
| 581 | + end times |
| 582 | + |
703 | 583 | """ |
704 | 584 | class CampaignReportingLoader(DataLoader): |
705 | 585 | |
— | — | @@ -711,7 +591,7 @@ |
712 | 592 | self._query_type_ = query_type |
713 | 593 | |
714 | 594 | """ |
715 | | - !! MODIFY -- use python reflection !! ... maybe |
| 595 | + !! MODIFY / FIXME -- use python reflection !! ... maybe |
716 | 596 | |
717 | 597 | This method is retrieving campaign names |
718 | 598 | |
— | — | @@ -737,7 +617,7 @@ |
738 | 618 | |
739 | 619 | Handle queries from "report_campaign_totals" |
740 | 620 | |
741 | | - Gets metric totals for campaigns |
| 621 | + Gets metric totals across campaigns |
742 | 622 | |
743 | 623 | """ |
744 | 624 | def query_totals(self, params): |
— | — | @@ -790,7 +670,7 @@ |
791 | 671 | |
792 | 672 | Handle queries from "report_campaign_banners" and "report_campaign_lps" |
793 | 673 | |
794 | | - Gets a list of banners and landing pages running on the campaign in a time frame |
| 674 | + Gets a list of banners and landing pages running for a given campaign over a given time frame |
795 | 675 | |
796 | 676 | """ |
797 | 677 | def query_artifacts(self, params): |
— | — | @@ -840,11 +720,14 @@ |
841 | 721 | |
842 | 722 | CLASS :: TableLoader |
843 | 723 | |
844 | | - Provides data access particular to the t-test |
| 724 | + Base class for providing MySQL table access. Inherits DataLoader. |
845 | 725 | |
846 | 726 | METHODS: |
847 | | - init_db - |
848 | | - close_db - |
| 727 | + record_exists - return a boolean value reflecting whether a record exists in the table |
| 728 | + insert_row - try to insert a new record into the table |
| 729 | + delete_row - try to delete a record from the table |
| 730 | + update - try to modify a record in the table |
| 731 | + |
849 | 732 | """ |
850 | 733 | class TableLoader(DataLoader): |
851 | 734 | |
— | — | @@ -867,6 +750,8 @@ |
868 | 751 | |
869 | 752 | Provides data access particular to the t-test |
870 | 753 | |
| 754 | + storage3.pmtpa.wmnet.faulkner.ttest: |
| 755 | + |
871 | 756 | +--------------------+--------------+------+-----+---------+-------+ |
872 | 757 | | Field | Type | Null | Key | Default | Extra | |
873 | 758 | +--------------------+--------------+------+-----+---------+-------+ |
— | — | @@ -884,11 +769,11 @@ |
885 | 770 | given the degrees of freedom and statistic t test |
886 | 771 | |
887 | 772 | INPUT: |
888 | | - degrees_of_freedom - |
889 | | - t - |
| 773 | + degrees_of_freedom - computed degrees of freedom of a dataset modeled on a student's t distribution |
| 774 | + t - test statistic; random variable whose value is to be measured |
890 | 775 | |
891 | 776 | RETURN: |
892 | | - p - |
| 777 | + p - the highest p value based on the input |
893 | 778 | |
894 | 779 | """ |
895 | 780 | def get_pValue(self, degrees_of_freedom, t): |
— | — | @@ -1099,23 +984,31 @@ |
1100 | 985 | """ |
1101 | 986 | def get_test_field(self, row, key): |
1102 | 987 | |
1103 | | - if key == 'test_name': |
1104 | | - return row[0] |
1105 | | - elif key == 'test_type': |
1106 | | - return row[1] |
1107 | | - elif key == 'utm_campaign': |
1108 | | - return row[2] |
1109 | | - elif key == 'start_time': |
1110 | | - return row[3].__str__() |
1111 | | - elif key == 'end_time': |
1112 | | - return row[4].__str__() |
1113 | | - elif key == 'winner': |
1114 | | - return row[5] |
1115 | | - elif key == 'is_conclusive': |
1116 | | - return row[6] |
1117 | | - elif key == 'html_report': |
1118 | | - return row[7] |
| 988 | + try: |
| 989 | + if key == 'test_name': |
| 990 | + return row[0] |
| 991 | + elif key == 'test_type': |
| 992 | + return row[1] |
| 993 | + elif key == 'utm_campaign': |
| 994 | + return row[2] |
| 995 | + elif key == 'start_time': |
| 996 | + return row[3].__str__() |
| 997 | + elif key == 'end_time': |
| 998 | + return row[4].__str__() |
| 999 | + elif key == 'winner': |
| 1000 | + return row[5] |
| 1001 | + elif key == 'is_conclusive': |
| 1002 | + return row[6] |
| 1003 | + elif key == 'html_report': |
| 1004 | + return row[7] |
1119 | 1005 | |
| 1006 | + except Exception as inst: |
| 1007 | + |
| 1008 | + print type(inst) # the exception instance |
| 1009 | + print inst.args # arguments stored in .args |
| 1010 | + print inst # __str__ allows args to printed directly |
| 1011 | + |
| 1012 | + return '' |
1120 | 1013 | |
1121 | 1014 | |
1122 | 1015 | def delete_row(self): |