r90486 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r90485‎ | r90486 | r90487 >
Date:21:45, 20 June 2011
Author:rfaulk
Status:deferred
Tags:
Comment:
Added documentation
removed BannerLPReportingLoader class as it is obsolete
Modified paths:
  • /trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py (modified) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py
@@ -33,15 +33,19 @@
3434
3535 BASE CLASS :: DataLoader
3636
37 - This is the base class for data handling functionality of metrics.
 37+ This is the base class for data handling functionality of metrics. Inherited classes will be associated with a set of SQL files that loosely define a set related query types.
3838
 39+ Handles database connections. Stores state values based on query results and also meta data for the queries themselves which can be resolved to full statements by interacting with QueryData.
 40+
 41+ Data is stored according to a key name on which sets are separated. Subclasses can further specify how the data is split and related.
 42+
3943 METHODS:
4044 init_db
4145 close_db
42 - compose_key
43 - include_keys
44 - exclude_keys
45 - get_sql_filename_for_query
 46+ compose_key - build a new key in the results based on existing one
 47+ include_keys - include only certain keys based on key name
 48+ exclude_keys - explicitly remove certain types of keys based on key name
 49+ get_sql_filename_for_query - resolve a sql filename based on the simpler query type
4650
4751
4852 """
@@ -172,11 +176,8 @@
173177 """
174178 Return a specific query name given a query type
175179
176 - INPUT:
177 - query_type -
178 -
179180 RETURN:
180 - query_name -
 181+ query_name - name of the sql file
181182
182183 """
183184 def get_sql_filename_for_query(self):
@@ -184,13 +185,17 @@
185186 try:
186187 return self._query_names_[self._query_type_]
187188 except KeyError:
188 - print >> sys.stderr, 'Could not find a query for type: ' + query_type
 189+ print >> sys.stderr, 'Could not find a query for type: ' + self._query_type_
189190 sys.exit(2)
190191
191192
192193
193 -
 194+"""
 195+
 196+ This Loader inherits the functionality of DaatLoader and handles SQL queries that group data by time intervals. These are generally preferable for most
 197+ of the time dependent data analysis and also provides functionality that enables the raw results to be combined over all keys
194198
 199+"""
195200 class IntervalReportingLoader(DataLoader):
196201
197202 _summary_data_ = None
@@ -201,11 +206,15 @@
202207 self._query_names_['campaign'] = 'report_campaign_metrics_minutely'
203208 self._query_names_['campaign_total'] = 'report_campaign_metrics_minutely_total'
204209
 210+ self._query_names_[FDH._QTYPE_BANNER_ + FDH._QTYPE_TIME_] = 'report_banner_metrics_minutely_all'
 211+ self._query_names_[FDH._QTYPE_LP_ + FDH._QTYPE_TIME_] = 'report_lp_metrics_minutely_all'
 212+ self._query_names_[FDH._QTYPE_CAMPAIGN_ + FDH._QTYPE_TIME_] = 'report_campaign_metrics_minutely_all'
 213+
205214 self._query_type_ = query_type
206215
207216 """ hardcode the data handler for now """
208217 self._data_handler_ = FDH
209 -
 218+
210219 """
211220 Executes the query which generates interval metrics and sets _results_ and _col_names_
212221
@@ -256,7 +265,7 @@
257266
258267 metric_index = QD.get_metric_index(query_name, metric_name)
259268 time_index = QD.get_time_index(query_name)
260 -
 269+ #print sql_stmnt
261270 """ Compose the data for each separate donor pipeline artifact """
262271 try:
263272 # err_msg = sql_stmnt
@@ -337,14 +346,9 @@
338347
339348
340349 """
341 - Post process raw data from query. Combines data rows according to column type definitions. This must be run *after*
 350+ Post process raw data from query. Combines data rows according to column type definitions. This must be run *after* run_query.
342351
343 - INPUT:
344 - data - a list of rows
345 - data_handler - the data handler module
346 - query_name -
347 - RETURN:
348 - the dictionary of combined rows (note that there must be a key column)
 352+ This allows aggregates of query data to be performed after the actual processing of the query.
349353 """
350354 def combine_rows(self):
351355
@@ -380,7 +384,13 @@
381385 """ Change null values to 0 """
382386 if field == None or field == 'NULL':
383387 field = 0.0
384 -
 388+
 389+ """
 390+ COMBINE THE DATA FOR EACH KEY
 391+
 392+ Based on the column type compile an aggregate (e.g. sum, average)
 393+
 394+ """
385395 if col_type == self._data_handler_._COLTYPE_RATE_:
386396
387397 try:
@@ -395,11 +405,14 @@
396406 except KeyError as e:
397407 data_dict[key][self._col_names_[i]] = float(field)
398408
399 - """ !! MODIFY --- this could cause issues in the case of missing data """
 409+ """ !! MODIFY / FIXME --- this could cause issues in the case of missing data """
400410 num_rows = len(self._results_) / len(data_dict.keys())
401411
402 - """ POST PROCESSING
403 - Normalize rate columns """
 412+ """
 413+ POST PROCESSING
 414+
 415+ Normalize rate columns
 416+ """
404417 for i in range(len(col_types)):
405418 if col_types[i] == self._data_handler_._COLTYPE_RATE_:
406419 for key in data_dict.keys():
@@ -411,6 +424,10 @@
412425
413426 """
414427
 428+ This class inherits the IntrvalLoader functionality but utilizes the campaign DataLoader instead. Also the results generated incorporate campaign totals also --
 429+ the result fully specifies each campaign item (campaign - banner - landing page) and the number of views resulting.
 430+
 431+ This differs from the interval reporter in that it compiles results for views and donations only for an entire donation process or pipeline.
415432
416433 """
417434 class CampaignIntervalReportingLoader(IntervalReportingLoader):
@@ -453,181 +470,40 @@
454471
455472 return [metrics, times]
456473
457 -
458 -"""
459 -
460 - CLASS :: BannerLPReportingLoader
461474
462 - This dataloader handles reporting on banners and landing pages.
463 -
464 - METHODS:
465 - run_query
466 -
467 -
468475 """
469 -class BannerLPReportingLoader(DataLoader):
470 -
471 - def __init__(self, query_type):
472 - self._query_names_['LP'] = 'report_LP_metrics'
473 - self._query_names_['BAN'] = 'report_banner_metrics'
474 - self._query_names_['BAN-TEST'] = 'report_banner_metrics'
475 - self._query_names_['LP-TEST'] = 'report_LP_metrics'
476 -
477 - self._query_type_ = query_type
478 -
479 - """
480 - <description>
481 -
482 - INPUT:
483 -
484 - RETURN:
485 -
486 - """
487 - def run_query(self,start_time, end_time, campaign, metric_name):
488 -
489 - self.init_db()
490 -
491 - metric_lists = Hlp.AutoVivification()
492 - time_lists = Hlp.AutoVivification()
493 - # table_data = [] # store the results in a table for reporting
494 -
495 - # Load the SQL File & Format
496 - filename = self._sql_path_ + query_name + '.sql'
497 - sql_stmnt = Hlp.read_sql(filename)
498 -
499 - sql_stmnt = QD.format_query(query_name, sql_stmnt, [start_time, end_time, campaign])
500 -
501 - key_index = QD.get_key_index(query_name)
502 - time_index = QD.get_time_index(query_name)
503 - metric_index = QD.get_metric_index(query_name, metric_name)
504 -
505 - """ Composes the data for each banner or LP """
506 - try:
507 - err_msg = sql_stmnt
508 - self.cur.execute(sql_stmnt)
509 -
510 - results = self.cur.fetchall()
511 -
512 - for row in results:
513 -
514 - key_name = row[key_index]
515 -
516 - try:
517 - metric_lists[key_name].append(row[metric_index])
518 - time_lists[key_name].append(row[time_index])
519 - except:
520 - metric_lists[key_name] = list()
521 - time_lists[key_name] = list()
522 -
523 - metric_lists[key_name].append(row[metric_index])
524 - time_lists[key_name].append(row[time_index])
525 -
526 - except:
527 - self.db.rollback()
528 - sys.exit("Database Interface Exception:\n" + err_msg)
529 -
530 - """ Convert Times to Integers """
531 - max_i = 0
532 -
533 - for key in time_lists.keys():
534 - for date_str in time_lists[key]:
535 - day_int = int(date_str[8:10])
536 - hr_int = int(date_str[11:13])
537 - date_int = int(date_str[0:4]+date_str[5:7]+date_str[8:10]+date_str[11:13])
538 - if date_int > max_i:
539 - max_i = date_int
540 - max_day = day_int
541 - max_hr = hr_int
542 -
543 -
544 - # Normalize dates
545 - time_norm = Hlp.AutoVivification()
546 - for key in time_lists.keys():
547 - for date_str in time_lists[key]:
548 - day = int(date_str[8:10])
549 - hr = int(date_str[11:13])
550 - # date_int = int(date_str[0:4]+date_str[5:7]+date_str[8:10]+date_str[11:13])
551 - elem = (day - max_day) * 24 + (hr - max_hr)
552 - try:
553 - time_norm[key].append(elem)
554 - except:
555 - time_norm[key] = list()
556 - time_norm[key].append(elem)
557 -
558 -
559 - self.close_db()
560 -
561 - return [metric_lists, time_norm]
562 -
563 -
564 - """
565 -
566 - <description>
567 -
568 - INPUT:
569 -
570 - RETURN:
571 -
572 - """
573 - def get_latest_campaign(self):
574 -
575 - query_name = 'report_latest_campaign'
576 - self.init_db()
577 -
578 - """ Look at campaigns over the past 24 hours - TS format=1, TS resolution=1 """
579 - now = datetime.datetime.now()
580 - hours_back = 72
581 - times = self.gen_date_strings(now, hours_back,1,1)
582 -
583 - sql_stmnt = Hlp.read_sql('./sql/report_latest_campaign.sql')
584 - sql_stmnt = QD.format_query(query_name, sql_stmnt, [times[0]])
585 -
586 - campaign_index = QD.get_campaign_index(query_name)
587 - time_index = QD.get_time_index(query_name)
588 -
589 - try:
590 - err_msg = sql_stmnt
591 - self.cur.execute(sql_stmnt)
592 -
593 - row = self.cur.fetchone()
594 - except:
595 - self.db.rollback()
596 - sys.exit("Database Interface Exception:\n" + err_msg)
597 -
598 - campaign = row[campaign_index]
599 - timestamp = row[time_index]
600 -
601 - self.close_db()
602 -
603 - return [campaign, timestamp]
604 -
605 -
606 -
 476+ This class is concerned with preparing the data for a hypothesis test and is consumed by classes which perform this analysis in HypothesisTest.py
 477+"""
607478 class HypothesisTestLoader(DataLoader):
608479
609480 """
610 - Execute data acquisition for hypothesis tester
 481+ Execute data acquisition for hypothesis tester. The idea is that data sampled identically over time for item 1 and item 2 will be generated on which analysis may be carried out.
611482
 483+ !! MODIFY/ FIXME -- the sampling is awkward, sampling interval and test interval should be specified explicitly !!
 484+
612485 INPUT:
613 - query_name -
614 - metric_name -
 486+ query_name - non-formatted sql filename !!MODIFY / FIXME -- this should be a type instead !!
 487+ metric_name - metric to be extracted from the data
615488 campaign -
616 - item_1 -
617 - item_2 -
 489+ item_1 - artifact or key name
 490+ item_2 - artifact or key name
618491 start_time -
619492 end_time -
620 - interval -
621 - num_samples -
 493+ interval - test interval; this groups a set of samples together to perform a paired t-test
 494+ num_samples - samples per test interval (sampling interval = interval / num_samples)
622495
623496 RETURN:
624 - metrics_1 -
625 - metrics_2 -
626 - times_indices -
 497+ metrics_1 - the metrics for item 1
 498+ metrics_2 - the metrics for item 2
 499+ times_indices - the sampling intervals
627500
628501 """
629502 def run_query(self, query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples):
630503
631 - """ retrieve time lists with timestamp format 1 (yyyyMMddhhmmss) """
 504+ """
 505+ Retrieve time lists with timestamp format 1 (yyyyMMddhhmmss)
 506+ This breaks the metrics into evenly sampled intervals
 507+ """
632508 ret = TP.get_time_lists(start_time, end_time, interval, num_samples, 1)
633509 times = ret[0]
634510 times_indices = ret[1]
@@ -641,6 +517,12 @@
642518 metrics_1 = []
643519 metrics_2 = []
644520
 521+ """
 522+ EXECUTE THE QUERIES FOR EACH INTERVAL
 523+
 524+ Generates metrics for each artifact, sampled in the same way
 525+
 526+ """
645527 for i in range(len(times) - 1):
646528
647529 # print '\nExecuting number ' + str(i) + ' batch of of data.'
@@ -693,12 +575,10 @@
694576
695577 CLASS :: CampaignReportingLoader
696578
697 - This dataloader handles reporting on utm_campaigns.
698 -
699 - METHODS:
700 - run_query
701 -
702 -
 579+ This inherits from the DataLoader base class and handles reporting on utm_campaigns. This reporter handles returning lists of banners and lps from a given campaign and also handles
 580+ reporting donation totals across all campaigns. While this interface is concerned with campaign reporting it does not associated results with time intervals within the start and
 581+ end times
 582+
703583 """
704584 class CampaignReportingLoader(DataLoader):
705585
@@ -711,7 +591,7 @@
712592 self._query_type_ = query_type
713593
714594 """
715 - !! MODIFY -- use python reflection !! ... maybe
 595+ !! MODIFY / FIXME -- use python reflection !! ... maybe
716596
717597 This method is retrieving campaign names
718598
@@ -737,7 +617,7 @@
738618
739619 Handle queries from "report_campaign_totals"
740620
741 - Gets metric totals for campaigns
 621+ Gets metric totals across campaigns
742622
743623 """
744624 def query_totals(self, params):
@@ -790,7 +670,7 @@
791671
792672 Handle queries from "report_campaign_banners" and "report_campaign_lps"
793673
794 - Gets a list of banners and landing pages running on the campaign in a time frame
 674+ Gets a list of banners and landing pages running for a given campaign over a given time frame
795675
796676 """
797677 def query_artifacts(self, params):
@@ -840,11 +720,14 @@
841721
842722 CLASS :: TableLoader
843723
844 - Provides data access particular to the t-test
 724+ Base class for providing MySQL table access. Inherits DataLoader.
845725
846726 METHODS:
847 - init_db -
848 - close_db -
 727+ record_exists - return a boolean value reflecting whether a record exists in the table
 728+ insert_row - try to insert a new record into the table
 729+ delete_row - try to delete a record from the table
 730+ update - try to modify a record in the table
 731+
849732 """
850733 class TableLoader(DataLoader):
851734
@@ -867,6 +750,8 @@
868751
869752 Provides data access particular to the t-test
870753
 754+ storage3.pmtpa.wmnet.faulkner.ttest:
 755+
871756 +--------------------+--------------+------+-----+---------+-------+
872757 | Field | Type | Null | Key | Default | Extra |
873758 +--------------------+--------------+------+-----+---------+-------+
@@ -884,11 +769,11 @@
885770 given the degrees of freedom and statistic t test
886771
887772 INPUT:
888 - degrees_of_freedom -
889 - t -
 773+ degrees_of_freedom - computed degrees of freedom of a dataset modeled on a student's t distribution
 774+ t - test statistic; random variable whose value is to be measured
890775
891776 RETURN:
892 - p -
 777+ p - the highest p value based on the input
893778
894779 """
895780 def get_pValue(self, degrees_of_freedom, t):
@@ -1099,23 +984,31 @@
1100985 """
1101986 def get_test_field(self, row, key):
1102987
1103 - if key == 'test_name':
1104 - return row[0]
1105 - elif key == 'test_type':
1106 - return row[1]
1107 - elif key == 'utm_campaign':
1108 - return row[2]
1109 - elif key == 'start_time':
1110 - return row[3].__str__()
1111 - elif key == 'end_time':
1112 - return row[4].__str__()
1113 - elif key == 'winner':
1114 - return row[5]
1115 - elif key == 'is_conclusive':
1116 - return row[6]
1117 - elif key == 'html_report':
1118 - return row[7]
 988+ try:
 989+ if key == 'test_name':
 990+ return row[0]
 991+ elif key == 'test_type':
 992+ return row[1]
 993+ elif key == 'utm_campaign':
 994+ return row[2]
 995+ elif key == 'start_time':
 996+ return row[3].__str__()
 997+ elif key == 'end_time':
 998+ return row[4].__str__()
 999+ elif key == 'winner':
 1000+ return row[5]
 1001+ elif key == 'is_conclusive':
 1002+ return row[6]
 1003+ elif key == 'html_report':
 1004+ return row[7]
11191005
 1006+ except Exception as inst:
 1007+
 1008+ print type(inst) # the exception instance
 1009+ print inst.args # arguments stored in .args
 1010+ print inst # __str__ allows args to printed directly
 1011+
 1012+ return ''
11201013
11211014
11221015 def delete_row(self):

Status & tagging log