r90999 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r90998‎ | r90999 | r91000 >
Date:21:11, 28 June 2011
Author:rfaulk
Status:deferred
Tags:
Comment:
Added flag _was_run_ to enable reuse of old data to prevent rerunning queries for existing data
Modified paths:
  • /trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py (modified) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py
@@ -15,14 +15,10 @@
1616 __date__ = "April 8th, 2011"
1717
1818
19 -import sys
20 -# sys.path.append('../')
 19+""" Import python base modules """
 20+import sys, MySQLdb, math, datetime, re
2121
22 -import MySQLdb
23 -import math
24 -import datetime
25 -import re # regular expression matching
26 -
 22+""" Import Analytics modules """
2723 import Fundraiser_Tools.settings as projSet
2824 import Fundraiser_Tools.classes.QueryData as QD
2925 import Fundraiser_Tools.classes.TimestampProcessor as TP
@@ -39,6 +35,10 @@
4036
4137 Data is stored according to a key name on which sets are separated. Subclasses can further specify how the data is split and related.
4238
 39+ MEMBERS:
 40+ @var _db_: instance of MySQLdb connection
 41+ @var _cur_: cursor object for a MySQL conne ction
 42+
4343 METHODS:
4444 init_db
4545 close_db
@@ -62,14 +62,23 @@
6363 _query_type_ = '' # Stores the query type (dependent on the data handler definition)
6464 _results_ = None
6565 _col_names_ = None
 66+ _was_run_ = False
6667
67 -
 68+ """
 69+ Constructor
 70+
 71+ Ensure that the query is run for each new object
 72+ """
 73+ def __init__(self):
 74+
 75+ self._was_run_ = False
 76+
 77+
6878 def init_db(self):
6979
7080 """ Establish connection """
71 - #db = MySQLdb.connect(host='db10.pmtpa.wmnet', user='rfaulk', db='faulkner')
 81+ # self._db_ = MySQLdb.connect(host=projSet.__db_server__, user=projSet.__user__, db=projSet.__db__, port=projSet.__db_port__, passwd=projSet.__pass__)
7282 self._db_ = MySQLdb.connect(host=projSet.__db_server__, user=projSet.__user__, db=projSet.__db__, port=projSet.__db_port__)
73 - #self.db = MySQLdb.connect(host='storage3.pmtpa.wmnet', user='rfaulk', db='faulkner')
7483
7584 """ Create cursor """
7685 self._cur_ = self._db_.cursor()
@@ -201,8 +210,10 @@
202211 _summary_data_ = None
203212
204213 def __init__(self, query_type):
205 - self._query_names_['banner'] = 'report_banner_metrics_minutely'
206 - self._query_names_['LP'] = 'report_LP_metrics_minutely'
 214+
 215+ self._query_names_[FDH._QTYPE_BANNER_] = 'report_banner_metrics_minutely'
 216+ self._query_names_[FDH._QTYPE_LP_] = 'report_LP_metrics_minutely'
 217+ self._query_names_[FDH._QTYPE_BANNER_LP_] = 'report_bannerLP_metrics_minutely'
207218 self._query_names_['campaign'] = 'report_campaign_metrics_minutely'
208219 self._query_names_['campaign_total'] = 'report_campaign_metrics_minutely_total'
209220
@@ -211,9 +222,13 @@
212223 self._query_names_[FDH._QTYPE_CAMPAIGN_ + FDH._QTYPE_TIME_] = 'report_campaign_metrics_minutely_all'
213224
214225 self._query_type_ = query_type
215 -
 226+
216227 """ hardcode the data handler for now """
217228 self._data_handler_ = FDH
 229+
 230+ """ Call constructor of parent """
 231+ DataLoader.__init__(self)
 232+
218233
219234 """
220235 Executes the query which generates interval metrics and sets _results_ and _col_names_
@@ -233,16 +248,15 @@
234249 _results_ - list containing the rows generated by the query
235250 """
236251 def run_query(self, start_time, end_time, interval, metric_name, campaign):
237 -
 252+
238253 self.init_db()
239254
240255 query_name = self.get_sql_filename_for_query()
241256 print >> sys.stdout, 'Using query: ' + query_name
242257
243258 metrics = Hlp.AutoVivification()
244 - times = Hlp.AutoVivification()
245 - self._col_names_ = list()
246 -
 259+ times = Hlp.AutoVivification()
 260+
247261 """ Compose datetime objects to represent the first and last intervals """
248262 start_time_obj = TP.timestamp_to_obj(start_time, 1)
249263 start_time_obj = start_time_obj.replace(minute=int(math.floor(start_time_obj.minute / interval) * interval))
@@ -252,29 +266,37 @@
253267 # end_time_obj = end_time_obj + datetime.timedelta(seconds=-1)
254268 end_time_obj = end_time_obj.replace(minute=int(math.floor(end_time_obj.minute / interval) * interval))
255269 end_time_obj_str = TP.timestamp_from_obj(end_time_obj, 1, 3)
256 -
257 -
258 - """ Load the SQL File & Format """
259 - filename = self._sql_path_+ query_name + '.sql'
260 - sql_stmnt = Hlp.read_sql(filename)
261 -
262 - sql_stmnt = QD.format_query(query_name, sql_stmnt, [start_time, end_time, campaign, interval])
263 -
 270+
 271+ """ QUERY PREP - ONLY EXECUTED IF THE QUERY HAS NOT BEEN RUN ALREADY """
 272+ if not(self._was_run_):
 273+ """ Load the SQL File & Format """
 274+ filename = self._sql_path_+ query_name + '.sql'
 275+ sql_stmnt = Hlp.read_sql(filename)
 276+
 277+ sql_stmnt = QD.format_query(query_name, sql_stmnt, [start_time, end_time, campaign, interval])
 278+
264279 """ Get Indexes into Query """
265280 key_index = QD.get_key_index(query_name)
266 -
267281 metric_index = QD.get_metric_index(query_name, metric_name)
268282 time_index = QD.get_time_index(query_name)
269 - #print sql_stmnt
 283+
270284 """ Compose the data for each separate donor pipeline artifact """
271285 try:
272 - # err_msg = sql_stmnt
273 - self._cur_.execute(sql_stmnt)
 286+ """ ONLY EXECUTE THE QUERY IF IT HASN'T BEEN BEFORE """
 287+ if not(self._was_run_):
 288+ print >> sys.stdout, 'Running query ...'
 289+
 290+ self._cur_.execute(sql_stmnt)
 291+
 292+ """ GET THE COLUMN NAMES FROM THE QUERY RESULTS """
 293+ self._col_names_ = list()
 294+ for i in self._cur_.description:
 295+ self._col_names_.append(i[0])
 296+
 297+ self._results_ = self._cur_.fetchall()
 298+
 299+ self._was_run_ = True
274300
275 - for i in self._cur_.description:
276 - self._col_names_.append(i[0])
277 -
278 - self._results_ = self._cur_.fetchall()
279301 final_time = dict() # stores the last timestamp seen
280302 interval_obj = datetime.timedelta(minutes=interval) # timedelta object used to shift times by _interval_ minutes
281303
@@ -317,7 +339,7 @@
318340 print inst # __str__ allows args to printed directly
319341
320342 self._db_.rollback()
321 - sys.exit(0)
 343+ # sys.exit(0)
322344
323345
324346 """ Ensure that the last time in the list is the endtime less the interval """
@@ -391,6 +413,7 @@
392414 Based on the column type compile an aggregate (e.g. sum, average)
393415
394416 """
 417+
395418 if col_type == self._data_handler_._COLTYPE_RATE_:
396419
397420 try:
@@ -430,8 +453,16 @@
431454 This differs from the interval reporter in that it compiles results for views and donations only for an entire donation process or pipeline.
432455
433456 """
434 -class CampaignIntervalReportingLoader(IntervalReportingLoader):
 457+class CampaignIntervalReportingLoader(DataLoader):
 458+
 459+
 460+ def __init__(self):
 461+ self._query_type_ = 'campaign'
435462
 463+ self._irl_artifacts_ = IntervalReportingLoader('campaign')
 464+ self._irl_totals_ = IntervalReportingLoader('campaign_total')
 465+
 466+
436467 """
437468 <DESCRIPTION>
438469
@@ -451,23 +482,20 @@
452483 def run_query(self, start_time, end_time, interval, metric_name, campaign):
453484
454485 """ Execute the standard interval reporting query """
455 - ir = IntervalReportingLoader('campaign')
456 - data = ir.run_query(start_time, end_time, interval, metric_name, campaign)
 486+ data = self._irl_artifacts_.run_query(start_time, end_time, interval, metric_name, campaign)
457487 metrics = data[0]
458488 times = data[1]
459 -
 489+
460490 """ Get the totals for campaign views and donations """
461 - ir = IntervalReportingLoader('campaign_total')
462 - data = ir.run_query(start_time, end_time, interval, metric_name, campaign)
 491+ data = self._irl_totals_.run_query(start_time, end_time, interval, metric_name, campaign)
463492 metrics_total = data[0]
464493 times_total = data[1]
465 -
 494+
466495 """ Combine the results for the campaign totals with (banner, landing page, campaign) """
467496 for key in metrics_total.keys():
468497 metrics[key] = metrics_total[key]
469498 times[key] = times_total[key]
470 -
471 -
 499+
472500 return [metrics, times]
473501
474502
@@ -477,6 +505,21 @@
478506 class HypothesisTestLoader(DataLoader):
479507
480508 """
 509+ Constructor
 510+
 511+ _results_ will be a list storing the data for each artifact
 512+ """
 513+ def __init__(self):
 514+
 515+ self._results_ = list()
 516+ self._results_.append(list())
 517+ self._results_.append(list())
 518+
 519+ """ Call constructor of parent """
 520+ DataLoader.__init__(self)
 521+
 522+
 523+ """
481524 Execute data acquisition for hypothesis tester. The idea is that data sampled identically over time for item 1 and item 2 will be generated on which analysis may be carried out.
482525
483526 !! MODIFY/ FIXME -- the sampling is awkward, sampling interval and test interval should be specified explicitly !!
@@ -500,6 +543,8 @@
501544 """
502545 def run_query(self, query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples):
503546
 547+ print >> sys.stdout, 'Using query: ' + query_name
 548+
504549 """
505550 Retrieve time lists with timestamp format 1 (yyyyMMddhhmmss)
506551 This breaks the metrics into evenly sampled intervals
@@ -508,11 +553,14 @@
509554 times = ret[0]
510555 times_indices = ret[1]
511556
512 - self.init_db()
513 -
514 - filename = self._sql_path_ + query_name + '.sql'
515 - sql_stmnt = Hlp.read_sql(filename)
516 -
 557+ """ ONLY EXECUTE THE QUERY IF IT HASN'T BEEN BEFORE """
 558+ if not(self._was_run_):
 559+
 560+ self.init_db()
 561+
 562+ filename = self._sql_path_ + query_name + '.sql'
 563+ sql_stmnt = Hlp.read_sql(filename)
 564+
517565 metric_index = QD.get_metric_index(query_name, metric_name)
518566 metrics_1 = []
519567 metrics_2 = []
@@ -529,43 +577,56 @@
530578 t1 = times[i]
531579 t2 = times[i+1]
532580
533 - formatted_sql_stmnt_1 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_1, campaign])
534 - formatted_sql_stmnt_2 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_2, campaign])
535 -
 581+ if not(self._was_run_):
 582+ formatted_sql_stmnt_1 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_1, campaign])
 583+ formatted_sql_stmnt_2 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_2, campaign])
 584+ # print formatted_sql_stmnt_1
536585 try:
537 - err_msg = formatted_sql_stmnt_1
538586
539 - self._cur_.execute(formatted_sql_stmnt_1)
540 - results_1 = self._cur_.fetchone() # there should only be a single row
 587+ """ ONLY EXECUTE THE QUERY IF IT HASN'T BEEN BEFORE """
 588+ if not(self._was_run_):
541589
542 - err_msg = formatted_sql_stmnt_2
 590+ print >> sys.stdout, 'Running confidence queries ...'
 591+ err_msg = formatted_sql_stmnt_1
 592+
 593+ self._cur_.execute(formatted_sql_stmnt_1)
 594+ results_1 = self._cur_.fetchone() # there should only be a single row
 595+ self._results_[0].append(results_1)
 596+
 597+ err_msg = formatted_sql_stmnt_2
 598+
 599+ self._cur_.execute(formatted_sql_stmnt_2)
 600+ results_2 = self._cur_.fetchone() # there should only be a single row
 601+ self._results_[1].append(results_2)
543602
544 - self._cur_.execute(formatted_sql_stmnt_2)
545 - results_2 = self._cur_.fetchone() # there should only be a single row
546 -
547603 except Exception as inst:
548604 print type(inst) # the exception instance
549605 print inst.args # arguments stored in .args
550606 print inst # __str__ allows args to printed directly
551607
552608 self._db_.rollback()
553 - sys.exit("Database Interface Exception:\n" + err_msg)
 609+ #sys.exit("Database Interface Exception:\n" + err_msg)
554610
555611 """ If no results are returned in this set the sample value is 0.0
556612 !! MODIFY -- these results should not count as data points !! """
557613 try:
558 - metrics_1.append(results_1[metric_index])
 614+ metrics_1.append(self._results_[0][i][metric_index])
559615 except TypeError:
560616 metrics_1.append(0.0)
561617 try:
562 - metrics_2.append(results_2[metric_index])
 618+ metrics_2.append(self._results_[1][i][metric_index])
563619 except TypeError:
564620 metrics_2.append(0.0)
565621
 622+ #print self._results_
566623 #print metrics_1
567624 #print metrics_2
568625
569 - self.close_db()
 626+ """ ONLY EXECUTE THE QUERY IF IT HASN'T BEEN BEFORE """
 627+ if not(self._was_run_):
 628+
 629+ self.close_db()
 630+ self._was_run_ = True
570631
571632 # return the metric values at each time
572633 return [metrics_1, metrics_2, times_indices]
@@ -590,6 +651,9 @@
591652
592653 self._query_type_ = query_type
593654
 655+ """ Call constructor of parent """
 656+ DataLoader.__init__(self)
 657+
594658 """
595659 !! MODIFY / FIXME -- use python reflection !! ... maybe
596660
@@ -763,7 +827,7 @@
764828
765829 """
766830 class TTestLoaderHelp(TableLoader):
767 -
 831+
768832 """
769833 This method knows about faulkner.t_test. This is a lookup table for p-values
770834 given the degrees of freedom and statistic t test

Status & tagging log