Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py |
— | — | @@ -15,14 +15,10 @@ |
16 | 16 | __date__ = "April 8th, 2011" |
17 | 17 | |
18 | 18 | |
19 | | -import sys |
20 | | -# sys.path.append('../') |
| 19 | +""" Import python base modules """ |
| 20 | +import sys, MySQLdb, math, datetime, re |
21 | 21 | |
22 | | -import MySQLdb |
23 | | -import math |
24 | | -import datetime |
25 | | -import re # regular expression matching |
26 | | - |
| 22 | +""" Import Analytics modules """ |
27 | 23 | import Fundraiser_Tools.settings as projSet |
28 | 24 | import Fundraiser_Tools.classes.QueryData as QD |
29 | 25 | import Fundraiser_Tools.classes.TimestampProcessor as TP |
— | — | @@ -39,6 +35,10 @@ |
40 | 36 | |
41 | 37 | Data is stored according to a key name on which sets are separated. Subclasses can further specify how the data is split and related. |
42 | 38 | |
| 39 | + MEMBERS: |
| 40 | + @var _db_: instance of MySQLdb connection |
| 41 | + @var _cur_: cursor object for a MySQL conne ction |
| 42 | + |
43 | 43 | METHODS: |
44 | 44 | init_db |
45 | 45 | close_db |
— | — | @@ -62,14 +62,23 @@ |
63 | 63 | _query_type_ = '' # Stores the query type (dependent on the data handler definition) |
64 | 64 | _results_ = None |
65 | 65 | _col_names_ = None |
| 66 | + _was_run_ = False |
66 | 67 | |
67 | | - |
| 68 | + """ |
| 69 | + Constructor |
| 70 | + |
| 71 | + Ensure that the query is run for each new object |
| 72 | + """ |
| 73 | + def __init__(self): |
| 74 | + |
| 75 | + self._was_run_ = False |
| 76 | + |
| 77 | + |
68 | 78 | def init_db(self): |
69 | 79 | |
70 | 80 | """ Establish connection """ |
71 | | - #db = MySQLdb.connect(host='db10.pmtpa.wmnet', user='rfaulk', db='faulkner') |
| 81 | + # self._db_ = MySQLdb.connect(host=projSet.__db_server__, user=projSet.__user__, db=projSet.__db__, port=projSet.__db_port__, passwd=projSet.__pass__) |
72 | 82 | self._db_ = MySQLdb.connect(host=projSet.__db_server__, user=projSet.__user__, db=projSet.__db__, port=projSet.__db_port__) |
73 | | - #self.db = MySQLdb.connect(host='storage3.pmtpa.wmnet', user='rfaulk', db='faulkner') |
74 | 83 | |
75 | 84 | """ Create cursor """ |
76 | 85 | self._cur_ = self._db_.cursor() |
— | — | @@ -201,8 +210,10 @@ |
202 | 211 | _summary_data_ = None |
203 | 212 | |
204 | 213 | def __init__(self, query_type): |
205 | | - self._query_names_['banner'] = 'report_banner_metrics_minutely' |
206 | | - self._query_names_['LP'] = 'report_LP_metrics_minutely' |
| 214 | + |
| 215 | + self._query_names_[FDH._QTYPE_BANNER_] = 'report_banner_metrics_minutely' |
| 216 | + self._query_names_[FDH._QTYPE_LP_] = 'report_LP_metrics_minutely' |
| 217 | + self._query_names_[FDH._QTYPE_BANNER_LP_] = 'report_bannerLP_metrics_minutely' |
207 | 218 | self._query_names_['campaign'] = 'report_campaign_metrics_minutely' |
208 | 219 | self._query_names_['campaign_total'] = 'report_campaign_metrics_minutely_total' |
209 | 220 | |
— | — | @@ -211,9 +222,13 @@ |
212 | 223 | self._query_names_[FDH._QTYPE_CAMPAIGN_ + FDH._QTYPE_TIME_] = 'report_campaign_metrics_minutely_all' |
213 | 224 | |
214 | 225 | self._query_type_ = query_type |
215 | | - |
| 226 | + |
216 | 227 | """ hardcode the data handler for now """ |
217 | 228 | self._data_handler_ = FDH |
| 229 | + |
| 230 | + """ Call constructor of parent """ |
| 231 | + DataLoader.__init__(self) |
| 232 | + |
218 | 233 | |
219 | 234 | """ |
220 | 235 | Executes the query which generates interval metrics and sets _results_ and _col_names_ |
— | — | @@ -233,16 +248,15 @@ |
234 | 249 | _results_ - list containing the rows generated by the query |
235 | 250 | """ |
236 | 251 | def run_query(self, start_time, end_time, interval, metric_name, campaign): |
237 | | - |
| 252 | + |
238 | 253 | self.init_db() |
239 | 254 | |
240 | 255 | query_name = self.get_sql_filename_for_query() |
241 | 256 | print >> sys.stdout, 'Using query: ' + query_name |
242 | 257 | |
243 | 258 | metrics = Hlp.AutoVivification() |
244 | | - times = Hlp.AutoVivification() |
245 | | - self._col_names_ = list() |
246 | | - |
| 259 | + times = Hlp.AutoVivification() |
| 260 | + |
247 | 261 | """ Compose datetime objects to represent the first and last intervals """ |
248 | 262 | start_time_obj = TP.timestamp_to_obj(start_time, 1) |
249 | 263 | start_time_obj = start_time_obj.replace(minute=int(math.floor(start_time_obj.minute / interval) * interval)) |
— | — | @@ -252,29 +266,37 @@ |
253 | 267 | # end_time_obj = end_time_obj + datetime.timedelta(seconds=-1) |
254 | 268 | end_time_obj = end_time_obj.replace(minute=int(math.floor(end_time_obj.minute / interval) * interval)) |
255 | 269 | end_time_obj_str = TP.timestamp_from_obj(end_time_obj, 1, 3) |
256 | | - |
257 | | - |
258 | | - """ Load the SQL File & Format """ |
259 | | - filename = self._sql_path_+ query_name + '.sql' |
260 | | - sql_stmnt = Hlp.read_sql(filename) |
261 | | - |
262 | | - sql_stmnt = QD.format_query(query_name, sql_stmnt, [start_time, end_time, campaign, interval]) |
263 | | - |
| 270 | + |
| 271 | + """ QUERY PREP - ONLY EXECUTED IF THE QUERY HAS NOT BEEN RUN ALREADY """ |
| 272 | + if not(self._was_run_): |
| 273 | + """ Load the SQL File & Format """ |
| 274 | + filename = self._sql_path_+ query_name + '.sql' |
| 275 | + sql_stmnt = Hlp.read_sql(filename) |
| 276 | + |
| 277 | + sql_stmnt = QD.format_query(query_name, sql_stmnt, [start_time, end_time, campaign, interval]) |
| 278 | + |
264 | 279 | """ Get Indexes into Query """ |
265 | 280 | key_index = QD.get_key_index(query_name) |
266 | | - |
267 | 281 | metric_index = QD.get_metric_index(query_name, metric_name) |
268 | 282 | time_index = QD.get_time_index(query_name) |
269 | | - #print sql_stmnt |
| 283 | + |
270 | 284 | """ Compose the data for each separate donor pipeline artifact """ |
271 | 285 | try: |
272 | | - # err_msg = sql_stmnt |
273 | | - self._cur_.execute(sql_stmnt) |
| 286 | + """ ONLY EXECUTE THE QUERY IF IT HASN'T BEEN BEFORE """ |
| 287 | + if not(self._was_run_): |
| 288 | + print >> sys.stdout, 'Running query ...' |
| 289 | + |
| 290 | + self._cur_.execute(sql_stmnt) |
| 291 | + |
| 292 | + """ GET THE COLUMN NAMES FROM THE QUERY RESULTS """ |
| 293 | + self._col_names_ = list() |
| 294 | + for i in self._cur_.description: |
| 295 | + self._col_names_.append(i[0]) |
| 296 | + |
| 297 | + self._results_ = self._cur_.fetchall() |
| 298 | + |
| 299 | + self._was_run_ = True |
274 | 300 | |
275 | | - for i in self._cur_.description: |
276 | | - self._col_names_.append(i[0]) |
277 | | - |
278 | | - self._results_ = self._cur_.fetchall() |
279 | 301 | final_time = dict() # stores the last timestamp seen |
280 | 302 | interval_obj = datetime.timedelta(minutes=interval) # timedelta object used to shift times by _interval_ minutes |
281 | 303 | |
— | — | @@ -317,7 +339,7 @@ |
318 | 340 | print inst # __str__ allows args to printed directly |
319 | 341 | |
320 | 342 | self._db_.rollback() |
321 | | - sys.exit(0) |
| 343 | + # sys.exit(0) |
322 | 344 | |
323 | 345 | |
324 | 346 | """ Ensure that the last time in the list is the endtime less the interval """ |
— | — | @@ -391,6 +413,7 @@ |
392 | 414 | Based on the column type compile an aggregate (e.g. sum, average) |
393 | 415 | |
394 | 416 | """ |
| 417 | + |
395 | 418 | if col_type == self._data_handler_._COLTYPE_RATE_: |
396 | 419 | |
397 | 420 | try: |
— | — | @@ -430,8 +453,16 @@ |
431 | 454 | This differs from the interval reporter in that it compiles results for views and donations only for an entire donation process or pipeline. |
432 | 455 | |
433 | 456 | """ |
434 | | -class CampaignIntervalReportingLoader(IntervalReportingLoader): |
| 457 | +class CampaignIntervalReportingLoader(DataLoader): |
| 458 | + |
| 459 | + |
| 460 | + def __init__(self): |
| 461 | + self._query_type_ = 'campaign' |
435 | 462 | |
| 463 | + self._irl_artifacts_ = IntervalReportingLoader('campaign') |
| 464 | + self._irl_totals_ = IntervalReportingLoader('campaign_total') |
| 465 | + |
| 466 | + |
436 | 467 | """ |
437 | 468 | <DESCRIPTION> |
438 | 469 | |
— | — | @@ -451,23 +482,20 @@ |
452 | 483 | def run_query(self, start_time, end_time, interval, metric_name, campaign): |
453 | 484 | |
454 | 485 | """ Execute the standard interval reporting query """ |
455 | | - ir = IntervalReportingLoader('campaign') |
456 | | - data = ir.run_query(start_time, end_time, interval, metric_name, campaign) |
| 486 | + data = self._irl_artifacts_.run_query(start_time, end_time, interval, metric_name, campaign) |
457 | 487 | metrics = data[0] |
458 | 488 | times = data[1] |
459 | | - |
| 489 | + |
460 | 490 | """ Get the totals for campaign views and donations """ |
461 | | - ir = IntervalReportingLoader('campaign_total') |
462 | | - data = ir.run_query(start_time, end_time, interval, metric_name, campaign) |
| 491 | + data = self._irl_totals_.run_query(start_time, end_time, interval, metric_name, campaign) |
463 | 492 | metrics_total = data[0] |
464 | 493 | times_total = data[1] |
465 | | - |
| 494 | + |
466 | 495 | """ Combine the results for the campaign totals with (banner, landing page, campaign) """ |
467 | 496 | for key in metrics_total.keys(): |
468 | 497 | metrics[key] = metrics_total[key] |
469 | 498 | times[key] = times_total[key] |
470 | | - |
471 | | - |
| 499 | + |
472 | 500 | return [metrics, times] |
473 | 501 | |
474 | 502 | |
— | — | @@ -477,6 +505,21 @@ |
478 | 506 | class HypothesisTestLoader(DataLoader): |
479 | 507 | |
480 | 508 | """ |
| 509 | + Constructor |
| 510 | + |
| 511 | + _results_ will be a list storing the data for each artifact |
| 512 | + """ |
| 513 | + def __init__(self): |
| 514 | + |
| 515 | + self._results_ = list() |
| 516 | + self._results_.append(list()) |
| 517 | + self._results_.append(list()) |
| 518 | + |
| 519 | + """ Call constructor of parent """ |
| 520 | + DataLoader.__init__(self) |
| 521 | + |
| 522 | + |
| 523 | + """ |
481 | 524 | Execute data acquisition for hypothesis tester. The idea is that data sampled identically over time for item 1 and item 2 will be generated on which analysis may be carried out. |
482 | 525 | |
483 | 526 | !! MODIFY/ FIXME -- the sampling is awkward, sampling interval and test interval should be specified explicitly !! |
— | — | @@ -500,6 +543,8 @@ |
501 | 544 | """ |
502 | 545 | def run_query(self, query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples): |
503 | 546 | |
| 547 | + print >> sys.stdout, 'Using query: ' + query_name |
| 548 | + |
504 | 549 | """ |
505 | 550 | Retrieve time lists with timestamp format 1 (yyyyMMddhhmmss) |
506 | 551 | This breaks the metrics into evenly sampled intervals |
— | — | @@ -508,11 +553,14 @@ |
509 | 554 | times = ret[0] |
510 | 555 | times_indices = ret[1] |
511 | 556 | |
512 | | - self.init_db() |
513 | | - |
514 | | - filename = self._sql_path_ + query_name + '.sql' |
515 | | - sql_stmnt = Hlp.read_sql(filename) |
516 | | - |
| 557 | + """ ONLY EXECUTE THE QUERY IF IT HASN'T BEEN BEFORE """ |
| 558 | + if not(self._was_run_): |
| 559 | + |
| 560 | + self.init_db() |
| 561 | + |
| 562 | + filename = self._sql_path_ + query_name + '.sql' |
| 563 | + sql_stmnt = Hlp.read_sql(filename) |
| 564 | + |
517 | 565 | metric_index = QD.get_metric_index(query_name, metric_name) |
518 | 566 | metrics_1 = [] |
519 | 567 | metrics_2 = [] |
— | — | @@ -529,43 +577,56 @@ |
530 | 578 | t1 = times[i] |
531 | 579 | t2 = times[i+1] |
532 | 580 | |
533 | | - formatted_sql_stmnt_1 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_1, campaign]) |
534 | | - formatted_sql_stmnt_2 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_2, campaign]) |
535 | | - |
| 581 | + if not(self._was_run_): |
| 582 | + formatted_sql_stmnt_1 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_1, campaign]) |
| 583 | + formatted_sql_stmnt_2 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_2, campaign]) |
| 584 | + # print formatted_sql_stmnt_1 |
536 | 585 | try: |
537 | | - err_msg = formatted_sql_stmnt_1 |
538 | 586 | |
539 | | - self._cur_.execute(formatted_sql_stmnt_1) |
540 | | - results_1 = self._cur_.fetchone() # there should only be a single row |
| 587 | + """ ONLY EXECUTE THE QUERY IF IT HASN'T BEEN BEFORE """ |
| 588 | + if not(self._was_run_): |
541 | 589 | |
542 | | - err_msg = formatted_sql_stmnt_2 |
| 590 | + print >> sys.stdout, 'Running confidence queries ...' |
| 591 | + err_msg = formatted_sql_stmnt_1 |
| 592 | + |
| 593 | + self._cur_.execute(formatted_sql_stmnt_1) |
| 594 | + results_1 = self._cur_.fetchone() # there should only be a single row |
| 595 | + self._results_[0].append(results_1) |
| 596 | + |
| 597 | + err_msg = formatted_sql_stmnt_2 |
| 598 | + |
| 599 | + self._cur_.execute(formatted_sql_stmnt_2) |
| 600 | + results_2 = self._cur_.fetchone() # there should only be a single row |
| 601 | + self._results_[1].append(results_2) |
543 | 602 | |
544 | | - self._cur_.execute(formatted_sql_stmnt_2) |
545 | | - results_2 = self._cur_.fetchone() # there should only be a single row |
546 | | - |
547 | 603 | except Exception as inst: |
548 | 604 | print type(inst) # the exception instance |
549 | 605 | print inst.args # arguments stored in .args |
550 | 606 | print inst # __str__ allows args to printed directly |
551 | 607 | |
552 | 608 | self._db_.rollback() |
553 | | - sys.exit("Database Interface Exception:\n" + err_msg) |
| 609 | + #sys.exit("Database Interface Exception:\n" + err_msg) |
554 | 610 | |
555 | 611 | """ If no results are returned in this set the sample value is 0.0 |
556 | 612 | !! MODIFY -- these results should not count as data points !! """ |
557 | 613 | try: |
558 | | - metrics_1.append(results_1[metric_index]) |
| 614 | + metrics_1.append(self._results_[0][i][metric_index]) |
559 | 615 | except TypeError: |
560 | 616 | metrics_1.append(0.0) |
561 | 617 | try: |
562 | | - metrics_2.append(results_2[metric_index]) |
| 618 | + metrics_2.append(self._results_[1][i][metric_index]) |
563 | 619 | except TypeError: |
564 | 620 | metrics_2.append(0.0) |
565 | 621 | |
| 622 | + #print self._results_ |
566 | 623 | #print metrics_1 |
567 | 624 | #print metrics_2 |
568 | 625 | |
569 | | - self.close_db() |
| 626 | + """ ONLY EXECUTE THE QUERY IF IT HASN'T BEEN BEFORE """ |
| 627 | + if not(self._was_run_): |
| 628 | + |
| 629 | + self.close_db() |
| 630 | + self._was_run_ = True |
570 | 631 | |
571 | 632 | # return the metric values at each time |
572 | 633 | return [metrics_1, metrics_2, times_indices] |
— | — | @@ -590,6 +651,9 @@ |
591 | 652 | |
592 | 653 | self._query_type_ = query_type |
593 | 654 | |
| 655 | + """ Call constructor of parent """ |
| 656 | + DataLoader.__init__(self) |
| 657 | + |
594 | 658 | """ |
595 | 659 | !! MODIFY / FIXME -- use python reflection !! ... maybe |
596 | 660 | |
— | — | @@ -763,7 +827,7 @@ |
764 | 828 | |
765 | 829 | """ |
766 | 830 | class TTestLoaderHelp(TableLoader): |
767 | | - |
| 831 | + |
768 | 832 | """ |
769 | 833 | This method knows about faulkner.t_test. This is a lookup table for p-values |
770 | 834 | given the degrees of freedom and statistic t test |