r90999 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r90998‎ \| r90999 \| r91000 >
Date:	21:11, 28 June 2011
Author:	rfaulk
Status:	deferred
Tags:
Comment:	Added flag _was_run_ to enable reuse of old data to prevent rerunning queries for existing data
Modified paths:	/trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py (modified) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py
—	—	@@ -15,14 +15,10 @@
16	16	__date__ = "April 8th, 2011"
17	17
18	18
19		~~-import sys~~
20		~~-# sys.path.append('../')~~
	19	+""" Import python base modules """
	20	+import sys, MySQLdb, math, datetime, re
21	21
22		~~-import MySQLdb~~
23		~~-import math~~
24		~~-import datetime~~
25		~~-import re # regular expression matching~~
26		-
	22	+""" Import Analytics modules """
27	23	import Fundraiser_Tools.settings as projSet
28	24	import Fundraiser_Tools.classes.QueryData as QD
29	25	import Fundraiser_Tools.classes.TimestampProcessor as TP
—	—	@@ -39,6 +35,10 @@
40	36
41	37	Data is stored according to a key name on which sets are separated. Subclasses can further specify how the data is split and related.
42	38
	39	+ MEMBERS:
	40	+ @var _db_: instance of MySQLdb connection
	41	+ @var _cur_: cursor object for a MySQL conne ction
	42	+
43	43	METHODS:
44	44	init_db
45	45	close_db
—	—	@@ -62,14 +62,23 @@
63	63	_query_type_ = '' # Stores the query type (dependent on the data handler definition)
64	64	_results_ = None
65	65	_col_names_ = None
	66	+ _was_run_ = False
66	67
67		-
	68	+ """
	69	+ Constructor
	70	+
	71	+ Ensure that the query is run for each new object
	72	+ """
	73	+ def __init__(self):
	74	+
	75	+ self._was_run_ = False
	76	+
	77	+
68	78	def init_db(self):
69	79
70	80	""" Establish connection """
71		~~- #db = MySQLdb.connect(host='db10.pmtpa.wmnet', user='rfaulk', db='faulkner')~~
	81	+ # self._db_ = MySQLdb.connect(host=projSet.__db_server__, user=projSet.__user__, db=projSet.__db__, port=projSet.__db_port__, passwd=projSet.__pass__)
72	82	self._db_ = MySQLdb.connect(host=projSet.__db_server__, user=projSet.__user__, db=projSet.__db__, port=projSet.__db_port__)
73		~~- #self.db = MySQLdb.connect(host='storage3.pmtpa.wmnet', user='rfaulk', db='faulkner')~~
74	83
75	84	""" Create cursor """
76	85	self._cur_ = self._db_.cursor()
—	—	@@ -201,8 +210,10 @@
202	211	_summary_data_ = None
203	212
204	213	def __init__(self, query_type):
205		~~- self._query_names_['banner'] = 'report_banner_metrics_minutely'~~
206		~~- self._query_names_['LP'] = 'report_LP_metrics_minutely'~~
	214	+
	215	+ self._query_names_[FDH._QTYPE_BANNER_] = 'report_banner_metrics_minutely'
	216	+ self._query_names_[FDH._QTYPE_LP_] = 'report_LP_metrics_minutely'
	217	+ self._query_names_[FDH._QTYPE_BANNER_LP_] = 'report_bannerLP_metrics_minutely'
207	218	self._query_names_['campaign'] = 'report_campaign_metrics_minutely'
208	219	self._query_names_['campaign_total'] = 'report_campaign_metrics_minutely_total'
209	220
—	—	@@ -211,9 +222,13 @@
212	223	self._query_names_[FDH._QTYPE_CAMPAIGN_ + FDH._QTYPE_TIME_] = 'report_campaign_metrics_minutely_all'
213	224
214	225	self._query_type_ = query_type
215		-
	226	+
216	227	""" hardcode the data handler for now """
217	228	self._data_handler_ = FDH
	229	+
	230	+ """ Call constructor of parent """
	231	+ DataLoader.__init__(self)
	232	+
218	233
219	234	"""
220	235	Executes the query which generates interval metrics and sets _results_ and _col_names_
—	—	@@ -233,16 +248,15 @@
234	249	_results_ - list containing the rows generated by the query
235	250	"""
236	251	def run_query(self, start_time, end_time, interval, metric_name, campaign):
237		-
	252	+
238	253	self.init_db()
239	254
240	255	query_name = self.get_sql_filename_for_query()
241	256	print >> sys.stdout, 'Using query: ' + query_name
242	257
243	258	metrics = Hlp.AutoVivification()
244		~~- times = Hlp.AutoVivification()~~
245		~~- self._col_names_ = list()~~
246		-
	259	+ times = Hlp.AutoVivification()
	260	+
247	261	""" Compose datetime objects to represent the first and last intervals """
248	262	start_time_obj = TP.timestamp_to_obj(start_time, 1)
249	263	start_time_obj = start_time_obj.replace(minute=int(math.floor(start_time_obj.minute / interval) * interval))
—	—	@@ -252,29 +266,37 @@
253	267	# end_time_obj = end_time_obj + datetime.timedelta(seconds=-1)
254	268	end_time_obj = end_time_obj.replace(minute=int(math.floor(end_time_obj.minute / interval) * interval))
255	269	end_time_obj_str = TP.timestamp_from_obj(end_time_obj, 1, 3)
256		-
257		-
258		~~- """ Load the SQL File & Format """~~
259		~~- filename = self._sql_path_+ query_name + '.sql'~~
260		~~- sql_stmnt = Hlp.read_sql(filename)~~
261		-
262		~~- sql_stmnt = QD.format_query(query_name, sql_stmnt, [start_time, end_time, campaign, interval])~~
263		-
	270	+
	271	+ """ QUERY PREP - ONLY EXECUTED IF THE QUERY HAS NOT BEEN RUN ALREADY """
	272	+ if not(self._was_run_):
	273	+ """ Load the SQL File & Format """
	274	+ filename = self._sql_path_+ query_name + '.sql'
	275	+ sql_stmnt = Hlp.read_sql(filename)
	276	+
	277	+ sql_stmnt = QD.format_query(query_name, sql_stmnt, [start_time, end_time, campaign, interval])
	278	+
264	279	""" Get Indexes into Query """
265	280	key_index = QD.get_key_index(query_name)
266		-
267	281	metric_index = QD.get_metric_index(query_name, metric_name)
268	282	time_index = QD.get_time_index(query_name)
269		~~- #print sql_stmnt~~
	283	+
270	284	""" Compose the data for each separate donor pipeline artifact """
271	285	try:
272		~~- # err_msg = sql_stmnt~~
273		~~- self._cur_.execute(sql_stmnt)~~
	286	+ """ ONLY EXECUTE THE QUERY IF IT HASN'T BEEN BEFORE """
	287	+ if not(self._was_run_):
	288	+ print >> sys.stdout, 'Running query ...'
	289	+
	290	+ self._cur_.execute(sql_stmnt)
	291	+
	292	+ """ GET THE COLUMN NAMES FROM THE QUERY RESULTS """
	293	+ self._col_names_ = list()
	294	+ for i in self._cur_.description:
	295	+ self._col_names_.append(i[0])
	296	+
	297	+ self._results_ = self._cur_.fetchall()
	298	+
	299	+ self._was_run_ = True
274	300
275		~~- for i in self._cur_.description:~~
276		~~- self._col_names_.append(i[0])~~
277		-
278		~~- self._results_ = self._cur_.fetchall()~~
279	301	final_time = dict() # stores the last timestamp seen
280	302	interval_obj = datetime.timedelta(minutes=interval) # timedelta object used to shift times by _interval_ minutes
281	303
—	—	@@ -317,7 +339,7 @@
318	340	print inst # __str__ allows args to printed directly
319	341
320	342	self._db_.rollback()
321		~~- sys.exit(0)~~
	343	+ # sys.exit(0)
322	344
323	345
324	346	""" Ensure that the last time in the list is the endtime less the interval """
—	—	@@ -391,6 +413,7 @@
392	414	Based on the column type compile an aggregate (e.g. sum, average)
393	415
394	416	"""
	417	+
395	418	if col_type == self._data_handler_._COLTYPE_RATE_:
396	419
397	420	try:
—	—	@@ -430,8 +453,16 @@
431	454	This differs from the interval reporter in that it compiles results for views and donations only for an entire donation process or pipeline.
432	455
433	456	"""
434		~~-class CampaignIntervalReportingLoader(IntervalReportingLoader):~~
	457	+class CampaignIntervalReportingLoader(DataLoader):
	458	+
	459	+
	460	+ def __init__(self):
	461	+ self._query_type_ = 'campaign'
435	462
	463	+ self._irl_artifacts_ = IntervalReportingLoader('campaign')
	464	+ self._irl_totals_ = IntervalReportingLoader('campaign_total')
	465	+
	466	+
436	467	"""
437	468	<DESCRIPTION>
438	469
—	—	@@ -451,23 +482,20 @@
452	483	def run_query(self, start_time, end_time, interval, metric_name, campaign):
453	484
454	485	""" Execute the standard interval reporting query """
455		~~- ir = IntervalReportingLoader('campaign')~~
456		~~- data = ir.run_query(start_time, end_time, interval, metric_name, campaign)~~
	486	+ data = self._irl_artifacts_.run_query(start_time, end_time, interval, metric_name, campaign)
457	487	metrics = data[0]
458	488	times = data[1]
459		-
	489	+
460	490	""" Get the totals for campaign views and donations """
461		~~- ir = IntervalReportingLoader('campaign_total')~~
462		~~- data = ir.run_query(start_time, end_time, interval, metric_name, campaign)~~
	491	+ data = self._irl_totals_.run_query(start_time, end_time, interval, metric_name, campaign)
463	492	metrics_total = data[0]
464	493	times_total = data[1]
465		-
	494	+
466	495	""" Combine the results for the campaign totals with (banner, landing page, campaign) """
467	496	for key in metrics_total.keys():
468	497	metrics[key] = metrics_total[key]
469	498	times[key] = times_total[key]
470		-
471		-
	499	+
472	500	return [metrics, times]
473	501
474	502
—	—	@@ -477,6 +505,21 @@
478	506	class HypothesisTestLoader(DataLoader):
479	507
480	508	"""
	509	+ Constructor
	510	+
	511	+ _results_ will be a list storing the data for each artifact
	512	+ """
	513	+ def __init__(self):
	514	+
	515	+ self._results_ = list()
	516	+ self._results_.append(list())
	517	+ self._results_.append(list())
	518	+
	519	+ """ Call constructor of parent """
	520	+ DataLoader.__init__(self)
	521	+
	522	+
	523	+ """
481	524	Execute data acquisition for hypothesis tester. The idea is that data sampled identically over time for item 1 and item 2 will be generated on which analysis may be carried out.
482	525
483	526	!! MODIFY/ FIXME -- the sampling is awkward, sampling interval and test interval should be specified explicitly !!
—	—	@@ -500,6 +543,8 @@
501	544	"""
502	545	def run_query(self, query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples):
503	546
	547	+ print >> sys.stdout, 'Using query: ' + query_name
	548	+
504	549	"""
505	550	Retrieve time lists with timestamp format 1 (yyyyMMddhhmmss)
506	551	This breaks the metrics into evenly sampled intervals
—	—	@@ -508,11 +553,14 @@
509	554	times = ret[0]
510	555	times_indices = ret[1]
511	556
512		~~- self.init_db()~~
513		-
514		~~- filename = self._sql_path_ + query_name + '.sql'~~
515		~~- sql_stmnt = Hlp.read_sql(filename)~~
516		-
	557	+ """ ONLY EXECUTE THE QUERY IF IT HASN'T BEEN BEFORE """
	558	+ if not(self._was_run_):
	559	+
	560	+ self.init_db()
	561	+
	562	+ filename = self._sql_path_ + query_name + '.sql'
	563	+ sql_stmnt = Hlp.read_sql(filename)
	564	+
517	565	metric_index = QD.get_metric_index(query_name, metric_name)
518	566	metrics_1 = []
519	567	metrics_2 = []
—	—	@@ -529,43 +577,56 @@
530	578	t1 = times[i]
531	579	t2 = times[i+1]
532	580
533		~~- formatted_sql_stmnt_1 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_1, campaign])~~
534		~~- formatted_sql_stmnt_2 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_2, campaign])~~
535		-
	581	+ if not(self._was_run_):
	582	+ formatted_sql_stmnt_1 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_1, campaign])
	583	+ formatted_sql_stmnt_2 = QD.format_query(query_name, sql_stmnt, [t1, t2, item_2, campaign])
	584	+ # print formatted_sql_stmnt_1
536	585	try:
537		~~- err_msg = formatted_sql_stmnt_1~~
538	586
539		~~- self._cur_.execute(formatted_sql_stmnt_1)~~
540		~~- results_1 = self._cur_.fetchone() # there should only be a single row~~
	587	+ """ ONLY EXECUTE THE QUERY IF IT HASN'T BEEN BEFORE """
	588	+ if not(self._was_run_):
541	589
542		~~- err_msg = formatted_sql_stmnt_2~~
	590	+ print >> sys.stdout, 'Running confidence queries ...'
	591	+ err_msg = formatted_sql_stmnt_1
	592	+
	593	+ self._cur_.execute(formatted_sql_stmnt_1)
	594	+ results_1 = self._cur_.fetchone() # there should only be a single row
	595	+ self._results_[0].append(results_1)
	596	+
	597	+ err_msg = formatted_sql_stmnt_2
	598	+
	599	+ self._cur_.execute(formatted_sql_stmnt_2)
	600	+ results_2 = self._cur_.fetchone() # there should only be a single row
	601	+ self._results_[1].append(results_2)
543	602
544		~~- self._cur_.execute(formatted_sql_stmnt_2)~~
545		~~- results_2 = self._cur_.fetchone() # there should only be a single row~~
546		-
547	603	except Exception as inst:
548	604	print type(inst) # the exception instance
549	605	print inst.args # arguments stored in .args
550	606	print inst # __str__ allows args to printed directly
551	607
552	608	self._db_.rollback()
553		~~- sys.exit("Database Interface Exception:\n" + err_msg)~~
	609	+ #sys.exit("Database Interface Exception:\n" + err_msg)
554	610
555	611	""" If no results are returned in this set the sample value is 0.0
556	612	!! MODIFY -- these results should not count as data points !! """
557	613	try:
558		~~- metrics_1.append(results_1[metric_index])~~
	614	+ metrics_1.append(self._results_[0][i][metric_index])
559	615	except TypeError:
560	616	metrics_1.append(0.0)
561	617	try:
562		~~- metrics_2.append(results_2[metric_index])~~
	618	+ metrics_2.append(self._results_[1][i][metric_index])
563	619	except TypeError:
564	620	metrics_2.append(0.0)
565	621
	622	+ #print self._results_
566	623	#print metrics_1
567	624	#print metrics_2
568	625
569		~~- self.close_db()~~
	626	+ """ ONLY EXECUTE THE QUERY IF IT HASN'T BEEN BEFORE """
	627	+ if not(self._was_run_):
	628	+
	629	+ self.close_db()
	630	+ self._was_run_ = True
570	631
571	632	# return the metric values at each time
572	633	return [metrics_1, metrics_2, times_indices]
—	—	@@ -590,6 +651,9 @@
591	652
592	653	self._query_type_ = query_type
593	654
	655	+ """ Call constructor of parent """
	656	+ DataLoader.__init__(self)
	657	+
594	658	"""
595	659	!! MODIFY / FIXME -- use python reflection !! ... maybe
596	660
—	—	@@ -763,7 +827,7 @@
764	828
765	829	"""
766	830	class TTestLoaderHelp(TableLoader):
767		-
	831	+
768	832	"""
769	833	This method knows about faulkner.t_test. This is a lookup table for p-values
770	834	given the degrees of freedom and statistic t test

Status & tagging log

21:15, 28 June 2011 😂 (talk | contribs) changed the status of r90999 [removed: new added: deferred]