r90486 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r90485‎ \| r90486 \| r90487 >
Date:	21:45, 20 June 2011
Author:	rfaulk
Status:	deferred
Tags:
Comment:	Added documentation removed BannerLPReportingLoader class as it is obsolete
Modified paths:	/trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py (modified) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/fundraiser-scripts/classes/DataLoader.py
—	—	@@ -33,15 +33,19 @@
34	34
35	35	BASE CLASS :: DataLoader
36	36
37		~~- This is the base class for data handling functionality of metrics.~~
	37	+ This is the base class for data handling functionality of metrics. Inherited classes will be associated with a set of SQL files that loosely define a set related query types.
38	38
	39	+ Handles database connections. Stores state values based on query results and also meta data for the queries themselves which can be resolved to full statements by interacting with QueryData.
	40	+
	41	+ Data is stored according to a key name on which sets are separated. Subclasses can further specify how the data is split and related.
	42	+
39	43	METHODS:
40	44	init_db
41	45	close_db
42		~~- compose_key~~
43		~~- include_keys~~
44		~~- exclude_keys~~
45		~~- get_sql_filename_for_query~~
	46	+ compose_key - build a new key in the results based on existing one
	47	+ include_keys - include only certain keys based on key name
	48	+ exclude_keys - explicitly remove certain types of keys based on key name
	49	+ get_sql_filename_for_query - resolve a sql filename based on the simpler query type
46	50
47	51
48	52	"""
—	—	@@ -172,11 +176,8 @@
173	177	"""
174	178	Return a specific query name given a query type
175	179
176		~~- INPUT:~~
177		~~- query_type -~~
178		-
179	180	RETURN:
180		~~- query_name -~~
	181	+ query_name - name of the sql file
181	182
182	183	"""
183	184	def get_sql_filename_for_query(self):
—	—	@@ -184,13 +185,17 @@
185	186	try:
186	187	return self._query_names_[self._query_type_]
187	188	except KeyError:
188		~~- print >> sys.stderr, 'Could not find a query for type: ' + query_type~~
	189	+ print >> sys.stderr, 'Could not find a query for type: ' + self._query_type_
189	190	sys.exit(2)
190	191
191	192
192	193
193		-
	194	+"""
	195	+
	196	+ This Loader inherits the functionality of DaatLoader and handles SQL queries that group data by time intervals. These are generally preferable for most
	197	+ of the time dependent data analysis and also provides functionality that enables the raw results to be combined over all keys
194	198
	199	+"""
195	200	class IntervalReportingLoader(DataLoader):
196	201
197	202	_summary_data_ = None
—	—	@@ -201,11 +206,15 @@
202	207	self._query_names_['campaign'] = 'report_campaign_metrics_minutely'
203	208	self._query_names_['campaign_total'] = 'report_campaign_metrics_minutely_total'
204	209
	210	+ self._query_names_[FDH._QTYPE_BANNER_ + FDH._QTYPE_TIME_] = 'report_banner_metrics_minutely_all'
	211	+ self._query_names_[FDH._QTYPE_LP_ + FDH._QTYPE_TIME_] = 'report_lp_metrics_minutely_all'
	212	+ self._query_names_[FDH._QTYPE_CAMPAIGN_ + FDH._QTYPE_TIME_] = 'report_campaign_metrics_minutely_all'
	213	+
205	214	self._query_type_ = query_type
206	215
207	216	""" hardcode the data handler for now """
208	217	self._data_handler_ = FDH
209		-
	218	+
210	219	"""
211	220	Executes the query which generates interval metrics and sets _results_ and _col_names_
212	221
—	—	@@ -256,7 +265,7 @@
257	266
258	267	metric_index = QD.get_metric_index(query_name, metric_name)
259	268	time_index = QD.get_time_index(query_name)
260		-
	269	+ #print sql_stmnt
261	270	""" Compose the data for each separate donor pipeline artifact """
262	271	try:
263	272	# err_msg = sql_stmnt
—	—	@@ -337,14 +346,9 @@
338	347
339	348
340	349	"""
341		- Post process raw data from query. Combines data rows according to column type definitions. This must be run after
	350	+ Post process raw data from query. Combines data rows according to column type definitions. This must be run after run_query.
342	351
343		~~- INPUT:~~
344		~~- data - a list of rows~~
345		~~- data_handler - the data handler module~~
346		~~- query_name -~~
347		~~- RETURN:~~
348		~~- the dictionary of combined rows (note that there must be a key column)~~
	352	+ This allows aggregates of query data to be performed after the actual processing of the query.
349	353	"""
350	354	def combine_rows(self):
351	355
—	—	@@ -380,7 +384,13 @@
381	385	""" Change null values to 0 """
382	386	if field == None or field == 'NULL':
383	387	field = 0.0
384		-
	388	+
	389	+ """
	390	+ COMBINE THE DATA FOR EACH KEY
	391	+
	392	+ Based on the column type compile an aggregate (e.g. sum, average)
	393	+
	394	+ """
385	395	if col_type == self._data_handler_._COLTYPE_RATE_:
386	396
387	397	try:
—	—	@@ -395,11 +405,14 @@
396	406	except KeyError as e:
397	407	data_dict[key][self._col_names_[i]] = float(field)
398	408
399		~~- """ !! MODIFY --- this could cause issues in the case of missing data """~~
	409	+ """ !! MODIFY / FIXME --- this could cause issues in the case of missing data """
400	410	num_rows = len(self._results_) / len(data_dict.keys())
401	411
402		~~- """ POST PROCESSING~~
403		~~- Normalize rate columns """~~
	412	+ """
	413	+ POST PROCESSING
	414	+
	415	+ Normalize rate columns
	416	+ """
404	417	for i in range(len(col_types)):
405	418	if col_types[i] == self._data_handler_._COLTYPE_RATE_:
406	419	for key in data_dict.keys():
—	—	@@ -411,6 +424,10 @@
412	425
413	426	"""
414	427
	428	+ This class inherits the IntrvalLoader functionality but utilizes the campaign DataLoader instead. Also the results generated incorporate campaign totals also --
	429	+ the result fully specifies each campaign item (campaign - banner - landing page) and the number of views resulting.
	430	+
	431	+ This differs from the interval reporter in that it compiles results for views and donations only for an entire donation process or pipeline.
415	432
416	433	"""
417	434	class CampaignIntervalReportingLoader(IntervalReportingLoader):
—	—	@@ -453,181 +470,40 @@
454	471
455	472	return [metrics, times]
456	473
457		-
458		~~-"""~~
459		-
460		~~- CLASS :: BannerLPReportingLoader~~
461	474
462		~~- This dataloader handles reporting on banners and landing pages.~~
463		-
464		~~- METHODS:~~
465		~~- run_query~~
466		-
467		-
468	475	"""
469		~~-class BannerLPReportingLoader(DataLoader):~~
470		-
471		~~- def __init__(self, query_type):~~
472		~~- self._query_names_['LP'] = 'report_LP_metrics'~~
473		~~- self._query_names_['BAN'] = 'report_banner_metrics'~~
474		~~- self._query_names_['BAN-TEST'] = 'report_banner_metrics'~~
475		~~- self._query_names_['LP-TEST'] = 'report_LP_metrics'~~
476		-
477		~~- self._query_type_ = query_type~~
478		-
479		~~- """~~
480		~~- <description>~~
481		-
482		~~- INPUT:~~
483		-
484		~~- RETURN:~~
485		-
486		~~- """~~
487		~~- def run_query(self,start_time, end_time, campaign, metric_name):~~
488		-
489		~~- self.init_db()~~
490		-
491		~~- metric_lists = Hlp.AutoVivification()~~
492		~~- time_lists = Hlp.AutoVivification()~~
493		~~- # table_data = [] # store the results in a table for reporting~~
494		-
495		~~- # Load the SQL File & Format~~
496		~~- filename = self._sql_path_ + query_name + '.sql'~~
497		~~- sql_stmnt = Hlp.read_sql(filename)~~
498		-
499		~~- sql_stmnt = QD.format_query(query_name, sql_stmnt, [start_time, end_time, campaign])~~
500		-
501		~~- key_index = QD.get_key_index(query_name)~~
502		~~- time_index = QD.get_time_index(query_name)~~
503		~~- metric_index = QD.get_metric_index(query_name, metric_name)~~
504		-
505		~~- """ Composes the data for each banner or LP """~~
506		~~- try:~~
507		~~- err_msg = sql_stmnt~~
508		~~- self.cur.execute(sql_stmnt)~~
509		-
510		~~- results = self.cur.fetchall()~~
511		-
512		~~- for row in results:~~
513		-
514		~~- key_name = row[key_index]~~
515		-
516		~~- try:~~
517		~~- metric_lists[key_name].append(row[metric_index])~~
518		~~- time_lists[key_name].append(row[time_index])~~
519		~~- except:~~
520		~~- metric_lists[key_name] = list()~~
521		~~- time_lists[key_name] = list()~~
522		-
523		~~- metric_lists[key_name].append(row[metric_index])~~
524		~~- time_lists[key_name].append(row[time_index])~~
525		-
526		~~- except:~~
527		~~- self.db.rollback()~~
528		~~- sys.exit("Database Interface Exception:\n" + err_msg)~~
529		-
530		~~- """ Convert Times to Integers """~~
531		~~- max_i = 0~~
532		-
533		~~- for key in time_lists.keys():~~
534		~~- for date_str in time_lists[key]:~~
535		~~- day_int = int(date_str[8:10])~~
536		~~- hr_int = int(date_str[11:13])~~
537		~~- date_int = int(date_str[0:4]+date_str[5:7]+date_str[8:10]+date_str[11:13])~~
538		~~- if date_int > max_i:~~
539		~~- max_i = date_int~~
540		~~- max_day = day_int~~
541		~~- max_hr = hr_int~~
542		-
543		-
544		~~- # Normalize dates~~
545		~~- time_norm = Hlp.AutoVivification()~~
546		~~- for key in time_lists.keys():~~
547		~~- for date_str in time_lists[key]:~~
548		~~- day = int(date_str[8:10])~~
549		~~- hr = int(date_str[11:13])~~
550		~~- # date_int = int(date_str[0:4]+date_str[5:7]+date_str[8:10]+date_str[11:13])~~
551		~~- elem = (day - max_day) * 24 + (hr - max_hr)~~
552		~~- try:~~
553		~~- time_norm[key].append(elem)~~
554		~~- except:~~
555		~~- time_norm[key] = list()~~
556		~~- time_norm[key].append(elem)~~
557		-
558		-
559		~~- self.close_db()~~
560		-
561		~~- return [metric_lists, time_norm]~~
562		-
563		-
564		~~- """~~
565		-
566		~~- <description>~~
567		-
568		~~- INPUT:~~
569		-
570		~~- RETURN:~~
571		-
572		~~- """~~
573		~~- def get_latest_campaign(self):~~
574		-
575		~~- query_name = 'report_latest_campaign'~~
576		~~- self.init_db()~~
577		-
578		~~- """ Look at campaigns over the past 24 hours - TS format=1, TS resolution=1 """~~
579		~~- now = datetime.datetime.now()~~
580		~~- hours_back = 72~~
581		~~- times = self.gen_date_strings(now, hours_back,1,1)~~
582		-
583		~~- sql_stmnt = Hlp.read_sql('./sql/report_latest_campaign.sql')~~
584		~~- sql_stmnt = QD.format_query(query_name, sql_stmnt, [times[0]])~~
585		-
586		~~- campaign_index = QD.get_campaign_index(query_name)~~
587		~~- time_index = QD.get_time_index(query_name)~~
588		-
589		~~- try:~~
590		~~- err_msg = sql_stmnt~~
591		~~- self.cur.execute(sql_stmnt)~~
592		-
593		~~- row = self.cur.fetchone()~~
594		~~- except:~~
595		~~- self.db.rollback()~~
596		~~- sys.exit("Database Interface Exception:\n" + err_msg)~~
597		-
598		~~- campaign = row[campaign_index]~~
599		~~- timestamp = row[time_index]~~
600		-
601		~~- self.close_db()~~
602		-
603		~~- return [campaign, timestamp]~~
604		-
605		-
606		-
	476	+ This class is concerned with preparing the data for a hypothesis test and is consumed by classes which perform this analysis in HypothesisTest.py
	477	+"""
607	478	class HypothesisTestLoader(DataLoader):
608	479
609	480	"""
610		~~- Execute data acquisition for hypothesis tester~~
	481	+ Execute data acquisition for hypothesis tester. The idea is that data sampled identically over time for item 1 and item 2 will be generated on which analysis may be carried out.
611	482
	483	+ !! MODIFY/ FIXME -- the sampling is awkward, sampling interval and test interval should be specified explicitly !!
	484	+
612	485	INPUT:
613		~~- query_name -~~
614		~~- metric_name -~~
	486	+ query_name - non-formatted sql filename !!MODIFY / FIXME -- this should be a type instead !!
	487	+ metric_name - metric to be extracted from the data
615	488	campaign -
616		~~- item_1 -~~
617		~~- item_2 -~~
	489	+ item_1 - artifact or key name
	490	+ item_2 - artifact or key name
618	491	start_time -
619	492	end_time -
620		~~- interval -~~
621		~~- num_samples -~~
	493	+ interval - test interval; this groups a set of samples together to perform a paired t-test
	494	+ num_samples - samples per test interval (sampling interval = interval / num_samples)
622	495
623	496	RETURN:
624		~~- metrics_1 -~~
625		~~- metrics_2 -~~
626		~~- times_indices -~~
	497	+ metrics_1 - the metrics for item 1
	498	+ metrics_2 - the metrics for item 2
	499	+ times_indices - the sampling intervals
627	500
628	501	"""
629	502	def run_query(self, query_name, metric_name, campaign, item_1, item_2, start_time, end_time, interval, num_samples):
630	503
631		~~- """ retrieve time lists with timestamp format 1 (yyyyMMddhhmmss) """~~
	504	+ """
	505	+ Retrieve time lists with timestamp format 1 (yyyyMMddhhmmss)
	506	+ This breaks the metrics into evenly sampled intervals
	507	+ """
632	508	ret = TP.get_time_lists(start_time, end_time, interval, num_samples, 1)
633	509	times = ret[0]
634	510	times_indices = ret[1]
—	—	@@ -641,6 +517,12 @@
642	518	metrics_1 = []
643	519	metrics_2 = []
644	520
	521	+ """
	522	+ EXECUTE THE QUERIES FOR EACH INTERVAL
	523	+
	524	+ Generates metrics for each artifact, sampled in the same way
	525	+
	526	+ """
645	527	for i in range(len(times) - 1):
646	528
647	529	# print '\nExecuting number ' + str(i) + ' batch of of data.'
—	—	@@ -693,12 +575,10 @@
694	576
695	577	CLASS :: CampaignReportingLoader
696	578
697		~~- This dataloader handles reporting on utm_campaigns.~~
698		-
699		~~- METHODS:~~
700		~~- run_query~~
701		-
702		-
	579	+ This inherits from the DataLoader base class and handles reporting on utm_campaigns. This reporter handles returning lists of banners and lps from a given campaign and also handles
	580	+ reporting donation totals across all campaigns. While this interface is concerned with campaign reporting it does not associated results with time intervals within the start and
	581	+ end times
	582	+
703	583	"""
704	584	class CampaignReportingLoader(DataLoader):
705	585
—	—	@@ -711,7 +591,7 @@
712	592	self._query_type_ = query_type
713	593
714	594	"""
715		~~- !! MODIFY -- use python reflection !! ... maybe~~
	595	+ !! MODIFY / FIXME -- use python reflection !! ... maybe
716	596
717	597	This method is retrieving campaign names
718	598
—	—	@@ -737,7 +617,7 @@
738	618
739	619	Handle queries from "report_campaign_totals"
740	620
741		~~- Gets metric totals for campaigns~~
	621	+ Gets metric totals across campaigns
742	622
743	623	"""
744	624	def query_totals(self, params):
—	—	@@ -790,7 +670,7 @@
791	671
792	672	Handle queries from "report_campaign_banners" and "report_campaign_lps"
793	673
794		~~- Gets a list of banners and landing pages running on the campaign in a time frame~~
	674	+ Gets a list of banners and landing pages running for a given campaign over a given time frame
795	675
796	676	"""
797	677	def query_artifacts(self, params):
—	—	@@ -840,11 +720,14 @@
841	721
842	722	CLASS :: TableLoader
843	723
844		~~- Provides data access particular to the t-test~~
	724	+ Base class for providing MySQL table access. Inherits DataLoader.
845	725
846	726	METHODS:
847		~~- init_db -~~
848		~~- close_db -~~
	727	+ record_exists - return a boolean value reflecting whether a record exists in the table
	728	+ insert_row - try to insert a new record into the table
	729	+ delete_row - try to delete a record from the table
	730	+ update - try to modify a record in the table
	731	+
849	732	"""
850	733	class TableLoader(DataLoader):
851	734
—	—	@@ -867,6 +750,8 @@
868	751
869	752	Provides data access particular to the t-test
870	753
	754	+ storage3.pmtpa.wmnet.faulkner.ttest:
	755	+
871	756	+--------------------+--------------+------+-----+---------+-------+
872	757	\| Field \| Type \| Null \| Key \| Default \| Extra \|
873	758	+--------------------+--------------+------+-----+---------+-------+
—	—	@@ -884,11 +769,11 @@
885	770	given the degrees of freedom and statistic t test
886	771
887	772	INPUT:
888		~~- degrees_of_freedom -~~
889		~~- t -~~
	773	+ degrees_of_freedom - computed degrees of freedom of a dataset modeled on a student's t distribution
	774	+ t - test statistic; random variable whose value is to be measured
890	775
891	776	RETURN:
892		~~- p -~~
	777	+ p - the highest p value based on the input
893	778
894	779	"""
895	780	def get_pValue(self, degrees_of_freedom, t):
—	—	@@ -1099,23 +984,31 @@
1100	985	"""
1101	986	def get_test_field(self, row, key):
1102	987
1103		~~- if key == 'test_name':~~
1104		~~- return row[0]~~
1105		~~- elif key == 'test_type':~~
1106		~~- return row[1]~~
1107		~~- elif key == 'utm_campaign':~~
1108		~~- return row[2]~~
1109		~~- elif key == 'start_time':~~
1110		~~- return row[3].__str__()~~
1111		~~- elif key == 'end_time':~~
1112		~~- return row[4].__str__()~~
1113		~~- elif key == 'winner':~~
1114		~~- return row[5]~~
1115		~~- elif key == 'is_conclusive':~~
1116		~~- return row[6]~~
1117		~~- elif key == 'html_report':~~
1118		~~- return row[7]~~
	988	+ try:
	989	+ if key == 'test_name':
	990	+ return row[0]
	991	+ elif key == 'test_type':
	992	+ return row[1]
	993	+ elif key == 'utm_campaign':
	994	+ return row[2]
	995	+ elif key == 'start_time':
	996	+ return row[3].__str__()
	997	+ elif key == 'end_time':
	998	+ return row[4].__str__()
	999	+ elif key == 'winner':
	1000	+ return row[5]
	1001	+ elif key == 'is_conclusive':
	1002	+ return row[6]
	1003	+ elif key == 'html_report':
	1004	+ return row[7]
1119	1005
	1006	+ except Exception as inst:
	1007	+
	1008	+ print type(inst) # the exception instance
	1009	+ print inst.args # arguments stored in .args
	1010	+ print inst # __str__ allows args to printed directly
	1011	+
	1012	+ return ''
1120	1013
1121	1014
1122	1015	def delete_row(self):

Status & tagging log

00:28, 22 June 2011 Reedy (talk | contribs) changed the status of r90486 [removed: new added: deferred]