r79202 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r79201‎ | r79202 | r79203 >
Date:21:07, 29 December 2010
Author:rfaulk
Status:deferred
Tags:
Comment:
Added reporting on mining impressions and views over last 24hrs.
Modified paths:
  • /trunk/fundraiser-statistics/bash/plot_build_hr.sh (modified) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/fundraiser_reporting.py (modified) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/miner_help.py (modified) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/query_store.py (modified) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/run_plots.py (modified) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/sql/report_banner_impressions_by_hour.sql (added) (history)
  • /trunk/fundraiser-statistics/fundraiser-scripts/sql/report_lp_views_by_hour.sql (added) (history)
  • /trunk/fundraiser-statistics/reporting/banner_test.sql (modified) (history)
  • /trunk/fundraiser-statistics/reporting/compare_test.php (modified) (history)
  • /trunk/fundraiser-statistics/reporting/fundraiser_analytics.html (modified) (history)
  • /trunk/fundraiser-statistics/reporting/reporting_mining.html (added) (history)

Diff [purge]

Index: trunk/fundraiser-statistics/fundraiser-scripts/fundraiser_reporting.py
@@ -57,8 +57,69 @@
5858 def close_db(self):
5959 self.cur.close()
6060 self.db.close()
 61+
 62+
 63+ """
 64+
 65+ Takes as input and converts it to a set of hours counting back from 0
6166
 67+ time_lists - a list of timestamp lists
 68+ time_norm - a dictionary of normalized times
 69+
6270 """
 71+ def normalize_timestamps(self, time_lists):
 72+
 73+ # Convert lists into dictionaries before processing
 74+ # it is assumed that lists are composed of only simple types
 75+ isList = 0
 76+ if type(time_lists) is list:
 77+ isList = 1
 78+
 79+ old_list = time_lists
 80+ time_lists = mh.AutoVivification()
 81+
 82+ key = 'key'
 83+ time_lists[key] = list()
 84+
 85+ for i in range(len(old_list)):
 86+ time_lists[key].append(old_list[i])
 87+
 88+
 89+ # Find the earliest date
 90+ max_i = 0
 91+
 92+ for key in time_lists.keys():
 93+ for date_str in time_lists[key]:
 94+ day_int = int(date_str[8:10])
 95+ hr_int = int(date_str[11:13])
 96+ date_int = int(date_str[0:4]+date_str[5:7]+date_str[8:10]+date_str[11:13])
 97+ if date_int > max_i:
 98+ max_i = date_int
 99+ max_day = day_int
 100+ max_hr = hr_int
 101+
 102+
 103+ # Normalize dates
 104+ time_norm = mh.AutoVivification()
 105+ for key in time_lists.keys():
 106+ for date_str in time_lists[key]:
 107+ day = int(date_str[8:10])
 108+ hr = int(date_str[11:13])
 109+ # date_int = int(date_str[0:4]+date_str[5:7]+date_str[8:10]+date_str[11:13])
 110+ elem = (day - max_day) * 24 + (hr - max_hr)
 111+ try:
 112+ time_norm[key].append(elem)
 113+ except:
 114+ time_norm[key] = list()
 115+ time_norm[key].append(elem)
 116+
 117+ # If the original argument was a list put it back in that form
 118+ if isList:
 119+ time_norm = time_norm[key]
 120+
 121+ return time_norm
 122+
 123+ """
63124
64125 input - python datetime object
65126 returns - formatted datetime strings
@@ -173,27 +234,53 @@
174235
175236 return new_values
176237
177 - # workaround for issue with tuple objects in HTML.py
178 - # MySQLdb returns unfamiliar tuple elements from its fetchall method
179 - # this is probably a version problem since the issue popped up in 2.5 but not 2.6
 238+ """
 239+
 240+ workaround for issue with tuple objects in HTML.py
 241+ MySQLdb returns unfamiliar tuple elements from its fetchall method
 242+ this is probably a version problem since the issue popped up in 2.5 but not 2.6
 243+
 244+ """
180245 def listify(row):
181246 l = []
182247 for i in row:
183248 l.append(i)
184249 return l
 250+
 251+
 252+ """
185253
 254+ To be overloaded by subclasses for specific types of queries
 255+
 256+ """
186257 def run_query(self, start_time, end_time, query_name, metric_name):
187258 return
 259+
188260
 261+ """
 262+
 263+ To be overloaded by subclasses for different plotting behaviour
 264+
 265+ """
189266 def gen_plot(self,x, y_lists, labels, title, xlabel, ylabel, subplot_index, fname):
190267 return
 268+
 269+ """
191270
192 - def publish_google_sheet(self):
193 - return
194 -
 271+ To be overloaded by subclasses for writing tables - this functionality currently exists outside of this class structure (test_reporting.py)
 272+
 273+ """
195274 def write_to_html_table(self):
196275 return
197276
 277+
 278+
 279+ """
 280+
 281+ The access point of FundraiserReporting and derived objects. Will be used for executing and orchestrating the creation of plots, tables etc.
 282+ To be overloaded by subclasses
 283+
 284+ """
198285 def run(self):
199286 return
200287
@@ -421,6 +508,7 @@
422509
423510 return [labels_temp, counts_temp, title, ylabel]
424511
 512+
425513 """
426514
427515 CLASS :: ^BannerLPReporting^
@@ -684,9 +772,163 @@
685773
686774 return [campaign, timestamp]
687775
 776+
 777+
688778
 779+ """
 780+
 781+ Takes as input and converts it to a set of hours counting back from 0
 782+
 783+ time_lists - a dictionary of timestamp lists
 784+ time_norm - a dictionary of normalized times
 785+
 786+ """
 787+ def normalize_timestamps(self, time_lists):
 788+ # Find the earliest date
 789+ max_i = 0
 790+
 791+ for key in time_lists.keys():
 792+ for date_str in time_lists[key]:
 793+ day_int = int(date_str[8:10])
 794+ hr_int = int(date_str[11:13])
 795+ date_int = int(date_str[0:4]+date_str[5:7]+date_str[8:10]+date_str[11:13])
 796+ if date_int > max_i:
 797+ max_i = date_int
 798+ max_day = day_int
 799+ max_hr = hr_int
 800+
 801+
 802+ # Normalize dates
 803+ time_norm = mh.AutoVivification()
 804+ for key in time_lists.keys():
 805+ for date_str in time_lists[key]:
 806+ day = int(date_str[8:10])
 807+ hr = int(date_str[11:13])
 808+ # date_int = int(date_str[0:4]+date_str[5:7]+date_str[8:10]+date_str[11:13])
 809+ elem = (day - max_day) * 24 + (hr - max_hr)
 810+ try:
 811+ time_norm[key].append(elem)
 812+ except:
 813+ time_norm[key] = list()
 814+ time_norm[key].append(elem)
 815+
 816+ return time_norm
 817+
689818 """
690819
 820+CLASS :: ^MinerReporting^
 821+
 822+This subclass handles reporting on raw values imported into the database.
 823+
 824+"""
 825+
 826+class MinerReporting(FundraiserReporting):
 827+
 828+ def run_query(self, start_time, end_time, query_name):
 829+
 830+ self.init_db()
 831+
 832+ query_obj = qs.query_store()
 833+
 834+ counts = list()
 835+ times = list()
 836+
 837+ # Load the SQL File & Format
 838+ filename = './sql/' + query_name + '.sql'
 839+ sql_stmnt = mh.read_sql(filename)
 840+
 841+ sql_stmnt = query_obj.format_query(query_name, sql_stmnt, [start_time, end_time])
 842+ #print sql_stmnt
 843+
 844+ # Get Indexes into Query
 845+ count_index = query_obj.get_count_index(query_name)
 846+ time_index = query_obj.get_time_index(query_name)
 847+
 848+ # Composes the data for each banner
 849+ try:
 850+ err_msg = sql_stmnt
 851+ self.cur.execute(sql_stmnt)
 852+
 853+ results = self.cur.fetchall()
 854+
 855+ for row in results:
 856+ counts.append(row[count_index])
 857+ times.append(row[time_index])
 858+
 859+ except:
 860+ self.db.rollback()
 861+ sys.exit("Database Interface Exception:\n" + err_msg)
 862+
 863+ """ Convert Times to Integers """
 864+ time_norm = self.normalize_timestamps(times)
 865+
 866+
 867+ self.close_db()
 868+
 869+ return [counts, time_norm]
 870+
 871+
 872+ # Create histograms for hourly counts
 873+ def gen_plot(self,counts, times, title, xlabel, ylabel, ranges, subplot_index, fname):
 874+
 875+ pylab.subplot(subplot_index)
 876+ pylab.figure(num=None,figsize=[26,14])
 877+
 878+ # pylab.plot(times, counts)
 879+ # pylab.hist(counts, times)
 880+ pylab.bar(times, counts, width=0.5)
 881+
 882+ pylab.grid()
 883+ pylab.xlim(ranges[0], ranges[1])
 884+
 885+ pylab.xlabel(xlabel)
 886+ pylab.ylabel(ylabel)
 887+
 888+ pylab.title(title)
 889+ pylab.savefig(fname, format='png')
 890+
 891+
 892+
 893+ def run(self, query_name):
 894+
 895+ query_obj = qs.query_store()
 896+
 897+ # Current date & time
 898+ now = datetime.datetime.now()
 899+ #UTC = 8
 900+ #delta = datetime.timedelta(hours=UTC)
 901+ #now = now + delta
 902+
 903+ # ESTABLISH THE START TIME TO PULL ANALYTICS
 904+ hours_back = 24
 905+ times = self.gen_date_strings_hr(now, hours_back)
 906+
 907+ start_time = times[0]
 908+ end_time = times[1]
 909+
 910+ # Run Query
 911+ return_val = self.run_query(start_time, end_time, query_name)
 912+ counts = return_val[0]
 913+ times = return_val[1]
 914+
 915+ # Normalize times
 916+ min_time = min(times)
 917+ ranges = [min_time, 0]
 918+
 919+ xlabel = 'Hours'
 920+ subplot_index = 111
 921+ fname = query_name + '.png'
 922+
 923+ title = query_obj.get_plot_title(query_name)
 924+ ylabel = query_obj.get_plot_ylabel(query_name)
 925+
 926+ # Generate Histogram
 927+ self.gen_plot(counts, times, title, xlabel, ylabel, ranges, subplot_index, fname)
 928+
 929+
 930+"""
 931+
 932+INCOMPLETE -
691933 CLASS :: ^ConfidenceReporting^
692934
693935 To be called primarily for reporting
Index: trunk/fundraiser-statistics/fundraiser-scripts/sql/report_banner_impressions_by_hour.sql
@@ -0,0 +1,6 @@
 2+select
 3+DATE_FORMAT(on_minute, '%sY-%sm-%sd %sH') as stamp,
 4+sum(counts) as visits
 5+from impression
 6+where on_minute >= '%s' and on_minute < '%s'
 7+group by 1;
\ No newline at end of file
Index: trunk/fundraiser-statistics/fundraiser-scripts/sql/report_lp_views_by_hour.sql
@@ -0,0 +1,6 @@
 2+select
 3+DATE_FORMAT(request_time, '%sY-%sm-%sd %sH') as stamp,
 4+count(*) as visits
 5+from landing_page
 6+where request_time >= '%s' and request_time < '%s'
 7+group by 1;
\ No newline at end of file
Index: trunk/fundraiser-statistics/fundraiser-scripts/query_store.py
@@ -85,9 +85,20 @@
8686 cmpgn = args[2]
8787 banner = args[3]
8888 sql_stmnt = sql_stmnt % ('%','%','%','%','10','10', start, end, cmpgn, banner, \
89 - '%','%','%','%','10','10', start, end, cmpgn, banner)
 89+ '%','%','%','%','10','10', start, end, cmpgn, banner)
 90+
 91+ elif query_name == 'report_banner_impressions_by_hour':
 92+ start = args[0]
 93+ end = args[1]
 94+ sql_stmnt = sql_stmnt % ('%','%','%','%', start, end)
 95+
 96+ elif query_name == 'report_lp_views_by_hour':
 97+ start = args[0]
 98+ end = args[1]
 99+ sql_stmnt = sql_stmnt % ('%','%','%','%', start, end)
 100+
90101 else:
91 - print 'no such table\n'
 102+ return 'no such table\n'
92103
93104 return sql_stmnt
94105
@@ -97,7 +108,7 @@
98109 elif query_name == '':
99110 return ''
100111 else:
101 - 'no such table'
 112+ return 'no such table'
102113
103114 def get_query_header(self, query_name):
104115 if query_name == 'report_contribution_tracking':
@@ -105,8 +116,16 @@
106117 elif query_name == '':
107118 return ''
108119 else:
109 - 'no such table'
 120+ return 'no such table'
110121
 122+ def get_count_index(self, query_name):
 123+ if query_name == 'report_lp_views_by_hour':
 124+ return 1
 125+ elif query_name == 'report_banner_impressions_by_hour':
 126+ return 1
 127+ else:
 128+ return -1
 129+
111130 def get_time_index(self, query_name):
112131 if query_name == 'report_campaign_logs_by_min':
113132 return 0
@@ -120,8 +139,12 @@
121140 return 0
122141 elif query_name == 'report_latest_campaign':
123142 return 1
 143+ elif query_name == 'report_banner_impressions_by_hour':
 144+ return 0
 145+ elif query_name == 'report_lp_views_by_hour':
 146+ return 0
124147 else:
125 - 'no such table'
 148+ return -1
126149
127150 def get_campaign_index(self, query_name):
128151 if query_name == 'report_campaign_logs_by_min':
@@ -135,7 +158,7 @@
136159 elif query_name == 'report_latest_campaign':
137160 return 0
138161 else:
139 - 'no such table'
 162+ return -1
140163
141164 def get_banner_index(self, query_name):
142165 if query_name == 'report_campaign_logs_by_min':
@@ -147,7 +170,7 @@
148171 elif query_name == 'report_bannerLP_metrics':
149172 return 1
150173 else:
151 - 'no such table'
 174+ return -1
152175
153176 def get_landing_page_index(self, query_name):
154177 if query_name == 'report_campaign_logs_by_min':
@@ -161,7 +184,7 @@
162185 elif query_name == 'report_bannerLP_metrics':
163186 return 1
164187 else:
165 - 'no such table'
 188+ return -1
166189
167190 def get_metric_index(self, query_name, metric_name):
168191 if query_name == 'report_campaign_logs_by_min':
@@ -213,4 +236,20 @@
214237 else:
215238 return -1
216239 else:
217 - 'no such table'
 240+ return 'no such table'
 241+
 242+ def get_plot_title(self, query_name):
 243+ if query_name == 'report_banner_impressions_by_hour':
 244+ return 'Banner Impressions Over the Past 24 Hours'
 245+ elif query_name == 'report_lp_views_by_hour':
 246+ return 'Landing Page Views Over the Past 24 Hours'
 247+ else:
 248+ return 'no such table'
 249+
 250+ def get_plot_ylabel(self, query_name):
 251+ if query_name == 'report_banner_impressions_by_hour':
 252+ return 'IMPRESSIONS'
 253+ elif query_name == 'report_lp_views_by_hour':
 254+ return 'VIEWS'
 255+ else:
 256+ return'no such table'
\ No newline at end of file
Index: trunk/fundraiser-statistics/fundraiser-scripts/miner_help.py
@@ -164,4 +164,19 @@
165165 diff = t2 - t1
166166 diff = float(diff.seconds) / 3600
167167
168 - return diff
\ No newline at end of file
 168+ return diff
 169+
 170+""" Converts a list to a dictionary or vice versa -- INCOMPLETE MAY BE USEFUL AT SOME FUTURE POINT """
 171+def convert_list_dict(collection):
 172+
 173+ if type(collection) is dict:
 174+ new_collection = list()
 175+
 176+ elif type(collection) is list:
 177+ new_collection = dict()
 178+
 179+ else:
 180+ print "miner_help::convert_list_dict: Invalid type, must be a list or a dictionary."
 181+ return 0;
 182+
 183+ return new_collection
\ No newline at end of file
Index: trunk/fundraiser-statistics/fundraiser-scripts/run_plots.py
@@ -17,6 +17,7 @@
1818
1919 tar = fa.TotalAmountsReporting()
2020 blpr = fa.BannerLPReporting()
 21+mr = fa.MinerReporting()
2122
2223
2324 # Run the total amount plots
@@ -34,4 +35,8 @@
3536 # Run the banner/lp plots
3637 blpr.run('LP-TEST', 'don_per_view')
3738 blpr.run('BAN-TEST', 'don_per_imp')
38 -blpr.run('BAN-TEST', 'click_rate')
\ No newline at end of file
 39+blpr.run('BAN-TEST', 'click_rate')
 40+
 41+# Run the impression / view counts
 42+mr.run('report_banner_impressions_by_hour')
 43+mr.run('report_lp_views_by_hour')
\ No newline at end of file
Index: trunk/fundraiser-statistics/reporting/banner_test.sql
@@ -83,7 +83,7 @@
8484
8585 on ecomm.banner = lp.utm_source and if(ecomm.landing_page like 'WMFJAcontrol%s','WMFJAcontrol', ecomm.landing_page) = lp.landing_page and ecomm.utm_campaign = lp.utm_campaign
8686
87 -where floor(impressions * views / views_banner) > 50000
 87+where floor(impressions * views / views_banner) > 10000
8888
8989 group by 1,2,3 order by 12 desc;
9090
Index: trunk/fundraiser-statistics/reporting/compare_test.php
@@ -269,10 +269,10 @@
270270
271271 $script_args = $type . ' ' . $cmpgn1 . ' ' . $cmpgn2 . ' ' . $item1 . ' ' . $item2 . ' ' . $start . ' ' . $end . ' ' . $metric_index;
272272 $cmd_output = ' 1>./plotrun_out.txt';
273 -echo 'python ' . $home_path . 'run_confidence_plot.py '. $script_args . $cmd_output;
 273+// echo 'python ' . $home_path . 'run_confidence_plot.py '. $script_args . $cmd_output;
274274 $retval1 = system('python ' . $home_path . 'run_confidence_plot.py '. $script_args . $cmd_output, $retval2);
275275
276 -echo '<br>' .$retval1. '<br>';
277 -echo $retval2;
 276+// echo '<br>' .$retval1. '<br>';
 277+// echo $retval2;
278278
279279 ?>
\ No newline at end of file
Index: trunk/fundraiser-statistics/reporting/fundraiser_analytics.html
@@ -37,6 +37,7 @@
3838 <h3><a href="http://fundraising.wikimedia.org/stats/reporting_totals.html">Reporting - Totals</a></h3>
3939 <h3><a href="http://fundraising.wikimedia.org/stats/reporting_latest.html">Reporting - Latest Test</a></h3>
4040 <h3><a href="http://fundraising.wikimedia.org/stats/reporting_all.html">Reporting - All Banners / LPs</a></h3>
 41+<h3><a href="http://fundraising.wikimedia.org/stats/reporting_all.html">Reporting - Mining Counts</a></h3>
4142
4243 <br>
4344
Index: trunk/fundraiser-statistics/reporting/reporting_mining.html
@@ -0,0 +1,49 @@
 2+
 3+
 4+<!-- Add style sheet -->
 5+
 6+<html>
 7+
 8+<head>
 9+
 10+<title>Wikimedia Fundraiser Reporting</title>
 11+<!--
 12+<script type="text/javascript">
 13+
 14+ var _gaq = _gaq || [];
 15+ _gaq.push(['_setAccount', 'UA-20321172-1']);
 16+ _gaq.push(['_trackPageview']);
 17+
 18+ (function() {
 19+ var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
 20+ ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
 21+ var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
 22+ })();
 23+
 24+</script>
 25+-->
 26+</head>
 27+
 28+<body>
 29+
 30+<h1>Fundraiser Reporting - Mining Counts</h1>
 31+
 32+<br>
 33+
 34+
 35+<h3><u>Banner Impressions - Last 24 Hours</u></h3>
 36+<OBJECT WIDTH="1600" HEIGHT="800" data="report_banner_impressions_by_hour.png" type="image/png">
 37+</OBJECT>
 38+
 39+<br>
 40+
 41+<h3><u>Landing Page Views - Last 24 Hours</u></h3>
 42+<OBJECT WIDTH="1600" HEIGHT="800" data="report_lp_views_by_hour.png" type="image/png">
 43+</OBJECT>
 44+
 45+<br>
 46+
 47+
 48+</body>
 49+
 50+</html>
\ No newline at end of file
Index: trunk/fundraiser-statistics/bash/plot_build_hr.sh
@@ -15,4 +15,7 @@
1616
1717 cp /home/rfaulk/fundraiser-statistics/fundraiser-scripts/report_LP_metrics_don_per_view_latest.png /srv/org.wikimedia.fundraising/stats/
1818 cp /home/rfaulk/fundraiser-statistics/fundraiser-scripts/report_banner_metrics_don_per_imp_latest.png /srv/org.wikimedia.fundraising/stats/
19 -cp /home/rfaulk/fundraiser-statistics/fundraiser-scripts/report_banner_metrics_click_rate_latest.png /srv/org.wikimedia.fundraising/stats/
\ No newline at end of file
 19+cp /home/rfaulk/fundraiser-statistics/fundraiser-scripts/report_banner_metrics_click_rate_latest.png /srv/org.wikimedia.fundraising/stats/
 20+
 21+cp /home/rfaulk/fundraiser-statistics/fundraiser-scripts/report_banner_impressions_by_hour.png /srv/org.wikimedia.fundraising/stats/
 22+cp /home/rfaulk/fundraiser-statistics/fundraiser-scripts/report_lp_views_by_hour.png /srv/org.wikimedia.fundraising/stats/
\ No newline at end of file

Status & tagging log