r91137 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r91136‎ | r91137 | r91138 >
Date:23:07, 29 June 2011
Author:ezachte
Status:deferred
Tags:
Comment:
Modified paths:
  • /trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl (modified) (history)
  • /trunk/wikistats/analytics/create_and_use_db_analytics.txt (modified) (history)

Diff [purge]

Index: trunk/wikistats/analytics/create_and_use_db_analytics.txt
@@ -1,5 +1,4 @@
 2+-- make sure to delete output files *test*.csv first if any exist (MySQL on purpose forbids overwrite)
23
34 -- tables implemented:
45 -- comscore
@@ -8,19 +7,17 @@
98 -- page_views
109 -- language names
1110
12 -
 11+-- more tables planned (O= optional, not needed for report card stage)
1312 -- project_names
 13+-- binaries per project_code, per language, per month, per extension type
 14+-- O edits per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other)
 15+-- O editors per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other)
1416
1517
1618 -- open issues:
1719 -- only store basic data in database and calculate all aggregates on the fly or do some aggragation before hand ? (e.g. count for mobile / non-mobile / ==> total of both ? <==)
1820 -- for binaries, store one extension type per row? (future proof, more work to query), or a selected few as columns? (one row only needed per month)
1921
20 -
2122 -- Create database and two tables from scratch
2223 DROP DATABASE IF EXISTS `analytics` ;
2324
@@ -29,128 +26,138 @@
3027 USE `analytics` ;
3128
3229 CREATE TABLE `comscore` (
33 - `yyyymm` char (7),
34 - `country_code` char (3),
35 - `region_code` char (2),
36 - `property` char (20),
37 - `project` char (10),
38 - `reach` decimal (4,1) DEFAULT '-1',
39 - `visitors` decimal (15) DEFAULT '-1',
40 - PRIMARY KEY (yyyymm,country_code,region_code,property)
 30+ `date` date NOT NULL,
 31+ `country_code` varchar (3),
 32+ `region_code` varchar (3),
 33+ `web_property` varchar (20),
 34+ `project_code` varchar (10),
 35+ `reach` decimal (4,1) DEFAULT NULL,
 36+ `visitors` decimal (15) DEFAULT NULL,
 37+ PRIMARY KEY (date,country_code,region_code,project_code,web_property),
 38+ KEY (`country_code`)
4139 ) ;
4240
4341 CREATE TABLE `comscore_regions` (
44 - `region_code` char (2),
45 - `report_language` char (10),
46 - `region_name` char (18),
 42+ `region_code` varchar (2),
 43+ `report_language` varchar (10),
 44+ `region_name` varchar (18),
4745 PRIMARY KEY (report_language,region_code)
4846 ) ;
4947
5048 CREATE TABLE `wikistats` (
51 - `yyyymm` char (7),
52 - `project` char (2),
53 - `language_code` char (15),
54 - `editors_all_time` decimal (10) DEFAULT '-1',
55 - `editors_new` decimal (7) DEFAULT '-1',
56 - `editors_ge_5` decimal (7) DEFAULT '-1',
57 - `editors_ge_25` decimal (7) DEFAULT '-1',
58 - `editors_ge_100` decimal (7) DEFAULT '-1',
59 - `articles` decimal (12) DEFAULT '-1',
60 - `articles_new_per_day` decimal (9) DEFAULT '-1',
61 - `articles_over_bytes_500` decimal (12) DEFAULT '-1',
62 - `articles_over_bytes_2000` decimal (12) DEFAULT '-1',
63 - `edits_per_article` decimal (9,1) DEFAULT '-1',
64 - `bytes_per_article` decimal (9,1) DEFAULT '-1',
65 - `edits` decimal (12) DEFAULT '-1',
66 - `size_in_bytes` decimal (15) DEFAULT '-1',
67 - `size_in_words` decimal (15) DEFAULT '-1',
68 - `links_internal` decimal (15) DEFAULT '-1',
69 - `links_interwiki` decimal (15) DEFAULT '-1',
70 - `links_image` decimal (15) DEFAULT '-1',
71 - `links_external` decimal (15) DEFAULT '-1',
72 - `redirects` decimal (15) DEFAULT '-1',
73 - PRIMARY KEY (yyyymm,project,language_code)
 49+ `date` date NOT NULL,
 50+ `project_code` varchar (10),
 51+ `language_code` varchar (15),
 52+ `editors_all_time` int (10) DEFAULT NULL,
 53+ `editors_new` int (7) DEFAULT NULL,
 54+ `editors_ge_5` int (7) DEFAULT NULL,
 55+ `editors_ge_25` int (7) DEFAULT NULL,
 56+ `editors_ge_100` int (7) DEFAULT NULL,
 57+ `articles` int (12) DEFAULT NULL,
 58+ `articles_new_per_day` int (9) DEFAULT NULL,
 59+ `articles_over_bytes_500` int (12) DEFAULT NULL,
 60+ `articles_over_bytes_2000` int (12) DEFAULT NULL,
 61+ `edits_per_article` decimal (9,1) DEFAULT NULL,
 62+ `bytes_per_article` decimal (9,1) DEFAULT NULL,
 63+ `edits` int (12) DEFAULT NULL,
 64+ `size_in_bytes` int (15) DEFAULT NULL,
 65+ `size_in_words` int (15) DEFAULT NULL,
 66+ `links_internal` int (15) DEFAULT NULL,
 67+ `links_interwiki` int (15) DEFAULT NULL,
 68+ `links_image` int (15) DEFAULT NULL,
 69+ `links_external` int (15) DEFAULT NULL,
 70+ `redirects` int (15) DEFAULT NULL,
 71+ PRIMARY KEY (date,project_code,language_code)
7472 ) ;
7573
7674 CREATE TABLE `page_views` (
7775 -- analytics data
78 - `yyyymm` char (7),
79 - `project` char (2),
 76+ `date` date NOT NULL,
 77+ `project_code` char (2),
8078 `language_code` char (15),
81 - `views_non_mobile_raw` decimal (15) DEFAULT '-1',
82 - `views_mobile_raw` decimal (15) DEFAULT '-1',
83 - `views_non_mobile_normalized` decimal (15) DEFAULT '-1',
84 - `views_mobile_normalized` decimal (15) DEFAULT '-1',
85 - PRIMARY KEY (yyyymm,project,language_code)
 79+ `views_non_mobile_raw` bigint (15),
 80+ `views_mobile_raw` bigint (15),
 81+ `views_non_mobile_normalized` bigint (15),
 82+ `views_mobile_normalized` bigint (15),
 83+ PRIMARY KEY (date,project_code,language_code)
8684 ) ;
8785
8886 CREATE TABLE `language_names` (
89 - `report_language` char (15),
90 - `language_code` char (15),
91 - `language_name` char (50),
 87+ `report_language` varchar (15),
 88+ `language_code` varchar (15),
 89+ `language_name` varchar (50),
9290 PRIMARY KEY (report_language,language_code)
9391 ) ;
9492
 93+ -- SHOW TABLES ;
 94+ -- DESCRIBE comscore ;
 95+ -- DESCRIBE comscore_regions ;
 96+ -- DESCRIBE wikistats ;
 97+ -- DESCRIBE page_views ;
 98+ -- DESCRIBE language_names ;
9599
96100 -- Database Manipulation
97101 -- Obviously in real world this is a separate script
98102
99 -LOAD DATA LOCAL INFILE 'c:/MySQL/analytics/analytics_in_comscore.csv'
 103+LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv'
100104 INTO TABLE comscore
101105 FIELDS TERMINATED BY ','
102106 OPTIONALLY ENCLOSED BY '"'
103 - (yyyymm,country_code,region_code,property,project,reach,visitors) ;
 107+ (@date,country_code,region_code,web_property,project_code,reach,visitors)
 108+ SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
104109
105 -LOAD DATA LOCAL INFILE 'c:/MySQL/analytics/analytics_in_comscore_regions.csv'
 110+LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv'
106111 INTO TABLE comscore_regions
107112 FIELDS TERMINATED BY ','
108113 OPTIONALLY ENCLOSED BY '"'
109114 (report_language,region_code,region_name) ;
110115
111 -SELECT *
112 - FROM comscore_regions ;
113 -
114116 LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv'
115117 INTO TABLE wikistats
116118 FIELDS TERMINATED BY ','
117119 OPTIONALLY ENCLOSED BY '"'
118 - (project,language_code,yyyymm,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) ;
 120+ (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects)
 121+ SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
119122
120123 LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv'
121124 INTO TABLE page_views
122125 FIELDS TERMINATED BY ','
123126 OPTIONALLY ENCLOSED BY '"'
124 - (project,language_code,yyyymm,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized) ;
 127+ (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized)
 128+ SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
125129
 130+
126131 LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv'
127132 INTO TABLE language_names
128133 FIELDS TERMINATED BY ','
129134 OPTIONALLY ENCLOSED BY '"'
130135 (report_language,language_code,language_name) ;
131136
 137+-- show contents (debugging only)
 138+-- SELECT * FROM comscore ;
 139+-- SELECT * FROM comscore_regions ;
 140+-- SELECT * FROM wikistats ;
 141+-- SELECT * FROM page_views ;
 142+-- SELECT * FROM language_names ;
 143+
 144+
132145 -- view makes join implicit and adds aggregated fields (and provides default order, can be overidden)
 146+
 147+-- define views (experimental only, not for current project)
 148+
 149+CREATE VIEW comscore_v AS
 150+ SELECT date, country_code, /* country_name, */ comscore.region_code, region_name, web_property, project_code, reach, visitors
 151+ FROM comscore LEFT JOIN comscore_regions /* LEFT JOIN country_names */
 152+ ON comscore.region_code = comscore_regions.region_code
 153+ -- AND comscore.country = country.country_code
 154+ ORDER BY date,project_code,region_code ;
 155+
 156+-- test code
 157+-- SELECT * FROM comscore_v
 158+-- WHERE region_code IS NULL ;
 159+
133160 CREATE VIEW page_views_v AS
134 - SELECT yyyymm, project, page_views.language_code, language_name,
 161+ SELECT date, project_code, page_views.language_code, language_name,
135162 views_non_mobile_raw, views_mobile_raw ,
136163 views_non_mobile_normalized, views_mobile_normalized,
137164 views_non_mobile_raw+views_mobile_raw AS views_raw,
@@ -158,72 +165,12 @@
159166 language_names.report_language
160167 FROM page_views LEFT JOIN language_names
161168 ON page_views.language_code = language_names.language_code
162 - ORDER BY yyyymm,project,language_name ;
 169+ ORDER BY date,project_code,language_code ;
163170
164 -PREPARE page_views_s1
165 - FROM
166 - "SELECT yyyymm,project,language_code,language_name,views_normalized
167 - FROM page_views_v
168 - WHERE (project = 'wp') AND
169 - (yyyymm BETWEEN '2011-03' AND '2011-05') AND
170 - (language_code = 'nl') AND
171 - (report_language=?) -- parameter needed
172 - INTO OUTFILE 'c:/MySQL/analytics/analytics_out_page_views_test1.csv'
173 - FIELDS TERMINATED BY ','" ;
 171+-- test code
 172+-- SELECT *
 173+-- FROM page_views_v
 174+-- WHERE (project_code = 'wv') AND
 175+-- (date >= '2011-01') AND (date <= '2011-03') ;
174176
175 -SET @report_language = 'en' ; -- used as parameter in prepared statements ; for tests databasee contains translations for 'en' and 'de'
176177
177 -EXECUTE page_views_s1 USING @report_language ;
178 -
179 -SET @project = 'wp' ;
180 -SET @yyyymm_first = '2011-03' ;
181 -SET @yyyymm_last = '2011-05' ;
182 -SET @language_code = 'nl' ;
183 -SET @report_language = 'de' ;
184 -
185 -PREPARE page_views_s2
186 - FROM
187 - "SELECT yyyymm,project,language_code,language_name,views_normalized
188 - FROM page_views_v
189 - WHERE (project = ?) AND
190 - (yyyymm BETWEEN ? AND ?) AND
191 - (language_code = ?) AND
192 - (report_language = ?)
193 - INTO OUTFILE 'c:/MySQL/analytics/analytics_out_page_views_test2.csv'
194 - FIELDS TERMINATED BY ','" ;
195 -
196 -EXECUTE page_views_s2 USING @project, @yyyymm_first, @yyyymm_last, @language_code, @report_language ;
197 -
198 -
199 -/*
200 -
201 -
202 -DROP PROCEDURE IF EXISTS page_views_p1 ;
203 -DELIMITER //
204 -CREATE PROCEDURE page_views_p1 (parm_project VARCHAR(30))
205 - SELECT yyyymm,project,language_code,language_name,views_normalized
206 - FROM page_views_v
207 - WHERE (project = parm_project) ;
208 - //
209 -DELIMITER ;
210 -
211 -
212 -delimiter //
213 -CREATE PROCEDURE dynamic(IN tbl CHAR(64), IN col CHAR(64))
214 -BEGIN
215 - SET @s = CONCAT('SELECT ',col,' FROM ',tbl) ;
216 - PREPARE stmt FROM @s;
217 - EXECUTE stmt;
218 -END
219 -//
220 -delimiter ;
221 -CALL dynamic ('comscore', 'yyyymm')
222 -*/
223 -
224 -
225 -
Index: trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl
@@ -39,7 +39,7 @@
4040 $false = 0 ;
4141
4242 $script_name = "AnalyticsPrepComscoreData.pl" ;
43 - $script_version = "0.31" ;
 43+ $script_version = "0.32" ;
4444
4545 # EZ test only
4646 # $source = "comscore" ;
@@ -50,17 +50,19 @@
5151 $dir_analytics = $options {"m"} ;
5252 $dir_comscore_updates = $options {"u"} ;
5353
54 -# $dir_analytics = "c:/MySQL/analytics" ; # EZ test only
55 -# $dir_comscore_updates = "W:/@ Report Card/Data" ; # EZ test only
 54+ $dir_analytics = "C:/@ Wikimedia/! MySQL/analytics" ; # EZ test only
 55+ $dir_comscore_updates = "C:/@ Wikimedia/@ Report Card/Data" ; # EZ test only
5656
5757 if (($dir_analytics eq '') || ($dir_comscore_updates eq ''))
5858 { Abort ("Specify folder for 'master' csv files as '-m folder', folder for 'update' csv files as -u folder'") ; }
5959
60 - $file_comscore_reach_master = "excel_out_comscore_reach_regions.csv" ;
61 - $file_comscore_reach_update = "*reach*by*region*csv" ;
62 - $file_comscore_uv_region_master = "excel_out_comscore_UV_regions.csv" ;
63 - $file_comscore_uv_region_update = "*UVs*by*region*csv" ;
64 - $file_comscore_uv_property_master = "excel_out_comscore_UV_properties.csv" ;
 60+ $file_comscore_reach_master = "history_comscore_reach_regions.csv" ;
 61+ $file_comscore_reach_update = "*reach*by*region*csv" ;
 62+
 63+ $file_comscore_uv_region_master = "history_comscore_UV_regions.csv" ;
 64+ $file_comscore_uv_region_update = "*UVs*by*region*csv" ;
 65+
 66+ $file_comscore_uv_property_master = "history_comscore_UV_properties.csv" ;
6567 $file_comscore_uv_property_update = "*UV*trend*csv" ;
6668
6769 $layout_csv_reach = 1 ;
@@ -113,33 +115,33 @@
114116 if (! -e "$dir_analytics/$file_comscore_master")
115117 { Abort ("File $file_comscore_master not found!") ; }
116118
117 - $age_all = -M "$dir_analytics/$file_comscore_master" ;
118 - print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " days old: '$file_comscore_master'\n" ;
 119+ $age_master = -M "$dir_analytics/$file_comscore_master" ;
 120+ print "\nLatest comscore master file is " . sprintf ("%.0f", $age_master) . " days old: '$file_comscore_master'\n" ;
119121
120122 my $cwd = getcwd ;
121123 chdir $dir_comscore_updates ;
122124
123125 @files = glob($file_comscore_updates) ;
124 - $min_age_upd = 999999 ;
 126+ $age_update = 999999 ;
125127 $file_comscore_updates_latest = '' ;
126128 foreach $file (@files)
127129 {
128130 $age = -M $file ;
129 - if ($age < $min_age_upd)
 131+ if ($age < $age_update)
130132 {
131 - $min_age_upd = $age ;
 133+ $age_update = $age ;
132134 $file_comscore_updates_latest = $file ;
133135 }
134136 }
135 - print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " days old: '$file_comscore_updates_latest'\n" ;
 137+ print "\nLatest comscore update file is " . sprintf ("%.0f", $age_update) . " days old: '$file_comscore_updates_latest'\n" ;
136138
137 - if ($min_age_upd == 999999)
 139+ if ($age_update == 999999)
138140 {
139141 print "No valid update file found. Nothing to update." ;
140142 return ;
141143 }
142144
143 - if ($age_all > $min_age_upd)
 145+ if ($age_master < $age_update)
144146 {
145147 print "File with master data more recent than latest update csv from comScore. Nothing to update." ;
146148 return ;
@@ -423,7 +425,7 @@
424426
425427 sub WriteDataAnalytics
426428 {
427 - open OUT, '>', "c:/MySQL/analytics/analytics_in_comscore.csv" ;
 429+ open OUT, '>', "$dir_analytics/analytics_in_comscore.csv" ;
428430
429431 $metric = 'unique_visitors' ;
430432 foreach $yyyymm (sort keys %months)
@@ -445,7 +447,7 @@
446448
447449 $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
448450 print OUT $line ;
449 - print $line ;
 451+ # print $line ;
450452 }
451453
452454 foreach $property (sort @properties)

Status & tagging log