Index: trunk/wikistats/analytics/create_and_use_db_analytics.txt |
— | — | @@ -1,5 +1,4 @@ |
| 2 | +-- make sure to delete output files *test*.csv first if any exist (MySQL on purpose forbids overwrite) |
2 | 3 | |
3 | 4 | -- tables implemented: |
4 | 5 | -- comscore |
— | — | @@ -8,19 +7,17 @@ |
9 | 8 | -- page_views |
10 | 9 | -- language names |
11 | 10 | |
12 | | - |
| 11 | +-- more tables planned (O= optional, not needed for report card stage) |
13 | 12 | -- project_names |
| 13 | +-- binaries per project_code, per language, per month, per extension type |
| 14 | +-- O edits per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other) |
| 15 | +-- O editors per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other) |
14 | 16 | |
15 | 17 | |
16 | 18 | -- open issues: |
17 | 19 | -- only store basic data in database and calculate all aggregates on the fly or do some aggragation before hand ? (e.g. count for mobile / non-mobile / ==> total of both ? <==) |
18 | 20 | -- for binaries, store one extension type per row? (future proof, more work to query), or a selected few as columns? (one row only needed per month) |
19 | 21 | |
20 | | - |
21 | 22 | -- Create database and two tables from scratch |
22 | 23 | DROP DATABASE IF EXISTS `analytics` ; |
23 | 24 | |
— | — | @@ -29,128 +26,138 @@ |
30 | 27 | USE `analytics` ; |
31 | 28 | |
32 | 29 | CREATE TABLE `comscore` ( |
33 | | - `yyyymm` char (7), |
34 | | - `country_code` char (3), |
35 | | - `region_code` char (2), |
36 | | - `property` char (20), |
37 | | - `project` char (10), |
38 | | - `reach` decimal (4,1) DEFAULT '-1', |
39 | | - `visitors` decimal (15) DEFAULT '-1', |
40 | | - PRIMARY KEY (yyyymm,country_code,region_code,property) |
| 30 | + `date` date NOT NULL, |
| 31 | + `country_code` varchar (3), |
| 32 | + `region_code` varchar (3), |
| 33 | + `web_property` varchar (20), |
| 34 | + `project_code` varchar (10), |
| 35 | + `reach` decimal (4,1) DEFAULT NULL, |
| 36 | + `visitors` decimal (15) DEFAULT NULL, |
| 37 | + PRIMARY KEY (date,country_code,region_code,project_code,web_property), |
| 38 | + KEY (`country_code`) |
41 | 39 | ) ; |
42 | 40 | |
43 | 41 | CREATE TABLE `comscore_regions` ( |
44 | | - `region_code` char (2), |
45 | | - `report_language` char (10), |
46 | | - `region_name` char (18), |
| 42 | + `region_code` varchar (2), |
| 43 | + `report_language` varchar (10), |
| 44 | + `region_name` varchar (18), |
47 | 45 | PRIMARY KEY (report_language,region_code) |
48 | 46 | ) ; |
49 | 47 | |
50 | 48 | CREATE TABLE `wikistats` ( |
51 | | - `yyyymm` char (7), |
52 | | - `project` char (2), |
53 | | - `language_code` char (15), |
54 | | - `editors_all_time` decimal (10) DEFAULT '-1', |
55 | | - `editors_new` decimal (7) DEFAULT '-1', |
56 | | - `editors_ge_5` decimal (7) DEFAULT '-1', |
57 | | - `editors_ge_25` decimal (7) DEFAULT '-1', |
58 | | - `editors_ge_100` decimal (7) DEFAULT '-1', |
59 | | - `articles` decimal (12) DEFAULT '-1', |
60 | | - `articles_new_per_day` decimal (9) DEFAULT '-1', |
61 | | - `articles_over_bytes_500` decimal (12) DEFAULT '-1', |
62 | | - `articles_over_bytes_2000` decimal (12) DEFAULT '-1', |
63 | | - `edits_per_article` decimal (9,1) DEFAULT '-1', |
64 | | - `bytes_per_article` decimal (9,1) DEFAULT '-1', |
65 | | - `edits` decimal (12) DEFAULT '-1', |
66 | | - `size_in_bytes` decimal (15) DEFAULT '-1', |
67 | | - `size_in_words` decimal (15) DEFAULT '-1', |
68 | | - `links_internal` decimal (15) DEFAULT '-1', |
69 | | - `links_interwiki` decimal (15) DEFAULT '-1', |
70 | | - `links_image` decimal (15) DEFAULT '-1', |
71 | | - `links_external` decimal (15) DEFAULT '-1', |
72 | | - `redirects` decimal (15) DEFAULT '-1', |
73 | | - PRIMARY KEY (yyyymm,project,language_code) |
| 49 | + `date` date NOT NULL, |
| 50 | + `project_code` varchar (10), |
| 51 | + `language_code` varchar (15), |
| 52 | + `editors_all_time` int (10) DEFAULT NULL, |
| 53 | + `editors_new` int (7) DEFAULT NULL, |
| 54 | + `editors_ge_5` int (7) DEFAULT NULL, |
| 55 | + `editors_ge_25` int (7) DEFAULT NULL, |
| 56 | + `editors_ge_100` int (7) DEFAULT NULL, |
| 57 | + `articles` int (12) DEFAULT NULL, |
| 58 | + `articles_new_per_day` int (9) DEFAULT NULL, |
| 59 | + `articles_over_bytes_500` int (12) DEFAULT NULL, |
| 60 | + `articles_over_bytes_2000` int (12) DEFAULT NULL, |
| 61 | + `edits_per_article` decimal (9,1) DEFAULT NULL, |
| 62 | + `bytes_per_article` decimal (9,1) DEFAULT NULL, |
| 63 | + `edits` int (12) DEFAULT NULL, |
| 64 | + `size_in_bytes` int (15) DEFAULT NULL, |
| 65 | + `size_in_words` int (15) DEFAULT NULL, |
| 66 | + `links_internal` int (15) DEFAULT NULL, |
| 67 | + `links_interwiki` int (15) DEFAULT NULL, |
| 68 | + `links_image` int (15) DEFAULT NULL, |
| 69 | + `links_external` int (15) DEFAULT NULL, |
| 70 | + `redirects` int (15) DEFAULT NULL, |
| 71 | + PRIMARY KEY (date,project_code,language_code) |
74 | 72 | ) ; |
75 | 73 | |
76 | 74 | CREATE TABLE `page_views` ( |
77 | 75 | -- analytics data |
78 | | - `yyyymm` char (7), |
79 | | - `project` char (2), |
| 76 | + `date` date NOT NULL, |
| 77 | + `project_code` char (2), |
80 | 78 | `language_code` char (15), |
81 | | - `views_non_mobile_raw` decimal (15) DEFAULT '-1', |
82 | | - `views_mobile_raw` decimal (15) DEFAULT '-1', |
83 | | - `views_non_mobile_normalized` decimal (15) DEFAULT '-1', |
84 | | - `views_mobile_normalized` decimal (15) DEFAULT '-1', |
85 | | - PRIMARY KEY (yyyymm,project,language_code) |
| 79 | + `views_non_mobile_raw` bigint (15), |
| 80 | + `views_mobile_raw` bigint (15), |
| 81 | + `views_non_mobile_normalized` bigint (15), |
| 82 | + `views_mobile_normalized` bigint (15), |
| 83 | + PRIMARY KEY (date,project_code,language_code) |
86 | 84 | ) ; |
87 | 85 | |
88 | 86 | CREATE TABLE `language_names` ( |
89 | | - `report_language` char (15), |
90 | | - `language_code` char (15), |
91 | | - `language_name` char (50), |
| 87 | + `report_language` varchar (15), |
| 88 | + `language_code` varchar (15), |
| 89 | + `language_name` varchar (50), |
92 | 90 | PRIMARY KEY (report_language,language_code) |
93 | 91 | ) ; |
94 | 92 | |
| 93 | + -- SHOW TABLES ; |
| 94 | + -- DESCRIBE comscore ; |
| 95 | + -- DESCRIBE comscore_regions ; |
| 96 | + -- DESCRIBE wikistats ; |
| 97 | + -- DESCRIBE page_views ; |
| 98 | + -- DESCRIBE language_names ; |
95 | 99 | |
96 | 100 | -- Database Manipulation |
97 | 101 | -- Obviously in real world this is a separate script |
98 | 102 | |
99 | | -LOAD DATA LOCAL INFILE 'c:/MySQL/analytics/analytics_in_comscore.csv' |
| 103 | +LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv' |
100 | 104 | INTO TABLE comscore |
101 | 105 | FIELDS TERMINATED BY ',' |
102 | 106 | OPTIONALLY ENCLOSED BY '"' |
103 | | - (yyyymm,country_code,region_code,property,project,reach,visitors) ; |
| 107 | + (@date,country_code,region_code,web_property,project_code,reach,visitors) |
| 108 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; |
104 | 109 | |
105 | | -LOAD DATA LOCAL INFILE 'c:/MySQL/analytics/analytics_in_comscore_regions.csv' |
| 110 | +LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv' |
106 | 111 | INTO TABLE comscore_regions |
107 | 112 | FIELDS TERMINATED BY ',' |
108 | 113 | OPTIONALLY ENCLOSED BY '"' |
109 | 114 | (report_language,region_code,region_name) ; |
110 | 115 | |
111 | | -SELECT * |
112 | | - FROM comscore_regions ; |
113 | | - |
114 | 116 | LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv' |
115 | 117 | INTO TABLE wikistats |
116 | 118 | FIELDS TERMINATED BY ',' |
117 | 119 | OPTIONALLY ENCLOSED BY '"' |
118 | | - (project,language_code,yyyymm,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) ; |
| 120 | + (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) |
| 121 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; |
119 | 122 | |
120 | 123 | LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv' |
121 | 124 | INTO TABLE page_views |
122 | 125 | FIELDS TERMINATED BY ',' |
123 | 126 | OPTIONALLY ENCLOSED BY '"' |
124 | | - (project,language_code,yyyymm,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized) ; |
| 127 | + (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized) |
| 128 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; |
125 | 129 | |
| 130 | + |
126 | 131 | LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv' |
127 | 132 | INTO TABLE language_names |
128 | 133 | FIELDS TERMINATED BY ',' |
129 | 134 | OPTIONALLY ENCLOSED BY '"' |
130 | 135 | (report_language,language_code,language_name) ; |
131 | 136 | |
| 137 | +-- show contents (debugging only) |
| 138 | +-- SELECT * FROM comscore ; |
| 139 | +-- SELECT * FROM comscore_regions ; |
| 140 | +-- SELECT * FROM wikistats ; |
| 141 | +-- SELECT * FROM page_views ; |
| 142 | +-- SELECT * FROM language_names ; |
| 143 | + |
| 144 | + |
132 | 145 | -- view makes join implicit and adds aggregated fields (and provides default order, can be overidden) |
| 146 | + |
| 147 | +-- define views (experimental only, not for current project) |
| 148 | + |
| 149 | +CREATE VIEW comscore_v AS |
| 150 | + SELECT date, country_code, /* country_name, */ comscore.region_code, region_name, web_property, project_code, reach, visitors |
| 151 | + FROM comscore LEFT JOIN comscore_regions /* LEFT JOIN country_names */ |
| 152 | + ON comscore.region_code = comscore_regions.region_code |
| 153 | + -- AND comscore.country = country.country_code |
| 154 | + ORDER BY date,project_code,region_code ; |
| 155 | + |
| 156 | +-- test code |
| 157 | +-- SELECT * FROM comscore_v |
| 158 | +-- WHERE region_code IS NULL ; |
| 159 | + |
133 | 160 | CREATE VIEW page_views_v AS |
134 | | - SELECT yyyymm, project, page_views.language_code, language_name, |
| 161 | + SELECT date, project_code, page_views.language_code, language_name, |
135 | 162 | views_non_mobile_raw, views_mobile_raw , |
136 | 163 | views_non_mobile_normalized, views_mobile_normalized, |
137 | 164 | views_non_mobile_raw+views_mobile_raw AS views_raw, |
— | — | @@ -158,72 +165,12 @@ |
159 | 166 | language_names.report_language |
160 | 167 | FROM page_views LEFT JOIN language_names |
161 | 168 | ON page_views.language_code = language_names.language_code |
162 | | - ORDER BY yyyymm,project,language_name ; |
| 169 | + ORDER BY date,project_code,language_code ; |
163 | 170 | |
164 | | -PREPARE page_views_s1 |
165 | | - FROM |
166 | | - "SELECT yyyymm,project,language_code,language_name,views_normalized |
167 | | - FROM page_views_v |
168 | | - WHERE (project = 'wp') AND |
169 | | - (yyyymm BETWEEN '2011-03' AND '2011-05') AND |
170 | | - (language_code = 'nl') AND |
171 | | - (report_language=?) -- parameter needed |
172 | | - INTO OUTFILE 'c:/MySQL/analytics/analytics_out_page_views_test1.csv' |
173 | | - FIELDS TERMINATED BY ','" ; |
| 171 | +-- test code |
| 172 | +-- SELECT * |
| 173 | +-- FROM page_views_v |
| 174 | +-- WHERE (project_code = 'wv') AND |
| 175 | +-- (date >= '2011-01') AND (date <= '2011-03') ; |
174 | 176 | |
175 | | -SET @report_language = 'en' ; -- used as parameter in prepared statements ; for tests databasee contains translations for 'en' and 'de' |
176 | 177 | |
177 | | -EXECUTE page_views_s1 USING @report_language ; |
178 | | - |
179 | | -SET @project = 'wp' ; |
180 | | -SET @yyyymm_first = '2011-03' ; |
181 | | -SET @yyyymm_last = '2011-05' ; |
182 | | -SET @language_code = 'nl' ; |
183 | | -SET @report_language = 'de' ; |
184 | | - |
185 | | -PREPARE page_views_s2 |
186 | | - FROM |
187 | | - "SELECT yyyymm,project,language_code,language_name,views_normalized |
188 | | - FROM page_views_v |
189 | | - WHERE (project = ?) AND |
190 | | - (yyyymm BETWEEN ? AND ?) AND |
191 | | - (language_code = ?) AND |
192 | | - (report_language = ?) |
193 | | - INTO OUTFILE 'c:/MySQL/analytics/analytics_out_page_views_test2.csv' |
194 | | - FIELDS TERMINATED BY ','" ; |
195 | | - |
196 | | -EXECUTE page_views_s2 USING @project, @yyyymm_first, @yyyymm_last, @language_code, @report_language ; |
197 | | - |
198 | | - |
199 | | -/* |
200 | | - |
201 | | - |
202 | | -DROP PROCEDURE IF EXISTS page_views_p1 ; |
203 | | -DELIMITER // |
204 | | -CREATE PROCEDURE page_views_p1 (parm_project VARCHAR(30)) |
205 | | - SELECT yyyymm,project,language_code,language_name,views_normalized |
206 | | - FROM page_views_v |
207 | | - WHERE (project = parm_project) ; |
208 | | - // |
209 | | -DELIMITER ; |
210 | | - |
211 | | - |
212 | | -delimiter // |
213 | | -CREATE PROCEDURE dynamic(IN tbl CHAR(64), IN col CHAR(64)) |
214 | | -BEGIN |
215 | | - SET @s = CONCAT('SELECT ',col,' FROM ',tbl) ; |
216 | | - PREPARE stmt FROM @s; |
217 | | - EXECUTE stmt; |
218 | | -END |
219 | | -// |
220 | | -delimiter ; |
221 | | -CALL dynamic ('comscore', 'yyyymm') |
222 | | -*/ |
223 | | - |
224 | | - |
225 | | - |
Index: trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl |
— | — | @@ -39,7 +39,7 @@ |
40 | 40 | $false = 0 ; |
41 | 41 | |
42 | 42 | $script_name = "AnalyticsPrepComscoreData.pl" ; |
43 | | - $script_version = "0.31" ; |
| 43 | + $script_version = "0.32" ; |
44 | 44 | |
45 | 45 | # EZ test only |
46 | 46 | # $source = "comscore" ; |
— | — | @@ -50,17 +50,19 @@ |
51 | 51 | $dir_analytics = $options {"m"} ; |
52 | 52 | $dir_comscore_updates = $options {"u"} ; |
53 | 53 | |
54 | | -# $dir_analytics = "c:/MySQL/analytics" ; # EZ test only |
55 | | -# $dir_comscore_updates = "W:/@ Report Card/Data" ; # EZ test only |
| 54 | + $dir_analytics = "C:/@ Wikimedia/! MySQL/analytics" ; # EZ test only |
| 55 | + $dir_comscore_updates = "C:/@ Wikimedia/@ Report Card/Data" ; # EZ test only |
56 | 56 | |
57 | 57 | if (($dir_analytics eq '') || ($dir_comscore_updates eq '')) |
58 | 58 | { Abort ("Specify folder for 'master' csv files as '-m folder', folder for 'update' csv files as -u folder'") ; } |
59 | 59 | |
60 | | - $file_comscore_reach_master = "excel_out_comscore_reach_regions.csv" ; |
61 | | - $file_comscore_reach_update = "*reach*by*region*csv" ; |
62 | | - $file_comscore_uv_region_master = "excel_out_comscore_UV_regions.csv" ; |
63 | | - $file_comscore_uv_region_update = "*UVs*by*region*csv" ; |
64 | | - $file_comscore_uv_property_master = "excel_out_comscore_UV_properties.csv" ; |
| 60 | + $file_comscore_reach_master = "history_comscore_reach_regions.csv" ; |
| 61 | + $file_comscore_reach_update = "*reach*by*region*csv" ; |
| 62 | + |
| 63 | + $file_comscore_uv_region_master = "history_comscore_UV_regions.csv" ; |
| 64 | + $file_comscore_uv_region_update = "*UVs*by*region*csv" ; |
| 65 | + |
| 66 | + $file_comscore_uv_property_master = "history_comscore_UV_properties.csv" ; |
65 | 67 | $file_comscore_uv_property_update = "*UV*trend*csv" ; |
66 | 68 | |
67 | 69 | $layout_csv_reach = 1 ; |
— | — | @@ -113,33 +115,33 @@ |
114 | 116 | if (! -e "$dir_analytics/$file_comscore_master") |
115 | 117 | { Abort ("File $file_comscore_master not found!") ; } |
116 | 118 | |
117 | | - $age_all = -M "$dir_analytics/$file_comscore_master" ; |
118 | | - print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " days old: '$file_comscore_master'\n" ; |
| 119 | + $age_master = -M "$dir_analytics/$file_comscore_master" ; |
| 120 | + print "\nLatest comscore master file is " . sprintf ("%.0f", $age_master) . " days old: '$file_comscore_master'\n" ; |
119 | 121 | |
120 | 122 | my $cwd = getcwd ; |
121 | 123 | chdir $dir_comscore_updates ; |
122 | 124 | |
123 | 125 | @files = glob($file_comscore_updates) ; |
124 | | - $min_age_upd = 999999 ; |
| 126 | + $age_update = 999999 ; |
125 | 127 | $file_comscore_updates_latest = '' ; |
126 | 128 | foreach $file (@files) |
127 | 129 | { |
128 | 130 | $age = -M $file ; |
129 | | - if ($age < $min_age_upd) |
| 131 | + if ($age < $age_update) |
130 | 132 | { |
131 | | - $min_age_upd = $age ; |
| 133 | + $age_update = $age ; |
132 | 134 | $file_comscore_updates_latest = $file ; |
133 | 135 | } |
134 | 136 | } |
135 | | - print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " days old: '$file_comscore_updates_latest'\n" ; |
| 137 | + print "\nLatest comscore update file is " . sprintf ("%.0f", $age_update) . " days old: '$file_comscore_updates_latest'\n" ; |
136 | 138 | |
137 | | - if ($min_age_upd == 999999) |
| 139 | + if ($age_update == 999999) |
138 | 140 | { |
139 | 141 | print "No valid update file found. Nothing to update." ; |
140 | 142 | return ; |
141 | 143 | } |
142 | 144 | |
143 | | - if ($age_all > $min_age_upd) |
| 145 | + if ($age_master < $age_update) |
144 | 146 | { |
145 | 147 | print "File with master data more recent than latest update csv from comScore. Nothing to update." ; |
146 | 148 | return ; |
— | — | @@ -423,7 +425,7 @@ |
424 | 426 | |
425 | 427 | sub WriteDataAnalytics |
426 | 428 | { |
427 | | - open OUT, '>', "c:/MySQL/analytics/analytics_in_comscore.csv" ; |
| 429 | + open OUT, '>', "$dir_analytics/analytics_in_comscore.csv" ; |
428 | 430 | |
429 | 431 | $metric = 'unique_visitors' ; |
430 | 432 | foreach $yyyymm (sort keys %months) |
— | — | @@ -445,7 +447,7 @@ |
446 | 448 | |
447 | 449 | $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ; |
448 | 450 | print OUT $line ; |
449 | | - print $line ; |
| 451 | + # print $line ; |
450 | 452 | } |
451 | 453 | |
452 | 454 | foreach $property (sort @properties) |