Index: trunk/wikistats/analytics/MySQLPrepComscoreData.pl |
— | — | @@ -16,6 +16,7 @@ |
17 | 17 | # Functionality: |
18 | 18 | # comScore data can be downloaded as csv file, which each contain 14 months history |
19 | 19 | # This script uses these files to update 'master' csv files which contain all known history |
| 20 | +# Note: only entities which are already in master file will be updated! |
20 | 21 | # Then it merges these master files into one csv file which can be loaded into analytics database |
21 | 22 | # Data are: reach by region, unique visitors by region, unique visitors by web property |
22 | 23 | |
— | — | @@ -26,6 +27,8 @@ |
27 | 28 | # Output: |
28 | 29 | # updated master csv files + merged and formatted csv for import in MySQL |
29 | 30 | |
| 31 | +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/ |
| 32 | + |
30 | 33 | use Getopt::Std ; |
31 | 34 | use Cwd; |
32 | 35 | |
— | — | @@ -38,7 +41,7 @@ |
39 | 42 | $script_name = "MySQLPrepComscoreData.pl" ; |
40 | 43 | $script_version = "0.3" ; |
41 | 44 | |
42 | | -# test |
| 45 | +# EZ test only |
43 | 46 | # $source = "comscore" ; |
44 | 47 | # $server = "ez_test" ; |
45 | 48 | # $generated = "2011-05-06 00:00:00" ; |
— | — | @@ -60,6 +63,10 @@ |
61 | 64 | $file_comscore_uv_property_master = "excel_out_comscore_UV_properties.csv" ; |
62 | 65 | $file_comscore_uv_property_update = "*UV*trend*csv" ; |
63 | 66 | |
| 67 | + $layout_csv_reach = 1 ; |
| 68 | + $layout_csv_regions = 2 ; |
| 69 | + $layout_csv_properties = 3 ; |
| 70 | + |
64 | 71 | print "Directories:\nAnalytics '$dir_analytics'\nUpdates '$dir_comscore_updates'\n\n" ; |
65 | 72 | |
66 | 73 | %region_codes = ( |
— | — | @@ -69,6 +76,7 @@ |
70 | 77 | "World-Wide" => "W", |
71 | 78 | "Middle East - Africa" => "MA", |
72 | 79 | "Asia Pacific"=> "AS", |
| 80 | + "United States" => "US", |
73 | 81 | "India" => "I", |
74 | 82 | "China" => "C" |
75 | 83 | ) ; |
— | — | @@ -78,16 +86,14 @@ |
79 | 87 | |
80 | 88 | @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ; |
81 | 89 | |
82 | | - # &ReadDataReachPerRegion ($file_comscore_reach_master, $file_comscore_reach_update, "%.1f") ; |
83 | | - # %reach_region_code = %data ; |
| 90 | + &ReadDataReachPerRegion ($file_comscore_reach_master, $file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ; |
| 91 | + %reach_region_code = %data ; |
84 | 92 | |
85 | | - &ReadDataVisitorsPerRegion ($file_comscore_uv_region_master, $file_comscore_uv_region_update, "%.0f") ; |
| 93 | + &ReadDataVisitorsPerRegion ($file_comscore_uv_region_master, $file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ; |
86 | 94 | %visitors_region_code = %data ; |
87 | 95 | |
88 | | - exit ; |
89 | | - &ReadDataVisitorsPerProperty ($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f") ; |
| 96 | + &ReadDataVisitorsPerProperty ($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f", 1000, $layout_csv_properties) ; |
90 | 97 | %visitors_web_property = %data ; |
91 | | - exit ; |
92 | 98 | |
93 | 99 | &WriteDataAnalytics ; |
94 | 100 | |
— | — | @@ -96,8 +102,14 @@ |
97 | 103 | |
98 | 104 | sub UpdateFromLatestComscoreData |
99 | 105 | { |
100 | | - my ($file_comscore_master, $file_comscore_updates) = @_ ; |
| 106 | + my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, @update_only) = @_ ; |
101 | 107 | |
| 108 | + undef %update_only ; |
| 109 | + undef %do_not_update ; |
| 110 | + |
| 111 | + foreach $id (@update_only) |
| 112 | + { $update_only {$id} = $true ; } |
| 113 | + |
102 | 114 | if (! -e "$dir_analytics/$file_comscore_master") |
103 | 115 | { abort ("File $file_comscore_master not found!") ; } |
104 | 116 | |
— | — | @@ -140,46 +152,85 @@ |
141 | 153 | while ($line = <CSV>) |
142 | 154 | { |
143 | 155 | chomp $line ; |
144 | | - if ($line =~ /^Location.*?-.*?-.*?-/) # e.g. 'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...' |
| 156 | + $line = &GetNumberOnly ($line) ; |
| 157 | + |
| 158 | + if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...' |
145 | 159 | { |
146 | | - ($dummy1,$dummy2,@months) = split (',', $line) ; |
| 160 | + if ($layout_csv == $layout_csv_properties) |
| 161 | + { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web properties csv file |
| 162 | + else |
| 163 | + { ($dummy1,$dummy2,@months) = split (',', $line) ; } # uv / reach csv files |
| 164 | + |
147 | 165 | @months = &mmm_yyyy2yyyy_mm (@months) ; |
148 | 166 | } |
149 | | - if ($line =~ /^\d,/) |
| 167 | + |
| 168 | + if ($line =~ /^\d+,/) |
150 | 169 | { |
151 | | - ($index,$region,@data) = split (',', $line) ; |
152 | | - $region =~ s/^\s+// ; |
153 | | - $region =~ s/\s+$// ; |
154 | | - $region_code = $region_codes {$region} ; |
| 170 | + if ($layout_csv == $layout_csv_properties) |
| 171 | + { |
| 172 | + ($index,$dummy,$property,@data) = split (',', $line) ; |
| 173 | + $property =~ s/^\s+// ; |
| 174 | + $property =~ s/\s+$// ; |
155 | 175 | |
| 176 | + $property =~ s/.*Google.*/Google/i ; |
| 177 | + $property =~ s/.*Microsoft.*/Microsoft/i ; |
| 178 | + $property =~ s/.*FACEBOOK.*/Facebook/i ; |
| 179 | + $property =~ s/.*Yahoo.*/Yahoo/i ; |
| 180 | + $property =~ s/.*Amazon.*/Amazon/i ; |
| 181 | + $property =~ s/.*Apple.*/Apple/i ; |
| 182 | + $property =~ s/.*AOL.*/AOL/i ; |
| 183 | + $property =~ s/.*Wikimedia.*/Wikimedia/i ; |
| 184 | + $property =~ s/.*Tencent.*/Tencent/i ; |
| 185 | + $property =~ s/.*Baidu.*/Baidu/i ; |
| 186 | + $property =~ s/.*CBS.*/CBS/i ; |
| 187 | + |
| 188 | + $id = $property ; |
| 189 | + } |
| 190 | + else |
| 191 | + { |
| 192 | + ($index,$region,@data) = split (',', $line) ; |
| 193 | + $region =~ s/^\s+// ; |
| 194 | + $region =~ s/\s+$// ; |
| 195 | + $id = $region_codes {$region} ; |
| 196 | + } |
| 197 | + |
| 198 | + if ($update_only {$id} == 0) |
| 199 | + { |
| 200 | + $do_not_update {$id}++ ; |
| 201 | + next ; |
| 202 | + } |
| 203 | + |
156 | 204 | for ($m = 0 ; $m <= $#months ; $m++) |
157 | 205 | { |
158 | 206 | $yyyymm = $months [$m] ; |
159 | 207 | $months {$yyyymm} ++ ; |
160 | | - $yyyymm_region_code = "$yyyymm,$region_code" ; |
161 | | - $data = $data [$m] ; |
| 208 | + $yyyymm_id = "$yyyymm,$id" ; |
| 209 | + $data = $data [$m] * $multiplier ; |
162 | 210 | |
163 | | - if (! defined $data {$yyyymm_region_code}) |
| 211 | + if (! defined $data {$yyyymm_id}) |
164 | 212 | { |
165 | 213 | $updates_found = $true ; |
166 | | - print "New data found: $yyyymm_region_code = $data\n" ; |
167 | | - $data {$yyyymm_region_code} = $data ; |
| 214 | + print "New data found: $yyyymm_id = $data\n" ; |
| 215 | + $data {$yyyymm_id} = $data ; |
168 | 216 | } |
169 | 217 | } |
170 | 218 | } |
171 | 219 | } |
172 | 220 | |
| 221 | + $ignored = join ', ', sort keys %do_not_update ; |
| 222 | + print "\nEntities ignored:\n$ignored\n\n" ; |
| 223 | + |
173 | 224 | if (! $updates_found) |
174 | | - { print "No new updates found.\n" ; } |
| 225 | + { print "No new updates found\n" ; } |
175 | 226 | else |
176 | | - { print "\nUpdates found, rewrite master file '$file_comscore_master'.\n\n" ; } |
| 227 | + { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; } |
177 | 228 | |
178 | 229 | return ($updates_found) ; |
179 | 230 | } |
180 | 231 | |
181 | 232 | sub ReadDataReachPerRegion |
182 | 233 | { |
183 | | - my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ; |
| 234 | + my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) = @_ ; |
184 | 235 | |
185 | 236 | undef %months ; |
186 | 237 | undef %data ; |
— | — | @@ -214,7 +265,7 @@ |
215 | 266 | } |
216 | 267 | close IN ; |
217 | 268 | |
218 | | - my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates) ; |
| 269 | + my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates, 1, $layout_csv, @regions) ; |
219 | 270 | return if ! $updates_found ; |
220 | 271 | |
221 | 272 | rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ; |
— | — | @@ -241,7 +292,7 @@ |
242 | 293 | |
243 | 294 | sub ReadDataVisitorsPerRegion |
244 | 295 | { |
245 | | - my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ; |
| 296 | + my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ; |
246 | 297 | |
247 | 298 | undef %months ; |
248 | 299 | undef %data ; |
— | — | @@ -264,20 +315,24 @@ |
265 | 316 | $field_ndx = 0 ; |
266 | 317 | foreach (@data) |
267 | 318 | { |
268 | | - $region = $regions [$field_ndx++] ; |
| 319 | + $region = $regions [$field_ndx] ; |
269 | 320 | $region_code = $region_codes {$region} ; |
270 | 321 | |
271 | 322 | $data = $data [$field_ndx] ; |
272 | 323 | if ($data eq '') |
273 | 324 | { $data = '0' ; } |
274 | 325 | |
| 326 | + # print "Old data $yyyymm,$region = $data\n" ; |
| 327 | + |
275 | 328 | $months {$yyyymm} ++ ; |
276 | 329 | $data {"$yyyymm,$region_code"} = $data ; |
| 330 | + |
| 331 | + $field_ndx++ ; |
277 | 332 | } |
278 | 333 | } |
279 | 334 | close IN ; |
280 | 335 | |
281 | | - my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates) ; |
| 336 | + my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @regions) ; |
282 | 337 | return if ! $updates_found ; |
283 | 338 | |
284 | 339 | rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ; |
— | — | @@ -304,10 +359,11 @@ |
305 | 360 | |
306 | 361 | sub ReadDataVisitorsPerProperty |
307 | 362 | { |
308 | | - my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ; |
| 363 | + my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ; |
309 | 364 | |
310 | 365 | undef %months ; |
311 | 366 | undef %data ; |
| 367 | + undef @properties ; |
312 | 368 | |
313 | 369 | open IN, '<', "$dir_analytics/$file_comscore_master" ; |
314 | 370 | |
— | — | @@ -317,8 +373,6 @@ |
318 | 374 | { |
319 | 375 | chomp $line ; |
320 | 376 | |
321 | | - $line = &GetNumberOnly ($line) ; |
322 | | - |
323 | 377 | ($yyyymm,@data) = split (',', $line) ; |
324 | 378 | if ($lines++ == 0) |
325 | 379 | { @properties = @data ; next ; } |
— | — | @@ -326,36 +380,40 @@ |
327 | 381 | $field_ndx = 0 ; |
328 | 382 | foreach (@data) |
329 | 383 | { |
330 | | - $property = $properties [$field_ndx++] ; |
331 | | - |
| 384 | + $property = $properties [$field_ndx] ; |
| 385 | + $property =~ s/.*Yahoo.*/Yahoo/ ; |
332 | 386 | $data = $data [$field_ndx] ; |
333 | 387 | if ($data eq '') |
334 | 388 | { $data = '0' ; } |
335 | 389 | |
| 390 | + # print "Old data $yyyymm,$property = $data\n" ; |
| 391 | + |
336 | 392 | $months {$yyyymm} ++ ; |
337 | 393 | $data {"$yyyymm,$property"} = $data ; |
| 394 | + |
| 395 | + $field_ndx++ ; |
338 | 396 | } |
339 | 397 | } |
340 | 398 | close IN ; |
341 | 399 | |
342 | | - my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates) ; |
| 400 | + my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @properties) ; |
343 | 401 | return if ! $updates_found ; |
344 | 402 | |
345 | 403 | rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ; |
346 | 404 | open OUT, '>', "$dir_analytics/$file_comscore_master" ; |
347 | 405 | |
348 | 406 | $line_out = "yyyymm" ; |
349 | | - foreach $region_name (@regions) |
350 | | - { $line_out .= ",$region_name" ; } |
| 407 | + foreach $property (@properties) |
| 408 | + { $line_out .= ",$property" ; } |
351 | 409 | print OUT "$line_out" ; |
352 | 410 | |
353 | 411 | foreach $yyyymm (sort {$b cmp $a} keys %months) |
354 | 412 | { |
355 | 413 | $line_out = "\n$yyyymm" ; |
356 | | - foreach $region_name (@regions) |
| 414 | + foreach $property (@properties) |
357 | 415 | { |
358 | | - $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ; |
359 | | - $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ; |
| 416 | + $yyyymm_property = "$yyyymm,$property" ; |
| 417 | + $line_out .= "," . sprintf ($precision, $data {$yyyymm_property}) ; |
360 | 418 | } |
361 | 419 | print OUT "$line_out" ; |
362 | 420 | } |
— | — | @@ -367,7 +425,7 @@ |
368 | 426 | { |
369 | 427 | open OUT, '>', "c:/MySQL/analytics/analytics_in_comscore.csv" ; |
370 | 428 | |
371 | | - $metric = 'unique_visitors' ; |
| 429 | + $metric = 'unique_visitors' ; |
372 | 430 | foreach $yyyymm (sort keys %months) |
373 | 431 | { |
374 | 432 | # store meta data elsewhere |
— | — | @@ -400,7 +458,7 @@ |
401 | 459 | |
402 | 460 | $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ; |
403 | 461 | print OUT $line ; |
404 | | - print $line ; |
| 462 | + # print $line ; |
405 | 463 | } |
406 | 464 | } |
407 | 465 | } |
Index: trunk/wikistats/analytics/create_and_use_db_analytics.txt |
— | — | @@ -0,0 +1,77 @@ |
| 2 | +-- invoke this file with "mysql --user=root --password=[...] < create_and_use_db_analytics.txt" |
| 3 | + |
| 4 | +/* Create database and two tables from scratch */ |
| 5 | + |
| 6 | +DROP DATABASE IF EXISTS `analytics` ; |
| 7 | +CREATE DATABASE `analytics` ; |
| 8 | +USE `analytics` ; |
| 9 | + |
| 10 | +CREATE TABLE `comscore` ( |
| 11 | +-- meta data (mostly for auditing, may not be sent over API on default) |
| 12 | +/* |
| 13 | + store meta dat elsewhere,tbd |
| 14 | + `id` int auto_increment NOT NULL, |
| 15 | + `generated` timestamp, |
| 16 | + `source` char (20), |
| 17 | + `server` char (20), |
| 18 | + `script_name` char (30), |
| 19 | + `script_version` char (8), |
| 20 | + `user` char (20), |
| 21 | +*/ |
| 22 | +-- analytics data |
| 23 | + `yyyymm` char (7), |
| 24 | + `country_code` char (3), |
| 25 | + `region_code` char (2), |
| 26 | + `property` char (20), |
| 27 | + `project` char (10), |
| 28 | + `reach` decimal (4,1) DEFAULT '-1', |
| 29 | + `visitors` decimal (15) DEFAULT '-1', |
| 30 | + PRIMARY KEY (yyyymm,country_code,region_code,property) |
| 31 | +) ; |
| 32 | + |
| 33 | +CREATE TABLE `comscore_regions` ( |
| 34 | + `region_code` char (2), |
| 35 | + `language_code` char (10), |
| 36 | + `region_name` char (18), |
| 37 | + PRIMARY KEY (language_code,region_code) |
| 38 | +) ; |
| 39 | + |
| 40 | +-- SHOW TABLES ; |
| 41 | +-- DESCRIBE comscore ; |
| 42 | +-- DESCRIBE comscore_regions ; |
| 43 | + |
| 44 | +/* Database Manipulation */ |
| 45 | +/* Obviously in real world this is a separate script */ |
| 46 | + |
| 47 | + |
| 48 | +-- show contents (debugging only) |
| 49 | +-- SELECT * |
| 50 | +-- FROM comscore_regions ; |
| 51 | + |
| 52 | +LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_comscore_regions.csv' |
| 53 | + INTO TABLE comscore_regions |
| 54 | + FIELDS TERMINATED BY ',' |
| 55 | + OPTIONALLY ENCLOSED BY '"' |
| 56 | + (language_code,region_code,region_name) ; |
| 57 | + |
| 58 | +LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_comscore.csv' |
| 59 | + INTO TABLE comscore |
| 60 | + FIELDS TERMINATED BY ',' |
| 61 | + OPTIONALLY ENCLOSED BY '"' |
| 62 | + (yyyymm,country_code,region_code,property,project,reach,visitors) ; |
| 63 | + |
| 64 | +-- show contents (debugging only) |
| 65 | + SELECT * |
| 66 | + FROM comscore |
| 67 | + ORDER BY yyyymm,country_code,region_code,property,project |
| 68 | + INTO OUTFILE 'c:/MySQL/analytics/analytics_out_comscore_full_table.csv' |
| 69 | + FIELDS TERMINATED BY ',' ; |
| 70 | + |
| 71 | +-- export all relevant non-meta data from comsCore's reach by region (and comScore treats India and China as regions in this context) |
| 72 | +SELECT yyyymm,region_name,reach |
| 73 | + FROM comscore LEFT JOIN comscore_regions ON comscore.region_code = comscore_regions.region_code AND comscore_regions.language_code = 'en' |
| 74 | + WHERE (region_name != '') AND (yyyymm BETWEEN '2010-06' AND '2011-05') |
| 75 | + ORDER BY yyyymm,region_name |
| 76 | + INTO OUTFILE 'c:/MySQL/analytics/analytics_out_comscore_reach.csv' |
| 77 | + FIELDS TERMINATED BY ',' ; |
| 78 | + |
Property changes on: trunk/wikistats/analytics/create_and_use_db_analytics.txt |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 79 | + native |