Index: trunk/wikistats/analytics/MySQLPrepComscoreData.pl |
— | — | @@ -0,0 +1,438 @@ |
| 2 | +#!/usr/bin/perl |
| 3 | + |
| 4 | +# Copyright (C) 2011 Wikimedia Foundation |
| 5 | +# This program is free software; you can redistribute it and/or |
| 6 | +# modify it under the terms of the GNU General Public License version 2 |
| 7 | +# as published by the Free Software Foundation. |
| 8 | +# This program is distributed in the hope that it will be useful, |
| 9 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 11 | +# See the GNU General Public License for more details, at |
| 12 | +# http://www.fsf.org/licenses/gpl.html |
| 13 | + |
| 14 | +# Author: |
| 15 | +# Erik Zachte, email ezachte@wikimedia.org |
| 16 | + |
| 17 | +# Functionality: |
| 18 | +# comScore data can be downloaded as csv file, which each contain 14 months history |
| 19 | +# This script uses these files to update 'master' csv files which contain all known history |
| 20 | +# Then it merges these master files into one csv file which can be loaded into analytics database |
| 21 | +# Data are: reach by region, unique visitors by region, unique visitors by web property |
| 22 | + |
| 23 | +# Parameters: |
| 24 | +# -m (required) folder with 'master' csv files (files with all known history) |
| 25 | +# -u (required) folder with 'update' csv files (files with lastest 14 months history, produced by comScore) |
| 26 | + |
| 27 | +# Output: |
| 28 | +# updated master csv files + merged and formatted csv for import in MySQL |
| 29 | + |
| 30 | + use Getopt::Std ; |
| 31 | + use Cwd; |
| 32 | + |
| 33 | + my $options ; |
| 34 | + getopt ("mu", \%options) ; |
| 35 | + |
| 36 | + $true = 1 ; |
| 37 | + $false = 0 ; |
| 38 | + |
| 39 | + $script_name = "MySQLPrepComscoreData.pl" ; |
| 40 | + $script_version = "0.3" ; |
| 41 | + |
| 42 | +# test |
| 43 | +# $source = "comscore" ; |
| 44 | +# $server = "ez_test" ; |
| 45 | +# $generated = "2011-05-06 00:00:00" ; |
| 46 | +# $user = "ezachte" ; |
| 47 | + |
| 48 | + $dir_analytics = $options {"m"} ; |
| 49 | + $dir_comscore_updates = $options {"u"} ; |
| 50 | + |
| 51 | + $dir_analytics = "c:/MySQL/analytics" ; # EZ test only |
| 52 | + $dir_comscore_updates = "W:/@ Report Card/Data" ; # EZ test only |
| 53 | + |
| 54 | + if (($dir_analytics eq '') || ($dir_comscore_updates eq '')) |
| 55 | + { abort ("Specify folder for 'master' csv files as '-m folder', folder for 'update' csv files as -u folder'") ; } |
| 56 | + |
| 57 | + $file_comscore_reach_master = "excel_out_comscore_reach_regions.csv" ; |
| 58 | + $file_comscore_reach_update = "*reach*by*region*csv" ; |
| 59 | + $file_comscore_uv_region_master = "excel_out_comscore_UV_regions.csv" ; |
| 60 | + $file_comscore_uv_region_update = "*UVs*by*region*csv" ; |
| 61 | + $file_comscore_uv_property_master = "excel_out_comscore_UV_properties.csv" ; |
| 62 | + $file_comscore_uv_property_update = "*UV*trend*csv" ; |
| 63 | + |
| 64 | + print "Directories:\nAnalytics '$dir_analytics'\nUpdates '$dir_comscore_updates'\n\n" ; |
| 65 | + |
| 66 | + %region_codes = ( |
| 67 | + "Europe"=>"EU", |
| 68 | + "North America"=>"NA", |
| 69 | + "Latin America"=>"LA", |
| 70 | + "World-Wide" => "W", |
| 71 | + "Middle East - Africa" => "MA", |
| 72 | + "Asia Pacific"=> "AS", |
| 73 | + "India" => "I", |
| 74 | + "China" => "C" |
| 75 | + ) ; |
| 76 | + |
| 77 | + foreach $region_name (keys %region_codes) |
| 78 | + { $region_names {$region_codes {$region_name}} = $region_name ; } |
| 79 | + |
| 80 | + @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ; |
| 81 | + |
| 82 | + # &ReadDataReachPerRegion ($file_comscore_reach_master, $file_comscore_reach_update, "%.1f") ; |
| 83 | + # %reach_region_code = %data ; |
| 84 | + |
| 85 | + &ReadDataVisitorsPerRegion ($file_comscore_uv_region_master, $file_comscore_uv_region_update, "%.0f") ; |
| 86 | + %visitors_region_code = %data ; |
| 87 | + |
| 88 | + exit ; |
| 89 | + &ReadDataVisitorsPerProperty ($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f") ; |
| 90 | + %visitors_web_property = %data ; |
| 91 | + exit ; |
| 92 | + |
| 93 | + &WriteDataAnalytics ; |
| 94 | + |
| 95 | + print "\nReady\n\n" ; |
| 96 | + exit ; |
| 97 | + |
| 98 | +sub UpdateFromLatestComscoreData |
| 99 | +{ |
| 100 | + my ($file_comscore_master, $file_comscore_updates) = @_ ; |
| 101 | + |
| 102 | + if (! -e "$dir_analytics/$file_comscore_master") |
| 103 | + { abort ("File $file_comscore_master not found!") ; } |
| 104 | + |
| 105 | + $age_all = -M "$dir_analytics/$file_comscore_master" ; |
| 106 | + print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " days old: '$file_comscore_master'\n" ; |
| 107 | + |
| 108 | + my $cwd = getcwd ; |
| 109 | + chdir $dir_comscore_updates ; |
| 110 | + |
| 111 | + @files = glob($file_comscore_updates) ; |
| 112 | + $min_age_upd = 999999 ; |
| 113 | + $file_comscore_updates_latest = '' ; |
| 114 | + foreach $file (@files) |
| 115 | + { |
| 116 | + $age = -M $file ; |
| 117 | + if ($age < $min_age_upd) |
| 118 | + { |
| 119 | + $min_age_upd = $age ; |
| 120 | + $file_comscore_updates_latest = $file ; |
| 121 | + } |
| 122 | + } |
| 123 | + print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " days old: '$file_comscore_updates_latest'\n" ; |
| 124 | + |
| 125 | + if ($min_age_upd == 999999) |
| 126 | + { |
| 127 | + print "No valid update file found. Nothing to update." ; |
| 128 | + return ; |
| 129 | + } |
| 130 | + |
| 131 | + if ($age_all > $min_age_upd) |
| 132 | + { |
| 133 | + print "File with master data more recent than latest update csv from comScore. Nothing to update." ; |
| 134 | + return ; |
| 135 | + } |
| 136 | + |
| 137 | + my $updates_found = $false ; |
| 138 | + |
| 139 | + print "\nRead updates\n\n" ; |
| 140 | + open CSV, '<', $file_comscore_updates_latest ; |
| 141 | + while ($line = <CSV>) |
| 142 | + { |
| 143 | + chomp $line ; |
| 144 | + if ($line =~ /^Location.*?-.*?-.*?-/) # e.g. 'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...' |
| 145 | + { |
| 146 | + ($dummy1,$dummy2,@months) = split (',', $line) ; |
| 147 | + @months = &mmm_yyyy2yyyy_mm (@months) ; |
| 148 | + } |
| 149 | + if ($line =~ /^\d,/) |
| 150 | + { |
| 151 | + ($index,$region,@data) = split (',', $line) ; |
| 152 | + $region =~ s/^\s+// ; |
| 153 | + $region =~ s/\s+$// ; |
| 154 | + $region_code = $region_codes {$region} ; |
| 155 | + |
| 156 | + for ($m = 0 ; $m <= $#months ; $m++) |
| 157 | + { |
| 158 | + $yyyymm = $months [$m] ; |
| 159 | + $months {$yyyymm} ++ ; |
| 160 | + $yyyymm_region_code = "$yyyymm,$region_code" ; |
| 161 | + $data = $data [$m] ; |
| 162 | + |
| 163 | + if (! defined $data {$yyyymm_region_code}) |
| 164 | + { |
| 165 | + $updates_found = $true ; |
| 166 | + print "New data found: $yyyymm_region_code = $data\n" ; |
| 167 | + $data {$yyyymm_region_code} = $data ; |
| 168 | + } |
| 169 | + } |
| 170 | + } |
| 171 | + } |
| 172 | + |
| 173 | + if (! $updates_found) |
| 174 | + { print "No new updates found.\n" ; } |
| 175 | + else |
| 176 | + { print "\nUpdates found, rewrite master file '$file_comscore_master'.\n\n" ; } |
| 177 | + |
| 178 | + return ($updates_found) ; |
| 179 | +} |
| 180 | + |
| 181 | +sub ReadDataReachPerRegion |
| 182 | +{ |
| 183 | + my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ; |
| 184 | + |
| 185 | + undef %months ; |
| 186 | + undef %data ; |
| 187 | + undef @regions ; |
| 188 | + |
| 189 | + open IN, '<', "$dir_analytics/$file_comscore_master" ; |
| 190 | + |
| 191 | + $lines = 0 ; |
| 192 | + while ($line = <IN>) |
| 193 | + { |
| 194 | + chomp $line ; |
| 195 | + |
| 196 | + ($yyyymm,@data) = split (',', $line) ; |
| 197 | + |
| 198 | + if ($lines++ == 0) |
| 199 | + { @regions = @data ; next ; } |
| 200 | + |
| 201 | + $field_ndx = 0 ; |
| 202 | + foreach (@data) |
| 203 | + { |
| 204 | + $region = $regions [$field_ndx] ; |
| 205 | + $region_code = $region_codes {$region} ; |
| 206 | + |
| 207 | + $data = $data [$field_ndx] ; |
| 208 | + if ($data eq '') |
| 209 | + { $data = '0' ; } |
| 210 | + $months {$yyyymm} ++ ; |
| 211 | + $data {"$yyyymm,$region_code"} = $data ; |
| 212 | + # print "Old data $yyyymm,$region_code = $data\n" ; |
| 213 | + $field_ndx++ ; |
| 214 | + } |
| 215 | + } |
| 216 | + close IN ; |
| 217 | + |
| 218 | + my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates) ; |
| 219 | + return if ! $updates_found ; |
| 220 | + |
| 221 | + rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ; |
| 222 | + open OUT, '>', "$dir_analytics/$file_comscore_master" ; |
| 223 | + |
| 224 | + $line_out = "yyyymm" ; |
| 225 | + foreach $region_name (@regions) |
| 226 | + { $line_out .= ",$region_name" ; } |
| 227 | + print OUT "$line_out" ; |
| 228 | + |
| 229 | + foreach $yyyymm (sort {$b cmp $a} keys %months) |
| 230 | + { |
| 231 | + $line_out = "\n$yyyymm" ; |
| 232 | + foreach $region_name (@regions) |
| 233 | + { |
| 234 | + $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ; |
| 235 | + $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ; |
| 236 | + } |
| 237 | + print OUT "$line_out" ; |
| 238 | + } |
| 239 | + |
| 240 | + close OUT ; |
| 241 | +} |
| 242 | + |
| 243 | +sub ReadDataVisitorsPerRegion |
| 244 | +{ |
| 245 | + my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ; |
| 246 | + |
| 247 | + undef %months ; |
| 248 | + undef %data ; |
| 249 | + undef @regions ; |
| 250 | + |
| 251 | + open IN, '<', "$dir_analytics/$file_comscore_master" ; |
| 252 | + |
| 253 | + $lines = 0 ; |
| 254 | + $metric = 'unique_visitors' ; |
| 255 | + while ($line = <IN>) |
| 256 | + { |
| 257 | + chomp $line ; |
| 258 | + $line = &GetNumberOnly ($line) ; |
| 259 | + |
| 260 | + ($yyyymm,@data) = split (',', $line) ; |
| 261 | + |
| 262 | + if ($lines++ == 0) |
| 263 | + { @regions = @data ; next ; } |
| 264 | + |
| 265 | + $field_ndx = 0 ; |
| 266 | + foreach (@data) |
| 267 | + { |
| 268 | + $region = $regions [$field_ndx++] ; |
| 269 | + $region_code = $region_codes {$region} ; |
| 270 | + |
| 271 | + $data = $data [$field_ndx] ; |
| 272 | + if ($data eq '') |
| 273 | + { $data = '0' ; } |
| 274 | + |
| 275 | + $months {$yyyymm} ++ ; |
| 276 | + $data {"$yyyymm,$region_code"} = $data ; |
| 277 | + } |
| 278 | + } |
| 279 | + close IN ; |
| 280 | + |
| 281 | + my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates) ; |
| 282 | + return if ! $updates_found ; |
| 283 | + |
| 284 | + rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ; |
| 285 | + open OUT, '>', "$dir_analytics/$file_comscore_master" ; |
| 286 | + |
| 287 | + $line_out = "yyyymm" ; |
| 288 | + foreach $region_name (@regions) |
| 289 | + { $line_out .= ",$region_name" ; } |
| 290 | + print OUT "$line_out" ; |
| 291 | + |
| 292 | + foreach $yyyymm (sort {$b cmp $a} keys %months) |
| 293 | + { |
| 294 | + $line_out = "\n$yyyymm" ; |
| 295 | + foreach $region_name (@regions) |
| 296 | + { |
| 297 | + $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ; |
| 298 | + $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ; |
| 299 | + } |
| 300 | + print OUT "$line_out" ; |
| 301 | + } |
| 302 | + |
| 303 | + close OUT ; |
| 304 | +} |
| 305 | + |
| 306 | +sub ReadDataVisitorsPerProperty |
| 307 | +{ |
| 308 | + my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ; |
| 309 | + |
| 310 | + undef %months ; |
| 311 | + undef %data ; |
| 312 | + |
| 313 | + open IN, '<', "$dir_analytics/$file_comscore_master" ; |
| 314 | + |
| 315 | + $lines = 0 ; |
| 316 | + $metric = 'unique_visitors' ; |
| 317 | + while ($line = <IN>) |
| 318 | + { |
| 319 | + chomp $line ; |
| 320 | + |
| 321 | + $line = &GetNumberOnly ($line) ; |
| 322 | + |
| 323 | + ($yyyymm,@data) = split (',', $line) ; |
| 324 | + if ($lines++ == 0) |
| 325 | + { @properties = @data ; next ; } |
| 326 | + |
| 327 | + $field_ndx = 0 ; |
| 328 | + foreach (@data) |
| 329 | + { |
| 330 | + $property = $properties [$field_ndx++] ; |
| 331 | + |
| 332 | + $data = $data [$field_ndx] ; |
| 333 | + if ($data eq '') |
| 334 | + { $data = '0' ; } |
| 335 | + |
| 336 | + $months {$yyyymm} ++ ; |
| 337 | + $data {"$yyyymm,$property"} = $data ; |
| 338 | + } |
| 339 | + } |
| 340 | + close IN ; |
| 341 | + |
| 342 | + my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates) ; |
| 343 | + return if ! $updates_found ; |
| 344 | + |
| 345 | + rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ; |
| 346 | + open OUT, '>', "$dir_analytics/$file_comscore_master" ; |
| 347 | + |
| 348 | + $line_out = "yyyymm" ; |
| 349 | + foreach $region_name (@regions) |
| 350 | + { $line_out .= ",$region_name" ; } |
| 351 | + print OUT "$line_out" ; |
| 352 | + |
| 353 | + foreach $yyyymm (sort {$b cmp $a} keys %months) |
| 354 | + { |
| 355 | + $line_out = "\n$yyyymm" ; |
| 356 | + foreach $region_name (@regions) |
| 357 | + { |
| 358 | + $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ; |
| 359 | + $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ; |
| 360 | + } |
| 361 | + print OUT "$line_out" ; |
| 362 | + } |
| 363 | + |
| 364 | + close OUT ; |
| 365 | +} |
| 366 | + |
| 367 | +sub WriteDataAnalytics |
| 368 | +{ |
| 369 | + open OUT, '>', "c:/MySQL/analytics/analytics_in_comscore.csv" ; |
| 370 | + |
| 371 | + $metric = 'unique_visitors' ; |
| 372 | + foreach $yyyymm (sort keys %months) |
| 373 | + { |
| 374 | + # store meta data elsewhere |
| 375 | + # $line = "$generated,$source,$server,$script_name,$script_version,$user,$yyyymm,$country_code,$region_code,$property,$project,$normalized,$metric,$data\n" ; |
| 376 | + foreach $region_code (sort values %region_codes) |
| 377 | + { |
| 378 | + $country_code = '-' ; |
| 379 | + $property = '-' ; |
| 380 | + $project = '-' ; |
| 381 | + $reach = $reach_region_code {"$yyyymm,$region_code"} ; |
| 382 | + $visitors = $visitors_region_code {"$yyyymm,$region_code"} ; |
| 383 | + |
| 384 | + if (! defined $reach) { $reach = -1 ; } |
| 385 | + if (! defined $visitors) { $reach = -1 ; } |
| 386 | + |
| 387 | + $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ; |
| 388 | + print OUT $line ; |
| 389 | + print $line ; |
| 390 | + } |
| 391 | + |
| 392 | + foreach $property (sort @properties) |
| 393 | + { |
| 394 | + $country_code = '-' ; |
| 395 | + $region_code = '-' ; |
| 396 | + $project = '-' ; |
| 397 | + $reach = '-1' ; |
| 398 | + $visitors = $visitors_web_property {"$yyyymm,$property"} ; |
| 399 | + |
| 400 | + next if ! defined $visitors ; |
| 401 | + |
| 402 | + $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ; |
| 403 | + print OUT $line ; |
| 404 | + print $line ; |
| 405 | + } |
| 406 | + } |
| 407 | +} |
| 408 | + |
| 409 | +sub GetNumberOnly |
| 410 | +{ |
| 411 | + my $line = shift ; |
| 412 | + $line =~ s/("[^\"]+")/($a=$1,$a=~s#,##g,$a)/ge ; # nested regexp: remove comma's inside double quotes |
| 413 | + $line =~ s/"//g ; |
| 414 | + return $line ; |
| 415 | +} |
| 416 | + |
| 417 | +sub mmm_yyyy2yyyy_mm |
| 418 | +{ |
| 419 | + my @months = @_ ; |
| 420 | + # Jan -> 01, etc |
| 421 | + foreach my $month (@months) |
| 422 | + { |
| 423 | + my ($mmm,$yyyy) = split ('-', $month) ; |
| 424 | + for ($m = 0 ; $m <= $#months_short ; $m++) |
| 425 | + { |
| 426 | + if ($mmm eq $months_short [$m]) |
| 427 | + { $month = "$yyyy-" . sprintf ("%02d", $m+1) ; } |
| 428 | + } |
| 429 | + } |
| 430 | + return @months ; |
| 431 | +} |
| 432 | + |
| 433 | +sub abort |
| 434 | +{ |
| 435 | + $msg = shift ; |
| 436 | + |
| 437 | + print "\nAbort, reason: $msg\n\n" ; |
| 438 | + exit ; |
| 439 | +} |