r88643 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r88642‎ | r88643 | r88644 >
Date:15:00, 23 May 2011
Author:ezachte
Status:deferred
Tags:
Comment:
scripts and data files related to new 'analytics' MySQL database, which will serve high level aggregated WMF metrics through an API
Modified paths:
  • /trunk/wikistats/analytics (added) (history)
  • /trunk/wikistats/analytics/MySQLPrepComscoreData.pl (added) (history)

Diff [purge]

Index: trunk/wikistats/analytics/MySQLPrepComscoreData.pl
@@ -0,0 +1,438 @@
 2+#!/usr/bin/perl
 3+
 4+# Copyright (C) 2011 Wikimedia Foundation
 5+# This program is free software; you can redistribute it and/or
 6+# modify it under the terms of the GNU General Public License version 2
 7+# as published by the Free Software Foundation.
 8+# This program is distributed in the hope that it will be useful,
 9+# but WITHOUT ANY WARRANTY; without even the implied warranty of
 10+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 11+# See the GNU General Public License for more details, at
 12+# http://www.fsf.org/licenses/gpl.html
 13+
 14+# Author:
 15+# Erik Zachte, email ezachte@wikimedia.org
 16+
 17+# Functionality:
 18+# comScore data can be downloaded as csv file, which each contain 14 months history
 19+# This script uses these files to update 'master' csv files which contain all known history
 20+# Then it merges these master files into one csv file which can be loaded into analytics database
 21+# Data are: reach by region, unique visitors by region, unique visitors by web property
 22+
 23+# Parameters:
 24+# -m (required) folder with 'master' csv files (files with all known history)
 25+# -u (required) folder with 'update' csv files (files with lastest 14 months history, produced by comScore)
 26+
 27+# Output:
 28+# updated master csv files + merged and formatted csv for import in MySQL
 29+
 30+ use Getopt::Std ;
 31+ use Cwd;
 32+
 33+ my $options ;
 34+ getopt ("mu", \%options) ;
 35+
 36+ $true = 1 ;
 37+ $false = 0 ;
 38+
 39+ $script_name = "MySQLPrepComscoreData.pl" ;
 40+ $script_version = "0.3" ;
 41+
 42+# test
 43+# $source = "comscore" ;
 44+# $server = "ez_test" ;
 45+# $generated = "2011-05-06 00:00:00" ;
 46+# $user = "ezachte" ;
 47+
 48+ $dir_analytics = $options {"m"} ;
 49+ $dir_comscore_updates = $options {"u"} ;
 50+
 51+ $dir_analytics = "c:/MySQL/analytics" ; # EZ test only
 52+ $dir_comscore_updates = "W:/@ Report Card/Data" ; # EZ test only
 53+
 54+ if (($dir_analytics eq '') || ($dir_comscore_updates eq ''))
 55+ { abort ("Specify folder for 'master' csv files as '-m folder', folder for 'update' csv files as -u folder'") ; }
 56+
 57+ $file_comscore_reach_master = "excel_out_comscore_reach_regions.csv" ;
 58+ $file_comscore_reach_update = "*reach*by*region*csv" ;
 59+ $file_comscore_uv_region_master = "excel_out_comscore_UV_regions.csv" ;
 60+ $file_comscore_uv_region_update = "*UVs*by*region*csv" ;
 61+ $file_comscore_uv_property_master = "excel_out_comscore_UV_properties.csv" ;
 62+ $file_comscore_uv_property_update = "*UV*trend*csv" ;
 63+
 64+ print "Directories:\nAnalytics '$dir_analytics'\nUpdates '$dir_comscore_updates'\n\n" ;
 65+
 66+ %region_codes = (
 67+ "Europe"=>"EU",
 68+ "North America"=>"NA",
 69+ "Latin America"=>"LA",
 70+ "World-Wide" => "W",
 71+ "Middle East - Africa" => "MA",
 72+ "Asia Pacific"=> "AS",
 73+ "India" => "I",
 74+ "China" => "C"
 75+ ) ;
 76+
 77+ foreach $region_name (keys %region_codes)
 78+ { $region_names {$region_codes {$region_name}} = $region_name ; }
 79+
 80+ @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ;
 81+
 82+ # &ReadDataReachPerRegion ($file_comscore_reach_master, $file_comscore_reach_update, "%.1f") ;
 83+ # %reach_region_code = %data ;
 84+
 85+ &ReadDataVisitorsPerRegion ($file_comscore_uv_region_master, $file_comscore_uv_region_update, "%.0f") ;
 86+ %visitors_region_code = %data ;
 87+
 88+ exit ;
 89+ &ReadDataVisitorsPerProperty ($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f") ;
 90+ %visitors_web_property = %data ;
 91+ exit ;
 92+
 93+ &WriteDataAnalytics ;
 94+
 95+ print "\nReady\n\n" ;
 96+ exit ;
 97+
 98+sub UpdateFromLatestComscoreData
 99+{
 100+ my ($file_comscore_master, $file_comscore_updates) = @_ ;
 101+
 102+ if (! -e "$dir_analytics/$file_comscore_master")
 103+ { abort ("File $file_comscore_master not found!") ; }
 104+
 105+ $age_all = -M "$dir_analytics/$file_comscore_master" ;
 106+ print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " days old: '$file_comscore_master'\n" ;
 107+
 108+ my $cwd = getcwd ;
 109+ chdir $dir_comscore_updates ;
 110+
 111+ @files = glob($file_comscore_updates) ;
 112+ $min_age_upd = 999999 ;
 113+ $file_comscore_updates_latest = '' ;
 114+ foreach $file (@files)
 115+ {
 116+ $age = -M $file ;
 117+ if ($age < $min_age_upd)
 118+ {
 119+ $min_age_upd = $age ;
 120+ $file_comscore_updates_latest = $file ;
 121+ }
 122+ }
 123+ print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " days old: '$file_comscore_updates_latest'\n" ;
 124+
 125+ if ($min_age_upd == 999999)
 126+ {
 127+ print "No valid update file found. Nothing to update." ;
 128+ return ;
 129+ }
 130+
 131+ if ($age_all > $min_age_upd)
 132+ {
 133+ print "File with master data more recent than latest update csv from comScore. Nothing to update." ;
 134+ return ;
 135+ }
 136+
 137+ my $updates_found = $false ;
 138+
 139+ print "\nRead updates\n\n" ;
 140+ open CSV, '<', $file_comscore_updates_latest ;
 141+ while ($line = <CSV>)
 142+ {
 143+ chomp $line ;
 144+ if ($line =~ /^Location.*?-.*?-.*?-/) # e.g. 'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...'
 145+ {
 146+ ($dummy1,$dummy2,@months) = split (',', $line) ;
 147+ @months = &mmm_yyyy2yyyy_mm (@months) ;
 148+ }
 149+ if ($line =~ /^\d,/)
 150+ {
 151+ ($index,$region,@data) = split (',', $line) ;
 152+ $region =~ s/^\s+// ;
 153+ $region =~ s/\s+$// ;
 154+ $region_code = $region_codes {$region} ;
 155+
 156+ for ($m = 0 ; $m <= $#months ; $m++)
 157+ {
 158+ $yyyymm = $months [$m] ;
 159+ $months {$yyyymm} ++ ;
 160+ $yyyymm_region_code = "$yyyymm,$region_code" ;
 161+ $data = $data [$m] ;
 162+
 163+ if (! defined $data {$yyyymm_region_code})
 164+ {
 165+ $updates_found = $true ;
 166+ print "New data found: $yyyymm_region_code = $data\n" ;
 167+ $data {$yyyymm_region_code} = $data ;
 168+ }
 169+ }
 170+ }
 171+ }
 172+
 173+ if (! $updates_found)
 174+ { print "No new updates found.\n" ; }
 175+ else
 176+ { print "\nUpdates found, rewrite master file '$file_comscore_master'.\n\n" ; }
 177+
 178+ return ($updates_found) ;
 179+}
 180+
 181+sub ReadDataReachPerRegion
 182+{
 183+ my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ;
 184+
 185+ undef %months ;
 186+ undef %data ;
 187+ undef @regions ;
 188+
 189+ open IN, '<', "$dir_analytics/$file_comscore_master" ;
 190+
 191+ $lines = 0 ;
 192+ while ($line = <IN>)
 193+ {
 194+ chomp $line ;
 195+
 196+ ($yyyymm,@data) = split (',', $line) ;
 197+
 198+ if ($lines++ == 0)
 199+ { @regions = @data ; next ; }
 200+
 201+ $field_ndx = 0 ;
 202+ foreach (@data)
 203+ {
 204+ $region = $regions [$field_ndx] ;
 205+ $region_code = $region_codes {$region} ;
 206+
 207+ $data = $data [$field_ndx] ;
 208+ if ($data eq '')
 209+ { $data = '0' ; }
 210+ $months {$yyyymm} ++ ;
 211+ $data {"$yyyymm,$region_code"} = $data ;
 212+ # print "Old data $yyyymm,$region_code = $data\n" ;
 213+ $field_ndx++ ;
 214+ }
 215+ }
 216+ close IN ;
 217+
 218+ my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates) ;
 219+ return if ! $updates_found ;
 220+
 221+ rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ;
 222+ open OUT, '>', "$dir_analytics/$file_comscore_master" ;
 223+
 224+ $line_out = "yyyymm" ;
 225+ foreach $region_name (@regions)
 226+ { $line_out .= ",$region_name" ; }
 227+ print OUT "$line_out" ;
 228+
 229+ foreach $yyyymm (sort {$b cmp $a} keys %months)
 230+ {
 231+ $line_out = "\n$yyyymm" ;
 232+ foreach $region_name (@regions)
 233+ {
 234+ $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
 235+ $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
 236+ }
 237+ print OUT "$line_out" ;
 238+ }
 239+
 240+ close OUT ;
 241+}
 242+
 243+sub ReadDataVisitorsPerRegion
 244+{
 245+ my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ;
 246+
 247+ undef %months ;
 248+ undef %data ;
 249+ undef @regions ;
 250+
 251+ open IN, '<', "$dir_analytics/$file_comscore_master" ;
 252+
 253+ $lines = 0 ;
 254+ $metric = 'unique_visitors' ;
 255+ while ($line = <IN>)
 256+ {
 257+ chomp $line ;
 258+ $line = &GetNumberOnly ($line) ;
 259+
 260+ ($yyyymm,@data) = split (',', $line) ;
 261+
 262+ if ($lines++ == 0)
 263+ { @regions = @data ; next ; }
 264+
 265+ $field_ndx = 0 ;
 266+ foreach (@data)
 267+ {
 268+ $region = $regions [$field_ndx++] ;
 269+ $region_code = $region_codes {$region} ;
 270+
 271+ $data = $data [$field_ndx] ;
 272+ if ($data eq '')
 273+ { $data = '0' ; }
 274+
 275+ $months {$yyyymm} ++ ;
 276+ $data {"$yyyymm,$region_code"} = $data ;
 277+ }
 278+ }
 279+ close IN ;
 280+
 281+ my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates) ;
 282+ return if ! $updates_found ;
 283+
 284+ rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ;
 285+ open OUT, '>', "$dir_analytics/$file_comscore_master" ;
 286+
 287+ $line_out = "yyyymm" ;
 288+ foreach $region_name (@regions)
 289+ { $line_out .= ",$region_name" ; }
 290+ print OUT "$line_out" ;
 291+
 292+ foreach $yyyymm (sort {$b cmp $a} keys %months)
 293+ {
 294+ $line_out = "\n$yyyymm" ;
 295+ foreach $region_name (@regions)
 296+ {
 297+ $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
 298+ $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
 299+ }
 300+ print OUT "$line_out" ;
 301+ }
 302+
 303+ close OUT ;
 304+}
 305+
 306+sub ReadDataVisitorsPerProperty
 307+{
 308+ my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ;
 309+
 310+ undef %months ;
 311+ undef %data ;
 312+
 313+ open IN, '<', "$dir_analytics/$file_comscore_master" ;
 314+
 315+ $lines = 0 ;
 316+ $metric = 'unique_visitors' ;
 317+ while ($line = <IN>)
 318+ {
 319+ chomp $line ;
 320+
 321+ $line = &GetNumberOnly ($line) ;
 322+
 323+ ($yyyymm,@data) = split (',', $line) ;
 324+ if ($lines++ == 0)
 325+ { @properties = @data ; next ; }
 326+
 327+ $field_ndx = 0 ;
 328+ foreach (@data)
 329+ {
 330+ $property = $properties [$field_ndx++] ;
 331+
 332+ $data = $data [$field_ndx] ;
 333+ if ($data eq '')
 334+ { $data = '0' ; }
 335+
 336+ $months {$yyyymm} ++ ;
 337+ $data {"$yyyymm,$property"} = $data ;
 338+ }
 339+ }
 340+ close IN ;
 341+
 342+ my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates) ;
 343+ return if ! $updates_found ;
 344+
 345+ rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ;
 346+ open OUT, '>', "$dir_analytics/$file_comscore_master" ;
 347+
 348+ $line_out = "yyyymm" ;
 349+ foreach $region_name (@regions)
 350+ { $line_out .= ",$region_name" ; }
 351+ print OUT "$line_out" ;
 352+
 353+ foreach $yyyymm (sort {$b cmp $a} keys %months)
 354+ {
 355+ $line_out = "\n$yyyymm" ;
 356+ foreach $region_name (@regions)
 357+ {
 358+ $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
 359+ $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
 360+ }
 361+ print OUT "$line_out" ;
 362+ }
 363+
 364+ close OUT ;
 365+}
 366+
 367+sub WriteDataAnalytics
 368+{
 369+ open OUT, '>', "c:/MySQL/analytics/analytics_in_comscore.csv" ;
 370+
 371+ $metric = 'unique_visitors' ;
 372+ foreach $yyyymm (sort keys %months)
 373+ {
 374+ # store meta data elsewhere
 375+ # $line = "$generated,$source,$server,$script_name,$script_version,$user,$yyyymm,$country_code,$region_code,$property,$project,$normalized,$metric,$data\n" ;
 376+ foreach $region_code (sort values %region_codes)
 377+ {
 378+ $country_code = '-' ;
 379+ $property = '-' ;
 380+ $project = '-' ;
 381+ $reach = $reach_region_code {"$yyyymm,$region_code"} ;
 382+ $visitors = $visitors_region_code {"$yyyymm,$region_code"} ;
 383+
 384+ if (! defined $reach) { $reach = -1 ; }
 385+ if (! defined $visitors) { $reach = -1 ; }
 386+
 387+ $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
 388+ print OUT $line ;
 389+ print $line ;
 390+ }
 391+
 392+ foreach $property (sort @properties)
 393+ {
 394+ $country_code = '-' ;
 395+ $region_code = '-' ;
 396+ $project = '-' ;
 397+ $reach = '-1' ;
 398+ $visitors = $visitors_web_property {"$yyyymm,$property"} ;
 399+
 400+ next if ! defined $visitors ;
 401+
 402+ $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
 403+ print OUT $line ;
 404+ print $line ;
 405+ }
 406+ }
 407+}
 408+
 409+sub GetNumberOnly
 410+{
 411+ my $line = shift ;
 412+ $line =~ s/("[^\"]+")/($a=$1,$a=~s#,##g,$a)/ge ; # nested regexp: remove comma's inside double quotes
 413+ $line =~ s/"//g ;
 414+ return $line ;
 415+}
 416+
 417+sub mmm_yyyy2yyyy_mm
 418+{
 419+ my @months = @_ ;
 420+ # Jan -> 01, etc
 421+ foreach my $month (@months)
 422+ {
 423+ my ($mmm,$yyyy) = split ('-', $month) ;
 424+ for ($m = 0 ; $m <= $#months_short ; $m++)
 425+ {
 426+ if ($mmm eq $months_short [$m])
 427+ { $month = "$yyyy-" . sprintf ("%02d", $m+1) ; }
 428+ }
 429+ }
 430+ return @months ;
 431+}
 432+
 433+sub abort
 434+{
 435+ $msg = shift ;
 436+
 437+ print "\nAbort, reason: $msg\n\n" ;
 438+ exit ;
 439+}

Status & tagging log