r88649 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r88648‎ | r88649 | r88650 >
Date:16:50, 23 May 2011
Author:ezachte
Status:deferred
Tags:
Comment:
perl script ready for integration test + added test file
Modified paths:
  • /trunk/wikistats/analytics/MySQLPrepComscoreData.pl (modified) (history)
  • /trunk/wikistats/analytics/create_and_use_db_analytics.txt (added) (history)

Diff [purge]

Index: trunk/wikistats/analytics/MySQLPrepComscoreData.pl
@@ -16,6 +16,7 @@
1717 # Functionality:
1818 # comScore data can be downloaded as csv file, which each contain 14 months history
1919 # This script uses these files to update 'master' csv files which contain all known history
 20+# Note: only entities which are already in master file will be updated!
2021 # Then it merges these master files into one csv file which can be loaded into analytics database
2122 # Data are: reach by region, unique visitors by region, unique visitors by web property
2223
@@ -26,6 +27,8 @@
2728 # Output:
2829 # updated master csv files + merged and formatted csv for import in MySQL
2930
 31+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/
 32+
3033 use Getopt::Std ;
3134 use Cwd;
3235
@@ -38,7 +41,7 @@
3942 $script_name = "MySQLPrepComscoreData.pl" ;
4043 $script_version = "0.3" ;
4144
42 -# test
 45+# EZ test only
4346 # $source = "comscore" ;
4447 # $server = "ez_test" ;
4548 # $generated = "2011-05-06 00:00:00" ;
@@ -60,6 +63,10 @@
6164 $file_comscore_uv_property_master = "excel_out_comscore_UV_properties.csv" ;
6265 $file_comscore_uv_property_update = "*UV*trend*csv" ;
6366
 67+ $layout_csv_reach = 1 ;
 68+ $layout_csv_regions = 2 ;
 69+ $layout_csv_properties = 3 ;
 70+
6471 print "Directories:\nAnalytics '$dir_analytics'\nUpdates '$dir_comscore_updates'\n\n" ;
6572
6673 %region_codes = (
@@ -69,6 +76,7 @@
7077 "World-Wide" => "W",
7178 "Middle East - Africa" => "MA",
7279 "Asia Pacific"=> "AS",
 80+ "United States" => "US",
7381 "India" => "I",
7482 "China" => "C"
7583 ) ;
@@ -78,16 +86,14 @@
7987
8088 @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ;
8189
82 - # &ReadDataReachPerRegion ($file_comscore_reach_master, $file_comscore_reach_update, "%.1f") ;
83 - # %reach_region_code = %data ;
 90+ &ReadDataReachPerRegion ($file_comscore_reach_master, $file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ;
 91+ %reach_region_code = %data ;
8492
85 - &ReadDataVisitorsPerRegion ($file_comscore_uv_region_master, $file_comscore_uv_region_update, "%.0f") ;
 93+ &ReadDataVisitorsPerRegion ($file_comscore_uv_region_master, $file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ;
8694 %visitors_region_code = %data ;
8795
88 - exit ;
89 - &ReadDataVisitorsPerProperty ($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f") ;
 96+ &ReadDataVisitorsPerProperty ($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f", 1000, $layout_csv_properties) ;
9097 %visitors_web_property = %data ;
91 - exit ;
9298
9399 &WriteDataAnalytics ;
94100
@@ -96,8 +102,14 @@
97103
98104 sub UpdateFromLatestComscoreData
99105 {
100 - my ($file_comscore_master, $file_comscore_updates) = @_ ;
 106+ my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, @update_only) = @_ ;
101107
 108+ undef %update_only ;
 109+ undef %do_not_update ;
 110+
 111+ foreach $id (@update_only)
 112+ { $update_only {$id} = $true ; }
 113+
102114 if (! -e "$dir_analytics/$file_comscore_master")
103115 { abort ("File $file_comscore_master not found!") ; }
104116
@@ -140,46 +152,85 @@
141153 while ($line = <CSV>)
142154 {
143155 chomp $line ;
144 - if ($line =~ /^Location.*?-.*?-.*?-/) # e.g. 'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...'
 156+ $line = &GetNumberOnly ($line) ;
 157+
 158+ if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...'
145159 {
146 - ($dummy1,$dummy2,@months) = split (',', $line) ;
 160+ if ($layout_csv == $layout_csv_properties)
 161+ { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web properties csv file
 162+ else
 163+ { ($dummy1,$dummy2,@months) = split (',', $line) ; } # uv / reach csv files
 164+
147165 @months = &mmm_yyyy2yyyy_mm (@months) ;
148166 }
149 - if ($line =~ /^\d,/)
 167+
 168+ if ($line =~ /^\d+,/)
150169 {
151 - ($index,$region,@data) = split (',', $line) ;
152 - $region =~ s/^\s+// ;
153 - $region =~ s/\s+$// ;
154 - $region_code = $region_codes {$region} ;
 170+ if ($layout_csv == $layout_csv_properties)
 171+ {
 172+ ($index,$dummy,$property,@data) = split (',', $line) ;
 173+ $property =~ s/^\s+// ;
 174+ $property =~ s/\s+$// ;
155175
 176+ $property =~ s/.*Google.*/Google/i ;
 177+ $property =~ s/.*Microsoft.*/Microsoft/i ;
 178+ $property =~ s/.*FACEBOOK.*/Facebook/i ;
 179+ $property =~ s/.*Yahoo.*/Yahoo/i ;
 180+ $property =~ s/.*Amazon.*/Amazon/i ;
 181+ $property =~ s/.*Apple.*/Apple/i ;
 182+ $property =~ s/.*AOL.*/AOL/i ;
 183+ $property =~ s/.*Wikimedia.*/Wikimedia/i ;
 184+ $property =~ s/.*Tencent.*/Tencent/i ;
 185+ $property =~ s/.*Baidu.*/Baidu/i ;
 186+ $property =~ s/.*CBS.*/CBS/i ;
 187+
 188+ $id = $property ;
 189+ }
 190+ else
 191+ {
 192+ ($index,$region,@data) = split (',', $line) ;
 193+ $region =~ s/^\s+// ;
 194+ $region =~ s/\s+$// ;
 195+ $id = $region_codes {$region} ;
 196+ }
 197+
 198+ if ($update_only {$id} == 0)
 199+ {
 200+ $do_not_update {$id}++ ;
 201+ next ;
 202+ }
 203+
156204 for ($m = 0 ; $m <= $#months ; $m++)
157205 {
158206 $yyyymm = $months [$m] ;
159207 $months {$yyyymm} ++ ;
160 - $yyyymm_region_code = "$yyyymm,$region_code" ;
161 - $data = $data [$m] ;
 208+ $yyyymm_id = "$yyyymm,$id" ;
 209+ $data = $data [$m] * $multiplier ;
162210
163 - if (! defined $data {$yyyymm_region_code})
 211+ if (! defined $data {$yyyymm_id})
164212 {
165213 $updates_found = $true ;
166 - print "New data found: $yyyymm_region_code = $data\n" ;
167 - $data {$yyyymm_region_code} = $data ;
 214+ print "New data found: $yyyymm_id = $data\n" ;
 215+ $data {$yyyymm_id} = $data ;
168216 }
169217 }
170218 }
171219 }
172220
 221+ $ignored = join ', ', sort keys %do_not_update ;
 222+ print "\nEntities ignored:\n$ignored\n\n" ;
 223+
173224 if (! $updates_found)
174 - { print "No new updates found.\n" ; }
 225+ { print "No new updates found\n" ; }
175226 else
176 - { print "\nUpdates found, rewrite master file '$file_comscore_master'.\n\n" ; }
 227+ { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; }
177228
178229 return ($updates_found) ;
179230 }
180231
181232 sub ReadDataReachPerRegion
182233 {
183 - my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ;
 234+ my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) = @_ ;
184235
185236 undef %months ;
186237 undef %data ;
@@ -214,7 +265,7 @@
215266 }
216267 close IN ;
217268
218 - my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates) ;
 269+ my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates, 1, $layout_csv, @regions) ;
219270 return if ! $updates_found ;
220271
221272 rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ;
@@ -241,7 +292,7 @@
242293
243294 sub ReadDataVisitorsPerRegion
244295 {
245 - my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ;
 296+ my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ;
246297
247298 undef %months ;
248299 undef %data ;
@@ -264,20 +315,24 @@
265316 $field_ndx = 0 ;
266317 foreach (@data)
267318 {
268 - $region = $regions [$field_ndx++] ;
 319+ $region = $regions [$field_ndx] ;
269320 $region_code = $region_codes {$region} ;
270321
271322 $data = $data [$field_ndx] ;
272323 if ($data eq '')
273324 { $data = '0' ; }
274325
 326+ # print "Old data $yyyymm,$region = $data\n" ;
 327+
275328 $months {$yyyymm} ++ ;
276329 $data {"$yyyymm,$region_code"} = $data ;
 330+
 331+ $field_ndx++ ;
277332 }
278333 }
279334 close IN ;
280335
281 - my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates) ;
 336+ my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @regions) ;
282337 return if ! $updates_found ;
283338
284339 rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ;
@@ -304,10 +359,11 @@
305360
306361 sub ReadDataVisitorsPerProperty
307362 {
308 - my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ;
 363+ my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ;
309364
310365 undef %months ;
311366 undef %data ;
 367+ undef @properties ;
312368
313369 open IN, '<', "$dir_analytics/$file_comscore_master" ;
314370
@@ -317,8 +373,6 @@
318374 {
319375 chomp $line ;
320376
321 - $line = &GetNumberOnly ($line) ;
322 -
323377 ($yyyymm,@data) = split (',', $line) ;
324378 if ($lines++ == 0)
325379 { @properties = @data ; next ; }
@@ -326,36 +380,40 @@
327381 $field_ndx = 0 ;
328382 foreach (@data)
329383 {
330 - $property = $properties [$field_ndx++] ;
331 -
 384+ $property = $properties [$field_ndx] ;
 385+ $property =~ s/.*Yahoo.*/Yahoo/ ;
332386 $data = $data [$field_ndx] ;
333387 if ($data eq '')
334388 { $data = '0' ; }
335389
 390+ # print "Old data $yyyymm,$property = $data\n" ;
 391+
336392 $months {$yyyymm} ++ ;
337393 $data {"$yyyymm,$property"} = $data ;
 394+
 395+ $field_ndx++ ;
338396 }
339397 }
340398 close IN ;
341399
342 - my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates) ;
 400+ my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @properties) ;
343401 return if ! $updates_found ;
344402
345403 rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ;
346404 open OUT, '>', "$dir_analytics/$file_comscore_master" ;
347405
348406 $line_out = "yyyymm" ;
349 - foreach $region_name (@regions)
350 - { $line_out .= ",$region_name" ; }
 407+ foreach $property (@properties)
 408+ { $line_out .= ",$property" ; }
351409 print OUT "$line_out" ;
352410
353411 foreach $yyyymm (sort {$b cmp $a} keys %months)
354412 {
355413 $line_out = "\n$yyyymm" ;
356 - foreach $region_name (@regions)
 414+ foreach $property (@properties)
357415 {
358 - $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
359 - $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
 416+ $yyyymm_property = "$yyyymm,$property" ;
 417+ $line_out .= "," . sprintf ($precision, $data {$yyyymm_property}) ;
360418 }
361419 print OUT "$line_out" ;
362420 }
@@ -367,7 +425,7 @@
368426 {
369427 open OUT, '>', "c:/MySQL/analytics/analytics_in_comscore.csv" ;
370428
371 - $metric = 'unique_visitors' ;
 429+ $metric = 'unique_visitors' ;
372430 foreach $yyyymm (sort keys %months)
373431 {
374432 # store meta data elsewhere
@@ -400,7 +458,7 @@
401459
402460 $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
403461 print OUT $line ;
404 - print $line ;
 462+ # print $line ;
405463 }
406464 }
407465 }
Index: trunk/wikistats/analytics/create_and_use_db_analytics.txt
@@ -0,0 +1,77 @@
 2+-- invoke this file with "mysql --user=root --password=[...] < create_and_use_db_analytics.txt"
 3+
 4+/* Create database and two tables from scratch */
 5+
 6+DROP DATABASE IF EXISTS `analytics` ;
 7+CREATE DATABASE `analytics` ;
 8+USE `analytics` ;
 9+
 10+CREATE TABLE `comscore` (
 11+-- meta data (mostly for auditing, may not be sent over API on default)
 12+/*
 13+ store meta dat elsewhere,tbd
 14+ `id` int auto_increment NOT NULL,
 15+ `generated` timestamp,
 16+ `source` char (20),
 17+ `server` char (20),
 18+ `script_name` char (30),
 19+ `script_version` char (8),
 20+ `user` char (20),
 21+*/
 22+-- analytics data
 23+ `yyyymm` char (7),
 24+ `country_code` char (3),
 25+ `region_code` char (2),
 26+ `property` char (20),
 27+ `project` char (10),
 28+ `reach` decimal (4,1) DEFAULT '-1',
 29+ `visitors` decimal (15) DEFAULT '-1',
 30+ PRIMARY KEY (yyyymm,country_code,region_code,property)
 31+) ;
 32+
 33+CREATE TABLE `comscore_regions` (
 34+ `region_code` char (2),
 35+ `language_code` char (10),
 36+ `region_name` char (18),
 37+ PRIMARY KEY (language_code,region_code)
 38+) ;
 39+
 40+-- SHOW TABLES ;
 41+-- DESCRIBE comscore ;
 42+-- DESCRIBE comscore_regions ;
 43+
 44+/* Database Manipulation */
 45+/* Obviously in real world this is a separate script */
 46+
 47+
 48+-- show contents (debugging only)
 49+-- SELECT *
 50+-- FROM comscore_regions ;
 51+
 52+LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_comscore_regions.csv'
 53+ INTO TABLE comscore_regions
 54+ FIELDS TERMINATED BY ','
 55+ OPTIONALLY ENCLOSED BY '"'
 56+ (language_code,region_code,region_name) ;
 57+
 58+LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_comscore.csv'
 59+ INTO TABLE comscore
 60+ FIELDS TERMINATED BY ','
 61+ OPTIONALLY ENCLOSED BY '"'
 62+ (yyyymm,country_code,region_code,property,project,reach,visitors) ;
 63+
 64+-- show contents (debugging only)
 65+ SELECT *
 66+ FROM comscore
 67+ ORDER BY yyyymm,country_code,region_code,property,project
 68+ INTO OUTFILE 'c:/MySQL/analytics/analytics_out_comscore_full_table.csv'
 69+ FIELDS TERMINATED BY ',' ;
 70+
 71+-- export all relevant non-meta data from comsCore's reach by region (and comScore treats India and China as regions in this context)
 72+SELECT yyyymm,region_name,reach
 73+ FROM comscore LEFT JOIN comscore_regions ON comscore.region_code = comscore_regions.region_code AND comscore_regions.language_code = 'en'
 74+ WHERE (region_name != '') AND (yyyymm BETWEEN '2010-06' AND '2011-05')
 75+ ORDER BY yyyymm,region_name
 76+ INTO OUTFILE 'c:/MySQL/analytics/analytics_out_comscore_reach.csv'
 77+ FIELDS TERMINATED BY ',' ;
 78+
Property changes on: trunk/wikistats/analytics/create_and_use_db_analytics.txt
___________________________________________________________________
Added: svn:eol-style
179 + native

Status & tagging log