r105444 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r105443‎ | r105444 | r105445 >
Date:17:36, 7 December 2011
Author:diederik
Status:deferred
Tags:
Comment:
Moved reportcard import utils from trunk/wikistats/analytics to trunk/extensions/MetricsReporting
Modified paths:
  • /trunk/extensions/MetricsReporting/import/AnalyticsPrepBinariesData.pl (added) (history)
  • /trunk/extensions/MetricsReporting/import/AnalyticsPrepComscoreData.pl (added) (history)
  • /trunk/extensions/MetricsReporting/import/AnalyticsPrepLanguageNames.pl (added) (history)
  • /trunk/extensions/MetricsReporting/import/AnalyticsPrepPageViews.pl (added) (history)
  • /trunk/extensions/MetricsReporting/import/AnalyticsPrepWikiCountsOutput.pl (added) (history)
  • /trunk/extensions/MetricsReporting/import/_readme.txt (added) (history)
  • /trunk/extensions/MetricsReporting/import/analytics_create_and_load_from_csv.txt (added) (history)
  • /trunk/extensions/MetricsReporting/import/analytics_generate_csv_files.sh (added) (history)
  • /trunk/extensions/MetricsReporting/import/analytics_new.sh (added) (history)
  • /trunk/extensions/MetricsReporting/import/analytics_refresh_from_csv.txt (added) (history)
  • /trunk/extensions/MetricsReporting/import/analytics_upd.sh (added) (history)

Diff [purge]

Index: trunk/extensions/MetricsReporting/import/AnalyticsPrepBinariesData.pl
@@ -0,0 +1,124 @@
 2+#!/usr/local/bin/perl
 3+
 4+ use Getopt::Std ;
 5+
 6+ &ParseArguments ;
 7+
 8+ print "Write file '$file_csv_out'\n" ;
 9+ open CSV_OUT, '>', $file_csv_out ;
 10+
 11+ foreach $project (qw (wb wk wn wp wq ws wv wx))
 12+ { &ReadStatisticsPerBinariesExtension ($project) ; }
 13+
 14+ close CSV_OUT ;
 15+
 16+ print "\n\nReady\n\n" ;
 17+ exit ;
 18+
 19+sub ParseArguments
 20+{
 21+ my @options ;
 22+ getopt ("io", \%options) ;
 23+
 24+ die ("Specify input folder as: -i path") if (! defined ($options {"i"})) ;
 25+ die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
 26+
 27+ $path_in = $options {"i"} ;
 28+ $path_out = $options {"o"} ;
 29+
 30+ die "Input folder '$path_in' does not exist" if (! -d $path_in) ;
 31+ die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
 32+
 33+ # tests only
 34+ # $path_in = "C:/@ Wikimedia/# Out Bayes" ;
 35+ # $path_out = "C:/analytics" ; # "w:/@ report card/data" ;
 36+
 37+ print "Input folder: $path_in\n" ;
 38+ print "Output folder: $path_out\n" ;
 39+ print "\n" ;
 40+
 41+ $file_csv_out = "$path_out/analytics_in_binaries.csv" ;
 42+}
 43+
 44+
 45+sub ReadStatisticsPerBinariesExtension
 46+{
 47+ my $project = shift ;
 48+ my $file_csv_in = "$path_in/csv_$project/StatisticsPerBinariesExtension.csv" ;
 49+ $yyyymm_hi = -1 ;
 50+
 51+ if (! -e $file_csv_in)
 52+ { die "Input file '$file_csv_in' not found" ; }
 53+
 54+
 55+ print "Read '$file_csv_in'\n" ;
 56+ open CSV_IN, '<', $file_csv_in ;
 57+
 58+ $language_prev = '' ;
 59+ while ($line = <CSV_IN>)
 60+ {
 61+ chomp $line ;
 62+ next if $line !~ /,.*?,/ ;
 63+
 64+ ($language,$date,$data) = split (',', $line, 3) ;
 65+
 66+ # for each wiki first line shows ext names, no tcounts
 67+ if ($date eq "00/0000")
 68+ {
 69+ if ($language_prev ne '')
 70+ { &WriteMonthlyData ($project, $language_prev) ; }
 71+ $language_prev = $language ;
 72+
 73+ undef %ext_name ;
 74+ undef %ext_ndx ;
 75+ undef %ext_cnt ;
 76+ undef %months ;
 77+
 78+ @exts = split (',', $data) ;
 79+ $ndx = 0 ;
 80+ foreach $ext (@exts)
 81+ {
 82+ $ext_name {$ndx} = $ext ;
 83+ $ext_ndx {$ext} = $ndx ;
 84+ $ndx ++ ;
 85+ }
 86+ next ;
 87+ }
 88+
 89+ ($month,$year) = split ('\/', $date) ;
 90+ $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
 91+ if ($yyyymm gt $yyyymm_hi)
 92+ { $yyyymm_hi = $yyyymm ; }
 93+ $months {$yyyymm}++ ;
 94+
 95+ @counts = split (',', $data) ;
 96+ $ndx = 0 ;
 97+ foreach $count (@counts)
 98+ {
 99+ $ext_cnt {$yyyymm}{$ext_name {$ndx}} = $count ;
 100+ $ndx ++ ;
 101+ }
 102+ }
 103+ &WriteMonthlyData ($project, $language_prev) ;
 104+
 105+ close CSV_IN ;
 106+}
 107+
 108+sub WriteMonthlyData
 109+{
 110+ my ($project,$language) = @_ ;
 111+ # get sorted array of extensions, order by count for most recent month
 112+ %ext_cnt_yyyymm_hi = %{$ext_cnt {$yyyymm_hi}} ;
 113+ @ext_cnt_yyyymm_hi = (sort {$ext_cnt_yyyymm_hi {$b} <=> $ext_cnt_yyyymm_hi {$a}} keys %ext_cnt_yyyymm_hi) ;
 114+
 115+ foreach $month (sort keys %months)
 116+ {
 117+ $ndx = 0 ;
 118+ foreach $ext (@ext_cnt_yyyymm_hi)
 119+ {
 120+ print CSV_OUT "$project,$language,$month,$ext,${ext_cnt{$yyyymm}{$ext_name {$ndx}}}\n" ;
 121+ # print "$month,$ext,${ext_cnt{$yyyymm}{$ext_name {$ndx}}}\n" ;
 122+ last if (++ $ndx > 25) ;
 123+ }
 124+ }
 125+}
Property changes on: trunk/extensions/MetricsReporting/import/AnalyticsPrepBinariesData.pl
___________________________________________________________________
Added: svn:eol-style
1126 + native
Index: trunk/extensions/MetricsReporting/import/AnalyticsPrepLanguageNames.pl
@@ -0,0 +1,162 @@
 2+#!/usr/bin/perl
 3+
 4+# This module prepares a csv file with language names for feed into the analytics database
 5+# The main work of collecting these names from different sources is done by
 6+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiReportsLocalizations.pm
 7+# which is part of wikistats reporting phase and once each month updates local master csv files from all sources
 8+
 9+# The following code to merge and filter these master csv files is based on parts of the code in WikiReportsLocalizations.pm, function Localization
 10+# To do some day: make a completely independent script out of this code and code from WikiReportsLocalizations.pm which covers the whole production cycle
 11+
 12+# Sources for language names:
 13+# - php files
 14+# - translatewiki
 15+# - English Wikipedia API (interwikilinks)
 16+
 17+# Right now multilingual support for the analytics database is just a nice idea, so to speed up data feeds, just keep English and German translations
 18+
 19+ use Getopt::Std ;
 20+
 21+ $true = 1 ;
 22+ $false = 0 ;
 23+
 24+ $max_language_name = 50 ; # truncate if longer
 25+
 26+ $file_csv_language_names_php = "LanguageNamesViaPhp.csv" ;
 27+ $file_csv_language_names_wp = "LanguageNamesViaWpEnEdited.csv" ;
 28+ $file_csv_analytics_in = "analytics_in_language_names.csv" ;
 29+
 30+ $languages_force_case_uc = "ast|br|de|en|id|nl|wa" ; # incomplete list, based on languages supported by wikistats reporting
 31+ $languages_filter = "de|en" ;
 32+ foreach $language (split '\|', $languages_filter)
 33+ { $include_target_language {$language} = $true ; }
 34+
 35+ &ParseArguments ;
 36+ &ReadCsvFiles ;
 37+ &WriteCsvFile ;
 38+
 39+
 40+# if ($language eq "id") # swap which file takes precedence
 41+
 42+ print "\nReady\n\n" ;
 43+ exit ;
 44+
 45+sub ParseArguments
 46+{
 47+ my (@options, $arguments) ;
 48+
 49+ getopt ("io", \%options) ;
 50+
 51+ foreach $arg (sort keys %options)
 52+ { $arguments .= " -$arg " . $options {$arg} . "\n" ; }
 53+ print ("\nArguments\n$arguments\n") ;
 54+
 55+ $options {"i"} = "w:/# out bayes/csv_wp" ; # EZ test
 56+ $options {"o"} = "c:/MySQL/analytics" ; # EZ test
 57+
 58+ die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ;
 59+ die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
 60+
 61+ ($path_in = $options {"i"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash if any
 62+ ($path_out = $options {"o"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash if any
 63+
 64+ die "Input folder '$path_in' does not exist" if (! -d $path_in) ;
 65+ die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
 66+
 67+ print "Input folder: $path_in\n" ;
 68+ print "Output folder: $path_out\n\n" ;
 69+
 70+ $file_csv_language_names_php = "$path_in/$file_csv_language_names_php" ;
 71+ $file_csv_language_names_wp = "$path_in/$file_csv_language_names_wp" ;
 72+ $file_csv_analytics_in = "$path_out/$file_csv_analytics_in" ;
 73+
 74+ die "Input file '$file_csv_language_names_php' not found" if (! -e $file_csv_language_names_php) ;
 75+ die "Input file '$file_csv_language_names_wp' not found" if (! -e $file_csv_language_names_wp) ;
 76+}
 77+
 78+sub ReadCsvFiles
 79+{
 80+ #first read definitions from php message files, then overwrite with definitions from interwiki links when available
 81+ # except for target language 'id' (Indonesian) where quality of php file has been deemed more reliable
 82+
 83+ open CSV_IN, "<", $file_csv_language_names_php ;
 84+ while ($line = <CSV_IN>)
 85+ {
 86+ chomp ($line) ;
 87+ ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ;
 88+
 89+ next if ! $include_target_language {$target_language} ;
 90+
 91+ $out_languages {$target_language} {$code} = &FormatName ($target_language, $name_unicode) ; # forget about html for analytics database
 92+ }
 93+ close CSV_IN ;
 94+
 95+ open CSV_IN, "<", $file_csv_language_names_wp ;
 96+ while ($line = <CSV_IN>)
 97+ {
 98+ chomp ($line) ;
 99+ ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ;
 100+
 101+ next if ! $include_target_language {$target_language} ;
 102+
 103+ next if $target_language eq 'id' and $out_languages {$target_language} {$code} ne '' ;
 104+
 105+ # $name_unicode_php = $out_languages {$target_language} {$code} ; # test only
 106+ # $name_unicode_wp = &FormatName ($target_language, $name_unicode) ; # test only
 107+ # if (($name_unicode_php ne '') && ($name_unicode_php ne $name_unicode_wp)) # test only
 108+ # { print "$name_unicode_php => $name_unicode_wp\n" ; } # test only
 109+
 110+ $out_languages {$target_language} {$code} = &FormatName ($target_language, $name_unicode) ; # forget about html for analytics database
 111+ }
 112+ close CSV_IN ;
 113+}
 114+
 115+sub FormatName
 116+{
 117+ my ($target_language, $name_unicode) = @_ ;
 118+
 119+ $name_unicode2 = $name_unicode ;
 120+
 121+ if ($target_language eq "de")
 122+ { $name_unicode =~ s/e?\s*\-?sprache//i ; }
 123+
 124+ if ($target_language =~ /^(?:$languages_force_case_uc)/)
 125+ { $name_unicode = ucfirst $name_unicode ; }
 126+ else
 127+ { $name_unicode = lc $name_unicode ; }
 128+
 129+# Test only
 130+# if (($target_language eq 'de') && ($name_unicode ne $name_unicode2))
 131+# { print "$name_unicode2 => $name_unicode\n" ; }
 132+
 133+ return ($name_unicode) ;
 134+}
 135+
 136+sub WriteCsvFile
 137+{
 138+ open CSV_OUT, ">", $file_csv_analytics_in || die ("File '$file_csv_analytics_in' could not be opened") ;
 139+ binmode CSV_OUT ; # force Unix style linebreak \012
 140+
 141+ foreach $target_language (sort keys %out_languages)
 142+ {
 143+ print "\nTarget language '$target_language'\n" ;
 144+ %translations = %{$out_languages {$target_language}} ;
 145+
 146+ foreach $code (sort keys %translations)
 147+ {
 148+ $language_name = $translations{$code} ;
 149+ $language_name2 = substr ($language_name,0,$max_language_name) ;
 150+
 151+ if ($language_name ne $language_name2)
 152+ { print "Language name truncated to $max_language_name chars: '$language_name' -> '$language_name2'\n" ; }
 153+
 154+ if ($language_name2 =~ /,/)
 155+ { $language_name2 = "\"$language_name2\"" ; }
 156+ # test only
 157+ print CSV_OUT "$target_language,$code,$language_name2\n" ;
 158+ }
 159+ }
 160+
 161+ close CSV_OUT ;
 162+}
 163+
Property changes on: trunk/extensions/MetricsReporting/import/AnalyticsPrepLanguageNames.pl
___________________________________________________________________
Added: svn:eol-style
1164 + native
Index: trunk/extensions/MetricsReporting/import/analytics_refresh_from_csv.txt
@@ -0,0 +1,55 @@
 2+USE `analytics` ;
 3+
 4+TRUNCATE TABLE comscore ;
 5+LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv'
 6+ INTO TABLE comscore
 7+ FIELDS TERMINATED BY ','
 8+ OPTIONALLY ENCLOSED BY '"'
 9+ (@date,country_code,region_code,web_property,project_code,reach,visitors)
 10+ SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
 11+
 12+TRUNCATE TABLE comscore_regions ;
 13+LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv'
 14+ INTO TABLE comscore_regions
 15+ FIELDS TERMINATED BY ','
 16+ OPTIONALLY ENCLOSED BY '"'
 17+ (report_language,region_code,region_name) ;
 18+
 19+TRUNCATE TABLE wikistats ;
 20+LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv'
 21+ INTO TABLE wikistats
 22+ FIELDS TERMINATED BY ','
 23+ OPTIONALLY ENCLOSED BY '"'
 24+ (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects)
 25+ SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
 26+
 27+TRUNCATE TABLE page_views ;
 28+LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv'
 29+ INTO TABLE page_views
 30+ FIELDS TERMINATED BY ','
 31+ OPTIONALLY ENCLOSED BY '"'
 32+ (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized)
 33+ SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
 34+
 35+TRUNCATE TABLE language_names ;
 36+LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv'
 37+ INTO TABLE language_names
 38+ FIELDS TERMINATED BY ','
 39+ OPTIONALLY ENCLOSED BY '"'
 40+ (report_language,language_code,language_name) ;
 41+
 42+TRUNCATE TABLE binaries ;
 43+LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv'
 44+ INTO TABLE binaries
 45+ FIELDS TERMINATED BY ','
 46+ OPTIONALLY ENCLOSED BY '"'
 47+ (project_code,language_code,@date,extension,binaries)
 48+ SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
 49+
 50+TRUNCATE TABLE offline ;
 51+LOAD DATA LOCAL INFILE 'analytics_in_offline.csv'
 52+ INTO TABLE offline
 53+ FIELDS TERMINATED BY ','
 54+ OPTIONALLY ENCLOSED BY '"'
 55+ (@date, readers)
 56+ SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
Property changes on: trunk/extensions/MetricsReporting/import/analytics_refresh_from_csv.txt
___________________________________________________________________
Added: svn:eol-style
157 + native
Index: trunk/extensions/MetricsReporting/import/AnalyticsPrepPageViews.pl
@@ -0,0 +1,11 @@
 2+#!/usr/bin/perl
 3+
 4+# This file is just a place holder (alias)
 5+# Page views per day/week/month/etc, per project, per language, nrmalized and not normalized are collected by
 6+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiCountsSummarizeProjectCounts.pl
 7+#
 8+# Added May 2001:
 9+# For analytics database one file is written for all projects and languages combined,
 10+# with per month, not normalized and normalized page view counts in one row.
 11+# This file is written to same output folder as other csv files generated by WikiCountsSummarizeProjectCounts.pl
 12+# and needs to be copied to analytics folder later
\ No newline at end of file
Property changes on: trunk/extensions/MetricsReporting/import/AnalyticsPrepPageViews.pl
___________________________________________________________________
Added: svn:eol-style
113 + native
Index: trunk/extensions/MetricsReporting/import/analytics_create_and_load_from_csv.txt
@@ -0,0 +1,181 @@
 2+-- make sure to delete output files *test*.csv first if any exist (MySQL on purpose forbids overwrite)
 3+
 4+-- tables implemented:
 5+-- comscore
 6+-- comscore_regions
 7+-- wikistats
 8+-- page_views
 9+-- language names
 10+-- binaries
 11+
 12+-- more tables planned (O= optional, not needed for report card stage)
 13+-- project_names
 14+-- O edits per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other)
 15+-- O editors per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other)
 16+
 17+
 18+-- open issues:
 19+-- only store basic data in database and calculate all aggregates on the fly or do some aggregation before hand ? (e.g. count for mobile / non-mobile / ==> total of both ? <==)
 20+-- for binaries, store one extension type per row? (future proof, more work to query), or a selected few as columns? (one row only needed per month)
 21+
 22+-- Create database and two tables from scratch
 23+DROP DATABASE IF EXISTS `analytics` ;
 24+
 25+CREATE DATABASE `analytics` ;
 26+
 27+USE `analytics` ;
 28+
 29+CREATE TABLE `comscore` (
 30+ `date` date NOT NULL,
 31+ `country_code` varchar (3),
 32+ `region_code` varchar (3),
 33+ `web_property` varchar (20),
 34+ `project_code` varchar (10),
 35+ `reach` decimal (4,1) DEFAULT NULL,
 36+ `visitors` decimal (15) DEFAULT NULL,
 37+ PRIMARY KEY (date,country_code,region_code,project_code,web_property),
 38+ KEY (`country_code`)
 39+) ;
 40+
 41+CREATE TABLE `comscore_regions` (
 42+ `region_code` varchar (2),
 43+ `report_language` varchar (10),
 44+ `region_name` varchar (18),
 45+ PRIMARY KEY (region_code,report_language)
 46+) ;
 47+
 48+CREATE TABLE `wikistats` (
 49+ `date` date NOT NULL,
 50+ `project_code` varchar (10),
 51+ `language_code` varchar (15),
 52+ `editors_all_time` int (10) DEFAULT NULL,
 53+ `editors_new` int (7) DEFAULT NULL,
 54+ `editors_ge_5` int (7) DEFAULT NULL,
 55+ `editors_ge_25` int (7) DEFAULT NULL,
 56+ `editors_ge_100` int (7) DEFAULT NULL,
 57+ `articles` int (12) DEFAULT NULL,
 58+ `articles_new_per_day` int (9) DEFAULT NULL,
 59+ `articles_over_bytes_500` int (12) DEFAULT NULL,
 60+ `articles_over_bytes_2000` int (12) DEFAULT NULL,
 61+ `edits_per_article` decimal (9,1) DEFAULT NULL,
 62+ `bytes_per_article` decimal (9,1) DEFAULT NULL,
 63+ `edits` int (12) DEFAULT NULL,
 64+ `size_in_bytes` int (15) DEFAULT NULL,
 65+ `size_in_words` int (15) DEFAULT NULL,
 66+ `links_internal` int (15) DEFAULT NULL,
 67+ `links_interwiki` int (15) DEFAULT NULL,
 68+ `links_image` int (15) DEFAULT NULL,
 69+ `links_external` int (15) DEFAULT NULL,
 70+ `redirects` int (15) DEFAULT NULL,
 71+ PRIMARY KEY (date,project_code,language_code)
 72+) ;
 73+
 74+CREATE TABLE `page_views` (
 75+ `date` date NOT NULL,
 76+ `project_code` char (2),
 77+ `language_code` char (15),
 78+ `views_non_mobile_raw` bigint (15),
 79+ `views_mobile_raw` bigint (15),
 80+ `views_non_mobile_normalized` bigint (15),
 81+ `views_mobile_normalized` bigint (15),
 82+ `views_raw` bigint (15),
 83+ `views_normalized` bigint (15),
 84+ PRIMARY KEY (date,project_code,language_code),
 85+ KEY (date,language_code,project_code)
 86+) ;
 87+
 88+CREATE TABLE `language_names` (
 89+ `report_language` varchar (15),
 90+ `language_code` varchar (15),
 91+ `language_name` varchar (50),
 92+ PRIMARY KEY (report_language,language_code)
 93+) ;
 94+
 95+CREATE TABLE `binaries` (
 96+ `date` date NOT NULL,
 97+ `project_code` char (2),
 98+ `language_code` char (15),
 99+ `extension` varchar (10),
 100+ `binaries` bigint (15),
 101+ PRIMARY KEY (date,project_code,language_code,extension)
 102+) ;
 103+
 104+CREATE TABLE `offline` (
 105+ `date` date NOT NULL,
 106+ `source` varchar (255),
 107+ `readers` bigint (12),
 108+ PRIMARY KEY (date,source)
 109+) ;
 110+
 111+-- SHOW TABLES ;
 112+-- DESCRIBE comscore ;
 113+-- DESCRIBE comscore_regions ;
 114+-- DESCRIBE wikistats ;
 115+-- DESCRIBE page_views ;
 116+-- DESCRIBE language_names ;
 117+-- DESCRIBE binaries ;
 118+-- DESCRIBE offline ;
 119+
 120+-- Database Manipulation
 121+-- Obviously in real world this is a separate script
 122+
 123+LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv'
 124+ INTO TABLE comscore
 125+ FIELDS TERMINATED BY ','
 126+ OPTIONALLY ENCLOSED BY '"'
 127+ (@date,country_code,region_code,web_property,project_code,reach,visitors)
 128+ SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
 129+
 130+LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv'
 131+ INTO TABLE comscore_regions
 132+ FIELDS TERMINATED BY ','
 133+ OPTIONALLY ENCLOSED BY '"'
 134+ (report_language,region_code,region_name) ;
 135+
 136+LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv'
 137+ INTO TABLE wikistats
 138+ FIELDS TERMINATED BY ','
 139+ OPTIONALLY ENCLOSED BY '"'
 140+ (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects)
 141+ SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
 142+
 143+LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv'
 144+ INTO TABLE page_views
 145+ FIELDS TERMINATED BY ','
 146+ OPTIONALLY ENCLOSED BY '"'
 147+ (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized)
 148+ SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
 149+
 150+
 151+LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv'
 152+ INTO TABLE language_names
 153+ FIELDS TERMINATED BY ','
 154+ OPTIONALLY ENCLOSED BY '"'
 155+ (report_language,language_code,language_name) ;
 156+
 157+LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv'
 158+ INTO TABLE binaries
 159+ FIELDS TERMINATED BY ','
 160+ OPTIONALLY ENCLOSED BY '"'
 161+ (project_code,language_code,@date,extension,binaries)
 162+ SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
 163+
 164+LOAD DATA LOCAL INFILE 'analytics_in_offline.csv'
 165+ INTO TABLE offline
 166+ FIELDS TERMINATED BY ','
 167+ OPTIONALLY ENCLOSED BY '"'
 168+ (@date,readers)
 169+ SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
 170+
 171+
 172+-- show contents (debugging only)
 173+-- SELECT * FROM comscore ;
 174+-- SELECT * FROM comscore_regions ;
 175+-- SELECT * FROM wikistats ;
 176+-- SELECT * FROM page_views ;
 177+-- SELECT * FROM language_names ;
 178+-- SELECT * FROM binaries
 179+-- WHERE project_code = 'commons' ;
 180+ SELECT * FROM offline ;
 181+
 182+
Property changes on: trunk/extensions/MetricsReporting/import/analytics_create_and_load_from_csv.txt
___________________________________________________________________
Added: svn:eol-style
1183 + native
Index: trunk/extensions/MetricsReporting/import/analytics_generate_csv_files.sh
@@ -0,0 +1,47 @@
 2+#!/bin/sh
 3+
 4+# Prepare several csv files, ready for importing into analytics database
 5+# All generated files have _in_ in name signalling these contain data ready for importing into database
 6+# One input record corresponds to one database record
 7+
 8+ulimit -v 8000000
 9+
 10+clear
 11+cd /a/analytics
 12+
 13+
 14+# AnalyticsPrepBinariesData.pl read counts for binaries which were generated by wikistats
 15+# and which reside in /a/wikistats/csv_[project code]/StatisticsPerBinariesExtension.csv
 16+# It filters and reorganizes data and produces analytics_in_binaries.csv
 17+# Output csv contains: project code, language, month, extension name, count
 18+
 19+perl AnalyticsPrepBinariesData.pl -i /a/wikistats/ -o /a/analytics/
 20+
 21+# AnalyticsPrepComscoreData.pl scans /a/analytics/comscore for newest comScore csv files (with data for last 14 months)
 22+# parses those csv files, adds/replaces data from these csv files into master files (containing full history)
 23+# and generates input csv file analytics_in_comscore.csv ready for importing into database
 24+#
 25+# note : these csv files were manually downloaded from http://mymetrix.comscore.com/app/report.aspx
 26+# and given more descriptive names, script finds newest files based on partial name search
 27+#
 28+# -r replace (default is add only)
 29+# -i input folder, contains manually downloaded csv files from comScore (or xls files manually converted to csv)
 30+# -m master files with full history
 31+# -o output csv file, with reach per region, UV's per region and UV's per top web property, ready for import into database
 32+
 33+perl AnalyticsPrepComscoreData.pl -r -i /a/analytics/comscore -m /a/analytics -o /a/analytics
 34+
 35+# AnalyticsPrepWikiCountsOutput.pl reads a plethora of fields from several csv files from wikistats process
 36+# It filters and reorganizes data and produces analytics_in_wikistats.csv, ready for import into analytics database
 37+
 38+perl AnalyticsPrepWikiCountsOutput.pl -i /a/wikistats/ -o /a/analytics
 39+
 40+# analytics_in_page_views.csv is written daily as part of WikiCountsSummarizeProjectCounts.pl
 41+# part of (/home/ezachte/pageviews_monthly.sh job)
 42+# which processes hourly projectcounts files (per wiki page view totals for one hour) from http://dammit.lt/wikistats
 43+# and generates several files on different aggregation levels
 44+# only action here is to copy data to this folder to have everything in one place
 45+# note: unlike folder name suggests this file contains stats for all projects
 46+
 47+cp /a/wikistats/csv_wp/analytics_in_page_views.csv .
 48+
Property changes on: trunk/extensions/MetricsReporting/import/analytics_generate_csv_files.sh
___________________________________________________________________
Added: svn:eol-style
149 + native
Index: trunk/extensions/MetricsReporting/import/analytics_upd.sh
@@ -0,0 +1,6 @@
 2+clear
 3+cd /a/analytics
 4+
 5+./analytics_generate_csv_files.sh
 6+
 7+mysql -u analytics -h project2.wikimedia.org -preport < analytics_refresh_from_csv.txt
Property changes on: trunk/extensions/MetricsReporting/import/analytics_upd.sh
___________________________________________________________________
Added: svn:eol-style
18 + native
Index: trunk/extensions/MetricsReporting/import/AnalyticsPrepWikiCountsOutput.pl
@@ -0,0 +1,334 @@
 2+#!/usr/local/bin/perl
 3+
 4+# Copyright (C) 2011 Wikimedia Foundation
 5+# This program is free software; you can redistribute it and/or
 6+# modify it under the terms of the GNU General Public License version 2
 7+# as published by the Free Software Foundation.
 8+# This program is distributed in the hope that it will be useful,
 9+# but WITHOUT ANY WARRANTY; without even the implied warranty of
 10+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 11+# See the GNU General Public License for more details, at
 12+# http://www.fsf.org/licenses/gpl.html
 13+
 14+# Author:
 15+# Erik Zachte, email ezachte@wikimedia.org
 16+# loosely based on predecessor
 17+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/reportcard/ReportCardExtractWikiCountsOutput.pl
 18+
 19+# Functionality:
 20+# tba
 21+
 22+# Parameters:
 23+# tba
 24+
 25+# Output:
 26+# updated csv file for import in MySQL
 27+
 28+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/
 29+
 30+ use Getopt::Std ;
 31+
 32+ $true = 1 ;
 33+ $false = 0 ;
 34+
 35+ @projects = ('wb','wk','wn','wp','wq','ws','wv','wx','commons','*') ;
 36+
 37+ $file_csv_monthly_data = "StatisticsMonthly.csv" ;
 38+ $file_csv_user_activity_spread = "StatisticsUserActivitySpread.csv" ;
 39+ $file_csv_analytics_in = "analytics_in_wikistats.csv" ;
 40+
 41+ &ParseArguments ;
 42+ &ReadStatisticsMonthly ;
 43+ &FindLargestWikis ;
 44+ &WriteMonthlyData ;
 45+
 46+ print "\nReady\n\n" ;
 47+ exit ;
 48+
 49+sub ParseArguments
 50+{
 51+ print "ParseArguments\n" ;
 52+ my (@options, $arguments) ;
 53+
 54+ getopt ("io", \%options) ;
 55+
 56+ foreach $arg (sort keys %options)
 57+ { $arguments .= " -$arg " . $options {$arg} . "\n" ; }
 58+ print ("\nArguments\n$arguments\n") ;
 59+
 60+ if (! -d '/mnt/') # EZ test
 61+ {
 62+ $path_in = "c:/\@ wikimedia/# out bayes" ;
 63+ $path_out = "c:/MySQL/analytics" ;
 64+ }
 65+ else
 66+ {
 67+ die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ;
 68+ die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
 69+
 70+ $path_in = $options {"i"} ;
 71+ $path_out = $options {"o"} ;
 72+ }
 73+
 74+ die "Input folder '$path_in' does not exist" if (! -d $path_in) ;
 75+ die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
 76+
 77+ print "Input folder: $path_in\n" ;
 78+ print "Output folder: $path_out\n\n" ;
 79+
 80+ $file_csv_out = "$path_out/analytics_in_wikistats.csv" ;
 81+}
 82+
 83+sub ReadStatisticsMonthly
 84+{
 85+ print "ReadStatisticsMonthly\n" ;
 86+ &ReadStatisticsMonthlyForProject ("wb") ;
 87+ &ReadStatisticsMonthlyForProject ("wk") ;
 88+ &ReadStatisticsMonthlyForProject ("wn") ;
 89+ &ReadStatisticsMonthlyForProject ("wp") ;
 90+ &ReadStatisticsMonthlyForProject ("wq") ;
 91+ &ReadStatisticsMonthlyForProject ("ws") ;
 92+ &ReadStatisticsMonthlyForProject ("wv") ;
 93+ &ReadStatisticsMonthlyForProject ("wx") ;
 94+
 95+# &ReadStatisticsPerBinariesExtensionCommons ;
 96+}
 97+
 98+sub ReadStatisticsMonthlyForProject
 99+{
 100+ my $project = shift;
 101+ $all_projects = "*" ;
 102+
 103+ my $file_csv_in_1 = "$path_in/csv_$project/$file_csv_monthly_data" ;
 104+ my $file_csv_in_2 = "$path_in/csv_$project/$file_csv_user_activity_spread" ;
 105+
 106+ if (! -e $file_csv_in_1)
 107+ { &Abort ("Input file '$file_csv_in_1' not found") ; }
 108+ if (! -e $file_csv_in_2)
 109+ { &Abort ("Input file '$file_csv_in_2' not found") ; }
 110+
 111+ my $yyyymm ;
 112+
 113+ print "Read '$file_csv_in_1'\n" ;
 114+ open CSV_IN, '<', $file_csv_in_1 ;
 115+ while ($line = <CSV_IN>)
 116+ {
 117+ chomp $line ;
 118+ ($language,$date,$counts) = split (',', $line, 3) ;
 119+ @fields = split (',', $counts) ;
 120+
 121+ next if ! &AcceptWiki ($project,$language) ;
 122+
 123+ ($month,$day,$year) = split ('\/', $date) ;
 124+ $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
 125+
 126+ foreach $field (@fields)
 127+ {
 128+ if ($field eq '-')
 129+ { $field = 0 ; }
 130+ }
 131+
 132+ $data = $fields [0] . ',' . # contributors all time
 133+ $fields [1] . ',' . # new contributors
 134+ 'data2,' . # place holder for more data, to be inserted later
 135+ $fields [4] . ',' . # articles
 136+ $fields [6] . ',' . # articles new per day
 137+ $fields [9] . ',' . # larger than 0.5 kB
 138+ $fields [10] . ',' . # larger than 2.0 kB
 139+ $fields [7] . ',' . # mean edits per article
 140+ $fields [8] . ',' . # mean bytes per article
 141+ $fields [11] . ',' . # edits
 142+ $fields [12] . ',' . # size in bytes
 143+ $fields [13] . ',' . # size in words
 144+ $fields [14] . ',' . # links internal
 145+ $fields [15] . ',' . # links interwiki
 146+ $fields [16] . ',' . # links images
 147+ $fields [17] . ',' . # links external
 148+ $fields [18] ; # redirects
 149+
 150+ $data1 {"$project,$language,$yyyymm"} = $data ;
 151+ }
 152+ close CSV_IN ;
 153+
 154+ # now read (very) active editors from newer more accurate file (split data for reg users and bots, unlike StatisticsMonthly.csv)
 155+
 156+ print "Read '$file_csv_in_2'\n" ;
 157+ open CSV_IN, '<', $file_csv_in_2 ;
 158+ while ($line = <CSV_IN>)
 159+ {
 160+ chomp $line ;
 161+ ($language,$date,$reguser_bot,$group,@counts) = split (',', $line) ;
 162+
 163+ next if ! &AcceptWiki ($project,$language) ;
 164+
 165+ if ($reguser_bot ne "R") { next ; } # R: reg user, B: bot
 166+ if ($group ne "A") { next ; } # A: articles, T: talk pages, O: other namespaces
 167+
 168+ ($month,$day,$year) = split ('\/', $date) ;
 169+ $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
 170+ $months {$yyyymm} ++ ;
 171+# print "YYYYMM $yyyymm\n" ;
 172+
 173+ # data have been collected in WikiCountsProcess.pm and been written in WikiCountsOutput.pm
 174+ # count user with over x edits
 175+ # threshold starting with a 3 are 10xSQRT(10), 100xSQRT(10), 1000xSQRT(10), etc
 176+ # @thresholds = (1,3,5,10,25,32,50,100,250,316,500,1000,2500,3162,5000,10000,25000,31623,50000,100000,250000,316228,500000,1000000,2500000,3162278,500000,10000000,25000000,31622777,5000000,100000000) ;
 177+ $edits_ge_5 = @counts [2] > 0 ? @counts [2] : 0 ;
 178+ $edits_ge_25 = @counts [4] > 0 ? @counts [4] : 0 ;
 179+ $edits_ge_100 = @counts [7] > 0 ? @counts [7] : 0 ;
 180+ $data2 {"$project,$language,$yyyymm"} = "$edits_ge_5,$edits_ge_25,$edits_ge_100" ;
 181+
 182+ $total_edits_ge_5 {"$project,$language"} += $edits_ge_5 ;
 183+ $total_edits_ge_25 {"$project,$language"} += $edits_ge_25 ;
 184+ $total_edits_ge_100 {"$project,$language"} += $edits_ge_100 ;
 185+
 186+ # prep string with right amount of comma's
 187+ if ($data2_default eq '')
 188+ {
 189+ $data2_default = $data2 {"$project,$language,$yyyymm"} ;
 190+ $data2_default =~ s/[^,]+/0/g ;
 191+ }
 192+ }
 193+ close CSV_IN ;
 194+}
 195+
 196+#sub ReadStatisticsPerBinariesExtensionCommons
 197+#{
 198+# my $file_csv_in = "$path_in/csv_wx/StatisticsPerBinariesExtension.csv" ;
 199+# my $mmax = -1 ;
 200+
 201+# if (! -e $file_csv_in)
 202+# { &Abort ("Input file '$file_csv_in' not found") ; }
 203+
 204+# print "Read '$file_csv_in'\n" ;
 205+# open CSV_IN, '<', $file_csv_in ;
 206+# while ($line = <CSV_IN>)
 207+# {
 208+# chomp $line ;
 209+# ($language,$date,$counts) = split (',', $line, 3) ;
 210+
 211+# if ($language ne "commons") { next ; }
 212+
 213+# if ($date eq "00/0000")
 214+# {
 215+# @fields = split (',', $counts) ;
 216+# $field_ndx = 0 ;
 217+# foreach $field (@fields)
 218+# {
 219+# $ext_cnt {-1}{$field_ndx} = $field ;
 220+# # print "EXT_CNT $field_ndx : $field\n" ;
 221+# $field_ndx ++ ;
 222+# }
 223+# next ;
 224+# }
 225+
 226+# ($month,$year) = split ('\/', $date) ;
 227+# my $m = &months_since_2000_01 ($year,$month) ;
 228+# next if $m < $m_start ;
 229+
 230+# if ($m > $mmax)
 231+# { $mmax = $m ; }
 232+
 233+# @fields = split (',', $counts) ;
 234+# $field_ndx = 0 ;
 235+# foreach $field (@fields)
 236+# {
 237+# $ext_cnt {$m}{$field_ndx} = $field ;
 238+# $ext_tot {$m} += $field ;
 239+# $field_ndx ++ ;
 240+# }
 241+# }
 242+# close CSV_IN ;
 243+
 244+# %ext_cnt_mmax = %{$ext_cnt {$mmax}} ;
 245+# @ext_cnt_mmax = (sort {$ext_cnt_mmax {$b} <=> $ext_cnt_mmax {$a}} keys %ext_cnt_mmax) ;
 246+
 247+# $extcnt = 0 ;
 248+# foreach $extndx (@ext_cnt_mmax)
 249+# {
 250+# # print "$extndx < ${ext_cnt {-1}{$extndx}} > : ${ext_cnt_mmax {$extndx}}\n" ;
 251+# push @extndxs, $extndx ;
 252+# if ($extcnt++ >= 9) { last ; }
 253+# }
 254+#}
 255+
 256+sub FindLargestWikis
 257+{
 258+ print "FindLargestWikis\n" ;
 259+ print "Largest projects (most accumulated very active editors):\n";
 260+ @total_edits_ge_100 = sort {$total_edits_ge_100 {$b} <=> $total_edits_ge_100 {$a}} keys %total_edits_ge_100 ;
 261+ $rank = 0 ;
 262+ foreach $project_language (@total_edits_ge_100)
 263+ {
 264+ $largest_projects {$project_language} = $rank++ ;
 265+ print "$project_language," ;
 266+ last if $rank > 10 ;
 267+ }
 268+ print "\n\n" ;
 269+
 270+ foreach $yyyymm (sort keys %months)
 271+ {
 272+ next if $yyyymm lt '2011' ;
 273+ foreach $project_language (keys %largest_projects)
 274+ {
 275+ ($project,$language) = split (',', $project_language) ;
 276+ if ($data2 {"$project,$language,$yyyymm"} eq '')
 277+ {
 278+ print "No data yet for large wiki $project_language for $yyyymm-> skip month $yyyymm\n" ;
 279+ $months {$yyyymm} = 0 ;
 280+ }
 281+ }
 282+ }
 283+}
 284+
 285+sub WriteMonthlyData
 286+{
 287+ print "WriteMonthlyData\n" ;
 288+ my $file_csv_out = "$path_out/$file_csv_analytics_in" ;
 289+ open CSV_OUT, '>', $file_csv_out ;
 290+ foreach $project_wiki_month (sort keys %data1)
 291+ {
 292+ ($project,$wiki,$yyyymm) = split (',', $project_wiki_month) ;
 293+
 294+ # recent month misses on eor more large wikis?
 295+ next if $months {$yyyymm} == 0 ;
 296+
 297+ $data1 = $data1 {$project_wiki_month} ;
 298+ $data2 = $data2 {$project_wiki_month} ;
 299+ if ($data2 eq '')
 300+ {
 301+ print "Editor data missing for $project_wiki_month\n" ;
 302+ $data2 = $data2_default ;
 303+ }
 304+ $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor fields close together
 305+ print CSV_OUT "$project_wiki_month,$data1\n" ;
 306+ }
 307+ $total_edits_ge_5 {"$project,*,$yyyymm"} += $edits_ge_5 ;
 308+ $total_edits_ge_25 {"$project,*,$yyyymm"} += $edits_ge_25 ;
 309+ $total_edits_ge_100 {"$project,*,$yyyymm"} += $edits_ge_100 ;
 310+ close CSV_OUT ;
 311+}
 312+
 313+sub AcceptWiki
 314+{
 315+ my ($project,$language) = @_ ;
 316+
 317+ return $false if $language eq 'commons' and $project ne 'wx' ; # commons also in wikipedia csv files (bug, hard to cleanup, just skip)
 318+ return $false if $language eq 'sr' and $project eq 'wn' ; # ignore insane bot spam on
 319+ return $false if $language =~ /mania|team|comcom|closed|chair|langcom|office|searchcom|sep11|nostalgia|stats|test/i ;
 320+
 321+ return $false if $language =~ /^(?:dk|tlh|ru_sib)$/ ; # dk=dumps exist(ed?) but site not, tlh=Klignon, ru-sib=Siberian
 322+ return $false if $project eq 'wk' and ($language eq "als" or $language eq "tlh") ;
 323+
 324+ return $true ;
 325+}
 326+
 327+sub Abort
 328+{
 329+ my $msg = shift ;
 330+ print "$msg\nExecution aborted." ;
 331+ # to do: log also to file
 332+ exit ;
 333+}
 334+
 335+
Property changes on: trunk/extensions/MetricsReporting/import/AnalyticsPrepWikiCountsOutput.pl
___________________________________________________________________
Added: svn:eol-style
1336 + native
Index: trunk/extensions/MetricsReporting/import/analytics_new.sh
@@ -0,0 +1,5 @@
 2+clear
 3+cd /a/analytics
 4+# rm *test*.csv
 5+# mysql --user=root --password=changerootwhenpriyankareturns < analytics_create_and_load_from_csv.txt > mysql_log.txt
 6+ mysql -u analytics -h project2.wikimedia.org -preport < analytics_create_and_load_from_csv.txt > mysql_log.txt
Property changes on: trunk/extensions/MetricsReporting/import/analytics_new.sh
___________________________________________________________________
Added: svn:eol-style
17 + native
Index: trunk/extensions/MetricsReporting/import/_readme.txt
@@ -0,0 +1,21 @@
 2+This folder contain bash and perl files to create and fill database 'analytics', a.o. for monthly dashboard.
 3+
 4+== analytics_new.sh ==
 5+Defines the database and tables and loads data from existing csv files.
 6+It executes SQL from analytics_create_and_load_from_csv.txt
 7+
 8+== analytics_upd.sh ==
 9+Prepares new csv files (delegated to analytics_generate_csv_files.sh),
 10+and empties/reloads all tables for which csv files are in this folder.
 11+It executes SQL from analytics_refresh_from_csv.txt
 12+
 13+== CSV files ==
 14+CSV files and where they are generated:
 15+
 16+analytics_in_binaries.csv <- AnalyticsPrepBinariesData.pl
 17+analytics_in_comscore.csv <- AnalyticsPrepComscoreData.pl
 18+analytics_in_comscore_regions.csv <- manual
 19+analytics_in_language_names.csv
 20+analytics_in_offline.csv <- manual
 21+analytics_in_page_views.csv <- /home/ezachte/wikistats/pageviews_monthly.sh (file copy after that)
 22+analytics_in_wikistats.csv <- AnalyticsPrepWikiCountsOutput.pl
Property changes on: trunk/extensions/MetricsReporting/import/_readme.txt
___________________________________________________________________
Added: svn:eol-style
123 + native
Index: trunk/extensions/MetricsReporting/import/AnalyticsPrepComscoreData.pl
@@ -0,0 +1,564 @@
 2+#!/usr/bin/perl
 3+
 4+# Copyright (C) 2011 Wikimedia Foundation
 5+# This program is free software; you can redistribute it and/or
 6+# modify it under the terms of the GNU General Public License version 2
 7+# as published by the Free Software Foundation.
 8+# This program is distributed in the hope that it will be useful,
 9+# but WITHOUT ANY WARRANTY; without even the implied warranty of
 10+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 11+# See the GNU General Public License for more details, at
 12+# http://www.fsf.org/licenses/gpl.html
 13+
 14+# Author:
 15+# Erik Zachte, email ezachte@wikimedia.org
 16+
 17+# Functionality:
 18+# comScore data can be downloaded as csv file, which each contain 14 months history
 19+# This script uses these files to update 'master' csv files which contain all known history
 20+# Note: only entities which are already in master file will be updated!
 21+# Then it merges these master files into one csv file which can be loaded into analytics database
 22+# Data are: reach by region, unique visitors by region, unique visitors by web property
 23+
 24+# Parameters:
 25+# -m (required) folder with 'master' csv files (files with all known history)
 26+# -u (required) folder with 'update' csv files (files with lastest 14 months history, produced by comScore)
 27+
 28+# Output:
 29+# updated master csv files + merged and formatted csv for import in MySQL
 30+
 31+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/
 32+
 33+ use Getopt::Std ;
 34+ use Cwd;
 35+
 36+ my $options ;
 37+ getopt ("imo", \%options) ;
 38+
 39+ $true = 1 ;
 40+ $false = 0 ;
 41+
 42+ $script_name = "AnalyticsPrepComscoreData.pl" ;
 43+ $script_version = "0.31" ;
 44+
 45+# EZ test only
 46+# $source = "comscore" ;
 47+# $server = "ez_test" ;
 48+# $generated = "2011-05-06 00:00:00" ;
 49+# $user = "ezachte" ;
 50+
 51+ $dir_in = $options {"i"} ;
 52+ $dir_upd = $options {"m"} ;
 53+ $dir_out = $options {"o"} ;
 54+ $mode = 'add';
 55+ if (defined $options {'r'})
 56+ { $mode = 'replace'; }
 57+
 58+ print "Mode is $mode (specify '-r' for replace)\n\n";
 59+
 60+ if (! -d "/home/") # EZ test machine
 61+ {
 62+ $dir_in = "C:/@ Wikimedia/@ Report Card/Data" ;
 63+ $dir_upd = "C:/MySQL/analytics" ;
 64+ $dir_out = "C:/MySQL/analytics" ;
 65+ $mode = 'replace' ;
 66+ }
 67+
 68+ if ($dir_in eq '')
 69+ { Abort ("Specify folder for input file (new comScore data) '-i folder'") ; }
 70+ if ($dir_upd eq '')
 71+ { Abort ("Specify folder for master files (full history) as '-m folder'") ; }
 72+ if ($dir_out eq '')
 73+ { Abort ("Specify folder for output file '-o folder'") ; }
 74+
 75+ $file_comscore_reach_master = "history_comscore_reach_regions.csv" ;
 76+ $file_comscore_reach_update = "*reach*by*region*csv" ;
 77+ $file_comscore_uv_region_master = "history_comscore_UV_regions.csv" ;
 78+ $file_comscore_uv_region_update = "*UVs*by*region*csv" ;
 79+ $file_comscore_uv_property_master = "history_comscore_UV_properties.csv" ;
 80+ $file_comscore_uv_property_update = "*UV*trend*csv" ;
 81+
 82+ $layout_csv_reach = 1 ;
 83+ $layout_csv_regions = 2 ;
 84+ $layout_csv_properties = 3 ;
 85+
 86+ print "Directories:\n" .
 87+ "Input (new comScore data): '$dir_in'\n".
 88+ "Master files (full history): '$dir_upd'\n" .
 89+ "Output (database feed): '$dir_out'\n\n" ;
 90+
 91+ %region_codes = (
 92+ "Europe"=>"EU",
 93+ "North America"=>"NA",
 94+ "Latin America"=>"LA",
 95+ "World-Wide" => "W",
 96+ "Middle East - Africa" => "MA",
 97+ "Asia Pacific"=> "AS",
 98+ "United States" => "US",
 99+ "India" => "I",
 100+ "China" => "C"
 101+ ) ;
 102+
 103+ foreach $region_name (keys %region_codes)
 104+ { $region_names {$region_codes {$region_name}} = $region_name ; }
 105+
 106+ @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ;
 107+
 108+ &ReadMasterComscoreDataReachPerRegion ($file_comscore_reach_master, $file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ;
 109+ %reach_region_code = %data ;
 110+
 111+ &ReadMasterComscoreDataVisitorsPerRegion ($file_comscore_uv_region_master, $file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ;
 112+ %visitors_region_code = %data ;
 113+
 114+ &ReadMasterComscoreDataVisitorsPerProperty ($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f", 1000, $layout_csv_properties) ;
 115+ %visitors_web_property = %data ;
 116+
 117+ &WriteDataAnalytics ;
 118+
 119+ print "\nReady\n\n" ;
 120+ exit ;
 121+
 122+sub ReadMasterComscoreDataReachPerRegion
 123+{
 124+ my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) = @_ ;
 125+
 126+ print "ReadMasterComscoreDataReachPerRegion\n\n" ;
 127+
 128+ undef %months ;
 129+ undef %data ;
 130+ undef @regions ;
 131+
 132+ open IN, '<', "$dir_upd/$file_comscore_master" ;
 133+
 134+ $lines = 0 ;
 135+ while ($line = <IN>)
 136+ {
 137+ chomp $line ;
 138+ $line =~ s/\r//g ;
 139+
 140+ ($yyyymm,@data) = split (',', $line) ;
 141+
 142+ if ($lines++ == 0)
 143+ {
 144+ @regions = @data ;
 145+ print "Regions found: " . (join ',', @regions) . "\n";
 146+ next ;
 147+ }
 148+
 149+ $field_ndx = 0 ;
 150+ foreach (@data)
 151+ {
 152+ $region = $regions [$field_ndx] ;
 153+ $region_code = $region_codes {$region} ;
 154+
 155+ $data = $data [$field_ndx] ;
 156+ if ($data eq '')
 157+ { $data = '0' ; }
 158+ $months {$yyyymm} ++ ;
 159+ $data {"$yyyymm,$region_code"} = $data ;
 160+ # print "Old data $yyyymm,$region_code = $data\n" ;
 161+ $field_ndx++ ;
 162+ }
 163+ }
 164+ close IN ;
 165+
 166+ my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1, $layout_csv, @regions) ;
 167+ return if ! $updates_found ;
 168+
 169+ rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ;
 170+ open OUT, '>', "$dir_upd/$file_comscore_master" ;
 171+
 172+ $line_out = "yyyymm" ;
 173+ foreach $region_name (@regions)
 174+ { $line_out .= ",$region_name" ; }
 175+ print OUT "$line_out" ;
 176+
 177+ foreach $yyyymm (sort {$b cmp $a} keys %months)
 178+ {
 179+ $line_out = "\n$yyyymm" ;
 180+ foreach $region_name (@regions)
 181+ {
 182+ $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
 183+ $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
 184+ }
 185+ print OUT "$line_out" ;
 186+ }
 187+
 188+ close OUT ;
 189+}
 190+
 191+sub ReadMasterComscoreDataVisitorsPerRegion
 192+{
 193+ my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ;
 194+
 195+ print "ReadMasterComscoreDataVisitorsPerRegion\n\n";
 196+
 197+ undef %months ;
 198+ undef %data ;
 199+ undef @regions ;
 200+
 201+ open IN, '<', "$dir_upd/$file_comscore_master" ;
 202+
 203+ $lines = 0 ;
 204+ $metric = 'unique_visitors' ;
 205+ while ($line = <IN>)
 206+ {
 207+ chomp $line ;
 208+ $line =~ s/\r//g ;
 209+ $line = &GetNumberOnly ($line) ;
 210+
 211+ next if $line !~ /(?:yyyymm|\d\d\d\d-\d\d)/ ;
 212+
 213+ ($yyyymm,@data) = split (',', $line) ;
 214+
 215+ if ($lines++ == 0)
 216+ {
 217+ @regions = @data ;
 218+ print "Regions found: " . (join ',', @regions) . "\n";
 219+ next ;
 220+ }
 221+
 222+ $field_ndx = 0 ;
 223+ foreach (@data)
 224+ {
 225+ $region = $regions [$field_ndx] ;
 226+ $region_code = $region_codes {$region} ;
 227+
 228+ $data = $data [$field_ndx] ;
 229+ if ($data eq '')
 230+ { $data = '0' ; }
 231+
 232+ # print "Old data $yyyymm,$region = $data\n" ;
 233+
 234+ $months {$yyyymm} ++ ;
 235+ $data {"$yyyymm,$region_code"} = $data ;
 236+
 237+ $field_ndx++ ;
 238+ }
 239+ }
 240+ close IN ;
 241+
 242+ my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @regions) ;
 243+ return if ! $updates_found ;
 244+
 245+ rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ;
 246+ open OUT, '>', "$dir_upd/$file_comscore_master" ;
 247+
 248+ $line_out = "yyyymm" ;
 249+ foreach $region_name (@regions)
 250+ { $line_out .= ",$region_name" ; }
 251+ print OUT "$line_out" ;
 252+
 253+ foreach $yyyymm (sort {$b cmp $a} keys %months)
 254+ {
 255+ $line_out = "\n$yyyymm" ;
 256+ foreach $region_name (@regions)
 257+ {
 258+ $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
 259+ $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
 260+ }
 261+ print OUT "$line_out" ;
 262+ }
 263+
 264+ close OUT ;
 265+}
 266+
 267+sub ReadMasterComscoreDataVisitorsPerProperty
 268+{
 269+ my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ;
 270+
 271+ print "ReadMasterComscoreDataVisitorsPerProperty\n\n";
 272+
 273+ undef %months ;
 274+ undef %data ;
 275+ undef @properties ;
 276+
 277+ open IN, '<', "$dir_upd/$file_comscore_master" ;
 278+
 279+ $lines = 0 ;
 280+ $metric = 'unique_visitors' ;
 281+ while ($line = <IN>)
 282+ {
 283+ chomp $line ;
 284+ $line =~ s/\r//g ;
 285+
 286+ ($yyyymm,@data) = split (',', $line) ;
 287+ if ($lines++ == 0)
 288+ { @properties = @data ; next ; }
 289+
 290+ $field_ndx = 0 ;
 291+ foreach (@data)
 292+ {
 293+ $property = $properties [$field_ndx] ;
 294+ $property =~ s/.*Yahoo.*/Yahoo/ ;
 295+ $data = $data [$field_ndx] ;
 296+ if ($data eq '')
 297+ { $data = '0' ; }
 298+
 299+ # print "Old data $yyyymm,$property = $data\n" ;
 300+
 301+ $months {$yyyymm} ++ ;
 302+ $data {"$yyyymm,$property"} = $data ;
 303+
 304+ $field_ndx++ ;
 305+ }
 306+ }
 307+ close IN ;
 308+
 309+ my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @properties) ;
 310+ return if ! $updates_found ;
 311+
 312+ rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ;
 313+ open OUT, '>', "$dir_upd/$file_comscore_master" ;
 314+
 315+ $line_out = "yyyymm" ;
 316+ foreach $property (@properties)
 317+ { $line_out .= ",$property" ; }
 318+ print OUT "$line_out" ;
 319+
 320+ foreach $yyyymm (sort {$b cmp $a} keys %months)
 321+ {
 322+ $line_out = "\n$yyyymm" ;
 323+ foreach $property (@properties)
 324+ {
 325+ $yyyymm_property = "$yyyymm,$property" ;
 326+ $line_out .= "," . sprintf ($precision, $data {$yyyymm_property}) ;
 327+ }
 328+ print OUT "$line_out" ;
 329+ }
 330+
 331+ close OUT ;
 332+}
 333+
 334+sub UpdateMasterFileFromRecentComscoreData
 335+{
 336+ my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, @white_list) = @_ ;
 337+
 338+ print "UpdateMasterFileFromRecentComscoreData\n\n";
 339+
 340+ undef %white_list ;
 341+ undef %not_white_listed ;
 342+
 343+ print "White list: ". (join (',', @white_list)) . "\n\n";
 344+
 345+ foreach $id (@white_list)
 346+ { $white_list {$id} = $true ; }
 347+
 348+ if (! -e "$dir_upd/$file_comscore_master")
 349+ { Abort ("File $file_comscore_master not found!") ; }
 350+
 351+ $age_all = -M "$dir_upd/$file_comscore_master" ;
 352+ print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " days old: '$file_comscore_master'\n" ;
 353+
 354+ my $cwd = getcwd ;
 355+ chdir $dir_in ;
 356+
 357+ @files = glob($file_comscore_updates) ;
 358+ $min_age_upd = 999999 ;
 359+ $file_comscore_updates_latest = '' ;
 360+ foreach $file (@files)
 361+ {
 362+ $age = -M $file ;
 363+ if ($age < $min_age_upd)
 364+ {
 365+ $min_age_upd = $age ;
 366+ $file_comscore_updates_latest = $file ;
 367+ }
 368+ }
 369+ print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " days old: '$file_comscore_updates_latest'\n" ;
 370+
 371+ if ($min_age_upd == 999999)
 372+ {
 373+ print "No valid update file found. Nothing to update." ;
 374+ return ;
 375+ }
 376+
 377+ #if ($age_all > $min_age_upd)
 378+ #{
 379+ # print "File with master data more recent than latest update csv from comScore. Nothing to update." ;
 380+ # return ;
 381+ #}
 382+
 383+ my $updates_found = $false ;
 384+
 385+ open CSV, '<', $file_comscore_updates_latest ;
 386+ binmode CSV ;
 387+ while ($line = <CSV>)
 388+ {
 389+ chomp $line ;
 390+ $line =~ s/\r//g ;
 391+ $line = &GetNumberOnly ($line) ;
 392+
 393+ if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...'
 394+ {
 395+ if ($layout_csv == $layout_csv_properties)
 396+ { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web properties csv file
 397+ else
 398+ { ($dummy1,$dummy2,@months) = split (',', $line) ; } # uv / reach csv files
 399+
 400+ @months = &mmm_yyyy2yyyy_mm (@months) ;
 401+ }
 402+
 403+ if (($line =~ /^\d+,/) || ($line =~ /,,.*?Total Internet/))
 404+ {
 405+ if ($layout_csv == $layout_csv_properties)
 406+ {
 407+ ($index,$dummy,$property,@data) = split (',', $line) ;
 408+ $property =~ s/^\s+// ;
 409+ $property =~ s/\s+$// ;
 410+
 411+ $property =~ s/.*Total Internet.*/Total Internet/i ;
 412+ $property =~ s/.*Google.*/Google/i ;
 413+ $property =~ s/.*Microsoft.*/Microsoft/i ;
 414+ $property =~ s/.*FACEBOOK.*/Facebook/i ;
 415+ $property =~ s/.*Yahoo.*/Yahoo/i ;
 416+ $property =~ s/.*Amazon.*/Amazon/i ;
 417+ $property =~ s/.*Apple.*/Apple/i ;
 418+ $property =~ s/.*AOL.*/AOL/i ;
 419+ $property =~ s/.*Wikimedia.*/Wikimedia/i ;
 420+ $property =~ s/.*Tencent.*/Tencent/i ;
 421+ $property =~ s/.*Baidu.*/Baidu/i ;
 422+ $property =~ s/.*CBS.*/CBS/i ;
 423+
 424+ if (! $white_list {$property})
 425+ {
 426+ $not_white_listed {$property}++ ;
 427+ next ;
 428+ }
 429+
 430+ $id = $property ;
 431+ }
 432+ else
 433+ {
 434+ ($index,$region,@data) = split (',', $line) ;
 435+ $region =~ s/^\s+// ;
 436+ $region =~ s/\s+$// ;
 437+
 438+ if (! $white_list {$region})
 439+ {
 440+ $not_white_listed {$region}++ ;
 441+ next ;
 442+ }
 443+
 444+ $id = $region_codes {$region} ;
 445+ }
 446+
 447+ for ($m = 0 ; $m <= $#months ; $m++)
 448+ {
 449+ $yyyymm = $months [$m] ;
 450+ $months {$yyyymm} ++ ;
 451+ $yyyymm_id = "$yyyymm,$id" ;
 452+ $data = $data [$m] * $multiplier ;
 453+
 454+ if ($mode eq 'add')
 455+ {
 456+ if (! defined $data {$yyyymm_id})
 457+ {
 458+ $updates_found = $true ;
 459+ print "New data found: $yyyymm_id = $data\n" ;
 460+ $data {$yyyymm_id} = $data ;
 461+ }
 462+ }
 463+ else
 464+ {
 465+ $updates_found = $true ;
 466+ print "Data found: $yyyymm_id = $data\n" ;
 467+ $data {$yyyymm_id} = $data ;
 468+ }
 469+ }
 470+ }
 471+ }
 472+
 473+ $entities_not_white_listed = join (', ', sort keys %not_white_listed) ;
 474+ if ($entities_not_white_listed ne '')
 475+ { print "\nEntities ignored:\n$entities_not_white_listed\n\n" ; }
 476+
 477+ if (! $updates_found)
 478+ { print "No new updates found\n" ; }
 479+ else
 480+ { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; }
 481+
 482+ return ($updates_found) ;
 483+}
 484+
 485+sub WriteDataAnalytics
 486+{
 487+ print "WriteDataAnalytics\n\n";
 488+
 489+ open OUT, '>', "$dir_out/analytics_in_comscore.csv" ;
 490+
 491+ $metric = 'unique_visitors' ;
 492+ foreach $yyyymm (sort keys %months)
 493+ {
 494+ # store meta data elsewhere
 495+ # $line = "$generated,$source,$server,$script_name,$script_version,$user,$yyyymm,$country_code,$region_code,$property,$project,$normalized,$metric,$data\n" ;
 496+ foreach $region_code (sort values %region_codes)
 497+ {
 498+ $country_code = '-' ;
 499+ $property = '-' ;
 500+ $project = '-' ;
 501+ $reach = $reach_region_code {"$yyyymm,$region_code"} ;
 502+ $visitors = $visitors_region_code {"$yyyymm,$region_code"} ;
 503+
 504+ if (! defined $reach) { $reach = -1 ; }
 505+ if (! defined $visitors) { $visitors = -1 ; }
 506+
 507+ next if $reach == -1 and $visitors == -1 ;
 508+
 509+ $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
 510+ print OUT $line ;
 511+ print $line ;
 512+ }
 513+
 514+ foreach $property (sort @properties)
 515+ {
 516+ $country_code = '-' ;
 517+ $region_code = '-' ;
 518+ $project = '-' ;
 519+ $reach = '-1' ;
 520+ $visitors = $visitors_web_property {"$yyyymm,$property"} ;
 521+
 522+ next if ! defined $visitors ;
 523+
 524+ $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
 525+ print OUT $line ;
 526+ # print $line ;
 527+ }
 528+ }
 529+}
 530+
 531+sub GetNumberOnly
 532+{
 533+ my $line = shift ;
 534+ $line =~ s/("[^\"]+")/($a=$1,$a=~s#,##g,$a)/ge ; # nested regexp: remove comma's inside double quotes
 535+ $line =~ s/"//g ;
 536+ return $line ;
 537+}
 538+
 539+sub mmm_yyyy2yyyy_mm
 540+{
 541+ my @months = @_ ;
 542+ my ($m) ;
 543+ # Jan -> 01, etc
 544+ foreach $month (@months)
 545+ {
 546+ my ($mmm,$yyyy) = split ('-', $month) ;
 547+ for ($m = 0 ; $m <= $#months_short ; $m++)
 548+ {
 549+ if ($mmm eq $months_short [$m])
 550+ {
 551+ $month = "$yyyy-" . sprintf ("%02d", $m+1) ;
 552+ last ;
 553+ }
 554+ }
 555+ }
 556+ return @months ;
 557+}
 558+
 559+sub Abort
 560+{
 561+ $msg = shift ;
 562+
 563+ print "\nAbort, reason: $msg\n\n" ;
 564+ exit ;
 565+}
Property changes on: trunk/extensions/MetricsReporting/import/AnalyticsPrepComscoreData.pl
___________________________________________________________________
Added: svn:eol-style
1566 + native

Status & tagging log