Index: trunk/extensions/MetricsReporting/import/AnalyticsPrepBinariesData.pl |
— | — | @@ -0,0 +1,124 @@ |
| 2 | +#!/usr/local/bin/perl |
| 3 | + |
| 4 | + use Getopt::Std ; |
| 5 | + |
| 6 | + &ParseArguments ; |
| 7 | + |
| 8 | + print "Write file '$file_csv_out'\n" ; |
| 9 | + open CSV_OUT, '>', $file_csv_out ; |
| 10 | + |
| 11 | + foreach $project (qw (wb wk wn wp wq ws wv wx)) |
| 12 | + { &ReadStatisticsPerBinariesExtension ($project) ; } |
| 13 | + |
| 14 | + close CSV_OUT ; |
| 15 | + |
| 16 | + print "\n\nReady\n\n" ; |
| 17 | + exit ; |
| 18 | + |
| 19 | +sub ParseArguments |
| 20 | +{ |
| 21 | + my @options ; |
| 22 | + getopt ("io", \%options) ; |
| 23 | + |
| 24 | + die ("Specify input folder as: -i path") if (! defined ($options {"i"})) ; |
| 25 | + die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ; |
| 26 | + |
| 27 | + $path_in = $options {"i"} ; |
| 28 | + $path_out = $options {"o"} ; |
| 29 | + |
| 30 | + die "Input folder '$path_in' does not exist" if (! -d $path_in) ; |
| 31 | + die "Output folder '$path_out' does not exist" if (! -d $path_out) ; |
| 32 | + |
| 33 | + # tests only |
| 34 | + # $path_in = "C:/@ Wikimedia/# Out Bayes" ; |
| 35 | + # $path_out = "C:/analytics" ; # "w:/@ report card/data" ; |
| 36 | + |
| 37 | + print "Input folder: $path_in\n" ; |
| 38 | + print "Output folder: $path_out\n" ; |
| 39 | + print "\n" ; |
| 40 | + |
| 41 | + $file_csv_out = "$path_out/analytics_in_binaries.csv" ; |
| 42 | +} |
| 43 | + |
| 44 | + |
| 45 | +sub ReadStatisticsPerBinariesExtension |
| 46 | +{ |
| 47 | + my $project = shift ; |
| 48 | + my $file_csv_in = "$path_in/csv_$project/StatisticsPerBinariesExtension.csv" ; |
| 49 | + $yyyymm_hi = -1 ; |
| 50 | + |
| 51 | + if (! -e $file_csv_in) |
| 52 | + { die "Input file '$file_csv_in' not found" ; } |
| 53 | + |
| 54 | + |
| 55 | + print "Read '$file_csv_in'\n" ; |
| 56 | + open CSV_IN, '<', $file_csv_in ; |
| 57 | + |
| 58 | + $language_prev = '' ; |
| 59 | + while ($line = <CSV_IN>) |
| 60 | + { |
| 61 | + chomp $line ; |
| 62 | + next if $line !~ /,.*?,/ ; |
| 63 | + |
| 64 | + ($language,$date,$data) = split (',', $line, 3) ; |
| 65 | + |
| 66 | + # for each wiki first line shows ext names, no tcounts |
| 67 | + if ($date eq "00/0000") |
| 68 | + { |
| 69 | + if ($language_prev ne '') |
| 70 | + { &WriteMonthlyData ($project, $language_prev) ; } |
| 71 | + $language_prev = $language ; |
| 72 | + |
| 73 | + undef %ext_name ; |
| 74 | + undef %ext_ndx ; |
| 75 | + undef %ext_cnt ; |
| 76 | + undef %months ; |
| 77 | + |
| 78 | + @exts = split (',', $data) ; |
| 79 | + $ndx = 0 ; |
| 80 | + foreach $ext (@exts) |
| 81 | + { |
| 82 | + $ext_name {$ndx} = $ext ; |
| 83 | + $ext_ndx {$ext} = $ndx ; |
| 84 | + $ndx ++ ; |
| 85 | + } |
| 86 | + next ; |
| 87 | + } |
| 88 | + |
| 89 | + ($month,$year) = split ('\/', $date) ; |
| 90 | + $yyyymm = sprintf ("%04d-%02d", $year, $month) ; |
| 91 | + if ($yyyymm gt $yyyymm_hi) |
| 92 | + { $yyyymm_hi = $yyyymm ; } |
| 93 | + $months {$yyyymm}++ ; |
| 94 | + |
| 95 | + @counts = split (',', $data) ; |
| 96 | + $ndx = 0 ; |
| 97 | + foreach $count (@counts) |
| 98 | + { |
| 99 | + $ext_cnt {$yyyymm}{$ext_name {$ndx}} = $count ; |
| 100 | + $ndx ++ ; |
| 101 | + } |
| 102 | + } |
| 103 | + &WriteMonthlyData ($project, $language_prev) ; |
| 104 | + |
| 105 | + close CSV_IN ; |
| 106 | +} |
| 107 | + |
| 108 | +sub WriteMonthlyData |
| 109 | +{ |
| 110 | + my ($project,$language) = @_ ; |
| 111 | + # get sorted array of extensions, order by count for most recent month |
| 112 | + %ext_cnt_yyyymm_hi = %{$ext_cnt {$yyyymm_hi}} ; |
| 113 | + @ext_cnt_yyyymm_hi = (sort {$ext_cnt_yyyymm_hi {$b} <=> $ext_cnt_yyyymm_hi {$a}} keys %ext_cnt_yyyymm_hi) ; |
| 114 | + |
| 115 | + foreach $month (sort keys %months) |
| 116 | + { |
| 117 | + $ndx = 0 ; |
| 118 | + foreach $ext (@ext_cnt_yyyymm_hi) |
| 119 | + { |
| 120 | + print CSV_OUT "$project,$language,$month,$ext,${ext_cnt{$yyyymm}{$ext_name {$ndx}}}\n" ; |
| 121 | + # print "$month,$ext,${ext_cnt{$yyyymm}{$ext_name {$ndx}}}\n" ; |
| 122 | + last if (++ $ndx > 25) ; |
| 123 | + } |
| 124 | + } |
| 125 | +} |
Property changes on: trunk/extensions/MetricsReporting/import/AnalyticsPrepBinariesData.pl |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 126 | + native |
Index: trunk/extensions/MetricsReporting/import/AnalyticsPrepLanguageNames.pl |
— | — | @@ -0,0 +1,162 @@ |
| 2 | +#!/usr/bin/perl |
| 3 | + |
| 4 | +# This module prepares a csv file with language names for feed into the analytics database |
| 5 | +# The main work of collecting these names from different sources is done by |
| 6 | +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiReportsLocalizations.pm |
| 7 | +# which is part of wikistats reporting phase and once each month updates local master csv files from all sources |
| 8 | + |
| 9 | +# The following code to merge and filter these master csv files is based on parts of the code in WikiReportsLocalizations.pm, function Localization |
| 10 | +# To do some day: make a completely independent script out of this code and code from WikiReportsLocalizations.pm which covers the whole production cycle |
| 11 | + |
| 12 | +# Sources for language names: |
| 13 | +# - php files |
| 14 | +# - translatewiki |
| 15 | +# - English Wikipedia API (interwikilinks) |
| 16 | + |
| 17 | +# Right now multilingual support for the analytics database is just a nice idea, so to speed up data feeds, just keep English and German translations |
| 18 | + |
| 19 | + use Getopt::Std ; |
| 20 | + |
| 21 | + $true = 1 ; |
| 22 | + $false = 0 ; |
| 23 | + |
| 24 | + $max_language_name = 50 ; # truncate if longer |
| 25 | + |
| 26 | + $file_csv_language_names_php = "LanguageNamesViaPhp.csv" ; |
| 27 | + $file_csv_language_names_wp = "LanguageNamesViaWpEnEdited.csv" ; |
| 28 | + $file_csv_analytics_in = "analytics_in_language_names.csv" ; |
| 29 | + |
| 30 | + $languages_force_case_uc = "ast|br|de|en|id|nl|wa" ; # incomplete list, based on languages supported by wikistats reporting |
| 31 | + $languages_filter = "de|en" ; |
| 32 | + foreach $language (split '\|', $languages_filter) |
| 33 | + { $include_target_language {$language} = $true ; } |
| 34 | + |
| 35 | + &ParseArguments ; |
| 36 | + &ReadCsvFiles ; |
| 37 | + &WriteCsvFile ; |
| 38 | + |
| 39 | + |
| 40 | +# if ($language eq "id") # swap which file takes precedence |
| 41 | + |
| 42 | + print "\nReady\n\n" ; |
| 43 | + exit ; |
| 44 | + |
| 45 | +sub ParseArguments |
| 46 | +{ |
| 47 | + my (@options, $arguments) ; |
| 48 | + |
| 49 | + getopt ("io", \%options) ; |
| 50 | + |
| 51 | + foreach $arg (sort keys %options) |
| 52 | + { $arguments .= " -$arg " . $options {$arg} . "\n" ; } |
| 53 | + print ("\nArguments\n$arguments\n") ; |
| 54 | + |
| 55 | + $options {"i"} = "w:/# out bayes/csv_wp" ; # EZ test |
| 56 | + $options {"o"} = "c:/MySQL/analytics" ; # EZ test |
| 57 | + |
| 58 | + die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ; |
| 59 | + die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ; |
| 60 | + |
| 61 | + ($path_in = $options {"i"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash if any |
| 62 | + ($path_out = $options {"o"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash if any |
| 63 | + |
| 64 | + die "Input folder '$path_in' does not exist" if (! -d $path_in) ; |
| 65 | + die "Output folder '$path_out' does not exist" if (! -d $path_out) ; |
| 66 | + |
| 67 | + print "Input folder: $path_in\n" ; |
| 68 | + print "Output folder: $path_out\n\n" ; |
| 69 | + |
| 70 | + $file_csv_language_names_php = "$path_in/$file_csv_language_names_php" ; |
| 71 | + $file_csv_language_names_wp = "$path_in/$file_csv_language_names_wp" ; |
| 72 | + $file_csv_analytics_in = "$path_out/$file_csv_analytics_in" ; |
| 73 | + |
| 74 | + die "Input file '$file_csv_language_names_php' not found" if (! -e $file_csv_language_names_php) ; |
| 75 | + die "Input file '$file_csv_language_names_wp' not found" if (! -e $file_csv_language_names_wp) ; |
| 76 | +} |
| 77 | + |
| 78 | +sub ReadCsvFiles |
| 79 | +{ |
| 80 | + #first read definitions from php message files, then overwrite with definitions from interwiki links when available |
| 81 | + # except for target language 'id' (Indonesian) where quality of php file has been deemed more reliable |
| 82 | + |
| 83 | + open CSV_IN, "<", $file_csv_language_names_php ; |
| 84 | + while ($line = <CSV_IN>) |
| 85 | + { |
| 86 | + chomp ($line) ; |
| 87 | + ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ; |
| 88 | + |
| 89 | + next if ! $include_target_language {$target_language} ; |
| 90 | + |
| 91 | + $out_languages {$target_language} {$code} = &FormatName ($target_language, $name_unicode) ; # forget about html for analytics database |
| 92 | + } |
| 93 | + close CSV_IN ; |
| 94 | + |
| 95 | + open CSV_IN, "<", $file_csv_language_names_wp ; |
| 96 | + while ($line = <CSV_IN>) |
| 97 | + { |
| 98 | + chomp ($line) ; |
| 99 | + ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ; |
| 100 | + |
| 101 | + next if ! $include_target_language {$target_language} ; |
| 102 | + |
| 103 | + next if $target_language eq 'id' and $out_languages {$target_language} {$code} ne '' ; |
| 104 | + |
| 105 | + # $name_unicode_php = $out_languages {$target_language} {$code} ; # test only |
| 106 | + # $name_unicode_wp = &FormatName ($target_language, $name_unicode) ; # test only |
| 107 | + # if (($name_unicode_php ne '') && ($name_unicode_php ne $name_unicode_wp)) # test only |
| 108 | + # { print "$name_unicode_php => $name_unicode_wp\n" ; } # test only |
| 109 | + |
| 110 | + $out_languages {$target_language} {$code} = &FormatName ($target_language, $name_unicode) ; # forget about html for analytics database |
| 111 | + } |
| 112 | + close CSV_IN ; |
| 113 | +} |
| 114 | + |
| 115 | +sub FormatName |
| 116 | +{ |
| 117 | + my ($target_language, $name_unicode) = @_ ; |
| 118 | + |
| 119 | + $name_unicode2 = $name_unicode ; |
| 120 | + |
| 121 | + if ($target_language eq "de") |
| 122 | + { $name_unicode =~ s/e?\s*\-?sprache//i ; } |
| 123 | + |
| 124 | + if ($target_language =~ /^(?:$languages_force_case_uc)/) |
| 125 | + { $name_unicode = ucfirst $name_unicode ; } |
| 126 | + else |
| 127 | + { $name_unicode = lc $name_unicode ; } |
| 128 | + |
| 129 | +# Test only |
| 130 | +# if (($target_language eq 'de') && ($name_unicode ne $name_unicode2)) |
| 131 | +# { print "$name_unicode2 => $name_unicode\n" ; } |
| 132 | + |
| 133 | + return ($name_unicode) ; |
| 134 | +} |
| 135 | + |
| 136 | +sub WriteCsvFile |
| 137 | +{ |
| 138 | + open CSV_OUT, ">", $file_csv_analytics_in || die ("File '$file_csv_analytics_in' could not be opened") ; |
| 139 | + binmode CSV_OUT ; # force Unix style linebreak \012 |
| 140 | + |
| 141 | + foreach $target_language (sort keys %out_languages) |
| 142 | + { |
| 143 | + print "\nTarget language '$target_language'\n" ; |
| 144 | + %translations = %{$out_languages {$target_language}} ; |
| 145 | + |
| 146 | + foreach $code (sort keys %translations) |
| 147 | + { |
| 148 | + $language_name = $translations{$code} ; |
| 149 | + $language_name2 = substr ($language_name,0,$max_language_name) ; |
| 150 | + |
| 151 | + if ($language_name ne $language_name2) |
| 152 | + { print "Language name truncated to $max_language_name chars: '$language_name' -> '$language_name2'\n" ; } |
| 153 | + |
| 154 | + if ($language_name2 =~ /,/) |
| 155 | + { $language_name2 = "\"$language_name2\"" ; } |
| 156 | + # test only |
| 157 | + print CSV_OUT "$target_language,$code,$language_name2\n" ; |
| 158 | + } |
| 159 | + } |
| 160 | + |
| 161 | + close CSV_OUT ; |
| 162 | +} |
| 163 | + |
Property changes on: trunk/extensions/MetricsReporting/import/AnalyticsPrepLanguageNames.pl |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 164 | + native |
Index: trunk/extensions/MetricsReporting/import/analytics_refresh_from_csv.txt |
— | — | @@ -0,0 +1,55 @@ |
| 2 | +USE `analytics` ; |
| 3 | + |
| 4 | +TRUNCATE TABLE comscore ; |
| 5 | +LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv' |
| 6 | + INTO TABLE comscore |
| 7 | + FIELDS TERMINATED BY ',' |
| 8 | + OPTIONALLY ENCLOSED BY '"' |
| 9 | + (@date,country_code,region_code,web_property,project_code,reach,visitors) |
| 10 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; |
| 11 | + |
| 12 | +TRUNCATE TABLE comscore_regions ; |
| 13 | +LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv' |
| 14 | + INTO TABLE comscore_regions |
| 15 | + FIELDS TERMINATED BY ',' |
| 16 | + OPTIONALLY ENCLOSED BY '"' |
| 17 | + (report_language,region_code,region_name) ; |
| 18 | + |
| 19 | +TRUNCATE TABLE wikistats ; |
| 20 | +LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv' |
| 21 | + INTO TABLE wikistats |
| 22 | + FIELDS TERMINATED BY ',' |
| 23 | + OPTIONALLY ENCLOSED BY '"' |
| 24 | + (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) |
| 25 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; |
| 26 | + |
| 27 | +TRUNCATE TABLE page_views ; |
| 28 | +LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv' |
| 29 | + INTO TABLE page_views |
| 30 | + FIELDS TERMINATED BY ',' |
| 31 | + OPTIONALLY ENCLOSED BY '"' |
| 32 | + (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized) |
| 33 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; |
| 34 | + |
| 35 | +TRUNCATE TABLE language_names ; |
| 36 | +LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv' |
| 37 | + INTO TABLE language_names |
| 38 | + FIELDS TERMINATED BY ',' |
| 39 | + OPTIONALLY ENCLOSED BY '"' |
| 40 | + (report_language,language_code,language_name) ; |
| 41 | + |
| 42 | +TRUNCATE TABLE binaries ; |
| 43 | +LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv' |
| 44 | + INTO TABLE binaries |
| 45 | + FIELDS TERMINATED BY ',' |
| 46 | + OPTIONALLY ENCLOSED BY '"' |
| 47 | + (project_code,language_code,@date,extension,binaries) |
| 48 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; |
| 49 | + |
| 50 | +TRUNCATE TABLE offline ; |
| 51 | +LOAD DATA LOCAL INFILE 'analytics_in_offline.csv' |
| 52 | + INTO TABLE offline |
| 53 | + FIELDS TERMINATED BY ',' |
| 54 | + OPTIONALLY ENCLOSED BY '"' |
| 55 | + (@date, readers) |
| 56 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; |
Property changes on: trunk/extensions/MetricsReporting/import/analytics_refresh_from_csv.txt |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 57 | + native |
Index: trunk/extensions/MetricsReporting/import/AnalyticsPrepPageViews.pl |
— | — | @@ -0,0 +1,11 @@ |
| 2 | +#!/usr/bin/perl |
| 3 | + |
| 4 | +# This file is just a place holder (alias) |
| 5 | +# Page views per day/week/month/etc, per project, per language, nrmalized and not normalized are collected by |
| 6 | +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiCountsSummarizeProjectCounts.pl |
| 7 | +# |
| 8 | +# Added May 2001: |
| 9 | +# For analytics database one file is written for all projects and languages combined, |
| 10 | +# with per month, not normalized and normalized page view counts in one row. |
| 11 | +# This file is written to same output folder as other csv files generated by WikiCountsSummarizeProjectCounts.pl |
| 12 | +# and needs to be copied to analytics folder later |
\ No newline at end of file |
Property changes on: trunk/extensions/MetricsReporting/import/AnalyticsPrepPageViews.pl |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 13 | + native |
Index: trunk/extensions/MetricsReporting/import/analytics_create_and_load_from_csv.txt |
— | — | @@ -0,0 +1,181 @@ |
| 2 | +-- make sure to delete output files *test*.csv first if any exist (MySQL on purpose forbids overwrite) |
| 3 | + |
| 4 | +-- tables implemented: |
| 5 | +-- comscore |
| 6 | +-- comscore_regions |
| 7 | +-- wikistats |
| 8 | +-- page_views |
| 9 | +-- language names |
| 10 | +-- binaries |
| 11 | + |
| 12 | +-- more tables planned (O= optional, not needed for report card stage) |
| 13 | +-- project_names |
| 14 | +-- O edits per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other) |
| 15 | +-- O editors per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other) |
| 16 | + |
| 17 | + |
| 18 | +-- open issues: |
| 19 | +-- only store basic data in database and calculate all aggregates on the fly or do some aggregation before hand ? (e.g. count for mobile / non-mobile / ==> total of both ? <==) |
| 20 | +-- for binaries, store one extension type per row? (future proof, more work to query), or a selected few as columns? (one row only needed per month) |
| 21 | + |
| 22 | +-- Create database and two tables from scratch |
| 23 | +DROP DATABASE IF EXISTS `analytics` ; |
| 24 | + |
| 25 | +CREATE DATABASE `analytics` ; |
| 26 | + |
| 27 | +USE `analytics` ; |
| 28 | + |
| 29 | +CREATE TABLE `comscore` ( |
| 30 | + `date` date NOT NULL, |
| 31 | + `country_code` varchar (3), |
| 32 | + `region_code` varchar (3), |
| 33 | + `web_property` varchar (20), |
| 34 | + `project_code` varchar (10), |
| 35 | + `reach` decimal (4,1) DEFAULT NULL, |
| 36 | + `visitors` decimal (15) DEFAULT NULL, |
| 37 | + PRIMARY KEY (date,country_code,region_code,project_code,web_property), |
| 38 | + KEY (`country_code`) |
| 39 | +) ; |
| 40 | + |
| 41 | +CREATE TABLE `comscore_regions` ( |
| 42 | + `region_code` varchar (2), |
| 43 | + `report_language` varchar (10), |
| 44 | + `region_name` varchar (18), |
| 45 | + PRIMARY KEY (region_code,report_language) |
| 46 | +) ; |
| 47 | + |
| 48 | +CREATE TABLE `wikistats` ( |
| 49 | + `date` date NOT NULL, |
| 50 | + `project_code` varchar (10), |
| 51 | + `language_code` varchar (15), |
| 52 | + `editors_all_time` int (10) DEFAULT NULL, |
| 53 | + `editors_new` int (7) DEFAULT NULL, |
| 54 | + `editors_ge_5` int (7) DEFAULT NULL, |
| 55 | + `editors_ge_25` int (7) DEFAULT NULL, |
| 56 | + `editors_ge_100` int (7) DEFAULT NULL, |
| 57 | + `articles` int (12) DEFAULT NULL, |
| 58 | + `articles_new_per_day` int (9) DEFAULT NULL, |
| 59 | + `articles_over_bytes_500` int (12) DEFAULT NULL, |
| 60 | + `articles_over_bytes_2000` int (12) DEFAULT NULL, |
| 61 | + `edits_per_article` decimal (9,1) DEFAULT NULL, |
| 62 | + `bytes_per_article` decimal (9,1) DEFAULT NULL, |
| 63 | + `edits` int (12) DEFAULT NULL, |
| 64 | + `size_in_bytes` int (15) DEFAULT NULL, |
| 65 | + `size_in_words` int (15) DEFAULT NULL, |
| 66 | + `links_internal` int (15) DEFAULT NULL, |
| 67 | + `links_interwiki` int (15) DEFAULT NULL, |
| 68 | + `links_image` int (15) DEFAULT NULL, |
| 69 | + `links_external` int (15) DEFAULT NULL, |
| 70 | + `redirects` int (15) DEFAULT NULL, |
| 71 | + PRIMARY KEY (date,project_code,language_code) |
| 72 | +) ; |
| 73 | + |
| 74 | +CREATE TABLE `page_views` ( |
| 75 | + `date` date NOT NULL, |
| 76 | + `project_code` char (2), |
| 77 | + `language_code` char (15), |
| 78 | + `views_non_mobile_raw` bigint (15), |
| 79 | + `views_mobile_raw` bigint (15), |
| 80 | + `views_non_mobile_normalized` bigint (15), |
| 81 | + `views_mobile_normalized` bigint (15), |
| 82 | + `views_raw` bigint (15), |
| 83 | + `views_normalized` bigint (15), |
| 84 | + PRIMARY KEY (date,project_code,language_code), |
| 85 | + KEY (date,language_code,project_code) |
| 86 | +) ; |
| 87 | + |
| 88 | +CREATE TABLE `language_names` ( |
| 89 | + `report_language` varchar (15), |
| 90 | + `language_code` varchar (15), |
| 91 | + `language_name` varchar (50), |
| 92 | + PRIMARY KEY (report_language,language_code) |
| 93 | +) ; |
| 94 | + |
| 95 | +CREATE TABLE `binaries` ( |
| 96 | + `date` date NOT NULL, |
| 97 | + `project_code` char (2), |
| 98 | + `language_code` char (15), |
| 99 | + `extension` varchar (10), |
| 100 | + `binaries` bigint (15), |
| 101 | + PRIMARY KEY (date,project_code,language_code,extension) |
| 102 | +) ; |
| 103 | + |
| 104 | +CREATE TABLE `offline` ( |
| 105 | + `date` date NOT NULL, |
| 106 | + `source` varchar (255), |
| 107 | + `readers` bigint (12), |
| 108 | + PRIMARY KEY (date,source) |
| 109 | +) ; |
| 110 | + |
| 111 | +-- SHOW TABLES ; |
| 112 | +-- DESCRIBE comscore ; |
| 113 | +-- DESCRIBE comscore_regions ; |
| 114 | +-- DESCRIBE wikistats ; |
| 115 | +-- DESCRIBE page_views ; |
| 116 | +-- DESCRIBE language_names ; |
| 117 | +-- DESCRIBE binaries ; |
| 118 | +-- DESCRIBE offline ; |
| 119 | + |
| 120 | +-- Database Manipulation |
| 121 | +-- Obviously in real world this is a separate script |
| 122 | + |
| 123 | +LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv' |
| 124 | + INTO TABLE comscore |
| 125 | + FIELDS TERMINATED BY ',' |
| 126 | + OPTIONALLY ENCLOSED BY '"' |
| 127 | + (@date,country_code,region_code,web_property,project_code,reach,visitors) |
| 128 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; |
| 129 | + |
| 130 | +LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv' |
| 131 | + INTO TABLE comscore_regions |
| 132 | + FIELDS TERMINATED BY ',' |
| 133 | + OPTIONALLY ENCLOSED BY '"' |
| 134 | + (report_language,region_code,region_name) ; |
| 135 | + |
| 136 | +LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv' |
| 137 | + INTO TABLE wikistats |
| 138 | + FIELDS TERMINATED BY ',' |
| 139 | + OPTIONALLY ENCLOSED BY '"' |
| 140 | + (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) |
| 141 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; |
| 142 | + |
| 143 | +LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv' |
| 144 | + INTO TABLE page_views |
| 145 | + FIELDS TERMINATED BY ',' |
| 146 | + OPTIONALLY ENCLOSED BY '"' |
| 147 | + (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized) |
| 148 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; |
| 149 | + |
| 150 | + |
| 151 | +LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv' |
| 152 | + INTO TABLE language_names |
| 153 | + FIELDS TERMINATED BY ',' |
| 154 | + OPTIONALLY ENCLOSED BY '"' |
| 155 | + (report_language,language_code,language_name) ; |
| 156 | + |
| 157 | +LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv' |
| 158 | + INTO TABLE binaries |
| 159 | + FIELDS TERMINATED BY ',' |
| 160 | + OPTIONALLY ENCLOSED BY '"' |
| 161 | + (project_code,language_code,@date,extension,binaries) |
| 162 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; |
| 163 | + |
| 164 | +LOAD DATA LOCAL INFILE 'analytics_in_offline.csv' |
| 165 | + INTO TABLE offline |
| 166 | + FIELDS TERMINATED BY ',' |
| 167 | + OPTIONALLY ENCLOSED BY '"' |
| 168 | + (@date,readers) |
| 169 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; |
| 170 | + |
| 171 | + |
| 172 | +-- show contents (debugging only) |
| 173 | +-- SELECT * FROM comscore ; |
| 174 | +-- SELECT * FROM comscore_regions ; |
| 175 | +-- SELECT * FROM wikistats ; |
| 176 | +-- SELECT * FROM page_views ; |
| 177 | +-- SELECT * FROM language_names ; |
| 178 | +-- SELECT * FROM binaries |
| 179 | +-- WHERE project_code = 'commons' ; |
| 180 | + SELECT * FROM offline ; |
| 181 | + |
| 182 | + |
Property changes on: trunk/extensions/MetricsReporting/import/analytics_create_and_load_from_csv.txt |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 183 | + native |
Index: trunk/extensions/MetricsReporting/import/analytics_generate_csv_files.sh |
— | — | @@ -0,0 +1,47 @@ |
| 2 | +#!/bin/sh |
| 3 | + |
| 4 | +# Prepare several csv files, ready for importing into analytics database |
| 5 | +# All generated files have _in_ in name signalling these contain data ready for importing into database |
| 6 | +# One input record corresponds to one database record |
| 7 | + |
| 8 | +ulimit -v 8000000 |
| 9 | + |
| 10 | +clear |
| 11 | +cd /a/analytics |
| 12 | + |
| 13 | + |
| 14 | +# AnalyticsPrepBinariesData.pl read counts for binaries which were generated by wikistats |
| 15 | +# and which reside in /a/wikistats/csv_[project code]/StatisticsPerBinariesExtension.csv |
| 16 | +# It filters and reorganizes data and produces analytics_in_binaries.csv |
| 17 | +# Output csv contains: project code, language, month, extension name, count |
| 18 | + |
| 19 | +perl AnalyticsPrepBinariesData.pl -i /a/wikistats/ -o /a/analytics/ |
| 20 | + |
| 21 | +# AnalyticsPrepComscoreData.pl scans /a/analytics/comscore for newest comScore csv files (with data for last 14 months) |
| 22 | +# parses those csv files, adds/replaces data from these csv files into master files (containing full history) |
| 23 | +# and generates input csv file analytics_in_comscore.csv ready for importing into database |
| 24 | +# |
| 25 | +# note : these csv files were manually downloaded from http://mymetrix.comscore.com/app/report.aspx |
| 26 | +# and given more descriptive names, script finds newest files based on partial name search |
| 27 | +# |
| 28 | +# -r replace (default is add only) |
| 29 | +# -i input folder, contains manually downloaded csv files from comScore (or xls files manually converted to csv) |
| 30 | +# -m master files with full history |
| 31 | +# -o output csv file, with reach per region, UV's per region and UV's per top web property, ready for import into database |
| 32 | + |
| 33 | +perl AnalyticsPrepComscoreData.pl -r -i /a/analytics/comscore -m /a/analytics -o /a/analytics |
| 34 | + |
| 35 | +# AnalyticsPrepWikiCountsOutput.pl reads a plethora of fields from several csv files from wikistats process |
| 36 | +# It filters and reorganizes data and produces analytics_in_wikistats.csv, ready for import into analytics database |
| 37 | + |
| 38 | +perl AnalyticsPrepWikiCountsOutput.pl -i /a/wikistats/ -o /a/analytics |
| 39 | + |
| 40 | +# analytics_in_page_views.csv is written daily as part of WikiCountsSummarizeProjectCounts.pl |
| 41 | +# part of (/home/ezachte/pageviews_monthly.sh job) |
| 42 | +# which processes hourly projectcounts files (per wiki page view totals for one hour) from http://dammit.lt/wikistats |
| 43 | +# and generates several files on different aggregation levels |
| 44 | +# only action here is to copy data to this folder to have everything in one place |
| 45 | +# note: unlike folder name suggests this file contains stats for all projects |
| 46 | + |
| 47 | +cp /a/wikistats/csv_wp/analytics_in_page_views.csv . |
| 48 | + |
Property changes on: trunk/extensions/MetricsReporting/import/analytics_generate_csv_files.sh |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 49 | + native |
Index: trunk/extensions/MetricsReporting/import/analytics_upd.sh |
— | — | @@ -0,0 +1,6 @@ |
| 2 | +clear |
| 3 | +cd /a/analytics |
| 4 | + |
| 5 | +./analytics_generate_csv_files.sh |
| 6 | + |
| 7 | +mysql -u analytics -h project2.wikimedia.org -preport < analytics_refresh_from_csv.txt |
Property changes on: trunk/extensions/MetricsReporting/import/analytics_upd.sh |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 8 | + native |
Index: trunk/extensions/MetricsReporting/import/AnalyticsPrepWikiCountsOutput.pl |
— | — | @@ -0,0 +1,334 @@ |
| 2 | +#!/usr/local/bin/perl |
| 3 | + |
| 4 | +# Copyright (C) 2011 Wikimedia Foundation |
| 5 | +# This program is free software; you can redistribute it and/or |
| 6 | +# modify it under the terms of the GNU General Public License version 2 |
| 7 | +# as published by the Free Software Foundation. |
| 8 | +# This program is distributed in the hope that it will be useful, |
| 9 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 11 | +# See the GNU General Public License for more details, at |
| 12 | +# http://www.fsf.org/licenses/gpl.html |
| 13 | + |
| 14 | +# Author: |
| 15 | +# Erik Zachte, email ezachte@wikimedia.org |
| 16 | +# loosely based on predecessor |
| 17 | +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/reportcard/ReportCardExtractWikiCountsOutput.pl |
| 18 | + |
| 19 | +# Functionality: |
| 20 | +# tba |
| 21 | + |
| 22 | +# Parameters: |
| 23 | +# tba |
| 24 | + |
| 25 | +# Output: |
| 26 | +# updated csv file for import in MySQL |
| 27 | + |
| 28 | +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/ |
| 29 | + |
| 30 | + use Getopt::Std ; |
| 31 | + |
| 32 | + $true = 1 ; |
| 33 | + $false = 0 ; |
| 34 | + |
| 35 | + @projects = ('wb','wk','wn','wp','wq','ws','wv','wx','commons','*') ; |
| 36 | + |
| 37 | + $file_csv_monthly_data = "StatisticsMonthly.csv" ; |
| 38 | + $file_csv_user_activity_spread = "StatisticsUserActivitySpread.csv" ; |
| 39 | + $file_csv_analytics_in = "analytics_in_wikistats.csv" ; |
| 40 | + |
| 41 | + &ParseArguments ; |
| 42 | + &ReadStatisticsMonthly ; |
| 43 | + &FindLargestWikis ; |
| 44 | + &WriteMonthlyData ; |
| 45 | + |
| 46 | + print "\nReady\n\n" ; |
| 47 | + exit ; |
| 48 | + |
| 49 | +sub ParseArguments |
| 50 | +{ |
| 51 | + print "ParseArguments\n" ; |
| 52 | + my (@options, $arguments) ; |
| 53 | + |
| 54 | + getopt ("io", \%options) ; |
| 55 | + |
| 56 | + foreach $arg (sort keys %options) |
| 57 | + { $arguments .= " -$arg " . $options {$arg} . "\n" ; } |
| 58 | + print ("\nArguments\n$arguments\n") ; |
| 59 | + |
| 60 | + if (! -d '/mnt/') # EZ test |
| 61 | + { |
| 62 | + $path_in = "c:/\@ wikimedia/# out bayes" ; |
| 63 | + $path_out = "c:/MySQL/analytics" ; |
| 64 | + } |
| 65 | + else |
| 66 | + { |
| 67 | + die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ; |
| 68 | + die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ; |
| 69 | + |
| 70 | + $path_in = $options {"i"} ; |
| 71 | + $path_out = $options {"o"} ; |
| 72 | + } |
| 73 | + |
| 74 | + die "Input folder '$path_in' does not exist" if (! -d $path_in) ; |
| 75 | + die "Output folder '$path_out' does not exist" if (! -d $path_out) ; |
| 76 | + |
| 77 | + print "Input folder: $path_in\n" ; |
| 78 | + print "Output folder: $path_out\n\n" ; |
| 79 | + |
| 80 | + $file_csv_out = "$path_out/analytics_in_wikistats.csv" ; |
| 81 | +} |
| 82 | + |
| 83 | +sub ReadStatisticsMonthly |
| 84 | +{ |
| 85 | + print "ReadStatisticsMonthly\n" ; |
| 86 | + &ReadStatisticsMonthlyForProject ("wb") ; |
| 87 | + &ReadStatisticsMonthlyForProject ("wk") ; |
| 88 | + &ReadStatisticsMonthlyForProject ("wn") ; |
| 89 | + &ReadStatisticsMonthlyForProject ("wp") ; |
| 90 | + &ReadStatisticsMonthlyForProject ("wq") ; |
| 91 | + &ReadStatisticsMonthlyForProject ("ws") ; |
| 92 | + &ReadStatisticsMonthlyForProject ("wv") ; |
| 93 | + &ReadStatisticsMonthlyForProject ("wx") ; |
| 94 | + |
| 95 | +# &ReadStatisticsPerBinariesExtensionCommons ; |
| 96 | +} |
| 97 | + |
| 98 | +sub ReadStatisticsMonthlyForProject |
| 99 | +{ |
| 100 | + my $project = shift; |
| 101 | + $all_projects = "*" ; |
| 102 | + |
| 103 | + my $file_csv_in_1 = "$path_in/csv_$project/$file_csv_monthly_data" ; |
| 104 | + my $file_csv_in_2 = "$path_in/csv_$project/$file_csv_user_activity_spread" ; |
| 105 | + |
| 106 | + if (! -e $file_csv_in_1) |
| 107 | + { &Abort ("Input file '$file_csv_in_1' not found") ; } |
| 108 | + if (! -e $file_csv_in_2) |
| 109 | + { &Abort ("Input file '$file_csv_in_2' not found") ; } |
| 110 | + |
| 111 | + my $yyyymm ; |
| 112 | + |
| 113 | + print "Read '$file_csv_in_1'\n" ; |
| 114 | + open CSV_IN, '<', $file_csv_in_1 ; |
| 115 | + while ($line = <CSV_IN>) |
| 116 | + { |
| 117 | + chomp $line ; |
| 118 | + ($language,$date,$counts) = split (',', $line, 3) ; |
| 119 | + @fields = split (',', $counts) ; |
| 120 | + |
| 121 | + next if ! &AcceptWiki ($project,$language) ; |
| 122 | + |
| 123 | + ($month,$day,$year) = split ('\/', $date) ; |
| 124 | + $yyyymm = sprintf ("%04d-%02d", $year, $month) ; |
| 125 | + |
| 126 | + foreach $field (@fields) |
| 127 | + { |
| 128 | + if ($field eq '-') |
| 129 | + { $field = 0 ; } |
| 130 | + } |
| 131 | + |
| 132 | + $data = $fields [0] . ',' . # contributors all time |
| 133 | + $fields [1] . ',' . # new contributors |
| 134 | + 'data2,' . # place holder for more data, to be inserted later |
| 135 | + $fields [4] . ',' . # articles |
| 136 | + $fields [6] . ',' . # articles new per day |
| 137 | + $fields [9] . ',' . # larger than 0.5 kB |
| 138 | + $fields [10] . ',' . # larger than 2.0 kB |
| 139 | + $fields [7] . ',' . # mean edits per article |
| 140 | + $fields [8] . ',' . # mean bytes per article |
| 141 | + $fields [11] . ',' . # edits |
| 142 | + $fields [12] . ',' . # size in bytes |
| 143 | + $fields [13] . ',' . # size in words |
| 144 | + $fields [14] . ',' . # links internal |
| 145 | + $fields [15] . ',' . # links interwiki |
| 146 | + $fields [16] . ',' . # links images |
| 147 | + $fields [17] . ',' . # links external |
| 148 | + $fields [18] ; # redirects |
| 149 | + |
| 150 | + $data1 {"$project,$language,$yyyymm"} = $data ; |
| 151 | + } |
| 152 | + close CSV_IN ; |
| 153 | + |
| 154 | + # now read (very) active editors from newer more accurate file (split data for reg users and bots, unlike StatisticsMonthly.csv) |
| 155 | + |
| 156 | + print "Read '$file_csv_in_2'\n" ; |
| 157 | + open CSV_IN, '<', $file_csv_in_2 ; |
| 158 | + while ($line = <CSV_IN>) |
| 159 | + { |
| 160 | + chomp $line ; |
| 161 | + ($language,$date,$reguser_bot,$group,@counts) = split (',', $line) ; |
| 162 | + |
| 163 | + next if ! &AcceptWiki ($project,$language) ; |
| 164 | + |
| 165 | + if ($reguser_bot ne "R") { next ; } # R: reg user, B: bot |
| 166 | + if ($group ne "A") { next ; } # A: articles, T: talk pages, O: other namespaces |
| 167 | + |
| 168 | + ($month,$day,$year) = split ('\/', $date) ; |
| 169 | + $yyyymm = sprintf ("%04d-%02d", $year, $month) ; |
| 170 | + $months {$yyyymm} ++ ; |
| 171 | +# print "YYYYMM $yyyymm\n" ; |
| 172 | + |
| 173 | + # data have been collected in WikiCountsProcess.pm and been written in WikiCountsOutput.pm |
| 174 | + # count user with over x edits |
| 175 | + # threshold starting with a 3 are 10xSQRT(10), 100xSQRT(10), 1000xSQRT(10), etc |
| 176 | + # @thresholds = (1,3,5,10,25,32,50,100,250,316,500,1000,2500,3162,5000,10000,25000,31623,50000,100000,250000,316228,500000,1000000,2500000,3162278,500000,10000000,25000000,31622777,5000000,100000000) ; |
| 177 | + $edits_ge_5 = @counts [2] > 0 ? @counts [2] : 0 ; |
| 178 | + $edits_ge_25 = @counts [4] > 0 ? @counts [4] : 0 ; |
| 179 | + $edits_ge_100 = @counts [7] > 0 ? @counts [7] : 0 ; |
| 180 | + $data2 {"$project,$language,$yyyymm"} = "$edits_ge_5,$edits_ge_25,$edits_ge_100" ; |
| 181 | + |
| 182 | + $total_edits_ge_5 {"$project,$language"} += $edits_ge_5 ; |
| 183 | + $total_edits_ge_25 {"$project,$language"} += $edits_ge_25 ; |
| 184 | + $total_edits_ge_100 {"$project,$language"} += $edits_ge_100 ; |
| 185 | + |
| 186 | + # prep string with right amount of comma's |
| 187 | + if ($data2_default eq '') |
| 188 | + { |
| 189 | + $data2_default = $data2 {"$project,$language,$yyyymm"} ; |
| 190 | + $data2_default =~ s/[^,]+/0/g ; |
| 191 | + } |
| 192 | + } |
| 193 | + close CSV_IN ; |
| 194 | +} |
| 195 | + |
| 196 | +#sub ReadStatisticsPerBinariesExtensionCommons |
| 197 | +#{ |
| 198 | +# my $file_csv_in = "$path_in/csv_wx/StatisticsPerBinariesExtension.csv" ; |
| 199 | +# my $mmax = -1 ; |
| 200 | + |
| 201 | +# if (! -e $file_csv_in) |
| 202 | +# { &Abort ("Input file '$file_csv_in' not found") ; } |
| 203 | + |
| 204 | +# print "Read '$file_csv_in'\n" ; |
| 205 | +# open CSV_IN, '<', $file_csv_in ; |
| 206 | +# while ($line = <CSV_IN>) |
| 207 | +# { |
| 208 | +# chomp $line ; |
| 209 | +# ($language,$date,$counts) = split (',', $line, 3) ; |
| 210 | + |
| 211 | +# if ($language ne "commons") { next ; } |
| 212 | + |
| 213 | +# if ($date eq "00/0000") |
| 214 | +# { |
| 215 | +# @fields = split (',', $counts) ; |
| 216 | +# $field_ndx = 0 ; |
| 217 | +# foreach $field (@fields) |
| 218 | +# { |
| 219 | +# $ext_cnt {-1}{$field_ndx} = $field ; |
| 220 | +# # print "EXT_CNT $field_ndx : $field\n" ; |
| 221 | +# $field_ndx ++ ; |
| 222 | +# } |
| 223 | +# next ; |
| 224 | +# } |
| 225 | + |
| 226 | +# ($month,$year) = split ('\/', $date) ; |
| 227 | +# my $m = &months_since_2000_01 ($year,$month) ; |
| 228 | +# next if $m < $m_start ; |
| 229 | + |
| 230 | +# if ($m > $mmax) |
| 231 | +# { $mmax = $m ; } |
| 232 | + |
| 233 | +# @fields = split (',', $counts) ; |
| 234 | +# $field_ndx = 0 ; |
| 235 | +# foreach $field (@fields) |
| 236 | +# { |
| 237 | +# $ext_cnt {$m}{$field_ndx} = $field ; |
| 238 | +# $ext_tot {$m} += $field ; |
| 239 | +# $field_ndx ++ ; |
| 240 | +# } |
| 241 | +# } |
| 242 | +# close CSV_IN ; |
| 243 | + |
| 244 | +# %ext_cnt_mmax = %{$ext_cnt {$mmax}} ; |
| 245 | +# @ext_cnt_mmax = (sort {$ext_cnt_mmax {$b} <=> $ext_cnt_mmax {$a}} keys %ext_cnt_mmax) ; |
| 246 | + |
| 247 | +# $extcnt = 0 ; |
| 248 | +# foreach $extndx (@ext_cnt_mmax) |
| 249 | +# { |
| 250 | +# # print "$extndx < ${ext_cnt {-1}{$extndx}} > : ${ext_cnt_mmax {$extndx}}\n" ; |
| 251 | +# push @extndxs, $extndx ; |
| 252 | +# if ($extcnt++ >= 9) { last ; } |
| 253 | +# } |
| 254 | +#} |
| 255 | + |
| 256 | +sub FindLargestWikis |
| 257 | +{ |
| 258 | + print "FindLargestWikis\n" ; |
| 259 | + print "Largest projects (most accumulated very active editors):\n"; |
| 260 | + @total_edits_ge_100 = sort {$total_edits_ge_100 {$b} <=> $total_edits_ge_100 {$a}} keys %total_edits_ge_100 ; |
| 261 | + $rank = 0 ; |
| 262 | + foreach $project_language (@total_edits_ge_100) |
| 263 | + { |
| 264 | + $largest_projects {$project_language} = $rank++ ; |
| 265 | + print "$project_language," ; |
| 266 | + last if $rank > 10 ; |
| 267 | + } |
| 268 | + print "\n\n" ; |
| 269 | + |
| 270 | + foreach $yyyymm (sort keys %months) |
| 271 | + { |
| 272 | + next if $yyyymm lt '2011' ; |
| 273 | + foreach $project_language (keys %largest_projects) |
| 274 | + { |
| 275 | + ($project,$language) = split (',', $project_language) ; |
| 276 | + if ($data2 {"$project,$language,$yyyymm"} eq '') |
| 277 | + { |
| 278 | + print "No data yet for large wiki $project_language for $yyyymm-> skip month $yyyymm\n" ; |
| 279 | + $months {$yyyymm} = 0 ; |
| 280 | + } |
| 281 | + } |
| 282 | + } |
| 283 | +} |
| 284 | + |
| 285 | +sub WriteMonthlyData |
| 286 | +{ |
| 287 | + print "WriteMonthlyData\n" ; |
| 288 | + my $file_csv_out = "$path_out/$file_csv_analytics_in" ; |
| 289 | + open CSV_OUT, '>', $file_csv_out ; |
| 290 | + foreach $project_wiki_month (sort keys %data1) |
| 291 | + { |
| 292 | + ($project,$wiki,$yyyymm) = split (',', $project_wiki_month) ; |
| 293 | + |
| 294 | + # recent month misses on eor more large wikis? |
| 295 | + next if $months {$yyyymm} == 0 ; |
| 296 | + |
| 297 | + $data1 = $data1 {$project_wiki_month} ; |
| 298 | + $data2 = $data2 {$project_wiki_month} ; |
| 299 | + if ($data2 eq '') |
| 300 | + { |
| 301 | + print "Editor data missing for $project_wiki_month\n" ; |
| 302 | + $data2 = $data2_default ; |
| 303 | + } |
| 304 | + $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor fields close together |
| 305 | + print CSV_OUT "$project_wiki_month,$data1\n" ; |
| 306 | + } |
| 307 | + $total_edits_ge_5 {"$project,*,$yyyymm"} += $edits_ge_5 ; |
| 308 | + $total_edits_ge_25 {"$project,*,$yyyymm"} += $edits_ge_25 ; |
| 309 | + $total_edits_ge_100 {"$project,*,$yyyymm"} += $edits_ge_100 ; |
| 310 | + close CSV_OUT ; |
| 311 | +} |
| 312 | + |
| 313 | +sub AcceptWiki |
| 314 | +{ |
| 315 | + my ($project,$language) = @_ ; |
| 316 | + |
| 317 | + return $false if $language eq 'commons' and $project ne 'wx' ; # commons also in wikipedia csv files (bug, hard to cleanup, just skip) |
| 318 | + return $false if $language eq 'sr' and $project eq 'wn' ; # ignore insane bot spam on |
| 319 | + return $false if $language =~ /mania|team|comcom|closed|chair|langcom|office|searchcom|sep11|nostalgia|stats|test/i ; |
| 320 | + |
| 321 | + return $false if $language =~ /^(?:dk|tlh|ru_sib)$/ ; # dk=dumps exist(ed?) but site not, tlh=Klignon, ru-sib=Siberian |
| 322 | + return $false if $project eq 'wk' and ($language eq "als" or $language eq "tlh") ; |
| 323 | + |
| 324 | + return $true ; |
| 325 | +} |
| 326 | + |
| 327 | +sub Abort |
| 328 | +{ |
| 329 | + my $msg = shift ; |
| 330 | + print "$msg\nExecution aborted." ; |
| 331 | + # to do: log also to file |
| 332 | + exit ; |
| 333 | +} |
| 334 | + |
| 335 | + |
Property changes on: trunk/extensions/MetricsReporting/import/AnalyticsPrepWikiCountsOutput.pl |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 336 | + native |
Index: trunk/extensions/MetricsReporting/import/analytics_new.sh |
— | — | @@ -0,0 +1,5 @@ |
| 2 | +clear |
| 3 | +cd /a/analytics |
| 4 | +# rm *test*.csv |
| 5 | +# mysql --user=root --password=changerootwhenpriyankareturns < analytics_create_and_load_from_csv.txt > mysql_log.txt |
| 6 | + mysql -u analytics -h project2.wikimedia.org -preport < analytics_create_and_load_from_csv.txt > mysql_log.txt |
Property changes on: trunk/extensions/MetricsReporting/import/analytics_new.sh |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 7 | + native |
Index: trunk/extensions/MetricsReporting/import/_readme.txt |
— | — | @@ -0,0 +1,21 @@ |
| 2 | +This folder contain bash and perl files to create and fill database 'analytics', a.o. for monthly dashboard. |
| 3 | + |
| 4 | +== analytics_new.sh == |
| 5 | +Defines the database and tables and loads data from existing csv files. |
| 6 | +It executes SQL from analytics_create_and_load_from_csv.txt |
| 7 | + |
| 8 | +== analytics_upd.sh == |
| 9 | +Prepares new csv files (delegated to analytics_generate_csv_files.sh), |
| 10 | +and empties/reloads all tables for which csv files are in this folder. |
| 11 | +It executes SQL from analytics_refresh_from_csv.txt |
| 12 | + |
| 13 | +== CSV files == |
| 14 | +CSV files and where they are generated: |
| 15 | + |
| 16 | +analytics_in_binaries.csv <- AnalyticsPrepBinariesData.pl |
| 17 | +analytics_in_comscore.csv <- AnalyticsPrepComscoreData.pl |
| 18 | +analytics_in_comscore_regions.csv <- manual |
| 19 | +analytics_in_language_names.csv |
| 20 | +analytics_in_offline.csv <- manual |
| 21 | +analytics_in_page_views.csv <- /home/ezachte/wikistats/pageviews_monthly.sh (file copy after that) |
| 22 | +analytics_in_wikistats.csv <- AnalyticsPrepWikiCountsOutput.pl |
Property changes on: trunk/extensions/MetricsReporting/import/_readme.txt |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 23 | + native |
Index: trunk/extensions/MetricsReporting/import/AnalyticsPrepComscoreData.pl |
— | — | @@ -0,0 +1,564 @@ |
| 2 | +#!/usr/bin/perl |
| 3 | + |
| 4 | +# Copyright (C) 2011 Wikimedia Foundation |
| 5 | +# This program is free software; you can redistribute it and/or |
| 6 | +# modify it under the terms of the GNU General Public License version 2 |
| 7 | +# as published by the Free Software Foundation. |
| 8 | +# This program is distributed in the hope that it will be useful, |
| 9 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 11 | +# See the GNU General Public License for more details, at |
| 12 | +# http://www.fsf.org/licenses/gpl.html |
| 13 | + |
| 14 | +# Author: |
| 15 | +# Erik Zachte, email ezachte@wikimedia.org |
| 16 | + |
| 17 | +# Functionality: |
| 18 | +# comScore data can be downloaded as csv file, which each contain 14 months history |
| 19 | +# This script uses these files to update 'master' csv files which contain all known history |
| 20 | +# Note: only entities which are already in master file will be updated! |
| 21 | +# Then it merges these master files into one csv file which can be loaded into analytics database |
| 22 | +# Data are: reach by region, unique visitors by region, unique visitors by web property |
| 23 | + |
| 24 | +# Parameters: |
| 25 | +# -m (required) folder with 'master' csv files (files with all known history) |
| 26 | +# -u (required) folder with 'update' csv files (files with lastest 14 months history, produced by comScore) |
| 27 | + |
| 28 | +# Output: |
| 29 | +# updated master csv files + merged and formatted csv for import in MySQL |
| 30 | + |
| 31 | +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/ |
| 32 | + |
| 33 | + use Getopt::Std ; |
| 34 | + use Cwd; |
| 35 | + |
| 36 | + my $options ; |
| 37 | + getopt ("imo", \%options) ; |
| 38 | + |
| 39 | + $true = 1 ; |
| 40 | + $false = 0 ; |
| 41 | + |
| 42 | + $script_name = "AnalyticsPrepComscoreData.pl" ; |
| 43 | + $script_version = "0.31" ; |
| 44 | + |
| 45 | +# EZ test only |
| 46 | +# $source = "comscore" ; |
| 47 | +# $server = "ez_test" ; |
| 48 | +# $generated = "2011-05-06 00:00:00" ; |
| 49 | +# $user = "ezachte" ; |
| 50 | + |
| 51 | + $dir_in = $options {"i"} ; |
| 52 | + $dir_upd = $options {"m"} ; |
| 53 | + $dir_out = $options {"o"} ; |
| 54 | + $mode = 'add'; |
| 55 | + if (defined $options {'r'}) |
| 56 | + { $mode = 'replace'; } |
| 57 | + |
| 58 | + print "Mode is $mode (specify '-r' for replace)\n\n"; |
| 59 | + |
| 60 | + if (! -d "/home/") # EZ test machine |
| 61 | + { |
| 62 | + $dir_in = "C:/@ Wikimedia/@ Report Card/Data" ; |
| 63 | + $dir_upd = "C:/MySQL/analytics" ; |
| 64 | + $dir_out = "C:/MySQL/analytics" ; |
| 65 | + $mode = 'replace' ; |
| 66 | + } |
| 67 | + |
| 68 | + if ($dir_in eq '') |
| 69 | + { Abort ("Specify folder for input file (new comScore data) '-i folder'") ; } |
| 70 | + if ($dir_upd eq '') |
| 71 | + { Abort ("Specify folder for master files (full history) as '-m folder'") ; } |
| 72 | + if ($dir_out eq '') |
| 73 | + { Abort ("Specify folder for output file '-o folder'") ; } |
| 74 | + |
| 75 | + $file_comscore_reach_master = "history_comscore_reach_regions.csv" ; |
| 76 | + $file_comscore_reach_update = "*reach*by*region*csv" ; |
| 77 | + $file_comscore_uv_region_master = "history_comscore_UV_regions.csv" ; |
| 78 | + $file_comscore_uv_region_update = "*UVs*by*region*csv" ; |
| 79 | + $file_comscore_uv_property_master = "history_comscore_UV_properties.csv" ; |
| 80 | + $file_comscore_uv_property_update = "*UV*trend*csv" ; |
| 81 | + |
| 82 | + $layout_csv_reach = 1 ; |
| 83 | + $layout_csv_regions = 2 ; |
| 84 | + $layout_csv_properties = 3 ; |
| 85 | + |
| 86 | + print "Directories:\n" . |
| 87 | + "Input (new comScore data): '$dir_in'\n". |
| 88 | + "Master files (full history): '$dir_upd'\n" . |
| 89 | + "Output (database feed): '$dir_out'\n\n" ; |
| 90 | + |
| 91 | + %region_codes = ( |
| 92 | + "Europe"=>"EU", |
| 93 | + "North America"=>"NA", |
| 94 | + "Latin America"=>"LA", |
| 95 | + "World-Wide" => "W", |
| 96 | + "Middle East - Africa" => "MA", |
| 97 | + "Asia Pacific"=> "AS", |
| 98 | + "United States" => "US", |
| 99 | + "India" => "I", |
| 100 | + "China" => "C" |
| 101 | + ) ; |
| 102 | + |
| 103 | + foreach $region_name (keys %region_codes) |
| 104 | + { $region_names {$region_codes {$region_name}} = $region_name ; } |
| 105 | + |
| 106 | + @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ; |
| 107 | + |
| 108 | + &ReadMasterComscoreDataReachPerRegion ($file_comscore_reach_master, $file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ; |
| 109 | + %reach_region_code = %data ; |
| 110 | + |
| 111 | + &ReadMasterComscoreDataVisitorsPerRegion ($file_comscore_uv_region_master, $file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ; |
| 112 | + %visitors_region_code = %data ; |
| 113 | + |
| 114 | + &ReadMasterComscoreDataVisitorsPerProperty ($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f", 1000, $layout_csv_properties) ; |
| 115 | + %visitors_web_property = %data ; |
| 116 | + |
| 117 | + &WriteDataAnalytics ; |
| 118 | + |
| 119 | + print "\nReady\n\n" ; |
| 120 | + exit ; |
| 121 | + |
| 122 | +sub ReadMasterComscoreDataReachPerRegion |
| 123 | +{ |
| 124 | + my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) = @_ ; |
| 125 | + |
| 126 | + print "ReadMasterComscoreDataReachPerRegion\n\n" ; |
| 127 | + |
| 128 | + undef %months ; |
| 129 | + undef %data ; |
| 130 | + undef @regions ; |
| 131 | + |
| 132 | + open IN, '<', "$dir_upd/$file_comscore_master" ; |
| 133 | + |
| 134 | + $lines = 0 ; |
| 135 | + while ($line = <IN>) |
| 136 | + { |
| 137 | + chomp $line ; |
| 138 | + $line =~ s/\r//g ; |
| 139 | + |
| 140 | + ($yyyymm,@data) = split (',', $line) ; |
| 141 | + |
| 142 | + if ($lines++ == 0) |
| 143 | + { |
| 144 | + @regions = @data ; |
| 145 | + print "Regions found: " . (join ',', @regions) . "\n"; |
| 146 | + next ; |
| 147 | + } |
| 148 | + |
| 149 | + $field_ndx = 0 ; |
| 150 | + foreach (@data) |
| 151 | + { |
| 152 | + $region = $regions [$field_ndx] ; |
| 153 | + $region_code = $region_codes {$region} ; |
| 154 | + |
| 155 | + $data = $data [$field_ndx] ; |
| 156 | + if ($data eq '') |
| 157 | + { $data = '0' ; } |
| 158 | + $months {$yyyymm} ++ ; |
| 159 | + $data {"$yyyymm,$region_code"} = $data ; |
| 160 | + # print "Old data $yyyymm,$region_code = $data\n" ; |
| 161 | + $field_ndx++ ; |
| 162 | + } |
| 163 | + } |
| 164 | + close IN ; |
| 165 | + |
| 166 | + my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1, $layout_csv, @regions) ; |
| 167 | + return if ! $updates_found ; |
| 168 | + |
| 169 | + rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ; |
| 170 | + open OUT, '>', "$dir_upd/$file_comscore_master" ; |
| 171 | + |
| 172 | + $line_out = "yyyymm" ; |
| 173 | + foreach $region_name (@regions) |
| 174 | + { $line_out .= ",$region_name" ; } |
| 175 | + print OUT "$line_out" ; |
| 176 | + |
| 177 | + foreach $yyyymm (sort {$b cmp $a} keys %months) |
| 178 | + { |
| 179 | + $line_out = "\n$yyyymm" ; |
| 180 | + foreach $region_name (@regions) |
| 181 | + { |
| 182 | + $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ; |
| 183 | + $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ; |
| 184 | + } |
| 185 | + print OUT "$line_out" ; |
| 186 | + } |
| 187 | + |
| 188 | + close OUT ; |
| 189 | +} |
| 190 | + |
| 191 | +sub ReadMasterComscoreDataVisitorsPerRegion |
| 192 | +{ |
| 193 | + my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ; |
| 194 | + |
| 195 | + print "ReadMasterComscoreDataVisitorsPerRegion\n\n"; |
| 196 | + |
| 197 | + undef %months ; |
| 198 | + undef %data ; |
| 199 | + undef @regions ; |
| 200 | + |
| 201 | + open IN, '<', "$dir_upd/$file_comscore_master" ; |
| 202 | + |
| 203 | + $lines = 0 ; |
| 204 | + $metric = 'unique_visitors' ; |
| 205 | + while ($line = <IN>) |
| 206 | + { |
| 207 | + chomp $line ; |
| 208 | + $line =~ s/\r//g ; |
| 209 | + $line = &GetNumberOnly ($line) ; |
| 210 | + |
| 211 | + next if $line !~ /(?:yyyymm|\d\d\d\d-\d\d)/ ; |
| 212 | + |
| 213 | + ($yyyymm,@data) = split (',', $line) ; |
| 214 | + |
| 215 | + if ($lines++ == 0) |
| 216 | + { |
| 217 | + @regions = @data ; |
| 218 | + print "Regions found: " . (join ',', @regions) . "\n"; |
| 219 | + next ; |
| 220 | + } |
| 221 | + |
| 222 | + $field_ndx = 0 ; |
| 223 | + foreach (@data) |
| 224 | + { |
| 225 | + $region = $regions [$field_ndx] ; |
| 226 | + $region_code = $region_codes {$region} ; |
| 227 | + |
| 228 | + $data = $data [$field_ndx] ; |
| 229 | + if ($data eq '') |
| 230 | + { $data = '0' ; } |
| 231 | + |
| 232 | + # print "Old data $yyyymm,$region = $data\n" ; |
| 233 | + |
| 234 | + $months {$yyyymm} ++ ; |
| 235 | + $data {"$yyyymm,$region_code"} = $data ; |
| 236 | + |
| 237 | + $field_ndx++ ; |
| 238 | + } |
| 239 | + } |
| 240 | + close IN ; |
| 241 | + |
| 242 | + my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @regions) ; |
| 243 | + return if ! $updates_found ; |
| 244 | + |
| 245 | + rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ; |
| 246 | + open OUT, '>', "$dir_upd/$file_comscore_master" ; |
| 247 | + |
| 248 | + $line_out = "yyyymm" ; |
| 249 | + foreach $region_name (@regions) |
| 250 | + { $line_out .= ",$region_name" ; } |
| 251 | + print OUT "$line_out" ; |
| 252 | + |
| 253 | + foreach $yyyymm (sort {$b cmp $a} keys %months) |
| 254 | + { |
| 255 | + $line_out = "\n$yyyymm" ; |
| 256 | + foreach $region_name (@regions) |
| 257 | + { |
| 258 | + $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ; |
| 259 | + $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ; |
| 260 | + } |
| 261 | + print OUT "$line_out" ; |
| 262 | + } |
| 263 | + |
| 264 | + close OUT ; |
| 265 | +} |
| 266 | + |
| 267 | +sub ReadMasterComscoreDataVisitorsPerProperty |
| 268 | +{ |
| 269 | + my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ; |
| 270 | + |
| 271 | + print "ReadMasterComscoreDataVisitorsPerProperty\n\n"; |
| 272 | + |
| 273 | + undef %months ; |
| 274 | + undef %data ; |
| 275 | + undef @properties ; |
| 276 | + |
| 277 | + open IN, '<', "$dir_upd/$file_comscore_master" ; |
| 278 | + |
| 279 | + $lines = 0 ; |
| 280 | + $metric = 'unique_visitors' ; |
| 281 | + while ($line = <IN>) |
| 282 | + { |
| 283 | + chomp $line ; |
| 284 | + $line =~ s/\r//g ; |
| 285 | + |
| 286 | + ($yyyymm,@data) = split (',', $line) ; |
| 287 | + if ($lines++ == 0) |
| 288 | + { @properties = @data ; next ; } |
| 289 | + |
| 290 | + $field_ndx = 0 ; |
| 291 | + foreach (@data) |
| 292 | + { |
| 293 | + $property = $properties [$field_ndx] ; |
| 294 | + $property =~ s/.*Yahoo.*/Yahoo/ ; |
| 295 | + $data = $data [$field_ndx] ; |
| 296 | + if ($data eq '') |
| 297 | + { $data = '0' ; } |
| 298 | + |
| 299 | + # print "Old data $yyyymm,$property = $data\n" ; |
| 300 | + |
| 301 | + $months {$yyyymm} ++ ; |
| 302 | + $data {"$yyyymm,$property"} = $data ; |
| 303 | + |
| 304 | + $field_ndx++ ; |
| 305 | + } |
| 306 | + } |
| 307 | + close IN ; |
| 308 | + |
| 309 | + my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @properties) ; |
| 310 | + return if ! $updates_found ; |
| 311 | + |
| 312 | + rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ; |
| 313 | + open OUT, '>', "$dir_upd/$file_comscore_master" ; |
| 314 | + |
| 315 | + $line_out = "yyyymm" ; |
| 316 | + foreach $property (@properties) |
| 317 | + { $line_out .= ",$property" ; } |
| 318 | + print OUT "$line_out" ; |
| 319 | + |
| 320 | + foreach $yyyymm (sort {$b cmp $a} keys %months) |
| 321 | + { |
| 322 | + $line_out = "\n$yyyymm" ; |
| 323 | + foreach $property (@properties) |
| 324 | + { |
| 325 | + $yyyymm_property = "$yyyymm,$property" ; |
| 326 | + $line_out .= "," . sprintf ($precision, $data {$yyyymm_property}) ; |
| 327 | + } |
| 328 | + print OUT "$line_out" ; |
| 329 | + } |
| 330 | + |
| 331 | + close OUT ; |
| 332 | +} |
| 333 | + |
| 334 | +sub UpdateMasterFileFromRecentComscoreData |
| 335 | +{ |
| 336 | + my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, @white_list) = @_ ; |
| 337 | + |
| 338 | + print "UpdateMasterFileFromRecentComscoreData\n\n"; |
| 339 | + |
| 340 | + undef %white_list ; |
| 341 | + undef %not_white_listed ; |
| 342 | + |
| 343 | + print "White list: ". (join (',', @white_list)) . "\n\n"; |
| 344 | + |
| 345 | + foreach $id (@white_list) |
| 346 | + { $white_list {$id} = $true ; } |
| 347 | + |
| 348 | + if (! -e "$dir_upd/$file_comscore_master") |
| 349 | + { Abort ("File $file_comscore_master not found!") ; } |
| 350 | + |
| 351 | + $age_all = -M "$dir_upd/$file_comscore_master" ; |
| 352 | + print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " days old: '$file_comscore_master'\n" ; |
| 353 | + |
| 354 | + my $cwd = getcwd ; |
| 355 | + chdir $dir_in ; |
| 356 | + |
| 357 | + @files = glob($file_comscore_updates) ; |
| 358 | + $min_age_upd = 999999 ; |
| 359 | + $file_comscore_updates_latest = '' ; |
| 360 | + foreach $file (@files) |
| 361 | + { |
| 362 | + $age = -M $file ; |
| 363 | + if ($age < $min_age_upd) |
| 364 | + { |
| 365 | + $min_age_upd = $age ; |
| 366 | + $file_comscore_updates_latest = $file ; |
| 367 | + } |
| 368 | + } |
| 369 | + print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " days old: '$file_comscore_updates_latest'\n" ; |
| 370 | + |
| 371 | + if ($min_age_upd == 999999) |
| 372 | + { |
| 373 | + print "No valid update file found. Nothing to update." ; |
| 374 | + return ; |
| 375 | + } |
| 376 | + |
| 377 | + #if ($age_all > $min_age_upd) |
| 378 | + #{ |
| 379 | + # print "File with master data more recent than latest update csv from comScore. Nothing to update." ; |
| 380 | + # return ; |
| 381 | + #} |
| 382 | + |
| 383 | + my $updates_found = $false ; |
| 384 | + |
| 385 | + open CSV, '<', $file_comscore_updates_latest ; |
| 386 | + binmode CSV ; |
| 387 | + while ($line = <CSV>) |
| 388 | + { |
| 389 | + chomp $line ; |
| 390 | + $line =~ s/\r//g ; |
| 391 | + $line = &GetNumberOnly ($line) ; |
| 392 | + |
| 393 | + if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...' |
| 394 | + { |
| 395 | + if ($layout_csv == $layout_csv_properties) |
| 396 | + { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web properties csv file |
| 397 | + else |
| 398 | + { ($dummy1,$dummy2,@months) = split (',', $line) ; } # uv / reach csv files |
| 399 | + |
| 400 | + @months = &mmm_yyyy2yyyy_mm (@months) ; |
| 401 | + } |
| 402 | + |
| 403 | + if (($line =~ /^\d+,/) || ($line =~ /,,.*?Total Internet/)) |
| 404 | + { |
| 405 | + if ($layout_csv == $layout_csv_properties) |
| 406 | + { |
| 407 | + ($index,$dummy,$property,@data) = split (',', $line) ; |
| 408 | + $property =~ s/^\s+// ; |
| 409 | + $property =~ s/\s+$// ; |
| 410 | + |
| 411 | + $property =~ s/.*Total Internet.*/Total Internet/i ; |
| 412 | + $property =~ s/.*Google.*/Google/i ; |
| 413 | + $property =~ s/.*Microsoft.*/Microsoft/i ; |
| 414 | + $property =~ s/.*FACEBOOK.*/Facebook/i ; |
| 415 | + $property =~ s/.*Yahoo.*/Yahoo/i ; |
| 416 | + $property =~ s/.*Amazon.*/Amazon/i ; |
| 417 | + $property =~ s/.*Apple.*/Apple/i ; |
| 418 | + $property =~ s/.*AOL.*/AOL/i ; |
| 419 | + $property =~ s/.*Wikimedia.*/Wikimedia/i ; |
| 420 | + $property =~ s/.*Tencent.*/Tencent/i ; |
| 421 | + $property =~ s/.*Baidu.*/Baidu/i ; |
| 422 | + $property =~ s/.*CBS.*/CBS/i ; |
| 423 | + |
| 424 | + if (! $white_list {$property}) |
| 425 | + { |
| 426 | + $not_white_listed {$property}++ ; |
| 427 | + next ; |
| 428 | + } |
| 429 | + |
| 430 | + $id = $property ; |
| 431 | + } |
| 432 | + else |
| 433 | + { |
| 434 | + ($index,$region,@data) = split (',', $line) ; |
| 435 | + $region =~ s/^\s+// ; |
| 436 | + $region =~ s/\s+$// ; |
| 437 | + |
| 438 | + if (! $white_list {$region}) |
| 439 | + { |
| 440 | + $not_white_listed {$region}++ ; |
| 441 | + next ; |
| 442 | + } |
| 443 | + |
| 444 | + $id = $region_codes {$region} ; |
| 445 | + } |
| 446 | + |
| 447 | + for ($m = 0 ; $m <= $#months ; $m++) |
| 448 | + { |
| 449 | + $yyyymm = $months [$m] ; |
| 450 | + $months {$yyyymm} ++ ; |
| 451 | + $yyyymm_id = "$yyyymm,$id" ; |
| 452 | + $data = $data [$m] * $multiplier ; |
| 453 | + |
| 454 | + if ($mode eq 'add') |
| 455 | + { |
| 456 | + if (! defined $data {$yyyymm_id}) |
| 457 | + { |
| 458 | + $updates_found = $true ; |
| 459 | + print "New data found: $yyyymm_id = $data\n" ; |
| 460 | + $data {$yyyymm_id} = $data ; |
| 461 | + } |
| 462 | + } |
| 463 | + else |
| 464 | + { |
| 465 | + $updates_found = $true ; |
| 466 | + print "Data found: $yyyymm_id = $data\n" ; |
| 467 | + $data {$yyyymm_id} = $data ; |
| 468 | + } |
| 469 | + } |
| 470 | + } |
| 471 | + } |
| 472 | + |
| 473 | + $entities_not_white_listed = join (', ', sort keys %not_white_listed) ; |
| 474 | + if ($entities_not_white_listed ne '') |
| 475 | + { print "\nEntities ignored:\n$entities_not_white_listed\n\n" ; } |
| 476 | + |
| 477 | + if (! $updates_found) |
| 478 | + { print "No new updates found\n" ; } |
| 479 | + else |
| 480 | + { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; } |
| 481 | + |
| 482 | + return ($updates_found) ; |
| 483 | +} |
| 484 | + |
| 485 | +sub WriteDataAnalytics |
| 486 | +{ |
| 487 | + print "WriteDataAnalytics\n\n"; |
| 488 | + |
| 489 | + open OUT, '>', "$dir_out/analytics_in_comscore.csv" ; |
| 490 | + |
| 491 | + $metric = 'unique_visitors' ; |
| 492 | + foreach $yyyymm (sort keys %months) |
| 493 | + { |
| 494 | + # store meta data elsewhere |
| 495 | + # $line = "$generated,$source,$server,$script_name,$script_version,$user,$yyyymm,$country_code,$region_code,$property,$project,$normalized,$metric,$data\n" ; |
| 496 | + foreach $region_code (sort values %region_codes) |
| 497 | + { |
| 498 | + $country_code = '-' ; |
| 499 | + $property = '-' ; |
| 500 | + $project = '-' ; |
| 501 | + $reach = $reach_region_code {"$yyyymm,$region_code"} ; |
| 502 | + $visitors = $visitors_region_code {"$yyyymm,$region_code"} ; |
| 503 | + |
| 504 | + if (! defined $reach) { $reach = -1 ; } |
| 505 | + if (! defined $visitors) { $visitors = -1 ; } |
| 506 | + |
| 507 | + next if $reach == -1 and $visitors == -1 ; |
| 508 | + |
| 509 | + $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ; |
| 510 | + print OUT $line ; |
| 511 | + print $line ; |
| 512 | + } |
| 513 | + |
| 514 | + foreach $property (sort @properties) |
| 515 | + { |
| 516 | + $country_code = '-' ; |
| 517 | + $region_code = '-' ; |
| 518 | + $project = '-' ; |
| 519 | + $reach = '-1' ; |
| 520 | + $visitors = $visitors_web_property {"$yyyymm,$property"} ; |
| 521 | + |
| 522 | + next if ! defined $visitors ; |
| 523 | + |
| 524 | + $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ; |
| 525 | + print OUT $line ; |
| 526 | + # print $line ; |
| 527 | + } |
| 528 | + } |
| 529 | +} |
| 530 | + |
| 531 | +sub GetNumberOnly |
| 532 | +{ |
| 533 | + my $line = shift ; |
| 534 | + $line =~ s/("[^\"]+")/($a=$1,$a=~s#,##g,$a)/ge ; # nested regexp: remove comma's inside double quotes |
| 535 | + $line =~ s/"//g ; |
| 536 | + return $line ; |
| 537 | +} |
| 538 | + |
| 539 | +sub mmm_yyyy2yyyy_mm |
| 540 | +{ |
| 541 | + my @months = @_ ; |
| 542 | + my ($m) ; |
| 543 | + # Jan -> 01, etc |
| 544 | + foreach $month (@months) |
| 545 | + { |
| 546 | + my ($mmm,$yyyy) = split ('-', $month) ; |
| 547 | + for ($m = 0 ; $m <= $#months_short ; $m++) |
| 548 | + { |
| 549 | + if ($mmm eq $months_short [$m]) |
| 550 | + { |
| 551 | + $month = "$yyyy-" . sprintf ("%02d", $m+1) ; |
| 552 | + last ; |
| 553 | + } |
| 554 | + } |
| 555 | + } |
| 556 | + return @months ; |
| 557 | +} |
| 558 | + |
| 559 | +sub Abort |
| 560 | +{ |
| 561 | + $msg = shift ; |
| 562 | + |
| 563 | + print "\nAbort, reason: $msg\n\n" ; |
| 564 | + exit ; |
| 565 | +} |
Property changes on: trunk/extensions/MetricsReporting/import/AnalyticsPrepComscoreData.pl |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 566 | + native |