Index: trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl |
— | — | @@ -17,21 +17,21 @@ |
18 | 18 |
|
19 | 19 | sub ParseArguments
|
20 | 20 | {
|
21 | | -# my @options ;
|
22 | | -# getopt ("io", \%options) ;
|
| 21 | + my @options ;
|
| 22 | + getopt ("io", \%options) ;
|
23 | 23 |
|
24 | | -# die ("Specify input folder as: -i path") if (! defined ($options {"i"})) ;
|
25 | | -# die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
|
| 24 | + die ("Specify input folder as: -i path") if (! defined ($options {"i"})) ;
|
| 25 | + die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
|
26 | 26 |
|
27 | | -# $path_in = $options {"i"} ;
|
28 | | -# $path_out = $options {"o"} ;
|
| 27 | + $path_in = $options {"i"} ;
|
| 28 | + $path_out = $options {"o"} ;
|
29 | 29 |
|
30 | | -# die "Input folder '$path_in' does not exist" if (! -d $path_in) ;
|
31 | | -# die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
|
| 30 | + die "Input folder '$path_in' does not exist" if (! -d $path_in) ;
|
| 31 | + die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
|
32 | 32 |
|
33 | 33 | # tests only
|
34 | | - $path_in = "C:/@ Wikimedia/# Out Bayes" ;
|
35 | | - $path_out = "C:/analytics" ; # "w:/@ report card/data" ;
|
| 34 | + # $path_in = "C:/@ Wikimedia/# Out Bayes" ;
|
| 35 | + # $path_out = "C:/analytics" ; # "w:/@ report card/data" ;
|
36 | 36 |
|
37 | 37 | print "Input folder: $path_in\n" ;
|
38 | 38 | print "Output folder: $path_out\n" ;
|
Index: trunk/wikistats/analytics/analytics_refresh_from_csv.txt |
— | — | @@ -0,0 +1,145 @@ |
| 2 | +-- make sure to delete output files *test*.csv first if any exist (MySQL on purpose forbids overwrite)
|
| 3 | +
|
| 4 | +USE `analytics` ;
|
| 5 | +
|
| 6 | +CREATE TABLE `comscore` (
|
| 7 | + `date` date NOT NULL,
|
| 8 | + `country_code` varchar (3),
|
| 9 | + `region_code` varchar (3),
|
| 10 | + `web_property` varchar (20),
|
| 11 | + `project_code` varchar (10),
|
| 12 | + `reach` decimal (4,1) DEFAULT NULL,
|
| 13 | + `visitors` decimal (15) DEFAULT NULL,
|
| 14 | + PRIMARY KEY (date,country_code,region_code,project_code,web_property),
|
| 15 | + KEY (`country_code`)
|
| 16 | +) ;
|
| 17 | +
|
| 18 | +CREATE TABLE `comscore_regions` (
|
| 19 | + `region_code` varchar (2),
|
| 20 | + `report_language` varchar (10),
|
| 21 | + `region_name` varchar (18),
|
| 22 | + PRIMARY KEY (report_language,region_code)
|
| 23 | +) ;
|
| 24 | +
|
| 25 | +CREATE TABLE `wikistats` (
|
| 26 | + `date` date NOT NULL,
|
| 27 | + `project_code` varchar (10),
|
| 28 | + `language_code` varchar (15),
|
| 29 | + `editors_all_time` int (10) DEFAULT NULL,
|
| 30 | + `editors_new` int (7) DEFAULT NULL,
|
| 31 | + `editors_ge_5` int (7) DEFAULT NULL,
|
| 32 | + `editors_ge_25` int (7) DEFAULT NULL,
|
| 33 | + `editors_ge_100` int (7) DEFAULT NULL,
|
| 34 | + `articles` int (12) DEFAULT NULL,
|
| 35 | + `articles_new_per_day` int (9) DEFAULT NULL,
|
| 36 | + `articles_over_bytes_500` int (12) DEFAULT NULL,
|
| 37 | + `articles_over_bytes_2000` int (12) DEFAULT NULL,
|
| 38 | + `edits_per_article` decimal (9,1) DEFAULT NULL,
|
| 39 | + `bytes_per_article` decimal (9,1) DEFAULT NULL,
|
| 40 | + `edits` int (12) DEFAULT NULL,
|
| 41 | + `size_in_bytes` int (15) DEFAULT NULL,
|
| 42 | + `size_in_words` int (15) DEFAULT NULL,
|
| 43 | + `links_internal` int (15) DEFAULT NULL,
|
| 44 | + `links_interwiki` int (15) DEFAULT NULL,
|
| 45 | + `links_image` int (15) DEFAULT NULL,
|
| 46 | + `links_external` int (15) DEFAULT NULL,
|
| 47 | + `redirects` int (15) DEFAULT NULL,
|
| 48 | + PRIMARY KEY (date,project_code,language_code)
|
| 49 | +) ;
|
| 50 | +
|
| 51 | +CREATE TABLE `page_views` (
|
| 52 | + `date` date NOT NULL,
|
| 53 | + `project_code` char (2),
|
| 54 | + `language_code` char (15),
|
| 55 | + `views_non_mobile_raw` bigint (15),
|
| 56 | + `views_mobile_raw` bigint (15),
|
| 57 | + `views_non_mobile_normalized` bigint (15),
|
| 58 | + `views_mobile_normalized` bigint (15),
|
| 59 | + `views_raw` bigint (15),
|
| 60 | + `views_normalized` bigint (15),
|
| 61 | + PRIMARY KEY (date,project_code,language_code)
|
| 62 | +) ;
|
| 63 | +
|
| 64 | +CREATE TABLE `language_names` (
|
| 65 | + `report_language` varchar (15),
|
| 66 | + `language_code` varchar (15),
|
| 67 | + `language_name` varchar (50),
|
| 68 | + PRIMARY KEY (report_language,language_code)
|
| 69 | +) ;
|
| 70 | +
|
| 71 | +CREATE TABLE `binaries` (
|
| 72 | + `date` date NOT NULL,
|
| 73 | + `project_code` char (2),
|
| 74 | + `language_code` char (15),
|
| 75 | + `extension` varchar (10),
|
| 76 | + `binaries` bigint (15),
|
| 77 | + PRIMARY KEY (date,project_code,language_code,extension)
|
| 78 | +) ;
|
| 79 | +
|
| 80 | +CREATE TABLE `offline` (
|
| 81 | + `date` date NOT NULL,
|
| 82 | + `readers` bigint (12),
|
| 83 | + PRIMARY KEY (date,readers)
|
| 84 | +) ;
|
| 85 | +
|
| 86 | + |
| 87 | +-- SHOW TABLES ;
|
| 88 | +-- DESCRIBE comscore ;
|
| 89 | +-- DESCRIBE comscore_regions ;
|
| 90 | +-- DESCRIBE wikistats ;
|
| 91 | +-- DESCRIBE page_views ;
|
| 92 | +-- DESCRIBE language_names ;
|
| 93 | +-- DESCRIBE binaries ;
|
| 94 | +-- DESCRIBE offline ;
|
| 95 | +
|
| 96 | +-- Database Manipulation
|
| 97 | +-- Obviously in real world this is a separate script
|
| 98 | + |
| 99 | +TRUNCATE TABLE 'analytics_in_comscore.csv' ;
|
| 100 | +LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv'
|
| 101 | + INTO TABLE comscore
|
| 102 | + FIELDS TERMINATED BY ','
|
| 103 | + OPTIONALLY ENCLOSED BY '"'
|
| 104 | + (@date,country_code,region_code,web_property,project_code,reach,visitors)
|
| 105 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
|
| 106 | +
|
| 107 | +TRUNCATE TABLE 'analytics_in_comscore_regions.csv' ; |
| 108 | +LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv'
|
| 109 | + INTO TABLE comscore_regions
|
| 110 | + FIELDS TERMINATED BY ','
|
| 111 | + OPTIONALLY ENCLOSED BY '"'
|
| 112 | + (report_language,region_code,region_name) ;
|
| 113 | +
|
| 114 | +TRUNCATE TABLE 'analytics_in_wikistats.csv' ; |
| 115 | +LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv'
|
| 116 | + INTO TABLE wikistats
|
| 117 | + FIELDS TERMINATED BY ','
|
| 118 | + OPTIONALLY ENCLOSED BY '"'
|
| 119 | + (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects)
|
| 120 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
|
| 121 | +
|
| 122 | +TRUNCATE TABLE 'analytics_in_page_views.csv' ; |
| 123 | +LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv'
|
| 124 | + INTO TABLE page_views
|
| 125 | + FIELDS TERMINATED BY ','
|
| 126 | + OPTIONALLY ENCLOSED BY '"'
|
| 127 | + (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized)
|
| 128 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
|
| 129 | +
|
| 130 | +
|
| 131 | +TRUNCATE TABLE 'analytics_in_language_names.csv' ; |
| 132 | +LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv'
|
| 133 | + INTO TABLE language_names
|
| 134 | + FIELDS TERMINATED BY ','
|
| 135 | + OPTIONALLY ENCLOSED BY '"'
|
| 136 | + (report_language,language_code,language_name) ;
|
| 137 | +
|
| 138 | +TRUNCATE TABLE 'analytics_in_binaries.csv' ; |
| 139 | +LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv'
|
| 140 | + INTO TABLE binaries
|
| 141 | + FIELDS TERMINATED BY ','
|
| 142 | + OPTIONALLY ENCLOSED BY '"'
|
| 143 | + (project_code,language_code,@date,extension,binaries)
|
| 144 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
|
| 145 | +
|
| 146 | +TRUNCATE TABLE 'analytics_in_offline.csv' ; |
Index: trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt |
— | — | @@ -0,0 +1,179 @@ |
| 2 | +-- make sure to delete output files *test*.csv first if any exist (MySQL on purpose forbids overwrite)
|
| 3 | +
|
| 4 | +-- tables implemented:
|
| 5 | +-- comscore
|
| 6 | +-- comscore_regions
|
| 7 | +-- wikistats
|
| 8 | +-- page_views
|
| 9 | +-- language names
|
| 10 | +-- binaries
|
| 11 | +
|
| 12 | +-- more tables planned (O= optional, not needed for report card stage)
|
| 13 | +-- project_names
|
| 14 | +-- O edits per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other)
|
| 15 | +-- O editors per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other)
|
| 16 | +
|
| 17 | +
|
| 18 | +-- open issues:
|
| 19 | +-- only store basic data in database and calculate all aggregates on the fly or do some aggragation before hand ? (e.g. count for mobile / non-mobile / ==> total of both ? <==)
|
| 20 | +-- for binaries, store one extension type per row? (future proof, more work to query), or a selected few as columns? (one row only needed per month)
|
| 21 | +
|
| 22 | +-- Create database and two tables from scratch
|
| 23 | +DROP DATABASE IF EXISTS `analytics` ;
|
| 24 | +
|
| 25 | +CREATE DATABASE `analytics` ;
|
| 26 | +
|
| 27 | +USE `analytics` ;
|
| 28 | +
|
| 29 | +CREATE TABLE `comscore` (
|
| 30 | + `date` date NOT NULL,
|
| 31 | + `country_code` varchar (3),
|
| 32 | + `region_code` varchar (3),
|
| 33 | + `web_property` varchar (20),
|
| 34 | + `project_code` varchar (10),
|
| 35 | + `reach` decimal (4,1) DEFAULT NULL,
|
| 36 | + `visitors` decimal (15) DEFAULT NULL,
|
| 37 | + PRIMARY KEY (date,country_code,region_code,project_code,web_property),
|
| 38 | + KEY (`country_code`)
|
| 39 | +) ;
|
| 40 | +
|
| 41 | +CREATE TABLE `comscore_regions` (
|
| 42 | + `region_code` varchar (2),
|
| 43 | + `report_language` varchar (10),
|
| 44 | + `region_name` varchar (18),
|
| 45 | + PRIMARY KEY (report_language,region_code)
|
| 46 | +) ;
|
| 47 | +
|
| 48 | +CREATE TABLE `wikistats` (
|
| 49 | + `date` date NOT NULL,
|
| 50 | + `project_code` varchar (10),
|
| 51 | + `language_code` varchar (15),
|
| 52 | + `editors_all_time` int (10) DEFAULT NULL,
|
| 53 | + `editors_new` int (7) DEFAULT NULL,
|
| 54 | + `editors_ge_5` int (7) DEFAULT NULL,
|
| 55 | + `editors_ge_25` int (7) DEFAULT NULL,
|
| 56 | + `editors_ge_100` int (7) DEFAULT NULL,
|
| 57 | + `articles` int (12) DEFAULT NULL,
|
| 58 | + `articles_new_per_day` int (9) DEFAULT NULL,
|
| 59 | + `articles_over_bytes_500` int (12) DEFAULT NULL,
|
| 60 | + `articles_over_bytes_2000` int (12) DEFAULT NULL,
|
| 61 | + `edits_per_article` decimal (9,1) DEFAULT NULL,
|
| 62 | + `bytes_per_article` decimal (9,1) DEFAULT NULL,
|
| 63 | + `edits` int (12) DEFAULT NULL,
|
| 64 | + `size_in_bytes` int (15) DEFAULT NULL,
|
| 65 | + `size_in_words` int (15) DEFAULT NULL,
|
| 66 | + `links_internal` int (15) DEFAULT NULL,
|
| 67 | + `links_interwiki` int (15) DEFAULT NULL,
|
| 68 | + `links_image` int (15) DEFAULT NULL,
|
| 69 | + `links_external` int (15) DEFAULT NULL,
|
| 70 | + `redirects` int (15) DEFAULT NULL,
|
| 71 | + PRIMARY KEY (date,project_code,language_code)
|
| 72 | +) ;
|
| 73 | +
|
| 74 | +CREATE TABLE `page_views` (
|
| 75 | + `date` date NOT NULL,
|
| 76 | + `project_code` char (2),
|
| 77 | + `language_code` char (15),
|
| 78 | + `views_non_mobile_raw` bigint (15),
|
| 79 | + `views_mobile_raw` bigint (15),
|
| 80 | + `views_non_mobile_normalized` bigint (15),
|
| 81 | + `views_mobile_normalized` bigint (15),
|
| 82 | + `views_raw` bigint (15),
|
| 83 | + `views_normalized` bigint (15),
|
| 84 | + PRIMARY KEY (date,project_code,language_code)
|
| 85 | +) ;
|
| 86 | +
|
| 87 | +CREATE TABLE `language_names` (
|
| 88 | + `report_language` varchar (15),
|
| 89 | + `language_code` varchar (15),
|
| 90 | + `language_name` varchar (50),
|
| 91 | + PRIMARY KEY (report_language,language_code)
|
| 92 | +) ;
|
| 93 | +
|
| 94 | +CREATE TABLE `binaries` (
|
| 95 | + `date` date NOT NULL,
|
| 96 | + `project_code` char (2),
|
| 97 | + `language_code` char (15),
|
| 98 | + `extension` varchar (10),
|
| 99 | + `binaries` bigint (15),
|
| 100 | + PRIMARY KEY (date,project_code,language_code,extension)
|
| 101 | +) ;
|
| 102 | +
|
| 103 | +CREATE TABLE `offline` (
|
| 104 | + `date` date NOT NULL,
|
| 105 | + `readers` bigint (12),
|
| 106 | + PRIMARY KEY (date,readers)
|
| 107 | +) ;
|
| 108 | +
|
| 109 | +-- SHOW TABLES ;
|
| 110 | +-- DESCRIBE comscore ;
|
| 111 | +-- DESCRIBE comscore_regions ;
|
| 112 | +-- DESCRIBE wikistats ;
|
| 113 | +-- DESCRIBE page_views ;
|
| 114 | +-- DESCRIBE language_names ;
|
| 115 | +-- DESCRIBE binaries ;
|
| 116 | +-- DESCRIBE offline ;
|
| 117 | +
|
| 118 | +-- Database Manipulation
|
| 119 | +-- Obviously in real world this is a separate script
|
| 120 | +
|
| 121 | +LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv'
|
| 122 | + INTO TABLE comscore
|
| 123 | + FIELDS TERMINATED BY ','
|
| 124 | + OPTIONALLY ENCLOSED BY '"'
|
| 125 | + (@date,country_code,region_code,web_property,project_code,reach,visitors)
|
| 126 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
|
| 127 | +
|
| 128 | +LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv'
|
| 129 | + INTO TABLE comscore_regions
|
| 130 | + FIELDS TERMINATED BY ','
|
| 131 | + OPTIONALLY ENCLOSED BY '"'
|
| 132 | + (report_language,region_code,region_name) ;
|
| 133 | +
|
| 134 | +LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv'
|
| 135 | + INTO TABLE wikistats
|
| 136 | + FIELDS TERMINATED BY ','
|
| 137 | + OPTIONALLY ENCLOSED BY '"'
|
| 138 | + (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects)
|
| 139 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
|
| 140 | +
|
| 141 | +LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv'
|
| 142 | + INTO TABLE page_views
|
| 143 | + FIELDS TERMINATED BY ','
|
| 144 | + OPTIONALLY ENCLOSED BY '"'
|
| 145 | + (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized)
|
| 146 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
|
| 147 | +
|
| 148 | +
|
| 149 | +LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv'
|
| 150 | + INTO TABLE language_names
|
| 151 | + FIELDS TERMINATED BY ','
|
| 152 | + OPTIONALLY ENCLOSED BY '"'
|
| 153 | + (report_language,language_code,language_name) ;
|
| 154 | +
|
| 155 | +LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv'
|
| 156 | + INTO TABLE binaries
|
| 157 | + FIELDS TERMINATED BY ','
|
| 158 | + OPTIONALLY ENCLOSED BY '"'
|
| 159 | + (project_code,language_code,@date,extension,binaries)
|
| 160 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
|
| 161 | +
|
| 162 | +LOAD DATA LOCAL INFILE 'analytics_in_offline.csv'
|
| 163 | + INTO TABLE offline
|
| 164 | + FIELDS TERMINATED BY ','
|
| 165 | + OPTIONALLY ENCLOSED BY '"'
|
| 166 | + (@date,readers)
|
| 167 | + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
|
| 168 | +
|
| 169 | +
|
| 170 | +-- show contents (debugging only)
|
| 171 | +-- SELECT * FROM comscore ;
|
| 172 | +-- SELECT * FROM comscore_regions ;
|
| 173 | +-- SELECT * FROM wikistats ;
|
| 174 | +-- SELECT * FROM page_views ;
|
| 175 | +-- SELECT * FROM language_names ;
|
| 176 | +-- SELECT * FROM binaries
|
| 177 | +-- WHERE project_code = 'commons' ;
|
| 178 | + SELECT * FROM offline ;
|
| 179 | +
|
| 180 | +
|
Index: trunk/wikistats/analytics/analytics_generate_csv_files.sh |
— | — | @@ -0,0 +1,21 @@ |
| 2 | +#!/bin/sh |
| 3 | +ulimit -v 8000000 |
| 4 | + |
| 5 | +clear |
| 6 | +cd /a/analytics |
| 7 | + |
| 8 | +perl AnalyticsPrepBinariesData.pl -i /a/wikistats/ -o /a/analytics/test/ |
| 9 | + |
| 10 | +# add or replace data from newest comScore csv files (last 14 months) into master files (full history) |
| 11 | +# and generate database input csv file from it |
| 12 | + |
| 13 | +# -r replace (default is add only) |
| 14 | +# -i input folder, contains manually downloaded csv files from comScore (or xls, converted to csv) |
| 15 | +# -m master files with full history |
| 16 | +# -o output csv file, with reach and UV's per region and UV's per top web property, ready for import into database |
| 17 | +perl AnalyticsPrepComscoreData.pl -r -i /a/analytics/comscore -m /a/analytics -o /a/analytics |
| 18 | + |
| 19 | +perl AnalyticsPrepWikiCountsOutput.pl -i /a/wikistats/ -o /a/analytics |
| 20 | + |
| 21 | +cp /a/wikistats/csv_wp/analytics_in_page_views.csv . |
| 22 | + |
Index: trunk/wikistats/analytics/analytics_upd.sh |
— | — | @@ -0,0 +1,6 @@ |
| 2 | +clear |
| 3 | +cd /a/analytics |
| 4 | + |
| 5 | +./analytics_generate_csv_files.sh |
| 6 | + |
| 7 | +mysql -u analytics -h project2.wikimedia.org -preport < analytics_refresh_from_csv.txt |
\ No newline at end of file |
Index: trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl |
— | — | @@ -39,6 +39,7 @@ |
40 | 40 |
|
41 | 41 | &ParseArguments ;
|
42 | 42 | &ReadStatisticsMonthly ;
|
| 43 | + &FindLargestWikis ;
|
43 | 44 | &WriteMonthlyData ;
|
44 | 45 |
|
45 | 46 | print "\nReady\n\n" ;
|
— | — | @@ -54,15 +55,20 @@ |
55 | 56 | { $arguments .= " -$arg " . $options {$arg} . "\n" ; }
|
56 | 57 | print ("\nArguments\n$arguments\n") ;
|
57 | 58 |
|
58 | | -# $options {"i"} = "w:/# out bayes" ; # EZ test
|
59 | | -# $options {"o"} = "c:/MySQL/analytics" ; # EZ test
|
| 59 | + if (! -d '/mnt/') # EZ test
|
| 60 | + {
|
| 61 | + $path_in = "c:/\@ wikimedia/# out bayes" ;
|
| 62 | + $path_out = "c:/MySQL/analytics" ;
|
| 63 | + }
|
| 64 | + else
|
| 65 | + {
|
| 66 | + die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ;
|
| 67 | + die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
|
60 | 68 |
|
61 | | - die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ;
|
62 | | - die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
|
| 69 | + $path_in = $options {"i"} ;
|
| 70 | + $path_out = $options {"o"} ;
|
| 71 | + }
|
63 | 72 |
|
64 | | - $path_in = $options {"i"} ;
|
65 | | - $path_out = $options {"o"} ;
|
66 | | -
|
67 | 73 | die "Input folder '$path_in' does not exist" if (! -d $path_in) ;
|
68 | 74 | die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
|
69 | 75 |
|
— | — | @@ -158,6 +164,8 @@ |
159 | 165 |
|
160 | 166 | ($month,$day,$year) = split ('\/', $date) ;
|
161 | 167 | $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
|
| 168 | + $months {$yyyymm} ++ ;
|
| 169 | +# print "YYYYMM $yyyymm\n" ;
|
162 | 170 |
|
163 | 171 | # data have been collected in WikiCountsProcess.pm and been written in WikiCountsOutput.pm
|
164 | 172 | # count user with over x edits
|
— | — | @@ -167,6 +175,11 @@ |
168 | 176 | $edits_ge_25 = @counts [4] > 0 ? @counts [4] : 0 ;
|
169 | 177 | $edits_ge_100 = @counts [7] > 0 ? @counts [7] : 0 ;
|
170 | 178 | $data2 {"$project,$language,$yyyymm"} = "$edits_ge_5,$edits_ge_25,$edits_ge_100" ;
|
| 179 | +
|
| 180 | + $total_edits_ge_5 {"$project,$language"} += $edits_ge_5 ;
|
| 181 | + $total_edits_ge_25 {"$project,$language"} += $edits_ge_25 ;
|
| 182 | + $total_edits_ge_100 {"$project,$language"} += $edits_ge_100 ;
|
| 183 | +
|
171 | 184 | # prep string with right amount of comma's
|
172 | 185 | if ($data2_default eq '')
|
173 | 186 | {
|
— | — | @@ -237,12 +250,46 @@ |
238 | 251 | # }
|
239 | 252 | #}
|
240 | 253 |
|
| 254 | +sub FindLargestWikis
|
| 255 | +{
|
| 256 | + print "Largest projects (most accumulated very active editors):\n";
|
| 257 | + @total_edits_ge_100 = sort {$total_edits_ge_100 {$b} <=> $total_edits_ge_100 {$a}} keys %total_edits_ge_100 ;
|
| 258 | + $rank = 0 ;
|
| 259 | + foreach $project_language (@total_edits_ge_100)
|
| 260 | + {
|
| 261 | + $largest_projects {$project_language} = $rank++ ;
|
| 262 | + print "$project_language," ;
|
| 263 | + last if $rank > 10 ;
|
| 264 | + }
|
| 265 | + print "\n\n" ;
|
| 266 | +
|
| 267 | + foreach $yyyymm (sort keys %months)
|
| 268 | + {
|
| 269 | + next if $yyyymm lt '2011' ;
|
| 270 | + foreach $project_language (keys %largest_projects)
|
| 271 | + {
|
| 272 | + ($project,$language) = split (',', $project_language) ;
|
| 273 | + if ($data2 {"$project,$language,$yyyymm"} eq '')
|
| 274 | + {
|
| 275 | + print "No data yet for large wiki $project_language for $yyyymm-> skip month $yyyymm\n" ;
|
| 276 | + $months {$yyyymm} = 0 ;
|
| 277 | + }
|
| 278 | + }
|
| 279 | + }
|
| 280 | + exit ;
|
| 281 | +}
|
| 282 | +
|
241 | 283 | sub WriteMonthlyData
|
242 | 284 | {
|
243 | 285 | my $file_csv_out = "$path_out/$file_csv_analytics_in" ;
|
244 | 286 | open CSV_OUT, '>', $file_csv_out ;
|
245 | 287 | foreach $project_wiki_month (sort keys %data1)
|
246 | 288 | {
|
| 289 | + ($project,$wiki,$yyyymm) = split (',', $project_wiki_month) ;
|
| 290 | +
|
| 291 | + # recent month misses on eor more large wikis?
|
| 292 | + next if $months {$yyyymm} == 0 ;
|
| 293 | +
|
247 | 294 | $data1 = $data1 {$project_wiki_month} ;
|
248 | 295 | $data2 = $data2 {$project_wiki_month} ;
|
249 | 296 | if ($data2 eq '')
|
— | — | @@ -250,9 +297,12 @@ |
251 | 298 | print "Editor data missing for $project_wiki_month\n" ;
|
252 | 299 | $data2 = $data2_default ;
|
253 | 300 | }
|
254 | | - $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor fields follow each other
|
| 301 | + $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor fields close together
|
255 | 302 | print CSV_OUT "$project_wiki_month,$data1\n" ;
|
256 | 303 | }
|
| 304 | + $total_edits_ge_5 {"$project,*,$yyyymm"} += $edits_ge_5 ;
|
| 305 | + $total_edits_ge_25 {"$project,*,$yyyymm"} += $edits_ge_25 ;
|
| 306 | + $total_edits_ge_100 {"$project,*,$yyyymm"} += $edits_ge_100 ;
|
257 | 307 | close CSV_OUT ;
|
258 | 308 | }
|
259 | 309 |
|
Index: trunk/wikistats/analytics/analytics_new.sh |
— | — | @@ -0,0 +1,5 @@ |
| 2 | +clear |
| 3 | +cd /a/analytics |
| 4 | +# rm *test*.csv |
| 5 | +# mysql --user=root --password=changerootwhenpriyankareturns < analytics_create_and_load_from_csv.txt > mysql_log.txt |
| 6 | + mysql -u analytics -h project2.wikimedia.org -preport < analytics_create_and_load_from_csv.txt > mysql_log.txt |
Index: trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl |
— | — | @@ -33,13 +33,13 @@ |
34 | 34 | use Cwd; |
35 | 35 | |
36 | 36 | my $options ; |
37 | | - getopt ("mu", \%options) ; |
| 37 | + getopt ("imo", \%options) ; |
38 | 38 | |
39 | 39 | $true = 1 ; |
40 | 40 | $false = 0 ; |
41 | 41 | |
42 | 42 | $script_name = "AnalyticsPrepComscoreData.pl" ; |
43 | | - $script_version = "0.32" ; |
| 43 | + $script_version = "0.31" ; |
44 | 44 | |
45 | 45 | # EZ test only |
46 | 46 | # $source = "comscore" ; |
— | — | @@ -47,21 +47,34 @@ |
48 | 48 | # $generated = "2011-05-06 00:00:00" ; |
49 | 49 | # $user = "ezachte" ; |
50 | 50 | |
51 | | - $dir_analytics = $options {"m"} ; |
52 | | - $dir_comscore_updates = $options {"u"} ; |
| 51 | + $dir_in = $options {"i"} ; |
| 52 | + $dir_upd = $options {"m"} ; |
| 53 | + $dir_out = $options {"o"} ; |
| 54 | + $mode = 'add'; |
| 55 | + if (defined $options {'r'}) |
| 56 | + { $mode = 'replace'; } |
53 | 57 | |
54 | | - $dir_analytics = "C:/@ Wikimedia/! MySQL/analytics" ; # EZ test only |
55 | | - $dir_comscore_updates = "C:/@ Wikimedia/@ Report Card/Data" ; # EZ test only |
| 58 | + print "Mode is $mode (specify '-r' for replace)\n\n"; |
56 | 59 | |
57 | | - if (($dir_analytics eq '') || ($dir_comscore_updates eq '')) |
58 | | - { Abort ("Specify folder for 'master' csv files as '-m folder', folder for 'update' csv files as -u folder'") ; } |
| 60 | + if (! -d "/home/") # EZ test machine |
| 61 | + { |
| 62 | + $dir_in = "C:/@ Wikimedia/@ Report Card/Data" ; |
| 63 | + $dir_upd = "C:/MySQL/analytics" ; |
| 64 | + $dir_out = "C:/MySQL/analytics" ; |
| 65 | + $mode = 'replace' ; |
| 66 | + } |
59 | 67 | |
60 | | - $file_comscore_reach_master = "history_comscore_reach_regions.csv" ; |
61 | | - $file_comscore_reach_update = "*reach*by*region*csv" ; |
| 68 | + if ($dir_in eq '') |
| 69 | + { Abort ("Specify folder for input file (new comScore data) '-i folder'") ; } |
| 70 | + if ($dir_upd eq '') |
| 71 | + { Abort ("Specify folder for master files (full history) as '-m folder'") ; } |
| 72 | + if ($dir_out eq '') |
| 73 | + { Abort ("Specify folder for output file '-o folder'") ; } |
62 | 74 | |
63 | | - $file_comscore_uv_region_master = "history_comscore_UV_regions.csv" ; |
64 | | - $file_comscore_uv_region_update = "*UVs*by*region*csv" ; |
65 | | - |
| 75 | + $file_comscore_reach_master = "history_comscore_reach_regions.csv" ; |
| 76 | + $file_comscore_reach_update = "*reach*by*region*csv" ; |
| 77 | + $file_comscore_uv_region_master = "history_comscore_UV_regions.csv" ; |
| 78 | + $file_comscore_uv_region_update = "*UVs*by*region*csv" ; |
66 | 79 | $file_comscore_uv_property_master = "history_comscore_UV_properties.csv" ; |
67 | 80 | $file_comscore_uv_property_update = "*UV*trend*csv" ; |
68 | 81 | |
— | — | @@ -69,7 +82,10 @@ |
70 | 83 | $layout_csv_regions = 2 ; |
71 | 84 | $layout_csv_properties = 3 ; |
72 | 85 | |
73 | | - print "Directories:\nAnalytics '$dir_analytics'\nUpdates '$dir_comscore_updates'\n\n" ; |
| 86 | + print "Directories:\n" . |
| 87 | + "Input (new comScore data): '$dir_in'\n". |
| 88 | + "Master files (full history): '$dir_upd'\n" . |
| 89 | + "Output (database feed): '$dir_out'\n\n" ; |
74 | 90 | |
75 | 91 | %region_codes = ( |
76 | 92 | "Europe"=>"EU", |
— | — | @@ -88,13 +104,13 @@ |
89 | 105 | |
90 | 106 | @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ; |
91 | 107 | |
92 | | - &ReadDataReachPerRegion ($file_comscore_reach_master, $file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ; |
| 108 | + &ReadMasterComscoreDataReachPerRegion ($file_comscore_reach_master, $file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ; |
93 | 109 | %reach_region_code = %data ; |
94 | 110 | |
95 | | - &ReadDataVisitorsPerRegion ($file_comscore_uv_region_master, $file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ; |
| 111 | + &ReadMasterComscoreDataVisitorsPerRegion ($file_comscore_uv_region_master, $file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ; |
96 | 112 | %visitors_region_code = %data ; |
97 | 113 | |
98 | | - &ReadDataVisitorsPerProperty ($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f", 1000, $layout_csv_properties) ; |
| 114 | + &ReadMasterComscoreDataVisitorsPerProperty ($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f", 1000, $layout_csv_properties) ; |
99 | 115 | %visitors_web_property = %data ; |
100 | 116 | |
101 | 117 | &WriteDataAnalytics ; |
— | — | @@ -102,153 +118,32 @@ |
103 | 119 | print "\nReady\n\n" ; |
104 | 120 | exit ; |
105 | 121 | |
106 | | -sub UpdateFromLatestComscoreData |
| 122 | +sub ReadMasterComscoreDataReachPerRegion |
107 | 123 | { |
108 | | - my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, @update_only) = @_ ; |
| 124 | + my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) = @_ ; |
109 | 125 | |
110 | | - undef %update_only ; |
111 | | - undef %do_not_update ; |
| 126 | + print "ReadMasterComscoreDataReachPerRegion\n\n" ; |
112 | 127 | |
113 | | - foreach $id (@update_only) |
114 | | - { $update_only {$id} = $true ; } |
115 | | - |
116 | | - if (! -e "$dir_analytics/$file_comscore_master") |
117 | | - { Abort ("File $file_comscore_master not found!") ; } |
118 | | - |
119 | | - $age_master = -M "$dir_analytics/$file_comscore_master" ; |
120 | | - print "\nLatest comscore master file is " . sprintf ("%.0f", $age_master) . " days old: '$file_comscore_master'\n" ; |
121 | | - |
122 | | - my $cwd = getcwd ; |
123 | | - chdir $dir_comscore_updates ; |
124 | | - |
125 | | - @files = glob($file_comscore_updates) ; |
126 | | - $age_update = 999999 ; |
127 | | - $file_comscore_updates_latest = '' ; |
128 | | - foreach $file (@files) |
129 | | - { |
130 | | - $age = -M $file ; |
131 | | - if ($age < $age_update) |
132 | | - { |
133 | | - $age_update = $age ; |
134 | | - $file_comscore_updates_latest = $file ; |
135 | | - } |
136 | | - } |
137 | | - print "\nLatest comscore update file is " . sprintf ("%.0f", $age_update) . " days old: '$file_comscore_updates_latest'\n" ; |
138 | | - |
139 | | - if ($age_update == 999999) |
140 | | - { |
141 | | - print "No valid update file found. Nothing to update." ; |
142 | | - return ; |
143 | | - } |
144 | | - |
145 | | - if ($age_master < $age_update) |
146 | | - { |
147 | | - print "File with master data more recent than latest update csv from comScore. Nothing to update." ; |
148 | | - return ; |
149 | | - } |
150 | | - |
151 | | - my $updates_found = $false ; |
152 | | - |
153 | | - print "\nRead updates\n\n" ; |
154 | | - open CSV, '<', $file_comscore_updates_latest ; |
155 | | - while ($line = <CSV>) |
156 | | - { |
157 | | - chomp $line ; |
158 | | - $line = &GetNumberOnly ($line) ; |
159 | | - |
160 | | - if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...' |
161 | | - { |
162 | | - if ($layout_csv == $layout_csv_properties) |
163 | | - { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web properties csv file |
164 | | - else |
165 | | - { ($dummy1,$dummy2,@months) = split (',', $line) ; } # uv / reach csv files |
166 | | - |
167 | | - @months = &mmm_yyyy2yyyy_mm (@months) ; |
168 | | - } |
169 | | - |
170 | | - if ($line =~ /^\d+,/) |
171 | | - { |
172 | | - if ($layout_csv == $layout_csv_properties) |
173 | | - { |
174 | | - ($index,$dummy,$property,@data) = split (',', $line) ; |
175 | | - $property =~ s/^\s+// ; |
176 | | - $property =~ s/\s+$// ; |
177 | | - |
178 | | - $property =~ s/.*Google.*/Google/i ; |
179 | | - $property =~ s/.*Microsoft.*/Microsoft/i ; |
180 | | - $property =~ s/.*FACEBOOK.*/Facebook/i ; |
181 | | - $property =~ s/.*Yahoo.*/Yahoo/i ; |
182 | | - $property =~ s/.*Amazon.*/Amazon/i ; |
183 | | - $property =~ s/.*Apple.*/Apple/i ; |
184 | | - $property =~ s/.*AOL.*/AOL/i ; |
185 | | - $property =~ s/.*Wikimedia.*/Wikimedia/i ; |
186 | | - $property =~ s/.*Tencent.*/Tencent/i ; |
187 | | - $property =~ s/.*Baidu.*/Baidu/i ; |
188 | | - $property =~ s/.*CBS.*/CBS/i ; |
189 | | - |
190 | | - $id = $property ; |
191 | | - } |
192 | | - else |
193 | | - { |
194 | | - ($index,$region,@data) = split (',', $line) ; |
195 | | - $region =~ s/^\s+// ; |
196 | | - $region =~ s/\s+$// ; |
197 | | - $id = $region_codes {$region} ; |
198 | | - } |
199 | | - |
200 | | - if ($update_only {$id} == 0) |
201 | | - { |
202 | | - $do_not_update {$id}++ ; |
203 | | - next ; |
204 | | - } |
205 | | - |
206 | | - for ($m = 0 ; $m <= $#months ; $m++) |
207 | | - { |
208 | | - $yyyymm = $months [$m] ; |
209 | | - $months {$yyyymm} ++ ; |
210 | | - $yyyymm_id = "$yyyymm,$id" ; |
211 | | - $data = $data [$m] * $multiplier ; |
212 | | - |
213 | | - if (! defined $data {$yyyymm_id}) |
214 | | - { |
215 | | - $updates_found = $true ; |
216 | | - print "New data found: $yyyymm_id = $data\n" ; |
217 | | - $data {$yyyymm_id} = $data ; |
218 | | - } |
219 | | - } |
220 | | - } |
221 | | - } |
222 | | - |
223 | | - $ignored = join ', ', sort keys %do_not_update ; |
224 | | - print "\nEntities ignored:\n$ignored\n\n" ; |
225 | | - |
226 | | - if (! $updates_found) |
227 | | - { print "No new updates found\n" ; } |
228 | | - else |
229 | | - { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; } |
230 | | - |
231 | | - return ($updates_found) ; |
232 | | -} |
233 | | - |
234 | | -sub ReadDataReachPerRegion |
235 | | -{ |
236 | | - my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) = @_ ; |
237 | | - |
238 | 128 | undef %months ; |
239 | 129 | undef %data ; |
240 | 130 | undef @regions ; |
241 | 131 | |
242 | | - open IN, '<', "$dir_analytics/$file_comscore_master" ; |
| 132 | + open IN, '<', "$dir_upd/$file_comscore_master" ; |
243 | 133 | |
244 | 134 | $lines = 0 ; |
245 | 135 | while ($line = <IN>) |
246 | 136 | { |
247 | 137 | chomp $line ; |
| 138 | + $line =~ s/\r//g ; |
248 | 139 | |
249 | 140 | ($yyyymm,@data) = split (',', $line) ; |
250 | 141 | |
251 | 142 | if ($lines++ == 0) |
252 | | - { @regions = @data ; next ; } |
| 143 | + { |
| 144 | + @regions = @data ; |
| 145 | + print "Regions found: " . (join ',', @regions) . "\n"; |
| 146 | + next ; |
| 147 | + } |
253 | 148 | |
254 | 149 | $field_ndx = 0 ; |
255 | 150 | foreach (@data) |
— | — | @@ -267,11 +162,11 @@ |
268 | 163 | } |
269 | 164 | close IN ; |
270 | 165 | |
271 | | - my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates, 1, $layout_csv, @regions) ; |
| 166 | + my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1, $layout_csv, @regions) ; |
272 | 167 | return if ! $updates_found ; |
273 | 168 | |
274 | | - rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ; |
275 | | - open OUT, '>', "$dir_analytics/$file_comscore_master" ; |
| 169 | + rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ; |
| 170 | + open OUT, '>', "$dir_upd/$file_comscore_master" ; |
276 | 171 | |
277 | 172 | $line_out = "yyyymm" ; |
278 | 173 | foreach $region_name (@regions) |
— | — | @@ -292,27 +187,36 @@ |
293 | 188 | close OUT ; |
294 | 189 | } |
295 | 190 | |
296 | | -sub ReadDataVisitorsPerRegion |
| 191 | +sub ReadMasterComscoreDataVisitorsPerRegion |
297 | 192 | { |
298 | 193 | my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ; |
299 | 194 | |
| 195 | + print "ReadMasterComscoreDataVisitorsPerRegion\n\n"; |
| 196 | + |
300 | 197 | undef %months ; |
301 | 198 | undef %data ; |
302 | 199 | undef @regions ; |
303 | 200 | |
304 | | - open IN, '<', "$dir_analytics/$file_comscore_master" ; |
| 201 | + open IN, '<', "$dir_upd/$file_comscore_master" ; |
305 | 202 | |
306 | 203 | $lines = 0 ; |
307 | 204 | $metric = 'unique_visitors' ; |
308 | 205 | while ($line = <IN>) |
309 | 206 | { |
310 | 207 | chomp $line ; |
| 208 | + $line =~ s/\r//g ; |
311 | 209 | $line = &GetNumberOnly ($line) ; |
312 | 210 | |
| 211 | + next if $line !~ /(?:yyyymm|\d\d\d\d-\d\d)/ ; |
| 212 | + |
313 | 213 | ($yyyymm,@data) = split (',', $line) ; |
314 | 214 | |
315 | 215 | if ($lines++ == 0) |
316 | | - { @regions = @data ; next ; } |
| 216 | + { |
| 217 | + @regions = @data ; |
| 218 | + print "Regions found: " . (join ',', @regions) . "\n"; |
| 219 | + next ; |
| 220 | + } |
317 | 221 | |
318 | 222 | $field_ndx = 0 ; |
319 | 223 | foreach (@data) |
— | — | @@ -334,11 +238,11 @@ |
335 | 239 | } |
336 | 240 | close IN ; |
337 | 241 | |
338 | | - my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @regions) ; |
| 242 | + my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @regions) ; |
339 | 243 | return if ! $updates_found ; |
340 | 244 | |
341 | | - rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ; |
342 | | - open OUT, '>', "$dir_analytics/$file_comscore_master" ; |
| 245 | + rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ; |
| 246 | + open OUT, '>', "$dir_upd/$file_comscore_master" ; |
343 | 247 | |
344 | 248 | $line_out = "yyyymm" ; |
345 | 249 | foreach $region_name (@regions) |
— | — | @@ -359,21 +263,24 @@ |
360 | 264 | close OUT ; |
361 | 265 | } |
362 | 266 | |
363 | | -sub ReadDataVisitorsPerProperty |
| 267 | +sub ReadMasterComscoreDataVisitorsPerProperty |
364 | 268 | { |
365 | 269 | my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ; |
366 | 270 | |
| 271 | + print "ReadMasterComscoreDataVisitorsPerProperty\n\n"; |
| 272 | + |
367 | 273 | undef %months ; |
368 | 274 | undef %data ; |
369 | 275 | undef @properties ; |
370 | 276 | |
371 | | - open IN, '<', "$dir_analytics/$file_comscore_master" ; |
| 277 | + open IN, '<', "$dir_upd/$file_comscore_master" ; |
372 | 278 | |
373 | 279 | $lines = 0 ; |
374 | 280 | $metric = 'unique_visitors' ; |
375 | 281 | while ($line = <IN>) |
376 | 282 | { |
377 | 283 | chomp $line ; |
| 284 | + $line =~ s/\r//g ; |
378 | 285 | |
379 | 286 | ($yyyymm,@data) = split (',', $line) ; |
380 | 287 | if ($lines++ == 0) |
— | — | @@ -398,11 +305,11 @@ |
399 | 306 | } |
400 | 307 | close IN ; |
401 | 308 | |
402 | | - my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @properties) ; |
| 309 | + my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @properties) ; |
403 | 310 | return if ! $updates_found ; |
404 | 311 | |
405 | | - rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ; |
406 | | - open OUT, '>', "$dir_analytics/$file_comscore_master" ; |
| 312 | + rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ; |
| 313 | + open OUT, '>', "$dir_upd/$file_comscore_master" ; |
407 | 314 | |
408 | 315 | $line_out = "yyyymm" ; |
409 | 316 | foreach $property (@properties) |
— | — | @@ -423,10 +330,163 @@ |
424 | 331 | close OUT ; |
425 | 332 | } |
426 | 333 | |
| 334 | +sub UpdateMasterFileFromRecentComscoreData |
| 335 | +{ |
| 336 | + my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, @white_list) = @_ ; |
| 337 | + |
| 338 | + print "UpdateMasterFileFromRecentComscoreData\n\n"; |
| 339 | + |
| 340 | + undef %white_list ; |
| 341 | + undef %not_white_listed ; |
| 342 | + |
| 343 | + print "White list: ". (join (',', @white_list)) . "\n\n"; |
| 344 | + |
| 345 | + foreach $id (@white_list) |
| 346 | + { $white_list {$id} = $true ; } |
| 347 | + |
| 348 | + if (! -e "$dir_upd/$file_comscore_master") |
| 349 | + { Abort ("File $file_comscore_master not found!") ; } |
| 350 | + |
| 351 | + $age_all = -M "$dir_upd/$file_comscore_master" ; |
| 352 | + print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " days old: '$file_comscore_master'\n" ; |
| 353 | + |
| 354 | + my $cwd = getcwd ; |
| 355 | + chdir $dir_in ; |
| 356 | + |
| 357 | + @files = glob($file_comscore_updates) ; |
| 358 | + $min_age_upd = 999999 ; |
| 359 | + $file_comscore_updates_latest = '' ; |
| 360 | + foreach $file (@files) |
| 361 | + { |
| 362 | + $age = -M $file ; |
| 363 | + if ($age < $min_age_upd) |
| 364 | + { |
| 365 | + $min_age_upd = $age ; |
| 366 | + $file_comscore_updates_latest = $file ; |
| 367 | + } |
| 368 | + } |
| 369 | + print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " days old: '$file_comscore_updates_latest'\n" ; |
| 370 | + |
| 371 | + if ($min_age_upd == 999999) |
| 372 | + { |
| 373 | + print "No valid update file found. Nothing to update." ; |
| 374 | + return ; |
| 375 | + } |
| 376 | + |
| 377 | + #if ($age_all > $min_age_upd) |
| 378 | + #{ |
| 379 | + # print "File with master data more recent than latest update csv from comScore. Nothing to update." ; |
| 380 | + # return ; |
| 381 | + #} |
| 382 | + |
| 383 | + my $updates_found = $false ; |
| 384 | + |
| 385 | + open CSV, '<', $file_comscore_updates_latest ; |
| 386 | + binmode CSV ; |
| 387 | + while ($line = <CSV>) |
| 388 | + { |
| 389 | + chomp $line ; |
| 390 | + $line =~ s/\r//g ; |
| 391 | + $line = &GetNumberOnly ($line) ; |
| 392 | + |
| 393 | + if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...' |
| 394 | + { |
| 395 | + if ($layout_csv == $layout_csv_properties) |
| 396 | + { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web properties csv file |
| 397 | + else |
| 398 | + { ($dummy1,$dummy2,@months) = split (',', $line) ; } # uv / reach csv files |
| 399 | + |
| 400 | + @months = &mmm_yyyy2yyyy_mm (@months) ; |
| 401 | + } |
| 402 | + |
| 403 | + if (($line =~ /^\d+,/) || ($line =~ /,,.*?Total Internet/)) |
| 404 | + { |
| 405 | + if ($layout_csv == $layout_csv_properties) |
| 406 | + { |
| 407 | + ($index,$dummy,$property,@data) = split (',', $line) ; |
| 408 | + $property =~ s/^\s+// ; |
| 409 | + $property =~ s/\s+$// ; |
| 410 | + |
| 411 | + $property =~ s/.*Total Internet.*/Total Internet/i ; |
| 412 | + $property =~ s/.*Google.*/Google/i ; |
| 413 | + $property =~ s/.*Microsoft.*/Microsoft/i ; |
| 414 | + $property =~ s/.*FACEBOOK.*/Facebook/i ; |
| 415 | + $property =~ s/.*Yahoo.*/Yahoo/i ; |
| 416 | + $property =~ s/.*Amazon.*/Amazon/i ; |
| 417 | + $property =~ s/.*Apple.*/Apple/i ; |
| 418 | + $property =~ s/.*AOL.*/AOL/i ; |
| 419 | + $property =~ s/.*Wikimedia.*/Wikimedia/i ; |
| 420 | + $property =~ s/.*Tencent.*/Tencent/i ; |
| 421 | + $property =~ s/.*Baidu.*/Baidu/i ; |
| 422 | + $property =~ s/.*CBS.*/CBS/i ; |
| 423 | + |
| 424 | + if (! $white_list {$property}) |
| 425 | + { |
| 426 | + $not_white_listed {$property}++ ; |
| 427 | + next ; |
| 428 | + } |
| 429 | + |
| 430 | + $id = $property ; |
| 431 | + } |
| 432 | + else |
| 433 | + { |
| 434 | + ($index,$region,@data) = split (',', $line) ; |
| 435 | + $region =~ s/^\s+// ; |
| 436 | + $region =~ s/\s+$// ; |
| 437 | + |
| 438 | + if (! $white_list {$region}) |
| 439 | + { |
| 440 | + $not_white_listed {$region}++ ; |
| 441 | + next ; |
| 442 | + } |
| 443 | + |
| 444 | + $id = $region_codes {$region} ; |
| 445 | + } |
| 446 | + |
| 447 | + for ($m = 0 ; $m <= $#months ; $m++) |
| 448 | + { |
| 449 | + $yyyymm = $months [$m] ; |
| 450 | + $months {$yyyymm} ++ ; |
| 451 | + $yyyymm_id = "$yyyymm,$id" ; |
| 452 | + $data = $data [$m] * $multiplier ; |
| 453 | + |
| 454 | + if ($mode eq 'add') |
| 455 | + { |
| 456 | + if (! defined $data {$yyyymm_id}) |
| 457 | + { |
| 458 | + $updates_found = $true ; |
| 459 | + print "New data found: $yyyymm_id = $data\n" ; |
| 460 | + $data {$yyyymm_id} = $data ; |
| 461 | + } |
| 462 | + } |
| 463 | + else |
| 464 | + { |
| 465 | + $updates_found = $true ; |
| 466 | + print "Data found: $yyyymm_id = $data\n" ; |
| 467 | + $data {$yyyymm_id} = $data ; |
| 468 | + } |
| 469 | + } |
| 470 | + } |
| 471 | + } |
| 472 | + |
| 473 | + $entities_not_white_listed = join (', ', sort keys %not_white_listed) ; |
| 474 | + if ($entities_not_white_listed ne '') |
| 475 | + { print "\nEntities ignored:\n$entities_not_white_listed\n\n" ; } |
| 476 | + |
| 477 | + if (! $updates_found) |
| 478 | + { print "No new updates found\n" ; } |
| 479 | + else |
| 480 | + { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; } |
| 481 | + |
| 482 | + return ($updates_found) ; |
| 483 | +} |
| 484 | + |
427 | 485 | sub WriteDataAnalytics |
428 | 486 | { |
429 | | - open OUT, '>', "$dir_analytics/analytics_in_comscore.csv" ; |
| 487 | + print "WriteDataAnalytics\n\n"; |
430 | 488 | |
| 489 | + open OUT, '>', "$dir_out/analytics_in_comscore.csv" ; |
| 490 | + |
431 | 491 | $metric = 'unique_visitors' ; |
432 | 492 | foreach $yyyymm (sort keys %months) |
433 | 493 | { |
— | — | @@ -447,7 +507,7 @@ |
448 | 508 | |
449 | 509 | $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ; |
450 | 510 | print OUT $line ; |
451 | | - # print $line ; |
| 511 | + print $line ; |
452 | 512 | } |
453 | 513 | |
454 | 514 | foreach $property (sort @properties) |
— | — | @@ -478,14 +538,18 @@ |
479 | 539 | sub mmm_yyyy2yyyy_mm |
480 | 540 | { |
481 | 541 | my @months = @_ ; |
| 542 | + my ($m) ; |
482 | 543 | # Jan -> 01, etc |
483 | | - foreach my $month (@months) |
| 544 | + foreach $month (@months) |
484 | 545 | { |
485 | 546 | my ($mmm,$yyyy) = split ('-', $month) ; |
486 | 547 | for ($m = 0 ; $m <= $#months_short ; $m++) |
487 | 548 | { |
488 | 549 | if ($mmm eq $months_short [$m]) |
489 | | - { $month = "$yyyy-" . sprintf ("%02d", $m+1) ; } |
| 550 | + { |
| 551 | + $month = "$yyyy-" . sprintf ("%02d", $m+1) ; |
| 552 | + last ; |
| 553 | + } |
490 | 554 | } |
491 | 555 | } |
492 | 556 | return @months ; |