r105445 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r105444‎ | r105445 | r105446 >
Date:17:39, 7 December 2011
Author:diederik
Status:deferred
Tags:
Comment:
Moved files to trunk/extensions/MetricsReporting/import
Modified paths:
  • /trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl (deleted) (history)
  • /trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl (deleted) (history)
  • /trunk/wikistats/analytics/AnalyticsPrepLanguageNames.pl (deleted) (history)
  • /trunk/wikistats/analytics/AnalyticsPrepPageViews.pl (deleted) (history)
  • /trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl (deleted) (history)
  • /trunk/wikistats/analytics/README (added) (history)
  • /trunk/wikistats/analytics/_readme.txt (deleted) (history)
  • /trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt (deleted) (history)
  • /trunk/wikistats/analytics/analytics_generate_csv_files.sh (deleted) (history)
  • /trunk/wikistats/analytics/analytics_new.sh (deleted) (history)
  • /trunk/wikistats/analytics/analytics_refresh_from_csv.txt (deleted) (history)
  • /trunk/wikistats/analytics/analytics_upd.sh (deleted) (history)

Diff [purge]

Index: trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl
@@ -1,124 +0,0 @@
2 -#!/usr/local/bin/perl
3 -
4 - use Getopt::Std ;
5 -
6 - &ParseArguments ;
7 -
8 - print "Write file '$file_csv_out'\n" ;
9 - open CSV_OUT, '>', $file_csv_out ;
10 -
11 - foreach $project (qw (wb wk wn wp wq ws wv wx))
12 - { &ReadStatisticsPerBinariesExtension ($project) ; }
13 -
14 - close CSV_OUT ;
15 -
16 - print "\n\nReady\n\n" ;
17 - exit ;
18 -
19 -sub ParseArguments
20 -{
21 - my @options ;
22 - getopt ("io", \%options) ;
23 -
24 - die ("Specify input folder as: -i path") if (! defined ($options {"i"})) ;
25 - die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
26 -
27 - $path_in = $options {"i"} ;
28 - $path_out = $options {"o"} ;
29 -
30 - die "Input folder '$path_in' does not exist" if (! -d $path_in) ;
31 - die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
32 -
33 - # tests only
34 - # $path_in = "C:/@ Wikimedia/# Out Bayes" ;
35 - # $path_out = "C:/analytics" ; # "w:/@ report card/data" ;
36 -
37 - print "Input folder: $path_in\n" ;
38 - print "Output folder: $path_out\n" ;
39 - print "\n" ;
40 -
41 - $file_csv_out = "$path_out/analytics_in_binaries.csv" ;
42 -}
43 -
44 -
45 -sub ReadStatisticsPerBinariesExtension
46 -{
47 - my $project = shift ;
48 - my $file_csv_in = "$path_in/csv_$project/StatisticsPerBinariesExtension.csv" ;
49 - $yyyymm_hi = -1 ;
50 -
51 - if (! -e $file_csv_in)
52 - { die "Input file '$file_csv_in' not found" ; }
53 -
54 -
55 - print "Read '$file_csv_in'\n" ;
56 - open CSV_IN, '<', $file_csv_in ;
57 -
58 - $language_prev = '' ;
59 - while ($line = <CSV_IN>)
60 - {
61 - chomp $line ;
62 - next if $line !~ /,.*?,/ ;
63 -
64 - ($language,$date,$data) = split (',', $line, 3) ;
65 -
66 - # for each wiki first line shows ext names, no tcounts
67 - if ($date eq "00/0000")
68 - {
69 - if ($language_prev ne '')
70 - { &WriteMonthlyData ($project, $language_prev) ; }
71 - $language_prev = $language ;
72 -
73 - undef %ext_name ;
74 - undef %ext_ndx ;
75 - undef %ext_cnt ;
76 - undef %months ;
77 -
78 - @exts = split (',', $data) ;
79 - $ndx = 0 ;
80 - foreach $ext (@exts)
81 - {
82 - $ext_name {$ndx} = $ext ;
83 - $ext_ndx {$ext} = $ndx ;
84 - $ndx ++ ;
85 - }
86 - next ;
87 - }
88 -
89 - ($month,$year) = split ('\/', $date) ;
90 - $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
91 - if ($yyyymm gt $yyyymm_hi)
92 - { $yyyymm_hi = $yyyymm ; }
93 - $months {$yyyymm}++ ;
94 -
95 - @counts = split (',', $data) ;
96 - $ndx = 0 ;
97 - foreach $count (@counts)
98 - {
99 - $ext_cnt {$yyyymm}{$ext_name {$ndx}} = $count ;
100 - $ndx ++ ;
101 - }
102 - }
103 - &WriteMonthlyData ($project, $language_prev) ;
104 -
105 - close CSV_IN ;
106 -}
107 -
108 -sub WriteMonthlyData
109 -{
110 - my ($project,$language) = @_ ;
111 - # get sorted array of extensions, order by count for most recent month
112 - %ext_cnt_yyyymm_hi = %{$ext_cnt {$yyyymm_hi}} ;
113 - @ext_cnt_yyyymm_hi = (sort {$ext_cnt_yyyymm_hi {$b} <=> $ext_cnt_yyyymm_hi {$a}} keys %ext_cnt_yyyymm_hi) ;
114 -
115 - foreach $month (sort keys %months)
116 - {
117 - $ndx = 0 ;
118 - foreach $ext (@ext_cnt_yyyymm_hi)
119 - {
120 - print CSV_OUT "$project,$language,$month,$ext,${ext_cnt{$yyyymm}{$ext_name {$ndx}}}\n" ;
121 - # print "$month,$ext,${ext_cnt{$yyyymm}{$ext_name {$ndx}}}\n" ;
122 - last if (++ $ndx > 25) ;
123 - }
124 - }
125 -}
Index: trunk/wikistats/analytics/AnalyticsPrepLanguageNames.pl
@@ -1,162 +0,0 @@
2 -#!/usr/bin/perl
3 -
4 -# This module prepares a csv file with language names for feed into the analytics database
5 -# The main work of collecting these names from different sources is done by
6 -# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiReportsLocalizations.pm
7 -# which is part of wikistats reporting phase and once each month updates local master csv files from all sources
8 -
9 -# The following code to merge and filter these master csv files is based on parts of the code in WikiReportsLocalizations.pm, function Localization
10 -# To do some day: make a completely independent script out of this code and code from WikiReportsLocalizations.pm which covers the whole production cycle
11 -
12 -# Sources for language names:
13 -# - php files
14 -# - translatewiki
15 -# - English Wikipedia API (interwikilinks)
16 -
17 -# Right now multilingual support for the analytics database is just a nice idea, so to speed up data feeds, just keep English and German translations
18 -
19 - use Getopt::Std ;
20 -
21 - $true = 1 ;
22 - $false = 0 ;
23 -
24 - $max_language_name = 50 ; # truncate if longer
25 -
26 - $file_csv_language_names_php = "LanguageNamesViaPhp.csv" ;
27 - $file_csv_language_names_wp = "LanguageNamesViaWpEnEdited.csv" ;
28 - $file_csv_analytics_in = "analytics_in_language_names.csv" ;
29 -
30 - $languages_force_case_uc = "ast|br|de|en|id|nl|wa" ; # incomplete list, based on languages supported by wikistats reporting
31 - $languages_filter = "de|en" ;
32 - foreach $language (split '\|', $languages_filter)
33 - { $include_target_language {$language} = $true ; }
34 -
35 - &ParseArguments ;
36 - &ReadCsvFiles ;
37 - &WriteCsvFile ;
38 -
39 -
40 -# if ($language eq "id") # swap which file takes precedence
41 -
42 - print "\nReady\n\n" ;
43 - exit ;
44 -
45 -sub ParseArguments
46 -{
47 - my (@options, $arguments) ;
48 -
49 - getopt ("io", \%options) ;
50 -
51 - foreach $arg (sort keys %options)
52 - { $arguments .= " -$arg " . $options {$arg} . "\n" ; }
53 - print ("\nArguments\n$arguments\n") ;
54 -
55 - $options {"i"} = "w:/# out bayes/csv_wp" ; # EZ test
56 - $options {"o"} = "c:/MySQL/analytics" ; # EZ test
57 -
58 - die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ;
59 - die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
60 -
61 - ($path_in = $options {"i"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash if any
62 - ($path_out = $options {"o"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash if any
63 -
64 - die "Input folder '$path_in' does not exist" if (! -d $path_in) ;
65 - die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
66 -
67 - print "Input folder: $path_in\n" ;
68 - print "Output folder: $path_out\n\n" ;
69 -
70 - $file_csv_language_names_php = "$path_in/$file_csv_language_names_php" ;
71 - $file_csv_language_names_wp = "$path_in/$file_csv_language_names_wp" ;
72 - $file_csv_analytics_in = "$path_out/$file_csv_analytics_in" ;
73 -
74 - die "Input file '$file_csv_language_names_php' not found" if (! -e $file_csv_language_names_php) ;
75 - die "Input file '$file_csv_language_names_wp' not found" if (! -e $file_csv_language_names_wp) ;
76 -}
77 -
78 -sub ReadCsvFiles
79 -{
80 - #first read definitions from php message files, then overwrite with definitions from interwiki links when available
81 - # except for target language 'id' (Indonesian) where quality of php file has been deemed more reliable
82 -
83 - open CSV_IN, "<", $file_csv_language_names_php ;
84 - while ($line = <CSV_IN>)
85 - {
86 - chomp ($line) ;
87 - ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ;
88 -
89 - next if ! $include_target_language {$target_language} ;
90 -
91 - $out_languages {$target_language} {$code} = &FormatName ($target_language, $name_unicode) ; # forget about html for analytics database
92 - }
93 - close CSV_IN ;
94 -
95 - open CSV_IN, "<", $file_csv_language_names_wp ;
96 - while ($line = <CSV_IN>)
97 - {
98 - chomp ($line) ;
99 - ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ;
100 -
101 - next if ! $include_target_language {$target_language} ;
102 -
103 - next if $target_language eq 'id' and $out_languages {$target_language} {$code} ne '' ;
104 -
105 - # $name_unicode_php = $out_languages {$target_language} {$code} ; # test only
106 - # $name_unicode_wp = &FormatName ($target_language, $name_unicode) ; # test only
107 - # if (($name_unicode_php ne '') && ($name_unicode_php ne $name_unicode_wp)) # test only
108 - # { print "$name_unicode_php => $name_unicode_wp\n" ; } # test only
109 -
110 - $out_languages {$target_language} {$code} = &FormatName ($target_language, $name_unicode) ; # forget about html for analytics database
111 - }
112 - close CSV_IN ;
113 -}
114 -
115 -sub FormatName
116 -{
117 - my ($target_language, $name_unicode) = @_ ;
118 -
119 - $name_unicode2 = $name_unicode ;
120 -
121 - if ($target_language eq "de")
122 - { $name_unicode =~ s/e?\s*\-?sprache//i ; }
123 -
124 - if ($target_language =~ /^(?:$languages_force_case_uc)/)
125 - { $name_unicode = ucfirst $name_unicode ; }
126 - else
127 - { $name_unicode = lc $name_unicode ; }
128 -
129 -# Test only
130 -# if (($target_language eq 'de') && ($name_unicode ne $name_unicode2))
131 -# { print "$name_unicode2 => $name_unicode\n" ; }
132 -
133 - return ($name_unicode) ;
134 -}
135 -
136 -sub WriteCsvFile
137 -{
138 - open CSV_OUT, ">", $file_csv_analytics_in || die ("File '$file_csv_analytics_in' could not be opened") ;
139 - binmode CSV_OUT ; # force Unix style linebreak \012
140 -
141 - foreach $target_language (sort keys %out_languages)
142 - {
143 - print "\nTarget language '$target_language'\n" ;
144 - %translations = %{$out_languages {$target_language}} ;
145 -
146 - foreach $code (sort keys %translations)
147 - {
148 - $language_name = $translations{$code} ;
149 - $language_name2 = substr ($language_name,0,$max_language_name) ;
150 -
151 - if ($language_name ne $language_name2)
152 - { print "Language name truncated to $max_language_name chars: '$language_name' -> '$language_name2'\n" ; }
153 -
154 - if ($language_name2 =~ /,/)
155 - { $language_name2 = "\"$language_name2\"" ; }
156 - # test only
157 - print CSV_OUT "$target_language,$code,$language_name2\n" ;
158 - }
159 - }
160 -
161 - close CSV_OUT ;
162 -}
163 -
Index: trunk/wikistats/analytics/analytics_refresh_from_csv.txt
@@ -1,55 +0,0 @@
2 -USE `analytics` ;
3 -
4 -TRUNCATE TABLE comscore ;
5 -LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv'
6 - INTO TABLE comscore
7 - FIELDS TERMINATED BY ','
8 - OPTIONALLY ENCLOSED BY '"'
9 - (@date,country_code,region_code,web_property,project_code,reach,visitors)
10 - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
11 -
12 -TRUNCATE TABLE comscore_regions ;
13 -LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv'
14 - INTO TABLE comscore_regions
15 - FIELDS TERMINATED BY ','
16 - OPTIONALLY ENCLOSED BY '"'
17 - (report_language,region_code,region_name) ;
18 -
19 -TRUNCATE TABLE wikistats ;
20 -LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv'
21 - INTO TABLE wikistats
22 - FIELDS TERMINATED BY ','
23 - OPTIONALLY ENCLOSED BY '"'
24 - (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects)
25 - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
26 -
27 -TRUNCATE TABLE page_views ;
28 -LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv'
29 - INTO TABLE page_views
30 - FIELDS TERMINATED BY ','
31 - OPTIONALLY ENCLOSED BY '"'
32 - (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized)
33 - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
34 -
35 -TRUNCATE TABLE language_names ;
36 -LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv'
37 - INTO TABLE language_names
38 - FIELDS TERMINATED BY ','
39 - OPTIONALLY ENCLOSED BY '"'
40 - (report_language,language_code,language_name) ;
41 -
42 -TRUNCATE TABLE binaries ;
43 -LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv'
44 - INTO TABLE binaries
45 - FIELDS TERMINATED BY ','
46 - OPTIONALLY ENCLOSED BY '"'
47 - (project_code,language_code,@date,extension,binaries)
48 - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
49 -
50 -TRUNCATE TABLE offline ;
51 -LOAD DATA LOCAL INFILE 'analytics_in_offline.csv'
52 - INTO TABLE offline
53 - FIELDS TERMINATED BY ','
54 - OPTIONALLY ENCLOSED BY '"'
55 - (@date, readers)
56 - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
Index: trunk/wikistats/analytics/AnalyticsPrepPageViews.pl
@@ -1,11 +0,0 @@
2 -#!/usr/bin/perl
3 -
4 -# This file is just a place holder (alias)
5 -# Page views per day/week/month/etc, per project, per language, nrmalized and not normalized are collected by
6 -# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiCountsSummarizeProjectCounts.pl
7 -#
8 -# Added May 2001:
9 -# For analytics database one file is written for all projects and languages combined,
10 -# with per month, not normalized and normalized page view counts in one row.
11 -# This file is written to same output folder as other csv files generated by WikiCountsSummarizeProjectCounts.pl
12 -# and needs to be copied to analytics folder later
\ No newline at end of file
Index: trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt
@@ -1,181 +0,0 @@
2 -
3 -
4 -
5 -
6 -
7 -DROP DATABASE IF EXISTS `analytics` ;
8 -
9 -CREATE DATABASE `analytics` ;
10 -
11 -USE `analytics` ;
12 -
13 -CREATE TABLE `comscore` (
14 - `date` date NOT NULL,
15 - `country_code` varchar (3),
16 - `region_code` varchar (3),
17 - `web_property` varchar (20),
18 - `project_code` varchar (10),
19 - `reach` decimal (4,1) DEFAULT NULL,
20 - `visitors` decimal (15) DEFAULT NULL,
21 - PRIMARY KEY (date,country_code,region_code,project_code,web_property),
22 - KEY (`country_code`)
23 -) ;
24 -
25 -CREATE TABLE `comscore_regions` (
26 - `region_code` varchar (2),
27 - `report_language` varchar (10),
28 - `region_name` varchar (18),
29 - PRIMARY KEY (region_code,report_language)
30 -) ;
31 -
32 -CREATE TABLE `wikistats` (
33 - `date` date NOT NULL,
34 - `project_code` varchar (10),
35 - `language_code` varchar (15),
36 - `editors_all_time` int (10) DEFAULT NULL,
37 - `editors_new` int (7) DEFAULT NULL,
38 - `editors_ge_5` int (7) DEFAULT NULL,
39 - `editors_ge_25` int (7) DEFAULT NULL,
40 - `editors_ge_100` int (7) DEFAULT NULL,
41 - `articles` int (12) DEFAULT NULL,
42 - `articles_new_per_day` int (9) DEFAULT NULL,
43 - `articles_over_bytes_500` int (12) DEFAULT NULL,
44 - `articles_over_bytes_2000` int (12) DEFAULT NULL,
45 - `edits_per_article` decimal (9,1) DEFAULT NULL,
46 - `bytes_per_article` decimal (9,1) DEFAULT NULL,
47 - `edits` int (12) DEFAULT NULL,
48 - `size_in_bytes` int (15) DEFAULT NULL,
49 - `size_in_words` int (15) DEFAULT NULL,
50 - `links_internal` int (15) DEFAULT NULL,
51 - `links_interwiki` int (15) DEFAULT NULL,
52 - `links_image` int (15) DEFAULT NULL,
53 - `links_external` int (15) DEFAULT NULL,
54 - `redirects` int (15) DEFAULT NULL,
55 - PRIMARY KEY (date,project_code,language_code)
56 -) ;
57 -
58 -CREATE TABLE `page_views` (
59 - `date` date NOT NULL,
60 - `project_code` char (2),
61 - `language_code` char (15),
62 - `views_non_mobile_raw` bigint (15),
63 - `views_mobile_raw` bigint (15),
64 - `views_non_mobile_normalized` bigint (15),
65 - `views_mobile_normalized` bigint (15),
66 - `views_raw` bigint (15),
67 - `views_normalized` bigint (15),
68 - PRIMARY KEY (date,project_code,language_code),
69 - KEY (date,language_code,project_code)
70 -) ;
71 -
72 -CREATE TABLE `language_names` (
73 - `report_language` varchar (15),
74 - `language_code` varchar (15),
75 - `language_name` varchar (50),
76 - PRIMARY KEY (report_language,language_code)
77 -) ;
78 -
79 -CREATE TABLE `binaries` (
80 - `date` date NOT NULL,
81 - `project_code` char (2),
82 - `language_code` char (15),
83 - `extension` varchar (10),
84 - `binaries` bigint (15),
85 - PRIMARY KEY (date,project_code,language_code,extension)
86 -) ;
87 -
88 -CREATE TABLE `offline` (
89 - `date` date NOT NULL,
90 - `source` varchar (255),
91 - `readers` bigint (12),
92 - PRIMARY KEY (date,source)
93 -) ;
94 -
95 -
96 -
97 -LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv'
98 - INTO TABLE comscore
99 - FIELDS TERMINATED BY ','
100 - OPTIONALLY ENCLOSED BY '"'
101 - (@date,country_code,region_code,web_property,project_code,reach,visitors)
102 - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
103 -
104 -LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv'
105 - INTO TABLE comscore_regions
106 - FIELDS TERMINATED BY ','
107 - OPTIONALLY ENCLOSED BY '"'
108 - (report_language,region_code,region_name) ;
109 -
110 -LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv'
111 - INTO TABLE wikistats
112 - FIELDS TERMINATED BY ','
113 - OPTIONALLY ENCLOSED BY '"'
114 - (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects)
115 - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
116 -
117 -LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv'
118 - INTO TABLE page_views
119 - FIELDS TERMINATED BY ','
120 - OPTIONALLY ENCLOSED BY '"'
121 - (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized)
122 - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
123 -
124 -
125 -LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv'
126 - INTO TABLE language_names
127 - FIELDS TERMINATED BY ','
128 - OPTIONALLY ENCLOSED BY '"'
129 - (report_language,language_code,language_name) ;
130 -
131 -LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv'
132 - INTO TABLE binaries
133 - FIELDS TERMINATED BY ','
134 - OPTIONALLY ENCLOSED BY '"'
135 - (project_code,language_code,@date,extension,binaries)
136 - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
137 -
138 -LOAD DATA LOCAL INFILE 'analytics_in_offline.csv'
139 - INTO TABLE offline
140 - FIELDS TERMINATED BY ','
141 - OPTIONALLY ENCLOSED BY '"'
142 - (@date,readers)
143 - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
144 -
145 -
146 - SELECT * FROM offline ;
147 -
148 -
Index: trunk/wikistats/analytics/analytics_generate_csv_files.sh
@@ -1,47 +0,0 @@
2 -#!/bin/sh
3 -
4 -# Prepare several csv files, ready for importing into analytics database
5 -# All generated files have _in_ in name signalling these contain data ready for importing into database
6 -# One input record corresponds to one database record
7 -
8 -ulimit -v 8000000
9 -
10 -clear
11 -cd /a/analytics
12 -
13 -
14 -# AnalyticsPrepBinariesData.pl read counts for binaries which were generated by wikistats
15 -# and which reside in /a/wikistats/csv_[project code]/StatisticsPerBinariesExtension.csv
16 -# It filters and reorganizes data and produces analytics_in_binaries.csv
17 -# Output csv contains: project code, language, month, extension name, count
18 -
19 -perl AnalyticsPrepBinariesData.pl -i /a/wikistats/ -o /a/analytics/
20 -
21 -# AnalyticsPrepComscoreData.pl scans /a/analytics/comscore for newest comScore csv files (with data for last 14 months)
22 -# parses those csv files, adds/replaces data from these csv files into master files (containing full history)
23 -# and generates input csv file analytics_in_comscore.csv ready for importing into database
24 -#
25 -# note : these csv files were manually downloaded from http://mymetrix.comscore.com/app/report.aspx
26 -# and given more descriptive names, script finds newest files based on partial name search
27 -#
28 -# -r replace (default is add only)
29 -# -i input folder, contains manually downloaded csv files from comScore (or xls files manually converted to csv)
30 -# -m master files with full history
31 -# -o output csv file, with reach per region, UV's per region and UV's per top web property, ready for import into database
32 -
33 -perl AnalyticsPrepComscoreData.pl -r -i /a/analytics/comscore -m /a/analytics -o /a/analytics
34 -
35 -# AnalyticsPrepWikiCountsOutput.pl reads a plethora of fields from several csv files from wikistats process
36 -# It filters and reorganizes data and produces analytics_in_wikistats.csv, ready for import into analytics database
37 -
38 -perl AnalyticsPrepWikiCountsOutput.pl -i /a/wikistats/ -o /a/analytics
39 -
40 -# analytics_in_page_views.csv is written daily as part of WikiCountsSummarizeProjectCounts.pl
41 -# part of (/home/ezachte/pageviews_monthly.sh job)
42 -# which processes hourly projectcounts files (per wiki page view totals for one hour) from http://dammit.lt/wikistats
43 -# and generates several files on different aggregation levels
44 -# only action here is to copy data to this folder to have everything in one place
45 -# note: unlike folder name suggests this file contains stats for all projects
46 -
47 -cp /a/wikistats/csv_wp/analytics_in_page_views.csv .
48 -
Index: trunk/wikistats/analytics/analytics_upd.sh
@@ -1,6 +0,0 @@
2 -clear
3 -cd /a/analytics
4 -
5 -./analytics_generate_csv_files.sh
6 -
7 -mysql -u analytics -h project2.wikimedia.org -preport < analytics_refresh_from_csv.txt
Index: trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl
@@ -1,334 +0,0 @@
2 -#!/usr/local/bin/perl
3 -
4 -# Copyright (C) 2011 Wikimedia Foundation
5 -# This program is free software; you can redistribute it and/or
6 -# modify it under the terms of the GNU General Public License version 2
7 -# as published by the Free Software Foundation.
8 -# This program is distributed in the hope that it will be useful,
9 -# but WITHOUT ANY WARRANTY; without even the implied warranty of
10 -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 -# See the GNU General Public License for more details, at
12 -# http://www.fsf.org/licenses/gpl.html
13 -
14 -# Author:
15 -# Erik Zachte, email ezachte@wikimedia.org
16 -# loosely based on predecessor
17 -# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/reportcard/ReportCardExtractWikiCountsOutput.pl
18 -
19 -# Functionality:
20 -# tba
21 -
22 -# Parameters:
23 -# tba
24 -
25 -# Output:
26 -# updated csv file for import in MySQL
27 -
28 -# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/
29 -
30 - use Getopt::Std ;
31 -
32 - $true = 1 ;
33 - $false = 0 ;
34 -
35 - @projects = ('wb','wk','wn','wp','wq','ws','wv','wx','commons','*') ;
36 -
37 - $file_csv_monthly_data = "StatisticsMonthly.csv" ;
38 - $file_csv_user_activity_spread = "StatisticsUserActivitySpread.csv" ;
39 - $file_csv_analytics_in = "analytics_in_wikistats.csv" ;
40 -
41 - &ParseArguments ;
42 - &ReadStatisticsMonthly ;
43 - &FindLargestWikis ;
44 - &WriteMonthlyData ;
45 -
46 - print "\nReady\n\n" ;
47 - exit ;
48 -
49 -sub ParseArguments
50 -{
51 - print "ParseArguments\n" ;
52 - my (@options, $arguments) ;
53 -
54 - getopt ("io", \%options) ;
55 -
56 - foreach $arg (sort keys %options)
57 - { $arguments .= " -$arg " . $options {$arg} . "\n" ; }
58 - print ("\nArguments\n$arguments\n") ;
59 -
60 - if (! -d '/mnt/') # EZ test
61 - {
62 - $path_in = "c:/\@ wikimedia/# out bayes" ;
63 - $path_out = "c:/MySQL/analytics" ;
64 - }
65 - else
66 - {
67 - die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ;
68 - die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
69 -
70 - $path_in = $options {"i"} ;
71 - $path_out = $options {"o"} ;
72 - }
73 -
74 - die "Input folder '$path_in' does not exist" if (! -d $path_in) ;
75 - die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
76 -
77 - print "Input folder: $path_in\n" ;
78 - print "Output folder: $path_out\n\n" ;
79 -
80 - $file_csv_out = "$path_out/analytics_in_wikistats.csv" ;
81 -}
82 -
83 -sub ReadStatisticsMonthly
84 -{
85 - print "ReadStatisticsMonthly\n" ;
86 - &ReadStatisticsMonthlyForProject ("wb") ;
87 - &ReadStatisticsMonthlyForProject ("wk") ;
88 - &ReadStatisticsMonthlyForProject ("wn") ;
89 - &ReadStatisticsMonthlyForProject ("wp") ;
90 - &ReadStatisticsMonthlyForProject ("wq") ;
91 - &ReadStatisticsMonthlyForProject ("ws") ;
92 - &ReadStatisticsMonthlyForProject ("wv") ;
93 - &ReadStatisticsMonthlyForProject ("wx") ;
94 -
95 -# &ReadStatisticsPerBinariesExtensionCommons ;
96 -}
97 -
98 -sub ReadStatisticsMonthlyForProject
99 -{
100 - my $project = shift;
101 - $all_projects = "*" ;
102 -
103 - my $file_csv_in_1 = "$path_in/csv_$project/$file_csv_monthly_data" ;
104 - my $file_csv_in_2 = "$path_in/csv_$project/$file_csv_user_activity_spread" ;
105 -
106 - if (! -e $file_csv_in_1)
107 - { &Abort ("Input file '$file_csv_in_1' not found") ; }
108 - if (! -e $file_csv_in_2)
109 - { &Abort ("Input file '$file_csv_in_2' not found") ; }
110 -
111 - my $yyyymm ;
112 -
113 - print "Read '$file_csv_in_1'\n" ;
114 - open CSV_IN, '<', $file_csv_in_1 ;
115 - while ($line = <CSV_IN>)
116 - {
117 - chomp $line ;
118 - ($language,$date,$counts) = split (',', $line, 3) ;
119 - @fields = split (',', $counts) ;
120 -
121 - next if ! &AcceptWiki ($project,$language) ;
122 -
123 - ($month,$day,$year) = split ('\/', $date) ;
124 - $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
125 -
126 - foreach $field (@fields)
127 - {
128 - if ($field eq '-')
129 - { $field = 0 ; }
130 - }
131 -
132 - $data = $fields [0] . ',' . # contributors all time
133 - $fields [1] . ',' . # new contributors
134 - 'data2,' . # place holder for more data, to be inserted later
135 - $fields [4] . ',' . # articles
136 - $fields [6] . ',' . # articles new per day
137 - $fields [9] . ',' . # larger than 0.5 kB
138 - $fields [10] . ',' . # larger than 2.0 kB
139 - $fields [7] . ',' . # mean edits per article
140 - $fields [8] . ',' . # mean bytes per article
141 - $fields [11] . ',' . # edits
142 - $fields [12] . ',' . # size in bytes
143 - $fields [13] . ',' . # size in words
144 - $fields [14] . ',' . # links internal
145 - $fields [15] . ',' . # links interwiki
146 - $fields [16] . ',' . # links images
147 - $fields [17] . ',' . # links external
148 - $fields [18] ; # redirects
149 -
150 - $data1 {"$project,$language,$yyyymm"} = $data ;
151 - }
152 - close CSV_IN ;
153 -
154 - # now read (very) active editors from newer more accurate file (split data for reg users and bots, unlike StatisticsMonthly.csv)
155 -
156 - print "Read '$file_csv_in_2'\n" ;
157 - open CSV_IN, '<', $file_csv_in_2 ;
158 - while ($line = <CSV_IN>)
159 - {
160 - chomp $line ;
161 - ($language,$date,$reguser_bot,$group,@counts) = split (',', $line) ;
162 -
163 - next if ! &AcceptWiki ($project,$language) ;
164 -
165 - if ($reguser_bot ne "R") { next ; } # R: reg user, B: bot
166 - if ($group ne "A") { next ; } # A: articles, T: talk pages, O: other namespaces
167 -
168 - ($month,$day,$year) = split ('\/', $date) ;
169 - $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
170 - $months {$yyyymm} ++ ;
171 -# print "YYYYMM $yyyymm\n" ;
172 -
173 - # data have been collected in WikiCountsProcess.pm and been written in WikiCountsOutput.pm
174 - # count user with over x edits
175 - # threshold starting with a 3 are 10xSQRT(10), 100xSQRT(10), 1000xSQRT(10), etc
176 - # @thresholds = (1,3,5,10,25,32,50,100,250,316,500,1000,2500,3162,5000,10000,25000,31623,50000,100000,250000,316228,500000,1000000,2500000,3162278,500000,10000000,25000000,31622777,5000000,100000000) ;
177 - $edits_ge_5 = @counts [2] > 0 ? @counts [2] : 0 ;
178 - $edits_ge_25 = @counts [4] > 0 ? @counts [4] : 0 ;
179 - $edits_ge_100 = @counts [7] > 0 ? @counts [7] : 0 ;
180 - $data2 {"$project,$language,$yyyymm"} = "$edits_ge_5,$edits_ge_25,$edits_ge_100" ;
181 -
182 - $total_edits_ge_5 {"$project,$language"} += $edits_ge_5 ;
183 - $total_edits_ge_25 {"$project,$language"} += $edits_ge_25 ;
184 - $total_edits_ge_100 {"$project,$language"} += $edits_ge_100 ;
185 -
186 - # prep string with right amount of comma's
187 - if ($data2_default eq '')
188 - {
189 - $data2_default = $data2 {"$project,$language,$yyyymm"} ;
190 - $data2_default =~ s/[^,]+/0/g ;
191 - }
192 - }
193 - close CSV_IN ;
194 -}
195 -
196 -#sub ReadStatisticsPerBinariesExtensionCommons
197 -#{
198 -# my $file_csv_in = "$path_in/csv_wx/StatisticsPerBinariesExtension.csv" ;
199 -# my $mmax = -1 ;
200 -
201 -# if (! -e $file_csv_in)
202 -# { &Abort ("Input file '$file_csv_in' not found") ; }
203 -
204 -# print "Read '$file_csv_in'\n" ;
205 -# open CSV_IN, '<', $file_csv_in ;
206 -# while ($line = <CSV_IN>)
207 -# {
208 -# chomp $line ;
209 -# ($language,$date,$counts) = split (',', $line, 3) ;
210 -
211 -# if ($language ne "commons") { next ; }
212 -
213 -# if ($date eq "00/0000")
214 -# {
215 -# @fields = split (',', $counts) ;
216 -# $field_ndx = 0 ;
217 -# foreach $field (@fields)
218 -# {
219 -# $ext_cnt {-1}{$field_ndx} = $field ;
220 -# # print "EXT_CNT $field_ndx : $field\n" ;
221 -# $field_ndx ++ ;
222 -# }
223 -# next ;
224 -# }
225 -
226 -# ($month,$year) = split ('\/', $date) ;
227 -# my $m = &months_since_2000_01 ($year,$month) ;
228 -# next if $m < $m_start ;
229 -
230 -# if ($m > $mmax)
231 -# { $mmax = $m ; }
232 -
233 -# @fields = split (',', $counts) ;
234 -# $field_ndx = 0 ;
235 -# foreach $field (@fields)
236 -# {
237 -# $ext_cnt {$m}{$field_ndx} = $field ;
238 -# $ext_tot {$m} += $field ;
239 -# $field_ndx ++ ;
240 -# }
241 -# }
242 -# close CSV_IN ;
243 -
244 -# %ext_cnt_mmax = %{$ext_cnt {$mmax}} ;
245 -# @ext_cnt_mmax = (sort {$ext_cnt_mmax {$b} <=> $ext_cnt_mmax {$a}} keys %ext_cnt_mmax) ;
246 -
247 -# $extcnt = 0 ;
248 -# foreach $extndx (@ext_cnt_mmax)
249 -# {
250 -# # print "$extndx < ${ext_cnt {-1}{$extndx}} > : ${ext_cnt_mmax {$extndx}}\n" ;
251 -# push @extndxs, $extndx ;
252 -# if ($extcnt++ >= 9) { last ; }
253 -# }
254 -#}
255 -
256 -sub FindLargestWikis
257 -{
258 - print "FindLargestWikis\n" ;
259 - print "Largest projects (most accumulated very active editors):\n";
260 - @total_edits_ge_100 = sort {$total_edits_ge_100 {$b} <=> $total_edits_ge_100 {$a}} keys %total_edits_ge_100 ;
261 - $rank = 0 ;
262 - foreach $project_language (@total_edits_ge_100)
263 - {
264 - $largest_projects {$project_language} = $rank++ ;
265 - print "$project_language," ;
266 - last if $rank > 10 ;
267 - }
268 - print "\n\n" ;
269 -
270 - foreach $yyyymm (sort keys %months)
271 - {
272 - next if $yyyymm lt '2011' ;
273 - foreach $project_language (keys %largest_projects)
274 - {
275 - ($project,$language) = split (',', $project_language) ;
276 - if ($data2 {"$project,$language,$yyyymm"} eq '')
277 - {
278 - print "No data yet for large wiki $project_language for $yyyymm-> skip month $yyyymm\n" ;
279 - $months {$yyyymm} = 0 ;
280 - }
281 - }
282 - }
283 -}
284 -
285 -sub WriteMonthlyData
286 -{
287 - print "WriteMonthlyData\n" ;
288 - my $file_csv_out = "$path_out/$file_csv_analytics_in" ;
289 - open CSV_OUT, '>', $file_csv_out ;
290 - foreach $project_wiki_month (sort keys %data1)
291 - {
292 - ($project,$wiki,$yyyymm) = split (',', $project_wiki_month) ;
293 -
294 - # recent month misses on eor more large wikis?
295 - next if $months {$yyyymm} == 0 ;
296 -
297 - $data1 = $data1 {$project_wiki_month} ;
298 - $data2 = $data2 {$project_wiki_month} ;
299 - if ($data2 eq '')
300 - {
301 - print "Editor data missing for $project_wiki_month\n" ;
302 - $data2 = $data2_default ;
303 - }
304 - $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor fields close together
305 - print CSV_OUT "$project_wiki_month,$data1\n" ;
306 - }
307 - $total_edits_ge_5 {"$project,*,$yyyymm"} += $edits_ge_5 ;
308 - $total_edits_ge_25 {"$project,*,$yyyymm"} += $edits_ge_25 ;
309 - $total_edits_ge_100 {"$project,*,$yyyymm"} += $edits_ge_100 ;
310 - close CSV_OUT ;
311 -}
312 -
313 -sub AcceptWiki
314 -{
315 - my ($project,$language) = @_ ;
316 -
317 - return $false if $language eq 'commons' and $project ne 'wx' ; # commons also in wikipedia csv files (bug, hard to cleanup, just skip)
318 - return $false if $language eq 'sr' and $project eq 'wn' ; # ignore insane bot spam on
319 - return $false if $language =~ /mania|team|comcom|closed|chair|langcom|office|searchcom|sep11|nostalgia|stats|test/i ;
320 -
321 - return $false if $language =~ /^(?:dk|tlh|ru_sib)$/ ; # dk=dumps exist(ed?) but site not, tlh=Klignon, ru-sib=Siberian
322 - return $false if $project eq 'wk' and ($language eq "als" or $language eq "tlh") ;
323 -
324 - return $true ;
325 -}
326 -
327 -sub Abort
328 -{
329 - my $msg = shift ;
330 - print "$msg\nExecution aborted." ;
331 - # to do: log also to file
332 - exit ;
333 -}
334 -
335 -
Index: trunk/wikistats/analytics/analytics_new.sh
@@ -1,5 +0,0 @@
2 -clear
3 -cd /a/analytics
4 -# rm *test*.csv
5 -# mysql --user=root --password=changerootwhenpriyankareturns < analytics_create_and_load_from_csv.txt > mysql_log.txt
6 - mysql -u analytics -h project2.wikimedia.org -preport < analytics_create_and_load_from_csv.txt > mysql_log.txt
Index: trunk/wikistats/analytics/_readme.txt
@@ -1,21 +0,0 @@
2 -This folder contain bash and perl files to create and fill database 'analytics', a.o. for monthly dashboard.
3 -
4 -== analytics_new.sh ==
5 -Defines the database and tables and loads data from existing csv files.
6 -It executes SQL from analytics_create_and_load_from_csv.txt
7 -
8 -== analytics_upd.sh ==
9 -Prepares new csv files (delegated to analytics_generate_csv_files.sh),
10 -and empties/reloads all tables for which csv files are in this folder.
11 -It executes SQL from analytics_refresh_from_csv.txt
12 -
13 -== CSV files ==
14 -CSV files and where they are generated:
15 -
16 -analytics_in_binaries.csv <- AnalyticsPrepBinariesData.pl
17 -analytics_in_comscore.csv <- AnalyticsPrepComscoreData.pl
18 -analytics_in_comscore_regions.csv <- manual
19 -analytics_in_language_names.csv
20 -analytics_in_offline.csv <- manual
21 -analytics_in_page_views.csv <- /home/ezachte/wikistats/pageviews_monthly.sh (file copy after that)
22 -analytics_in_wikistats.csv <- AnalyticsPrepWikiCountsOutput.pl
Index: trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl
@@ -1,564 +0,0 @@
2 -#!/usr/bin/perl
3 -
4 -# Copyright (C) 2011 Wikimedia Foundation
5 -# This program is free software; you can redistribute it and/or
6 -# modify it under the terms of the GNU General Public License version 2
7 -# as published by the Free Software Foundation.
8 -# This program is distributed in the hope that it will be useful,
9 -# but WITHOUT ANY WARRANTY; without even the implied warranty of
10 -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 -# See the GNU General Public License for more details, at
12 -# http://www.fsf.org/licenses/gpl.html
13 -
14 -# Author:
15 -# Erik Zachte, email ezachte@wikimedia.org
16 -
17 -# Functionality:
18 -# comScore data can be downloaded as csv file, which each contain 14 months history
19 -# This script uses these files to update 'master' csv files which contain all known history
20 -# Note: only entities which are already in master file will be updated!
21 -# Then it merges these master files into one csv file which can be loaded into analytics database
22 -# Data are: reach by region, unique visitors by region, unique visitors by web property
23 -
24 -# Parameters:
25 -# -m (required) folder with 'master' csv files (files with all known history)
26 -# -u (required) folder with 'update' csv files (files with lastest 14 months history, produced by comScore)
27 -
28 -# Output:
29 -# updated master csv files + merged and formatted csv for import in MySQL
30 -
31 -# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/
32 -
33 - use Getopt::Std ;
34 - use Cwd;
35 -
36 - my $options ;
37 - getopt ("imo", \%options) ;
38 -
39 - $true = 1 ;
40 - $false = 0 ;
41 -
42 - $script_name = "AnalyticsPrepComscoreData.pl" ;
43 - $script_version = "0.31" ;
44 -
45 -# EZ test only
46 -# $source = "comscore" ;
47 -# $server = "ez_test" ;
48 -# $generated = "2011-05-06 00:00:00" ;
49 -# $user = "ezachte" ;
50 -
51 - $dir_in = $options {"i"} ;
52 - $dir_upd = $options {"m"} ;
53 - $dir_out = $options {"o"} ;
54 - $mode = 'add';
55 - if (defined $options {'r'})
56 - { $mode = 'replace'; }
57 -
58 - print "Mode is $mode (specify '-r' for replace)\n\n";
59 -
60 - if (! -d "/home/") # EZ test machine
61 - {
62 - $dir_in = "C:/@ Wikimedia/@ Report Card/Data" ;
63 - $dir_upd = "C:/MySQL/analytics" ;
64 - $dir_out = "C:/MySQL/analytics" ;
65 - $mode = 'replace' ;
66 - }
67 -
68 - if ($dir_in eq '')
69 - { Abort ("Specify folder for input file (new comScore data) '-i folder'") ; }
70 - if ($dir_upd eq '')
71 - { Abort ("Specify folder for master files (full history) as '-m folder'") ; }
72 - if ($dir_out eq '')
73 - { Abort ("Specify folder for output file '-o folder'") ; }
74 -
75 - $file_comscore_reach_master = "history_comscore_reach_regions.csv" ;
76 - $file_comscore_reach_update = "*reach*by*region*csv" ;
77 - $file_comscore_uv_region_master = "history_comscore_UV_regions.csv" ;
78 - $file_comscore_uv_region_update = "*UVs*by*region*csv" ;
79 - $file_comscore_uv_property_master = "history_comscore_UV_properties.csv" ;
80 - $file_comscore_uv_property_update = "*UV*trend*csv" ;
81 -
82 - $layout_csv_reach = 1 ;
83 - $layout_csv_regions = 2 ;
84 - $layout_csv_properties = 3 ;
85 -
86 - print "Directories:\n" .
87 - "Input (new comScore data): '$dir_in'\n".
88 - "Master files (full history): '$dir_upd'\n" .
89 - "Output (database feed): '$dir_out'\n\n" ;
90 -
91 - %region_codes = (
92 - "Europe"=>"EU",
93 - "North America"=>"NA",
94 - "Latin America"=>"LA",
95 - "World-Wide" => "W",
96 - "Middle East - Africa" => "MA",
97 - "Asia Pacific"=> "AS",
98 - "United States" => "US",
99 - "India" => "I",
100 - "China" => "C"
101 - ) ;
102 -
103 - foreach $region_name (keys %region_codes)
104 - { $region_names {$region_codes {$region_name}} = $region_name ; }
105 -
106 - @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ;
107 -
108 - &ReadMasterComscoreDataReachPerRegion ($file_comscore_reach_master, $file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ;
109 - %reach_region_code = %data ;
110 -
111 - &ReadMasterComscoreDataVisitorsPerRegion ($file_comscore_uv_region_master, $file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ;
112 - %visitors_region_code = %data ;
113 -
114 - &ReadMasterComscoreDataVisitorsPerProperty ($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f", 1000, $layout_csv_properties) ;
115 - %visitors_web_property = %data ;
116 -
117 - &WriteDataAnalytics ;
118 -
119 - print "\nReady\n\n" ;
120 - exit ;
121 -
122 -sub ReadMasterComscoreDataReachPerRegion
123 -{
124 - my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) = @_ ;
125 -
126 - print "ReadMasterComscoreDataReachPerRegion\n\n" ;
127 -
128 - undef %months ;
129 - undef %data ;
130 - undef @regions ;
131 -
132 - open IN, '<', "$dir_upd/$file_comscore_master" ;
133 -
134 - $lines = 0 ;
135 - while ($line = <IN>)
136 - {
137 - chomp $line ;
138 - $line =~ s/\r//g ;
139 -
140 - ($yyyymm,@data) = split (',', $line) ;
141 -
142 - if ($lines++ == 0)
143 - {
144 - @regions = @data ;
145 - print "Regions found: " . (join ',', @regions) . "\n";
146 - next ;
147 - }
148 -
149 - $field_ndx = 0 ;
150 - foreach (@data)
151 - {
152 - $region = $regions [$field_ndx] ;
153 - $region_code = $region_codes {$region} ;
154 -
155 - $data = $data [$field_ndx] ;
156 - if ($data eq '')
157 - { $data = '0' ; }
158 - $months {$yyyymm} ++ ;
159 - $data {"$yyyymm,$region_code"} = $data ;
160 - # print "Old data $yyyymm,$region_code = $data\n" ;
161 - $field_ndx++ ;
162 - }
163 - }
164 - close IN ;
165 -
166 - my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1, $layout_csv, @regions) ;
167 - return if ! $updates_found ;
168 -
169 - rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ;
170 - open OUT, '>', "$dir_upd/$file_comscore_master" ;
171 -
172 - $line_out = "yyyymm" ;
173 - foreach $region_name (@regions)
174 - { $line_out .= ",$region_name" ; }
175 - print OUT "$line_out" ;
176 -
177 - foreach $yyyymm (sort {$b cmp $a} keys %months)
178 - {
179 - $line_out = "\n$yyyymm" ;
180 - foreach $region_name (@regions)
181 - {
182 - $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
183 - $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
184 - }
185 - print OUT "$line_out" ;
186 - }
187 -
188 - close OUT ;
189 -}
190 -
191 -sub ReadMasterComscoreDataVisitorsPerRegion
192 -{
193 - my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ;
194 -
195 - print "ReadMasterComscoreDataVisitorsPerRegion\n\n";
196 -
197 - undef %months ;
198 - undef %data ;
199 - undef @regions ;
200 -
201 - open IN, '<', "$dir_upd/$file_comscore_master" ;
202 -
203 - $lines = 0 ;
204 - $metric = 'unique_visitors' ;
205 - while ($line = <IN>)
206 - {
207 - chomp $line ;
208 - $line =~ s/\r//g ;
209 - $line = &GetNumberOnly ($line) ;
210 -
211 - next if $line !~ /(?:yyyymm|\d\d\d\d-\d\d)/ ;
212 -
213 - ($yyyymm,@data) = split (',', $line) ;
214 -
215 - if ($lines++ == 0)
216 - {
217 - @regions = @data ;
218 - print "Regions found: " . (join ',', @regions) . "\n";
219 - next ;
220 - }
221 -
222 - $field_ndx = 0 ;
223 - foreach (@data)
224 - {
225 - $region = $regions [$field_ndx] ;
226 - $region_code = $region_codes {$region} ;
227 -
228 - $data = $data [$field_ndx] ;
229 - if ($data eq '')
230 - { $data = '0' ; }
231 -
232 - # print "Old data $yyyymm,$region = $data\n" ;
233 -
234 - $months {$yyyymm} ++ ;
235 - $data {"$yyyymm,$region_code"} = $data ;
236 -
237 - $field_ndx++ ;
238 - }
239 - }
240 - close IN ;
241 -
242 - my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @regions) ;
243 - return if ! $updates_found ;
244 -
245 - rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ;
246 - open OUT, '>', "$dir_upd/$file_comscore_master" ;
247 -
248 - $line_out = "yyyymm" ;
249 - foreach $region_name (@regions)
250 - { $line_out .= ",$region_name" ; }
251 - print OUT "$line_out" ;
252 -
253 - foreach $yyyymm (sort {$b cmp $a} keys %months)
254 - {
255 - $line_out = "\n$yyyymm" ;
256 - foreach $region_name (@regions)
257 - {
258 - $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
259 - $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
260 - }
261 - print OUT "$line_out" ;
262 - }
263 -
264 - close OUT ;
265 -}
266 -
267 -sub ReadMasterComscoreDataVisitorsPerProperty
268 -{
269 - my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ;
270 -
271 - print "ReadMasterComscoreDataVisitorsPerProperty\n\n";
272 -
273 - undef %months ;
274 - undef %data ;
275 - undef @properties ;
276 -
277 - open IN, '<', "$dir_upd/$file_comscore_master" ;
278 -
279 - $lines = 0 ;
280 - $metric = 'unique_visitors' ;
281 - while ($line = <IN>)
282 - {
283 - chomp $line ;
284 - $line =~ s/\r//g ;
285 -
286 - ($yyyymm,@data) = split (',', $line) ;
287 - if ($lines++ == 0)
288 - { @properties = @data ; next ; }
289 -
290 - $field_ndx = 0 ;
291 - foreach (@data)
292 - {
293 - $property = $properties [$field_ndx] ;
294 - $property =~ s/.*Yahoo.*/Yahoo/ ;
295 - $data = $data [$field_ndx] ;
296 - if ($data eq '')
297 - { $data = '0' ; }
298 -
299 - # print "Old data $yyyymm,$property = $data\n" ;
300 -
301 - $months {$yyyymm} ++ ;
302 - $data {"$yyyymm,$property"} = $data ;
303 -
304 - $field_ndx++ ;
305 - }
306 - }
307 - close IN ;
308 -
309 - my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @properties) ;
310 - return if ! $updates_found ;
311 -
312 - rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ;
313 - open OUT, '>', "$dir_upd/$file_comscore_master" ;
314 -
315 - $line_out = "yyyymm" ;
316 - foreach $property (@properties)
317 - { $line_out .= ",$property" ; }
318 - print OUT "$line_out" ;
319 -
320 - foreach $yyyymm (sort {$b cmp $a} keys %months)
321 - {
322 - $line_out = "\n$yyyymm" ;
323 - foreach $property (@properties)
324 - {
325 - $yyyymm_property = "$yyyymm,$property" ;
326 - $line_out .= "," . sprintf ($precision, $data {$yyyymm_property}) ;
327 - }
328 - print OUT "$line_out" ;
329 - }
330 -
331 - close OUT ;
332 -}
333 -
334 -sub UpdateMasterFileFromRecentComscoreData
335 -{
336 - my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, @white_list) = @_ ;
337 -
338 - print "UpdateMasterFileFromRecentComscoreData\n\n";
339 -
340 - undef %white_list ;
341 - undef %not_white_listed ;
342 -
343 - print "White list: ". (join (',', @white_list)) . "\n\n";
344 -
345 - foreach $id (@white_list)
346 - { $white_list {$id} = $true ; }
347 -
348 - if (! -e "$dir_upd/$file_comscore_master")
349 - { Abort ("File $file_comscore_master not found!") ; }
350 -
351 - $age_all = -M "$dir_upd/$file_comscore_master" ;
352 - print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " days old: '$file_comscore_master'\n" ;
353 -
354 - my $cwd = getcwd ;
355 - chdir $dir_in ;
356 -
357 - @files = glob($file_comscore_updates) ;
358 - $min_age_upd = 999999 ;
359 - $file_comscore_updates_latest = '' ;
360 - foreach $file (@files)
361 - {
362 - $age = -M $file ;
363 - if ($age < $min_age_upd)
364 - {
365 - $min_age_upd = $age ;
366 - $file_comscore_updates_latest = $file ;
367 - }
368 - }
369 - print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " days old: '$file_comscore_updates_latest'\n" ;
370 -
371 - if ($min_age_upd == 999999)
372 - {
373 - print "No valid update file found. Nothing to update." ;
374 - return ;
375 - }
376 -
377 - #if ($age_all > $min_age_upd)
378 - #{
379 - # print "File with master data more recent than latest update csv from comScore. Nothing to update." ;
380 - # return ;
381 - #}
382 -
383 - my $updates_found = $false ;
384 -
385 - open CSV, '<', $file_comscore_updates_latest ;
386 - binmode CSV ;
387 - while ($line = <CSV>)
388 - {
389 - chomp $line ;
390 - $line =~ s/\r//g ;
391 - $line = &GetNumberOnly ($line) ;
392 -
393 - if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...'
394 - {
395 - if ($layout_csv == $layout_csv_properties)
396 - { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web properties csv file
397 - else
398 - { ($dummy1,$dummy2,@months) = split (',', $line) ; } # uv / reach csv files
399 -
400 - @months = &mmm_yyyy2yyyy_mm (@months) ;
401 - }
402 -
403 - if (($line =~ /^\d+,/) || ($line =~ /,,.*?Total Internet/))
404 - {
405 - if ($layout_csv == $layout_csv_properties)
406 - {
407 - ($index,$dummy,$property,@data) = split (',', $line) ;
408 - $property =~ s/^\s+// ;
409 - $property =~ s/\s+$// ;
410 -
411 - $property =~ s/.*Total Internet.*/Total Internet/i ;
412 - $property =~ s/.*Google.*/Google/i ;
413 - $property =~ s/.*Microsoft.*/Microsoft/i ;
414 - $property =~ s/.*FACEBOOK.*/Facebook/i ;
415 - $property =~ s/.*Yahoo.*/Yahoo/i ;
416 - $property =~ s/.*Amazon.*/Amazon/i ;
417 - $property =~ s/.*Apple.*/Apple/i ;
418 - $property =~ s/.*AOL.*/AOL/i ;
419 - $property =~ s/.*Wikimedia.*/Wikimedia/i ;
420 - $property =~ s/.*Tencent.*/Tencent/i ;
421 - $property =~ s/.*Baidu.*/Baidu/i ;
422 - $property =~ s/.*CBS.*/CBS/i ;
423 -
424 - if (! $white_list {$property})
425 - {
426 - $not_white_listed {$property}++ ;
427 - next ;
428 - }
429 -
430 - $id = $property ;
431 - }
432 - else
433 - {
434 - ($index,$region,@data) = split (',', $line) ;
435 - $region =~ s/^\s+// ;
436 - $region =~ s/\s+$// ;
437 -
438 - if (! $white_list {$region})
439 - {
440 - $not_white_listed {$region}++ ;
441 - next ;
442 - }
443 -
444 - $id = $region_codes {$region} ;
445 - }
446 -
447 - for ($m = 0 ; $m <= $#months ; $m++)
448 - {
449 - $yyyymm = $months [$m] ;
450 - $months {$yyyymm} ++ ;
451 - $yyyymm_id = "$yyyymm,$id" ;
452 - $data = $data [$m] * $multiplier ;
453 -
454 - if ($mode eq 'add')
455 - {
456 - if (! defined $data {$yyyymm_id})
457 - {
458 - $updates_found = $true ;
459 - print "New data found: $yyyymm_id = $data\n" ;
460 - $data {$yyyymm_id} = $data ;
461 - }
462 - }
463 - else
464 - {
465 - $updates_found = $true ;
466 - print "Data found: $yyyymm_id = $data\n" ;
467 - $data {$yyyymm_id} = $data ;
468 - }
469 - }
470 - }
471 - }
472 -
473 - $entities_not_white_listed = join (', ', sort keys %not_white_listed) ;
474 - if ($entities_not_white_listed ne '')
475 - { print "\nEntities ignored:\n$entities_not_white_listed\n\n" ; }
476 -
477 - if (! $updates_found)
478 - { print "No new updates found\n" ; }
479 - else
480 - { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; }
481 -
482 - return ($updates_found) ;
483 -}
484 -
485 -sub WriteDataAnalytics
486 -{
487 - print "WriteDataAnalytics\n\n";
488 -
489 - open OUT, '>', "$dir_out/analytics_in_comscore.csv" ;
490 -
491 - $metric = 'unique_visitors' ;
492 - foreach $yyyymm (sort keys %months)
493 - {
494 - # store meta data elsewhere
495 - # $line = "$generated,$source,$server,$script_name,$script_version,$user,$yyyymm,$country_code,$region_code,$property,$project,$normalized,$metric,$data\n" ;
496 - foreach $region_code (sort values %region_codes)
497 - {
498 - $country_code = '-' ;
499 - $property = '-' ;
500 - $project = '-' ;
501 - $reach = $reach_region_code {"$yyyymm,$region_code"} ;
502 - $visitors = $visitors_region_code {"$yyyymm,$region_code"} ;
503 -
504 - if (! defined $reach) { $reach = -1 ; }
505 - if (! defined $visitors) { $visitors = -1 ; }
506 -
507 - next if $reach == -1 and $visitors == -1 ;
508 -
509 - $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
510 - print OUT $line ;
511 - print $line ;
512 - }
513 -
514 - foreach $property (sort @properties)
515 - {
516 - $country_code = '-' ;
517 - $region_code = '-' ;
518 - $project = '-' ;
519 - $reach = '-1' ;
520 - $visitors = $visitors_web_property {"$yyyymm,$property"} ;
521 -
522 - next if ! defined $visitors ;
523 -
524 - $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
525 - print OUT $line ;
526 - # print $line ;
527 - }
528 - }
529 -}
530 -
531 -sub GetNumberOnly
532 -{
533 - my $line = shift ;
534 - $line =~ s/("[^\"]+")/($a=$1,$a=~s#,##g,$a)/ge ; # nested regexp: remove comma's inside double quotes
535 - $line =~ s/"//g ;
536 - return $line ;
537 -}
538 -
539 -sub mmm_yyyy2yyyy_mm
540 -{
541 - my @months = @_ ;
542 - my ($m) ;
543 - # Jan -> 01, etc
544 - foreach $month (@months)
545 - {
546 - my ($mmm,$yyyy) = split ('-', $month) ;
547 - for ($m = 0 ; $m <= $#months_short ; $m++)
548 - {
549 - if ($mmm eq $months_short [$m])
550 - {
551 - $month = "$yyyy-" . sprintf ("%02d", $m+1) ;
552 - last ;
553 - }
554 - }
555 - }
556 - return @months ;
557 -}
558 -
559 -sub Abort
560 -{
561 - $msg = shift ;
562 -
563 - print "\nAbort, reason: $msg\n\n" ;
564 - exit ;
565 -}
Index: trunk/wikistats/analytics/README
@@ -0,0 +1 @@
 2+The files in this folder have been moved to trunk/extensions/MetricReporting/import.
Property changes on: trunk/wikistats/analytics/README
___________________________________________________________________
Added: svn:eol-style
13 + native

Status & tagging log