Index: trunk/wikistats/analytics/AnalyticsPrepLanguageNames.pl |
— | — | @@ -0,0 +1,162 @@ |
| 2 | +#!/usr/bin/perl |
| 3 | + |
| 4 | +# This module prepares a csv file with language names for feed into the analytics database |
| 5 | +# The main work of collecting these names from different sources is done by |
| 6 | +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiReportsLocalizations.pm |
| 7 | +# which is part of wikistats reporting phase and once each month updates local master csv files from all sources |
| 8 | + |
| 9 | +# The following code to merge and filter these master csv files is based on parts of the code in WikiReportsLocalizations.pm, function Localization |
| 10 | +# To do some day: make a completely independent script out of this code and code from WikiReportsLocalizations.pm which covers the whole production cycle |
| 11 | + |
| 12 | +# Sources for language names: |
| 13 | +# - php files |
| 14 | +# - translatewiki |
| 15 | +# - English Wikipedia API (interwikilinks) |
| 16 | + |
| 17 | +# Right now multilingual support for the analytics database is just a nice idea, so to speed up data feeds, just keep English and German translations |
| 18 | + |
| 19 | + use Getopt::Std ; |
| 20 | + |
| 21 | + $true = 1 ; |
| 22 | + $false = 0 ; |
| 23 | + |
| 24 | + $max_language_name = 50 ; # truncate if longer |
| 25 | + |
| 26 | + $file_csv_language_names_php = "LanguageNamesViaPhp.csv" ; |
| 27 | + $file_csv_language_names_wp = "LanguageNamesViaWpEnEdited.csv" ; |
| 28 | + $file_csv_analytics_in = "analytics_in_language_names.csv" ; |
| 29 | + |
| 30 | + $languages_force_case_uc = "ast|br|de|en|id|nl|wa" ; # incomplete list, based on languages supported by wikistats reporting |
| 31 | + $languages_filter = "de|en" ; |
| 32 | + foreach $language (split '\|', $languages_filter) |
| 33 | + { $include_target_language {$language} = $true ; } |
| 34 | + |
| 35 | + &ParseArguments ; |
| 36 | + &ReadCsvFiles ; |
| 37 | + &WriteCsvFile ; |
| 38 | + |
| 39 | + |
| 40 | +# if ($language eq "id") # swap which file takes precedence |
| 41 | + |
| 42 | + print "\nReady\n\n" ; |
| 43 | + exit ; |
| 44 | + |
| 45 | +sub ParseArguments |
| 46 | +{ |
| 47 | + my (@options, $arguments) ; |
| 48 | + |
| 49 | + getopt ("io", \%options) ; |
| 50 | + |
| 51 | + foreach $arg (sort keys %options) |
| 52 | + { $arguments .= " -$arg " . $options {$arg} . "\n" ; } |
| 53 | + print ("\nArguments\n$arguments\n") ; |
| 54 | + |
| 55 | + $options {"i"} = "w:/# out bayes/csv_wp" ; # EZ test |
| 56 | + $options {"o"} = "c:/MySQL/analytics" ; # EZ test |
| 57 | + |
| 58 | + die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ; |
| 59 | + die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ; |
| 60 | + |
| 61 | + ($path_in = $options {"i"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash if any |
| 62 | + ($path_out = $options {"o"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash if any |
| 63 | + |
| 64 | + die "Input folder '$path_in' does not exist" if (! -d $path_in) ; |
| 65 | + die "Output folder '$path_out' does not exist" if (! -d $path_out) ; |
| 66 | + |
| 67 | + print "Input folder: $path_in\n" ; |
| 68 | + print "Output folder: $path_out\n\n" ; |
| 69 | + |
| 70 | + $file_csv_language_names_php = "$path_in/$file_csv_language_names_php" ; |
| 71 | + $file_csv_language_names_wp = "$path_in/$file_csv_language_names_wp" ; |
| 72 | + $file_csv_analytics_in = "$path_out/$file_csv_analytics_in" ; |
| 73 | + |
| 74 | + die "Input file '$file_csv_language_names_php' not found" if (! -e $file_csv_language_names_php) ; |
| 75 | + die "Input file '$file_csv_language_names_wp' not found" if (! -e $file_csv_language_names_wp) ; |
| 76 | +} |
| 77 | + |
| 78 | +sub ReadCsvFiles |
| 79 | +{ |
| 80 | + #first read definitions from php message files, then overwrite with definitions from interwiki links when available |
| 81 | + # except for target language 'id' (Indonesian) where quality of php file has been deemed more reliable |
| 82 | + |
| 83 | + open CSV_IN, "<", $file_csv_language_names_php ; |
| 84 | + while ($line = <CSV_IN>) |
| 85 | + { |
| 86 | + chomp ($line) ; |
| 87 | + ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ; |
| 88 | + |
| 89 | + next if ! $include_target_language {$target_language} ; |
| 90 | + |
| 91 | + $out_languages {$target_language} {$code} = &FormatName ($target_language, $name_unicode) ; # forget about html for analytics database |
| 92 | + } |
| 93 | + close CSV_IN ; |
| 94 | + |
| 95 | + open CSV_IN, "<", $file_csv_language_names_wp ; |
| 96 | + while ($line = <CSV_IN>) |
| 97 | + { |
| 98 | + chomp ($line) ; |
| 99 | + ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ; |
| 100 | + |
| 101 | + next if ! $include_target_language {$target_language} ; |
| 102 | + |
| 103 | + next if $target_language eq 'id' and $out_languages {$target_language} {$code} ne '' ; |
| 104 | + |
| 105 | + # $name_unicode_php = $out_languages {$target_language} {$code} ; # test only |
| 106 | + # $name_unicode_wp = &FormatName ($target_language, $name_unicode) ; # test only |
| 107 | + # if (($name_unicode_php ne '') && ($name_unicode_php ne $name_unicode_wp)) # test only |
| 108 | + # { print "$name_unicode_php => $name_unicode_wp\n" ; } # test only |
| 109 | + |
| 110 | + $out_languages {$target_language} {$code} = &FormatName ($target_language, $name_unicode) ; # forget about html for analytics database |
| 111 | + } |
| 112 | + close CSV_IN ; |
| 113 | +} |
| 114 | + |
| 115 | +sub FormatName |
| 116 | +{ |
| 117 | + my ($target_language, $name_unicode) = @_ ; |
| 118 | + |
| 119 | + $name_unicode2 = $name_unicode ; |
| 120 | + |
| 121 | + if ($target_language eq "de") |
| 122 | + { $name_unicode =~ s/e?\s*\-?sprache//i ; } |
| 123 | + |
| 124 | + if ($target_language =~ /^(?:$languages_force_case_uc)/) |
| 125 | + { $name_unicode = ucfirst $name_unicode ; } |
| 126 | + else |
| 127 | + { $name_unicode = lc $name_unicode ; } |
| 128 | + |
| 129 | +# Test only |
| 130 | +# if (($target_language eq 'de') && ($name_unicode ne $name_unicode2)) |
| 131 | +# { print "$name_unicode2 => $name_unicode\n" ; } |
| 132 | + |
| 133 | + return ($name_unicode) ; |
| 134 | +} |
| 135 | + |
| 136 | +sub WriteCsvFile |
| 137 | +{ |
| 138 | + open CSV_OUT, ">", $file_csv_analytics_in || die ("File '$file_csv_analytics_in' could not be opened") ; |
| 139 | + binmode CSV_OUT ; # force Unix style linebreak \012 |
| 140 | + |
| 141 | + foreach $target_language (sort keys %out_languages) |
| 142 | + { |
| 143 | + print "\nTarget language '$target_language'\n" ; |
| 144 | + %translations = %{$out_languages {$target_language}} ; |
| 145 | + |
| 146 | + foreach $code (sort keys %translations) |
| 147 | + { |
| 148 | + $language_name = $translations{$code} ; |
| 149 | + $language_name2 = substr ($language_name,0,$max_language_name) ; |
| 150 | + |
| 151 | + if ($language_name ne $language_name2) |
| 152 | + { print "Language name truncated to $max_language_name chars: '$language_name' -> '$language_name2'\n" ; } |
| 153 | + |
| 154 | + if ($language_name2 =~ /,/) |
| 155 | + { $language_name2 = "\"$language_name2\"" ; } |
| 156 | + # test only |
| 157 | + print CSV_OUT "$target_language,$code,$language_name2\n" ; |
| 158 | + } |
| 159 | + } |
| 160 | + |
| 161 | + close CSV_OUT ; |
| 162 | +} |
| 163 | + |
Index: trunk/wikistats/analytics/AnalyticsPrepPageViews.pl |
— | — | @@ -0,0 +1,11 @@ |
| 2 | +#!/usr/bin/perl |
| 3 | + |
| 4 | +# This file is just a place holder (alias) |
| 5 | +# Page views per day/week/month/etc, per project, per language, nrmalized and not normalized are collected by |
| 6 | +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiCountsSummarizeProjectCounts.pl |
| 7 | +# |
| 8 | +# Added May 2001: |
| 9 | +# For analytics database one file is written for all projects and languages combined, |
| 10 | +# with per month, not normalized and normalized page view counts in one row. |
| 11 | +# This file is written to same output folder as other csv files generated by WikiCountsSummarizeProjectCounts.pl |
| 12 | +# and needs to be copied to analytics folder later |
\ No newline at end of file |
Index: trunk/wikistats/analytics/create_and_use_db_analytics.txt |
— | — | @@ -1,5 +1,25 @@ |
2 | 2 | -- invoke this file with "mysql --user=root --password=[...] < create_and_use_db_analytics.txt" |
3 | 3 | |
| 4 | +-- tables implemented: |
| 5 | +-- comscore |
| 6 | +-- comscore_regions |
| 7 | +-- wikistats |
| 8 | +-- page_views |
| 9 | +-- language names |
| 10 | + |
| 11 | + |
| 12 | +-- more tables needed (O= optional, not needed for report card stage) |
| 13 | +-- project_names |
| 14 | +-- binaries per project, per language, per month, per extension type |
| 15 | +-- O edits per project, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other) |
| 16 | +-- O editors per project, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other) |
| 17 | + |
| 18 | + |
| 19 | +-- open issues: |
| 20 | +-- only store basic data in database and calculate all aggregates on the fly or do some aggragation before hand ? (e.g. count for mobile / non-mobile / ==> total of both ? <==) |
| 21 | +-- for binaries, store one extension type per row? (future proof, more work to query), or a selected few as columns? (one row only needed per month) |
| 22 | + |
| 23 | + |
4 | 24 | -- Create database and two tables from scratch |
5 | 25 | DROP DATABASE IF EXISTS `analytics` ; |
6 | 26 | CREATE DATABASE `analytics` ; |
— | — | @@ -30,43 +50,63 @@ |
31 | 51 | ) ; |
32 | 52 | |
33 | 53 | CREATE TABLE `comscore_regions` ( |
34 | | - `region_code` char (2), |
35 | | - `language_code` char (10), |
36 | | - `region_name` char (18), |
37 | | - PRIMARY KEY (language_code,region_code) |
| 54 | + `region_code` char (2), |
| 55 | + `target_language` char (10), |
| 56 | + `region_name` char (18), |
| 57 | + PRIMARY KEY (target_language,region_code) |
38 | 58 | ) ; |
39 | 59 | |
40 | 60 | CREATE TABLE `wikistats` ( |
| 61 | + `yyyymm` char (7), |
| 62 | + `project` char (2), |
| 63 | + `language_code` char (15), |
| 64 | + `editors_all_time` decimal (10) DEFAULT '-1', |
| 65 | + `editors_new` decimal (7) DEFAULT '-1', |
| 66 | + `editors_ge_5` decimal (7) DEFAULT '-1', |
| 67 | + `editors_ge_25` decimal (7) DEFAULT '-1', |
| 68 | + `editors_ge_100` decimal (7) DEFAULT '-1', |
| 69 | + `articles` decimal (12) DEFAULT '-1', |
| 70 | + `articles_new_per_day` decimal (9) DEFAULT '-1', |
| 71 | + `articles_over_bytes_500` decimal (12) DEFAULT '-1', |
| 72 | + `articles_over_bytes_2000` decimal (12) DEFAULT '-1', |
| 73 | + `edits_per_article` decimal (9,1) DEFAULT '-1', |
| 74 | + `bytes_per_article` decimal (9,1) DEFAULT '-1', |
| 75 | + `edits` decimal (12) DEFAULT '-1', |
| 76 | + `size_in_bytes` decimal (15) DEFAULT '-1', |
| 77 | + `size_in_words` decimal (15) DEFAULT '-1', |
| 78 | + `links_internal` decimal (15) DEFAULT '-1', |
| 79 | + `links_interwiki` decimal (15) DEFAULT '-1', |
| 80 | + `links_image` decimal (15) DEFAULT '-1', |
| 81 | + `links_external` decimal (15) DEFAULT '-1', |
| 82 | + `redirects` decimal (15) DEFAULT '-1', |
| 83 | + PRIMARY KEY (yyyymm,project,language_code) |
| 84 | +) ; |
| 85 | + |
| 86 | +CREATE TABLE `page_views` ( |
41 | 87 | -- analytics data |
42 | | - `yyyymm` char (7), |
43 | | - `project` char (2), |
44 | | - `wiki` char (15), |
45 | | - `editors_all_time` decimal (10) DEFAULT '-1', |
46 | | - `editors_new` decimal (7) DEFAULT '-1', |
47 | | - `editors_ge_5` decimal (7) DEFAULT '-1', |
48 | | - `editors_ge_25` decimal (7) DEFAULT '-1', |
49 | | - `editors_ge_100` decimal (7) DEFAULT '-1', |
50 | | - `articles` decimal (12) DEFAULT '-1', |
51 | | - `articles_new_per_day` decimal (9) DEFAULT '-1', |
52 | | - `articles_over_bytes_500` decimal (12) DEFAULT '-1', |
53 | | - `articles_over_bytes_2000` decimal (12) DEFAULT '-1', |
54 | | - `edits_per_article` decimal (9,1) DEFAULT '-1', |
55 | | - `bytes_per_article` decimal (9,1) DEFAULT '-1', |
56 | | - `edits` decimal (12) DEFAULT '-1', |
57 | | - `size_in_bytes` decimal (15) DEFAULT '-1', |
58 | | - `size_in_words` decimal (15) DEFAULT '-1', |
59 | | - `links_internal` decimal (15) DEFAULT '-1', |
60 | | - `links_interwiki` decimal (15) DEFAULT '-1', |
61 | | - `links_image` decimal (15) DEFAULT '-1', |
62 | | - `links_external` decimal (15) DEFAULT '-1', |
63 | | - `redirects` decimal (15) DEFAULT '-1', |
64 | | - PRIMARY KEY (yyyymm,project,wiki) |
| 88 | + `yyyymm` char (7), |
| 89 | + `project` char (2), |
| 90 | + `language_code` char (15), |
| 91 | + `views_non_mobile_raw` decimal (15) DEFAULT '-1', |
| 92 | + `views_mobile_raw` decimal (15) DEFAULT '-1', |
| 93 | + `views_non_mobile_normalized` decimal (15) DEFAULT '-1', |
| 94 | + `views_mobile_normalized` decimal (15) DEFAULT '-1', |
| 95 | + PRIMARY KEY (yyyymm,project,language_code) |
65 | 96 | ) ; |
66 | 97 | |
| 98 | +CREATE TABLE `language_names` ( |
| 99 | + `target_language` char (15), |
| 100 | + `language_code` char (15), |
| 101 | + `language_name` char (50), |
| 102 | + PRIMARY KEY (target_language,language_code) |
| 103 | +) ; |
| 104 | + |
67 | 105 | -- SHOW TABLES ; |
68 | 106 | -- DESCRIBE comscore ; |
69 | 107 | -- DESCRIBE comscore_regions ; |
70 | | - DESCRIBE wikistats ; |
| 108 | +-- DESCRIBE wikistats ; |
| 109 | +-- DESCRIBE page_views ; |
| 110 | +-- DESCRIBE language_names ; |
71 | 111 | |
72 | 112 | -- Database Manipulation |
73 | 113 | -- Obviously in real world this is a separate script |
— | — | @@ -76,41 +116,85 @@ |
77 | 117 | -- FROM comscore_regions ; |
78 | 118 | |
79 | 119 | |
80 | | -LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_comscore_regions.csv' |
81 | | - INTO TABLE comscore_regions |
82 | | - FIELDS TERMINATED BY ',' |
83 | | - OPTIONALLY ENCLOSED BY '"' |
84 | | - (language_code,region_code,region_name) ; |
85 | | - |
86 | 120 | LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_comscore.csv' |
87 | 121 | INTO TABLE comscore |
88 | 122 | FIELDS TERMINATED BY ',' |
89 | 123 | OPTIONALLY ENCLOSED BY '"' |
90 | 124 | (yyyymm,country_code,region_code,property,project,reach,visitors) ; |
91 | 125 | |
| 126 | +LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_comscore_regions.csv' |
| 127 | + INTO TABLE comscore_regions |
| 128 | + FIELDS TERMINATED BY ',' |
| 129 | + OPTIONALLY ENCLOSED BY '"' |
| 130 | + (target_language,region_code,region_name) ; |
| 131 | + |
92 | 132 | LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_wikistats.csv' |
93 | 133 | INTO TABLE wikistats |
94 | 134 | FIELDS TERMINATED BY ',' |
95 | 135 | OPTIONALLY ENCLOSED BY '"' |
96 | | - (project,wiki,yyyymm,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) ; |
| 136 | + (project,language_code,yyyymm,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) ; |
| 137 | + |
| 138 | +LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_page_views.csv' |
| 139 | + INTO TABLE page_views |
| 140 | + FIELDS TERMINATED BY ',' |
| 141 | + OPTIONALLY ENCLOSED BY '"' |
| 142 | + (project,language_code,yyyymm,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized) ; |
| 143 | + |
| 144 | +LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_language_names.csv' |
| 145 | + INTO TABLE language_names |
| 146 | + FIELDS TERMINATED BY ',' |
| 147 | + OPTIONALLY ENCLOSED BY '"' |
| 148 | + (target_language,language_code,language_name) ; |
| 149 | + |
97 | 150 | -- show contents (debugging only) |
98 | | - SELECT * |
| 151 | +SELECT * |
99 | 152 | FROM comscore |
100 | 153 | ORDER BY yyyymm,country_code,region_code,property,project |
101 | 154 | INTO OUTFILE 'c:/MySQL/analytics/analytics_out_comscore_full_table.csv' |
102 | 155 | FIELDS TERMINATED BY ',' ; |
103 | 156 | |
104 | | -SELECT * |
105 | | - FROM wikistats |
106 | | - WHERE (project = 'wp') AND (wiki = 'en') AND (yyyymm BETWEEN '2010-06' AND '2011-05') |
107 | | - ORDER BY project,wiki,yyyymm |
108 | | - INTO OUTFILE 'c:/MySQL/analytics/analytics_out_wikistats_test1.csv' |
| 157 | +-- test query (to be described in detail) |
| 158 | +-- make sure to delete output file first |
| 159 | +SELECT yyyymm, project, page_views.language_code, language_name, views_non_mobile_normalized, views_mobile_normalized |
| 160 | + FROM page_views LEFT JOIN language_names |
| 161 | + ON page_views.language_code = language_names.language_code |
| 162 | + WHERE (project = 'wp') AND |
| 163 | + (yyyymm BETWEEN '2011-03' AND '2011-05') AND |
| 164 | + (page_views.language_code = 'nl') AND |
| 165 | + (language_names.target_language = 'en') |
| 166 | + ORDER BY project,language_name,yyyymm |
| 167 | + INTO OUTFILE 'c:/MySQL/analytics/analytics_out_page_views_test1.csv' |
109 | 168 | FIELDS TERMINATED BY ',' ; |
| 169 | + |
| 170 | +-- expected result: |
| 171 | +-- 2011-03,wp,nl,Dutch,164544225,4994050 |
| 172 | +-- 2011-04,wp,nl,Dutch,153550168,5427629 |
| 173 | +-- 2011-05,wp,nl,Dutch,1957423,73106 |
| 174 | + |
| 175 | +-- test query (to be described in detail) |
| 176 | +-- make sure to delete output file first |
| 177 | +SELECT yyyymm, project, page_views.language_code, language_name, views_non_mobile_normalized, views_mobile_normalized |
| 178 | + FROM page_views LEFT JOIN language_names |
| 179 | + ON page_views.language_code = language_names.language_code |
| 180 | + WHERE (project = 'wp') AND |
| 181 | + (yyyymm BETWEEN '2011-03' AND '2011-05') AND |
| 182 | + (page_views.language_code = 'nl') AND |
| 183 | + (language_names.target_language = 'de') |
| 184 | + ORDER BY project,language_name,yyyymm |
| 185 | + INTO OUTFILE 'c:/MySQL/analytics/analytics_out_page_views_test2.csv' |
| 186 | + FIELDS TERMINATED BY ',' ; |
| 187 | + |
| 188 | +-- expected result: |
| 189 | +-- 2011-03,wp,nl,Niederländisch,164544225,4994050 |
| 190 | +-- 2011-04,wp,nl,Niederländisch,153550168,5427629 |
| 191 | +-- 2011-05,wp,nl,Niederländisch,1957423,73106 |
| 192 | + |
110 | 193 | -- export all relevant non-meta data from comsCore's reach by region (and comScore treats India and China as regions in this context) |
| 194 | +-- make sure to delete output file first |
111 | 195 | SELECT yyyymm,region_name,reach |
112 | | - FROM comscore LEFT JOIN comscore_regions ON comscore.region_code = comscore_regions.region_code AND comscore_regions.language_code = 'en' |
113 | | - WHERE (region_name != '') AND (yyyymm BETWEEN '2010-06' AND '2011-05') |
| 196 | + FROM comscore LEFT JOIN comscore_regions |
| 197 | + ON comscore.region_code = comscore_regions.region_code AND comscore_regions.target_language = 'en' |
| 198 | + WHERE (region_name != '') AND (yyyymm BETWEEN '2011-03' AND '2011-05') |
114 | 199 | ORDER BY yyyymm,region_name |
115 | | - INTO OUTFILE 'c:/MySQL/analytics/analytics_out_comscore_reach.csv' |
| 200 | + INTO OUTFILE 'c:/MySQL/analytics/analytics_out_comscore_reach_test1.csv' |
116 | 201 | FIELDS TERMINATED BY ',' ; |