r88897 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r88896‎ | r88897 | r88898 >
Date:16:29, 26 May 2011
Author:ezachte
Status:deferred
Tags:
Comment:
new file AnalyticsPrepLanguageNames.pl + binmode on analytics input files (MySQL is picky)
Modified paths:
  • /trunk/wikistats/analytics/AnalyticsPrepLanguageNames.pl (added) (history)
  • /trunk/wikistats/analytics/AnalyticsPrepPageViews.pl (added) (history)
  • /trunk/wikistats/analytics/create_and_use_db_analytics.txt (modified) (history)

Diff [purge]

Index: trunk/wikistats/analytics/AnalyticsPrepLanguageNames.pl
@@ -0,0 +1,162 @@
 2+#!/usr/bin/perl
 3+
 4+# This module prepares a csv file with language names for feed into the analytics database
 5+# The main work of collecting these names from different sources is done by
 6+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiReportsLocalizations.pm
 7+# which is part of wikistats reporting phase and once each month updates local master csv files from all sources
 8+
 9+# The following code to merge and filter these master csv files is based on parts of the code in WikiReportsLocalizations.pm, function Localization
 10+# To do some day: make a completely independent script out of this code and code from WikiReportsLocalizations.pm which covers the whole production cycle
 11+
 12+# Sources for language names:
 13+# - php files
 14+# - translatewiki
 15+# - English Wikipedia API (interwikilinks)
 16+
 17+# Right now multilingual support for the analytics database is just a nice idea, so to speed up data feeds, just keep English and German translations
 18+
 19+ use Getopt::Std ;
 20+
 21+ $true = 1 ;
 22+ $false = 0 ;
 23+
 24+ $max_language_name = 50 ; # truncate if longer
 25+
 26+ $file_csv_language_names_php = "LanguageNamesViaPhp.csv" ;
 27+ $file_csv_language_names_wp = "LanguageNamesViaWpEnEdited.csv" ;
 28+ $file_csv_analytics_in = "analytics_in_language_names.csv" ;
 29+
 30+ $languages_force_case_uc = "ast|br|de|en|id|nl|wa" ; # incomplete list, based on languages supported by wikistats reporting
 31+ $languages_filter = "de|en" ;
 32+ foreach $language (split '\|', $languages_filter)
 33+ { $include_target_language {$language} = $true ; }
 34+
 35+ &ParseArguments ;
 36+ &ReadCsvFiles ;
 37+ &WriteCsvFile ;
 38+
 39+
 40+# if ($language eq "id") # swap which file takes precedence
 41+
 42+ print "\nReady\n\n" ;
 43+ exit ;
 44+
 45+sub ParseArguments
 46+{
 47+ my (@options, $arguments) ;
 48+
 49+ getopt ("io", \%options) ;
 50+
 51+ foreach $arg (sort keys %options)
 52+ { $arguments .= " -$arg " . $options {$arg} . "\n" ; }
 53+ print ("\nArguments\n$arguments\n") ;
 54+
 55+ $options {"i"} = "w:/# out bayes/csv_wp" ; # EZ test
 56+ $options {"o"} = "c:/MySQL/analytics" ; # EZ test
 57+
 58+ die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ;
 59+ die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
 60+
 61+ ($path_in = $options {"i"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash if any
 62+ ($path_out = $options {"o"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash if any
 63+
 64+ die "Input folder '$path_in' does not exist" if (! -d $path_in) ;
 65+ die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
 66+
 67+ print "Input folder: $path_in\n" ;
 68+ print "Output folder: $path_out\n\n" ;
 69+
 70+ $file_csv_language_names_php = "$path_in/$file_csv_language_names_php" ;
 71+ $file_csv_language_names_wp = "$path_in/$file_csv_language_names_wp" ;
 72+ $file_csv_analytics_in = "$path_out/$file_csv_analytics_in" ;
 73+
 74+ die "Input file '$file_csv_language_names_php' not found" if (! -e $file_csv_language_names_php) ;
 75+ die "Input file '$file_csv_language_names_wp' not found" if (! -e $file_csv_language_names_wp) ;
 76+}
 77+
 78+sub ReadCsvFiles
 79+{
 80+ #first read definitions from php message files, then overwrite with definitions from interwiki links when available
 81+ # except for target language 'id' (Indonesian) where quality of php file has been deemed more reliable
 82+
 83+ open CSV_IN, "<", $file_csv_language_names_php ;
 84+ while ($line = <CSV_IN>)
 85+ {
 86+ chomp ($line) ;
 87+ ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ;
 88+
 89+ next if ! $include_target_language {$target_language} ;
 90+
 91+ $out_languages {$target_language} {$code} = &FormatName ($target_language, $name_unicode) ; # forget about html for analytics database
 92+ }
 93+ close CSV_IN ;
 94+
 95+ open CSV_IN, "<", $file_csv_language_names_wp ;
 96+ while ($line = <CSV_IN>)
 97+ {
 98+ chomp ($line) ;
 99+ ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ;
 100+
 101+ next if ! $include_target_language {$target_language} ;
 102+
 103+ next if $target_language eq 'id' and $out_languages {$target_language} {$code} ne '' ;
 104+
 105+ # $name_unicode_php = $out_languages {$target_language} {$code} ; # test only
 106+ # $name_unicode_wp = &FormatName ($target_language, $name_unicode) ; # test only
 107+ # if (($name_unicode_php ne '') && ($name_unicode_php ne $name_unicode_wp)) # test only
 108+ # { print "$name_unicode_php => $name_unicode_wp\n" ; } # test only
 109+
 110+ $out_languages {$target_language} {$code} = &FormatName ($target_language, $name_unicode) ; # forget about html for analytics database
 111+ }
 112+ close CSV_IN ;
 113+}
 114+
 115+sub FormatName
 116+{
 117+ my ($target_language, $name_unicode) = @_ ;
 118+
 119+ $name_unicode2 = $name_unicode ;
 120+
 121+ if ($target_language eq "de")
 122+ { $name_unicode =~ s/e?\s*\-?sprache//i ; }
 123+
 124+ if ($target_language =~ /^(?:$languages_force_case_uc)/)
 125+ { $name_unicode = ucfirst $name_unicode ; }
 126+ else
 127+ { $name_unicode = lc $name_unicode ; }
 128+
 129+# Test only
 130+# if (($target_language eq 'de') && ($name_unicode ne $name_unicode2))
 131+# { print "$name_unicode2 => $name_unicode\n" ; }
 132+
 133+ return ($name_unicode) ;
 134+}
 135+
 136+sub WriteCsvFile
 137+{
 138+ open CSV_OUT, ">", $file_csv_analytics_in || die ("File '$file_csv_analytics_in' could not be opened") ;
 139+ binmode CSV_OUT ; # force Unix style linebreak \012
 140+
 141+ foreach $target_language (sort keys %out_languages)
 142+ {
 143+ print "\nTarget language '$target_language'\n" ;
 144+ %translations = %{$out_languages {$target_language}} ;
 145+
 146+ foreach $code (sort keys %translations)
 147+ {
 148+ $language_name = $translations{$code} ;
 149+ $language_name2 = substr ($language_name,0,$max_language_name) ;
 150+
 151+ if ($language_name ne $language_name2)
 152+ { print "Language name truncated to $max_language_name chars: '$language_name' -> '$language_name2'\n" ; }
 153+
 154+ if ($language_name2 =~ /,/)
 155+ { $language_name2 = "\"$language_name2\"" ; }
 156+ # test only
 157+ print CSV_OUT "$target_language,$code,$language_name2\n" ;
 158+ }
 159+ }
 160+
 161+ close CSV_OUT ;
 162+}
 163+
Index: trunk/wikistats/analytics/AnalyticsPrepPageViews.pl
@@ -0,0 +1,11 @@
 2+#!/usr/bin/perl
 3+
 4+# This file is just a place holder (alias)
 5+# Page views per day/week/month/etc, per project, per language, nrmalized and not normalized are collected by
 6+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiCountsSummarizeProjectCounts.pl
 7+#
 8+# Added May 2001:
 9+# For analytics database one file is written for all projects and languages combined,
 10+# with per month, not normalized and normalized page view counts in one row.
 11+# This file is written to same output folder as other csv files generated by WikiCountsSummarizeProjectCounts.pl
 12+# and needs to be copied to analytics folder later
\ No newline at end of file
Index: trunk/wikistats/analytics/create_and_use_db_analytics.txt
@@ -1,5 +1,25 @@
22 -- invoke this file with "mysql --user=root --password=[...] < create_and_use_db_analytics.txt"
33
 4+-- tables implemented:
 5+-- comscore
 6+-- comscore_regions
 7+-- wikistats
 8+-- page_views
 9+-- language names
 10+
 11+
 12+-- more tables needed (O= optional, not needed for report card stage)
 13+-- project_names
 14+-- binaries per project, per language, per month, per extension type
 15+-- O edits per project, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other)
 16+-- O editors per project, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other)
 17+
 18+
 19+-- open issues:
 20+-- only store basic data in database and calculate all aggregates on the fly or do some aggragation before hand ? (e.g. count for mobile / non-mobile / ==> total of both ? <==)
 21+-- for binaries, store one extension type per row? (future proof, more work to query), or a selected few as columns? (one row only needed per month)
 22+
 23+
424 -- Create database and two tables from scratch
525 DROP DATABASE IF EXISTS `analytics` ;
626 CREATE DATABASE `analytics` ;
@@ -30,43 +50,63 @@
3151 ) ;
3252
3353 CREATE TABLE `comscore_regions` (
34 - `region_code` char (2),
35 - `language_code` char (10),
36 - `region_name` char (18),
37 - PRIMARY KEY (language_code,region_code)
 54+ `region_code` char (2),
 55+ `target_language` char (10),
 56+ `region_name` char (18),
 57+ PRIMARY KEY (target_language,region_code)
3858 ) ;
3959
4060 CREATE TABLE `wikistats` (
 61+ `yyyymm` char (7),
 62+ `project` char (2),
 63+ `language_code` char (15),
 64+ `editors_all_time` decimal (10) DEFAULT '-1',
 65+ `editors_new` decimal (7) DEFAULT '-1',
 66+ `editors_ge_5` decimal (7) DEFAULT '-1',
 67+ `editors_ge_25` decimal (7) DEFAULT '-1',
 68+ `editors_ge_100` decimal (7) DEFAULT '-1',
 69+ `articles` decimal (12) DEFAULT '-1',
 70+ `articles_new_per_day` decimal (9) DEFAULT '-1',
 71+ `articles_over_bytes_500` decimal (12) DEFAULT '-1',
 72+ `articles_over_bytes_2000` decimal (12) DEFAULT '-1',
 73+ `edits_per_article` decimal (9,1) DEFAULT '-1',
 74+ `bytes_per_article` decimal (9,1) DEFAULT '-1',
 75+ `edits` decimal (12) DEFAULT '-1',
 76+ `size_in_bytes` decimal (15) DEFAULT '-1',
 77+ `size_in_words` decimal (15) DEFAULT '-1',
 78+ `links_internal` decimal (15) DEFAULT '-1',
 79+ `links_interwiki` decimal (15) DEFAULT '-1',
 80+ `links_image` decimal (15) DEFAULT '-1',
 81+ `links_external` decimal (15) DEFAULT '-1',
 82+ `redirects` decimal (15) DEFAULT '-1',
 83+ PRIMARY KEY (yyyymm,project,language_code)
 84+) ;
 85+
 86+CREATE TABLE `page_views` (
4187 -- analytics data
42 - `yyyymm` char (7),
43 - `project` char (2),
44 - `wiki` char (15),
45 - `editors_all_time` decimal (10) DEFAULT '-1',
46 - `editors_new` decimal (7) DEFAULT '-1',
47 - `editors_ge_5` decimal (7) DEFAULT '-1',
48 - `editors_ge_25` decimal (7) DEFAULT '-1',
49 - `editors_ge_100` decimal (7) DEFAULT '-1',
50 - `articles` decimal (12) DEFAULT '-1',
51 - `articles_new_per_day` decimal (9) DEFAULT '-1',
52 - `articles_over_bytes_500` decimal (12) DEFAULT '-1',
53 - `articles_over_bytes_2000` decimal (12) DEFAULT '-1',
54 - `edits_per_article` decimal (9,1) DEFAULT '-1',
55 - `bytes_per_article` decimal (9,1) DEFAULT '-1',
56 - `edits` decimal (12) DEFAULT '-1',
57 - `size_in_bytes` decimal (15) DEFAULT '-1',
58 - `size_in_words` decimal (15) DEFAULT '-1',
59 - `links_internal` decimal (15) DEFAULT '-1',
60 - `links_interwiki` decimal (15) DEFAULT '-1',
61 - `links_image` decimal (15) DEFAULT '-1',
62 - `links_external` decimal (15) DEFAULT '-1',
63 - `redirects` decimal (15) DEFAULT '-1',
64 - PRIMARY KEY (yyyymm,project,wiki)
 88+ `yyyymm` char (7),
 89+ `project` char (2),
 90+ `language_code` char (15),
 91+ `views_non_mobile_raw` decimal (15) DEFAULT '-1',
 92+ `views_mobile_raw` decimal (15) DEFAULT '-1',
 93+ `views_non_mobile_normalized` decimal (15) DEFAULT '-1',
 94+ `views_mobile_normalized` decimal (15) DEFAULT '-1',
 95+ PRIMARY KEY (yyyymm,project,language_code)
6596 ) ;
6697
 98+CREATE TABLE `language_names` (
 99+ `target_language` char (15),
 100+ `language_code` char (15),
 101+ `language_name` char (50),
 102+ PRIMARY KEY (target_language,language_code)
 103+) ;
 104+
67105 -- SHOW TABLES ;
68106 -- DESCRIBE comscore ;
69107 -- DESCRIBE comscore_regions ;
70 - DESCRIBE wikistats ;
 108+-- DESCRIBE wikistats ;
 109+-- DESCRIBE page_views ;
 110+-- DESCRIBE language_names ;
71111
72112 -- Database Manipulation
73113 -- Obviously in real world this is a separate script
@@ -76,41 +116,85 @@
77117 -- FROM comscore_regions ;
78118
79119
80 -LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_comscore_regions.csv'
81 - INTO TABLE comscore_regions
82 - FIELDS TERMINATED BY ','
83 - OPTIONALLY ENCLOSED BY '"'
84 - (language_code,region_code,region_name) ;
85 -
86120 LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_comscore.csv'
87121 INTO TABLE comscore
88122 FIELDS TERMINATED BY ','
89123 OPTIONALLY ENCLOSED BY '"'
90124 (yyyymm,country_code,region_code,property,project,reach,visitors) ;
91125
 126+LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_comscore_regions.csv'
 127+ INTO TABLE comscore_regions
 128+ FIELDS TERMINATED BY ','
 129+ OPTIONALLY ENCLOSED BY '"'
 130+ (target_language,region_code,region_name) ;
 131+
92132 LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_wikistats.csv'
93133 INTO TABLE wikistats
94134 FIELDS TERMINATED BY ','
95135 OPTIONALLY ENCLOSED BY '"'
96 - (project,wiki,yyyymm,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) ;
 136+ (project,language_code,yyyymm,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) ;
 137+
 138+LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_page_views.csv'
 139+ INTO TABLE page_views
 140+ FIELDS TERMINATED BY ','
 141+ OPTIONALLY ENCLOSED BY '"'
 142+ (project,language_code,yyyymm,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized) ;
 143+
 144+LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_language_names.csv'
 145+ INTO TABLE language_names
 146+ FIELDS TERMINATED BY ','
 147+ OPTIONALLY ENCLOSED BY '"'
 148+ (target_language,language_code,language_name) ;
 149+
97150 -- show contents (debugging only)
98 - SELECT *
 151+SELECT *
99152 FROM comscore
100153 ORDER BY yyyymm,country_code,region_code,property,project
101154 INTO OUTFILE 'c:/MySQL/analytics/analytics_out_comscore_full_table.csv'
102155 FIELDS TERMINATED BY ',' ;
103156
104 -SELECT *
105 - FROM wikistats
106 - WHERE (project = 'wp') AND (wiki = 'en') AND (yyyymm BETWEEN '2010-06' AND '2011-05')
107 - ORDER BY project,wiki,yyyymm
108 - INTO OUTFILE 'c:/MySQL/analytics/analytics_out_wikistats_test1.csv'
 157+-- test query (to be described in detail)
 158+-- make sure to delete output file first
 159+SELECT yyyymm, project, page_views.language_code, language_name, views_non_mobile_normalized, views_mobile_normalized
 160+ FROM page_views LEFT JOIN language_names
 161+ ON page_views.language_code = language_names.language_code
 162+ WHERE (project = 'wp') AND
 163+ (yyyymm BETWEEN '2011-03' AND '2011-05') AND
 164+ (page_views.language_code = 'nl') AND
 165+ (language_names.target_language = 'en')
 166+ ORDER BY project,language_name,yyyymm
 167+ INTO OUTFILE 'c:/MySQL/analytics/analytics_out_page_views_test1.csv'
109168 FIELDS TERMINATED BY ',' ;
 169+
 170+-- expected result:
 171+-- 2011-03,wp,nl,Dutch,164544225,4994050
 172+-- 2011-04,wp,nl,Dutch,153550168,5427629
 173+-- 2011-05,wp,nl,Dutch,1957423,73106
 174+
 175+-- test query (to be described in detail)
 176+-- make sure to delete output file first
 177+SELECT yyyymm, project, page_views.language_code, language_name, views_non_mobile_normalized, views_mobile_normalized
 178+ FROM page_views LEFT JOIN language_names
 179+ ON page_views.language_code = language_names.language_code
 180+ WHERE (project = 'wp') AND
 181+ (yyyymm BETWEEN '2011-03' AND '2011-05') AND
 182+ (page_views.language_code = 'nl') AND
 183+ (language_names.target_language = 'de')
 184+ ORDER BY project,language_name,yyyymm
 185+ INTO OUTFILE 'c:/MySQL/analytics/analytics_out_page_views_test2.csv'
 186+ FIELDS TERMINATED BY ',' ;
 187+
 188+-- expected result:
 189+-- 2011-03,wp,nl,Niederländisch,164544225,4994050
 190+-- 2011-04,wp,nl,Niederländisch,153550168,5427629
 191+-- 2011-05,wp,nl,Niederländisch,1957423,73106
 192+
110193 -- export all relevant non-meta data from comsCore's reach by region (and comScore treats India and China as regions in this context)
 194+-- make sure to delete output file first
111195 SELECT yyyymm,region_name,reach
112 - FROM comscore LEFT JOIN comscore_regions ON comscore.region_code = comscore_regions.region_code AND comscore_regions.language_code = 'en'
113 - WHERE (region_name != '') AND (yyyymm BETWEEN '2010-06' AND '2011-05')
 196+ FROM comscore LEFT JOIN comscore_regions
 197+ ON comscore.region_code = comscore_regions.region_code AND comscore_regions.target_language = 'en'
 198+ WHERE (region_name != '') AND (yyyymm BETWEEN '2011-03' AND '2011-05')
114199 ORDER BY yyyymm,region_name
115 - INTO OUTFILE 'c:/MySQL/analytics/analytics_out_comscore_reach.csv'
 200+ INTO OUTFILE 'c:/MySQL/analytics/analytics_out_comscore_reach_test1.csv'
116201 FIELDS TERMINATED BY ',' ;

Status & tagging log