r88720 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r88719‎ | r88720 | r88721 >
Date:15:52, 24 May 2011
Author:ezachte
Status:deferred
Tags:
Comment:
Modified paths:
  • /trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl (modified) (history)
  • /trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl (added) (history)
  • /trunk/wikistats/analytics/create_and_use_db_analytics.txt (modified) (history)

Diff [purge]

Index: trunk/wikistats/analytics/create_and_use_db_analytics.txt
@@ -1,23 +1,23 @@
22 -- invoke this file with "mysql --user=root --password=[...] < create_and_use_db_analytics.txt"
33
4 -/* Create database and two tables from scratch */
5 -
 4+-- Create database and two tables from scratch
65 DROP DATABASE IF EXISTS `analytics` ;
76 CREATE DATABASE `analytics` ;
 7+
88 USE `analytics` ;
99
1010 CREATE TABLE `comscore` (
1111 -- meta data (mostly for auditing, may not be sent over API on default)
12 -/*
13 - store meta dat elsewhere,tbd
14 - `id` int auto_increment NOT NULL,
15 - `generated` timestamp,
16 - `source` char (20),
17 - `server` char (20),
18 - `script_name` char (30),
19 - `script_version` char (8),
20 - `user` char (20),
21 -*/
 12+-- not used right now, store meta dat elsewhere,tbd, kept for brainstorm
 13+--
 14+--`id` int auto_increment NOT NULL,
 15+--`generated` timestamp,
 16+--`source` char (20),
 17+--`server` char (20),
 18+--`script_name` char (30),
 19+--`script_version` char (8),
 20+--`user` char (20),
 21+--
2222 -- analytics data
2323 `yyyymm` char (7),
2424 `country_code` char (3),
@@ -36,18 +36,46 @@
3737 PRIMARY KEY (language_code,region_code)
3838 ) ;
3939
 40+CREATE TABLE `wikistats` (
 41+-- analytics data
 42+ `yyyymm` char (7),
 43+ `project` char (2),
 44+ `wiki` char (15),
 45+ `editors_all_time` decimal (10) DEFAULT '-1',
 46+ `editors_new` decimal (7) DEFAULT '-1',
 47+ `editors_ge_5` decimal (7) DEFAULT '-1',
 48+ `editors_ge_25` decimal (7) DEFAULT '-1',
 49+ `editors_ge_100` decimal (7) DEFAULT '-1',
 50+ `articles` decimal (12) DEFAULT '-1',
 51+ `articles_new_per_day` decimal (9) DEFAULT '-1',
 52+ `articles_over_bytes_500` decimal (12) DEFAULT '-1',
 53+ `articles_over_bytes_2000` decimal (12) DEFAULT '-1',
 54+ `edits_per_article` decimal (9,1) DEFAULT '-1',
 55+ `bytes_per_article` decimal (9,1) DEFAULT '-1',
 56+ `edits` decimal (12) DEFAULT '-1',
 57+ `size_in_bytes` decimal (15) DEFAULT '-1',
 58+ `size_in_words` decimal (15) DEFAULT '-1',
 59+ `links_internal` decimal (15) DEFAULT '-1',
 60+ `links_interwiki` decimal (15) DEFAULT '-1',
 61+ `links_image` decimal (15) DEFAULT '-1',
 62+ `links_external` decimal (15) DEFAULT '-1',
 63+ `redirects` decimal (15) DEFAULT '-1',
 64+ PRIMARY KEY (yyyymm,project,wiki)
 65+) ;
 66+
4067 -- SHOW TABLES ;
4168 -- DESCRIBE comscore ;
4269 -- DESCRIBE comscore_regions ;
 70+ DESCRIBE wikistats ;
4371
44 -/* Database Manipulation */
45 -/* Obviously in real world this is a separate script */
 72+-- Database Manipulation
 73+-- Obviously in real world this is a separate script
4674
47 -
4875 -- show contents (debugging only)
4976 -- SELECT *
5077 -- FROM comscore_regions ;
5178
 79+
5280 LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_comscore_regions.csv'
5381 INTO TABLE comscore_regions
5482 FIELDS TERMINATED BY ','
@@ -60,6 +88,11 @@
6189 OPTIONALLY ENCLOSED BY '"'
6290 (yyyymm,country_code,region_code,property,project,reach,visitors) ;
6391
 92+LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_wikistats.csv'
 93+ INTO TABLE wikistats
 94+ FIELDS TERMINATED BY ','
 95+ OPTIONALLY ENCLOSED BY '"'
 96+ (project,wiki,yyyymm,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) ;
6497 -- show contents (debugging only)
6598 SELECT *
6699 FROM comscore
@@ -68,10 +101,16 @@
69102 FIELDS TERMINATED BY ',' ;
70103
71104 -- export all relevant non-meta data from comsCore's reach by region (and comScore treats India and China as regions in this context)
 105+SELECT *
 106+ FROM wikistats
 107+ WHERE (project = 'wp') AND (wiki = 'en') AND (yyyymm BETWEEN '2010-06' AND '2011-05')
 108+ ORDER BY project,wiki,yyyymm
 109+ INTO OUTFILE 'c:/MySQL/analytics/analytics_out_wikistats_test1.csv'
 110+ FIELDS TERMINATED BY ',' ;
 111+-- export all relevant non-meta data from comsCore's reach by region (and comScore treats India and China as regions in this context)
72112 SELECT yyyymm,region_name,reach
73113 FROM comscore LEFT JOIN comscore_regions ON comscore.region_code = comscore_regions.region_code AND comscore_regions.language_code = 'en'
74114 WHERE (region_name != '') AND (yyyymm BETWEEN '2010-06' AND '2011-05')
75115 ORDER BY yyyymm,region_name
76116 INTO OUTFILE 'c:/MySQL/analytics/analytics_out_comscore_reach.csv'
77117 FIELDS TERMINATED BY ',' ;
78 -
Index: trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl
@@ -54,7 +54,7 @@
5555 # $dir_comscore_updates = "W:/@ Report Card/Data" ; # EZ test only
5656
5757 if (($dir_analytics eq '') || ($dir_comscore_updates eq ''))
58 - { abort ("Specify folder for 'master' csv files as '-m folder', folder for 'update' csv files as -u folder'") ; }
 58+ { Abort ("Specify folder for 'master' csv files as '-m folder', folder for 'update' csv files as -u folder'") ; }
5959
6060 $file_comscore_reach_master = "excel_out_comscore_reach_regions.csv" ;
6161 $file_comscore_reach_update = "*reach*by*region*csv" ;
@@ -111,7 +111,7 @@
112112 { $update_only {$id} = $true ; }
113113
114114 if (! -e "$dir_analytics/$file_comscore_master")
115 - { abort ("File $file_comscore_master not found!") ; }
 115+ { Abort ("File $file_comscore_master not found!") ; }
116116
117117 $age_all = -M "$dir_analytics/$file_comscore_master" ;
118118 print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " days old: '$file_comscore_master'\n" ;
@@ -438,9 +438,11 @@
439439 $reach = $reach_region_code {"$yyyymm,$region_code"} ;
440440 $visitors = $visitors_region_code {"$yyyymm,$region_code"} ;
441441
442 - if (! defined $reach) { $reach = -1 ; }
443 - if (! defined $visitors) { $reach = -1 ; }
 442+ if (! defined $reach) { $reach = -1 ; }
 443+ if (! defined $visitors) { $visitors = -1 ; }
444444
 445+ next if $reach == -1 and $visitors == -1 ;
 446+
445447 $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
446448 print OUT $line ;
447449 print $line ;
@@ -487,7 +489,7 @@
488490 return @months ;
489491 }
490492
491 -sub abort
 493+sub Abort
492494 {
493495 $msg = shift ;
494496
Index: trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl
@@ -0,0 +1,281 @@
 2+#!/usr/local/bin/perl
 3+
 4+# Copyright (C) 2011 Wikimedia Foundation
 5+# This program is free software; you can redistribute it and/or
 6+# modify it under the terms of the GNU General Public License version 2
 7+# as published by the Free Software Foundation.
 8+# This program is distributed in the hope that it will be useful,
 9+# but WITHOUT ANY WARRANTY; without even the implied warranty of
 10+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 11+# See the GNU General Public License for more details, at
 12+# http://www.fsf.org/licenses/gpl.html
 13+
 14+# Author:
 15+# Erik Zachte, email ezachte@wikimedia.org
 16+# loosely based on predecessor
 17+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/reportcard/ReportCardExtractWikiCountsOutput.pl
 18+
 19+# Functionality:
 20+# tba
 21+
 22+# Parameters:
 23+# tba
 24+
 25+# Output:
 26+# updated csv file for import in MySQL
 27+
 28+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/
 29+
 30+ use Getopt::Std ;
 31+
 32+ $true = 1 ;
 33+ $false = 0 ;
 34+
 35+ @projects = ('wb','wk','wn','wp','wq','ws','wv','wx','commons','*') ;
 36+
 37+ $file_csv_monthly_data = "StatisticsMonthly.csv" ;
 38+ $file_csv_user_activity_spread = "StatisticsUserActivitySpread.csv" ;
 39+ $file_csv_analytics_in = "analytics_in_wikistats.csv" ;
 40+
 41+ &ParseArguments ;
 42+ &ReadStatisticsMonthly ;
 43+ &WriteMonthlyData ;
 44+
 45+ print "\nReady\n\n" ;
 46+ exit ;
 47+
 48+sub ParseArguments
 49+{
 50+ my (@options, $arguments) ;
 51+
 52+ getopt ("io", \%options) ;
 53+
 54+ foreach $arg (sort keys %options)
 55+ { $arguments .= " -$arg " . $options {$arg} . "\n" ; }
 56+ print ("\nArguments\n$arguments\n") ;
 57+
 58+# $options {"i"} = "w:/# out bayes" ; # EZ test
 59+# $options {"o"} = "c:/MySQL/analytics" ; # EZ test
 60+
 61+ die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ;
 62+ die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
 63+
 64+ $path_in = $options {"i"} ;
 65+ $path_out = $options {"o"} ;
 66+
 67+ die "Input folder '$path_in' does not exist" if (! -d $path_in) ;
 68+ die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
 69+
 70+ print "Input folder: $path_in\n" ;
 71+ print "Output folder: $path_out\n\n" ;
 72+
 73+ $file_csv_out = "$path_out/analytics_in_wikistats.csv" ;
 74+}
 75+
 76+sub ReadStatisticsMonthly
 77+{
 78+ &ReadStatisticsMonthlyForProject ("wb") ;
 79+ &ReadStatisticsMonthlyForProject ("wk") ;
 80+ &ReadStatisticsMonthlyForProject ("wn") ;
 81+ &ReadStatisticsMonthlyForProject ("wp") ;
 82+ &ReadStatisticsMonthlyForProject ("wq") ;
 83+ &ReadStatisticsMonthlyForProject ("ws") ;
 84+ &ReadStatisticsMonthlyForProject ("wv") ;
 85+ &ReadStatisticsMonthlyForProject ("wx") ;
 86+
 87+# &ReadStatisticsPerBinariesExtensionCommons ;
 88+}
 89+
 90+sub ReadStatisticsMonthlyForProject
 91+{
 92+ my $project = shift;
 93+ $all_projects = "*" ;
 94+
 95+ my $file_csv_in_1 = "$path_in/csv_$project/$file_csv_monthly_data" ;
 96+ my $file_csv_in_2 = "$path_in/csv_$project/$file_csv_user_activity_spread" ;
 97+
 98+ if (! -e $file_csv_in_1)
 99+ { &Abort ("Input file '$file_csv_in_1' not found") ; }
 100+ if (! -e $file_csv_in_2)
 101+ { &Abort ("Input file '$file_csv_in_2' not found") ; }
 102+
 103+ my $yyyymm ;
 104+
 105+ print "Read '$file_csv_in_1'\n" ;
 106+ open CSV_IN, '<', $file_csv_in_1 ;
 107+ while ($line = <CSV_IN>)
 108+ {
 109+ chomp $line ;
 110+ ($language,$date,$counts) = split (',', $line, 3) ;
 111+ @fields = split (',', $counts) ;
 112+
 113+ next if ! &AcceptWiki ($project,$language) ;
 114+
 115+ ($month,$day,$year) = split ('\/', $date) ;
 116+ $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
 117+
 118+ foreach $field (@fields)
 119+ {
 120+ if ($field eq '-')
 121+ { $field = 0 ; }
 122+ }
 123+
 124+ $data = $fields [0] . ',' . # contributors all time
 125+ $fields [1] . ',' . # new contributors
 126+ 'data2,' . # place holder for more data, to be inserted later
 127+ $fields [4] . ',' . # articles
 128+ $fields [6] . ',' . # articles new per day
 129+ $fields [9] . ',' . # larger than 0.5 kB
 130+ $fields [10] . ',' . # larger than 2.0 kB
 131+ $fields [7] . ',' . # mean edits per article
 132+ $fields [8] . ',' . # mean bytes per article
 133+ $fields [11] . ',' . # edits
 134+ $fields [12] . ',' . # size in bytes
 135+ $fields [13] . ',' . # size in words
 136+ $fields [14] . ',' . # links internal
 137+ $fields [15] . ',' . # links interwiki
 138+ $fields [16] . ',' . # links images
 139+ $fields [17] . ',' . # links external
 140+ $fields [18] ; # redirects
 141+
 142+ $data1 {"$project,$language,$yyyymm"} = $data ;
 143+ }
 144+ close CSV_IN ;
 145+
 146+ # now read (very) active editors from newer more accurate file (split data for reg users and bots, unlike StatisticsMonthly.csv)
 147+
 148+ print "Read '$file_csv_in_2'\n" ;
 149+ open CSV_IN, '<', $file_csv_in_2 ;
 150+ while ($line = <CSV_IN>)
 151+ {
 152+ chomp $line ;
 153+ ($language,$date,$reguser_bot,$group,@counts) = split (',', $line) ;
 154+
 155+ next if ! &AcceptWiki ($project,$language) ;
 156+
 157+ if ($reguser_bot ne "R") { next ; } # R: reg user, B: bot
 158+ if ($group ne "A") { next ; } # A: articles, T: talk pages, O: other namespaces
 159+
 160+ ($month,$day,$year) = split ('\/', $date) ;
 161+ $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
 162+
 163+ # data have been collected in WikiCountsProcess.pm and been written in WikiCountsOutput.pm
 164+ # count user with over x edits
 165+ # threshold starting with a 3 are 10xSQRT(10), 100xSQRT(10), 1000xSQRT(10), etc
 166+ # @thresholds = (1,3,5,10,25,32,50,100,250,316,500,1000,2500,3162,5000,10000,25000,31623,50000,100000,250000,316228,500000,1000000,2500000,3162278,500000,10000000,25000000,31622777,5000000,100000000) ;
 167+ $edits_ge_5 = @counts [2] > 0 ? @counts [2] : 0 ;
 168+ $edits_ge_25 = @counts [4] > 0 ? @counts [4] : 0 ;
 169+ $edits_ge_100 = @counts [7] > 0 ? @counts [7] : 0 ;
 170+ $data2 {"$project,$language,$yyyymm"} = "$edits_ge_5,$edits_ge_25,$edits_ge_100" ;
 171+ # prep string with right amount of comma's
 172+ if ($data2_default eq '')
 173+ {
 174+ $data2_default = $data2 {"$project,$language,$yyyymm"} ;
 175+ $data2_default =~ s/[^,]+/0/g ;
 176+ }
 177+ }
 178+ close CSV_IN ;
 179+}
 180+
 181+#sub ReadStatisticsPerBinariesExtensionCommons
 182+#{
 183+# my $file_csv_in = "$path_in/csv_wx/StatisticsPerBinariesExtension.csv" ;
 184+# my $mmax = -1 ;
 185+
 186+# if (! -e $file_csv_in)
 187+# { &Abort ("Input file '$file_csv_in' not found") ; }
 188+
 189+# print "Read '$file_csv_in'\n" ;
 190+# open CSV_IN, '<', $file_csv_in ;
 191+# while ($line = <CSV_IN>)
 192+# {
 193+# chomp $line ;
 194+# ($language,$date,$counts) = split (',', $line, 3) ;
 195+
 196+# if ($language ne "commons") { next ; }
 197+
 198+# if ($date eq "00/0000")
 199+# {
 200+# @fields = split (',', $counts) ;
 201+# $field_ndx = 0 ;
 202+# foreach $field (@fields)
 203+# {
 204+# $ext_cnt {-1}{$field_ndx} = $field ;
 205+# # print "EXT_CNT $field_ndx : $field\n" ;
 206+# $field_ndx ++ ;
 207+# }
 208+# next ;
 209+# }
 210+
 211+# ($month,$year) = split ('\/', $date) ;
 212+# my $m = &months_since_2000_01 ($year,$month) ;
 213+# next if $m < $m_start ;
 214+
 215+# if ($m > $mmax)
 216+# { $mmax = $m ; }
 217+
 218+# @fields = split (',', $counts) ;
 219+# $field_ndx = 0 ;
 220+# foreach $field (@fields)
 221+# {
 222+# $ext_cnt {$m}{$field_ndx} = $field ;
 223+# $ext_tot {$m} += $field ;
 224+# $field_ndx ++ ;
 225+# }
 226+# }
 227+# close CSV_IN ;
 228+
 229+# %ext_cnt_mmax = %{$ext_cnt {$mmax}} ;
 230+# @ext_cnt_mmax = (sort {$ext_cnt_mmax {$b} <=> $ext_cnt_mmax {$a}} keys %ext_cnt_mmax) ;
 231+
 232+# $extcnt = 0 ;
 233+# foreach $extndx (@ext_cnt_mmax)
 234+# {
 235+# # print "$extndx < ${ext_cnt {-1}{$extndx}} > : ${ext_cnt_mmax {$extndx}}\n" ;
 236+# push @extndxs, $extndx ;
 237+# if ($extcnt++ >= 9) { last ; }
 238+# }
 239+#}
 240+
 241+sub WriteMonthlyData
 242+{
 243+ my $file_csv_out = "$path_out/$file_csv_analytics_in" ;
 244+ open CSV_OUT, '>', $file_csv_out ;
 245+ foreach $project_wiki_month (sort keys %data1)
 246+ {
 247+ $data1 = $data1 {$project_wiki_month} ;
 248+ $data2 = $data2 {$project_wiki_month} ;
 249+ if ($data2 eq '')
 250+ {
 251+ print "Editor data missing for $project_wiki_month\n" ;
 252+ $data2 = $data2_default ;
 253+ }
 254+ $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor fields follow each other
 255+ print CSV_OUT "$project_wiki_month,$data1\n" ;
 256+ }
 257+ close CSV_OUT ;
 258+}
 259+
 260+sub AcceptWiki
 261+{
 262+ my ($project,$language) = @_ ;
 263+
 264+ return $false if $language eq 'commons' and $project ne 'wx' ; # commons also in wikipedia csv files (bug, hard to cleanup, just skip)
 265+ return $false if $language eq 'sr' and $project eq 'wn' ; # ignore insane bot spam on
 266+ return $false if $language =~ /mania|team|comcom|closed|chair|langcom|office|searchcom|sep11|nostalgia|stats|test/i ;
 267+
 268+ return $false if $language =~ /^(?:dk|tlh|ru_sib)$/ ; # dk=dumps exist(ed?) but site not, tlh=Klignon, ru-sib=Siberian
 269+ return $false if $project eq 'wk' and ($language eq "als" or $language eq "tlh") ;
 270+
 271+ return $true ;
 272+}
 273+
 274+sub Abort
 275+{
 276+ my $msg = shift ;
 277+ print "$msg\nExecution aborted." ;
 278+ # to do: log also to file
 279+ exit ;
 280+}
 281+
 282+

Status & tagging log