r112312 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r112311‎ | r112312 | r112313 >
Date:11:15, 24 February 2012
Author:ezachte
Status:deferred
Tags:
Comment:
simple file to collect historic trend data for charting (right now only browser distribution stats, in daily/weekly/monthly versions)
Modified paths:
  • /trunk/wikistats/squids/SquidScanCsvFiles.pl (added) (history)

Diff [purge]

Index: trunk/wikistats/squids/SquidScanCsvFiles.pl
@@ -0,0 +1,232 @@
 2+#!/usr/bin/perl
 3+
 4+# Rather quick and simple script to collect browserstats for Excel chart, see for Excel output example:
 5+# http://infodisiac.com/blog/2012/02/wikimedia-usage-share-per-browserstraffic-breakdown-by-browser/
 6+
 7+use Time::Local ;
 8+
 9+$mode_all_pages = 0 ; # to do: make runtime argument
 10+
 11+if ($mode_all_pages)
 12+{ $time = timegm (0,0,0,1,2,109) ; } # start 2009-3-1 - oldest month with counts
 13+else
 14+{ $time = timegm (0,0,0,1,4,111) ; } # start 2011-5-1 - oldest month with mime-type column (page,image,other)
 15+
 16+ if ($mode_all_pages)
 17+ { $mime_filter = "AllRequests" ; }
 18+ else
 19+ { $mime_filter = "HtmlRequests" ; }
 20+
 21+ open CSV_OUT_DAILY , '>', "SquidScanClientsDaily$mime_filter.csv" ;
 22+ open CSV_OUT_WEEKLY, '>', "SquidScanClientsWeekly$mime_filter.csv" ;
 23+ open CSV_OUT_MONTHLY, '>', "SquidScanClientsMonthly$mime_filter.csv" ;
 24+
 25+ $days_done = 0 ;
 26+ while ($time < time)
 27+ {
 28+ ($day,$month,$year,$yearday) = (gmtime ($time))[3,4,5,7] ;
 29+ $yyyy_mm_dd = sprintf ("%04d-%02d-%02d", $year+1900, $month+1, $day) ;
 30+ $yyyy_mm = sprintf ("%04d-%02d", $year+1900, $month+1) ;
 31+ $date_excel = sprintf ("\"=DATE(%d,%d,%d)\"", $year+1900, $month+1, $day) ;
 32+
 33+ $days_done++ ;
 34+ $weeknum = int ($days_done / 7) ;
 35+
 36+ # remember first day of week
 37+ if ($weeknums {$weeknum} eq '')
 38+ { $weeknums {$weeknum} = $yyyy_mm_dd ; }
 39+ $months {$yyyy_mm} ++ ;
 40+
 41+ # next if $yyyy_mm eq "2011-09" and $yyyy_mm_dd ge "2011-09-08" ; # " Sep 2011: varnish bug could not be repaired, as logs were gone when bug was found Dec 2011
 42+
 43+ $days {$yyyy_mm_dd}++ ; # collect days found
 44+ $dates_ascii {$yyyy_mm_dd} = $yyyy_mm_dd ;
 45+ $dates_excel {$yyyy_mm_dd} = $date_excel ;
 46+ $dates_ascii {$yyyy_mm} = $yyyy_mm_dd ;
 47+ $dates_excel {$yyyy_mm} = $date_excel ;
 48+
 49+ print "$yyyy_mm_dd\n" ;
 50+
 51+ $folder = "/a/ezachte/$yyyy_mm/$yyyy_mm_dd" ;
 52+
 53+ if ($yyyy_mm ge "2010-07")
 54+ { $folder .= "/public" ; }
 55+
 56+ $file = "$folder/SquidDataClients.csv" ;
 57+
 58+ $count = '-' ;
 59+ if (-e $file)
 60+ {
 61+ $files {$weeknum} ++ ;
 62+
 63+ open CSV_IN, '<', $file ;
 64+
 65+ while ($line = <CSV_IN>)
 66+ {
 67+ chomp $line ;
 68+ @fields = split (',', $line) ;
 69+ next if $fields [0] ne 'G' ; # grouped stats only (irrespective of version)
 70+
 71+
 72+ if ($mode_all_pages)
 73+ {
 74+ if ($yyyy_mm ge "2011-05")
 75+ { $count = $fields [4] ; }
 76+ else
 77+ { $count = $fields [3] ; }
 78+ }
 79+ else
 80+ {
 81+ next if $fields [3] ne 'page' ; # html requests only
 82+
 83+ $count = $fields [4] ;
 84+ }
 85+
 86+ # next if $count < 1000 ; # request count in 1:1000 sampled file, so less than 1 million per day
 87+
 88+ $totals_weekly {$weeknum} += $count ;
 89+ $totals_monthly {$yyyy_mm} += $count ;
 90+
 91+ $group = ucfirst (lc ($fields [2])) ;
 92+
 93+ if ($fields [1] eq 'M')
 94+ {
 95+ if ($group !~ /^(?:safari|android|opera)$/i)
 96+ { $group = 'other' ; }
 97+ $group = "$group (Mobile)" ;
 98+
 99+ $mobile_weekly {$weeknum} += $count ;
 100+ $mobile_monthly {$yyyy_mm} += $count ;
 101+
 102+ $group_daily {"Mobile,$yyyy_mm_dd"} += $count ;
 103+ $group_weekly {"Mobile,$weeknum"} += $count ;
 104+ $group_monthly {"Mobile,$yyyy_mm"} += $count ;
 105+ }
 106+ else
 107+ {
 108+ if ($group !~ /^(?:msie|firefox|chrome|opera)$/i)
 109+ { $group = 'other' ; }
 110+ $group = "$group" ;
 111+
 112+ $non_mobile_weekly {$weeknum} += $count ;
 113+ $non_mobile_monthly {$yyyy_mm} += $count ;
 114+
 115+ $group_daily {"Non-Mobile,$yyyy_mm_dd"} += $count ;
 116+ $group_weekly {"Non-Mobile,$weeknum"} += $count ;
 117+ $group_monthly {"Non-Mobile,$yyyy_mm"} += $count ;
 118+ }
 119+ # next if $fields [2] eq 'NetFront' ; # skip, occurs on few days only
 120+
 121+
 122+ $groups {$group}++ ;
 123+ $group_daily {"$group,$yyyy_mm_dd"} += $count ;
 124+ $group_weekly {"$group,$weeknum"} += $count ;
 125+ $group_monthly {"$group,$yyyy_mm"} += $count ;
 126+ $totals {$group} += $count ;
 127+
 128+ # print "$group,$count\n" ;
 129+ }
 130+ }
 131+
 132+ $time += 3600 * 24 ; # next day
 133+ }
 134+
 135+ $groups = 0 ;
 136+# push @group_list, "Non-Mobile" ;
 137+ push @group_list, "Mobile" ;
 138+ for $group (sort {$totals {$b} <=> $totals {$a}} keys %totals)
 139+ {
 140+ print "$group: " . $totals {$group} . "\n" ;
 141+ last if ++$groups > 15 ;
 142+ push @group_list, $group ;
 143+ }
 144+
 145+ # daily counts
 146+ print CSV_OUT_DAILY 'date ascii,date,' ;
 147+ for $group (@group_list)
 148+ { print CSV_OUT_DAILY "$group," ; }
 149+
 150+ print CSV_OUT_DAILY "\n" ;
 151+
 152+ for $yyyy_mm_dd (sort keys %days)
 153+ {
 154+ print CSV_OUT_DAILY $dates_ascii {$yyyy_mm_dd} . ',' ;
 155+ print CSV_OUT_DAILY $dates_excel {$yyyy_mm_dd} . ',' ;
 156+
 157+ for $group (@group_list)
 158+ {
 159+ print CSV_OUT_DAILY $group_daily {"$group,$yyyy_mm_dd"} . ',' ;
 160+ }
 161+
 162+ print CSV_OUT_DAILY "\n" ;
 163+ }
 164+
 165+ # monthly counts
 166+ print CSV_OUT_MONTHLY 'date ascii,date,' ;
 167+ for $group (@group_list)
 168+ { print CSV_OUT_MONTHLY "$group," ; }
 169+ print CSV_OUT_MONTHLY "\n" ;
 170+
 171+ for $month (sort {$a cmp $b} keys %months)
 172+ {
 173+ print CSV_OUT_MONTHLY $dates_ascii {$month} . ',' ;
 174+ print CSV_OUT_MONTHLY $dates_excel {$month} . ',' ;
 175+
 176+ last if $totals_monthly {$month} == 0 ;
 177+
 178+ for $group (@group_list)
 179+ {
 180+ if ($totals_monthly {$month} > 0)
 181+ { print CSV_OUT_MONTHLY sprintf ("%.2f", 100 * $group_monthly {"$group,$month"}/$totals_monthly {$month}) . ',' ; } }
 182+
 183+ print CSV_OUT_MONTHLY "\n" ;
 184+ }
 185+
 186+
 187+ # weekly counts
 188+ print CSV_OUT_WEEKLY 'date ascii,date,' ;
 189+ for $group (@group_list)
 190+ { print CSV_OUT_WEEKLY "$group," ; }
 191+ print CSV_OUT_WEEKLY "\n" ;
 192+
 193+ for $weeknum (sort {$a <=> $b} keys %weeknums)
 194+ {
 195+ print CSV_OUT_WEEKLY $dates_ascii {$weeknums {$weeknum}} . ',' ;
 196+ print CSV_OUT_WEEKLY $dates_excel {$weeknums {$weeknum}} . ',' ;
 197+
 198+ if ($files {$weeknum} > 0)
 199+ {
 200+ for $group (@group_list)
 201+ {
 202+ if ($totals_weekly {$weeknum} == 0)
 203+ { print CSV_OUT_WEEKLY ',' ; }
 204+ else
 205+ { print CSV_OUT_WEEKLY sprintf ("%.2f", 100 * $group_weekly {"$group,$weeknum"}/$totals_weekly {$weeknum}) . ',' ; }
 206+ }
 207+ }
 208+
 209+ print CSV_OUT_WEEKLY "\n" ;
 210+ }
 211+
 212+
 213+ # daily averages from weekly counts, adjusted for missing days
 214+ print CSV_OUT_WEEKLY "\n\n" ;
 215+ print CSV_OUT_WEEKLY 'date ascii,date,' ;
 216+ for $group (@group_list)
 217+ { print CSV_OUT_WEEKLY "$group," ; }
 218+ print CSV_OUT_WEEKLY "\n" ;
 219+
 220+ for $weeknum (sort {$a <=> $b} keys %weeknums)
 221+ {
 222+ print CSV_OUT_WEEKLY $dates_ascii {$weeknums {$weeknum}} . ',' ;
 223+ print CSV_OUT_WEEKLY $dates_excel {$weeknums {$weeknum}} . ',' ;
 224+
 225+ if ($files {$weeknum} > 0)
 226+ {
 227+ for $group (@group_list)
 228+ { print CSV_OUT_WEEKLY int ($group_weekly {"$group,$weeknum"}/$files {$weeknum}) . ',' ; }
 229+ }
 230+
 231+ print CSV_OUT_WEEKLY "\n" ;
 232+ }
 233+

Status & tagging log