r99035 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r99034‎ | r99035 | r99036 >
Date:19:41, 5 October 2011
Author:ezachte
Status:deferred
Tags:
Comment:
Collect per squid metrics from 1:1000 sampled log (# events, average sequence number gap), write several aggregations, e.g. to patch dammit.lt projectcounts hourly per wiki totals to compensate for server overload
Modified paths:
  • /trunk/wikistats/squids/SquidLoadScan.pl (added) (history)

Diff [purge]

Index: trunk/wikistats/squids/SquidLoadScan.pl
@@ -0,0 +1,189 @@
 2+#! /usr/bin/perl
 3+
 4+ $| = 1; # flush screen output
 5+
 6+# read all files on squid log aggregator with hourly counts for
 7+# - number of events received per squid
 8+# - average gap in sequence numbers (this should be 1000 idealy on a 1:1000 sampled log)
 9+# write several aggregations of these data
 10+
 11+ &ReadData ;
 12+ &ProcessData ;
 13+ &WriteHourlyAveragedDeltaSequenceNumbers ;
 14+ &WriteMonthlyAveragedEventsPerSquidPerHour ;
 15+ &WriteMonthlyMetricsPerSquidSet ;
 16+
 17+ print "\n\nReady\n\n" ;
 18+
 19+sub ProcessData
 20+{
 21+ my ($date_yyyy_mm_dd, $file) = @_ ;
 22+ $date_yyyy_mm = substr ($date_yyyy_mm_dd,0,7) ;
 23+ $months {$date_yyyy_mm}++ ;
 24+
 25+ open CSV, '<', $file ;
 26+ while ($line = <CSV>)
 27+ {
 28+ next if $line =~ /events/i ; # headers + totals
 29+ chomp $line ;
 30+ ($squid,$hour,$events,$tot_delta,$avg_delta) = split (',', $line) ;
 31+
 32+ $squid2 = $squid ;
 33+ $squid2 =~ s/\..*// ;
 34+
 35+ ($digits = $squid2) =~ s/[^\d]//g ;
 36+ $digits =~ s/\d?\d$/*/ ;
 37+ ($name = $squid2) =~ s/[\d]//g ;
 38+
 39+ $squid_set = $name . $digits ;
 40+ $squid_sets {$squid_set}++ ;
 41+ if ($squid_sets_lo {$squid_set} eq '')
 42+ { $squid_sets_lo {$squid_set} = $squid2 ; }
 43+ if ($squid_sets_hi {$squid_set} eq '')
 44+ { $squid_sets_hi {$squid_set} = $squid2 ; }
 45+ if ($squid_sets_lo {$squid_set} gt $squid2)
 46+ { $squid_sets_lo {$squid_set} = $squid2 ; }
 47+ if ($squid_sets_hi {$squid_set} lt $squid2)
 48+ { $squid_sets_hi {$squid_set} = $squid2 ; }
 49+
 50+ # if ($squid ne '')
 51+ # { $squids {"$squid,$date_yyyy_mm"} += $events ; }
 52+
 53+ $squid_events_month {"$squid,$date_yyyy_mm"} += $events ;
 54+ $squid_hours_month {"$squid,$date_yyyy_mm"} ++ ;
 55+
 56+ $squid_set_delta_month {"$squid_set,$date_yyyy_mm"} += $avg_delta ;
 57+ $squid_set_events_month {"$squid_set,$date_yyyy_mm"} += $events ;
 58+ $squid_set_hours_month {"$squid_set,$date_yyyy_mm"} ++ ;
 59+
 60+ if ($squid =~ /sq/) # only for regular squids for clearer correction data
 61+ {
 62+ $all_regular_squids_delta_hour {"$date_yyyy_mm_dd,$hour"} += $avg_delta ;
 63+ $all_regular_squids_active {"$date_yyyy_mm_dd,$hour"} ++ ;
 64+ }
 65+ }
 66+ close CSV ;
 67+}
 68+
 69+sub ReadData
 70+{
 71+ $path_squid_counts = "/a/ezachte" ;
 72+
 73+ @files = <*>;
 74+ foreach $file (@files)
 75+ {
 76+ next if ! -d $file ;
 77+ next if $file !~ /^\d\d\d\d-\d\d$/ ;
 78+ push @folders, $file ;
 79+ }
 80+
 81+ foreach $folder (@folders)
 82+ {
 83+ print "Scanning $folder\n" ;
 84+ chdir "$path_squid_counts/$folder" ;
 85+ @files = <*>;
 86+
 87+ foreach $file (@files)
 88+ {
 89+ next if ! -d $file ;
 90+ next if $file !~ /^\d\d\d\d-\d\d-\d\d$/ ;
 91+ $folder2 = $file ;
 92+ $file_csv = "$path_squid_counts/$folder/$folder2/SquidDataSequenceNumbersPerSquidHour.csv" ;
 93+ if (-e $file_csv)
 94+ { &ProcessData ($folder2, $file_csv) ; }
 95+ }
 96+ }
 97+}
 98+
 99+# this file can be used to patch projectcounts files from dammit.lt/wikistats to make up for missing events (due to server overload)
 100+# if for some hour average gap in sequence numbers is 1200 instead of 1000 this means all per wiki counts in projectcount file for that hour need correction: * 1200/1000
 101+sub WriteHourlyAveragedDeltaSequenceNumbers
 102+{
 103+ open CSV , '>', "$path_squid_counts/SquidDataHourlyAverageDeltaSequenceNumbers.csv" ;
 104+ foreach $date_hour (sort keys %all_regular_squids_active)
 105+ {
 106+ $avg_delta_all_regular_squids = sprintf ("%.0f", $all_regular_squids_delta_hour {$date_hour} / $all_regular_squids_active {$date_hour}) ;
 107+ print CSV "$date_hour,$avg_delta_all_regular_squids\n" ;
 108+ }
 109+ close CSV ;
 110+}
 111+
 112+sub WriteMonthlyAveragedEventsPerSquidPerHour
 113+{
 114+ open CSV , '>', "$path_squid_counts/SquidDataMonthlyEventsPerSquidPerHour.csv" ;
 115+ foreach $key (sort keys %squid_events_month)
 116+ {
 117+ $events_per_hour = sprintf ("%.0f", $squid_events_month {$key} / $squid_hours_month {$key}) ;
 118+ $key =~ s/(\w)0(\d\.)/$1$2/ ;
 119+
 120+ print CSV "$key,$events_per_hour\n" ;
 121+ }
 122+ close CSV ;
 123+}
 124+
 125+# monthly data per squid set, first average hourly delta between sequence numbers, then hourly number of events
 126+sub WriteMonthlyMetricsPerSquidSet
 127+{
 128+ open CSV , '>', "$path_squid_counts/SquidDataMonthlyPerSquidSet.csv" ;
 129+ print CSV "\nAverage delta in sequence numbers per squid per active hour \n\n" ;
 130+ $line = "month" ;
 131+ foreach $squid_set (sort keys %squid_sets)
 132+ {
 133+ if ($squid_sets_lo {$squid_set} eq $squid_sets_hi {$squid_set})
 134+ { $squid_range = $squid_sets_lo {$squid_set} ; }
 135+ else
 136+ {
 137+ ($squid_sets_hi_num = $squid_sets_hi {$squid_set}) =~ s/[^\d]//g ;
 138+ $squid_range = $squid_sets_lo {$squid_set} . "-" . $squid_sets_hi_num ;
 139+ }
 140+ $line .= ",$squid_range" ;
 141+ }
 142+ print CSV "$line\n" ;
 143+
 144+ foreach $month (sort keys %months)
 145+ {
 146+ $line = $month ;
 147+ foreach $squid_set (sort keys %squid_sets)
 148+ {
 149+ $key = "$squid_set,$month" ;
 150+ if ($squid_set_hours_month {$key} == 0)
 151+ { $line .= "," ; }
 152+ else
 153+ { $line .= "," . sprintf ("%.0f", $squid_set_delta_month {$key} / $squid_set_hours_month {$key}) ; }
 154+ }
 155+ print CSV "$line\n" ;
 156+ }
 157+
 158+ print CSV "\n\nAverage events per squid per active hour \n\n" ;
 159+
 160+ $line = "month" ;
 161+ foreach $squid_set (sort keys %squid_sets)
 162+ {
 163+ if ($squid_sets_lo {$squid_set} eq $squid_sets_hi {$squid_set})
 164+ { $squid_range = $squid_sets_lo {$squid_set} ; }
 165+ else
 166+ {
 167+ ($squid_sets_hi_num = $squid_sets_hi {$squid_set}) =~ s/[^\d]//g ;
 168+ $squid_range = $squid_sets_lo {$squid_set} . "-" . $squid_sets_hi_num ;
 169+ }
 170+ $line .= ",$squid_range" ;
 171+ }
 172+ print CSV "$line\n" ;
 173+
 174+ foreach $month (sort keys %months)
 175+ {
 176+ $line = $month ;
 177+ foreach $squid_set (sort keys %squid_sets)
 178+ {
 179+ $key = "$squid_set,$month" ;
 180+ if ($squid_set_hours_month {$key} == 0)
 181+ { $line .= "," ; }
 182+ else
 183+ { $line .= "," . sprintf ("%.0f", $squid_set_events_month {$key} / $squid_set_hours_month {$key}) ; }
 184+ }
 185+ print CSV "$line\n" ;
 186+ }
 187+
 188+ print CSV "\n\nSquid names grouped by first 2 digits\n" ;
 189+ close CSV ;
 190+}

Status & tagging log