Index: trunk/wikistats/squids/SquidLoadScan.pl |
— | — | @@ -0,0 +1,189 @@ |
| 2 | +#! /usr/bin/perl |
| 3 | + |
| 4 | + $| = 1; # flush screen output |
| 5 | + |
| 6 | +# read all files on squid log aggregator with hourly counts for |
| 7 | +# - number of events received per squid |
| 8 | +# - average gap in sequence numbers (this should be 1000 idealy on a 1:1000 sampled log) |
| 9 | +# write several aggregations of these data |
| 10 | + |
| 11 | + &ReadData ; |
| 12 | + &ProcessData ; |
| 13 | + &WriteHourlyAveragedDeltaSequenceNumbers ; |
| 14 | + &WriteMonthlyAveragedEventsPerSquidPerHour ; |
| 15 | + &WriteMonthlyMetricsPerSquidSet ; |
| 16 | + |
| 17 | + print "\n\nReady\n\n" ; |
| 18 | + |
| 19 | +sub ProcessData |
| 20 | +{ |
| 21 | + my ($date_yyyy_mm_dd, $file) = @_ ; |
| 22 | + $date_yyyy_mm = substr ($date_yyyy_mm_dd,0,7) ; |
| 23 | + $months {$date_yyyy_mm}++ ; |
| 24 | + |
| 25 | + open CSV, '<', $file ; |
| 26 | + while ($line = <CSV>) |
| 27 | + { |
| 28 | + next if $line =~ /events/i ; # headers + totals |
| 29 | + chomp $line ; |
| 30 | + ($squid,$hour,$events,$tot_delta,$avg_delta) = split (',', $line) ; |
| 31 | + |
| 32 | + $squid2 = $squid ; |
| 33 | + $squid2 =~ s/\..*// ; |
| 34 | + |
| 35 | + ($digits = $squid2) =~ s/[^\d]//g ; |
| 36 | + $digits =~ s/\d?\d$/*/ ; |
| 37 | + ($name = $squid2) =~ s/[\d]//g ; |
| 38 | + |
| 39 | + $squid_set = $name . $digits ; |
| 40 | + $squid_sets {$squid_set}++ ; |
| 41 | + if ($squid_sets_lo {$squid_set} eq '') |
| 42 | + { $squid_sets_lo {$squid_set} = $squid2 ; } |
| 43 | + if ($squid_sets_hi {$squid_set} eq '') |
| 44 | + { $squid_sets_hi {$squid_set} = $squid2 ; } |
| 45 | + if ($squid_sets_lo {$squid_set} gt $squid2) |
| 46 | + { $squid_sets_lo {$squid_set} = $squid2 ; } |
| 47 | + if ($squid_sets_hi {$squid_set} lt $squid2) |
| 48 | + { $squid_sets_hi {$squid_set} = $squid2 ; } |
| 49 | + |
| 50 | + # if ($squid ne '') |
| 51 | + # { $squids {"$squid,$date_yyyy_mm"} += $events ; } |
| 52 | + |
| 53 | + $squid_events_month {"$squid,$date_yyyy_mm"} += $events ; |
| 54 | + $squid_hours_month {"$squid,$date_yyyy_mm"} ++ ; |
| 55 | + |
| 56 | + $squid_set_delta_month {"$squid_set,$date_yyyy_mm"} += $avg_delta ; |
| 57 | + $squid_set_events_month {"$squid_set,$date_yyyy_mm"} += $events ; |
| 58 | + $squid_set_hours_month {"$squid_set,$date_yyyy_mm"} ++ ; |
| 59 | + |
| 60 | + if ($squid =~ /sq/) # only for regular squids for clearer correction data |
| 61 | + { |
| 62 | + $all_regular_squids_delta_hour {"$date_yyyy_mm_dd,$hour"} += $avg_delta ; |
| 63 | + $all_regular_squids_active {"$date_yyyy_mm_dd,$hour"} ++ ; |
| 64 | + } |
| 65 | + } |
| 66 | + close CSV ; |
| 67 | +} |
| 68 | + |
| 69 | +sub ReadData |
| 70 | +{ |
| 71 | + $path_squid_counts = "/a/ezachte" ; |
| 72 | + |
| 73 | + @files = <*>; |
| 74 | + foreach $file (@files) |
| 75 | + { |
| 76 | + next if ! -d $file ; |
| 77 | + next if $file !~ /^\d\d\d\d-\d\d$/ ; |
| 78 | + push @folders, $file ; |
| 79 | + } |
| 80 | + |
| 81 | + foreach $folder (@folders) |
| 82 | + { |
| 83 | + print "Scanning $folder\n" ; |
| 84 | + chdir "$path_squid_counts/$folder" ; |
| 85 | + @files = <*>; |
| 86 | + |
| 87 | + foreach $file (@files) |
| 88 | + { |
| 89 | + next if ! -d $file ; |
| 90 | + next if $file !~ /^\d\d\d\d-\d\d-\d\d$/ ; |
| 91 | + $folder2 = $file ; |
| 92 | + $file_csv = "$path_squid_counts/$folder/$folder2/SquidDataSequenceNumbersPerSquidHour.csv" ; |
| 93 | + if (-e $file_csv) |
| 94 | + { &ProcessData ($folder2, $file_csv) ; } |
| 95 | + } |
| 96 | + } |
| 97 | +} |
| 98 | + |
| 99 | +# this file can be used to patch projectcounts files from dammit.lt/wikistats to make up for missing events (due to server overload) |
| 100 | +# if for some hour average gap in sequence numbers is 1200 instead of 1000 this means all per wiki counts in projectcount file for that hour need correction: * 1200/1000 |
| 101 | +sub WriteHourlyAveragedDeltaSequenceNumbers |
| 102 | +{ |
| 103 | + open CSV , '>', "$path_squid_counts/SquidDataHourlyAverageDeltaSequenceNumbers.csv" ; |
| 104 | + foreach $date_hour (sort keys %all_regular_squids_active) |
| 105 | + { |
| 106 | + $avg_delta_all_regular_squids = sprintf ("%.0f", $all_regular_squids_delta_hour {$date_hour} / $all_regular_squids_active {$date_hour}) ; |
| 107 | + print CSV "$date_hour,$avg_delta_all_regular_squids\n" ; |
| 108 | + } |
| 109 | + close CSV ; |
| 110 | +} |
| 111 | + |
| 112 | +sub WriteMonthlyAveragedEventsPerSquidPerHour |
| 113 | +{ |
| 114 | + open CSV , '>', "$path_squid_counts/SquidDataMonthlyEventsPerSquidPerHour.csv" ; |
| 115 | + foreach $key (sort keys %squid_events_month) |
| 116 | + { |
| 117 | + $events_per_hour = sprintf ("%.0f", $squid_events_month {$key} / $squid_hours_month {$key}) ; |
| 118 | + $key =~ s/(\w)0(\d\.)/$1$2/ ; |
| 119 | + |
| 120 | + print CSV "$key,$events_per_hour\n" ; |
| 121 | + } |
| 122 | + close CSV ; |
| 123 | +} |
| 124 | + |
| 125 | +# monthly data per squid set, first average hourly delta between sequence numbers, then hourly number of events |
| 126 | +sub WriteMonthlyMetricsPerSquidSet |
| 127 | +{ |
| 128 | + open CSV , '>', "$path_squid_counts/SquidDataMonthlyPerSquidSet.csv" ; |
| 129 | + print CSV "\nAverage delta in sequence numbers per squid per active hour \n\n" ; |
| 130 | + $line = "month" ; |
| 131 | + foreach $squid_set (sort keys %squid_sets) |
| 132 | + { |
| 133 | + if ($squid_sets_lo {$squid_set} eq $squid_sets_hi {$squid_set}) |
| 134 | + { $squid_range = $squid_sets_lo {$squid_set} ; } |
| 135 | + else |
| 136 | + { |
| 137 | + ($squid_sets_hi_num = $squid_sets_hi {$squid_set}) =~ s/[^\d]//g ; |
| 138 | + $squid_range = $squid_sets_lo {$squid_set} . "-" . $squid_sets_hi_num ; |
| 139 | + } |
| 140 | + $line .= ",$squid_range" ; |
| 141 | + } |
| 142 | + print CSV "$line\n" ; |
| 143 | + |
| 144 | + foreach $month (sort keys %months) |
| 145 | + { |
| 146 | + $line = $month ; |
| 147 | + foreach $squid_set (sort keys %squid_sets) |
| 148 | + { |
| 149 | + $key = "$squid_set,$month" ; |
| 150 | + if ($squid_set_hours_month {$key} == 0) |
| 151 | + { $line .= "," ; } |
| 152 | + else |
| 153 | + { $line .= "," . sprintf ("%.0f", $squid_set_delta_month {$key} / $squid_set_hours_month {$key}) ; } |
| 154 | + } |
| 155 | + print CSV "$line\n" ; |
| 156 | + } |
| 157 | + |
| 158 | + print CSV "\n\nAverage events per squid per active hour \n\n" ; |
| 159 | + |
| 160 | + $line = "month" ; |
| 161 | + foreach $squid_set (sort keys %squid_sets) |
| 162 | + { |
| 163 | + if ($squid_sets_lo {$squid_set} eq $squid_sets_hi {$squid_set}) |
| 164 | + { $squid_range = $squid_sets_lo {$squid_set} ; } |
| 165 | + else |
| 166 | + { |
| 167 | + ($squid_sets_hi_num = $squid_sets_hi {$squid_set}) =~ s/[^\d]//g ; |
| 168 | + $squid_range = $squid_sets_lo {$squid_set} . "-" . $squid_sets_hi_num ; |
| 169 | + } |
| 170 | + $line .= ",$squid_range" ; |
| 171 | + } |
| 172 | + print CSV "$line\n" ; |
| 173 | + |
| 174 | + foreach $month (sort keys %months) |
| 175 | + { |
| 176 | + $line = $month ; |
| 177 | + foreach $squid_set (sort keys %squid_sets) |
| 178 | + { |
| 179 | + $key = "$squid_set,$month" ; |
| 180 | + if ($squid_set_hours_month {$key} == 0) |
| 181 | + { $line .= "," ; } |
| 182 | + else |
| 183 | + { $line .= "," . sprintf ("%.0f", $squid_set_events_month {$key} / $squid_set_hours_month {$key}) ; } |
| 184 | + } |
| 185 | + print CSV "$line\n" ; |
| 186 | + } |
| 187 | + |
| 188 | + print CSV "\n\nSquid names grouped by first 2 digits\n" ; |
| 189 | + close CSV ; |
| 190 | +} |