Index: trunk/wikistats/squids/EzLib.pm |
— | — | @@ -0,0 +1,594 @@ |
| 2 | +# Erik Zachte - general purpose routines
|
| 3 | +# subroutines in this module have names in lowercase (I usually name own routines mixed case, though not consistently (yet)
|
| 4 | +
|
| 5 | +no warnings 'uninitialized';
|
| 6 | +
|
| 7 | +#use POSIX qw (locale_h);
|
| 8 | +#$old_locale = setlocale(LC_LANGUAGE) ;
|
| 9 | +#print "old locale LC_LANGUAGE $old_locale\n" ;
|
| 10 | +#$new_locale = setlocale(LC_LANGUAGE, "en_US.UTF-8");
|
| 11 | +#print "new locale LC_LANGUAGE $new_locale\n" ;
|
| 12 | +
|
| 13 | +$ez_lib_version = 14 ;
|
| 14 | +
|
| 15 | +sub ez_lib_version
|
| 16 | +{
|
| 17 | + $ez_lib_version_required = shift ;
|
| 18 | + if ($ez_lib_version < $ez_lib_version_required)
|
| 19 | + { print "EzLib out of date: version $ez_lib_version_required required" ; exit ;}
|
| 20 | +}
|
| 21 | +
|
| 22 | +use lib "/home/ezachte/lib" ;
|
| 23 | +
|
| 24 | +use Time::HiRes ;
|
| 25 | +use Time::Local ;
|
| 26 | +use Getopt::Std ;
|
| 27 | +use Carp ;
|
| 28 | +use Net::Domain qw (hostname);
|
| 29 | +use Digest::MD5 qw (md5_hex);
|
| 30 | +use Cwd ;
|
| 31 | +use Benchmark qw (timesum timediff timestr timethis timethese cmpthese) ;
|
| 32 | +use POSIX ;
|
| 33 | +
|
| 34 | +sub date_time_english ($) ;
|
| 35 | +
|
| 36 | +$true = 1 ;
|
| 37 | +$false = 0 ;
|
| 38 | +
|
| 39 | +($app_start_user,$app_start_system) = times ;
|
| 40 | +
|
| 41 | +# Get host name
|
| 42 | +$hostname = `hostname` ;
|
| 43 | +chomp ($hostname) ;
|
| 44 | +
|
| 45 | +$os = $^O ;
|
| 46 | +$os_linux = $true if $os =~ /linux/i ;
|
| 47 | +$os_windows = $true if $os =~ /win32/i ;
|
| 48 | +
|
| 49 | +$path_program = $0 ;
|
| 50 | +$path_program = Win32::GetLongPathName ($path_program) if $os_windows ;
|
| 51 | +($path_program,$name_program) = split '[\\\/](?=[^\\\/]*$)', $path_program ;
|
| 52 | +
|
| 53 | +die "Operating system '$os' not supported" if (! $os_linux and ! $os_windows) ;
|
| 54 | +
|
| 55 | +if ($os_linux) # && (-d "/home/ezachte")) # runs on server, to be refined
|
| 56 | +{
|
| 57 | + $job_runs_on_production_server = $true ;
|
| 58 | + $path_home = "/home/ezachte" ;
|
| 59 | +}
|
| 60 | +else
|
| 61 | +{ $path_home = getcwd () ; }
|
| 62 | +
|
| 63 | +$trace_on_exit = $false ; # shorthand for $trace_on_exit_concise
|
| 64 | +$trace_on_exit_concise = $false ;
|
| 65 | +$trace_on_exit_verbose = $false ;
|
| 66 | +$trace_on_exit_libs = $false ;
|
| 67 | +
|
| 68 | +# emulate new perl 5.10 function
|
| 69 | +sub say
|
| 70 | +{ $msg = shift ; print "$msg\n" ; }
|
| 71 | +
|
| 72 | +# if no explicit parameters specified use these defaults (mainly for tests)
|
| 73 | +sub default_argv
|
| 74 | +{
|
| 75 | + my $argv = shift ;
|
| 76 | + if (($#ARGV == -1) && (! $job_runs_on_production_server))
|
| 77 | + {
|
| 78 | + $argv =~ s/('[^'|]+')/($a=$1,$a=~s# #``#g,$a)/ge ;
|
| 79 | + $argv =~ s/("[^'|]+")/($a=$1,$a=~s# #``#g,$a)/ge ;
|
| 80 | + $argv =~ s/\s*\|/ /g ;
|
| 81 | + $argv =~ s/\|\s*/ /g ;
|
| 82 | + # @ARGV = split '\|', $argv ;
|
| 83 | + @ARGV = split ' ', $argv ;
|
| 84 | + foreach $arg (@ARGV)
|
| 85 | + { $arg =~ s/``/ /g ; }
|
| 86 | + $argv =~ s/``/ /g ;
|
| 87 | + }
|
| 88 | + else
|
| 89 | + { $argv = join ' | ', @ARGV ; }
|
| 90 | + print "\nScript $name_program started at " . date_time_english (time) . "\n" ;
|
| 91 | + print "Arguments: $argv\n" ;
|
| 92 | + print "\n" . '=' x 80 . "\n\n" ;
|
| 93 | + @ARGV_BAK = @ARGV ;
|
| 94 | +}
|
| 95 | +
|
| 96 | +# Get file time
|
| 97 | +sub file_time ($)
|
| 98 | +{
|
| 99 | + my $path = shift ;
|
| 100 | +
|
| 101 | + if (! -e $path)
|
| 102 | + { return '?' ; }
|
| 103 | + else
|
| 104 | + { return (time - (-M $path) * 24 * 60 * 60) ; }
|
| 105 | +}
|
| 106 | +
|
| 107 | +# Get last modification of this file
|
| 108 | +sub trace_ez_lib
|
| 109 | +{
|
| 110 | + $file_pm = 'EzLib.pm' ;
|
| 111 | + $path_pm = "/home/ezachte/lib/$file_pm" ;
|
| 112 | + print "File $path_pm not found" unless -e $path_pm ;
|
| 113 | + $path_pm_age = time - ((-M $path_pm) * 24 * 60 * 60 ) ;
|
| 114 | + print "\n$file_pm last modified: " . date_time_english ($path_pm_age) . "\n\n" ;
|
| 115 | +}
|
| 116 | +
|
| 117 | +# Print current file and line number
|
| 118 | +# print "File: ", __FILE__, " Line: ", __LINE__, "\n";
|
| 119 | +
|
| 120 | +# Flush output
|
| 121 | +$| = 1;
|
| 122 | +
|
| 123 | +# prototype (\%) forces supplying one variable argument, which also is auto converted to reference
|
| 124 | +# Pro Perl page 226: Requiring Variabloe Rather Than Values
|
| 125 | +
|
| 126 | +# invocation: @array = keys_sorted_by_value_alpha_asc (%hash) ;
|
| 127 | +# replaces: @array = sort {$hash{$a} cmp $hash{$b}} keys %hash ;
|
| 128 | +sub keys_sorted_by_value_alpha_asc (\%)
|
| 129 | +{
|
| 130 | + my $hashref = shift ;
|
| 131 | + return (sort {$hashref->{$a} cmp $hashref->{$b}} keys %$hashref) ;
|
| 132 | +}
|
| 133 | +
|
| 134 | +# invocation: @array = keys_sorted_by_value_alpha_desc (%hash) ;
|
| 135 | +# replaces: @array = sort {$hash{$b} cmp $hash{$a}} keys %hash ;
|
| 136 | +sub keys_sorted_by_value_alpha_desc (\%)
|
| 137 | +{
|
| 138 | + my $hashref = shift ;
|
| 139 | + return (sort {$hashref->{$b} cmp $hashref->{$a}} keys %$hashref) ;
|
| 140 | +}
|
| 141 | +
|
| 142 | +# invocation: @array = keys_sorted_by_value_num_asc (%hash) ;
|
| 143 | +# replaces: @array = sort {$hash{$a} <=> $hash{$b}} keys %hash ;
|
| 144 | +sub keys_sorted_by_value_num_asc (\%)
|
| 145 | +{
|
| 146 | + my $hashref = shift ;
|
| 147 | + return (sort {$hashref->{$a} <=> $hashref->{$b}} keys %$hashref) ;
|
| 148 | +}
|
| 149 | +
|
| 150 | +# invocation: @array = keys_sorted_by_value_num_desc (%hash) ;
|
| 151 | +# replaces: @array = sort {$hash{$b} <=> $hash{$a}} keys %hash ;
|
| 152 | +sub keys_sorted_by_value_num_desc (\%)
|
| 153 | +{
|
| 154 | + my $hashref = shift ;
|
| 155 | + return (sort {$hashref->{$b} <=> $hashref->{$a}} keys %$hashref) ;
|
| 156 | +}
|
| 157 | +
|
| 158 | +# almost trivial but to match keys_sorted_by_value_... subroutines
|
| 159 | +# invocation: @array = keys_sorted_alpha_asc (%hash) ;
|
| 160 | +# replaces: @array = sort {$a cmp $b} keys %hash ;
|
| 161 | +sub keys_sorted_alpha_asc (\%)
|
| 162 | +{
|
| 163 | + my $hashref = shift ;
|
| 164 | + return (sort {$a cmp $b} keys %$hashref) ;
|
| 165 | +}
|
| 166 | +
|
| 167 | +# almost trivial but to match keys_sorted_by_value_... subroutines
|
| 168 | +# invocation: @array = keys_sorted_alpha_desc (%hash) ;
|
| 169 | +# replaces: @array = sort {$a cmp $b} keys %hash ;
|
| 170 | +sub keys_sorted_alpha_desc (\%)
|
| 171 | +{
|
| 172 | + my $hashref = shift ;
|
| 173 | + return (sort {$b cmp $a} keys %$hashref) ;
|
| 174 | +}
|
| 175 | +
|
| 176 | +# almost trivial but to match keys_sorted_by_value_... subroutines
|
| 177 | +# invocation: @array = keys_sorted_num_asc (%hash) ;
|
| 178 | +# replaces: @array = sort {$a <=> $b} keys %hash ;
|
| 179 | +sub keys_sorted_num_asc (\%)
|
| 180 | +{
|
| 181 | + my $hashref = shift ;
|
| 182 | + return (sort {$a <=> $b} keys %$hashref) ;
|
| 183 | +}
|
| 184 | +
|
| 185 | +# almost trivial but to match keys_sorted_by_value_... subroutines
|
| 186 | +# invocation: @array = keys_sorted_num_desc (%hash) ;
|
| 187 | +# replaces: @array = sort {$b <=> $a} keys %hash ;
|
| 188 | +sub keys_sorted_num_desc (\%)
|
| 189 | +{
|
| 190 | + my $hashref = shift ;
|
| 191 | + return (sort {$b <=> $a} keys %$hashref) ;
|
| 192 | +}
|
| 193 | +
|
| 194 | +# for mulilingual version see wikiReportsDate.pl / sub GetDate
|
| 195 | +sub date_time_english ($)
|
| 196 | +{
|
| 197 | + my @weekdays_en = qw (Sunday Monday Tuesday Wednesday Thursday Friday Saturday);
|
| 198 | + my @months_en = qw (January February March April May June July
|
| 199 | + August September October November December);
|
| 200 | + my $time = shift ;
|
| 201 | + my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($time);
|
| 202 | + return (substr ($weekdays_en[$wday],0,3) . ", " .
|
| 203 | + substr ($months_en[$mon],0,3) . " " .
|
| 204 | + $mday . ", " .
|
| 205 | + (1900 + $year) .
|
| 206 | + " " . sprintf ("%2d:%02d", $hour, $min)) ;
|
| 207 | +}
|
| 208 | +
|
| 209 | +# for mulilingual version see wikiReportsDate.pl / sub GetMonthShort
|
| 210 | +sub month_english_short ($)
|
| 211 | +{
|
| 212 | + my @months_en = qw (Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
|
| 213 | +
|
| 214 | + my $month = shift ;
|
| 215 | + if ($month !~ /^\d+$/)
|
| 216 | + { return ("?") ; }
|
| 217 | +
|
| 218 | + return ($months_en [$month % 12]) ;
|
| 219 | +}
|
| 220 | +
|
| 221 | +sub month_year_english_short ($)
|
| 222 | +{
|
| 223 | + my @months_en = qw (Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
|
| 224 | +
|
| 225 | + my $month = shift ;
|
| 226 | + if ($month !~ /^\d+$/)
|
| 227 | + { return ("?") ; }
|
| 228 | + $month-- ;
|
| 229 | +
|
| 230 | + return ($months_en [$month % 12] . " " . (2000 + int ($month / 12)) ) ;
|
| 231 | +}
|
| 232 | +
|
| 233 | +sub ddhhmmss
|
| 234 | +{
|
| 235 | + my $seconds = shift ;
|
| 236 | + my $format = shift ;
|
| 237 | +
|
| 238 | + $days = int ($seconds / (24*3600)) ;
|
| 239 | + $seconds -= $days * 24*3600 ;
|
| 240 | + $hrs = int ($seconds / 3600) ;
|
| 241 | + $seconds -= $hrs * 3600 ;
|
| 242 | + $min = int ($seconds / 60) ;
|
| 243 | + $sec = $seconds % 60 ;
|
| 244 | +
|
| 245 | + if ($format eq '')
|
| 246 | + {
|
| 247 | + $days = ($days > 0) ? (($days > 1) ? "$days days, " : "$days day, ") : "" ;
|
| 248 | + $hrs = (($days + $hrs > 0) ? (($hrs > 1) ? "$hrs hrs" : "$hrs hrs") : "") . ($days + $hrs > 0 ? ", " : ""); # 2 hrs/1 hr ?
|
| 249 | + $min = ($days + $hrs + $min > 0) ? "$min min, " : "" ;
|
| 250 | + $sec = "$sec sec" ;
|
| 251 | + return ("$days$hrs$min$sec") ;
|
| 252 | + }
|
| 253 | + else
|
| 254 | + {
|
| 255 | + return sprintf ($format,$days,$hrs,$min,$sec) if $format =~ /%.*%.*%.*%/ ;
|
| 256 | + return sprintf ($format, $hrs,$min,$sec) if $format =~ /%.*%.*%/ ;
|
| 257 | + return sprintf ($format, $min,$sec) if $format =~ /%.*%/ ;
|
| 258 | + return sprintf ($format, $sec) ;
|
| 259 | + }
|
| 260 | +}
|
| 261 | +
|
| 262 | +sub yyyymmddThhmmssDiff
|
| 263 | +{
|
| 264 | + my ($time_till, $time_from) = @_ ;
|
| 265 | + my ($yy1,$mm1,$dd1,$hh1,$nn1,$ss1) = $time_till =~ /(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)/ ;
|
| 266 | + my ($yy2,$mm2,$dd2,$hh2,$nn2,$ss2) = $time_from =~ /(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)/ ;
|
| 267 | + $time_till = timegm ($ss1,$nn1,$hh1,$dd1, $mm1-1, $yy1) ;
|
| 268 | + $time_from = timegm ($ss2,$nn2,$hh2,$dd2, $mm2-1, $yy2) ;
|
| 269 | + return ($time_till - $time_from) ;
|
| 270 | +}
|
| 271 | +
|
| 272 | +sub yyyymmddhhmmssDiff
|
| 273 | +{
|
| 274 | + my ($time_till, $time_from) = @_ ;
|
| 275 | + my ($yy1,$mm1,$dd1,$hh1,$nn1,$ss1) = $time_till =~ /(\d\d\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)/ ;
|
| 276 | + my ($yy2,$mm2,$dd2,$hh2,$nn2,$ss2) = $time_from =~ /(\d\d\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)/ ;
|
| 277 | + $time_till = timegm ($ss1,$nn1,$hh1,$dd1, $mm1-1, $yy1) ;
|
| 278 | + $time_from = timegm ($ss2,$nn2,$hh2,$dd2, $mm2-1, $yy2) ;
|
| 279 | + return ($time_till - $time_from) ;
|
| 280 | +}
|
| 281 | +
|
| 282 | +sub yyyymmddDiffDays
|
| 283 | +{
|
| 284 | + my ($time_till, $time_from) = @_ ;
|
| 285 | + my ($yy1,$mm1,$dd1) = $time_till =~ /(\d\d\d\d)-(\d\d)-(\d\d)/ ;
|
| 286 | + my ($yy2,$mm2,$dd2) = $time_from =~ /(\d\d\d\d)-(\d\d)-(\d\d)/ ;
|
| 287 | + $time_till = timegm (0,0,0,$dd1, $mm1-1, $yy1) ;
|
| 288 | + $time_from = timegm (0,0,0,$dd2, $mm2-1, $yy2) ;
|
| 289 | + return (($time_till - $time_from) / (24 * 60 * 60));
|
| 290 | +}
|
| 291 | +
|
| 292 | +sub yyyymmDiffDays
|
| 293 | +{
|
| 294 | + my ($time_till, $time_from) = @_ ;
|
| 295 | + my ($yy1,$mm1) = $time_till =~ /(\d\d\d\d)-(\d\d)/ ;
|
| 296 | + my ($yy2,$mm2) = $time_from =~ /(\d\d\d\d)-(\d\d)/ ;
|
| 297 | + $mm1++ ;
|
| 298 | + if ($mm1 > 12) { $mm1 = 1 ; $yy++ ; }
|
| 299 | + $time_till = timegm (0,0,0,1, $mm1-1, $yy1) ;
|
| 300 | + $time_from = timegm (0,0,0,1, $mm2-1, $yy2) ;
|
| 301 | + return (($time_till - $time_from) / (24 * 60 * 60)) ;
|
| 302 | +}
|
| 303 | +
|
| 304 | +sub days_in_month
|
| 305 | +{
|
| 306 | + my $year = shift ;
|
| 307 | + my $month = shift ;
|
| 308 | + my $days = $days_in_month_cached {"$year $month"} ;
|
| 309 | + return $days if $days > 0 ;
|
| 310 | +
|
| 311 | + my $month2 = $month+1 ;
|
| 312 | + my $year2 = $year ;
|
| 313 | + if ($month2 > 12)
|
| 314 | + { $month2 = 1 ; $year2++ }
|
| 315 | +
|
| 316 | + my $timegm1 = timegm (0,0,0,1,$month-1,$year-1900) ;
|
| 317 | + my $timegm2 = timegm (0,0,0,1,$month2-1,$year2-1900) ;
|
| 318 | + $days = ($timegm2-$timegm1) / (24*60*60) ;
|
| 319 | +
|
| 320 | + $days_in_month_cached {"$year $month"} = $days ;
|
| 321 | + return ($days) ;
|
| 322 | +}
|
| 323 | +
|
| 324 | +
|
| 325 | +sub abort
|
| 326 | +{
|
| 327 | +
|
| 328 | + $msg = shift ;
|
| 329 | + confess ("\nAbort: $msg\n\n") ;
|
| 330 | + exit ;
|
| 331 | +}
|
| 332 | +
|
| 333 | +
|
| 334 | +# test on each run of script whether message should still be displayed, e.g. "New feature"
|
| 335 | +sub blank_text_after
|
| 336 | +{
|
| 337 | + my $date = shift ;
|
| 338 | + my $text = shift ;
|
| 339 | + my ($day,$month,$year) = $date =~ /(\d+).*?(\d+).*?(\d+)/ ;
|
| 340 | + my $till = timegm (0,0,0,$day,$month-1,$year-1900) ;
|
| 341 | + if (time > $till)
|
| 342 | + { return ("") ; }
|
| 343 | + else
|
| 344 | + { return ($text) ; }
|
| 345 | +}
|
| 346 | +
|
| 347 | +# test for four triplets and optional port number
|
| 348 | +sub is_valid_ip_address
|
| 349 | +{
|
| 350 | + my $address = shift ;
|
| 351 | + return ($address =~ /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?:\:\d+)?$/) ;
|
| 352 | +}
|
| 353 | +
|
| 354 | +# store elapsed high resolution time, gor benchmarking
|
| 355 | +sub code_started
|
| 356 | +{ return Time::HiRes::time() ; }
|
| 357 | +
|
| 358 | +sub code_complete
|
| 359 | +{
|
| 360 | + my ($label, $start) = @_ ;
|
| 361 | + $code_passes {$label} ++ ;
|
| 362 | + $code_time_spent {$label} += Time::HiRes::time - $start ;
|
| 363 | +}
|
| 364 | +
|
| 365 | +# only protect division against runtime error
|
| 366 | +sub divide_if_allowed
|
| 367 | +{
|
| 368 | + my $x = shift ;
|
| 369 | + my $y = shift ;
|
| 370 | + if ($y == 0)
|
| 371 | + { return () ; }
|
| 372 | + else
|
| 373 | + { return ($x/$y) ; }
|
| 374 | +}
|
| 375 | +
|
| 376 | +# use Encode qw(encode);
|
| 377 | +# $eckey=encode('utf8',$key);
|
| 378 | +sub encode_url
|
| 379 | +{
|
| 380 | + my $url = shift ;
|
| 381 | + $url =~ s/([^0-9a-zA-Z\%\:\/\.])/"%".sprintf ("%X",ord($1))/ge ;
|
| 382 | + return ($url) ;
|
| 383 | +}
|
| 384 | +
|
| 385 | +sub encode_non_ascii
|
| 386 | +{
|
| 387 | + my $msg = shift ;
|
| 388 | + $msg =~ s/([\x80-\xFF]{2,})/"%".sprintf ("%X",ord($1))/ge ;
|
| 389 | + return ($msg) ;
|
| 390 | +}
|
| 391 | +
|
| 392 | +sub convert_unicode
|
| 393 | +{
|
| 394 | + my $string = shift ;
|
| 395 | + my $input_unicoded = ($string =~ m/[\xc0-\xdf][\x80-\xbf]|
|
| 396 | + [\xe0-\xef][\x80-\xbf]{2}|
|
| 397 | + [\xf0-\xf7][\x80-\xbf]{3}/x) ;
|
| 398 | +
|
| 399 | +
|
| 400 | + # unicode -> html character codes &#nnnn;
|
| 401 | + if ($input_unicoded)
|
| 402 | + { $string =~ s/([\x80-\xFF]+)/unicode_to_html($1)/ge ; }
|
| 403 | + return ($string) ;
|
| 404 | +}
|
| 405 | +
|
| 406 | +sub unicode_to_html
|
| 407 | +{
|
| 408 | + my $text = shift ;
|
| 409 | + my $html = "" ;
|
| 410 | + my ($c, $len, $byte, $ord, $unicode, $bytes) ;
|
| 411 | +
|
| 412 | + $len = length ($text) ;
|
| 413 | + for ($c = 0 ; $c < $len ; $c++)
|
| 414 | + {
|
| 415 | + $byte = substr ($text,$c,1) ;
|
| 416 | + $ord = ord ($byte) ;
|
| 417 | + if ($ord < 128) # plain ascii character
|
| 418 | + { $html .= $byte ; } # (will not occur in this script)
|
| 419 | + else
|
| 420 | + {
|
| 421 | + # single byte left >= 0x80 ? should never occur but does a few times
|
| 422 | + # treat as pre-unicode high ascii character
|
| 423 | + if ($c == $len - 1)
|
| 424 | + {
|
| 425 | + $html = "\&\#". $ord . ";" ;
|
| 426 | + # print FILE_ERR $title .":invalid unicode char ".$text. "\n"
|
| 427 | + }
|
| 428 | + else
|
| 429 | + {
|
| 430 | + if ($ord < 224)
|
| 431 | + { $bytes = 2 ; }
|
| 432 | + elsif ($ord < 240)
|
| 433 | + { $bytes = 3 ; }
|
| 434 | + elsif ($ord < 248)
|
| 435 | + { $bytes = 4 ; }
|
| 436 | + elsif ($ord < 252)
|
| 437 | + { $bytes = 5 ; }
|
| 438 | + else
|
| 439 | + { $bytes = 6 ; }
|
| 440 | + $unicode = substr ($text,$c,$bytes) ;
|
| 441 | + $html .= unicode_to_html_tag ($unicode) ;
|
| 442 | + $c += $bytes - 1 ;
|
| 443 | + }
|
| 444 | + }
|
| 445 | + }
|
| 446 | + return ($html) ;
|
| 447 | +}
|
| 448 | +
|
| 449 | +
|
| 450 | +sub unicode_to_html_tag
|
| 451 | +{
|
| 452 | + my $unicode = shift ;
|
| 453 | + my $char = substr ($unicode,0,1) ;
|
| 454 | + my $ord = ord ($char) ;
|
| 455 | + my ($c, $value, $html) ;
|
| 456 | +
|
| 457 | + if ($ord < 128) # plain ascii character
|
| 458 | + { return ($unicode) ; } # (will not occur in this script)
|
| 459 | + else
|
| 460 | + {
|
| 461 | + if ($ord >= 252)
|
| 462 | + { $value = $ord - 252 ; }
|
| 463 | + elsif ($ord >= 248)
|
| 464 | + { $value = $ord - 248 ; }
|
| 465 | + elsif ($ord >= 240)
|
| 466 | + { $value = $ord - 240 ; }
|
| 467 | + elsif ($ord >= 224)
|
| 468 | + { $value = $ord - 224 ; }
|
| 469 | + else
|
| 470 | + { $value = $ord - 192 ; }
|
| 471 | + for ($c = 1 ; $c < length ($unicode) ; $c++)
|
| 472 | + { $value = $value * 64 + ord (substr ($unicode, $c,1)) - 128 ; }
|
| 473 | + $html = "\&\#" . $value . ";" ;
|
| 474 | +
|
| 475 | + return ($html) ;
|
| 476 | + }
|
| 477 | +}
|
| 478 | +
|
| 479 | +
|
| 480 | +
|
| 481 | +
|
| 482 | +BEGIN
|
| 483 | +{
|
| 484 | +}
|
| 485 | +
|
| 486 | +# optionally print program meta data when program sends
|
| 487 | +END
|
| 488 | +{
|
| 489 | +# if ($os_windows)
|
| 490 | +# { use Win32 ; }
|
| 491 | +
|
| 492 | + my ($time, $path,$program) ;
|
| 493 | +
|
| 494 | + if ($trace_on_exit || $trace_on_exit_verbose || $trace_on_exit_concise)
|
| 495 | + {
|
| 496 | + $time_elapsed_total = time - $^T ; # $^T is program start time
|
| 497 | + ($app_end_user,$app_end_system) = times ;
|
| 498 | +
|
| 499 | + $time_active_user_processes = $app_end_user - $app_start_user ;
|
| 500 | + $time_active_system_processes = $app_end_system - $app_start_system ;
|
| 501 | + $time_active_total = $time_active_user_processes + $time_active_system_processes ;
|
| 502 | +
|
| 503 | + # print "\n" . '=' x (length ($msg) -1) . "\n\n$msg\n\n" ;
|
| 504 | + print "\n" . '=' x 80 . "\n\n$msg\n\n" ;
|
| 505 | + }
|
| 506 | +
|
| 507 | + if ($trace_on_exit || $trace_on_exit_verbose || $trace_on_exit_concise)
|
| 508 | + {
|
| 509 | + print "Prog: $name_program\n" ;
|
| 510 | + print "Path: $path_program\n" ;
|
| 511 | + if ($job_runs_on_production_server)
|
| 512 | + { print "Host: $hostname (production)\n\n" ; }
|
| 513 | + else
|
| 514 | + { print "Host: $hostname (test run)\n\n" ; }
|
| 515 | + print "Args:\n\n", map {" $_\n"} @ARGV_BAK ;
|
| 516 | + # print "Host: $hostname\n" ;
|
| 517 | + print "OS: $os\n" ;
|
| 518 | + print "Perl: " . ($a = sprintf ("%.9f",$^V), $a =~ s/\_/_/g,$a) . "\n" ; # perl version
|
| 519 | + print "Perl: $^X\n" ; # perl exe path
|
| 520 | + print "EzLib: $ez_lib_version\n" ; # perl exe path
|
| 521 | + }
|
| 522 | +
|
| 523 | + if ($trace_on_exit || $trace_on_exit_verbose || $trace_on_exit_libs)
|
| 524 | + {
|
| 525 | + # Get library paths
|
| 526 | + print "\nLibs:\n", map {" $_\n"} @INC ;
|
| 527 | +
|
| 528 | + $cwd = cwd () ;
|
| 529 | + foreach (grep {$_ =~ /home|wiki/i} values %INC) # own modules
|
| 530 | + # foreach (values %INC) # all modules
|
| 531 | + {
|
| 532 | + $file = $_ ;
|
| 533 | + if ($file !~ /[\\\/]/)
|
| 534 | + { $file = "$cwd/$file" ; }
|
| 535 | + $time = file_time ($file) ;
|
| 536 | + # $file = Win32::GetLongPathName ($_) if $os_windows ;
|
| 537 | + push @own_modules, "$time|$file" ;
|
| 538 | + }
|
| 539 | +
|
| 540 | + @own_modules = sort {$b <=> $a} @own_modules ;
|
| 541 | + print "\nOwn modules (d/m/y h:m):\n" ;
|
| 542 | + foreach (@own_modules)
|
| 543 | + {
|
| 544 | + ($time,$path) = split '\|', $_ ;
|
| 545 | + my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($time);
|
| 546 | + print sprintf (" %2d/%02d/%4d %2d:%02d %s\n", $mday,$mon+1,$year+1900,$hour,$min, $path) ;
|
| 547 | + }
|
| 548 | + print "\n\n" ;
|
| 549 | + }
|
| 550 | +
|
| 551 | + $lines = 0 ;
|
| 552 | + foreach $key (sort keys %code_passes)
|
| 553 | + {
|
| 554 | + if ($lines++ == 0)
|
| 555 | + { print "Executing times:\n" ; }
|
| 556 | + print sprintf (" %-26s","$key:") . ddhhmmss($code_time_spent{$key},'%3d min, %2d sec') . " / " .
|
| 557 | + sprintf ("%10d",$code_passes{$key}) . " calls = " . sprintf ("%6f", divide_if_allowed ($code_time_spent{$key}, $code_passes {$key})) . " sec/pass\n" ;
|
| 558 | + }
|
| 559 | + print "\n" ;
|
| 560 | +
|
| 561 | +# $locale = setlocale(LC_LANGUAGE, $old_locale);
|
| 562 | +# print "locale LC_LANGUAGE back to $locale\n" ;
|
| 563 | +
|
| 564 | + if ($time_elapsed_total < 5)
|
| 565 | + { $msg = "Ready in " . ddhhmmss ($time_elapsed_total) . "\n" ; }
|
| 566 | + else
|
| 567 | + {
|
| 568 | + $perc_active_user_processes = sprintf ("%4.1f", 100 *$time_active_user_processes/$time_elapsed_total) ;
|
| 569 | + $perc_active_system_processes = sprintf ("%4.1f", 100 *$time_active_system_processes/$time_elapsed_total) ;
|
| 570 | + $perc_active_total = sprintf ("%4.1f", 100 *$time_active_total/$time_elapsed_total) ;
|
| 571 | + $msg = "Ready in " . ddhhmmss ($time_elapsed_total) . "\n\nTime spent:\n" .
|
| 572 | + "User: $perc_active_user_processes\% (" . ddhhmmss ($time_active_user_processes) . ")\n" .
|
| 573 | + "System: $perc_active_system_processes\% (" . ddhhmmss ($time_active_system_processes) . ")\n" .
|
| 574 | + "Total: $perc_active_total\% (" . ddhhmmss ($time_active_total) . ")\n" ;
|
| 575 | + }
|
| 576 | +
|
| 577 | + print "\n\n" . '=' x 80 . "\n" . '=' x 80 . "\n\n" ;
|
| 578 | +}
|
| 579 | +
|
| 580 | +sub trace
|
| 581 | +{
|
| 582 | + my $function_name = shift ;
|
| 583 | +
|
| 584 | + my ($ss,$mm,$hh) = (localtime (time))[0,1,2] ;
|
| 585 | + my $time = sprintf ("%02d:%02d:%02d", $hh, $mm, $ss) ;
|
| 586 | +
|
| 587 | + print "\n$time $function_name\n" ;
|
| 588 | +}
|
| 589 | +
|
| 590 | +# only when perl compiled with malloc
|
| 591 | +# use Devel::Peek ;
|
| 592 | +# $ENV {PERL_DEBUG_MSTATS} = 2;
|
| 593 | +# mstat() ;
|
| 594 | +
|
| 595 | +1 ;
|
Index: trunk/wikistats/squids/SquidCountArchiveReadInput.pm |
— | — | @@ -1,5 +1,10 @@ |
2 | 2 | #!/usr/bin/perl |
3 | 3 | |
| 4 | +# /usr/local/bin/geoiplogtag uses /usr/share/GeoIP/GeoIP.dat |
| 5 | +# test: |
| 6 | +# echo 125.123.123.123 | /usr/local/bin/geoiplogtag 1 |
| 7 | +# refresh: bayes:/usr/share/GeoIP> wget http://geolite.maxmind.com/download/geoip/database/GeoLiteCountry/GeoIP.dat.gz |
| 8 | + |
4 | 9 | sub CollectFilesToProcess |
5 | 10 | { |
6 | 11 | trace CollectFilesToProcess ; |
— | — | @@ -161,7 +166,7 @@ |
162 | 167 | if ($job_runs_on_production_server) |
163 | 168 | { |
164 | 169 | if ($file_in =~ /\.gz$/o) |
165 | | - { open IN, "-|", "gzip -dc $file_in | /usr/local/bin/geoiplogtag 5" ; } # http://perldoc.perl.org/functions/open.html |
| 170 | + { open IN, "-|", "gzip -dc $file_in | sed s/\\ \\ */\\ /g | /usr/local/bin/geoiplogtag 5" ; } # http://perldoc.perl.org/functions/open.html |
166 | 171 | else |
167 | 172 | { open IN, "-|", "cat $file_in | /usr/local/bin/geoiplogtag 5" ; } # http://perldoc.perl.org/functions/open.html |
168 | 173 | $fields_expected = 14 ; |
— | — | @@ -184,10 +189,36 @@ |
185 | 190 | # print $line ; |
186 | 191 | # } |
187 | 192 | |
| 193 | + |
| 194 | +# ugly Q&D code to circumvent spaces in agent string |
| 195 | +# $line2 = $line ; |
| 196 | + chomp $line ; |
188 | 197 | @fields = split (' ', $line) ; |
189 | | - if ($#fields < $fields_expected) { $fields_too_few ++ ; next ; } |
190 | | - if ($#fields > $fields_expected) { $fields_too_many ++ ; next ; } |
| 198 | +# next if $line =~ /upload/ ; |
| 199 | +# next if $line !~ /en\.m\.wikipedia/ ; |
| 200 | +# next if $fields[10] eq '-' ; |
| 201 | +# print "mime " . $fields[10] . "\n" ; |
| 202 | +#next if $fields [9] eq '-' ; |
| 203 | +#next if $fields [9] =~ /NONE/ ; |
| 204 | + if ($#fields > 14) |
| 205 | + { |
| 206 | +# print "line $line2\n" ; |
| 207 | +# print "fields " . $#fields . "\n$line\n" ; |
| 208 | + $country_code = $fields [$#fields] ; |
| 209 | + $fields [$#fields] = '' ; |
| 210 | + $line = join (' ', @fields) ; |
| 211 | +# print "2 $line\n" ; |
| 212 | + @fields = split (' ', $line, 14) ; |
| 213 | + $fields [14] = $country_code ; |
| 214 | +# print "\n\n12: " . $fields [12] . "\n" ; |
| 215 | +# print "13: " . $fields [13] . "\n" ; |
| 216 | +# print "14: " . $fields [14] . "\n" ; |
| 217 | +# print "15: " . $fields [15] . "\n" ; |
| 218 | + } |
191 | 219 | |
| 220 | + if ($#fields < $fields_expected) { $fields_too_few ++ ; print "invalid field count " . $#fields . "\n" ; next ; } |
| 221 | + if ($#fields > $fields_expected) { $fields_too_many ++ ; print "invalid field count " . $#fields . "\n" ; next ; } |
| 222 | + |
192 | 223 | $time = $fields [2] ; |
193 | 224 | |
194 | 225 | if (($oldest_time_read eq "") || ($time lt $oldest_time_read)) |
Index: trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm |
— | — | @@ -0,0 +1,922 @@ |
| 2 | + #!/usr/bin/perl |
| 3 | + |
| 4 | + use lib "/home/ezachte/lib" ; |
| 5 | + use EzLib ; |
| 6 | + |
| 7 | +sub WriteOutputIpFrequencies |
| 8 | +{ |
| 9 | + trace WriteOutputIpFrequencies ; |
| 10 | + |
| 11 | + my $path_out = shift ; |
| 12 | + print "\ncd $path_out\n\n" ; |
| 13 | + chdir ($path_out) ; |
| 14 | + |
| 15 | + $comment = "# Data from $time_to_start till $time_to_stop (yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 1000)\n" ; |
| 16 | + |
| 17 | + open CSV_MULTIPLE_ADDRESSES, '>', $file_ip_frequencies ; |
| 18 | + print "# html pages found: $html_pages_found\n" ; |
| 19 | + print CSV_MULTIPLE_ADDRESSES "# html pages found: $html_pages_found\n" ; |
| 20 | + print CSV_MULTIPLE_ADDRESSES "#\n" ; |
| 21 | + |
| 22 | + foreach $address (keys %ip_frequencies) |
| 23 | + { |
| 24 | + $ip_distribution {$ip_frequencies {$address}} ++ ; |
| 25 | + } |
| 26 | + |
| 27 | + $ip_distribution_ge_2 = 0 ; |
| 28 | + $ip_distribution_ge_3 = 0 ; |
| 29 | + $ip_distribution_ge_4 = 0 ; |
| 30 | + $ip_distribution_ge_5 = 0 ; |
| 31 | + $ip_distribution_ge_10 = 0 ; |
| 32 | + $ip_distribution_ge_20 = 0 ; |
| 33 | + $ip_distribution_ge_50 = 0 ; |
| 34 | + $ip_distribution_ge_100 = 0 ; |
| 35 | + $ip_distribution_ge_250 = 0 ; |
| 36 | + $ip_distribution_ge_1000 = 0 ; |
| 37 | + $ip_distribution_ge_2500 = 0 ; |
| 38 | + $ip_distribution_ge_10000 = 0 ; |
| 39 | + |
| 40 | + foreach $frequency (sort {$a <=> $b} keys %ip_distribution) |
| 41 | + { |
| 42 | + $metafreq = $ip_distribution {$frequency} ; |
| 43 | + if ($frequency >= 2) { $ip_distribution_ge_2 += $metafreq ; } |
| 44 | + if ($frequency >= 3) { $ip_distribution_ge_3 += $metafreq ; } |
| 45 | + if ($frequency >= 4) { $ip_distribution_ge_4 += $metafreq ; } |
| 46 | + if ($frequency >= 5) { $ip_distribution_ge_5 += $metafreq ; } |
| 47 | + if ($frequency >= 10) { $ip_distribution_ge_10 += $metafreq ; } |
| 48 | + if ($frequency >= 20) { $ip_distribution_ge_20 += $metafreq ; } |
| 49 | + if ($frequency >= 50) { $ip_distribution_ge_50 += $metafreq ; } |
| 50 | + if ($frequency >= 100) { $ip_distribution_ge_100 += $metafreq ; } |
| 51 | + if ($frequency >= 250) { $ip_distribution_ge_250 += $metafreq ; } |
| 52 | + if ($frequency >= 1000) { $ip_distribution_ge_1000 += $metafreq ; } |
| 53 | + if ($frequency >= 2500) { $ip_distribution_ge_2500 += $metafreq ; } |
| 54 | + if ($frequency >= 10000) { $ip_distribution_ge_10000 += $metafreq ; } |
| 55 | + if ($frequency > 20) { next ; } |
| 56 | + print "# $metafreq addresses occur $frequency times\n" ; |
| 57 | + print CSV_MULTIPLE_ADDRESSES "# $metafreq addresses occur $frequency times\n" ; |
| 58 | + } |
| 59 | + |
| 60 | + print CSV_MULTIPLE_ADDRESSES "#\n" ; |
| 61 | + print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_2 addresses occur 2+ times\n" ; |
| 62 | + print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_3 addresses occur 3+ times\n" ; |
| 63 | + print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_4 addresses occur 4+ times\n" ; |
| 64 | + print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_5 addresses occur 5+ times\n" ; |
| 65 | + print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_10 addresses occur 10+ times\n" ; |
| 66 | + print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_20 addresses occur 20+ times\n" ; |
| 67 | + print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_50 addresses occur 50+ times\n" ; |
| 68 | + print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_100 addresses occur 100+ times\n" ; |
| 69 | + print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_250 addresses occur 250+ times\n" ; |
| 70 | + print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_1000 addresses occur 1000+ times\n" ; |
| 71 | + print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_2500 addresses occur 2500+ times\n" ; |
| 72 | + print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_10000 addresses occur 10000+ times\n" ; |
| 73 | + |
| 74 | + foreach $address (sort {$ip_frequencies {$b} <=> $ip_frequencies {$a}} keys %ip_frequencies) |
| 75 | + { |
| 76 | + $frequency = $ip_frequencies {$address} ; |
| 77 | + # print "$freq,$address\n" ; |
| 78 | + if ($frequency > 1) |
| 79 | + { print CSV_MULTIPLE_ADDRESSES "$frequency,$address\n" ; } |
| 80 | + } |
| 81 | + |
| 82 | + close CSV_MULTIPLE_ADDRESSES ; |
| 83 | + |
| 84 | + if ($job_runs_on_production_server) |
| 85 | + { |
| 86 | + $cmd = "bzip2 -f $file_ip_frequencies" ; |
| 87 | + print "\ncmd = '$cmd'\n" ; |
| 88 | + `$cmd` ; |
| 89 | + } |
| 90 | +} |
| 91 | + |
| 92 | +sub WriteOutputSquidSequenceGaps |
| 93 | +{ |
| 94 | + trace WriteOutputSquidSequenceGaps ; |
| 95 | + |
| 96 | + my $path_out = shift ; |
| 97 | + print "\ncd $path_out\n\n" ; |
| 98 | + chdir ($path_out) ; |
| 99 | + |
| 100 | + my ($tot_events_all_day, $tot_delta_all_day, %all_squids_events, %all_squids_delta, %squids, $tot_squids) ; |
| 101 | + |
| 102 | + $yyyy = substr ($time_to_start,0,4) ; |
| 103 | + $mm = substr ($time_to_start,5,2) ; |
| 104 | + $dd = substr ($time_to_start,8,2) ; |
| 105 | + $date = substr ($time_to_start,0,10) ; |
| 106 | + $date_excel = "\"=DATE($yyyy,$mm,$dd)\"" ; |
| 107 | + |
| 108 | + open CSV_SEQNO_PER_SQUIDHOUR, '>', $file_seqno_per_squidhour ; |
| 109 | + print CSV_SEQNO_PER_SQUIDHOUR "squid,hour,events,tot delta,avg delta\n" ; |
| 110 | + |
| 111 | + $squid_hour = 0 ; |
| 112 | + foreach $squid_hour (sort keys %squid_events) |
| 113 | + { |
| 114 | + $events = $squid_events {$squid_hour} ; |
| 115 | + next if $events == 0 ; |
| 116 | + |
| 117 | + $delta = $squid_delta {$squid_hour} ; |
| 118 | + $avg_delta = sprintf ("%.0f", $delta / $events) ; |
| 119 | + |
| 120 | + print CSV_SEQNO_PER_SQUIDHOUR "$squid_hour,$events,$delta,$avg_delta\n" ; |
| 121 | + print "$squid_hour,$events,$delta,$avg_delta\n" ; |
| 122 | + |
| 123 | + $tot_events_all_day += $events ; |
| 124 | + $tot_delta_all_day += $delta ; |
| 125 | + ($squid,$hour) = split (',', $squid_hour) ; |
| 126 | + $squids {$squid} ++ ; |
| 127 | + |
| 128 | + $all_squids_events {$hour} += $events ; |
| 129 | + $all_squids_delta {$hour} += $delta ; |
| 130 | + } |
| 131 | + |
| 132 | + foreach $squid (keys %squids) |
| 133 | + { $tot_squids++ ; } |
| 134 | + |
| 135 | + |
| 136 | + if ($tot_events_all_day > 0) |
| 137 | + { |
| 138 | + $avg_delta_all_day = sprintf ("%.0f", $tot_delta_all_day / $tot_events_all_day) ; |
| 139 | + $tot_events_all_day_corrected = sprintf ("%.0f", ($avg_delta_all_day / 1000) * $tot_events_all_day) ; |
| 140 | + |
| 141 | + print CSV_SEQNO_PER_SQUIDHOUR "# Squids: $tot_squids Events: $tot_events_all_day Avg delta: $avg_delta_all_day\n\n" ; |
| 142 | + print "\nSquids: $tot_squids\nEvents: $tot_events_all_day\nAvg delta: $avg_delta_all_day\n\n" ; |
| 143 | + } |
| 144 | + else |
| 145 | + { |
| 146 | + print CSV_SEQNO_PER_SQUIDHOUR "# Squids: $tot_squids Events: 0\n\n" ; |
| 147 | + print "\nSquids: $tot_squids\nEvents: 0\n\n" ; |
| 148 | + } |
| 149 | + close CSV_SEQNO_PER_SQUIDHOUR ; |
| 150 | + |
| 151 | + # now same thing for all squids combined, hourly |
| 152 | + |
| 153 | + undef @csv ; |
| 154 | + |
| 155 | + open CSV_SEQNO_ALL_SQUIDS_DAY, '>', $file_seqno_all_squids ; |
| 156 | + print CSV_SEQNO_ALL_SQUIDS_DAY "date,time,events,avg delta seqno\n" ; |
| 157 | + |
| 158 | + open CSV_SEQNO_ALL_SQUIDS_MONTH, '<', "../$file_seqno_all_squids" ; |
| 159 | + while ($line = <CSV_SEQNO_ALL_SQUIDS_MONTH>) |
| 160 | + { |
| 161 | + next if $line =~ /^$date/ ; |
| 162 | + next if $line =~ /^date/ ; |
| 163 | + push @csv, $line ; |
| 164 | + } |
| 165 | + close CSV_SEQNO_ALL_SQUIDS_MONTH ; |
| 166 | + |
| 167 | + open CSV_SEQNO_ALL_SQUIDS_MONTH, '>', "../$file_seqno_all_squids" ; |
| 168 | + print CSV_SEQNO_ALL_SQUIDS_MONTH "date,time,events (x 1000),avg delta seqno,date excel,events corrected (x 1000)\n" ; |
| 169 | + foreach $line (sort @csv) |
| 170 | + { print CSV_SEQNO_ALL_SQUIDS_MONTH $line ; } |
| 171 | + |
| 172 | + $hour = '' ; |
| 173 | + foreach $hour (sort keys %all_squids_events) |
| 174 | + { |
| 175 | + $avg_delta = 0 ; |
| 176 | + $events = $all_squids_events {$hour} ; |
| 177 | + $delta = $all_squids_delta {$hour} ; |
| 178 | + if ($events > 0) |
| 179 | + { $avg_delta = sprintf ("%.0f", $delta / $events) ; } |
| 180 | + |
| 181 | + print CSV_SEQNO_ALL_SQUIDS_DAY "$date,$hour,$events,$avg_delta\n" ; |
| 182 | + print CSV_SEQNO_ALL_SQUIDS_MONTH "$date,$hour,$events,$avg_delta\n" ; |
| 183 | + print "$date,$hour,$events,$avg_delta\n" ; |
| 184 | + } |
| 185 | + |
| 186 | + print CSV_SEQNO_ALL_SQUIDS_MONTH "$date,*,$tot_events_all_day,$avg_delta_all_day,$date_excel,$tot_events_all_day_corrected\n" ; |
| 187 | + print "$date,*,$tot_events_all_day,$avg_delta_all_day,$tot_events_all_day_corrected\n" ; |
| 188 | + |
| 189 | + close CSV_SEQNO_ALL_SQUIDS_DAY ; |
| 190 | + close CSV_SEQNO_ALL_SQUIDS_MONTH ; |
| 191 | +} |
| 192 | + |
| 193 | +sub WriteOutputSquidLogs |
| 194 | +{ |
| 195 | + trace WriteOutputSquidLogs ; |
| 196 | + |
| 197 | + my $path_out = shift ; |
| 198 | + print "\ncd $path_out\n\n" ; |
| 199 | + chdir ($path_out) ; |
| 200 | + |
| 201 | + $comment = "# Data from $time_to_start till $time_to_stop (yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 1000)\n" ; |
| 202 | + |
| 203 | + open CSV_METHODS, '>', $file_csv_methods ; |
| 204 | + open CSV_SKINS, '>', $file_csv_skins ; |
| 205 | + open CSV_SCRIPTS, '>', $file_csv_scripts ; |
| 206 | + open CSV_IMAGES, '>', $file_csv_images ; |
| 207 | + open CSV_BANNERS, '>', $file_csv_banners ; |
| 208 | + open CSV_BINARIES, '>', $file_csv_binaries ; |
| 209 | + open CSV_EXTENSIONS, '>', $file_csv_extensions ; |
| 210 | + open CSV_REQUESTS, '>', $file_csv_requests ; |
| 211 | + open CSV_REQUESTS_WAP, '>', $file_csv_requests_wap ; |
| 212 | + open CSV_REQUESTS_M, '>', $file_csv_requests_m ; |
| 213 | + open CSV_ORIGINS, '>', $file_csv_origins ; |
| 214 | + open CSV_SEARCH, '>', $file_csv_search ; |
| 215 | + open CSV_BOTS, '>', $file_csv_bots ; |
| 216 | + open CSV_GOOGLEBOTS, '>', $file_csv_googlebots ; |
| 217 | + open CSV_OPSYS, '>', $file_csv_opsys ; |
| 218 | + open CSV_CLIENTS, '>', $file_csv_clients ; |
| 219 | + open CSV_LANGUAGES, '>', $file_csv_languages ; |
| 220 | + open CSV_COUNTRIES_VIEWS, '>', $file_csv_countries_views ; |
| 221 | + open CSV_COUNTRIES_SAVES, '>', $file_csv_countries_saves ; |
| 222 | + open CSV_COUNTRIESTIMED, '>', $file_csv_countries_timed ; |
| 223 | + open OUT_REFERERS, '>', $file_out_referers ; |
| 224 | + open CSV_CLIENTS_BY_WIKI, '>', $file_csv_clients_by_wiki ; |
| 225 | + open CSV_AGENTS, '>', $file_csv_agents ; |
| 226 | + |
| 227 | + print CSV_METHODS $comment ; |
| 228 | + print CSV_SKINS $comment ; |
| 229 | + print CSV_SCRIPTS $comment ; |
| 230 | + print CSV_IMAGES $comment ; |
| 231 | + print CSV_BANNERS $comment ; |
| 232 | + print CSV_BINARIES $comment ; |
| 233 | + print CSV_EXTENSIONS $comment ; |
| 234 | + print CSV_REQUESTS $comment ; |
| 235 | + print CSV_REQUESTS_WAP $comment ; |
| 236 | + print CSV_REQUESTS_M $comment ; |
| 237 | + print CSV_ORIGINS $comment ; |
| 238 | + print CSV_SEARCH $comment ; |
| 239 | + print CSV_BOTS $comment ; |
| 240 | + print CSV_GOOGLEBOTS $comment ; |
| 241 | + print CSV_OPSYS $comment . "# mobile: $tags_mobile ($tags_mobile_upd)\n" . |
| 242 | + "# pos 1: - = non mobile, M = mobile ('-'+'M'=100%), G = aggregated Group\n" ; |
| 243 | + print CSV_CLIENTS $comment ; |
| 244 | + print CSV_LANGUAGES $comment ; |
| 245 | + print CSV_COUNTRIES_VIEWS $comment ; |
| 246 | + print CSV_COUNTRIES_SAVES $comment ; |
| 247 | + print CSV_COUNTRIESTIMED $comment ; |
| 248 | + print OUT_REFERERS $comment ; |
| 249 | + print CSV_CLIENTS_BY_WIKI $comment ; |
| 250 | + print CSV_AGENTS $comment ; |
| 251 | + |
| 252 | + # CSV_METHODS |
| 253 | + print OUT "\nMETHODS:\n\n" ; |
| 254 | + print "\nMethods:\n\n" ; |
| 255 | + $method_all = 0 ; |
| 256 | + foreach $key (sort keys %statusses) |
| 257 | + { |
| 258 | + if ($key =~ /:total/) |
| 259 | + { |
| 260 | + $total = $statusses {$key} ; |
| 261 | + $method_all += $total ; |
| 262 | + ($method = $key) =~ s/:.*$// ; |
| 263 | + print OUT sprintf ("%-8s", "$method:") . sprintf ("%6d",$total) . "\n" ; |
| 264 | + print sprintf ("%-8s", "$method:") . sprintf ("%6d",$total) . "\n" ; |
| 265 | + } |
| 266 | + } |
| 267 | + print OUT "TOTAL: " . sprintf ("%6d",$method_all) . "\n" ; |
| 268 | + print "TOTAL: " . sprintf ("%6d",$method_all) . "\n" ; |
| 269 | + |
| 270 | + print CSV_METHODS ":method,status,count\n" ; |
| 271 | + foreach $key (sort keys %statusses) |
| 272 | + { |
| 273 | + if ($key =~ /:total/) |
| 274 | + { |
| 275 | + $total = $statusses {$key} ; |
| 276 | + ($method = $key) =~ s/:.*$// ; |
| 277 | + } |
| 278 | + else |
| 279 | + { |
| 280 | + $total = $statusses {$key} ; |
| 281 | + |
| 282 | + print OUT sprintf ("%6d",$total) . " : " . $key . "\n" ; |
| 283 | + $key2 = $key ; |
| 284 | + $key2 =~ s/,/,/g ; |
| 285 | + $key2 =~ s/\:/,/g ; |
| 286 | + print CSV_METHODS "$key2,$total\n" ; |
| 287 | + } |
| 288 | + } |
| 289 | + |
| 290 | + # CSV_SKINS |
| 291 | + print OUT "\nSKINS:\n\n" ; |
| 292 | + print CSV_SKINS ":scripts,parameters,count\n" ; |
| 293 | + $total_skins = 0 ; |
| 294 | + foreach $key (sort keys %skins) |
| 295 | + { |
| 296 | + $total = $skins{$key} ; |
| 297 | + $total_skins += $total ; |
| 298 | + print OUT sprintf ("%5d",$total) . " : " . $key . "\n" ; |
| 299 | + print CSV_SKINS "$key,$total\n" ; |
| 300 | + } |
| 301 | + print OUT sprintf ("%5d",$total_skins) . " : total\n" ; |
| 302 | + |
| 303 | + # CSV_SCRIPTS |
| 304 | + print OUT "\nSCRIPTS:\n\n" ; |
| 305 | + print CSV_SCRIPTS ":scripts,parameters,count\n" ; |
| 306 | + foreach $key (sort keys %scripts) |
| 307 | + { |
| 308 | + print OUT sprintf ("%5d",$scripts{$key}) . " : " . $key . "\n" ; |
| 309 | + print CSV_SCRIPTS "$key,${scripts{$key}}\n" ; |
| 310 | + } |
| 311 | + |
| 312 | + print OUT "\nSCRIPTS NO FURTHER PROCESSED:\n\n" ; |
| 313 | + foreach $key (sort keys %scripts_no_further_processing) |
| 314 | + { |
| 315 | + print OUT sprintf ("%5d",$scripts_no_further_processing{$key}) . " : " . $key . "\n" ; |
| 316 | + } |
| 317 | + |
| 318 | + # CSV_IMAGES |
| 319 | + print OUT "\nIMAGE SIZES:\n\n" ; |
| 320 | + print CSV_IMAGES ":size range,count\n" ; |
| 321 | + foreach $range (sort keys %imagesizes) |
| 322 | + { |
| 323 | + ($range2 = $range) =~ s/ //g ; |
| 324 | + $count = $imagesizes {$range} ; |
| 325 | + print OUT sprintf ("%5d",$count) . " : $range\n" ; |
| 326 | + print CSV_IMAGES "$range2,$count\n" ; |
| 327 | + } |
| 328 | + |
| 329 | + # CSV_BANNERS |
| 330 | + print OUT "\nBANNERS:\n\n" ; |
| 331 | + print CSV_BANNERS ":country,url\n" ; |
| 332 | + foreach $key (sort {$banners {$b} <=> $banners {$a}} keys %banners) |
| 333 | + { |
| 334 | + print OUT sprintf ("%5d",$banners{$key}) . " : " . $key . "\n" ; |
| 335 | + print CSV_BANNERS "$key,${banners{$key}}\n" ; |
| 336 | + } |
| 337 | + |
| 338 | + # CSV_BINARIES |
| 339 | + print OUT "\nBINARIES:\n\n" ; |
| 340 | + print CSV_BINARIES ":file,count\n" ; |
| 341 | + $cnt_binaries = 0 ; |
| 342 | + foreach $key (sort {$binaries {$b} <=> $binaries {$a}} keys %binaries) |
| 343 | + { |
| 344 | + if (++$cnt_binaries <= 500) |
| 345 | + { print OUT sprintf ("%5d",$binaries{$key}) . " : " . $key . "\n" ; } |
| 346 | + |
| 347 | + print CSV_BINARIES "$key,${binaries{$key}}\n" ; |
| 348 | + } |
| 349 | + # print OUT "\nImages:\n\n" ; |
| 350 | + # print CSV_IMAGES ":project,referer,ext,mime,parms,count\n" ; |
| 351 | + |
| 352 | + foreach $key (sort keys %images_xref) |
| 353 | + { |
| 354 | + print OUT sprintf ("%5d",$images_xref{$key}) . " : " . $key . "\n" ; |
| 355 | + # $key2 = $key ; |
| 356 | + # $key2 =~ s/,/,/g ; |
| 357 | + # $key2 =~ s/\|/,/g ; |
| 358 | + # push @csv, "$key2,${requests{$key}}" ; |
| 359 | + } |
| 360 | + #@csv =sort @csv ; |
| 361 | + #foreach $line (@csv) |
| 362 | + #{ print CSV_REQUESTS "$line\n" ; } |
| 363 | + |
| 364 | + # CSV_EXTENSIONS |
| 365 | + print OUT "\nEXTENSIONS:\n\n" ; |
| 366 | + print "\nExtensions:\n\n" ; |
| 367 | + print CSV_EXTENSIONS ":extension,count\n" ; |
| 368 | + $total = 0 ; |
| 369 | + foreach $key (sort {$exts {$b} <=> $exts {$a}} keys %exts) |
| 370 | + { |
| 371 | + $count = $exts {$key} ; |
| 372 | + $total += $count ; |
| 373 | + print OUT sprintf ("%6d",$count) . " : $key\n" ; |
| 374 | + print sprintf ("%6d",$count) . " : $key\n" ; |
| 375 | + print CSV_EXTENSIONS "$key,$count\n" ; |
| 376 | + } |
| 377 | + print OUT sprintf ("%6d",$total) . " : total\n" ; |
| 378 | + print sprintf ("%6d",$total) . " : total\n" ; |
| 379 | + |
| 380 | + # CSV_REQUESTS |
| 381 | + undef @csv ; |
| 382 | + print OUT "\nREQUESTS:\n\n" ; |
| 383 | + print CSV_REQUESTS $legend ; |
| 384 | + print CSV_REQUESTS ":project,referer,ext,mime,parms,count\n" ; |
| 385 | + foreach $key (sort keys %requests) |
| 386 | + { |
| 387 | + print OUT sprintf ("%5d",$requests{$key}) . " : " . $key . "\n" ; |
| 388 | + $key2 = $key ; |
| 389 | + $key2 =~ s/,/,/g ; |
| 390 | + $key2 =~ s/\|/,/g ; |
| 391 | + push @csv, "$key2,${requests{$key}}" ; |
| 392 | + } |
| 393 | + @csv = sort @csv ; |
| 394 | + foreach $line (@csv) |
| 395 | + { print CSV_REQUESTS "$line\n" ; } |
| 396 | + |
| 397 | + # CSV_REQUESTS_WAP |
| 398 | + undef @csv ; |
| 399 | + print OUT "\nREQUESTS_WAP:\n\n" ; |
| 400 | + print CSV_REQUESTS_WAP $legend ; |
| 401 | + print CSV_REQUESTS_WAP ":project,ext,mime,parms,country,count\n" ; |
| 402 | + foreach $key (sort keys %requests_wap) |
| 403 | + { |
| 404 | + print OUT sprintf ("%5d",$requests_wap{$key}) . " : " . $key . "\n" ; |
| 405 | + $key2 = $key ; |
| 406 | + $key2 =~ s/,/,/g ; |
| 407 | + $key2 =~ s/\|/,/g ; |
| 408 | + push @csv, "$key2,${requests_wap{$key}}" ; |
| 409 | + } |
| 410 | + @csv = sort @csv ; |
| 411 | + foreach $line (@csv) |
| 412 | + { print CSV_REQUESTS_WAP "$line\n" ; } |
| 413 | + |
| 414 | + # CSV_REQUESTS_M |
| 415 | + undef @csv ; |
| 416 | + print OUT "\nREQUESTS_M:\n\n" ; |
| 417 | + print CSV_REQUESTS_M $legend ; |
| 418 | + print CSV_REQUESTS_M ":project,ext,mime,parms,country,count\n" ; |
| 419 | + foreach $key (sort keys %requests_m) |
| 420 | + { |
| 421 | + print OUT sprintf ("%5d",$requests_m{$key}) . " : " . $key . "\n" ; |
| 422 | + $key2 = $key ; |
| 423 | + $key2 =~ s/,/,/g ; |
| 424 | + $key2 =~ s/\|/,/g ; |
| 425 | + push @csv, "$key2,${requests_m{$key}}" ; |
| 426 | + } |
| 427 | + @csv = sort @csv ; |
| 428 | + foreach $line (@csv) |
| 429 | + { print CSV_REQUESTS_M "$line\n" ; } |
| 430 | + |
| 431 | + # CSV_BOTS |
| 432 | + foreach $key (sort {$bots {$b} <=> $bots {$a}} keys %bots) |
| 433 | + { print CSV_BOTS $bots{$key} . ",$key\n" ; } |
| 434 | + |
| 435 | + # CSV_GOOGLEBOTS |
| 436 | + print CSV_GOOGLEBOTS "# Hits for googlebot from Google ip address\n" ; |
| 437 | + print CSV_GOOGLEBOTS ":date,:ip range,:hits\n" ; |
| 438 | + foreach $key (sort {$a cmp $b} keys %google_bot_hits) |
| 439 | + { |
| 440 | + my $year = substr ($key,0,4) ; |
| 441 | + my $mon = substr ($key,5,2) ; |
| 442 | + my $mday = substr ($key,8,2) ; |
| 443 | + my $hour = substr ($key,11,2) ; |
| 444 | + my $date = "$year/$mon/$mday $hour:00:00" ; |
| 445 | + my $iprange = $key ; |
| 446 | + $iprange =~ s/^[^,]*,// ; |
| 447 | + |
| 448 | + print CSV_GOOGLEBOTS "$date,$iprange,${google_bot_hits{$key}}\n" ; |
| 449 | + } |
| 450 | + |
| 451 | + #print OUT "\nUrls:\n" ; |
| 452 | + #foreach $key (sort keys %urls) |
| 453 | + #{ print OUT sprintf ("%5d",$urls{$key}) . " : " . $key . "\n" ; } |
| 454 | + |
| 455 | + # OUT_INTERWIKI |
| 456 | + print OUT "\nINTERWIKI:\n\n" ; |
| 457 | + foreach $key (sort keys %interwiki) |
| 458 | + { print OUT sprintf ("%5d",$interwiki{$key}) . " : " . $key . "\n" ; } |
| 459 | + |
| 460 | + print OUT "\nREFERER UPLOAD:\n\n" ; |
| 461 | + foreach $key (sort keys %referer_upload) |
| 462 | + { print OUT sprintf ("%5d",$referer_upload{$key}) . " : " . $key . "\n" ; } |
| 463 | + |
| 464 | + # OUT_REFERERS |
| 465 | + print OUT_REFERERS $legend ; |
| 466 | + print OUT_REFERERS "referer,count\n" ; |
| 467 | + |
| 468 | + print OUT_REFERERS "# internal\n" ; |
| 469 | + foreach $key (sort keys %referers_internal) |
| 470 | + { print OUT_REFERERS sprintf ("%5d",$referers_internal{$key}) . " : " . $key . "\n" ; } |
| 471 | + |
| 472 | + print OUT_REFERERS "# external\n" ; |
| 473 | + foreach $key (sort {$origins_external {$b} <=> $origins_external {$a} } keys %origins_external) |
| 474 | + { print OUT_REFERERS sprintf ("%5d",$origins_external{$key}) . " : " . $key . "\n" ; } |
| 475 | + |
| 476 | + print OUT_REFERERS "# unsimplified\n" ; |
| 477 | + foreach $key (sort keys %origins_unsimplified) |
| 478 | + { print OUT_REFERERS sprintf ("%5d",$origins_unsimplified{$key}) . " : " . $key . "\n" ; } |
| 479 | + |
| 480 | + print OUT_REFERERS "# simplified\n" ; |
| 481 | + foreach $key (sort keys %origin_simplified) |
| 482 | + { print OUT_REFERERS sprintf ("%5d",$origin_simplified{$key}) . " : " . $key . "\n" ; } |
| 483 | + |
| 484 | + print "\nLook alikes:\n\n" ; |
| 485 | + print OUT_REFERERS "# look alikes\n" ; |
| 486 | + foreach $key (sort {$wikis {$b} <=> $wikis {$a}} keys %wikis) |
| 487 | + { |
| 488 | + print OUT_REFERERS sprintf ("%5d",$wikis{$key}) . " : " . $key . "\n" ; |
| 489 | + print sprintf ("%5d",$wikis{$key}) . " : " . $key . "\n" ; |
| 490 | + } |
| 491 | + |
| 492 | + # CSV_ORIGINS |
| 493 | + print OUT "\nORIGINS:\n\n" ; |
| 494 | + print CSV_ORIGINS ":toplevel,count\n" ; |
| 495 | + foreach $key (sort keys %origins) |
| 496 | + { |
| 497 | + print OUT sprintf ("%8d",$origins{$key}) . " : " . $key . "\n" ; |
| 498 | + print CSV_ORIGINS "$key,${origins{$key}}\n" ; |
| 499 | + } |
| 500 | + |
| 501 | + # CSV_SEARCH |
| 502 | + print OUT "\nSEARCHES:\n" ; |
| 503 | + print CSV_SEARCH ":matches (ip range|referer|agent string),site,referer group,bot,agent match,mime group,top level domain,count\n" ; |
| 504 | + foreach $key (sort keys %search) |
| 505 | + { |
| 506 | + print OUT sprintf ("%8d",$search{$key}) . " : " . $key . "\n" ; |
| 507 | + print CSV_SEARCH "$key,${search{$key}}\n" ; |
| 508 | + } |
| 509 | + |
| 510 | + # CSV_LANGUAGES |
| 511 | + print OUT "\nLANGUAGES:\n\n" ; |
| 512 | + print CSV_LANGUAGES ":browser,:language,:count\n" ; |
| 513 | + foreach $key (sort keys %languages) |
| 514 | + { |
| 515 | + print OUT sprintf ("%8d",$languages{$key}) . " : " . $key . "\n" ; |
| 516 | + print CSV_LANGUAGES "$key,${languages{$key}}\n" ; |
| 517 | + } |
| 518 | + |
| 519 | + #print OUT "\nSources:\n\n" ; |
| 520 | + #foreach $key (sort keys %srcs) |
| 521 | + #{ print OUT sprintf ("%5d",$srcs{$key}) . " : " . $key . "\n" ; } |
| 522 | + |
| 523 | + print OUT "\nGOOGLE BOTS:\n\n" ; |
| 524 | + foreach $key (sort keys %googlebots) |
| 525 | + { print OUT sprintf ("%5d",$googlebots{$key}) . " : " . $key . "\n" ; } |
| 526 | + |
| 527 | + print OUT "\nGOOGLE BINS:\n\n" ; |
| 528 | + print "\nGoogle bins:\n\n" ; |
| 529 | + foreach $key (sort {$googlebins {$b} <=> $googlebins {$a}} keys %googlebins) |
| 530 | + { |
| 531 | + print OUT sprintf ("%5d",$googlebins{$key}) . " : " . $key . "\n" ; |
| 532 | + print sprintf ("%5d",$googlebins{$key}) . " : " . $key . "\n" ; |
| 533 | + } |
| 534 | + |
| 535 | + print OUT "\nGOOGLE BINS 2:\n\n" ; |
| 536 | + print "\nGoogle bins 2:\n\n" ; |
| 537 | + foreach $key (sort {$googlebins2 {$b} <=> $googlebins2 {$a}} keys %googlebins2) |
| 538 | + { |
| 539 | + print OUT sprintf ("%5d",$googlebins2{$key}) . " : " . $key . "\n" ; |
| 540 | + print sprintf ("%5d",$googlebins2{$key}) . " : " . $key . "\n" ; |
| 541 | + } |
| 542 | + |
| 543 | + print OUT "\nDOMAIN ERRORS:\n\n" ; |
| 544 | + foreach $key (sort { $domain_errors {$b} <=> $domain_errors {$a}} keys %domain_errors) |
| 545 | + { print OUT sprintf ("%5d",$domain_errors{$key}) . " : " . $key . "\n" ; } |
| 546 | + |
| 547 | + print OUT "\nUNRECOGNIZED GOOGLE AGENTS:\n\n" ; |
| 548 | + foreach $key (sort { $googleagents {$b} <=> $googleagents {$a}} keys %googleagents) |
| 549 | + { print OUT sprintf ("%5d",$googleagents{$key}) . " : " . $key . "\n" ; } |
| 550 | + |
| 551 | + print OUT "\nGOOGLE LOOK ALIKES:\n\n" ; |
| 552 | + foreach $key (sort { $google_imposters {$b} <=> $google_imposters {$a}} keys %google_imposters) |
| 553 | + { print OUT sprintf ("%5d",$google_imposters{$key}) . " : " . $key . "\n" ; } |
| 554 | + |
| 555 | + print OUT "\nYAHOO BOTS:\n\n" ; |
| 556 | + foreach $key (sort keys %yahoobots) |
| 557 | + { print OUT sprintf ("%5d",$yahoobots{$key}) . " : " . $key . "\n" ; } |
| 558 | + |
| 559 | + if ($count_hits_per_ip_range) |
| 560 | + { |
| 561 | + print OUT "\nIP ACTIVITY BY COUNT:\n\n" ; |
| 562 | + foreach $key (sort {$cnt_ip_ranges {$b} <=> $cnt_ip_ranges {$a}}keys %cnt_ip_ranges) |
| 563 | + { |
| 564 | + if ($cnt_ip_ranges {$key} >= 10) |
| 565 | + { print OUT sprintf ("%5d",$cnt_ip_ranges{$key}) . " : " . $key . "\n" ; } |
| 566 | + } |
| 567 | + } |
| 568 | + |
| 569 | + print OUT "\nIP ACTIVITY BY ADDRESS:\n\n" ; |
| 570 | + foreach $key (sort keys %cnt_ip_ranges) |
| 571 | + { |
| 572 | + if ($cnt_ip_ranges {$key} >= 10) |
| 573 | + { print OUT sprintf ("%5d",$cnt_ip_ranges{$key}) . " : " . $key . "\n" ; } |
| 574 | + } |
| 575 | + |
| 576 | + print OUT2 "\nOPERATING SYSTEMS:\n\n" ; |
| 577 | + print CSV_OPSYS ":rectype,opsys,count\n" ; |
| 578 | + $total_operating_systems = 0 ; |
| 579 | + |
| 580 | + foreach $key (keys %operating_systems) |
| 581 | + { $total_operating_systems += $operating_systems{$key} ; } |
| 582 | + |
| 583 | + print OUT2 "\nTOTAL_OPERATING_SYSTEMS: $total_operating_systems\n\n" ; |
| 584 | + foreach $key (sort keys %operating_systems) |
| 585 | + { |
| 586 | + my $count = $operating_systems {$key} ; |
| 587 | + my $count2 = sprintf ("%5d",$count) ; |
| 588 | + my $perc1 = sprintf ("%6.2f",(100*$count/$total_operating_systems)) . "%" ; |
| 589 | + my $perc2 = sprintf ("%.2f",(100*$count/$total_operating_systems)) . "%" ; |
| 590 | + |
| 591 | + if ($count >= 1) |
| 592 | + { print OUT2 "$count2 = $perc1: $key \n" ; } |
| 593 | + |
| 594 | + print CSV_OPSYS "$key,$count,$perc2\n" ; |
| 595 | + } |
| 596 | + print OUT2 "\nOPERATING SYSTEMS GROUPED:\n\n" ; |
| 597 | + $total_operating_systems_printed = 0 ; |
| 598 | + &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "BlackBerry") ; |
| 599 | + &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "DoCoMo") ; |
| 600 | + &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "FreeBSD") ; |
| 601 | + &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "iPad") ; |
| 602 | + &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "iPhone") ; |
| 603 | + &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Linux") ; |
| 604 | + &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Mac") ; |
| 605 | + &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "OpenBSD") ; |
| 606 | + &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "SunOS") ; |
| 607 | + &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "SymbianOS") ; |
| 608 | + &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Windows") ; |
| 609 | + print OUT2 sprintf ("%6d",$total_operating_systems_printed) . "=" . sprintf ("%5.2f",(100*$total_operating_systems_printed/$total_operating_systems)) . "% : Total\n\n" ; |
| 610 | + |
| 611 | + @LinuxVersions = split (',', 'Android,Xubuntu,Kubuntu,Ubuntu,Gentoo,PCLinuxOS,CentOS,Oracle,Mandriva,Red Hat,Mandriva,openSUSE,SUSE,Fedora,Epiphany,Mint,Mips,Arch,Debian,Slackware,Motor,Other') ; |
| 612 | + |
| 613 | + &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Mac Intel") ; |
| 614 | + &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Mac PowerPC") ; |
| 615 | + |
| 616 | + foreach $LinuxVersion (@LinuxVersions) |
| 617 | + { &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Linux $LinuxVersion") ; } |
| 618 | + |
| 619 | + |
| 620 | + print OUT2 "\nCLIENTS:\n\n" ; |
| 621 | + print CSV_CLIENTS ":mobile,engine,client,mime-cat\n" ; |
| 622 | + $total_clients = 0 ; |
| 623 | + foreach $key (keys %clients) |
| 624 | + { |
| 625 | + ($mobile,$version,$mimecat) = split (',', $key) ; |
| 626 | + print OUT2 "\%CLIENTS: '$mobile','$version','$mimecat': " . $clients{$key} . "\n" ; |
| 627 | + $total_clients {$mimecat} += $clients{$key} ; |
| 628 | + $version =~ s/ .*$// ; |
| 629 | + $version =~ s/\/.*$// ; |
| 630 | + $version =~ s/,/,/g ; |
| 631 | + $group = "$mobile,$version,$mimecat" ; |
| 632 | + $grouped_clients {$group} += $clients{$key} ; |
| 633 | + } |
| 634 | + foreach $key (sort keys %clients) |
| 635 | + { |
| 636 | + ($mobile,$version,$mimecat) = split (',', $key) ; |
| 637 | + my $count = $clients {$key} ; |
| 638 | + my $count2 = sprintf ("%5d",$count) ; |
| 639 | + my $perc1 = sprintf ("%6.2f",(100*$count/$total_clients {$mimecat})) . "%" ; |
| 640 | + my $perc2 = sprintf ("%.2f" ,(100*$count/$total_clients {$mimecat})) . "%" ; |
| 641 | + |
| 642 | + if ($clients {$key} >= 3) |
| 643 | + { print OUT2 "$count2 = $perc1: $key\n" ; } |
| 644 | + |
| 645 | + print CSV_CLIENTS "$key,$count,$perc2\n" ; |
| 646 | + } |
| 647 | + foreach $key (sort keys %engines) |
| 648 | + { |
| 649 | + my $count = $engines {$key} ; |
| 650 | + print CSV_CLIENTS "E,$key,$count\n" ; |
| 651 | + } |
| 652 | + foreach $key (sort keys %grouped_clients) |
| 653 | + { |
| 654 | + ($group,$version,$mimecat) = split (',', $key) ; |
| 655 | + my $count = $grouped_clients {$key} ; |
| 656 | + my $perc2 = sprintf ("%.2f",(100*$count/$total_clients {$mimecat})) . "%" ; |
| 657 | + print CSV_CLIENTS "G,$key,$count,$perc2\n" ; |
| 658 | + } |
| 659 | + |
| 660 | + print OUT2 "\nCLIENTS BY WIKI:\n\n" ; |
| 661 | + print CSV_CLIENTS_BY_WIKI ":mobile,client,mime-cat\n" ; |
| 662 | + $total_clients = 0 ; |
| 663 | + foreach $key (keys %clients_by_wiki) |
| 664 | + { $total_clients += $clients_by_wiki{$key} ; } |
| 665 | + |
| 666 | + foreach $key (sort keys %clients_by_wiki) |
| 667 | + { |
| 668 | + my $count = $clients_by_wiki {$key} ; |
| 669 | + my $count2 = sprintf ("%5d",$count) ; |
| 670 | + my $perc1 = sprintf ("%6.2f",(100*$count/$total_clients)) . "%" ; |
| 671 | + my $perc2 = sprintf ("%.2f",(100*$count/$total_clients)) . "%" ; |
| 672 | + if ($clients_by_wiki {$key} >= 3) |
| 673 | + { print OUT2 "$count2 = $perc1: $key\n" ; } |
| 674 | + ($mobile,$version,$domain,$mimecat) = split (',', $key) ; |
| 675 | + $domain = ExpandAbbreviation ($domain) ; |
| 676 | + $domain =~ s/:/,/ ; |
| 677 | + $domain =~ s/\ /--/ ; |
| 678 | + print CSV_CLIENTS_BY_WIKI "$mobile,$version,$domain,$mimecat,$count,$perc2\n" ; |
| 679 | + } |
| 680 | + |
| 681 | + foreach $key (sort keys %grouped_clients_by_wiki) |
| 682 | + { |
| 683 | + my $count = $grouped_clients_by_wiki {$key} ; |
| 684 | + my $perc2 = sprintf ("%.2f",(100*$count/$total_clients)) . "%" ; |
| 685 | + print CSV_CLIENTS_BY_WIKI "G,$key,$count,$perc2\n" ; |
| 686 | + } |
| 687 | + |
| 688 | + print OUT2 "\nGOOGLEBOT NOT FROM GOOGLE\n\n" ; |
| 689 | + foreach $key (sort keys %ip_bot_no_google) |
| 690 | + { |
| 691 | + if ($ip_bot_no_google {$key} >= 3) |
| 692 | + { print OUT2 sprintf ("%5d",$ip_bot_no_google{$key}) . " : " . $key . "\n" ; } |
| 693 | + } |
| 694 | + |
| 695 | + print OUT2 "\nMOBILE OTHER\n\n" ; |
| 696 | + foreach $key (sort keys %mobile_other) |
| 697 | + { print OUT2 sprintf ("%5d",$mobile_other{$key}) . " : " . $key . "\n" ; } |
| 698 | + |
| 699 | + foreach $key (sort keys %countries_views) |
| 700 | + { |
| 701 | + my $count = $countries_views {$key} ; |
| 702 | + print CSV_COUNTRIES_VIEWS "$key,$count\n" ; |
| 703 | + } |
| 704 | + |
| 705 | + foreach $key (sort keys %countries_saves) |
| 706 | + { |
| 707 | + my $count = $countries_saves {$key} ; |
| 708 | + print CSV_COUNTRIES_SAVES "$key,$count\n" ; |
| 709 | + print "$key,$count\n" ; |
| 710 | + } |
| 711 | + |
| 712 | + foreach $key (sort keys %countries_timed) |
| 713 | + { |
| 714 | + my $count = $countries_timed {$key} ; |
| 715 | + print CSV_COUNTRIESTIMED "$key,$count\n" ; |
| 716 | + } |
| 717 | + |
| 718 | + foreach $key (keys_sorted_by_value_num_desc %agents_raw) |
| 719 | + { |
| 720 | + my $count = $agents_raw {$key} ; |
| 721 | + $key =~ s/,/;/g ; |
| 722 | + next if $count < 5 ; |
| 723 | + print CSV_AGENTS "$key,$count\n" ; |
| 724 | + } |
| 725 | + |
| 726 | + close CSV_METHODS ; |
| 727 | + close CSV_SKINS ; |
| 728 | + close CSV_SCRIPTS ; |
| 729 | + close CSV_IMAGES ; |
| 730 | + close CSV_BANNERS ; |
| 731 | + close CSV_BINARIES ; |
| 732 | + close CSV_EXTENSIONS ; |
| 733 | + close CSV_REQUESTS ; |
| 734 | + close CSV_ORIGINS ; |
| 735 | + close CSV_SEARCH ; |
| 736 | + close CSV_BOTS ; |
| 737 | + close CSV_GOOGLEBOTS ; |
| 738 | + close CSV_OPSYS ; |
| 739 | + close CSV_LANGUAGES ; |
| 740 | + close CSV_COUNTRIES_VIEWS ; |
| 741 | + close CSV_COUNTRIES_SAVES ; |
| 742 | + close CSV_COUNTRIESTIMED ; |
| 743 | + close CSV_CLIENTS ; |
| 744 | + close CSV_CLIENTS_BY_WIKI ; |
| 745 | + close OUT_REFERERS ; |
| 746 | + close CSV_AGENTS ; |
| 747 | +} |
| 748 | + |
| 749 | +sub WriteOutputEditsSavesFile |
| 750 | +{ |
| 751 | + trace WriteOutputEditsSavesFile ; |
| 752 | + |
| 753 | + my $path_out = shift ; |
| 754 | + print "\ncd $path_out\n\n" ; |
| 755 | + chdir ($path_out) ; |
| 756 | + |
| 757 | + $comment = "# Data from $time_to_start till $time_to_stop (yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 1000)\n" ; |
| 758 | + |
| 759 | +# $comment = "# Data from $time_to_start till $time_to_stop (yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 1000)\n" ; |
| 760 | + |
| 761 | + # only keep edits/submits for ip addresses which occur only once in this hash (stands for avg. 2000 hits) |
| 762 | + foreach $key (keys %client_ip_record_cnt) |
| 763 | + { $client_ip_record_cnt_total {$client_ip_record_cnt {$key}}++ ; } |
| 764 | + |
| 765 | + print "\n\nEdit submit lines:\n" ; |
| 766 | + foreach $key (sort {$b <=> $a} keys %client_ip_record_cnt_total) |
| 767 | + { |
| 768 | + print sprintf ("%5d", $client_ip_record_cnt_total {$key}) . " ip address(es) occur $key times\n" ; |
| 769 | + $lines_edit_submit_total += $key * $client_ip_record_cnt_total {$key} ; |
| 770 | + } |
| 771 | + print "Total edit submit lines: $lines_edit_submit_total\n\n" ; |
| 772 | + |
| 773 | + foreach $key (keys %index_php_raw) |
| 774 | + { |
| 775 | + ($client_ip,$key2) = split (',', $key, 2) ; |
| 776 | + if ($client_ip_record_cnt {$client_ip} < 2) |
| 777 | + { |
| 778 | + $index_php {$key2} += $index_php_raw {$key} ; |
| 779 | + $edit_submit_filtered += $index_php_raw {$key} ; |
| 780 | + } |
| 781 | + } |
| 782 | + undef %index_php_raw ; |
| 783 | + |
| 784 | + open CSV_INDEXPHP, '>', "$path_out/$file_csv_indexphp" ; |
| 785 | + |
| 786 | + print CSV_INDEXPHP $comment ; |
| 787 | + foreach $key (sort {$index_php {$b} <=> $index_php {$a}} keys %index_php) |
| 788 | + { |
| 789 | + print CSV_INDEXPHP "$key,${index_php {$key}}\n" ; |
| 790 | + $lines_edit_submit_filtered ++ ; |
| 791 | + } |
| 792 | + print "Filtered edits+submits: $edit_submit_filtered in $lines_edit_submit_filtered lines\n\n" ; |
| 793 | + |
| 794 | + close CSV_INDEXPHP ; |
| 795 | +} |
| 796 | + |
| 797 | +sub WriteOutputCountriesSaves |
| 798 | +{ |
| 799 | + my $path_out = shift ; |
| 800 | + |
| 801 | + $comment = "# Data from $time_to_start till $time_to_stop (yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 1000)\n" ; |
| 802 | + |
| 803 | + open CSV_COUNTRIES_SAVES, '>', "$path_out/$file_csv_countries_saves" ; |
| 804 | + print CSV_COUNTRIES_SAVES $comment ; |
| 805 | + |
| 806 | + foreach $key (sort keys %countries_saves) |
| 807 | + { |
| 808 | + my $count = $countries_saves {$key} ; |
| 809 | + print CSV_COUNTRIES_SAVES "$key,$count\n" ; |
| 810 | + } |
| 811 | + close CSV_COUNTRIES_SAVES ; |
| 812 | +} |
| 813 | + |
| 814 | +sub WriteDiagnostics |
| 815 | +{ |
| 816 | + if ($statusses_non_tcp > 0) |
| 817 | + { print ERR "Statusses non 'TCP..' : $statusses_non_tcp\n" ; } |
| 818 | + |
| 819 | + if ($fields_too_many > 0) |
| 820 | + { print ERR "Too many fields on $fields_too_many records. (space in article name?)\n" ; } |
| 821 | + |
| 822 | + if ($fields_too_few > 0) |
| 823 | + { print ERR "Too few fields on $fields_too_few records.\n" ; } |
| 824 | + |
| 825 | + print "\nLines read per date:\n" ; |
| 826 | + print OUT "\nLines read per date:\n" ; |
| 827 | + foreach $key (sort keys %lines_read) |
| 828 | + { |
| 829 | + print OUT "$key: " . sprintf ("%8d",$lines_read{$key}) . "\n" ; |
| 830 | + print "$key: " . sprintf ("%8d",$lines_read{$key}) . "\n" ; |
| 831 | + } |
| 832 | + print OUT "\n" ; |
| 833 | + print "\n" ; |
| 834 | + |
| 835 | + print "Referers internal $tot_referers_internal\n" ; |
| 836 | + print "Referers external $tot_referers_external\n" ; |
| 837 | + print "Origins counted $tot_origins_external_counted\n" ; |
| 838 | + |
| 839 | + print ERR "\nUnrecognized domains:\n\n" ; |
| 840 | + foreach $key (sort keys %unrecognized_domains) |
| 841 | + { print ERR sprintf ("%5d",$unrecognized_domains{$key}) . " : " . $key . "\n" ; } |
| 842 | + |
| 843 | + print "\n$tot_mime_html html requests found.\n" ; |
| 844 | + print "country info stored for $tot_mime_html2 html requests.\n" ; |
| 845 | +# # double check that yahoo is much more than 10% of google (even when google uses ip addresses) |
| 846 | +# print "\ngoogle string in referer or agent: $googles\n" ; |
| 847 | +} |
| 848 | + |
| 849 | +sub WriteOutputLineToCsvSharePerOs |
| 850 | +{ |
| 851 | + my $total_all = shift ; |
| 852 | + my $criteria = shift ; |
| 853 | + (my $criteria2 = $criteria) =~ s/ /.*/g ; |
| 854 | + my @criteria = split (' ', $criteria2) ; |
| 855 | + |
| 856 | + my $total_operating_systems = 0 ; |
| 857 | + my $trace_count = 0 ; |
| 858 | + |
| 859 | + print "WriteOutputLineToCsvSharePerOs $criteria\n" ; |
| 860 | + foreach $key (keys %operating_systems) |
| 861 | + { |
| 862 | + my $match = $true ; |
| 863 | + foreach $criterion (@criteria) |
| 864 | + { |
| 865 | + if ($key !~ /$criterion/) |
| 866 | + { |
| 867 | + if (($trace_count++ < 20) && ($criteria =~ /Linux/)) |
| 868 | + { print "key $key criterion $criterion FALSE\n" ; } |
| 869 | + $match = $false ; |
| 870 | + last ; |
| 871 | + } |
| 872 | + else |
| 873 | + { |
| 874 | + if (($trace_count++ < 20) && ($criteria =~ /Linux/)) |
| 875 | + { print "key $key criterion $criterion TRUE\n" ; } |
| 876 | + } |
| 877 | + } |
| 878 | + if ($match) |
| 879 | + { $total_operating_systems += $operating_systems {$key} ; } |
| 880 | + } |
| 881 | + $perc_operating_systems1 = ".." ; |
| 882 | + $perc_operating_systems2 = ".." ; |
| 883 | + if ($total_all > 0) |
| 884 | + { |
| 885 | + $perc_operating_systems1 = sprintf ("%5.2f",(100*$total_operating_systems/$total_all)) ; |
| 886 | + $perc_operating_systems2 = sprintf ("%.2f",(100*$total_operating_systems/$total_all)) ; |
| 887 | + } |
| 888 | + print OUT2 sprintf ("%6d",$total_operating_systems) . "= $perc_operating_systems1\% : $criteria\n" ; |
| 889 | + print CSV_OPSYS "G,$criteria,$total_operating_systems,$perc_operating_systems2\%\n" ; ; |
| 890 | + $total_operating_systems_printed += $total_operating_systems ; |
| 891 | +} |
| 892 | + |
| 893 | +sub MoveAndCompressFiles |
| 894 | +{ |
| 895 | + trace MoveAndCompressFiles ; |
| 896 | + |
| 897 | + my ($path_out, $path_out_month, $date_collect_files) = @_ ; |
| 898 | + |
| 899 | + print "\ncd $path_out_month\n" ; |
| 900 | + chdir ($path_out_month) ; |
| 901 | + |
| 902 | +# $cmd = "mv $path_out/private/SquidDataEditsSavesDoNotPublish.txt $path_out/private/SquidDataEditsSavesDoNotPublish$date_collect_files.txt" ; |
| 903 | +# print "\ncmd = '$cmd'\n" ; |
| 904 | +#`$cmd` ; |
| 905 | + |
| 906 | + $cmd = "bzip2 -f $path_out/$file_edits_saves" ; |
| 907 | + print "\ncmd = '$cmd'\n" ; |
| 908 | + `$cmd` ; |
| 909 | + |
| 910 | + $cmd = "bzip2 -f $path_out/$file_csv_agents" ; |
| 911 | + print "\ncmd = '$cmd'\n" ; |
| 912 | + `$cmd` ; |
| 913 | + |
| 914 | + # $cmd = "tar -cf $date_collect_files\-csv.tar $date_collect_files/*.csv" ; |
| 915 | + # print "\ncmd = '$cmd'\n" ; |
| 916 | + # `$cmd` ; |
| 917 | + |
| 918 | + # $cmd = "bzip2 -f $date_collect_files\-csv.tar" ; |
| 919 | + # print "\ncmd = '$cmd'\n" ; |
| 920 | + # `$cmd` ; |
| 921 | +} |
| 922 | + |
| 923 | +1 ; |
Index: trunk/wikistats/squids/SquidCountArchive.pl |
— | — | @@ -510,6 +510,7 @@ |
511 | 511 | undef %squid_events ; |
512 | 512 | undef %squid_seqno ; |
513 | 513 | undef %statusses ; |
| 514 | + undef %total_clients ; |
514 | 515 | undef %unrecognized_domains ; |
515 | 516 | undef %wikis ; |
516 | 517 | # undef @files ; |
Index: trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm |
— | — | @@ -4,13 +4,20 @@ |
5 | 5 | { |
6 | 6 | my $line = shift ; |
7 | 7 | |
8 | | - my @fields = split (' ', $line) ; |
9 | 8 | $time = $fields [2] ; |
10 | 9 | $date = substr ($time,0,10) ; |
11 | 10 | |
12 | 11 | $client_ip = $fields [4] ; |
13 | 12 | $mime = $fields [10] ; |
| 13 | + $url = lc ($fields [8]) ; |
14 | 14 | |
| 15 | + if ($mime eq '-') |
| 16 | + { |
| 17 | + # no mime type on log records from varnish, assume 'page request' on most, until that stream had been fixed |
| 18 | + if (($url =~ /\.m\..*?\/wiki\//) || ($url =~ /\.m\..*?\/w\/index.php/)) |
| 19 | + { $mime = "text/html" ; } |
| 20 | + } |
| 21 | + |
15 | 22 | if ($scan_ip_frequencies) # phase 1 |
16 | 23 | { |
17 | 24 | return if $line =~ /Banner(?:Cont|List|Load|beheer)/io ; |
— | — | @@ -49,8 +56,12 @@ |
50 | 57 | $status = $fields [5] ; |
51 | 58 | $size = $fields [6] ; |
52 | 59 | $method = $fields [7] ; |
53 | | - $url = lc ($fields [8]) ; |
54 | 60 | |
| 61 | + $referer = lc ($fields [11]) ; |
| 62 | + $agent = $fields [13] ; |
| 63 | + |
| 64 | +# print "\ntime '$time', client_ip '$client_ip', mime '$mime', squid '$squid', seqno '$seqno', \nstatus '$status', size '$size', method '$method', referer '$referer',\nurl '$url', agent '$agent'\n" ; |
| 65 | + |
55 | 66 | if ($url =~ /\.m\.wikipedia.org/) |
56 | 67 | { |
57 | 68 | $url_wikipedia_mobile ++ ; |
— | — | @@ -63,9 +74,6 @@ |
64 | 75 | } |
65 | 76 | } |
66 | 77 | |
67 | | - $referer = lc ($fields [11]) ; |
68 | | - $agent = $fields [13] ; |
69 | | - |
70 | 78 | $url =~ s/^http\w?\:\/\///o ; |
71 | 79 | $url =~ s/\%3A/:/gio ; |
72 | 80 | $url =~ s/\%3B/;/gio ; |
— | — | @@ -355,7 +363,11 @@ |
356 | 364 | |
357 | 365 | if ($os =~ /Linux/o) |
358 | 366 | { |
359 | | - ($osx = $agent2) =~ s/^.*?((?:Android|Ubuntu|Gentoo|PCLinuxOS|CentOS|Red Hat|Mandriva|SUSE|Fedora|Epiphany|Debian|Motor\w+)[^\s;\[\]\(\)]*).*$/ucfirst($1)/ieo ; |
| 367 | + ($cpu = $agent2) =~ s/^.*?(armv\d+|i\d+|x[0-9_]+).*$/$1/o ; |
| 368 | + if ($cpu eq $agent2) |
| 369 | + { $cpu = '' ; } |
| 370 | + |
| 371 | + ($osx = $agent2) =~ s/^.*?((?:Android|Xubuntu|Kubuntu|Ubuntu|Gentoo|PCLinuxOS|CentOS|Oracle|Mandriva|Red Hat|Mandriva|openSUSE|SUSE|Fedora|Epiphany|Mint|Mips|Arch|Debian|Slackware|Motor\w+)[^\s;\[\]\(\)]*).*$/ucfirst($1)/ieo ; |
360 | 372 | if ($osx ne $agent2) |
361 | 373 | { |
362 | 374 | $osx =~ s/(\d+\_\d+).*$/$1/o ; |
— | — | @@ -363,8 +375,18 @@ |
364 | 376 | $osx =~ s/_/\./o ; |
365 | 377 | $osx =~ s/(\d+\.\d+).*$/$1/o ; |
366 | 378 | $osx =~ s/^(Motor)(\w+).*$/ucfirst(lc($1)).uc($2)/ieo ; |
367 | | - $os = "$os $osx" ; |
368 | 379 | } |
| 380 | + else |
| 381 | + { $osx = "Other" ; } |
| 382 | + |
| 383 | + $os = "$os $cpu $osx" ; |
| 384 | + $os =~ s/\s\s+/ /g ; |
| 385 | + |
| 386 | + # testing: |
| 387 | + # if ($osx eq $agent2) |
| 388 | + # { print "Linux ?? -> $agent2\n" ; } |
| 389 | + # elsif ($osx !~ /(?:Android|Ubuntu)/i) |
| 390 | + # { print "Linux !! $cpu $osx -> $agent2\n" ; } |
369 | 391 | } |
370 | 392 | |
371 | 393 | $os =~ s/(Windows NT \d+\.\d+).*$/$1/o ; |
— | — | @@ -1189,7 +1211,7 @@ |
1190 | 1212 | ($path = $url) =~ s/^.*?\.org\///o ; |
1191 | 1213 | ($file = $path) =~ s/^.*\/([^\/]*)$/$1/go ; # remove path |
1192 | 1214 | |
1193 | | - $binaries {$file} ++ ; |
| 1215 | + $binaries {$path} ++ ; # Jan 2012 store path, not file only |
1194 | 1216 | |
1195 | 1217 | if ($file =~ /(?:gif|jpg|jpeg|png|svg)$/io) |
1196 | 1218 | { |
Index: trunk/wikistats/squids/SquidReportArchive.pl |
— | — | @@ -6,8 +6,10 @@ |
7 | 7 | ez_lib_version (2) ; |
8 | 8 | |
9 | 9 | # set defaults mainly for tests on local machine |
10 | | - default_argv "-m 2011-07 " ; |
11 | | -# default_argv "-c -q 2010Q4" ; |
| 10 | +# default_argv "-m 2011-07 " ; |
| 11 | +# default_argv "-c -q 2010Q1" ; |
| 12 | +# default_argv "-w" ; # refresh country info from Wikipedia (population etc) |
| 13 | + default_argv "-c" ; |
12 | 14 | |
13 | 15 | # to do: add text from http://wiki.squid-cache.org/SquidFaq/SquidLogs |
14 | 16 | # ReportOrigin how to handle '!error <-> other |
— | — | @@ -30,6 +32,8 @@ |
31 | 33 | |
32 | 34 | getopt ("dmq", \%options) ; |
33 | 35 | |
| 36 | + undef %country_code_not_specified_reported ; |
| 37 | + |
34 | 38 | if (-d "/a/squid") |
35 | 39 | { |
36 | 40 | print "\n\nJob runs on server $hostname\n\n" ; |
— | — | @@ -52,11 +56,13 @@ |
53 | 57 | print "Path in = $path_in\n" ; |
54 | 58 | print "Path out = $path_out\n" ; |
55 | 59 | |
| 60 | + $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ; |
| 61 | + |
56 | 62 | # periodically harvest updated metrics from |
57 | 63 | # 'http://en.wikipedia.org/wiki/List_of_countries_by_population' |
58 | 64 | # 'http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users' |
59 | 65 | if (defined ($options {"w"})) |
60 | | - { &ReadWikipedia ; exit ; } |
| 66 | + { &ReadWikipedia ; print "Ready\n" ; exit ; } |
61 | 67 | |
62 | 68 | if (defined ($options {"c"})) |
63 | 69 | { $reportcountries = $true ; } |
— | — | @@ -77,7 +83,6 @@ |
78 | 84 | &InitProjectNames ; |
79 | 85 | |
80 | 86 | $file_csv_country_codes = "CountryCodes.csv" ; |
81 | | - $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ; |
82 | 87 | |
83 | 88 | &ReadInputCountriesNames ; |
84 | 89 | |
— | — | @@ -384,7 +389,7 @@ |
385 | 390 | $title = "$title_main - Wikipedia <font color=#008000>$views_edits Per Country</font> - Trends" ; |
386 | 391 | &WriteReportPerCountryTrends ($title, $views_edits, &UnLink ($links,$offset_links+3)) ; |
387 | 392 | |
388 | | - $links =~ s/,.*$// ; |
| 393 | +# $links =~ s/,.*$// ; |
389 | 394 | $title = "$title_main - <font color=#008000>$views_edits Per Wikipedia Language</font> - Breakdown" ; |
390 | 395 | &WriteReportPerLanguageBreakDown ($title, $views_edits, &UnLink ($links,$offset_links+4)) ; |
391 | 396 | } |
— | — | @@ -480,8 +485,23 @@ |
481 | 486 | "<input type='button' value=' Archive ' onclick='window.location=\"http://stats.wikimedia.org/archive/squid_reports\"'> " . |
482 | 487 | "<input type='button' value=' Wikimedia Statistics ' onclick='window.location=\"http://stats.wikimedia.org\"'>" . |
483 | 488 | "</td></tr>\n</table><hr>" . |
484 | | - " This analysis is based on a 1:1000 sampled server log (squids) X1000\nALSO<p>" ; |
| 489 | + # " This analysis is based on a 1:1000 sampled server log (squids) X1000\nALSO<p>" ; |
| 490 | + " This analysis is based on a 1:1000 sampled server log (squids) X1000\nALSO<br>" ; |
485 | 491 | |
| 492 | + if ($reportcountries) |
| 493 | + { |
| 494 | + $header .= "<p> <font color=#900000>WMF traffic logging service suffered from server capacity problems from Nov 2009 till July 2010 and again in Aug/Sep/Oct 2011.<br>" . |
| 495 | + " Data loss only occurred during peak hours. It therefore may have had somewhat different impact for traffic from different parts of the world." ; |
| 496 | + } |
| 497 | + else |
| 498 | + { |
| 499 | + $header .= "<font color=#900000>WMF traffic logging service suffered from server capacity problems in Aug/Sep/Oct 2011.<br>" . |
| 500 | + "Absolute traffic counts for October 2011 are approximatly 7% too low.<br>" . |
| 501 | + "Data loss only occurred during peak hours. It therefore may have had somewhat different impact for traffic from different parts of the world.<br>" . |
| 502 | + "and may have also skewed relative figures like share of traffic per browser or operating system.</font><p>" ; |
| 503 | + $header .= "<font color=#900000>In a an unrelated server outage precisely half of traffic to WMF mobile sites was not counted from Oct 16 - Nov 29 (one of two load-balanced servers did not report traffic).<br>" . |
| 504 | + "WMF has since improved server monitoring, so that similar outages should be detected and fixed much faster from now on.</font><p>" ; |
| 505 | + } |
486 | 506 | # to be localized some day like any reports |
487 | 507 | $out_license = "All data and images on this page are in the public domain." ; |
488 | 508 | $out_generated = "Generated on " ; |
— | — | @@ -619,7 +639,7 @@ |
620 | 640 | |
621 | 641 | $client =~ s/_/./g ; |
622 | 642 | $client =~ s/\.\./Other/g ; |
623 | | - if ($client !=~ / \d/) |
| 643 | + if ($client !~ / \d/) |
624 | 644 | { $client =~ s/\// / ; } |
625 | 645 | if ($rectype eq "-") { $total_clients_non_mobile += $count ; } |
626 | 646 | if ($rectype eq "M") { $total_clients_mobile += $count ; } |
— | — | @@ -1332,6 +1352,7 @@ |
1333 | 1353 | { |
1334 | 1354 | # http://en.wikipedia.org/wiki/List_of_countries_by_population |
1335 | 1355 | # http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users |
| 1356 | + print "Read $path_in/$file_csv_country_meta_info\n" ; |
1336 | 1357 | open COUNTRY_META_INFO, '<', "$path_in/$file_csv_country_meta_info" ; |
1337 | 1358 | binmode COUNTRY_META_INFO ; |
1338 | 1359 | while ($line = <COUNTRY_META_INFO>) |
— | — | @@ -1342,6 +1363,7 @@ |
1343 | 1364 | $line =~ s/C..?te d'Ivoire/Côte d'Ivoire/g ; |
1344 | 1365 | |
1345 | 1366 | ($country,$link,$population,$connected,$icon) = split ',', $line ; |
| 1367 | + print "COUNTRY $country\nLINK $link\nPOPULATION $population\nCONNECTED $connected\n\n" ; |
1346 | 1368 | $country =~ s/,/,/g ; |
1347 | 1369 | |
1348 | 1370 | # use country names as given by MaxMind |
— | — | @@ -1533,10 +1555,10 @@ |
1534 | 1556 | $months_recently = keys %months_recently ; |
1535 | 1557 | if ($months_recently == 0) { die "\$months_recently == 0\n" ; } |
1536 | 1558 | |
1537 | | - $requests_recently_start = substr ($requests_recently_start,5,2) . "/" . substr ($requests_recently_start,2,2) ; |
1538 | | - $requests_recently_stop = substr ($requests_recently_stop ,5,2) . "/" . substr ($requests_recently_stop ,2,2) ; |
1539 | | - $requests_start = substr ($requests_start,5,2) . "/" . substr ($requests_start,2,2) ; |
1540 | | - $requests_stop = substr ($requests_stop ,5,2) . "/" . substr ($requests_stop ,2,2) ; |
| 1559 | + $requests_recently_start = substr ($requests_recently_start,0,4) . '/' . substr ($requests_recently_start,5,2); |
| 1560 | + $requests_recently_stop = substr ($requests_recently_stop ,0,4) . '/' . substr ($requests_recently_stop ,5,2) ; |
| 1561 | + $requests_start = substr ($requests_start,0,4) . '/' . substr ($requests_start,5,2) ; |
| 1562 | + $requests_stop = substr ($requests_stop ,0,4) . '/' . substr ($requests_stop ,5,2) ; |
1541 | 1563 | |
1542 | 1564 | foreach $yyyymm (keys %$yyyymm) |
1543 | 1565 | { |
— | — | @@ -4644,9 +4666,6 @@ |
4645 | 4667 | $html_total .= "<tr><td colspan=99> </td></tr>" ; |
4646 | 4668 | |
4647 | 4669 | |
4648 | | - undef @keys_regions ; |
4649 | | -# foreach $key (sort keys %population_per_hemisphere) |
4650 | | -# { push @keys_regions, $key ; } |
4651 | 4670 | $html_regions = '' ; |
4652 | 4671 | foreach $key (qw (N S AF AS AU EU CA NA SA OC)) |
4653 | 4672 | { |
— | — | @@ -5150,19 +5169,19 @@ |
5151 | 5170 | my $views_edits_lc = lc $views_edits ; |
5152 | 5171 | |
5153 | 5172 | if ($show_logcount) |
5154 | | - { $report_version = "<p>This is the extended version of this report, with even small percentages included (> $cutoff_percentage\%) (see also bottom of page). " . |
5155 | | - "Switch to <a href='$file_html_per_country_breakdown'>regular version</a>" ; } |
| 5173 | + { $report_version = "<p>Showing even small percentages (> $cutoff_percentage\%) (read <a href='#more'>more</a>). " . |
| 5174 | + "Switch to <a href='$file_html_per_country_breakdown'>concise version</a>" ; } |
5156 | 5175 | else |
5157 | | - { $report_version = "<p>This is the regular version of this report, with only major percentages (> $cutoff_percentage\%) included." . |
5158 | | - " Switch to <a href='$file_html_per_country_breakdown_huge'>extended version</a>" ; } |
| 5176 | + { $report_version = "<p>Showing only only major percentages (> $cutoff_percentage\%) (read <a href='#more'>more</a>). " . |
| 5177 | + " Switch to <a href='$file_html_per_country_breakdown_huge'>detailed version</a>" ; } |
5159 | 5178 | |
5160 | 5179 | $html = $header ; |
5161 | 5180 | $html =~ s/TITLE/$title/ ; |
5162 | 5181 | $html =~ s/HEADER/$title/ ; |
5163 | 5182 | $html =~ s/LINKS// ; |
5164 | | - $html =~ s/ALSO/$links/ ; |
| 5183 | + $html =~ s/ALSO/$links$report_version/ ; |
5165 | 5184 | $html =~ s/NOTES// ; |
5166 | | - $html =~ s/X1000/. Period <b>$requests_recently_start - $requests_recently_stop<\/b><br>$report_version/ ; |
| 5185 | + $html =~ s/X1000/. Period <b>$requests_recently_start - $requests_recently_stop<\/b>/ ; |
5167 | 5186 | $html =~ s/DATE// ; |
5168 | 5187 | |
5169 | 5188 | $html .= "<p>Portal is <a href='http://www.wikipedia.org'>www.wikipedia.org</a>\n" ; |
— | — | @@ -5262,7 +5281,7 @@ |
5263 | 5282 | # $html .= "<tr><td colspan=99> </td></tr>\n" ; |
5264 | 5283 | } |
5265 | 5284 | $html .= "</table>" ; |
5266 | | - $html .= "<p><b>Share<\/b> is the percentage of requesting ip addresses (out of the global total) which originated from this country" . |
| 5285 | + $html .= "<p><a name='more' id='more'></a><b>Share<\/b> is the percentage of requesting ip addresses (out of the global total) which originated from this country" . |
5267 | 5286 | "<br> Further percentages show per country share of $views_edits_lc per Wikipedia visited" ; |
5268 | 5287 | $html .= "<p><b>Countries</b> are only included if the number of requests in the period exceeds $cutoff_requests,000 ($cutoff_requests matching records in 1:1000 sampled log)" ; |
5269 | 5288 | $html .= "<p><b>Wikipedia's</b> are only listed for some country if the share of visitors for that particular country exceeds $cutoff_percentage\%." ; |
— | — | @@ -5309,6 +5328,12 @@ |
5310 | 5329 | $html =~ s/X1000/. Period <b>$requests_start - $requests_stop<\/b>/ ; |
5311 | 5330 | $html =~ s/DATE// ; |
5312 | 5331 | |
| 5332 | + if ($views_edits eq 'Page Views') |
| 5333 | + { |
| 5334 | + $html .= "<p><font color=#800000>Nov 2011: For some countries the share of page views on the English Wikipedia was significantly higher in 2010 than in 2009 and 2011,<br>" . |
| 5335 | + "especially in Q1 and Q2. We don't know yet what caused this, this might be an artifact. Please be cautious to draw conclusions from this.</font>" ; |
| 5336 | + } |
| 5337 | + |
5313 | 5338 | $html .= "<p>Portal is <a href='http://www.wikipedia.org'>www.wikipedia.org</a>\n" ; |
5314 | 5339 | |
5315 | 5340 | $html .= "<p><table border=1 width=800>INDEX\n" ; |
— | — | @@ -5892,10 +5917,15 @@ |
5893 | 5918 | |
5894 | 5919 | sub ReadWikipedia |
5895 | 5920 | { |
| 5921 | + print "ReadWikipedia\n\n" ; |
| 5922 | + |
5896 | 5923 | use LWP::Simple qw($ua get); |
5897 | 5924 | |
5898 | 5925 | $ua->agent('Wikipedia Wikicounts job'); |
5899 | 5926 | $ua->timeout(60); |
| 5927 | + |
| 5928 | + |
| 5929 | + print "Read List_of_countries_by_population\n\n" ; |
5900 | 5930 | my $url = 'http://en.wikipedia.org/wiki/List_of_countries_by_population'; |
5901 | 5931 | my $html = get $url || die "Timed out!"; |
5902 | 5932 | |
— | — | @@ -5955,9 +5985,12 @@ |
5956 | 5986 | $link =~ s/,/,/g ; |
5957 | 5987 | $icon =~ s/,/,/g ; |
5958 | 5988 | |
| 5989 | + print "country: $country\nlink: $link\npopulation: $population\nconnected: $connected\nicon: $icon\n\n" ; |
5959 | 5990 | $countries {$country} = "$country,$link,$population,connected,$icon\n" ; |
5960 | 5991 | } |
5961 | 5992 | |
| 5993 | + print "List_of_countries_by_number_of_Internet_users\n\n" ; |
| 5994 | + |
5962 | 5995 | $url = 'http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users'; |
5963 | 5996 | $html = get $url || die "Timed out!"; |
5964 | 5997 | |
— | — | @@ -5995,10 +6028,12 @@ |
5996 | 6029 | $country =~ s/Timor Leste/Timor-Leste/ ; |
5997 | 6030 | $country =~ s/UAE/United Arab Emirates/ ; |
5998 | 6031 | |
| 6032 | + print "country: $country\nconnected: $connected\n\n" ; |
5999 | 6033 | $countries {$country} =~ s/connected/$connected/ ; |
6000 | 6034 | } |
6001 | 6035 | |
6002 | | - open COUNTRY_META_INFO, '>', "$path_out/SquidReportCountryMetaInfo.csv" ; |
| 6036 | + print "Write $path_in/$file_csv_country_meta_info\n\n" ; # use $path_in, not $path_out so that next step picks up proper file |
| 6037 | + open COUNTRY_META_INFO, '>', "$path_in/$file_csv_country_meta_info" ; |
6003 | 6038 | foreach $country (sort keys %countries) |
6004 | 6039 | { print COUNTRY_META_INFO $countries {$country} ; } |
6005 | 6040 | close COUNTRY_META_INFO ; |
— | — | @@ -6086,11 +6121,11 @@ |
6087 | 6122 | sub UnLink |
6088 | 6123 | { |
6089 | 6124 | my ($links,$index) = @_ ; |
6090 | | -# print "\n\nUnLink $index\n\n" ; |
| 6125 | + # print "\n\nUnLink $index\n\n" ; |
6091 | 6126 | my @segments = split '(?=<a )', $links ; |
6092 | | -# print "SEGMENT 1 $segments[$index]\n" ; |
| 6127 | + # print "SEGMENT 1 $segments[$index]\n" ; |
6093 | 6128 | $segments [$index] =~ s/^.*?<a .*?>([^<]*)<\/a>/<font color=#008000><b>$1<\/b><\/font>/ ; |
6094 | | -# print "SEGMENT 2 $segments[$index]\n" ; |
| 6129 | + # print "SEGMENT 2 $segments[$index]\n" ; |
6095 | 6130 | $links = join '', @segments ; |
6096 | 6131 | return ($links) ; |
6097 | 6132 | } |
— | — | @@ -6139,8 +6174,8 @@ |
6140 | 6175 | id: "millions", |
6141 | 6176 | is: function(s) { return false; }, |
6142 | 6177 | //failed so far to turn 1.2M into 1200000, so figures with decimal point are sorted out of place |
6143 | | -//format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/ /g,"").replace(/\\.(\d)M/,$1+"00000").replace(/M/,"000000").replace(/М/,"000000").replace(/K/,"000").replace(/К/i,"000")); }, |
6144 | | - format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/ /g,""). replace(/M/,"000000").replace(/М/,"000000").replace(/K/,"000").replace(/К/i,"000")); }, |
| 6178 | +//format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/ /g,"").replace(/\\.(\\d)M/,$1+"00000").replace(/M/,"000000").replace(/М/,"000000").replace(/K/,"000").replace(/К/i,"000")); }, |
| 6179 | + format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/ /g,""). replace(/M/,"000000").replace(/М/,"000000").replace(/K/,"000").replace(/К/i,"000")); }, |
6145 | 6180 | type: "numeric" |
6146 | 6181 | }); |
6147 | 6182 | |
— | — | @@ -6211,7 +6246,7 @@ |
6212 | 6247 | } |
6213 | 6248 | </style> |
6214 | 6249 | __HTML_SORT_TABLE__ |
6215 | | -return ($html) ; |
| 6250 | + return ($html) ; |
6216 | 6251 | } |
6217 | 6252 | |
6218 | 6253 | sub HtmlSortTableColumns |