r109171 MediaWiki - Code Review archive

Revision:r109170‎ | r109171 | r109172 >
Date:18:21, 17 January 2012
missing file added + patches for incompatible varnish (mobile) log
Modified paths:
  • /trunk/wikistats/squids/EzLib.pm (added) (history)
  • /trunk/wikistats/squids/SquidCountArchive.pl (modified) (history)
  • /trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm (modified) (history)
  • /trunk/wikistats/squids/SquidCountArchiveReadInput.pm (modified) (history)
  • /trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm (added) (history)
  • /trunk/wikistats/squids/SquidReportArchive.pl (modified) (history)

Diff [purge]

Index: trunk/wikistats/squids/EzLib.pm
@@ -0,0 +1,594 @@
 2+# Erik Zachte - general purpose routines
 3+# subroutines in this module have names in lowercase (I usually name own routines mixed case, though not consistently (yet)
 5+no warnings 'uninitialized';
 7+#use POSIX qw (locale_h);
 8+#$old_locale = setlocale(LC_LANGUAGE) ;
 9+#print "old locale LC_LANGUAGE $old_locale\n" ;
 10+#$new_locale = setlocale(LC_LANGUAGE, "en_US.UTF-8");
 11+#print "new locale LC_LANGUAGE $new_locale\n" ;
 13+$ez_lib_version = 14 ;
 15+sub ez_lib_version
 17+ $ez_lib_version_required = shift ;
 18+ if ($ez_lib_version < $ez_lib_version_required)
 19+ { print "EzLib out of date: version $ez_lib_version_required required" ; exit ;}
 22+use lib "/home/ezachte/lib" ;
 24+use Time::HiRes ;
 25+use Time::Local ;
 26+use Getopt::Std ;
 27+use Carp ;
 28+use Net::Domain qw (hostname);
 29+use Digest::MD5 qw (md5_hex);
 30+use Cwd ;
 31+use Benchmark qw (timesum timediff timestr timethis timethese cmpthese) ;
 32+use POSIX ;
 34+sub date_time_english ($) ;
 36+$true = 1 ;
 37+$false = 0 ;
 39+($app_start_user,$app_start_system) = times ;
 41+# Get host name
 42+$hostname = `hostname` ;
 43+chomp ($hostname) ;
 45+$os = $^O ;
 46+$os_linux = $true if $os =~ /linux/i ;
 47+$os_windows = $true if $os =~ /win32/i ;
 49+$path_program = $0 ;
 50+$path_program = Win32::GetLongPathName ($path_program) if $os_windows ;
 51+($path_program,$name_program) = split '[\\\/](?=[^\\\/]*$)', $path_program ;
 53+die "Operating system '$os' not supported" if (! $os_linux and ! $os_windows) ;
 55+if ($os_linux) # && (-d "/home/ezachte")) # runs on server, to be refined
 57+ $job_runs_on_production_server = $true ;
 58+ $path_home = "/home/ezachte" ;
 61+{ $path_home = getcwd () ; }
 63+$trace_on_exit = $false ; # shorthand for $trace_on_exit_concise
 64+$trace_on_exit_concise = $false ;
 65+$trace_on_exit_verbose = $false ;
 66+$trace_on_exit_libs = $false ;
 68+# emulate new perl 5.10 function
 69+sub say
 70+{ $msg = shift ; print "$msg\n" ; }
 72+# if no explicit parameters specified use these defaults (mainly for tests)
 73+sub default_argv
 75+ my $argv = shift ;
 76+ if (($#ARGV == -1) && (! $job_runs_on_production_server))
 77+ {
 78+ $argv =~ s/('[^'|]+')/($a=$1,$a=~s# #``#g,$a)/ge ;
 79+ $argv =~ s/("[^'|]+")/($a=$1,$a=~s# #``#g,$a)/ge ;
 80+ $argv =~ s/\s*\|/ /g ;
 81+ $argv =~ s/\|\s*/ /g ;
 82+ # @ARGV = split '\|', $argv ;
 83+ @ARGV = split ' ', $argv ;
 84+ foreach $arg (@ARGV)
 85+ { $arg =~ s/``/ /g ; }
 86+ $argv =~ s/``/ /g ;
 87+ }
 88+ else
 89+ { $argv = join ' | ', @ARGV ; }
 90+ print "\nScript $name_program started at " . date_time_english (time) . "\n" ;
 91+ print "Arguments: $argv\n" ;
 92+ print "\n" . '=' x 80 . "\n\n" ;
 93+ @ARGV_BAK = @ARGV ;
 96+# Get file time
 97+sub file_time ($)
 99+ my $path = shift ;
 101+ if (! -e $path)
 102+ { return '?' ; }
 103+ else
 104+ { return (time - (-M $path) * 24 * 60 * 60) ; }
 107+# Get last modification of this file
 108+sub trace_ez_lib
 110+ $file_pm = 'EzLib.pm' ;
 111+ $path_pm = "/home/ezachte/lib/$file_pm" ;
 112+ print "File $path_pm not found" unless -e $path_pm ;
 113+ $path_pm_age = time - ((-M $path_pm) * 24 * 60 * 60 ) ;
 114+ print "\n$file_pm last modified: " . date_time_english ($path_pm_age) . "\n\n" ;
 117+# Print current file and line number
 118+# print "File: ", __FILE__, " Line: ", __LINE__, "\n";
 120+# Flush output
 121+$| = 1;
 123+# prototype (\%) forces supplying one variable argument, which also is auto converted to reference
 124+# Pro Perl page 226: Requiring Variabloe Rather Than Values
 126+# invocation: @array = keys_sorted_by_value_alpha_asc (%hash) ;
 127+# replaces: @array = sort {$hash{$a} cmp $hash{$b}} keys %hash ;
 128+sub keys_sorted_by_value_alpha_asc (\%)
 130+ my $hashref = shift ;
 131+ return (sort {$hashref->{$a} cmp $hashref->{$b}} keys %$hashref) ;
 134+# invocation: @array = keys_sorted_by_value_alpha_desc (%hash) ;
 135+# replaces: @array = sort {$hash{$b} cmp $hash{$a}} keys %hash ;
 136+sub keys_sorted_by_value_alpha_desc (\%)
 138+ my $hashref = shift ;
 139+ return (sort {$hashref->{$b} cmp $hashref->{$a}} keys %$hashref) ;
 142+# invocation: @array = keys_sorted_by_value_num_asc (%hash) ;
 143+# replaces: @array = sort {$hash{$a} <=> $hash{$b}} keys %hash ;
 144+sub keys_sorted_by_value_num_asc (\%)
 146+ my $hashref = shift ;
 147+ return (sort {$hashref->{$a} <=> $hashref->{$b}} keys %$hashref) ;
 150+# invocation: @array = keys_sorted_by_value_num_desc (%hash) ;
 151+# replaces: @array = sort {$hash{$b} <=> $hash{$a}} keys %hash ;
 152+sub keys_sorted_by_value_num_desc (\%)
 154+ my $hashref = shift ;
 155+ return (sort {$hashref->{$b} <=> $hashref->{$a}} keys %$hashref) ;
 158+# almost trivial but to match keys_sorted_by_value_... subroutines
 159+# invocation: @array = keys_sorted_alpha_asc (%hash) ;
 160+# replaces: @array = sort {$a cmp $b} keys %hash ;
 161+sub keys_sorted_alpha_asc (\%)
 163+ my $hashref = shift ;
 164+ return (sort {$a cmp $b} keys %$hashref) ;
 167+# almost trivial but to match keys_sorted_by_value_... subroutines
 168+# invocation: @array = keys_sorted_alpha_desc (%hash) ;
 169+# replaces: @array = sort {$a cmp $b} keys %hash ;
 170+sub keys_sorted_alpha_desc (\%)
 172+ my $hashref = shift ;
 173+ return (sort {$b cmp $a} keys %$hashref) ;
 176+# almost trivial but to match keys_sorted_by_value_... subroutines
 177+# invocation: @array = keys_sorted_num_asc (%hash) ;
 178+# replaces: @array = sort {$a <=> $b} keys %hash ;
 179+sub keys_sorted_num_asc (\%)
 181+ my $hashref = shift ;
 182+ return (sort {$a <=> $b} keys %$hashref) ;
 185+# almost trivial but to match keys_sorted_by_value_... subroutines
 186+# invocation: @array = keys_sorted_num_desc (%hash) ;
 187+# replaces: @array = sort {$b <=> $a} keys %hash ;
 188+sub keys_sorted_num_desc (\%)
 190+ my $hashref = shift ;
 191+ return (sort {$b <=> $a} keys %$hashref) ;
 194+# for mulilingual version see wikiReportsDate.pl / sub GetDate
 195+sub date_time_english ($)
 197+ my @weekdays_en = qw (Sunday Monday Tuesday Wednesday Thursday Friday Saturday);
 198+ my @months_en = qw (January February March April May June July
 199+ August September October November December);
 200+ my $time = shift ;
 201+ my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($time);
 202+ return (substr ($weekdays_en[$wday],0,3) . ", " .
 203+ substr ($months_en[$mon],0,3) . " " .
 204+ $mday . ", " .
 205+ (1900 + $year) .
 206+ " " . sprintf ("%2d:%02d", $hour, $min)) ;
 209+# for mulilingual version see wikiReportsDate.pl / sub GetMonthShort
 210+sub month_english_short ($)
 212+ my @months_en = qw (Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
 214+ my $month = shift ;
 215+ if ($month !~ /^\d+$/)
 216+ { return ("?") ; }
 218+ return ($months_en [$month % 12]) ;
 221+sub month_year_english_short ($)
 223+ my @months_en = qw (Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
 225+ my $month = shift ;
 226+ if ($month !~ /^\d+$/)
 227+ { return ("?") ; }
 228+ $month-- ;
 230+ return ($months_en [$month % 12] . " " . (2000 + int ($month / 12)) ) ;
 233+sub ddhhmmss
 235+ my $seconds = shift ;
 236+ my $format = shift ;
 238+ $days = int ($seconds / (24*3600)) ;
 239+ $seconds -= $days * 24*3600 ;
 240+ $hrs = int ($seconds / 3600) ;
 241+ $seconds -= $hrs * 3600 ;
 242+ $min = int ($seconds / 60) ;
 243+ $sec = $seconds % 60 ;
 245+ if ($format eq '')
 246+ {
 247+ $days = ($days > 0) ? (($days > 1) ? "$days days, " : "$days day, ") : "" ;
 248+ $hrs = (($days + $hrs > 0) ? (($hrs > 1) ? "$hrs hrs" : "$hrs hrs") : "") . ($days + $hrs > 0 ? ", " : ""); # 2 hrs/1 hr ?
 249+ $min = ($days + $hrs + $min > 0) ? "$min min, " : "" ;
 250+ $sec = "$sec sec" ;
 251+ return ("$days$hrs$min$sec") ;
 252+ }
 253+ else
 254+ {
 255+ return sprintf ($format,$days,$hrs,$min,$sec) if $format =~ /%.*%.*%.*%/ ;
 256+ return sprintf ($format, $hrs,$min,$sec) if $format =~ /%.*%.*%/ ;
 257+ return sprintf ($format, $min,$sec) if $format =~ /%.*%/ ;
 258+ return sprintf ($format, $sec) ;
 259+ }
 262+sub yyyymmddThhmmssDiff
 264+ my ($time_till, $time_from) = @_ ;
 265+ my ($yy1,$mm1,$dd1,$hh1,$nn1,$ss1) = $time_till =~ /(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)/ ;
 266+ my ($yy2,$mm2,$dd2,$hh2,$nn2,$ss2) = $time_from =~ /(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)/ ;
 267+ $time_till = timegm ($ss1,$nn1,$hh1,$dd1, $mm1-1, $yy1) ;
 268+ $time_from = timegm ($ss2,$nn2,$hh2,$dd2, $mm2-1, $yy2) ;
 269+ return ($time_till - $time_from) ;
 272+sub yyyymmddhhmmssDiff
 274+ my ($time_till, $time_from) = @_ ;
 275+ my ($yy1,$mm1,$dd1,$hh1,$nn1,$ss1) = $time_till =~ /(\d\d\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)/ ;
 276+ my ($yy2,$mm2,$dd2,$hh2,$nn2,$ss2) = $time_from =~ /(\d\d\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)/ ;
 277+ $time_till = timegm ($ss1,$nn1,$hh1,$dd1, $mm1-1, $yy1) ;
 278+ $time_from = timegm ($ss2,$nn2,$hh2,$dd2, $mm2-1, $yy2) ;
 279+ return ($time_till - $time_from) ;
 282+sub yyyymmddDiffDays
 284+ my ($time_till, $time_from) = @_ ;
 285+ my ($yy1,$mm1,$dd1) = $time_till =~ /(\d\d\d\d)-(\d\d)-(\d\d)/ ;
 286+ my ($yy2,$mm2,$dd2) = $time_from =~ /(\d\d\d\d)-(\d\d)-(\d\d)/ ;
 287+ $time_till = timegm (0,0,0,$dd1, $mm1-1, $yy1) ;
 288+ $time_from = timegm (0,0,0,$dd2, $mm2-1, $yy2) ;
 289+ return (($time_till - $time_from) / (24 * 60 * 60));
 292+sub yyyymmDiffDays
 294+ my ($time_till, $time_from) = @_ ;
 295+ my ($yy1,$mm1) = $time_till =~ /(\d\d\d\d)-(\d\d)/ ;
 296+ my ($yy2,$mm2) = $time_from =~ /(\d\d\d\d)-(\d\d)/ ;
 297+ $mm1++ ;
 298+ if ($mm1 > 12) { $mm1 = 1 ; $yy++ ; }
 299+ $time_till = timegm (0,0,0,1, $mm1-1, $yy1) ;
 300+ $time_from = timegm (0,0,0,1, $mm2-1, $yy2) ;
 301+ return (($time_till - $time_from) / (24 * 60 * 60)) ;
 304+sub days_in_month
 306+ my $year = shift ;
 307+ my $month = shift ;
 308+ my $days = $days_in_month_cached {"$year $month"} ;
 309+ return $days if $days > 0 ;
 311+ my $month2 = $month+1 ;
 312+ my $year2 = $year ;
 313+ if ($month2 > 12)
 314+ { $month2 = 1 ; $year2++ }
 316+ my $timegm1 = timegm (0,0,0,1,$month-1,$year-1900) ;
 317+ my $timegm2 = timegm (0,0,0,1,$month2-1,$year2-1900) ;
 318+ $days = ($timegm2-$timegm1) / (24*60*60) ;
 320+ $days_in_month_cached {"$year $month"} = $days ;
 321+ return ($days) ;
 325+sub abort
 328+ $msg = shift ;
 329+ confess ("\nAbort: $msg\n\n") ;
 330+ exit ;
 334+# test on each run of script whether message should still be displayed, e.g. "New feature"
 335+sub blank_text_after
 337+ my $date = shift ;
 338+ my $text = shift ;
 339+ my ($day,$month,$year) = $date =~ /(\d+).*?(\d+).*?(\d+)/ ;
 340+ my $till = timegm (0,0,0,$day,$month-1,$year-1900) ;
 341+ if (time > $till)
 342+ { return ("") ; }
 343+ else
 344+ { return ($text) ; }
 347+# test for four triplets and optional port number
 348+sub is_valid_ip_address
 350+ my $address = shift ;
 351+ return ($address =~ /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?:\:\d+)?$/) ;
 354+# store elapsed high resolution time, gor benchmarking
 355+sub code_started
 356+{ return Time::HiRes::time() ; }
 358+sub code_complete
 360+ my ($label, $start) = @_ ;
 361+ $code_passes {$label} ++ ;
 362+ $code_time_spent {$label} += Time::HiRes::time - $start ;
 365+# only protect division against runtime error
 366+sub divide_if_allowed
 368+ my $x = shift ;
 369+ my $y = shift ;
 370+ if ($y == 0)
 371+ { return () ; }
 372+ else
 373+ { return ($x/$y) ; }
 376+# use Encode qw(encode);
 377+# $eckey=encode('utf8',$key);
 378+sub encode_url
 380+ my $url = shift ;
 381+ $url =~ s/([^0-9a-zA-Z\%\:\/\.])/"%".sprintf ("%X",ord($1))/ge ;
 382+ return ($url) ;
 385+sub encode_non_ascii
 387+ my $msg = shift ;
 388+ $msg =~ s/([\x80-\xFF]{2,})/"%".sprintf ("%X",ord($1))/ge ;
 389+ return ($msg) ;
 392+sub convert_unicode
 394+ my $string = shift ;
 395+ my $input_unicoded = ($string =~ m/[\xc0-\xdf][\x80-\xbf]|
 396+ [\xe0-\xef][\x80-\xbf]{2}|
 397+ [\xf0-\xf7][\x80-\xbf]{3}/x) ;
 400+ # unicode -> html character codes &#nnnn;
 401+ if ($input_unicoded)
 402+ { $string =~ s/([\x80-\xFF]+)/unicode_to_html($1)/ge ; }
 403+ return ($string) ;
 406+sub unicode_to_html
 408+ my $text = shift ;
 409+ my $html = "" ;
 410+ my ($c, $len, $byte, $ord, $unicode, $bytes) ;
 412+ $len = length ($text) ;
 413+ for ($c = 0 ; $c < $len ; $c++)
 414+ {
 415+ $byte = substr ($text,$c,1) ;
 416+ $ord = ord ($byte) ;
 417+ if ($ord < 128) # plain ascii character
 418+ { $html .= $byte ; } # (will not occur in this script)
 419+ else
 420+ {
 421+ # single byte left >= 0x80 ? should never occur but does a few times
 422+ # treat as pre-unicode high ascii character
 423+ if ($c == $len - 1)
 424+ {
 425+ $html = "\&\#". $ord . ";" ;
 426+ # print FILE_ERR $title .":invalid unicode char ".$text. "\n"
 427+ }
 428+ else
 429+ {
 430+ if ($ord < 224)
 431+ { $bytes = 2 ; }
 432+ elsif ($ord < 240)
 433+ { $bytes = 3 ; }
 434+ elsif ($ord < 248)
 435+ { $bytes = 4 ; }
 436+ elsif ($ord < 252)
 437+ { $bytes = 5 ; }
 438+ else
 439+ { $bytes = 6 ; }
 440+ $unicode = substr ($text,$c,$bytes) ;
 441+ $html .= unicode_to_html_tag ($unicode) ;
 442+ $c += $bytes - 1 ;
 443+ }
 444+ }
 445+ }
 446+ return ($html) ;
 450+sub unicode_to_html_tag
 452+ my $unicode = shift ;
 453+ my $char = substr ($unicode,0,1) ;
 454+ my $ord = ord ($char) ;
 455+ my ($c, $value, $html) ;
 457+ if ($ord < 128) # plain ascii character
 458+ { return ($unicode) ; } # (will not occur in this script)
 459+ else
 460+ {
 461+ if ($ord >= 252)
 462+ { $value = $ord - 252 ; }
 463+ elsif ($ord >= 248)
 464+ { $value = $ord - 248 ; }
 465+ elsif ($ord >= 240)
 466+ { $value = $ord - 240 ; }
 467+ elsif ($ord >= 224)
 468+ { $value = $ord - 224 ; }
 469+ else
 470+ { $value = $ord - 192 ; }
 471+ for ($c = 1 ; $c < length ($unicode) ; $c++)
 472+ { $value = $value * 64 + ord (substr ($unicode, $c,1)) - 128 ; }
 473+ $html = "\&\#" . $value . ";" ;
 475+ return ($html) ;
 476+ }
 486+# optionally print program meta data when program sends
 489+# if ($os_windows)
 490+# { use Win32 ; }
 492+ my ($time, $path,$program) ;
 494+ if ($trace_on_exit || $trace_on_exit_verbose || $trace_on_exit_concise)
 495+ {
 496+ $time_elapsed_total = time - $^T ; # $^T is program start time
 497+ ($app_end_user,$app_end_system) = times ;
 499+ $time_active_user_processes = $app_end_user - $app_start_user ;
 500+ $time_active_system_processes = $app_end_system - $app_start_system ;
 501+ $time_active_total = $time_active_user_processes + $time_active_system_processes ;
 503+ # print "\n" . '=' x (length ($msg) -1) . "\n\n$msg\n\n" ;
 504+ print "\n" . '=' x 80 . "\n\n$msg\n\n" ;
 505+ }
 507+ if ($trace_on_exit || $trace_on_exit_verbose || $trace_on_exit_concise)
 508+ {
 509+ print "Prog: $name_program\n" ;
 510+ print "Path: $path_program\n" ;
 511+ if ($job_runs_on_production_server)
 512+ { print "Host: $hostname (production)\n\n" ; }
 513+ else
 514+ { print "Host: $hostname (test run)\n\n" ; }
 515+ print "Args:\n\n", map {" $_\n"} @ARGV_BAK ;
 516+ # print "Host: $hostname\n" ;
 517+ print "OS: $os\n" ;
 518+ print "Perl: " . ($a = sprintf ("%.9f",$^V), $a =~ s/\_/_/g,$a) . "\n" ; # perl version
 519+ print "Perl: $^X\n" ; # perl exe path
 520+ print "EzLib: $ez_lib_version\n" ; # perl exe path
 521+ }
 523+ if ($trace_on_exit || $trace_on_exit_verbose || $trace_on_exit_libs)
 524+ {
 525+ # Get library paths
 526+ print "\nLibs:\n", map {" $_\n"} @INC ;
 528+ $cwd = cwd () ;
 529+ foreach (grep {$_ =~ /home|wiki/i} values %INC) # own modules
 530+ # foreach (values %INC) # all modules
 531+ {
 532+ $file = $_ ;
 533+ if ($file !~ /[\\\/]/)
 534+ { $file = "$cwd/$file" ; }
 535+ $time = file_time ($file) ;
 536+ # $file = Win32::GetLongPathName ($_) if $os_windows ;
 537+ push @own_modules, "$time|$file" ;
 538+ }
 540+ @own_modules = sort {$b <=> $a} @own_modules ;
 541+ print "\nOwn modules (d/m/y h:m):\n" ;
 542+ foreach (@own_modules)
 543+ {
 544+ ($time,$path) = split '\|', $_ ;
 545+ my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($time);
 546+ print sprintf (" %2d/%02d/%4d %2d:%02d %s\n", $mday,$mon+1,$year+1900,$hour,$min, $path) ;
 547+ }
 548+ print "\n\n" ;
 549+ }
 551+ $lines = 0 ;
 552+ foreach $key (sort keys %code_passes)
 553+ {
 554+ if ($lines++ == 0)
 555+ { print "Executing times:\n" ; }
 556+ print sprintf (" %-26s","$key:") . ddhhmmss($code_time_spent{$key},'%3d min, %2d sec') . " / " .
 557+ sprintf ("%10d",$code_passes{$key}) . " calls = " . sprintf ("%6f", divide_if_allowed ($code_time_spent{$key}, $code_passes {$key})) . " sec/pass\n" ;
 558+ }
 559+ print "\n" ;
 561+# $locale = setlocale(LC_LANGUAGE, $old_locale);
 562+# print "locale LC_LANGUAGE back to $locale\n" ;
 564+ if ($time_elapsed_total < 5)
 565+ { $msg = "Ready in " . ddhhmmss ($time_elapsed_total) . "\n" ; }
 566+ else
 567+ {
 568+ $perc_active_user_processes = sprintf ("%4.1f", 100 *$time_active_user_processes/$time_elapsed_total) ;
 569+ $perc_active_system_processes = sprintf ("%4.1f", 100 *$time_active_system_processes/$time_elapsed_total) ;
 570+ $perc_active_total = sprintf ("%4.1f", 100 *$time_active_total/$time_elapsed_total) ;
 571+ $msg = "Ready in " . ddhhmmss ($time_elapsed_total) . "\n\nTime spent:\n" .
 572+ "User: $perc_active_user_processes\% (" . ddhhmmss ($time_active_user_processes) . ")\n" .
 573+ "System: $perc_active_system_processes\% (" . ddhhmmss ($time_active_system_processes) . ")\n" .
 574+ "Total: $perc_active_total\% (" . ddhhmmss ($time_active_total) . ")\n" ;
 575+ }
 577+ print "\n\n" . '=' x 80 . "\n" . '=' x 80 . "\n\n" ;
 580+sub trace
 582+ my $function_name = shift ;
 584+ my ($ss,$mm,$hh) = (localtime (time))[0,1,2] ;
 585+ my $time = sprintf ("%02d:%02d:%02d", $hh, $mm, $ss) ;
 587+ print "\n$time $function_name\n" ;
 590+# only when perl compiled with malloc
 591+# use Devel::Peek ;
 592+# $ENV {PERL_DEBUG_MSTATS} = 2;
 593+# mstat() ;
 595+1 ;
Index: trunk/wikistats/squids/SquidCountArchiveReadInput.pm
@@ -1,5 +1,10 @@
22 #!/usr/bin/perl
 4+# /usr/local/bin/geoiplogtag uses /usr/share/GeoIP/GeoIP.dat
 5+# test:
 6+# echo | /usr/local/bin/geoiplogtag 1
 7+# refresh: bayes:/usr/share/GeoIP> wget http://geolite.maxmind.com/download/geoip/database/GeoLiteCountry/GeoIP.dat.gz
49 sub CollectFilesToProcess
510 {
611 trace CollectFilesToProcess ;
@@ -161,7 +166,7 @@
162167 if ($job_runs_on_production_server)
163168 {
164169 if ($file_in =~ /\.gz$/o)
165 - { open IN, "-|", "gzip -dc $file_in | /usr/local/bin/geoiplogtag 5" ; } # http://perldoc.perl.org/functions/open.html
 170+ { open IN, "-|", "gzip -dc $file_in | sed s/\\ \\ */\\ /g | /usr/local/bin/geoiplogtag 5" ; } # http://perldoc.perl.org/functions/open.html
166171 else
167172 { open IN, "-|", "cat $file_in | /usr/local/bin/geoiplogtag 5" ; } # http://perldoc.perl.org/functions/open.html
168173 $fields_expected = 14 ;
@@ -184,10 +189,36 @@
185190 # print $line ;
186191 # }
 194+# ugly Q&D code to circumvent spaces in agent string
 195+# $line2 = $line ;
 196+ chomp $line ;
188197 @fields = split (' ', $line) ;
189 - if ($#fields < $fields_expected) { $fields_too_few ++ ; next ; }
190 - if ($#fields > $fields_expected) { $fields_too_many ++ ; next ; }
 198+# next if $line =~ /upload/ ;
 199+# next if $line !~ /en\.m\.wikipedia/ ;
 200+# next if $fields[10] eq '-' ;
 201+# print "mime " . $fields[10] . "\n" ;
 202+#next if $fields [9] eq '-' ;
 203+#next if $fields [9] =~ /NONE/ ;
 204+ if ($#fields > 14)
 205+ {
 206+# print "line $line2\n" ;
 207+# print "fields " . $#fields . "\n$line\n" ;
 208+ $country_code = $fields [$#fields] ;
 209+ $fields [$#fields] = '' ;
 210+ $line = join (' ', @fields) ;
 211+# print "2 $line\n" ;
 212+ @fields = split (' ', $line, 14) ;
 213+ $fields [14] = $country_code ;
 214+# print "\n\n12: " . $fields [12] . "\n" ;
 215+# print "13: " . $fields [13] . "\n" ;
 216+# print "14: " . $fields [14] . "\n" ;
 217+# print "15: " . $fields [15] . "\n" ;
 218+ }
 220+ if ($#fields < $fields_expected) { $fields_too_few ++ ; print "invalid field count " . $#fields . "\n" ; next ; }
 221+ if ($#fields > $fields_expected) { $fields_too_many ++ ; print "invalid field count " . $#fields . "\n" ; next ; }
192223 $time = $fields [2] ;
194225 if (($oldest_time_read eq "") || ($time lt $oldest_time_read))
Index: trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm
@@ -0,0 +1,922 @@
 2+ #!/usr/bin/perl
 4+ use lib "/home/ezachte/lib" ;
 5+ use EzLib ;
 7+sub WriteOutputIpFrequencies
 9+ trace WriteOutputIpFrequencies ;
 11+ my $path_out = shift ;
 12+ print "\ncd $path_out\n\n" ;
 13+ chdir ($path_out) ;
 15+ $comment = "# Data from $time_to_start till $time_to_stop (yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 1000)\n" ;
 17+ open CSV_MULTIPLE_ADDRESSES, '>', $file_ip_frequencies ;
 18+ print "# html pages found: $html_pages_found\n" ;
 19+ print CSV_MULTIPLE_ADDRESSES "# html pages found: $html_pages_found\n" ;
 20+ print CSV_MULTIPLE_ADDRESSES "#\n" ;
 22+ foreach $address (keys %ip_frequencies)
 23+ {
 24+ $ip_distribution {$ip_frequencies {$address}} ++ ;
 25+ }
 27+ $ip_distribution_ge_2 = 0 ;
 28+ $ip_distribution_ge_3 = 0 ;
 29+ $ip_distribution_ge_4 = 0 ;
 30+ $ip_distribution_ge_5 = 0 ;
 31+ $ip_distribution_ge_10 = 0 ;
 32+ $ip_distribution_ge_20 = 0 ;
 33+ $ip_distribution_ge_50 = 0 ;
 34+ $ip_distribution_ge_100 = 0 ;
 35+ $ip_distribution_ge_250 = 0 ;
 36+ $ip_distribution_ge_1000 = 0 ;
 37+ $ip_distribution_ge_2500 = 0 ;
 38+ $ip_distribution_ge_10000 = 0 ;
 40+ foreach $frequency (sort {$a <=> $b} keys %ip_distribution)
 41+ {
 42+ $metafreq = $ip_distribution {$frequency} ;
 43+ if ($frequency >= 2) { $ip_distribution_ge_2 += $metafreq ; }
 44+ if ($frequency >= 3) { $ip_distribution_ge_3 += $metafreq ; }
 45+ if ($frequency >= 4) { $ip_distribution_ge_4 += $metafreq ; }
 46+ if ($frequency >= 5) { $ip_distribution_ge_5 += $metafreq ; }
 47+ if ($frequency >= 10) { $ip_distribution_ge_10 += $metafreq ; }
 48+ if ($frequency >= 20) { $ip_distribution_ge_20 += $metafreq ; }
 49+ if ($frequency >= 50) { $ip_distribution_ge_50 += $metafreq ; }
 50+ if ($frequency >= 100) { $ip_distribution_ge_100 += $metafreq ; }
 51+ if ($frequency >= 250) { $ip_distribution_ge_250 += $metafreq ; }
 52+ if ($frequency >= 1000) { $ip_distribution_ge_1000 += $metafreq ; }
 53+ if ($frequency >= 2500) { $ip_distribution_ge_2500 += $metafreq ; }
 54+ if ($frequency >= 10000) { $ip_distribution_ge_10000 += $metafreq ; }
 55+ if ($frequency > 20) { next ; }
 56+ print "# $metafreq addresses occur $frequency times\n" ;
 57+ print CSV_MULTIPLE_ADDRESSES "# $metafreq addresses occur $frequency times\n" ;
 58+ }
 60+ print CSV_MULTIPLE_ADDRESSES "#\n" ;
 61+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_2 addresses occur 2+ times\n" ;
 62+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_3 addresses occur 3+ times\n" ;
 63+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_4 addresses occur 4+ times\n" ;
 64+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_5 addresses occur 5+ times\n" ;
 65+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_10 addresses occur 10+ times\n" ;
 66+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_20 addresses occur 20+ times\n" ;
 67+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_50 addresses occur 50+ times\n" ;
 68+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_100 addresses occur 100+ times\n" ;
 69+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_250 addresses occur 250+ times\n" ;
 70+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_1000 addresses occur 1000+ times\n" ;
 71+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_2500 addresses occur 2500+ times\n" ;
 72+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_10000 addresses occur 10000+ times\n" ;
 74+ foreach $address (sort {$ip_frequencies {$b} <=> $ip_frequencies {$a}} keys %ip_frequencies)
 75+ {
 76+ $frequency = $ip_frequencies {$address} ;
 77+ # print "$freq,$address\n" ;
 78+ if ($frequency > 1)
 79+ { print CSV_MULTIPLE_ADDRESSES "$frequency,$address\n" ; }
 80+ }
 84+ if ($job_runs_on_production_server)
 85+ {
 86+ $cmd = "bzip2 -f $file_ip_frequencies" ;
 87+ print "\ncmd = '$cmd'\n" ;
 88+ `$cmd` ;
 89+ }
 92+sub WriteOutputSquidSequenceGaps
 94+ trace WriteOutputSquidSequenceGaps ;
 96+ my $path_out = shift ;
 97+ print "\ncd $path_out\n\n" ;
 98+ chdir ($path_out) ;
 100+ my ($tot_events_all_day, $tot_delta_all_day, %all_squids_events, %all_squids_delta, %squids, $tot_squids) ;
 102+ $yyyy = substr ($time_to_start,0,4) ;
 103+ $mm = substr ($time_to_start,5,2) ;
 104+ $dd = substr ($time_to_start,8,2) ;
 105+ $date = substr ($time_to_start,0,10) ;
 106+ $date_excel = "\"=DATE($yyyy,$mm,$dd)\"" ;
 108+ open CSV_SEQNO_PER_SQUIDHOUR, '>', $file_seqno_per_squidhour ;
 109+ print CSV_SEQNO_PER_SQUIDHOUR "squid,hour,events,tot delta,avg delta\n" ;
 111+ $squid_hour = 0 ;
 112+ foreach $squid_hour (sort keys %squid_events)
 113+ {
 114+ $events = $squid_events {$squid_hour} ;
 115+ next if $events == 0 ;
 117+ $delta = $squid_delta {$squid_hour} ;
 118+ $avg_delta = sprintf ("%.0f", $delta / $events) ;
 120+ print CSV_SEQNO_PER_SQUIDHOUR "$squid_hour,$events,$delta,$avg_delta\n" ;
 121+ print "$squid_hour,$events,$delta,$avg_delta\n" ;
 123+ $tot_events_all_day += $events ;
 124+ $tot_delta_all_day += $delta ;
 125+ ($squid,$hour) = split (',', $squid_hour) ;
 126+ $squids {$squid} ++ ;
 128+ $all_squids_events {$hour} += $events ;
 129+ $all_squids_delta {$hour} += $delta ;
 130+ }
 132+ foreach $squid (keys %squids)
 133+ { $tot_squids++ ; }
 136+ if ($tot_events_all_day > 0)
 137+ {
 138+ $avg_delta_all_day = sprintf ("%.0f", $tot_delta_all_day / $tot_events_all_day) ;
 139+ $tot_events_all_day_corrected = sprintf ("%.0f", ($avg_delta_all_day / 1000) * $tot_events_all_day) ;
 141+ print CSV_SEQNO_PER_SQUIDHOUR "# Squids: $tot_squids Events: $tot_events_all_day Avg delta: $avg_delta_all_day\n\n" ;
 142+ print "\nSquids: $tot_squids\nEvents: $tot_events_all_day\nAvg delta: $avg_delta_all_day\n\n" ;
 143+ }
 144+ else
 145+ {
 146+ print CSV_SEQNO_PER_SQUIDHOUR "# Squids: $tot_squids Events: 0\n\n" ;
 147+ print "\nSquids: $tot_squids\nEvents: 0\n\n" ;
 148+ }
 151+ # now same thing for all squids combined, hourly
 153+ undef @csv ;
 155+ open CSV_SEQNO_ALL_SQUIDS_DAY, '>', $file_seqno_all_squids ;
 156+ print CSV_SEQNO_ALL_SQUIDS_DAY "date,time,events,avg delta seqno\n" ;
 158+ open CSV_SEQNO_ALL_SQUIDS_MONTH, '<', "../$file_seqno_all_squids" ;
 159+ while ($line = <CSV_SEQNO_ALL_SQUIDS_MONTH>)
 160+ {
 161+ next if $line =~ /^$date/ ;
 162+ next if $line =~ /^date/ ;
 163+ push @csv, $line ;
 164+ }
 167+ open CSV_SEQNO_ALL_SQUIDS_MONTH, '>', "../$file_seqno_all_squids" ;
 168+ print CSV_SEQNO_ALL_SQUIDS_MONTH "date,time,events (x 1000),avg delta seqno,date excel,events corrected (x 1000)\n" ;
 169+ foreach $line (sort @csv)
 170+ { print CSV_SEQNO_ALL_SQUIDS_MONTH $line ; }
 172+ $hour = '' ;
 173+ foreach $hour (sort keys %all_squids_events)
 174+ {
 175+ $avg_delta = 0 ;
 176+ $events = $all_squids_events {$hour} ;
 177+ $delta = $all_squids_delta {$hour} ;
 178+ if ($events > 0)
 179+ { $avg_delta = sprintf ("%.0f", $delta / $events) ; }
 181+ print CSV_SEQNO_ALL_SQUIDS_DAY "$date,$hour,$events,$avg_delta\n" ;
 182+ print CSV_SEQNO_ALL_SQUIDS_MONTH "$date,$hour,$events,$avg_delta\n" ;
 183+ print "$date,$hour,$events,$avg_delta\n" ;
 184+ }
 186+ print CSV_SEQNO_ALL_SQUIDS_MONTH "$date,*,$tot_events_all_day,$avg_delta_all_day,$date_excel,$tot_events_all_day_corrected\n" ;
 187+ print "$date,*,$tot_events_all_day,$avg_delta_all_day,$tot_events_all_day_corrected\n" ;
 193+sub WriteOutputSquidLogs
 195+ trace WriteOutputSquidLogs ;
 197+ my $path_out = shift ;
 198+ print "\ncd $path_out\n\n" ;
 199+ chdir ($path_out) ;
 201+ $comment = "# Data from $time_to_start till $time_to_stop (yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 1000)\n" ;
 203+ open CSV_METHODS, '>', $file_csv_methods ;
 204+ open CSV_SKINS, '>', $file_csv_skins ;
 205+ open CSV_SCRIPTS, '>', $file_csv_scripts ;
 206+ open CSV_IMAGES, '>', $file_csv_images ;
 207+ open CSV_BANNERS, '>', $file_csv_banners ;
 208+ open CSV_BINARIES, '>', $file_csv_binaries ;
 209+ open CSV_EXTENSIONS, '>', $file_csv_extensions ;
 210+ open CSV_REQUESTS, '>', $file_csv_requests ;
 211+ open CSV_REQUESTS_WAP, '>', $file_csv_requests_wap ;
 212+ open CSV_REQUESTS_M, '>', $file_csv_requests_m ;
 213+ open CSV_ORIGINS, '>', $file_csv_origins ;
 214+ open CSV_SEARCH, '>', $file_csv_search ;
 215+ open CSV_BOTS, '>', $file_csv_bots ;
 216+ open CSV_GOOGLEBOTS, '>', $file_csv_googlebots ;
 217+ open CSV_OPSYS, '>', $file_csv_opsys ;
 218+ open CSV_CLIENTS, '>', $file_csv_clients ;
 219+ open CSV_LANGUAGES, '>', $file_csv_languages ;
 220+ open CSV_COUNTRIES_VIEWS, '>', $file_csv_countries_views ;
 221+ open CSV_COUNTRIES_SAVES, '>', $file_csv_countries_saves ;
 222+ open CSV_COUNTRIESTIMED, '>', $file_csv_countries_timed ;
 223+ open OUT_REFERERS, '>', $file_out_referers ;
 224+ open CSV_CLIENTS_BY_WIKI, '>', $file_csv_clients_by_wiki ;
 225+ open CSV_AGENTS, '>', $file_csv_agents ;
 227+ print CSV_METHODS $comment ;
 228+ print CSV_SKINS $comment ;
 229+ print CSV_SCRIPTS $comment ;
 230+ print CSV_IMAGES $comment ;
 231+ print CSV_BANNERS $comment ;
 232+ print CSV_BINARIES $comment ;
 233+ print CSV_EXTENSIONS $comment ;
 234+ print CSV_REQUESTS $comment ;
 235+ print CSV_REQUESTS_WAP $comment ;
 236+ print CSV_REQUESTS_M $comment ;
 237+ print CSV_ORIGINS $comment ;
 238+ print CSV_SEARCH $comment ;
 239+ print CSV_BOTS $comment ;
 240+ print CSV_GOOGLEBOTS $comment ;
 241+ print CSV_OPSYS $comment . "# mobile: $tags_mobile ($tags_mobile_upd)\n" .
 242+ "# pos 1: - = non mobile, M = mobile ('-'+'M'=100%), G = aggregated Group\n" ;
 243+ print CSV_CLIENTS $comment ;
 244+ print CSV_LANGUAGES $comment ;
 245+ print CSV_COUNTRIES_VIEWS $comment ;
 246+ print CSV_COUNTRIES_SAVES $comment ;
 247+ print CSV_COUNTRIESTIMED $comment ;
 248+ print OUT_REFERERS $comment ;
 249+ print CSV_CLIENTS_BY_WIKI $comment ;
 250+ print CSV_AGENTS $comment ;
 253+ print OUT "\nMETHODS:\n\n" ;
 254+ print "\nMethods:\n\n" ;
 255+ $method_all = 0 ;
 256+ foreach $key (sort keys %statusses)
 257+ {
 258+ if ($key =~ /:total/)
 259+ {
 260+ $total = $statusses {$key} ;
 261+ $method_all += $total ;
 262+ ($method = $key) =~ s/:.*$// ;
 263+ print OUT sprintf ("%-8s", "$method:") . sprintf ("%6d",$total) . "\n" ;
 264+ print sprintf ("%-8s", "$method:") . sprintf ("%6d",$total) . "\n" ;
 265+ }
 266+ }
 267+ print OUT "TOTAL: " . sprintf ("%6d",$method_all) . "\n" ;
 268+ print "TOTAL: " . sprintf ("%6d",$method_all) . "\n" ;
 270+ print CSV_METHODS ":method,status,count\n" ;
 271+ foreach $key (sort keys %statusses)
 272+ {
 273+ if ($key =~ /:total/)
 274+ {
 275+ $total = $statusses {$key} ;
 276+ ($method = $key) =~ s/:.*$// ;
 277+ }
 278+ else
 279+ {
 280+ $total = $statusses {$key} ;
 282+ print OUT sprintf ("%6d",$total) . " : " . $key . "\n" ;
 283+ $key2 = $key ;
 284+ $key2 =~ s/,/&comma;/g ;
 285+ $key2 =~ s/\:/,/g ;
 286+ print CSV_METHODS "$key2,$total\n" ;
 287+ }
 288+ }
 290+ # CSV_SKINS
 291+ print OUT "\nSKINS:\n\n" ;
 292+ print CSV_SKINS ":scripts,parameters,count\n" ;
 293+ $total_skins = 0 ;
 294+ foreach $key (sort keys %skins)
 295+ {
 296+ $total = $skins{$key} ;
 297+ $total_skins += $total ;
 298+ print OUT sprintf ("%5d",$total) . " : " . $key . "\n" ;
 299+ print CSV_SKINS "$key,$total\n" ;
 300+ }
 301+ print OUT sprintf ("%5d",$total_skins) . " : total\n" ;
 304+ print OUT "\nSCRIPTS:\n\n" ;
 305+ print CSV_SCRIPTS ":scripts,parameters,count\n" ;
 306+ foreach $key (sort keys %scripts)
 307+ {
 308+ print OUT sprintf ("%5d",$scripts{$key}) . " : " . $key . "\n" ;
 309+ print CSV_SCRIPTS "$key,${scripts{$key}}\n" ;
 310+ }
 313+ foreach $key (sort keys %scripts_no_further_processing)
 314+ {
 315+ print OUT sprintf ("%5d",$scripts_no_further_processing{$key}) . " : " . $key . "\n" ;
 316+ }
 318+ # CSV_IMAGES
 319+ print OUT "\nIMAGE SIZES:\n\n" ;
 320+ print CSV_IMAGES ":size range,count\n" ;
 321+ foreach $range (sort keys %imagesizes)
 322+ {
 323+ ($range2 = $range) =~ s/ //g ;
 324+ $count = $imagesizes {$range} ;
 325+ print OUT sprintf ("%5d",$count) . " : $range\n" ;
 326+ print CSV_IMAGES "$range2,$count\n" ;
 327+ }
 330+ print OUT "\nBANNERS:\n\n" ;
 331+ print CSV_BANNERS ":country,url\n" ;
 332+ foreach $key (sort {$banners {$b} <=> $banners {$a}} keys %banners)
 333+ {
 334+ print OUT sprintf ("%5d",$banners{$key}) . " : " . $key . "\n" ;
 335+ print CSV_BANNERS "$key,${banners{$key}}\n" ;
 336+ }
 339+ print OUT "\nBINARIES:\n\n" ;
 340+ print CSV_BINARIES ":file,count\n" ;
 341+ $cnt_binaries = 0 ;
 342+ foreach $key (sort {$binaries {$b} <=> $binaries {$a}} keys %binaries)
 343+ {
 344+ if (++$cnt_binaries <= 500)
 345+ { print OUT sprintf ("%5d",$binaries{$key}) . " : " . $key . "\n" ; }
 347+ print CSV_BINARIES "$key,${binaries{$key}}\n" ;
 348+ }
 349+ # print OUT "\nImages:\n\n" ;
 350+ # print CSV_IMAGES ":project,referer,ext,mime,parms,count\n" ;
 352+ foreach $key (sort keys %images_xref)
 353+ {
 354+ print OUT sprintf ("%5d",$images_xref{$key}) . " : " . $key . "\n" ;
 355+ # $key2 = $key ;
 356+ # $key2 =~ s/,/&comma;/g ;
 357+ # $key2 =~ s/\|/,/g ;
 358+ # push @csv, "$key2,${requests{$key}}" ;
 359+ }
 360+ #@csv =sort @csv ;
 361+ #foreach $line (@csv)
 362+ #{ print CSV_REQUESTS "$line\n" ; }
 365+ print OUT "\nEXTENSIONS:\n\n" ;
 366+ print "\nExtensions:\n\n" ;
 367+ print CSV_EXTENSIONS ":extension,count\n" ;
 368+ $total = 0 ;
 369+ foreach $key (sort {$exts {$b} <=> $exts {$a}} keys %exts)
 370+ {
 371+ $count = $exts {$key} ;
 372+ $total += $count ;
 373+ print OUT sprintf ("%6d",$count) . " : $key\n" ;
 374+ print sprintf ("%6d",$count) . " : $key\n" ;
 375+ print CSV_EXTENSIONS "$key,$count\n" ;
 376+ }
 377+ print OUT sprintf ("%6d",$total) . " : total\n" ;
 378+ print sprintf ("%6d",$total) . " : total\n" ;
 381+ undef @csv ;
 382+ print OUT "\nREQUESTS:\n\n" ;
 383+ print CSV_REQUESTS $legend ;
 384+ print CSV_REQUESTS ":project,referer,ext,mime,parms,count\n" ;
 385+ foreach $key (sort keys %requests)
 386+ {
 387+ print OUT sprintf ("%5d",$requests{$key}) . " : " . $key . "\n" ;
 388+ $key2 = $key ;
 389+ $key2 =~ s/,/&comma;/g ;
 390+ $key2 =~ s/\|/,/g ;
 391+ push @csv, "$key2,${requests{$key}}" ;
 392+ }
 393+ @csv = sort @csv ;
 394+ foreach $line (@csv)
 395+ { print CSV_REQUESTS "$line\n" ; }
 398+ undef @csv ;
 399+ print OUT "\nREQUESTS_WAP:\n\n" ;
 400+ print CSV_REQUESTS_WAP $legend ;
 401+ print CSV_REQUESTS_WAP ":project,ext,mime,parms,country,count\n" ;
 402+ foreach $key (sort keys %requests_wap)
 403+ {
 404+ print OUT sprintf ("%5d",$requests_wap{$key}) . " : " . $key . "\n" ;
 405+ $key2 = $key ;
 406+ $key2 =~ s/,/&comma;/g ;
 407+ $key2 =~ s/\|/,/g ;
 408+ push @csv, "$key2,${requests_wap{$key}}" ;
 409+ }
 410+ @csv = sort @csv ;
 411+ foreach $line (@csv)
 412+ { print CSV_REQUESTS_WAP "$line\n" ; }
 415+ undef @csv ;
 416+ print OUT "\nREQUESTS_M:\n\n" ;
 417+ print CSV_REQUESTS_M $legend ;
 418+ print CSV_REQUESTS_M ":project,ext,mime,parms,country,count\n" ;
 419+ foreach $key (sort keys %requests_m)
 420+ {
 421+ print OUT sprintf ("%5d",$requests_m{$key}) . " : " . $key . "\n" ;
 422+ $key2 = $key ;
 423+ $key2 =~ s/,/&comma;/g ;
 424+ $key2 =~ s/\|/,/g ;
 425+ push @csv, "$key2,${requests_m{$key}}" ;
 426+ }
 427+ @csv = sort @csv ;
 428+ foreach $line (@csv)
 429+ { print CSV_REQUESTS_M "$line\n" ; }
 431+ # CSV_BOTS
 432+ foreach $key (sort {$bots {$b} <=> $bots {$a}} keys %bots)
 433+ { print CSV_BOTS $bots{$key} . ",$key\n" ; }
 436+ print CSV_GOOGLEBOTS "# Hits for googlebot from Google ip address\n" ;
 437+ print CSV_GOOGLEBOTS ":date,:ip range,:hits\n" ;
 438+ foreach $key (sort {$a cmp $b} keys %google_bot_hits)
 439+ {
 440+ my $year = substr ($key,0,4) ;
 441+ my $mon = substr ($key,5,2) ;
 442+ my $mday = substr ($key,8,2) ;
 443+ my $hour = substr ($key,11,2) ;
 444+ my $date = "$year/$mon/$mday $hour:00:00" ;
 445+ my $iprange = $key ;
 446+ $iprange =~ s/^[^,]*,// ;
 448+ print CSV_GOOGLEBOTS "$date,$iprange,${google_bot_hits{$key}}\n" ;
 449+ }
 451+ #print OUT "\nUrls:\n" ;
 452+ #foreach $key (sort keys %urls)
 453+ #{ print OUT sprintf ("%5d",$urls{$key}) . " : " . $key . "\n" ; }
 456+ print OUT "\nINTERWIKI:\n\n" ;
 457+ foreach $key (sort keys %interwiki)
 458+ { print OUT sprintf ("%5d",$interwiki{$key}) . " : " . $key . "\n" ; }
 460+ print OUT "\nREFERER UPLOAD:\n\n" ;
 461+ foreach $key (sort keys %referer_upload)
 462+ { print OUT sprintf ("%5d",$referer_upload{$key}) . " : " . $key . "\n" ; }
 465+ print OUT_REFERERS $legend ;
 466+ print OUT_REFERERS "referer,count\n" ;
 468+ print OUT_REFERERS "# internal\n" ;
 469+ foreach $key (sort keys %referers_internal)
 470+ { print OUT_REFERERS sprintf ("%5d",$referers_internal{$key}) . " : " . $key . "\n" ; }
 472+ print OUT_REFERERS "# external\n" ;
 473+ foreach $key (sort {$origins_external {$b} <=> $origins_external {$a} } keys %origins_external)
 474+ { print OUT_REFERERS sprintf ("%5d",$origins_external{$key}) . " : " . $key . "\n" ; }
 476+ print OUT_REFERERS "# unsimplified\n" ;
 477+ foreach $key (sort keys %origins_unsimplified)
 478+ { print OUT_REFERERS sprintf ("%5d",$origins_unsimplified{$key}) . " : " . $key . "\n" ; }
 480+ print OUT_REFERERS "# simplified\n" ;
 481+ foreach $key (sort keys %origin_simplified)
 482+ { print OUT_REFERERS sprintf ("%5d",$origin_simplified{$key}) . " : " . $key . "\n" ; }
 484+ print "\nLook alikes:\n\n" ;
 485+ print OUT_REFERERS "# look alikes\n" ;
 486+ foreach $key (sort {$wikis {$b} <=> $wikis {$a}} keys %wikis)
 487+ {
 488+ print OUT_REFERERS sprintf ("%5d",$wikis{$key}) . " : " . $key . "\n" ;
 489+ print sprintf ("%5d",$wikis{$key}) . " : " . $key . "\n" ;
 490+ }
 493+ print OUT "\nORIGINS:\n\n" ;
 494+ print CSV_ORIGINS ":toplevel,count\n" ;
 495+ foreach $key (sort keys %origins)
 496+ {
 497+ print OUT sprintf ("%8d",$origins{$key}) . " : " . $key . "\n" ;
 498+ print CSV_ORIGINS "$key,${origins{$key}}\n" ;
 499+ }
 501+ # CSV_SEARCH
 502+ print OUT "\nSEARCHES:\n" ;
 503+ print CSV_SEARCH ":matches (ip range|referer|agent string),site,referer group,bot,agent match,mime group,top level domain,count\n" ;
 504+ foreach $key (sort keys %search)
 505+ {
 506+ print OUT sprintf ("%8d",$search{$key}) . " : " . $key . "\n" ;
 507+ print CSV_SEARCH "$key,${search{$key}}\n" ;
 508+ }
 511+ print OUT "\nLANGUAGES:\n\n" ;
 512+ print CSV_LANGUAGES ":browser,:language,:count\n" ;
 513+ foreach $key (sort keys %languages)
 514+ {
 515+ print OUT sprintf ("%8d",$languages{$key}) . " : " . $key . "\n" ;
 516+ print CSV_LANGUAGES "$key,${languages{$key}}\n" ;
 517+ }
 519+ #print OUT "\nSources:\n\n" ;
 520+ #foreach $key (sort keys %srcs)
 521+ #{ print OUT sprintf ("%5d",$srcs{$key}) . " : " . $key . "\n" ; }
 523+ print OUT "\nGOOGLE BOTS:\n\n" ;
 524+ foreach $key (sort keys %googlebots)
 525+ { print OUT sprintf ("%5d",$googlebots{$key}) . " : " . $key . "\n" ; }
 527+ print OUT "\nGOOGLE BINS:\n\n" ;
 528+ print "\nGoogle bins:\n\n" ;
 529+ foreach $key (sort {$googlebins {$b} <=> $googlebins {$a}} keys %googlebins)
 530+ {
 531+ print OUT sprintf ("%5d",$googlebins{$key}) . " : " . $key . "\n" ;
 532+ print sprintf ("%5d",$googlebins{$key}) . " : " . $key . "\n" ;
 533+ }
 535+ print OUT "\nGOOGLE BINS 2:\n\n" ;
 536+ print "\nGoogle bins 2:\n\n" ;
 537+ foreach $key (sort {$googlebins2 {$b} <=> $googlebins2 {$a}} keys %googlebins2)
 538+ {
 539+ print OUT sprintf ("%5d",$googlebins2{$key}) . " : " . $key . "\n" ;
 540+ print sprintf ("%5d",$googlebins2{$key}) . " : " . $key . "\n" ;
 541+ }
 543+ print OUT "\nDOMAIN ERRORS:\n\n" ;
 544+ foreach $key (sort { $domain_errors {$b} <=> $domain_errors {$a}} keys %domain_errors)
 545+ { print OUT sprintf ("%5d",$domain_errors{$key}) . " : " . $key . "\n" ; }
 548+ foreach $key (sort { $googleagents {$b} <=> $googleagents {$a}} keys %googleagents)
 549+ { print OUT sprintf ("%5d",$googleagents{$key}) . " : " . $key . "\n" ; }
 551+ print OUT "\nGOOGLE LOOK ALIKES:\n\n" ;
 552+ foreach $key (sort { $google_imposters {$b} <=> $google_imposters {$a}} keys %google_imposters)
 553+ { print OUT sprintf ("%5d",$google_imposters{$key}) . " : " . $key . "\n" ; }
 555+ print OUT "\nYAHOO BOTS:\n\n" ;
 556+ foreach $key (sort keys %yahoobots)
 557+ { print OUT sprintf ("%5d",$yahoobots{$key}) . " : " . $key . "\n" ; }
 559+ if ($count_hits_per_ip_range)
 560+ {
 561+ print OUT "\nIP ACTIVITY BY COUNT:\n\n" ;
 562+ foreach $key (sort {$cnt_ip_ranges {$b} <=> $cnt_ip_ranges {$a}}keys %cnt_ip_ranges)
 563+ {
 564+ if ($cnt_ip_ranges {$key} >= 10)
 565+ { print OUT sprintf ("%5d",$cnt_ip_ranges{$key}) . " : " . $key . "\n" ; }
 566+ }
 567+ }
 569+ print OUT "\nIP ACTIVITY BY ADDRESS:\n\n" ;
 570+ foreach $key (sort keys %cnt_ip_ranges)
 571+ {
 572+ if ($cnt_ip_ranges {$key} >= 10)
 573+ { print OUT sprintf ("%5d",$cnt_ip_ranges{$key}) . " : " . $key . "\n" ; }
 574+ }
 576+ print OUT2 "\nOPERATING SYSTEMS:\n\n" ;
 577+ print CSV_OPSYS ":rectype,opsys,count\n" ;
 578+ $total_operating_systems = 0 ;
 580+ foreach $key (keys %operating_systems)
 581+ { $total_operating_systems += $operating_systems{$key} ; }
 583+ print OUT2 "\nTOTAL_OPERATING_SYSTEMS: $total_operating_systems\n\n" ;
 584+ foreach $key (sort keys %operating_systems)
 585+ {
 586+ my $count = $operating_systems {$key} ;
 587+ my $count2 = sprintf ("%5d",$count) ;
 588+ my $perc1 = sprintf ("%6.2f",(100*$count/$total_operating_systems)) . "%" ;
 589+ my $perc2 = sprintf ("%.2f",(100*$count/$total_operating_systems)) . "%" ;
 591+ if ($count >= 1)
 592+ { print OUT2 "$count2 = $perc1: $key \n" ; }
 594+ print CSV_OPSYS "$key,$count,$perc2\n" ;
 595+ }
 596+ print OUT2 "\nOPERATING SYSTEMS GROUPED:\n\n" ;
 597+ $total_operating_systems_printed = 0 ;
 598+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "BlackBerry") ;
 599+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "DoCoMo") ;
 600+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "FreeBSD") ;
 601+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "iPad") ;
 602+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "iPhone") ;
 603+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Linux") ;
 604+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Mac") ;
 605+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "OpenBSD") ;
 606+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "SunOS") ;
 607+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "SymbianOS") ;
 608+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Windows") ;
 609+ print OUT2 sprintf ("%6d",$total_operating_systems_printed) . "=" . sprintf ("%5.2f",(100*$total_operating_systems_printed/$total_operating_systems)) . "% : Total\n\n" ;
 611+ @LinuxVersions = split (',', 'Android,Xubuntu,Kubuntu,Ubuntu,Gentoo,PCLinuxOS,CentOS,Oracle,Mandriva,Red Hat,Mandriva,openSUSE,SUSE,Fedora,Epiphany,Mint,Mips,Arch,Debian,Slackware,Motor,Other') ;
 613+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Mac Intel") ;
 614+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Mac PowerPC") ;
 616+ foreach $LinuxVersion (@LinuxVersions)
 617+ { &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Linux $LinuxVersion") ; }
 620+ print OUT2 "\nCLIENTS:\n\n" ;
 621+ print CSV_CLIENTS ":mobile,engine,client,mime-cat\n" ;
 622+ $total_clients = 0 ;
 623+ foreach $key (keys %clients)
 624+ {
 625+ ($mobile,$version,$mimecat) = split (',', $key) ;
 626+ print OUT2 "\%CLIENTS: '$mobile','$version','$mimecat': " . $clients{$key} . "\n" ;
 627+ $total_clients {$mimecat} += $clients{$key} ;
 628+ $version =~ s/ .*$// ;
 629+ $version =~ s/\/.*$// ;
 630+ $version =~ s/,/&comma;/g ;
 631+ $group = "$mobile,$version,$mimecat" ;
 632+ $grouped_clients {$group} += $clients{$key} ;
 633+ }
 634+ foreach $key (sort keys %clients)
 635+ {
 636+ ($mobile,$version,$mimecat) = split (',', $key) ;
 637+ my $count = $clients {$key} ;
 638+ my $count2 = sprintf ("%5d",$count) ;
 639+ my $perc1 = sprintf ("%6.2f",(100*$count/$total_clients {$mimecat})) . "%" ;
 640+ my $perc2 = sprintf ("%.2f" ,(100*$count/$total_clients {$mimecat})) . "%" ;
 642+ if ($clients {$key} >= 3)
 643+ { print OUT2 "$count2 = $perc1: $key\n" ; }
 645+ print CSV_CLIENTS "$key,$count,$perc2\n" ;
 646+ }
 647+ foreach $key (sort keys %engines)
 648+ {
 649+ my $count = $engines {$key} ;
 650+ print CSV_CLIENTS "E,$key,$count\n" ;
 651+ }
 652+ foreach $key (sort keys %grouped_clients)
 653+ {
 654+ ($group,$version,$mimecat) = split (',', $key) ;
 655+ my $count = $grouped_clients {$key} ;
 656+ my $perc2 = sprintf ("%.2f",(100*$count/$total_clients {$mimecat})) . "%" ;
 657+ print CSV_CLIENTS "G,$key,$count,$perc2\n" ;
 658+ }
 660+ print OUT2 "\nCLIENTS BY WIKI:\n\n" ;
 661+ print CSV_CLIENTS_BY_WIKI ":mobile,client,mime-cat\n" ;
 662+ $total_clients = 0 ;
 663+ foreach $key (keys %clients_by_wiki)
 664+ { $total_clients += $clients_by_wiki{$key} ; }
 666+ foreach $key (sort keys %clients_by_wiki)
 667+ {
 668+ my $count = $clients_by_wiki {$key} ;
 669+ my $count2 = sprintf ("%5d",$count) ;
 670+ my $perc1 = sprintf ("%6.2f",(100*$count/$total_clients)) . "%" ;
 671+ my $perc2 = sprintf ("%.2f",(100*$count/$total_clients)) . "%" ;
 672+ if ($clients_by_wiki {$key} >= 3)
 673+ { print OUT2 "$count2 = $perc1: $key\n" ; }
 674+ ($mobile,$version,$domain,$mimecat) = split (',', $key) ;
 675+ $domain = ExpandAbbreviation ($domain) ;
 676+ $domain =~ s/:/,/ ;
 677+ $domain =~ s/\&nbsp;/--/ ;
 678+ print CSV_CLIENTS_BY_WIKI "$mobile,$version,$domain,$mimecat,$count,$perc2\n" ;
 679+ }
 681+ foreach $key (sort keys %grouped_clients_by_wiki)
 682+ {
 683+ my $count = $grouped_clients_by_wiki {$key} ;
 684+ my $perc2 = sprintf ("%.2f",(100*$count/$total_clients)) . "%" ;
 685+ print CSV_CLIENTS_BY_WIKI "G,$key,$count,$perc2\n" ;
 686+ }
 688+ print OUT2 "\nGOOGLEBOT NOT FROM GOOGLE\n\n" ;
 689+ foreach $key (sort keys %ip_bot_no_google)
 690+ {
 691+ if ($ip_bot_no_google {$key} >= 3)
 692+ { print OUT2 sprintf ("%5d",$ip_bot_no_google{$key}) . " : " . $key . "\n" ; }
 693+ }
 695+ print OUT2 "\nMOBILE OTHER\n\n" ;
 696+ foreach $key (sort keys %mobile_other)
 697+ { print OUT2 sprintf ("%5d",$mobile_other{$key}) . " : " . $key . "\n" ; }
 699+ foreach $key (sort keys %countries_views)
 700+ {
 701+ my $count = $countries_views {$key} ;
 702+ print CSV_COUNTRIES_VIEWS "$key,$count\n" ;
 703+ }
 705+ foreach $key (sort keys %countries_saves)
 706+ {
 707+ my $count = $countries_saves {$key} ;
 708+ print CSV_COUNTRIES_SAVES "$key,$count\n" ;
 709+ print "$key,$count\n" ;
 710+ }
 712+ foreach $key (sort keys %countries_timed)
 713+ {
 714+ my $count = $countries_timed {$key} ;
 715+ print CSV_COUNTRIESTIMED "$key,$count\n" ;
 716+ }
 718+ foreach $key (keys_sorted_by_value_num_desc %agents_raw)
 719+ {
 720+ my $count = $agents_raw {$key} ;
 721+ $key =~ s/,/;/g ;
 722+ next if $count < 5 ;
 723+ print CSV_AGENTS "$key,$count\n" ;
 724+ }
 726+ close CSV_METHODS ;
 727+ close CSV_SKINS ;
 728+ close CSV_SCRIPTS ;
 729+ close CSV_IMAGES ;
 730+ close CSV_BANNERS ;
 731+ close CSV_BINARIES ;
 732+ close CSV_EXTENSIONS ;
 733+ close CSV_REQUESTS ;
 734+ close CSV_ORIGINS ;
 735+ close CSV_SEARCH ;
 736+ close CSV_BOTS ;
 737+ close CSV_GOOGLEBOTS ;
 738+ close CSV_OPSYS ;
 739+ close CSV_LANGUAGES ;
 743+ close CSV_CLIENTS ;
 744+ close CSV_CLIENTS_BY_WIKI ;
 745+ close OUT_REFERERS ;
 746+ close CSV_AGENTS ;
 749+sub WriteOutputEditsSavesFile
 751+ trace WriteOutputEditsSavesFile ;
 753+ my $path_out = shift ;
 754+ print "\ncd $path_out\n\n" ;
 755+ chdir ($path_out) ;
 757+ $comment = "# Data from $time_to_start till $time_to_stop (yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 1000)\n" ;
 759+# $comment = "# Data from $time_to_start till $time_to_stop (yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 1000)\n" ;
 761+ # only keep edits/submits for ip addresses which occur only once in this hash (stands for avg. 2000 hits)
 762+ foreach $key (keys %client_ip_record_cnt)
 763+ { $client_ip_record_cnt_total {$client_ip_record_cnt {$key}}++ ; }
 765+ print "\n\nEdit submit lines:\n" ;
 766+ foreach $key (sort {$b <=> $a} keys %client_ip_record_cnt_total)
 767+ {
 768+ print sprintf ("%5d", $client_ip_record_cnt_total {$key}) . " ip address(es) occur $key times\n" ;
 769+ $lines_edit_submit_total += $key * $client_ip_record_cnt_total {$key} ;
 770+ }
 771+ print "Total edit submit lines: $lines_edit_submit_total\n\n" ;
 773+ foreach $key (keys %index_php_raw)
 774+ {
 775+ ($client_ip,$key2) = split (',', $key, 2) ;
 776+ if ($client_ip_record_cnt {$client_ip} < 2)
 777+ {
 778+ $index_php {$key2} += $index_php_raw {$key} ;
 779+ $edit_submit_filtered += $index_php_raw {$key} ;
 780+ }
 781+ }
 782+ undef %index_php_raw ;
 784+ open CSV_INDEXPHP, '>', "$path_out/$file_csv_indexphp" ;
 786+ print CSV_INDEXPHP $comment ;
 787+ foreach $key (sort {$index_php {$b} <=> $index_php {$a}} keys %index_php)
 788+ {
 789+ print CSV_INDEXPHP "$key,${index_php {$key}}\n" ;
 790+ $lines_edit_submit_filtered ++ ;
 791+ }
 792+ print "Filtered edits+submits: $edit_submit_filtered in $lines_edit_submit_filtered lines\n\n" ;
 794+ close CSV_INDEXPHP ;
 797+sub WriteOutputCountriesSaves
 799+ my $path_out = shift ;
 801+ $comment = "# Data from $time_to_start till $time_to_stop (yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 1000)\n" ;
 803+ open CSV_COUNTRIES_SAVES, '>', "$path_out/$file_csv_countries_saves" ;
 804+ print CSV_COUNTRIES_SAVES $comment ;
 806+ foreach $key (sort keys %countries_saves)
 807+ {
 808+ my $count = $countries_saves {$key} ;
 809+ print CSV_COUNTRIES_SAVES "$key,$count\n" ;
 810+ }
 814+sub WriteDiagnostics
 816+ if ($statusses_non_tcp > 0)
 817+ { print ERR "Statusses non 'TCP..' : $statusses_non_tcp\n" ; }
 819+ if ($fields_too_many > 0)
 820+ { print ERR "Too many fields on $fields_too_many records. (space in article name?)\n" ; }
 822+ if ($fields_too_few > 0)
 823+ { print ERR "Too few fields on $fields_too_few records.\n" ; }
 825+ print "\nLines read per date:\n" ;
 826+ print OUT "\nLines read per date:\n" ;
 827+ foreach $key (sort keys %lines_read)
 828+ {
 829+ print OUT "$key: " . sprintf ("%8d",$lines_read{$key}) . "\n" ;
 830+ print "$key: " . sprintf ("%8d",$lines_read{$key}) . "\n" ;
 831+ }
 832+ print OUT "\n" ;
 833+ print "\n" ;
 835+ print "Referers internal $tot_referers_internal\n" ;
 836+ print "Referers external $tot_referers_external\n" ;
 837+ print "Origins counted $tot_origins_external_counted\n" ;
 839+ print ERR "\nUnrecognized domains:\n\n" ;
 840+ foreach $key (sort keys %unrecognized_domains)
 841+ { print ERR sprintf ("%5d",$unrecognized_domains{$key}) . " : " . $key . "\n" ; }
 843+ print "\n$tot_mime_html html requests found.\n" ;
 844+ print "country info stored for $tot_mime_html2 html requests.\n" ;
 845+# # double check that yahoo is much more than 10% of google (even when google uses ip addresses)
 846+# print "\ngoogle string in referer or agent: $googles\n" ;
 849+sub WriteOutputLineToCsvSharePerOs
 851+ my $total_all = shift ;
 852+ my $criteria = shift ;
 853+ (my $criteria2 = $criteria) =~ s/ /.*/g ;
 854+ my @criteria = split (' ', $criteria2) ;
 856+ my $total_operating_systems = 0 ;
 857+ my $trace_count = 0 ;
 859+ print "WriteOutputLineToCsvSharePerOs $criteria\n" ;
 860+ foreach $key (keys %operating_systems)
 861+ {
 862+ my $match = $true ;
 863+ foreach $criterion (@criteria)
 864+ {
 865+ if ($key !~ /$criterion/)
 866+ {
 867+ if (($trace_count++ < 20) && ($criteria =~ /Linux/))
 868+ { print "key $key criterion $criterion FALSE\n" ; }
 869+ $match = $false ;
 870+ last ;
 871+ }
 872+ else
 873+ {
 874+ if (($trace_count++ < 20) && ($criteria =~ /Linux/))
 875+ { print "key $key criterion $criterion TRUE\n" ; }
 876+ }
 877+ }
 878+ if ($match)
 879+ { $total_operating_systems += $operating_systems {$key} ; }
 880+ }
 881+ $perc_operating_systems1 = ".." ;
 882+ $perc_operating_systems2 = ".." ;
 883+ if ($total_all > 0)
 884+ {
 885+ $perc_operating_systems1 = sprintf ("%5.2f",(100*$total_operating_systems/$total_all)) ;
 886+ $perc_operating_systems2 = sprintf ("%.2f",(100*$total_operating_systems/$total_all)) ;
 887+ }
 888+ print OUT2 sprintf ("%6d",$total_operating_systems) . "= $perc_operating_systems1\% : $criteria\n" ;
 889+ print CSV_OPSYS "G,$criteria,$total_operating_systems,$perc_operating_systems2\%\n" ; ;
 890+ $total_operating_systems_printed += $total_operating_systems ;
 893+sub MoveAndCompressFiles
 895+ trace MoveAndCompressFiles ;
 897+ my ($path_out, $path_out_month, $date_collect_files) = @_ ;
 899+ print "\ncd $path_out_month\n" ;
 900+ chdir ($path_out_month) ;
 902+# $cmd = "mv $path_out/private/SquidDataEditsSavesDoNotPublish.txt $path_out/private/SquidDataEditsSavesDoNotPublish$date_collect_files.txt" ;
 903+# print "\ncmd = '$cmd'\n" ;
 904+#`$cmd` ;
 906+ $cmd = "bzip2 -f $path_out/$file_edits_saves" ;
 907+ print "\ncmd = '$cmd'\n" ;
 908+ `$cmd` ;
 910+ $cmd = "bzip2 -f $path_out/$file_csv_agents" ;
 911+ print "\ncmd = '$cmd'\n" ;
 912+ `$cmd` ;
 914+ # $cmd = "tar -cf $date_collect_files\-csv.tar $date_collect_files/*.csv" ;
 915+ # print "\ncmd = '$cmd'\n" ;
 916+ # `$cmd` ;
 918+ # $cmd = "bzip2 -f $date_collect_files\-csv.tar" ;
 919+ # print "\ncmd = '$cmd'\n" ;
 920+ # `$cmd` ;
 923+1 ;
Index: trunk/wikistats/squids/SquidCountArchive.pl
@@ -510,6 +510,7 @@
511511 undef %squid_events ;
512512 undef %squid_seqno ;
513513 undef %statusses ;
 514+ undef %total_clients ;
514515 undef %unrecognized_domains ;
515516 undef %wikis ;
516517 # undef @files ;
Index: trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm
@@ -4,13 +4,20 @@
55 {
66 my $line = shift ;
8 - my @fields = split (' ', $line) ;
98 $time = $fields [2] ;
109 $date = substr ($time,0,10) ;
1211 $client_ip = $fields [4] ;
1312 $mime = $fields [10] ;
 13+ $url = lc ($fields [8]) ;
 15+ if ($mime eq '-')
 16+ {
 17+ # no mime type on log records from varnish, assume 'page request' on most, until that stream had been fixed
 18+ if (($url =~ /\.m\..*?\/wiki\//) || ($url =~ /\.m\..*?\/w\/index.php/))
 19+ { $mime = "text/html" ; }
 20+ }
1522 if ($scan_ip_frequencies) # phase 1
1623 {
1724 return if $line =~ /Banner(?:Cont|List|Load|beheer)/io ;
@@ -49,8 +56,12 @@
5057 $status = $fields [5] ;
5158 $size = $fields [6] ;
5259 $method = $fields [7] ;
53 - $url = lc ($fields [8]) ;
 61+ $referer = lc ($fields [11]) ;
 62+ $agent = $fields [13] ;
 64+# print "\ntime '$time', client_ip '$client_ip', mime '$mime', squid '$squid', seqno '$seqno', \nstatus '$status', size '$size', method '$method', referer '$referer',\nurl '$url', agent '$agent'\n" ;
5566 if ($url =~ /\.m\.wikipedia.org/)
5667 {
5768 $url_wikipedia_mobile ++ ;
@@ -63,9 +74,6 @@
6475 }
6576 }
67 - $referer = lc ($fields [11]) ;
68 - $agent = $fields [13] ;
69 -
7078 $url =~ s/^http\w?\:\/\///o ;
7179 $url =~ s/\%3A/:/gio ;
7280 $url =~ s/\%3B/;/gio ;
@@ -355,7 +363,11 @@
357365 if ($os =~ /Linux/o)
358366 {
359 - ($osx = $agent2) =~ s/^.*?((?:Android|Ubuntu|Gentoo|PCLinuxOS|CentOS|Red Hat|Mandriva|SUSE|Fedora|Epiphany|Debian|Motor\w+)[^\s;\[\]\(\)]*).*$/ucfirst($1)/ieo ;
 367+ ($cpu = $agent2) =~ s/^.*?(armv\d+|i\d+|x[0-9_]+).*$/$1/o ;
 368+ if ($cpu eq $agent2)
 369+ { $cpu = '' ; }
 371+ ($osx = $agent2) =~ s/^.*?((?:Android|Xubuntu|Kubuntu|Ubuntu|Gentoo|PCLinuxOS|CentOS|Oracle|Mandriva|Red Hat|Mandriva|openSUSE|SUSE|Fedora|Epiphany|Mint|Mips|Arch|Debian|Slackware|Motor\w+)[^\s;\[\]\(\)]*).*$/ucfirst($1)/ieo ;
360372 if ($osx ne $agent2)
361373 {
362374 $osx =~ s/(\d+\_\d+).*$/$1/o ;
@@ -363,8 +375,18 @@
364376 $osx =~ s/_/\./o ;
365377 $osx =~ s/(\d+\.\d+).*$/$1/o ;
366378 $osx =~ s/^(Motor)(\w+).*$/ucfirst(lc($1)).uc($2)/ieo ;
367 - $os = "$os $osx" ;
368379 }
 380+ else
 381+ { $osx = "Other" ; }
 383+ $os = "$os $cpu $osx" ;
 384+ $os =~ s/\s\s+/ /g ;
 386+ # testing:
 387+ # if ($osx eq $agent2)
 388+ # { print "Linux ?? -> $agent2\n" ; }
 389+ # elsif ($osx !~ /(?:Android|Ubuntu)/i)
 390+ # { print "Linux !! $cpu $osx -> $agent2\n" ; }
369391 }
371393 $os =~ s/(Windows NT \d+\.\d+).*$/$1/o ;
@@ -1189,7 +1211,7 @@
11901212 ($path = $url) =~ s/^.*?\.org\///o ;
11911213 ($file = $path) =~ s/^.*\/([^\/]*)$/$1/go ; # remove path
1193 - $binaries {$file} ++ ;
 1215+ $binaries {$path} ++ ; # Jan 2012 store path, not file only
11951217 if ($file =~ /(?:gif|jpg|jpeg|png|svg)$/io)
11961218 {
Index: trunk/wikistats/squids/SquidReportArchive.pl
@@ -6,8 +6,10 @@
77 ez_lib_version (2) ;
99 # set defaults mainly for tests on local machine
10 - default_argv "-m 2011-07 " ;
11 -# default_argv "-c -q 2010Q4" ;
 10+# default_argv "-m 2011-07 " ;
 11+# default_argv "-c -q 2010Q1" ;
 12+# default_argv "-w" ; # refresh country info from Wikipedia (population etc)
 13+ default_argv "-c" ;
1315 # to do: add text from http://wiki.squid-cache.org/SquidFaq/SquidLogs
1416 # ReportOrigin how to handle '!error <-> other
@@ -30,6 +32,8 @@
3234 getopt ("dmq", \%options) ;
 36+ undef %country_code_not_specified_reported ;
3438 if (-d "/a/squid")
3539 {
3640 print "\n\nJob runs on server $hostname\n\n" ;
@@ -52,11 +56,13 @@
5357 print "Path in = $path_in\n" ;
5458 print "Path out = $path_out\n" ;
 60+ $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ;
5662 # periodically harvest updated metrics from
5763 # 'http://en.wikipedia.org/wiki/List_of_countries_by_population'
5864 # 'http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users'
5965 if (defined ($options {"w"}))
60 - { &ReadWikipedia ; exit ; }
 66+ { &ReadWikipedia ; print "Ready\n" ; exit ; }
6268 if (defined ($options {"c"}))
6369 { $reportcountries = $true ; }
@@ -77,7 +83,6 @@
7884 &InitProjectNames ;
8086 $file_csv_country_codes = "CountryCodes.csv" ;
81 - $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ;
8388 &ReadInputCountriesNames ;
@@ -384,7 +389,7 @@
385390 $title = "$title_main - Wikipedia <font color=#008000>$views_edits Per Country</font> - Trends" ;
386391 &WriteReportPerCountryTrends ($title, $views_edits, &UnLink ($links,$offset_links+3)) ;
388 - $links =~ s/,.*$// ;
 393+# $links =~ s/,.*$// ;
389394 $title = "$title_main - <font color=#008000>$views_edits Per Wikipedia Language</font> - Breakdown" ;
390395 &WriteReportPerLanguageBreakDown ($title, $views_edits, &UnLink ($links,$offset_links+4)) ;
391396 }
@@ -480,8 +485,23 @@
481486 "<input type='button' value=' Archive ' onclick='window.location=\"http://stats.wikimedia.org/archive/squid_reports\"'> " .
482487 "<input type='button' value=' Wikimedia Statistics ' onclick='window.location=\"http://stats.wikimedia.org\"'>" .
483488 "</td></tr>\n</table><hr>" .
484 - "&nbsp;This analysis is based on a 1:1000 sampled server log (squids) X1000\nALSO<p>" ;
 489+ # "&nbsp;This analysis is based on a 1:1000 sampled server log (squids) X1000\nALSO<p>" ;
 490+ "&nbsp;This analysis is based on a 1:1000 sampled server log (squids) X1000\nALSO<br>" ;
 492+ if ($reportcountries)
 493+ {
 494+ $header .= "<p>&nbsp;<font color=#900000>WMF traffic logging service suffered from server capacity problems from Nov 2009 till July 2010 and again in Aug/Sep/Oct 2011.<br>" .
 495+ "&nbsp;Data loss only occurred during peak hours. It therefore may have had somewhat different impact for traffic from different parts of the world." ;
 496+ }
 497+ else
 498+ {
 499+ $header .= "<font color=#900000>WMF traffic logging service suffered from server capacity problems in Aug/Sep/Oct 2011.<br>" .
 500+ "Absolute traffic counts for October 2011 are approximatly 7% too low.<br>" .
 501+ "Data loss only occurred during peak hours. It therefore may have had somewhat different impact for traffic from different parts of the world.<br>" .
 502+ "and may have also skewed relative figures like share of traffic per browser or operating system.</font><p>" ;
 503+ $header .= "<font color=#900000>In a an unrelated server outage precisely half of traffic to WMF mobile sites was not counted from Oct 16 - Nov 29 (one of two load-balanced servers did not report traffic).<br>" .
 504+ "WMF has since improved server monitoring, so that similar outages should be detected and fixed much faster from now on.</font><p>" ;
 505+ }
486506 # to be localized some day like any reports
487507 $out_license = "All data and images on this page are in the public domain." ;
488508 $out_generated = "Generated on " ;
@@ -619,7 +639,7 @@
621641 $client =~ s/_/./g ;
622642 $client =~ s/\.\./Other/g ;
623 - if ($client !=~ / \d/)
 643+ if ($client !~ / \d/)
624644 { $client =~ s/\// / ; }
625645 if ($rectype eq "-") { $total_clients_non_mobile += $count ; }
626646 if ($rectype eq "M") { $total_clients_mobile += $count ; }
@@ -1332,6 +1352,7 @@
13331353 {
13341354 # http://en.wikipedia.org/wiki/List_of_countries_by_population
13351355 # http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users
 1356+ print "Read $path_in/$file_csv_country_meta_info\n" ;
13361357 open COUNTRY_META_INFO, '<', "$path_in/$file_csv_country_meta_info" ;
13371358 binmode COUNTRY_META_INFO ;
13381359 while ($line = <COUNTRY_META_INFO>)
@@ -1342,6 +1363,7 @@
13431364 $line =~ s/C..?te d'Ivoire/C&ocirc;te d'Ivoire/g ;
13451366 ($country,$link,$population,$connected,$icon) = split ',', $line ;
 1367+ print "COUNTRY $country\nLINK $link\nPOPULATION $population\nCONNECTED $connected\n\n" ;
13461368 $country =~ s/&comma;/,/g ;
13481370 # use country names as given by MaxMind
@@ -1533,10 +1555,10 @@
15341556 $months_recently = keys %months_recently ;
15351557 if ($months_recently == 0) { die "\$months_recently == 0\n" ; }
1537 - $requests_recently_start = substr ($requests_recently_start,5,2) . "/" . substr ($requests_recently_start,2,2) ;
1538 - $requests_recently_stop = substr ($requests_recently_stop ,5,2) . "/" . substr ($requests_recently_stop ,2,2) ;
1539 - $requests_start = substr ($requests_start,5,2) . "/" . substr ($requests_start,2,2) ;
1540 - $requests_stop = substr ($requests_stop ,5,2) . "/" . substr ($requests_stop ,2,2) ;
 1559+ $requests_recently_start = substr ($requests_recently_start,0,4) . '/' . substr ($requests_recently_start,5,2);
 1560+ $requests_recently_stop = substr ($requests_recently_stop ,0,4) . '/' . substr ($requests_recently_stop ,5,2) ;
 1561+ $requests_start = substr ($requests_start,0,4) . '/' . substr ($requests_start,5,2) ;
 1562+ $requests_stop = substr ($requests_stop ,0,4) . '/' . substr ($requests_stop ,5,2) ;
15421564 foreach $yyyymm (keys %$yyyymm)
15431565 {
@@ -4644,9 +4666,6 @@
46454667 $html_total .= "<tr><td colspan=99>&nbsp;</td></tr>" ;
4648 - undef @keys_regions ;
4649 -# foreach $key (sort keys %population_per_hemisphere)
4650 -# { push @keys_regions, $key ; }
46514670 $html_regions = '' ;
46524671 foreach $key (qw (N S AF AS AU EU CA NA SA OC))
46534672 {
@@ -5150,19 +5169,19 @@
51515170 my $views_edits_lc = lc $views_edits ;
51535172 if ($show_logcount)
5154 - { $report_version = "<p>This is the extended version of this report, with even small percentages included (> $cutoff_percentage\%) (see also bottom of page). " .
5155 - "Switch to <a href='$file_html_per_country_breakdown'>regular version</a>" ; }
 5173+ { $report_version = "<p>Showing even small percentages (> $cutoff_percentage\%) (read <a href='#more'>more</a>). " .
 5174+ "Switch to <a href='$file_html_per_country_breakdown'>concise version</a>" ; }
51565175 else
5157 - { $report_version = "<p>This is the regular version of this report, with only major percentages (> $cutoff_percentage\%) included." .
5158 - " Switch to <a href='$file_html_per_country_breakdown_huge'>extended version</a>" ; }
 5176+ { $report_version = "<p>Showing only only major percentages (> $cutoff_percentage\%) (read <a href='#more'>more</a>). " .
 5177+ " Switch to <a href='$file_html_per_country_breakdown_huge'>detailed version</a>" ; }
51605179 $html = $header ;
51615180 $html =~ s/TITLE/$title/ ;
51625181 $html =~ s/HEADER/$title/ ;
51635182 $html =~ s/LINKS// ;
5164 - $html =~ s/ALSO/$links/ ;
 5183+ $html =~ s/ALSO/$links$report_version/ ;
51655184 $html =~ s/NOTES// ;
5166 - $html =~ s/X1000/.&nbsp;Period <b>$requests_recently_start - $requests_recently_stop<\/b><br>$report_version/ ;
 5185+ $html =~ s/X1000/.&nbsp;Period <b>$requests_recently_start - $requests_recently_stop<\/b>/ ;
51675186 $html =~ s/DATE// ;
51695188 $html .= "<p>Portal is <a href='http://www.wikipedia.org'>www.wikipedia.org</a>\n" ;
@@ -5262,7 +5281,7 @@
52635282 # $html .= "<tr><td colspan=99>&nbsp;</td></tr>\n" ;
52645283 }
52655284 $html .= "</table>" ;
5266 - $html .= "<p><b>Share<\/b> is the percentage of requesting ip addresses (out of the global total) which originated from this country" .
 5285+ $html .= "<p><a name='more' id='more'></a><b>Share<\/b> is the percentage of requesting ip addresses (out of the global total) which originated from this country" .
52675286 "<br>&nbsp;Further percentages show per country share of $views_edits_lc per Wikipedia visited" ;
52685287 $html .= "<p><b>Countries</b> are only included if the number of requests in the period exceeds $cutoff_requests,000 ($cutoff_requests matching records in 1:1000 sampled log)" ;
52695288 $html .= "<p><b>Wikipedia's</b> are only listed for some country if the share of visitors for that particular country exceeds $cutoff_percentage\%." ;
@@ -5309,6 +5328,12 @@
53105329 $html =~ s/X1000/.&nbsp;Period <b>$requests_start - $requests_stop<\/b>/ ;
53115330 $html =~ s/DATE// ;
 5332+ if ($views_edits eq 'Page Views')
 5333+ {
 5334+ $html .= "<p><font color=#800000>Nov 2011: For some countries the share of page views on the English Wikipedia was significantly higher in 2010 than in 2009 and 2011,<br>" .
 5335+ "especially in Q1 and Q2. We don't know yet what caused this, this might be an artifact. Please be cautious to draw conclusions from this.</font>" ;
 5336+ }
53135338 $html .= "<p>Portal is <a href='http://www.wikipedia.org'>www.wikipedia.org</a>\n" ;
53155340 $html .= "<p><table border=1 width=800>INDEX\n" ;
@@ -5892,10 +5917,15 @@
58945919 sub ReadWikipedia
58955920 {
 5921+ print "ReadWikipedia\n\n" ;
58965923 use LWP::Simple qw($ua get);
58985925 $ua->agent('Wikipedia Wikicounts job');
58995926 $ua->timeout(60);
 5929+ print "Read List_of_countries_by_population\n\n" ;
59005930 my $url = 'http://en.wikipedia.org/wiki/List_of_countries_by_population';
59015931 my $html = get $url || die "Timed out!";
@@ -5955,9 +5985,12 @@
59565986 $link =~ s/,/&comma;/g ;
59575987 $icon =~ s/,/&comma;/g ;
 5989+ print "country: $country\nlink: $link\npopulation: $population\nconnected: $connected\nicon: $icon\n\n" ;
59595990 $countries {$country} = "$country,$link,$population,connected,$icon\n" ;
59605991 }
 5993+ print "List_of_countries_by_number_of_Internet_users\n\n" ;
59625995 $url = 'http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users';
59635996 $html = get $url || die "Timed out!";
@@ -5995,10 +6028,12 @@
59966029 $country =~ s/Timor Leste/Timor-Leste/ ;
59976030 $country =~ s/UAE/United Arab Emirates/ ;
 6032+ print "country: $country\nconnected: $connected\n\n" ;
59996033 $countries {$country} =~ s/connected/$connected/ ;
60006034 }
6002 - open COUNTRY_META_INFO, '>', "$path_out/SquidReportCountryMetaInfo.csv" ;
 6036+ print "Write $path_in/$file_csv_country_meta_info\n\n" ; # use $path_in, not $path_out so that next step picks up proper file
 6037+ open COUNTRY_META_INFO, '>', "$path_in/$file_csv_country_meta_info" ;
60036038 foreach $country (sort keys %countries)
60046039 { print COUNTRY_META_INFO $countries {$country} ; }
60056040 close COUNTRY_META_INFO ;
@@ -6086,11 +6121,11 @@
60876122 sub UnLink
60886123 {
60896124 my ($links,$index) = @_ ;
6090 -# print "\n\nUnLink $index\n\n" ;
 6125+ # print "\n\nUnLink $index\n\n" ;
60916126 my @segments = split '(?=<a )', $links ;
6092 -# print "SEGMENT 1 $segments[$index]\n" ;
 6127+ # print "SEGMENT 1 $segments[$index]\n" ;
60936128 $segments [$index] =~ s/^.*?<a .*?>([^<]*)<\/a>/<font color=#008000><b>$1<\/b><\/font>/ ;
6094 -# print "SEGMENT 2 $segments[$index]\n" ;
 6129+ # print "SEGMENT 2 $segments[$index]\n" ;
60956130 $links = join '', @segments ;
60966131 return ($links) ;
60976132 }
@@ -6139,8 +6174,8 @@
61406175 id: "millions",
61416176 is: function(s) { return false; },
61426177 //failed so far to turn 1.2M into 1200000, so figures with decimal point are sorted out of place
6143 -//format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/&nbsp;/g,"").replace(/\\.(\d)M/,$1+"00000").replace(/M/,"000000").replace(/&#1052;/,"000000").replace(/K/,"000").replace(/&#1050;/i,"000")); },
6144 - format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/&nbsp;/g,""). replace(/M/,"000000").replace(/&#1052;/,"000000").replace(/K/,"000").replace(/&#1050;/i,"000")); },
 6178+//format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/&nbsp;/g,"").replace(/\\.(\\d)M/,$1+"00000").replace(/M/,"000000").replace(/&#1052;/,"000000").replace(/K/,"000").replace(/&#1050;/i,"000")); },
 6179+ format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/&nbsp;/g,""). replace(/M/,"000000").replace(/&#1052;/,"000000").replace(/K/,"000").replace(/&#1050;/i,"000")); },
61456180 type: "numeric"
61466181 });
@@ -6211,7 +6246,7 @@
62126247 }
62136248 </style>
62146249 __HTML_SORT_TABLE__
6215 -return ($html) ;
 6250+ return ($html) ;
62166251 }
62186253 sub HtmlSortTableColumns

Status & tagging log