r82606 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r82605‎ | r82606 | r82607 >
Date:15:17, 22 February 2011
Author:ezachte
Status:deferred
Tags:
Comment:
Process squid logs for traffic reports http://stats.wikimedia.org/#requests
Modified paths:
  • /trunk/wikistats/squids (added) (history)
  • /trunk/wikistats/squids/SquidCountArchive.pl (added) (history)
  • /trunk/wikistats/squids/SquidCountArchive.sh (added) (history)
  • /trunk/wikistats/squids/SquidCountryScan.pl (added) (history)
  • /trunk/wikistats/squids/SquidCountryScan.sh (added) (history)
  • /trunk/wikistats/squids/SquidReportArchive.pl (added) (history)
  • /trunk/wikistats/squids/SquidReportArchive.sh (added) (history)

Diff [purge]

Index: trunk/wikistats/squids/SquidCountryScan.sh
@@ -0,0 +1,6 @@
 2+#!/bin/bash
 3+
 4+ulimit -v 4000000
 5+
 6+# perl ./SquidCountryScan.pl -y 2010
 7+perl ./SquidCountryScan.pl # start in July 2009
Property changes on: trunk/wikistats/squids/SquidCountryScan.sh
___________________________________________________________________
Added: svn:eol-style
18 + native
Index: trunk/wikistats/squids/SquidCountryScan.pl
@@ -0,0 +1,488 @@
 2+#!/usr/bin/perl
 3+## Collect page views stats by country on Locke
 4+## sub CollectRawData -> SquidDataCountries.csv
 5+## sub ProcessRawData <- SquidDataCountries.csv -> ??
 6+
 7+ use lib "/home/ezachte/lib" ;
 8+ use EzLib ;
 9+ $trace_on_exit = $true ;
 10+
 11+ use Time::Local ;
 12+ use Getopt::Std ;
 13+ use Cwd;
 14+ $timestart = time ;
 15+
 16+ my %options ;
 17+ getopt ("y", \%options) ;
 18+ $process_year = $options {"y"} ;
 19+ if (($process_year !~ /^\d\d\d\d$/) || ($process_year < 2009))
 20+ {
 21+ $process_year = 2009 ;
 22+ # print "Specify year as '-y nnnn'\n\n" ;
 23+ # exit ;
 24+ }
 25+
 26+ $path_root = "/a/ezachte/" ;
 27+# $path_root = "w:/! perl/squids/archive/" ;
 28+
 29+ $file_raw_data_monthly_visits = "$path_root/SquidDataVisitsPerCountryMonthly.csv" ;
 30+ $file_raw_data_daily_visits = "$path_root/SquidDataVisitsPerCountryDaily.csv" ;
 31+ $file_per_country_visits = "public/SquidDataCountriesViews.csv" ;
 32+ $file_per_country_visits_old = "SquidDataCountries2.csv" ;
 33+
 34+ $file_raw_data_monthly_saves = "$path_root/SquidDataSavesPerCountryMonthly.csv" ;
 35+ $file_raw_data_daily_saves = "$path_root/SquidDataSavesPerCountryDaily.csv" ;
 36+ $file_per_country_saves = "public/SquidDataCountriesSaves.csv" ;
 37+ $file_per_country_saves_old = "SquidDataCountriesSaves.csv" ;
 38+
 39+ &CollectRawData ('visits', $file_per_country_visits, $file_per_country_visits_old, $file_raw_data_monthly_visits, $file_raw_data_daily_visits) ;
 40+ &CollectRawData ('saves', $file_per_country_saves, $file_per_country_saves_old, $file_raw_data_monthly_saves, $file_raw_data_daily_saves) ;
 41+# &ProcessRawData ;
 42+
 43+ exit ;
 44+
 45+sub CollectRawData
 46+{
 47+ my ($mode, $file_per_country, $file_per_country_old, $file_raw_data_monthly, $file_raw_data_daily) = @_ ;
 48+ my ($visits_wp_total, $visits_total_wp_en) ;
 49+ my (%visits_monthly, %visits_daily, %visits_wp_yyyymm, %visits_per_project, %visits_per_language, %visits_per_country, %visits_wp_b, %visits_wp_u, %correct_for_missing_days) ;
 50+
 51+ print "Collect raw data for $mode\n\n" ;
 52+ print "Input data per country $file_per_country, $file_per_country_old\n" ;
 53+ print "Raw data monthly $file_raw_data_monthly\n" ;
 54+ print "Raw data daily $file_raw_data_daily\n\n" ;
 55+
 56+ $year = $process_year ;
 57+ if ($year == 2009)
 58+ { $month = 7 ; }
 59+ else
 60+ { $month = 1 ; }
 61+
 62+ while ($true)
 63+ {
 64+ $dir = "$path_root/" . sprintf ("%04d-%02d", $year, $month) ;
 65+ $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
 66+ if (-d $dir)
 67+ {
 68+ print "Dir: $dir\n" ;
 69+ $days_in_month = &DaysInMonth ($year,$month) ;
 70+
 71+ $days_found = 0 ;
 72+ for ($day = 1 ; $day <= $days_in_month ; $day++)
 73+ {
 74+ if (($month == 4) && ($year == 2009) && ($day < 18)) { next ; }
 75+
 76+ $yyyymmdd = sprintf ("%04d-%02d-%02d", $year, $month, $day) ;
 77+
 78+ # do not combine with SquidDataCountries.csv from earlier months
 79+ # only from 2009-07 anonymous bots (hits > 1 in sampled log) were ignored
 80+ $file = "$dir/" . sprintf ("%04d-%02d-%02d", $year, $month, $day) . "/$file_per_country_old" ;
 81+ # print "READ1 $file\n" ;
 82+ if (! -e $file)
 83+ {
 84+ $file = "$dir/" . sprintf ("%04d-%02d-%02d", $year, $month, $day) . "/$file_per_country" ;
 85+ # print "READ2 $file\n" ;
 86+ }
 87+
 88+ if (-e $file)
 89+ {
 90+ $days_found++ ;
 91+ # print "File: $file\n" ;
 92+ open IN, '<', $file ;
 93+ while ($line = <IN>)
 94+ {
 95+ if ($line =~ /^#/) { next ; }
 96+
 97+ chomp $line ;
 98+ ($bot,$wiki,$country,$count) = split (',', $line) ;
 99+
 100+ if ($bot =~ /Y/)
 101+ { $bot = 'B' ; }
 102+ else
 103+ { $bot = 'U' ; }
 104+
 105+ ($project,$language) = split (':', $wiki) ;
 106+ $project =~ s/\s//g ;
 107+
 108+ # if ($project ne "wp") { next ; }
 109+ # if ($yyyymm ne "2009-11") { next ; }
 110+ # if ($language eq "www") { next ; }
 111+
 112+ $visits_monthly {"$yyyymm,$project,$language,$country,$bot"} += $count ;
 113+ $visits_daily {"$yyyymmdd,$project,$language,$country,$bot"} += $count ;
 114+
 115+ # following hashes for specific research, not for regular csv files
 116+ if (($project eq "wp") && ($bot eq 'U') && ($country ne "--"))
 117+ {
 118+ $visits_wp_yyyymm {$yyyymm} += $count ;
 119+ $visits_wp_total += $count ;
 120+ }
 121+
 122+ if (($project eq "wp") && ($language eq "en") && ($bot eq 'U') && ($country ne "--"))
 123+ {
 124+ $visits_total_wp_en += $count ;
 125+ $visits_wp_en {$country} += $count ;
 126+ }
 127+
 128+ if (($bot eq 'U') && ($country ne "--"))
 129+ {
 130+ $visits_per_project {$project} += $count ;
 131+ $visits_per_language {$language} += $count ;
 132+ $visits_per_country {$country} += $count ;
 133+ }
 134+
 135+ $visits_total += $count ;
 136+
 137+ if (($project eq "wp") && ($language =~ /^(?:th|sk)$/))
 138+ {
 139+ if ($bot eq 'U')
 140+ { $visits_wp_u {"$language $yyyymm"} += $count ; }
 141+ else
 142+ { $visits_wp_b {"$language $yyyymm"} += $count ; }
 143+ }
 144+ }
 145+ close IN ;
 146+ }
 147+ else
 148+ { print "Miss! $file\n" ; }
 149+ }
 150+ $correct_for_missing_days {$yyyymm} = 1 ;
 151+ if (($days_found > 0) && ($days_in_month > $days_found))
 152+ {
 153+ $correct_for_missing_days {$yyyymm} = $days_in_month / $days_found ;
 154+ print "Correct for $yyyymm: $days_found -> $days_in_month = * ${correct_for_missing_days {$yyyymm}}\n" ;
 155+ }
 156+ }
 157+ else
 158+ {
 159+ print "Folder $dir not found. Processing complete.\n" ;
 160+ last ;
 161+ }
 162+
 163+ $month++ ;
 164+ if ($month > 12)
 165+ {
 166+ $month =1 ;
 167+ $year ++ ;
 168+ # last ;
 169+ }
 170+ }
 171+
 172+ print "\nVisits per project:\n" ;
 173+ foreach $key (sort {$visits_per_project {$b} <=> $visits_per_project {$a} } keys %visits_per_project)
 174+ {
 175+ print sprintf ("%9d", $visits_per_project {$key}) . " " .sprintf ("%5.2f", 100 * $visits_per_project {$key}/$visits_total) . "% $key\n" ;
 176+ }
 177+
 178+ print "\n\n" ;
 179+
 180+ print "\nVisits per country:\n" ;
 181+ foreach $key (sort {$visits_per_country {$a} <=> $visits_per_country {$b}} keys %visits_per_country)
 182+ {
 183+ print sprintf ("%9d", $visits_per_country {$key}) . " " .sprintf ("%6.3f", 100 * $visits_per_country {$key}/$visits_total) . "% $key\n" ;
 184+ }
 185+
 186+ print "\nWikipedia visits per country:\n" ;
 187+ foreach $key (sort {$visits_wp_u {$b} cmp $visits_wp_u {$a}} keys %visits_wp_u)
 188+ {
 189+ print sprintf ("%9.1f", ($visits_wp_u {$key} + $visits_wp_b {$key}) /1000) . " - " . sprintf ("%9.1f", $visits_wp_u {$key} /1000) . " - " . sprintf ("%9.1f", $visits_wp_b {$key} /1000) . " $key\n" ; # / 1000 on 1:1000 sampled file is millions
 190+ }
 191+
 192+ print "\nVisits per language:\n" ;
 193+ foreach $key (sort {$visits_per_language {$a} <=> $visits_per_language {$b}} keys %visits_per_language)
 194+ {
 195+ print sprintf ("%9d", $visits_per_language {$key}) . " " .sprintf ("%6.3f", 100 * $visits_per_language {$key}/$visits_total) . "% $key\n" ;
 196+ }
 197+
 198+ print "\nVisits to English Wikipedia\n" ;
 199+ foreach $key (sort {$visits_wp_en {$a} <=> $visits_wp_en {$b}} keys %visits_wp_en)
 200+ {
 201+ print sprintf ("%9d", $visits_wp_en {$key}) . " " .sprintf ("%6.3f", 100 * $visits_wp_en {$key}/$visits_total_wp_en) . "% $key\n" ;
 202+ }
 203+
 204+ print "\n\n" ;
 205+
 206+ print "\n\n" ;
 207+
 208+# foreach $key (sort keys %visits)
 209+# {
 210+# if ($key !~ /wq/) { next ; }
 211+# print sprintf ("%5d", $visits {$key}) . " $key\n" ;
 212+# }
 213+
 214+ open CSV_MONTHLY, '>', $file_raw_data_monthly ;
 215+ foreach $key (sort keys %visits_monthly)
 216+ {
 217+ ($yyyymm, $project, $language, $country) = split (',', $key) ;
 218+ $correction = $correct_for_missing_days {$yyyymm} ;
 219+ $count = $visits_monthly{$key} ;
 220+ $count2 = $count ;
 221+ if (($correction != 0) && ($correction != 1))
 222+ {
 223+ $count2 = $count ;
 224+ $count = sprintf ("%.0f", $count * $correction) ;
 225+ # print "$yyyymm: $count2 -> $count (=* $correction)\n" ;
 226+ }
 227+ print CSV_MONTHLY "$key,$count\n" ;
 228+ }
 229+ close CSV_MONTHLY ;
 230+
 231+ # note correct for missing days in follow processing, see monthly data above
 232+ open CSV_DAILY, '>', $file_raw_data_daily ;
 233+ foreach $key (sort keys %visits_daily)
 234+ { print CSV_DAILY "$key,${visits_daily{$key}}\n" ; }
 235+ close CSV_DAILY ;
 236+
 237+ foreach $yyyymm (sort keys %visits_wp_yyyymm)
 238+ {
 239+ $total = $visits_wp_yyyymm {$yyyymm} ;
 240+ $correction = $correct_for_missing_days {$yyyymm} ;
 241+ $total_corrected = $total * $correction ;
 242+ $total_corrected_share = int (100 * $total_corrected / $visits_wp_total) ;
 243+ print "$yyyymm: $total * $correction = $total_corrected / $visits_wp_total = $total_corrected_share\%\n" ;
 244+ }
 245+}
 246+
 247+sub ProcessRawData
 248+{
 249+ print "\nProcessRawData\n\n" ;
 250+
 251+ open IN, '<', $file_raw_data ;
 252+ open OUT, '>', $file_csv_counts_daily_project ;
 253+
 254+ $date_prev = "" ;
 255+
 256+ while ($line = <IN>)
 257+ {
 258+ $lines++ ;
 259+ chomp ($line) ;
 260+ # ($date,$bot,$from,$to,$php,$status,$mime,$action,$agent,$count) = split (',', $line) ;
 261+ ($date,$bot,$from,$to,$status,$mime,$action,$count) = split (',', $line) ;
 262+
 263+# if ($to !~ /wk:lt/) { next ; }
 264+
 265+ if ($bot =~ /^#/) { next ; } # fix, should be removed in CollectRawData
 266+
 267+ # if ($php ne "php(index.php)") { $lines_unexpected_php {$php}++ ; next ; }
 268+
 269+ $action2 = $action ;
 270+ $action2 =~ s/\&.*$// ;
 271+ $counts_per_action {"$action2"} += $count ;
 272+
 273+ $action =~ s/\&amp;/&/g ;
 274+
 275+ if ($action =~ /submitlogin/)
 276+ { next ; }
 277+
 278+ if (($action !~ /^action=edit\&/) && ($action !~ /^action=submit\&/) )
 279+ {
 280+ $invalid_actions ++ ;
 281+ next ;
 282+ }
 283+
 284+ if ($mime ne "text/html")
 285+ {
 286+ $mime_not_text_html {$mime} ++ ;
 287+ next ;
 288+ }
 289+
 290+ if (! ((($action =~ /action=edit/) && ($status =~ /200/)) ||
 291+ (($action =~ /action=submit/) && ($status =~ /302/))))
 292+ { next ; }
 293+
 294+ $counts_per_relevant_action_and_status1 {"$action2"} += $count ;
 295+
 296+ $counts_per_bot_relevant_action_and_status2 {"$bot,$action2,$status"} += $count ;
 297+
 298+ if ($action !~ /redlink/)
 299+ {
 300+ $counts_per_relevant_action_and_status_no_redlink {"$action2,$status"} += $count ;
 301+
 302+ $counts_per_bot_relevant_action_and_status_no_redlink {"$bot,$status,$action2"} += $count ;
 303+
 304+ if ($bot =~ /N/)
 305+ {
 306+ # print "$to,$action2,$count\n" ;
 307+ $counts_no_bot_per_relevant_action_and_status_no_redlink {"$to,$action2"} += $count ;
 308+ $counts_no_bot_no_redlink_per_destination {$to} += $count ;
 309+ }
 310+ }
 311+
 312+ if (($action =~ /redlink/) && ($status =~ /(?:200|302)/))
 313+ {
 314+ $counts_per_relevant_status_with_redlink {"$to,action=edit,redlink=..,$status"} += $count ;
 315+ $counts_per_destination {$to} += $count ;
 316+ }
 317+
 318+ if ($action =~ /redlink/)
 319+ { next ; }
 320+
 321+ if (($to !~ /wp:(?:en|de|ja|es|fr|ru|zh)$/) && ($to !~ /wk:(?:lt)$/) && ($to !~ /wx:(?:mw)$/))
 322+ { next ; }
 323+
 324+ if ($bot !~ /N/)
 325+ { next ; }
 326+
 327+ $counts {"$date,$to,$action2"} += $count ;
 328+ $dates {$date}++ ;
 329+ $tos {$to}++ ;
 330+
 331+ if ($bot eq "bot=Y")
 332+ {
 333+ if ($action =~ /action=edit/)
 334+ {$ bots_edits += $count ; }
 335+ elsif ($action =~ /action=submit/)
 336+ { $bots_saves += $count ; }
 337+ }
 338+ else
 339+ {
 340+ if ($action =~ /action=edit/)
 341+ {$user_edits += $count ; }
 342+ elsif ($action =~ /action=submit/)
 343+ { $user_saves += $count ; }
 344+ }
 345+ }
 346+
 347+
 348+ print OUT "date," ;
 349+ foreach $to (sort keys %tos)
 350+ { print OUT "edits $to,saves $to,ratio $to," ; }
 351+ print OUT "\n" ;
 352+
 353+ foreach $date (sort keys %dates)
 354+ {
 355+ # print "DAY $date\n" ;
 356+ $csv_date = "\"=DATE(" . substr ($date,0,4) . "," . substr ($date,4,2) . "," . substr ($date,6,2) . ")\"" ;
 357+
 358+ print OUT "$csv_date, " ;
 359+
 360+ foreach $to (sort keys %tos)
 361+ {
 362+ # print "TO $to\n" ;
 363+
 364+ $edits = $counts {"$date,$to,action=edit"} ;
 365+ $submits = $counts {"$date,$to,action=submit"} ;
 366+ $ratio = -1 ;
 367+ if ($submits > 0)
 368+ { $ratio = sprintf ("%.1f", $edits/$submits) ; }
 369+ print OUT "$edits,$submits,$ratio," ;
 370+ }
 371+ print OUT "\n" ;
 372+ }
 373+
 374+ # Write CSV_COUNT_DAILY
 375+
 376+ open CSV_COUNT_DAILY, '>', $file_csv_counts_daily ;
 377+ foreach $key (sort keys %counts)
 378+ { print CSV_COUNT_DAILY sprintf ("%6d", $counts {$key}) . ",$key\n" ; }
 379+ close CSV_COUNT_DAILY ;
 380+
 381+ $text = "" ;
 382+ $text .= "\nInvalid actions: $invalid_actions\n\n" ;
 383+
 384+ $text .= "Counts per action:\n" ;
 385+ foreach $key (sort keys %counts_per_action)
 386+ {
 387+ $count = $counts_per_action {$key} ;
 388+ if ($count < 5) { next ; }
 389+ $text .= sprintf ("%6d", $count) . ",$key\n" ;
 390+ }
 391+ $text .= "\n\n" ;
 392+
 393+ $text .= "Counts per relevant action and status:\n" ;
 394+ foreach $key (sort keys %counts_per_relevant_action_and_status1)
 395+ {
 396+ $count = $counts_per_relevant_action_and_status1 {$key} ;
 397+ # if ($count < 5) { next ; }
 398+ $text .= sprintf ("%6d", $count) . ",$key\n" ;
 399+ }
 400+ $text .= "\n\n" ;
 401+
 402+ $text .= "Counts per bot, relevant action and status:\n" ;
 403+ foreach $key (sort keys %counts_per_bot_relevant_action_and_status2)
 404+ {
 405+ $count = $counts_per_bot_relevant_action_and_status2 {$key} ;
 406+ # if ($count < 5) { next ; }
 407+ $text .= sprintf ("%6d", $count) . ",$key\n" ;
 408+ }
 409+ $text .= "\n\n" ;
 410+
 411+ $text .= "Counts per relevant action and status and no redlinks:\n" ;
 412+ foreach $key (sort keys %counts_per_relevant_action_and_status_no_redlink)
 413+ {
 414+ $count = $counts_per_relevant_action_and_status_no_redlink {$key} ;
 415+ if ($count < 5) { next ; }
 416+ $text .= sprintf ("%6d", $count) . ",$key\n" ;
 417+ }
 418+ $text .= "\n\n" ;
 419+
 420+ $text .= "Count per bot, relevant action and status and no redlink:\n" ;
 421+ foreach $key (sort keys %counts_per_bot_relevant_action_and_status_no_redlink)
 422+ {
 423+ $count = $counts_per_bot_relevant_action_and_status_no_redlink {$key} ;
 424+ # if ($count < 5) { next ; }
 425+ $text .= sprintf ("%-33s",$key) . sprintf ("%6d", $count) . "\n" ;
 426+ }
 427+ $text .= "\n\n" ;
 428+
 429+ $text .= "Counts no bot, per relevant action and status no redlink:\n" ;
 430+ foreach $key (sort keys %counts_no_bot_per_relevant_action_and_status_no_redlink)
 431+ {
 432+ ($to = $key) =~ s/,.*$// ;
 433+ if ($to !~ /:/) { next ; }
 434+ if ($counts_no_bot_no_redlink_per_destination {$to} < 100) { next ; }
 435+ $count = $counts_no_bot_per_relevant_action_and_status_no_redlink {$key} ;
 436+ if ($key =~ /action=edit/)
 437+ {
 438+ $count_edit = $counts_no_bot_per_relevant_action_and_status_no_redlink {"$to,action=edit"} ;
 439+ $count_submit = $counts_no_bot_per_relevant_action_and_status_no_redlink {"$to,action=submit"} ;
 440+ $count_edits += $count_edit ;
 441+ $count_submits += $count_submit ;
 442+ $ratio = '..' ;
 443+ if ($count_submit > 0)
 444+ { $ratio = sprintf ("%5.1f", $count_edit / $count_submit) ; }
 445+ push @ratios, "$ratio|" . sprintf ("%-14s",$to) . "edits " . sprintf ("%6d", $count_edit) . ", submits ". sprintf ("%6d", $count_submit) . ", ratio $ratio\n" ;
 446+ }
 447+ # $text .= sprintf ("%-33s",$key) . sprintf ("%6d", $count) . "\n" ;
 448+ }
 449+ @ratios = sort {$b <=> $a} @ratios ;
 450+ foreach $line (@ratios)
 451+ {
 452+ ($ratio, $line) = split ('\|', $line) ;
 453+ $text .= $line ;
 454+ }
 455+ $ratio = sprintf ("%5.1f", $count_edits / $count_submits) ;
 456+ $text .= sprintf ("%-14s",'total') . "edits " . sprintf ("%6d", $count_edits) . ", submits ". sprintf ("%6d", $count_submits) . ", ratio $ratio\n" ;
 457+ $text .= "\n\n" ;
 458+ print $count
 459+
 460+ $text .= "Count per relevant status with redlink:\n" ;
 461+ foreach $key (sort keys %counts_per_relevant_status_with_redlink)
 462+ {
 463+ $count = $counts_per_relevant_status_with_redlink {$key} ;
 464+ ($to = $key) =~ s/,.*$// ;
 465+ if ($counts_per_destination {$to} < 100) { next ; }
 466+ $text .= sprintf ("%6d", $count) . ",$key\n" ;
 467+ }
 468+ $text .= "\n\n" ;
 469+
 470+ open SUMMARY, '>', $file_txt_summary ;
 471+ print SUMMARY $text ;
 472+ close SUMMARY ;
 473+
 474+ print $text ;
 475+}
 476+
 477+
 478+sub DaysInMonth
 479+{
 480+ my $year = shift ;
 481+ my $month = shift ;
 482+ my $timegm1 = timegm (0,0,0,1,$month-1,$year-1900) ;
 483+ $month++ ;
 484+ if ($month > 12)
 485+ { $month = 1 ; $year++ }
 486+ my $timegm2 = timegm (0,0,0,1,$month-1,$year-1900) ;
 487+ my $days = ($timegm2-$timegm1) / (24*60*60) ;
 488+ return ($days) ;
 489+}
Index: trunk/wikistats/squids/SquidCountArchive.sh
@@ -0,0 +1,13 @@
 2+#!/bin/bash
 3+
 4+ulimit -v 4000000
 5+
 6+home="/a/ezachte"
 7+log="$home/SquidCountArchiveLog.txt"
 8+script="$home/SquidCountArchive.pl"
 9+
 10+echo "" > $log
 11+
 12+nice perl $script -d 2011/02/07-2011/02/11
 13+echo "Ready" >> $log
 14+echo "Ready"
Property changes on: trunk/wikistats/squids/SquidCountArchive.sh
___________________________________________________________________
Added: svn:eol-style
115 + native
Index: trunk/wikistats/squids/SquidCountArchive.pl
@@ -0,0 +1,1030 @@
 2+ #!/usr/bin/perl
 3+
 4+ use lib "/home/ezachte/lib" ;
 5+ use EzLib ;
 6+
 7+ $trace_on_exit = $true ;
 8+ ez_lib_version (13) ;
 9+
 10+ use SquidCountArchiveProcessLogRecord ;
 11+ use SquidCountArchiveReadInput ;
 12+ use SquidCountArchiveWriteOutput ;
 13+
 14+ # set defaults mainly for tests on local machine
 15+ default_argv "-d 2010/05/10" ;
 16+
 17+# http://wikitech.wikimedia.org/view/Squid_log_format
 18+# 1. Hostname
 19+# 2. Sequence number
 20+# 3. Current time in ISO 8601 format (oplus milliseconds), according ot the squid server's clock
 21+# 4. Request time in ms
 22+# 5. Client IP
 23+# 6. Squid request status, HTTP status code
 24+# 7. Reply size including HTTP headers
 25+# 8. Request method (GET/POST etc)
 26+# 9. URL
 27+# 10. Squid hierarchy status, peer IP
 28+# 11. MIME content type
 29+# 12. Referer header
 30+# 13. X-Forwarded-For header
 31+# 14 User-Agent header
 32+
 33+# valid parameters:
 34+# parm -d m[-n] (last m|n days before today) or yyyymmdd[-yyyymmdd] or yyyy/mm/dd[-yyyy/mm/dd]
 35+# parm -f [1|2|12] force phase 1 and or 2 even when already ran succesfully earlier
 36+# phase 1 = collect IP frequency counts, this is first pass through data (there is litle change this needs to be redone, hence default is no overwrite)
 37+# phase 2 = collect other counts, this may have to be redone after filtering logic has changed
 38+# parm -t test mode
 39+
 40+# todo: parm -e use unsampled file with all edits and saves
 41+# todo: parm -r root folder
 42+
 43+ $test = $false ;
 44+ $test_maxlines = 4000000 ;
 45+
 46+ if (! $job_runs_on_production_server)
 47+ {
 48+ $test = $true ;
 49+ $file_test = "w:/# Out Locke/sampled-1000-log-20100510b.txt" ;
 50+ # $file_test = getcwd . "/SquidDataFilterFY.txt" ;
 51+ if (! -e $file_test)
 52+ { abort "Test input file '$file_test' not found" ; }
 53+ }
 54+
 55+ $time_start = time ;
 56+
 57+ if ($job_runs_on_production_server)
 58+ { $path_root = "/a/ezachte" ; }
 59+ else
 60+ { $path_root = "w:/! perl/squids/archive/test" ; }
 61+
 62+ $tags_mobile = "Android|BlackBerry|Windows CE|DoCoMo|iPad|iPod|iPhone|HipTop|LGE|Linux arm|Mobile|MIDP|NetFront|Nintendo|Nokia|Obigo|Opera Mini|Palm Pre|Playstation|Samsung|SoftBank|SonyEricsson|SymbianOS|UP\.Browser|Vodafone|WAP|webOS|Wikiamo|Wikipanion" ;
 63+ $tags_mobile_upd = "May 2010" ;
 64+
 65+ $pattern_url_pre = "(?:^|[a-zA-Z0-9-]+\\.)*?" ;
 66+ $pattern_url_post = "\\.(?:biz|com|info|name|net|org|pro|aero|asia|cat|coop|edu|gov|int|jobs|mil|mobi|museum|tel|travel|arpa|[a-zA-Z0-9-]{2}|(?:com?|ne)\\.[a-zA-Z0-9-]{2})\$" ;
 67+
 68+ my (%squid_seqno_lo, %squid_seqno_hi) ;
 69+
 70+ my ($from_days_ago, $till_days_ago, $from_date, $till_date) = &ParseArguments ;
 71+ &SetFileNames ;
 72+
 73+ my ($path_out, $path_out_month) ;
 74+ for ($days_ago = $from_days_ago ; $days_ago >= $till_days_ago ; $days_ago--)
 75+ {
 76+ if ($days_to_process ++ > 0)
 77+ { print "\n" . "=" x 80 . "\n" ; }
 78+ ($path_out, $path_out_month) = &SetPathOut ($days_ago) ;
 79+
 80+ open OUT, '>', "$path_out/$file_out" ;
 81+ open OUT2, '>', "$path_out/$file_out2" ;
 82+ open ERR, '>', "$path_out/$file_err" ;
 83+ # open FILTER_FY, '>>', "$path_out_month/$file_filter_fy" ;
 84+
 85+ my $do_phase1 = &CheckProcessPhase1 ($days_ago, $path_out) ; # Collect IP frequencies
 86+ my $do_phase2 = &CheckProcessPhase2 ($days_ago, $path_out) ; # collect other data
 87+
 88+ next if ! $do_phase1 and ! $do_phase2 ;
 89+
 90+ &InitGlobals ;
 91+ undef @files ; # keep out of InitGlobals, to allow rerun with same files, see 'test InitGlobals' below
 92+
 93+ ($date_collect_files, $time_to_start, $time_to_stop) = &SetTimeRangeToProcess ($days_ago) ;
 94+
 95+ $all_files_found = &CollectFilesToProcess ($days_ago, $date_collect_files, $time_to_start, $time_to_stop, $path_out, $path_out_month) ;
 96+ next if not $all_files_found ;
 97+
 98+ if ($do_phase1) # Collect IP frequencies
 99+ { &ProcessPhase1 ($days_ago, $date_collect_files, $time_to_start, $time_to_stop, $path_out, @files) ; }
 100+
 101+ if ($do_phase2) # collect other data
 102+ { &ProcessPhase2 ($days_ago, $date_collect_files, $time_to_start, $time_to_stop, $path_out, $path_out_month, @files) ; }
 103+
 104+ # test InitGlobals: rebuild files in alternate folder, if InitGlobals did its work, all files are binary equal
 105+ # &InitGlobals ;
 106+ # if ($do_phase2) # collect other data
 107+ # { &ProcessPhase2 ($days_ago, $date_collect_files, $time_to_start, $time_to_stop, $path_out. 'b', $path_out_month, @files) ; }
 108+
 109+ close OUT ;
 110+ close OUT2 ;
 111+ close ERR ;
 112+ # close FILTER_FY ;
 113+ }
 114+
 115+# if (defined ($options {"u"})) # all lines with action=edit or action=submit generated in mode scan_squid_archive
 116+# { &ScanEditsSavesFile ; } # also use to build ScanDataCountriesSaves.csv for earlier months from SquidDataEditsSavesyyyy-mm-dd.txt.bz2
 117+# else
 118+# {
 119+# if (defined ($options {"a"})) # scan ip addresses only (find multiple occurrences, store for reuse)
 120+# {
 121+# $scan_ip_frequencies = $true ;
 122+# print "Scan for multiple occurrences of ip addresses\n\n" ;
 123+# }
 124+# elsif (defined ($options {"s"})) # scan squid sequence numbers
 125+# {
 126+# $scan_squid_msg_sequence_numbers = $true ;
 127+# print "Scan for squid sequence numbers\n\n" ;
 128+# }
 129+# else
 130+# {
 131+# $scan_all_fields = $true ;
 132+# print "Scan all fields\n\n" ;
 133+# }
 134+
 135+# &ScanSquidArchive ;
 136+# }
 137+
 138+# &ProcessSquidSequenceNumbers ;
 139+
 140+ print "\n\nReady\n\n" ;
 141+ exit ;
 142+
 143+sub ParseArguments
 144+{
 145+ trace ParseArguments ;
 146+
 147+ my %options ;
 148+
 149+ getopt ("df", \%options) ;
 150+
 151+ $date_range = $options {"d"} ;
 152+ $force_phases = $options {"f"} ;
 153+
 154+ if ($force_phases !~ /^(?:|1|2|12|21)$/)
 155+ { abort "Invalid data for -f parameter: specify which phases to force as -f [1|2|12]\nForce = execute phase even when already done succesfully earlier\nPhase1 = collect ip counts\nPhase2 = collect other counts\n" ; }
 156+
 157+ if ($date_range eq '')
 158+ { abort "No valid date range specified\n\nSpecify first and last day to process as:\n'-d yyyymmdd[-yyyymmdd]' (yymmdd or yyyy/mm/dd, " .
 159+ "second date defaults to first)\nor\n'-d mmm[-nnn]', where mmm and nnn are days before today (mmm less or equal to nnn), nnn defaults to mmm\n\n" ; }
 160+
 161+ if ($date_range =~ m/^\d{4}\/?\d{2}\/?\d{2}(?:\-\d{4}\/?\d{2}\/?\d{2})?$/) # specify daterange as yyyymmdd-yyyymmdd or yyyy/mm/dd-yyyy/mm/dd
 162+ {
 163+ if ($date_range =~ /^\d{4}\/?\d{2}\/?\d{2}$/) # expand shorthand version
 164+ { $date_range =~ s/^(\d{4}\/?\d{2}\/?\d{2})$/$1-$1/ ; }
 165+
 166+ ($from_date,$till_date) = split '-', $date_range ;
 167+
 168+ $from_year = substr ($from_date,0,4) ;
 169+ $from_month = substr ($from_date,4,2) ;
 170+ $from_day = substr ($from_date,6,2) ;
 171+
 172+ $till_year = substr ($till_date,0,4) ;
 173+ $till_month = substr ($till_date,4,2) ;
 174+ $till_day = substr ($till_date,6.2) ;
 175+
 176+ $from_days_ago = ValidateDateAndCalcDaysAgo ('from date', $from_date) ;
 177+ $till_days_ago = ValidateDateAndCalcDaysAgo ('till date', $till_date) ;
 178+
 179+ my $diff_days = ($from_days_ago - $till_days_ago) + 1 ;
 180+ if ($till_days_ago > $from_days_ago)
 181+ { abort "Invalid date range: from date '$from_date' is later than till date '$till_date'\n" ; }
 182+
 183+ $yyyymmdd = 'yyyy/mm/dd' ;
 184+ if ($from_date !~ /\//)
 185+ { $yyyymmdd =~ s/\///g ; }
 186+ print "Process following date range:\nFrom '$from_date' till '$till_date' ($yyyymmdd)\nWhich is from $from_days_ago till $till_days_ago days ago = $diff_days days\n" ;
 187+ }
 188+ elsif ($date_range =~ /^\d{1,3}(?:-\d{1,3})?$/) # specify daterange as mmm-nnn (where mmm and nnn are number of days before today), nnn defaults to mmm
 189+ {
 190+ if ($date_range =~ /^\d+$/) # expand shorthand version
 191+ { $date_range =~ s/^(\d+)$/$1-$1/ ; }
 192+
 193+ ($from_days_ago,$till_days_ago) = split '-', $date_range ;
 194+
 195+ if ($till_days_ago > $from_days_ago) # swap
 196+ # { abort "Invalid date range: from date '$from_date' is later than till date '$till_date'\n" ; }
 197+ { my $temp = $till_days_ago ; $till_days_ago = $from_days_ago ; $from_days_ago = $temp ; }
 198+
 199+ ($sec,$min,$hour,$day,$month,$year) = localtime (time) ;
 200+ ($year,$month,$day) = &ShiftDays ($year+1900, $month+1, $day, - $from_days_ago) ;
 201+ $from_date = sprintf ("%04d/%02d/%02d",$year,$month,$day) ;
 202+
 203+ ($sec,$min,$hour,$day,$month,$year) = localtime (time) ;
 204+ ($year,$month,$day) = &ShiftDays ($year+1900, $month+1, $day, - $till_days_ago) ;
 205+ $till_date = sprintf ("%04d/%02d/%02d",$year,$month,$day) ;
 206+
 207+ my $diff_days = ($from_days_ago - $till_days_ago) + 1 ;
 208+ print "Process following date range:\nFrom $from_days_ago till $till_days_ago days ago, which is:\nFrom '$from_date' till '$till_date' (yyyy/mm/dd) = $diff_days days\n" ;
 209+ }
 210+ else
 211+ { abort "\nNo valid date range specified!\n\nSpecify first and last day to process as:\n'-d yyyymmdd[-yyyymmdd]' (yyyy/m/dd also valid)\n" .
 212+ "(second date defaults to first)\nor\n'-d mmm[-nnn]', where mmm and nnn are days before today (mmm =< nnn), nnn defaults to mmm\n\n" ; }
 213+
 214+ if ($options {"t"})
 215+ {
 216+ $test = $true ;
 217+ print "Run in test mode: process less input\n" ;
 218+ }
 219+
 220+ return ($from_days_ago, $till_days_ago, $from_date, $till_date) ;
 221+}
 222+
 223+sub ValidateDateAndCalcDaysAgo
 224+{
 225+ trace ValidateDateAndCalcDaysAgo ;
 226+
 227+ my ($desc, $date) = @_ ;
 228+
 229+ my ($sec,$min,$hour,$day,$month,$year) ;
 230+ ($sec,$min,$hour,$day,$month,$year) = localtime (time) ;
 231+
 232+ my $date_today = sprintf ("%4d/%02d/%02d", $year+1900,$month+1,$day) ;
 233+ if ($date !~ /\//)
 234+ { $date_today =~ s/\///g ; }
 235+
 236+ if ($date =~ m!^(20\d\d)/?(0[1-9]|1[012])/?(0[1-9]|[12][0-9]|3[01])$!)
 237+ {
 238+ # At this point, $1 holds the year, $2 the month and $3 the day of the date entered
 239+ $year = $1 ;
 240+ $month = $2 ;
 241+ $day = $3 ;
 242+
 243+ if ($day == 31 and ($month == 4 or $month == 6 or $month == 9 or $month == 11))
 244+ { abort "$desc '$date': 31st of a month with 30 days" ; }
 245+ elsif ($day >= 30 and $month == 2)
 246+ { abort "$desc '$date': February 30th or 31st" ; }
 247+ elsif ($month == 2 and $day == 29 and not ($year % 4 == 0 and ($year % 100 != 0 or $year % 400 == 0)))
 248+ { abort "$desc '$date': February 29th outside a leap year" ; }
 249+ else { ; } # valid date
 250+ }
 251+ else { abort "$date: not valid date format: use yyyymmdd or yyyy/mm/dd" ; }
 252+
 253+ my $time_input = timelocal (0,0,0,$day, $month-1, $year-1900) ;
 254+ ($sec,$min,$hour,$day,$month,$year) = localtime (time) ;
 255+ my $time_today = timelocal (0,0,0,$day, $month, $year) ;
 256+
 257+ my $days_ago = ($time_today - $time_input) / (24 * 60 * 60) ;
 258+
 259+ if ($days_ago < 1)
 260+ { abort "$desc '$date' should be before today which is $date_today" ; }
 261+
 262+ if ($days_ago > 366)
 263+ { abort "$desc '$date' should be a year or less ago (but before today: '$date_today')" ; }
 264+
 265+ return ($days_ago) ;
 266+}
 267+
 268+sub SetFileNames
 269+{
 270+ trace SetFileNames ;
 271+
 272+ $file_out = "private/DebugSquidDataOutDoNotPublish.txt" ;
 273+ $file_out2 = "private/DebugSquidDataOutDoNotPublish2.txt" ;
 274+ $file_err = "private/DebugSquidDataErrDoNotPublish.txt" ;
 275+
 276+ $file_ip_frequencies = "private/SquidDataIpFrequenciesDoNotPublish.csv" ;
 277+ $file_ip_frequencies_bz2 = "private/SquidDataIpFrequenciesDoNotPublish.csv.bz2" ;
 278+ $file_out_referers = "private/SquidDataReferersDoNotPublish.txt" ;
 279+ $file_edits_saves = "private/SquidDataEditsSavesDoNotPublish.txt" ;
 280+
 281+ $file_csv_agents = "public/SquidDataAgents.csv" ;
 282+ $file_csv_banners = "public/SquidDataBanners.csv" ;
 283+ $file_csv_binaries = "public/SquidDataBinaries.csv" ;
 284+ $file_csv_clients = "public/SquidDataClients.csv" ;
 285+ $file_csv_clients_by_wiki = "public/SquidDataClientsByWiki.csv" ; # request Howie
 286+ $file_csv_countries_views = "public/SquidDataCountriesViews.csv" ; # was SquidDataCountries2.csv
 287+ $file_csv_countries_timed = "public/SquidDataCountriesViewsTimed.csv" ; # was SquidDataCountriesTimed2.csv
 288+ $file_csv_countries_saves = "public/SquidDataCountriesSaves.csv" ;
 289+ $file_csv_bots = "public/SquidDataCrawlers.csv" ;
 290+ $file_csv_extensions = "public/SquidDataExtensions.csv" ;
 291+ $file_csv_googlebots = "public/SquidDataGoogleBots.csv" ;
 292+ $file_csv_images = "public/SquidDataImages.csv" ;
 293+ $file_csv_indexphp = "public/SquidDataIndexPhp.csv" ; #
 294+ $file_csv_languages = "public/SquidDataLanguages.csv" ;
 295+ $file_head_tail = "public/SquidDataLogFilesHeadTail.csv" ;
 296+ $file_csv_methods = "public/SquidDataMethods.csv" ;
 297+ $file_csv_opsys = "public/SquidDataOpSys.csv" ;
 298+ $file_csv_origins = "public/SquidDataOrigins.csv" ;
 299+ $file_csv_requests = "public/SquidDataRequests.csv" ;
 300+ $file_csv_requests_wap = "public/SquidDataRequestsWap.csv" ;
 301+ $file_csv_requests_m = "public/SquidDataRequestsM.csv" ; # .m. in url, not mobile as derived from agent
 302+ $file_csv_scripts = "public/SquidDataScripts.csv" ;
 303+ $file_csv_search = "public/SquidDataSearch.csv" ;
 304+ $file_csv_skins = "public/SquidDataSkins.csv" ;
 305+
 306+ $file_seqno_per_squidhour = "SquidDataSequenceNumbersPerSquidHour.csv" ;
 307+ $file_seqno_all_squids = "SquidDataSequenceNumbersAllSquids.csv" ;
 308+ $file_head_tail = "SquidDataLogFilesHeadTail.csv" ;
 309+# $file_filter_fy = "SquidDataFilterFY.txt" ;
 310+
 311+ $path_out = "" ;
 312+}
 313+
 314+sub SetPathOut
 315+{
 316+ trace SetPathOut ; # to keep trace tidy , do this at end of routine
 317+
 318+ my $days_ago = shift ;
 319+ my ($path_out, $path_out_month) ;
 320+
 321+ ($sec,$min,$hour,$day,$month,$year) = localtime ($time_start - $days_ago * 24 * 3600) ;
 322+
 323+ $path_out = sprintf ("%04d-%02d", $year+1900, $month+1) ;
 324+
 325+ $path_out = "$path_root/$path_out" ;
 326+ $path_out_month = $path_out ;
 327+
 328+ if (! -d $path_out)
 329+ {
 330+ # print "mkdir $path_out\n" ;
 331+ mkdir ($path_out) || die "Unable to create directory $path_out\n" ;
 332+ }
 333+
 334+ $path_out .= "/" . sprintf ("%04d-%02d-%02d", $year+1900, $month+1, $day) ;
 335+ if (! -d $path_out)
 336+ {
 337+ # print "mkdir $path_out\n" ;
 338+ mkdir ($path_out) || die "Unable to create directory $path_out\n" ;
 339+ # print "mkdir $path_out/private\n" ;
 340+ mkdir ("$path_out/private") || die "Unable to create directory $path_out/private\n" ;
 341+ # print "mkdir $path_out/public\n" ;
 342+ mkdir ("$path_out/public" ) || die "Unable to create directory $path_out/public\n" ;
 343+ }
 344+
 345+ # clean up obsolete signal files
 346+ $file_ready = "$path_out/\^Ready" ;
 347+ unlink $file_ready ;
 348+ $file_ready = "$path_out/\@Ready" ;
 349+ unlink $file_ready ;
 350+
 351+ trace "SetPathOut for $days_ago days ago => path_out = '$path_out'\n" ;
 352+ return ($path_out, $path_out_month) ;
 353+}
 354+
 355+sub SetTimeRangeToProcess
 356+{
 357+ my $days_ago = shift ;
 358+
 359+ my ($sec,$min,$hour,$day,$month,$year) = localtime ($time_start - $days_ago * 24 * 3600) ;
 360+ my $date_collect_files = sprintf ("%4d-%02d-%02d", $year+1900, $month+1, $day) ;
 361+ my $time_to_start = $date_collect_files . "T00:00:00" ;
 362+ my ($sec,$min,$hour,$day,$month,$year) = localtime ($time_start - ($days_ago-1) * 24 * 3600) ;
 363+ my $date_after_collect_files = sprintf ("%4d-%02d-%02d", $year+1900, $month+1, $day) ;
 364+ my $time_to_stop = $date_after_collect_files . "T00:00:00" ;
 365+# my $time_to_stop = $date_collect_files . "T23:30:00" ; # Q&D fix to process last file available
 366+
 367+ # if ($test)
 368+ # { $time_to_stop = $date_collect_files . "T00:30:00" ; }
 369+
 370+ return ($date_collect_files, $time_to_start, $time_to_stop) ;
 371+}
 372+
 373+sub CheckProcessPhase1 # Collect IP frequencies
 374+{
 375+ trace CheckProcessPhase1 ;
 376+
 377+ my ($days_ago, $path_out) = @_ ;
 378+ my $process = $true ;
 379+
 380+ my $file_ready = "$file_ip_frequencies_bz2" ;
 381+ my $path_ready = "$path_out/$file_ready" ;
 382+
 383+ if (-e $path_ready)
 384+ {
 385+ if ($force_phases !~ /1/)
 386+ {
 387+ $process = $false ;
 388+ print "File '[path_out]$file_ready' already exists => skip phase 1 (collecting ip address counts)\n" ;
 389+ }
 390+ else
 391+ { print "File '[path_out]$file_ready' already exists.\nYet force execute phase 1 (collecting ip address counts), as -f 1 has been specified\n" ; }
 392+ }
 393+ else
 394+ { print "File '[path_out]/$file_ready' not found -> process phase 1\n" ; }
 395+
 396+ return ($process) ;
 397+}
 398+
 399+sub CheckProcessPhase2 # collect other data
 400+
 401+{
 402+ trace CheckProcessPhase2 ;
 403+
 404+ my ($days_ago, $path_out) = @_ ;
 405+ my $process = $true ;
 406+
 407+ my $file_ready = "#Ready" ;
 408+ my $path_ready = "$path_out/$file_ready" ;
 409+ if (-e $path_ready)
 410+ {
 411+ if ($force_phases !~ /2/)
 412+ {
 413+ $process = $false ;
 414+ print "File '[path_out]/$file_ready' already exists => skip phase 2 (collecting counts other than ip counts)\n" ;
 415+ }
 416+ else
 417+ { print "File '[path_out]/$file_ready' already exists.\nYet force execute phase 2 (collecting counts other than ip counts), as -f 2 has been specified\n" ; }
 418+ }
 419+ else
 420+ { print "File '[path_out]/$file_ready' not found -> process phase 2\n" ; }
 421+
 422+ return ($process) ;
 423+}
 424+
 425+sub InitGlobals # qqq
 426+{
 427+ trace InitGlobals ;
 428+
 429+ undef $addresses_stored ;
 430+ undef $banner_requests_ignored ;
 431+ undef $date_prev ;
 432+ undef $fields_too_few ;
 433+ undef $fields_too_many ;
 434+ undef $googlebots ;
 435+ undef $googles ;
 436+ undef $html_pages_found ;
 437+ undef $lines_in_file ;
 438+ undef $lines_processed ;
 439+ undef $lines_this_day ;
 440+ undef $newest_time_read ;
 441+ undef $oldest_time_read ;
 442+ undef $statusses_non_tcp ;
 443+ undef $tot_mime_html ;
 444+ undef $tot_mime_html2 ;
 445+ undef $tot_origins_external_counted ;
 446+ undef $tot_referers_external ;
 447+ undef $tot_referers_internal ;
 448+ undef $unrecognized_domains ;
 449+
 450+ undef %google_bot_hits ;
 451+ undef %ip_bot_no_google ;
 452+ undef %agents_raw ;
 453+ undef %binaries ;
 454+ undef %bots ;
 455+ undef %client_ip_record_cnt ;
 456+ undef %client_ip_record_cnt_total ;
 457+ undef %clients ;
 458+ undef %clients_by_wiki ;
 459+ undef %cnt_ip_ranges ;
 460+ undef %countries ;
 461+ undef %countries_saves ;
 462+ undef %countries_timed ;
 463+ undef %countries_views ;
 464+ undef %edit_submit_filtered ;
 465+ undef %engines ;
 466+ undef %exts ;
 467+ undef %google_imposters ;
 468+ undef %googlebins ;
 469+ undef %googlebins2 ;
 470+ undef %grouped_clients ;
 471+ undef %imagesizes ;
 472+ undef %index_php ;
 473+ undef %index_php_raw ;
 474+ undef %ip_distribution ;
 475+ undef %ip_frequencies ;
 476+ undef %languages ;
 477+ undef %languages_unrecognized ;
 478+ undef %lines_read ;
 479+ undef %mobile_other ;
 480+ undef %operating_systems ;
 481+ undef %origin_simplified ;
 482+ undef %origins ;
 483+ undef %origins_external ;
 484+ undef %origins_unsimplified ;
 485+ undef %referers_internal ;
 486+ undef %requests ;
 487+ undef %scripts ;
 488+ undef %search ;
 489+ undef %skins ;
 490+ undef %squid_delta ;
 491+ undef %squid_events ;
 492+ undef %squid_seqno ;
 493+ undef %statusses ;
 494+ undef %unrecognized_domains ;
 495+ undef %wikis ;
 496+# undef @files ;
 497+};
 498+
 499+sub ProcessPhase1 # collect IP frequencies, needed for filtering probable bots in phase 2
 500+
 501+{
 502+ trace "ProcessPhase1: Collect IP frequencies" ;
 503+ my ($days_ago, $date_collect_files, $time_to_start, $time_to_stop, $path_out, @files) = @_ ;
 504+
 505+ $scan_ip_frequencies = $true ;
 506+ $scan_all_fields = $false ;
 507+
 508+ my $data_read = &ReadSquidLogFiles ($path_out, $time_to_start, $time_to_stop, @files) ;
 509+ return if not $data_read ;
 510+
 511+ &WriteOutputIpFrequencies ($path_out) ;
 512+}
 513+
 514+sub ProcessPhase2 # Collect other data
 515+{
 516+ trace "ProcessPhase2: Collect other data" ;
 517+ my ($days_ago, $date_collect_files, $time_to_start, $time_to_stop, $path_out, $path_out_month, @files) = @_ ;
 518+
 519+ $scan_ip_frequencies = $false ;
 520+ $scan_all_fields = $true ;
 521+
 522+ my $data_read = &ReadIpFrequencies ($path_out) ;
 523+ return if not $data_read ;
 524+
 525+ my $data_read = &ReadSquidLogFiles ($path_out, $time_to_start, $time_to_stop, @files) ;
 526+ return if not $data_read ;
 527+
 528+ &WriteOutputSquidSequenceGaps ($path_out) ;
 529+ &WriteOutputSquidLogs ($path_out) ;
 530+ &WriteOutputEditsSavesFile ($path_out) ;
 531+ &WriteOutputCountriesSaves ($path_out) ;
 532+
 533+ &WriteDiagnostics ;
 534+
 535+ if ($job_runs_on_production_server)
 536+ { &MoveAndCompressFiles ($path_out, $path_out_month, $date_collect_files) ; }
 537+
 538+
 539+ if ($job_runs_on_production_server)
 540+ {
 541+ $cmd = "echo \"Ready in \"" . ddhhmmss (time - $time_start). " > $path_out/\#Ready" ; # use in next run to test whether this day has been completely processed
 542+ `$cmd` ;
 543+ $cmd = "echo \"\nReady in \"" . ddhhmmss (time - $time_start). " >> /home/ezachte/SquidCountArchiveLog.txt\n\n" ;
 544+ `$cmd` ;
 545+ }
 546+}
 547+
 548+#sub ScanSquidArchive
 549+#{
 550+# trace ScanSquidArchive ;
 551+
 552+# $T00 = "T00:00:00" ;
 553+
 554+# ($time_to_start, $time_to_stop) = &GetSquidLogsToProcess ; # aborts if not all found
 555+
 556+# open OUT, '>', "$path_out/$file_out" ;
 557+# open OUT2, '>', "$path_out/$file_out2" ;
 558+# open ERR, '>', "$path_out/$file_err" ;
 559+
 560+# &CheckSquidLogsAlreadyProcessed ; # aborts if this is the case
 561+
 562+# if ($scan_all_fields)
 563+# { &ReadIpFrequencies ; }
 564+
 565+# &ReadSquidLogFiles ;
 566+
 567+# if (($oldest_time_read gt $time_to_start) || ($newest_time_read lt $time_to_stop))
 568+# { abort ("Log does not contain full range from $time_to_start till $time_to_stop (oldest time read $oldest_time_read, newest time read $newest_time_read)\n") unless $test ; }
 569+
 570+# print "\ncd $path_out\n" ;
 571+# chdir ($path_out) ;
 572+
 573+# &WriteOutputSquidLogs ;
 574+
 575+# if ($scan_all_fields)
 576+# { &WriteDiagnostics ; }
 577+
 578+# close OUT ;
 579+# close OUT2 ;
 580+# close ERR ;
 581+
 582+# if ($job_runs_on_production_server && $scan_all_fields)
 583+# { &MoveAndCompressFiles ($path_out, $time_to_start) ; }
 584+#}
 585+
 586+#sub GetSquidLogsToProcess
 587+#{
 588+# trace GetSquidLogsToProcess ;
 589+
 590+# my ($date_archived, $datestart, $datestop) ;
 591+
 592+# $time = time ;
 593+# my ($sec,$min,$hour,$day,$month,$year) = localtime ($time) ;
 594+
 595+# $day_today = sprintf ("%04d-%02d-%02d",$year+1900,$month+1,$day) ;
 596+# print "Date today is $day_today.\n\n" ;
 597+
 598+# if ($job_runs_on_production_server)
 599+# {
 600+# $dir_in = "/a/squid/archive" ;
 601+
 602+# if ($logdate =~ /^\d{8}$/)
 603+# {
 604+# $year = substr ($logdate,0,4) ;
 605+# $month = substr ($logdate,4,2) ;
 606+# $day = substr ($logdate,6,2) ;
 607+
 608+# $time_to_start = sprintf ("%04d-%02d-%02d$T00",$year,$month,$day) ;
 609+# ($year,$month,$day) = &ShiftDays ($year, $month, $day, 1) ;
 610+# $time_to_stop = sprintf ("%04d-%02d-%02d$T00",$year,$month,$day) ;
 611+# }
 612+# elsif ($logdate =~ /^-\d+$/)
 613+# {
 614+# ($sec,$min,$hour,$day,$month,$year) = localtime ($time+$logdate*24*3600) ;
 615+# $year += 1900 ;
 616+# $month += 1 ;
 617+# $time_to_start = sprintf ("%04d-%02d-%02d$T00",$year,$month,$day) ;
 618+# ($year,$month,$day) = &ShiftDays ($year, $month, $day, 1) ;
 619+# $time_to_stop = sprintf ("%04d-%02d-%02d$T00",$year,$month,$day) ;
 620+# }
 621+# else
 622+# {
 623+# print "No logdate specified\n" ;
 624+# exit ;
 625+# }
 626+
 627+# print "-d $logdate => Process data from $time_to_start till $time_to_stop\n\n" ;
 628+# }
 629+# else # test
 630+# {
 631+# # $time_to_start = "2009-02-05T00" ;
 632+# # $time_to_stop = "2009-02-05T23:59:59" ;
 633+# # push @files, getcwd . "/sampled-1000-oneday.txt" ;
 634+
 635+# $time_to_start = "2010-05-10T00" ;
 636+# $time_to_stop = "2010-05-10T01" ;
 637+# push @files, getcwd . "/sampled-1000-log-20100510.txt" ;
 638+
 639+# print "Job runs in test env => Process data from $time_to_start till $time_to_stop\n\n" ;
 640+# }
 641+
 642+# $some_files_found = $false ;
 643+# $full_range_found = $false ;
 644+
 645+# ($path_out, $path_out_month) = &GetPathOut ($time_to_start) ;
 646+# $path_head_tail = "$path_out_month/$file_head_tail" ;
 647+
 648+# if ($job_runs_on_production_server)
 649+# {
 650+# # file naming scheme on server: sampled-1000.log-yyyymmdd, does not mean on that day file sampled-1000.log was archived
 651+# # file can contain data for days(s) before and day (days?) after yyyymmdd, see e.g. sampled-10000.log-20090802 (days 0801-0803)
 652+# # this is confusing so start a few days earlier and check for each day:
 653+# # whether a file exists and whether it's 'head' and or 'tail' time (first last record) fall within range
 654+
 655+# # find first and last file to process that comprise all log records within date range
 656+# $year = substr ($time_to_stop,0,4) ;
 657+# $month = substr ($time_to_stop,5,2) ;
 658+# $day = substr ($time_to_stop,8,2) ;
 659+# ($year,$month,$day) = &ShiftDays ($year, $month, $day, +5) ;
 660+# $datestop = sprintf ("%4d%02d%02d", $year, $month, $day) ;
 661+
 662+# $year = substr ($time_to_start,0,4) ;
 663+# $month = substr ($time_to_start,5,2) ;
 664+# $day = substr ($time_to_start,8,2) ;
 665+
 666+# ($year,$month,$day) = &ShiftDays ($year, $month, $day, -5) ;
 667+# $datestart = sprintf ("%4d%02d%02d", $year, $month, $day) ;
 668+
 669+# $date_archived = $datestart ;
 670+# while ($date_archived lt $datestop)
 671+# {
 672+# $date_archived = sprintf ("%4d%02d%02d", $year, $month, $day) ;
 673+# ($year,$month,$day) = &ShiftDays ($year, $month, $day, +1) ;
 674+
 675+# $file = "$dir_in/sampled-1000.log-$date_archived.gz" ;
 676+
 677+# if (-e $file)
 678+# {
 679+# ($timehead,$timetail) = &GetLogRange ($file, $path_head_tail) ;
 680+
 681+# if (($timehead lt $time_to_start) && ($timetail ge $time_to_start))
 682+# {
 683+# $some_files_found = $true ;
 684+# $processfiles = $true ;
 685+# }
 686+
 687+# if ($processfiles)
 688+# {
 689+# print "$file: time range $timehead - $timetail\n" ;
 690+# push @files, $file ;
 691+# }
 692+
 693+# if (($timehead lt $time_to_stop) && ($timetail ge $time_to_stop))
 694+# {
 695+# $full_range_found = $true ;
 696+# last ;
 697+# }
 698+# }
 699+# }
 700+# }
 701+
 702+# if ($job_runs_on_production_server)
 703+# {
 704+# if (! $some_files_found)
 705+# { print "Not any file containing start time. Aborting...\n\n" ; exit ; }
 706+# if (! $full_range_found)
 707+# { print "Not all files were found. Aborting...\n\n" ; exit ; }
 708+# }
 709+
 710+# print "\n" ;
 711+# foreach $file (sort @files)
 712+# { print "Process $file\n" ; }
 713+
 714+# return ($time_to_start, $time_to_stop) ;
 715+#}
 716+
 717+#sub GetPathOut
 718+#{
 719+# my $time_to_start = shift ;
 720+
 721+# $path_out = substr ($time_to_start,0,7) ;
 722+# if ($job_runs_on_production_server)
 723+# {
 724+# $path_out = "$path_root/$path_out" ;
 725+# $path_out_month = $path_out ;
 726+# }
 727+
 728+# if (! -d $path_out)
 729+# {
 730+# mkdir ($path_out) || die "Unable to create directory $path_out\n" ;
 731+# print "mkdir $path_out\n" ;
 732+# }
 733+
 734+# $path_out .= "/" . substr ($time_to_start,0,10) ;
 735+# if (! -d $path_out)
 736+# {
 737+# mkdir ($path_out) || die "Unable to create directory $path_out\n" ;
 738+# print "mkdir $path_out\n" ;
 739+# }
 740+
 741+# # clean up obsolete signal files
 742+# $file_ready = "$path_out/\^Ready" ;
 743+# unlink $file_ready ;
 744+# $file_ready = "$path_out/\@Ready" ;
 745+# unlink $file_ready ;
 746+
 747+# return ($path_out,$path_out_month) ;
 748+#}
 749+
 750+#sub CheckSquidLogsAlreadyProcessed
 751+#{
 752+# trace CheckSquidLogsAlreadyProcessed ;
 753+
 754+# if ($scan_ip_frequencies)
 755+# {
 756+# if (-e $file_ip_frequencies)
 757+# {
 758+# print "File $path_out/$file_ip_frequencies exists -> Day already processed\nExiting ...\n" ;
 759+# exit ;
 760+# }
 761+# }
 762+# elsif ($scan_squid_msg_sequence_numbers)
 763+# {
 764+# if (-e $file_sequence_numbers)
 765+# {
 766+# print "File $path_out/$file_sequence_numbers exists -> Day already processed\nExiting ...\n" ;
 767+# exit ;
 768+# }
 769+# }
 770+# else
 771+# {
 772+# if (-e $file_ready)
 773+# {
 774+# print "File $file_ready exists -> Day already processed\nExiting ...\n" ;
 775+# exit ;
 776+# }
 777+# else
 778+# { print "File $file_ready not found -> process data\n" ; }
 779+# }
 780+#}
 781+
 782+#sub ScanEditsSavesFile
 783+#{
 784+# trace ScanEditsSavesFile ;
 785+
 786+# if ($logdate =~ /^\d{8}$/)
 787+# {
 788+# $year = substr ($logdate,0,4) ;
 789+# $month = substr ($logdate,4,2) ;
 790+# $day = substr ($logdate,6,2) ;
 791+# }
 792+# else
 793+# {
 794+# print "No (valid) logdate specified\n" ;
 795+# if ($job_runs_on_production_server)
 796+# { exit ; }
 797+# else
 798+# {
 799+# $year = 2010 ;
 800+# $month = 4 ;
 801+# $day = 01 ;
 802+# }
 803+# }
 804+
 805+# $time_to_start = sprintf ("%04d-%02d-%02d$T00",$year,$month,$day) ;
 806+# ($year2,$month2,$day2) = &ShiftDays ($year, $month, $day, 1) ;
 807+# $time_to_stop = sprintf ("%04d-%02d-%02d$T00",$year2,$month2,$day2) ;
 808+
 809+# ($path_out, $path_out_month) = &GetPathOut ($time_to_start) ;
 810+
 811+# if ($job_runs_on_production_server)
 812+# { $path_out = $path_root ; }
 813+# else
 814+# {
 815+# push @files, getcwd . "/sampled-1000.log-20100401" ;
 816+# # return ;
 817+# }
 818+
 819+# $file_txt = "$path_root/" . sprintf ("%4d-%02d", $year, $month) . "/SquidDataEditsSaves" . sprintf ("%4d-%02d-%02d", $year, $month, $day) . ".txt.bz2" ;
 820+# $file_csv = "$path_root/" . sprintf ("%4d-%02d", $year, $month) . "/" . sprintf ("%4d-%02d-%02d", $year, $month, $day) . "/$file_csv_indexphp" ;
 821+# $file_csv_countries_saves = "$path_root/" . sprintf ("%4d-%02d", $year, $month) . "/" . sprintf ("%4d-%02d-%02d", $year, $month, $day) . "/$file_csv_countries_saves" ;
 822+# if (-e $file_txt)
 823+# {
 824+# &ReadInputEditsSavesFile ($file_txt) ;
 825+# &WriteOutputEditsSavesFile ($file_csv) ;
 826+# &WriteOutputCountriesSaves ($file_csv_countries_saves) ;
 827+# }
 828+# else
 829+# { print "ScanEditsSavesFile: File $file_txt not found. Aborting...\n\n" ; exit ; }
 830+#}
 831+
 832+sub ShiftDays
 833+{
 834+ my $year = shift ;
 835+ my $month = shift ;
 836+ my $day = shift ;
 837+ my $delta = shift ;
 838+
 839+ my $time = timelocal (0,0,0,$day, $month-1, $year-1900) ;
 840+ ($sec,$min,$hour,$day,$month,$year) = localtime ($time+$delta*24*3600) ;
 841+
 842+ return ($year+1900,$month+1,$day) ;
 843+}
 844+
 845+sub ExpandAbbreviation
 846+
 847+{
 848+ my $text = shift ;
 849+ # reverse (more or less) abbreviations
 850+ $text =~ s/^[\@\*]//o ;
 851+ $text =~ s/^xx:upload/upload:&nbsp;/o;
 852+ $text =~ s/^wb:/wikibooks:/o;
 853+ $text =~ s/^wk:/wiktionary:/o;
 854+ $text =~ s/^wn:/wikinews:/o;
 855+ $text =~ s/^wp:/wikipedia:/o;
 856+ $text =~ s/^wq:/wikiquote:/o;
 857+ $text =~ s/^ws:/wikisource:/o;
 858+ $text =~ s/^wv:/wikiversity:/o;
 859+ $text =~ s/^wx:/wikispecial:/o;
 860+ $text =~ s/^mw:/wikispecial:/o; # eg bugzilla
 861+ $text =~ s/:!mw/:mediawiki/o;
 862+ $text =~ s/^wm:/wikimedia:/o;
 863+ $text =~ s/:wm$/:wikimedia/o;
 864+ $text =~ s/^wmf:/foundation:/o;
 865+ $text =~ s/:www$/:portal/o;
 866+# $text =~ s/^wikispecial:(.*)$/$1:&nbsp;/o;
 867+ return ($text) ;
 868+}
 869+
 870+sub ProcessSquidSequenceNumbers
 871+{
 872+ # input has been established for tast three months of data in WriteOutputSquidLogs
 873+ # there for each day per squid and hour of day total event and total gap were established
 874+ # avg gap for all squids combined (per hour and per day) was written to this csv file
 875+ open CSV, '<', 'SquidDataSequenceNumbersAllSquids.csv' ;
 876+ while ($line = <CSV>)
 877+ {
 878+ next if $line =~ /\*/o ;
 879+ next if $line !~ /\d\d\d\d\-\d\d\-\d\d,/o ;
 880+ chomp $line ;
 881+ ($date,$hour,$events,$mean_gap) = split (',', $line) ;
 882+ $yyyy = substr ($date,0,4) ;
 883+ $mm = substr ($date,5,2) ;
 884+ $dd = substr ($date,8,2) ;
 885+ $time = timelocal (0,0,0,$dd,$mm-1,$yyyy-1900) ;
 886+ ($ss,$nn,$hh,$day,$month,$year,$wday,$yday,$isdst) = localtime($time);
 887+ $month ++ ;
 888+ $weekno = int ($yday / 7) ;
 889+ if ($weekno_start {$weekno} eq '')
 890+ { $weekno_start {$weekno} = $date ; }
 891+ $weekno_stop {$weekno} = $date ;
 892+ $events {"$weekno,$hour"} += $events ;
 893+ $totgap {"$weekno,$hour"} += $events * $mean_gap ;
 894+ $events_allday {$weekno} += $events ;
 895+ $totgap_allday {$weekno} += $events * $mean_gap ;
 896+
 897+ # to establish correction factor per month igore all days when another anomaly occurred, or after problem was fixed
 898+ # wk 23: from 6/11 till 6/16 unusually many messages got lost due to temporary slowdown of server
 899+ # (unwanted blocking process had been introduced by vector switch)
 900+ # wk 26: on 6/27 and 6/28 22 hours of data were lost after incomplete manual restart of locke
 901+ # wk 26/27: from 7/7 till 7/10 69 hours of data were lost after incomplete restart of locke after power down
 902+ # (week 27 does not stand out in the chart, squids got rebooted? <- counters were reset?)
 903+ # wk 29: 7/22 Mark stopped several secondary processes on locke,
 904+ # around 14.00 hrs GMT message loss vanished almost entirely
 905+ # After that average gap became 1003, meaning only 0.3% of messages is missing.
 906+
 907+
 908+ next if $month == 6 and (($day >= 11 and $day <= 16) or ($day >= 27 and $day <= 28)) ;
 909+ next if $month == 7 and (($day >= 7 and $day <= 10) or ($day >= 22)) ;
 910+ # these dates where data were missing or underreported are already skipped in WikiCountsSummarizeProjectCounts
 911+ # and totals are already extrapolated
 912+
 913+ $events_allmonth {$month} += $events ;
 914+ $totgap_allmonth {$month} += $events * $mean_gap ;
 915+
 916+ $weeks {$weekno} ++ ;
 917+ $months {$month} ++ ;
 918+ }
 919+ close CSV ;
 920+
 921+ open CSV, '>', 'SquidDataSequenceNumbersAllSquidsOut.csv' ;
 922+
 923+ print CSV "hour," ;
 924+ print "hour," ;
 925+ foreach $weekno (sort {$a <=> $b} keys %weeks)
 926+ {
 927+ $start = substr ($weekno_start {$weekno},5) ;
 928+ $start =~ s/-/\//go ;
 929+ $start =~ s/^0//go ;
 930+ # $stop = substr ($weekno_stop {$weekno},5) ;
 931+
 932+
 933+ print CSV "wk $weekno: ($start ..)," ;
 934+ print "wk $weekno: ($start ..)," ;
 935+ }
 936+ print "\n" ;
 937+ print CSV "\n" ;
 938+
 939+ foreach ($hour = 0 ; $hour <= 23 ; $hour++)
 940+ {
 941+ print CSV "$hour," ;
 942+ print "$hour," ;
 943+
 944+ $hour = sprintf ("%02d", $hour) ;
 945+ foreach $weekno (sort {$a <=> $b} keys %weeks)
 946+ {
 947+ $events = $events {"$weekno,$hour"} ;
 948+ $totgap = $totgap {"$weekno,$hour"} ;
 949+ $mean_gap = 0 ;
 950+ if ($events > 0)
 951+ { $mean_gap = sprintf ("%.0f", $totgap / $events ) ; }
 952+ print CSV "$mean_gap," ;
 953+ print "$mean_gap," ;
 954+ }
 955+
 956+ print "\n" ;
 957+ print CSV "\n" ;
 958+ }
 959+ print CSV "all day," ;
 960+ print "all day," ;
 961+ foreach $weekno (sort {$a <=> $b} keys %weeks)
 962+ {
 963+ $events = $events_allday {$weekno} ;
 964+ $totgap = $totgap_allday {$weekno} ;
 965+ $mean_gap = 0 ;
 966+ if ($events > 0)
 967+ { $mean_gap = sprintf ("%.0f", $totgap / $events ) ; }
 968+ print CSV "$mean_gap," ;
 969+ print "$mean_gap," ;
 970+ }
 971+
 972+ # the following yields (month, avg gap)
 973+ # 4: 1241 so assume this factor for full April: 1,000,000 / 1241 gap = x msgs, too short: y msgs = 1000 - x
 974+ # 5: 1310
 975+ # 6: 1328
 976+ # 7: 1470 so assume this factor for 22.5/days for July
 977+
 978+ print "\n\n" ;
 979+ print CSV "\n\n" ;
 980+ foreach $month (sort {$a <=> $b} keys %months)
 981+ {
 982+ print CSV "month $month," ;
 983+ print "month $month," ;
 984+ $events = $events_allmonth {$month} ;
 985+ $totgap = $totgap_allmonth {$month} ;
 986+ $mean_gap = 0 ;
 987+ if ($events > 0)
 988+ { $mean_gap = sprintf ("%.0f", $totgap / $events ) ; }
 989+ print CSV "$mean_gap\n" ;
 990+ print "$mean_gap\n" ;
 991+ }
 992+
 993+ close CSV ;
 994+}
 995+
 996+
 997+# how to detect page saves:
 998+# henbane /a/log/vu.awk: (see also Domasz' webstats collector)
 999+#
 1000+# function savemark(url, code) {
 1001+# if (url ~ /action=submit$/ && code == "TCP_MISS/302")
 1002+# return "save"
 1003+# return "-"
 1004+# }
 1005+
 1006+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/tools/counter/
 1007+# http://leuksman.com/log/2007/06/07/wikimedia-page-views/
 1008+# http://www.iplists.com/
 1009+# WHOIS http://ws.arin.net/whois/?queryinput=N%20.%20GOOGLE
 1010+# WHOIS http://tools.whois.net/index.php?fuseaction=whois.whoisbyipresults
 1011+# http://en.wikipedia.org/wiki/List_of_search_engines
 1012+
 1013+# http://en.wikipedia.org/wiki/User_agent
 1014+# http://www.texsoft.it/index.php?c=software&m=sw.php.useragent&l=it
 1015+# http://www.hyperborea.org/journal/archives/2004/06/19/whats-in-a-user-agent-string/
 1016+
 1017+# Funwebproducts
 1018+# No fun with funwebproducts http://www.networkworld.com/newsletters/web/2003/1208web2.html
 1019+
 1020+# SLCC
 1021+# Nice and easy. SLCC1 stands for Secure Licensing Commerce Client version 1.0. SLCC is the service responsible for the Windows Anytime upgrade process present in Vista and Server 2008 which allows you to upgrade Vista Home Basic to Vista Ultimate Edition, or Server 2008 Standard to Server 2008 Enterprise ad-hoc.
 1022+# SLCC is present in the browser identifier tag, the User Agent, in order to allow Microsoft update servers to offer you the tantalising and irresistible promise of an even more resource heavy version of Vista!
 1023+# J2ME
 1024+# Java 2 Micro Edition
 1025+
 1026+# Chrome Safari
 1027+# http://www.neowin.net/news/main/09/02/01/chrome-masks-as-safari-to-fool-windows-live-mail
 1028+
 1029+# Danger Hiptop
 1030+# http://en.wikipedia.org/wiki/Danger_Hiptop
 1031+
Index: trunk/wikistats/squids/SquidReportArchive.sh
@@ -0,0 +1,10 @@
 2+#! /bin/sh
 3+ulimit -v 4000000
 4+home="/a/ezachte"
 5+# perl $home/SquidReportArchive.pl -m 201007 > SquidReportArchiveLog.txt
 6+# after further automating SquidScanCountries.sh:
 7+perl $home/SquidReportArchive.pl -c 201101 >> SquidReportArchiveLog.txt # -c for per country reports
 8+perl $home/SquidReportArchive.pl -m 201101 >> SquidReportArchiveLog.txt
 9+tar -cf reports.tar /a/ezachte/*.htm
 10+bzip2 reports.tar
 11+mv reports.tar.bz2 /a/ezachte
Property changes on: trunk/wikistats/squids/SquidReportArchive.sh
___________________________________________________________________
Added: svn:eol-style
112 + native
Index: trunk/wikistats/squids/SquidReportArchive.pl
@@ -0,0 +1,6265 @@
 2+#!/usr/bin/perl
 3+
 4+ use lib "/home/ezachte/lib" ;
 5+ use EzLib ;
 6+ $trace_on_exit = $true ;
 7+ ez_lib_version (2) ;
 8+
 9+# $quarter_only = '2010 Q3' ; # if not empty filter process for this quarter only
 10+
 11+ # set defaults mainly for tests on local machine
 12+# default_argv "-m 201009 " ;
 13+ default_argv "-c " ;
 14+
 15+# $html = "<html><body bgcolor=black><table>" ;
 16+# for ($i = 4 ; $i >= 0 ; $i-=0.5)
 17+# {
 18+# ($requests,$ratio,$fill) = RatioAndFillColor1 ('',$i,4, $ratio_sqrt) ;
 19+# print sprintf ("%.1f",$i) . ": $fill\n" ;
 20+# $i2 = sprintf ("%0.1f", $i) ;
 21+# $html .= "<tr><td align=right><font color=grey>$i2</font></td><td width=15>&nbsp;</td><td width=50 style=\"background:$fill\">&nbsp;</td><td width=15>&nbsp;</td><td><font color=grey> $fill</font></td></tr>" ;
 22+# }
 23+# $html .= "<tr><td height=30 colspan=99>&nbsp;</td></tr>" ;
 24+# for ($i = 4 ; $i >= 0 ; $i-=0.5)
 25+# {
 26+# ($requests,$ratio,$fill) = RatioAndFillColor2 ('',$i,4, $ratio_sqrt) ;
 27+# print sprintf ("%.1f",$i) . ": $fill\n" ;
 28+# $i2 = sprintf ("%0.1f", $i) ;
 29+# $html .= "<tr><td align=right><font color=grey>$i2</font></td><td width=15>&nbsp;</td><td width=50 style=\"background:$fill\">&nbsp;</td><td width=15>&nbsp;</td><td><font color=grey> $fill</font></td></tr>" ;
 30+# }
 31+# $html .= "</table><body></html>" ;
 32+# open HTML, '>', 'color_range2.html' ;
 33+# print HTML $html ;
 34+# close HTML ;
 35+# exit ;
 36+
 37+#sub RatioAndFillColor1
 38+#{
 39+# my ($code, $requests,$requests_max) = @_ ;
 40+# my ($ratio,$green,$red,$blue,$fill) ;
 41+
 42+# if ($requests > $requests_max)
 43+# { $requests = $requests_max ; }
 44+
 45+# $ratio = sqrt ($requests / $requests_max) ;
 46+# if ($ratio >= 0.20)
 47+# {
 48+# $green = 180 ;
 49+# $red = 180 - int (0.5 + 180 * 5/4 * ($ratio-0.20)) ;
 50+# $blue = int ($green / 3) ;
 51+# }
 52+# else
 53+# {
 54+# $red = 220 ;
 55+# $green = int (0.5 + 220 * 5 * $ratio) ;
 56+# $blue = 0 ; #int ($green / 2) ;
 57+# }
 58+
 59+# $fill = "\#" . sprintf ("%02x%02x%02x",$red,$green,$blue) ;
 60+# $fill = lc hsv2rgb($ratio*150,0.67+$ratio*0.33,0.8-0.2*$ratio) ;
 61+
 62+# $fills {lc $code} = $fill ;
 63+# return ($requests,$ratio,$fill) ;
 64+#}
 65+
 66+#sub RatioAndFillColor2
 67+#{
 68+# my ($code, $requests,$requests_max) = @_ ;
 69+# my ($ratio,$green,$red,$blue,$fill) ;
 70+
 71+# if ($requests > $requests_max)
 72+# { $requests = $requests_max ; }
 73+
 74+# $ratio = $requests / $requests_max ;
 75+# if ($ratio >= 0.20)
 76+# {
 77+# $green = 180 ;
 78+# $red = 180 - int (0.5 + 180 * 5/4 * ($ratio-0.20)) ;
 79+# $blue = int ($green / 3) ;
 80+# }
 81+# else
 82+# {
 83+# $red = 220 ;
 84+# $green = int (0.5 + 220 * 5 * $ratio) ;
 85+# $blue = 0 ; #int ($green / 2) ;
 86+# }
 87+
 88+# $fill = "\#" . sprintf ("%02x%02x%02x",$red,$green,$blue) ;
 89+# $fill = lc hsv2rgb($ratio*150,1-$ratio*0.334,0.6) ;
 90+
 91+# $fills {lc $code} = $fill ;
 92+# return ($requests,$ratio,$fill) ;
 93+#}
 94+
 95+# to do: add text from http://wiki.squid-cache.org/SquidFaq/SquidLogs
 96+# ReportOrigin how to handle '!error <-> other
 97+# SquidReportOrigins.htm total count<->alpha are not the same (+ skip total for "google (total)")
 98+# SquidReportOrigins.htm totals google don't match ReportMimeTypes
 99+# SquidReportOrigins.htm internal tonen als bij mime types
 100+
 101+# cater for missing files -> different multiplier
 102+# csv file google bot hits per hour -> Stu
 103+# report for edit/submit
 104+# log.txt s -> date folder
 105+
 106+# http://www.linux.com/community/blogs/Convert-a-.svg-file-to-a-.png-in-Ubuntu.html
 107+
 108+# use CGI::Carp qw(fatalsToBrowser);
 109+# use Getopt::Std ;
 110+ use Time::Local ;
 111+ use Cwd;
 112+
 113+ $ratio_sqrt = $true ;
 114+ $ratio_linear = $false ;
 115+
 116+ getopt ("dm", \%options) ;
 117+
 118+ if (-d "/a/squid")
 119+ {
 120+ print "\n\nJob runs on server $hostname\n\n" ;
 121+ $path_root = "/a/ezachte" ;
 122+ }
 123+ elsif ($hostname eq 'bayes')
 124+ {
 125+ print "\n\nJob runs on server $hostname\n\n" ;
 126+ $path_root = "/home/ezachte/wikistats/animation" ;
 127+ }
 128+ else
 129+ {
 130+ print "Job runs local for tests\n\n" ;
 131+ $path_root = "W:/! Perl/Squids/Archive/test5" ;
 132+ }
 133+ $path_in = $path_root ;
 134+ $path_out = $path_root ;
 135+
 136+ print "Path root = $path_root\n" ;
 137+
 138+ # periodically harvest updated metrics from
 139+ # 'http://en.wikipedia.org/wiki/List_of_countries_by_population'
 140+ # 'http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users'
 141+ if (defined ($options {"w"}))
 142+ { &ReadWikipedia ; exit ; }
 143+
 144+ if (defined ($options {"c"}))
 145+ { $reportcountries = $true ; }
 146+
 147+ # date range used to be read from csv file with ReadDate, now there are daily csv files
 148+ # if earlier methods still is useful it needs to be tweaked
 149+# if (($reportmonth ne "") && ($reportmonth !~ /^\d{6}$/))
 150+
 151+ &InitProjectNames ;
 152+
 153+ if ($reportcountries)
 154+ {
 155+ $project_mode = "wp" ;
 156+
 157+ $file_csv_country_codes = "CountryCodes.csv" ;
 158+ $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ;
 159+
 160+ &ReadInputCountriesNames ;
 161+ &ReadInputCountriesMeta ;
 162+
 163+ &CollectRegionCounts ;
 164+
 165+ &ReportCountries ('Saves');
 166+ &ReportCountries ('Views');
 167+
 168+ exit ;
 169+ }
 170+
 171+ $reportdaysback = $options {"d"} ;
 172+ $reportmonth = $options {"m"} ;
 173+
 174+ if (($reportmonth !~ /^\d{6}$/) && ($reportdaysback !~ /^-\d+/))
 175+ { print "Specify month as -m yyyymm or days back as -d -[days] (e.g. -d -1 for yesterday)" ; exit ; }
 176+
 177+ if ($reportmonth =~ /^\d{6}$/)
 178+ { $reportmonth = substr ($reportmonth,0,4) . "-" . substr ($reportmonth,4,2) ; }
 179+ else
 180+ {
 181+ ($sec,$min,$hour,$day,$month,$year) = localtime (time+$reportdaysback*86400) ;
 182+ $reportmonth = sprintf ("%04d-%02d",$year+1900,$month+1) ;
 183+ }
 184+ print "Report month = $reportmonth\n" ;
 185+
 186+ $threshold_mime = 0 ;
 187+ $threshold_project = 10 ;
 188+
 189+ $file_log = "WikiReportsSampledVisitorsLog.log" ;
 190+
 191+ $file_html_crawlers = "SquidReportCrawlers.htm" ;
 192+ $file_html_methods = "SquidReportMethods.htm" ;
 193+ $file_html_origins = "SquidReportOrigins.htm" ;
 194+ $file_html_opsys = "SquidReportOperatingSystems.htm" ;
 195+ $file_html_scripts = "SquidReportScripts.htm" ;
 196+ $file_html_skins = "SquidReportSkins.htm" ;
 197+ $file_html_requests = "SquidReportRequests.htm" ;
 198+ $file_html_google = "SquidReportGoogle.htm" ;
 199+ $file_html_clients = "SquidReportClients.htm" ;
 200+
 201+# names till 2010-07-01
 202+#
 203+# $file_csv_crawlers = "SquidDataCrawlers.csv" ;
 204+# $file_csv_methods = "SquidDataMethods.csv" ;
 205+# $file_csv_origins = "SquidDataOrigins.csv" ;
 206+# $file_csv_opsys = "SquidDataOpSys.csv" ;
 207+# $file_csv_requests = "SquidDataRequests.csv" ;
 208+# $file_csv_scripts = "SquidDataScripts.csv" ;
 209+# $file_csv_google = "SquidDataSearch.csv" ;
 210+# $file_csv_skins = "SquidDataSkins.csv" ;
 211+# $file_csv_clients = "SquidDataClients.csv" ;
 212+# $file_csv_google_bots = "SquidDataGoogleBots.csv" ;
 213+# $file_csv_indexphp = "SquidDataIndexPhp.csv" ;
 214+# $file_csv_countries_languages_visited = "SquidDataCountriesLanguagesVisited.csv" ;
 215+# $file_csv_countries_timed = "SquidDataCountriesTimed.csv" ;
 216+# $file_csv_browser_languages = "SquidDataLanguages.csv" ;
 217+
 218+ $file_csv_crawlers = "public/SquidDataCrawlers.csv" ;
 219+ $file_csv_methods = "public/SquidDataMethods.csv" ;
 220+ $file_csv_origins = "public/SquidDataOrigins.csv" ;
 221+ $file_csv_opsys = "public/SquidDataOpSys.csv" ;
 222+ $file_csv_requests = "public/SquidDataRequests.csv" ;
 223+ $file_csv_scripts = "public/SquidDataScripts.csv" ;
 224+ $file_csv_google = "public/SquidDataSearch.csv" ;
 225+ $file_csv_skins = "public/SquidDataSkins.csv" ;
 226+ $file_csv_clients = "public/SquidDataClients.csv" ;
 227+ $file_csv_google_bots = "public/SquidDataGoogleBots.csv" ;
 228+ $file_csv_indexphp = "public/SquidDataIndexPhp.csv" ;
 229+ $file_csv_countries_languages_visited = "public/SquidDataCountriesViews.csv" ;
 230+ $file_csv_countries_timed = "public/SquidDataCountriesViewsTimed.csv" ;
 231+ $file_csv_browser_languages = "public/SquidDataLanguages.csv" ;
 232+
 233+ print "\n\nJob SquidReportArchive.pl\n\n" ;
 234+
 235+# if (! -d "/a/squid")
 236+# {
 237+# if (! -e $file_csv_requests) { $file_csv_requests =~ s/\./Test./ }
 238+# if (! -e $file_csv_methods) { $file_csv_methods =~ s/\./Test./ }
 239+# if (! -e $file_csv_skins) { $file_csv_skins =~ s/\./Test./ }
 240+# if (! -e $file_csv_scripts) { $file_csv_scripts =~ s/\./Test./ }
 241+# if (! -e $file_csv_opsys) { $file_csv_opsys =~ s/\./Test./ }
 242+# if (! -e $file_csv_origins) { $file_csv_origins =~ s/\./Test./ }
 243+# if (! -e $file_csv_google) { $file_csv_google =~ s/\./Test./ }
 244+# if (! -e $file_csv_crawlers) { $file_csv_crawlers =~ s/\./Test./ }
 245+# }
 246+
 247+ if (! -d "$path_root/$reportmonth")
 248+ { print "Directory not found: $path_root\/$reportmonth\n" ; exit ; }
 249+
 250+# for ($month = 4 ; $month <= 10 ; $month ++)
 251+# {
 252+# $reportmonth = "2009-" . sprintf ("%02d", $month) ;
 253+
 254+ for ($day = 1 ; $day <= 31 ; $day ++)
 255+ {
 256+# last if ($month == 10) && ($day > 24) # temp code stay with DST summer time zone for SV
 257+
 258+ $date = $reportmonth . "-". sprintf ("%02d", $day) ;
 259+ $dir = "$path_root/$reportmonth/$date" ;
 260+
 261+ if (-d $dir)
 262+ {
 263+ if (-e "$dir/#Ready")
 264+ {
 265+ if ($date_first eq "")
 266+ { $date_first = $date ; }
 267+ $date_last = $date ;
 268+ print "Process dir $dir\n" ;
 269+ push @dirs_process, $dir ;
 270+ }
 271+ else
 272+ { print "Empty or incomplete dir $dir!\n" ; }
 273+ }
 274+ else
 275+ { print "Missing dir $dir!\n" ; }
 276+ }
 277+# }
 278+ if ($#dirs_process < 0)
 279+ { print "No valid data to process.\n" ; exit ; }
 280+
 281+ $dir_reports = "$path_root/$reportmonth" ;
 282+
 283+ $google_ip_ranges = "<b>IP ranges:</b> known ip ranges for Google are 64.233.[160.0-191.255], 66.249.[64.0-95.255], 66.102.[0.0-15.255], 72.14.[192.0-255.255], <br>74.125.[0.0-255.255], " .
 284+ "209.085.[128.0-255.255], 216.239.[32.0-63.255] and a few minor other subranges</small><p>\n" ;
 285+
 286+ &OpenLog ;
 287+ &PrepHtml ;
 288+ &SetPeriod ; # now date range derived from which folders found
 289+
 290+# &ReadDate ; date range was read from csv file
 291+
 292+ foreach $dir_process (@dirs_process)
 293+ {
 294+ $days_input_found ++ ;
 295+
 296+ &ReadInputClients ;
 297+ &ReadInputCrawlers ;
 298+ &ReadInputMethods ;
 299+ &ReadInputMimeTypes ;
 300+ &ReadInputOpSys ;
 301+ &ReadInputOrigins ;
 302+ &ReadInputScripts ;
 303+ &ReadInputGoogle ;
 304+ &ReadInputSkins ;
 305+ &ReadInputIndexPhp ;
 306+ &ReadInputBrowserLanguages ;
 307+# &ReadInputCountriesTimed ;
 308+ }
 309+
 310+#&ReadCountryCodes ;
 311+
 312+ print "\nDays input = $days_input_found\n" ;
 313+ $multiplier = 1 / $days_input_found ;
 314+ print "\nMultiplier = " . sprintf ("%.4f", $multiplier) . "\n" ;
 315+
 316+#&WriteCsvCountriesTimed ;
 317+#&WriteCsvCountriesGoTo ;
 318+#exit ;
 319+
 320+ foreach $key (keys_sorted_alpha_desc %edit_submit)
 321+ { print "YYY " . sprintf ("%5d", $edit_submit {$key}) . ": $key\n" ; }
 322+
 323+ foreach $total (keys_sorted_by_value_num_desc %edit_submits)
 324+ { print "total $total: ${edit_submits {$total}} \n" ; }
 325+
 326+ print "\n\n" ;
 327+
 328+
 329+ foreach $domain (keys_sorted_by_value_num_desc %edit_submit_bot_sort)
 330+ {
 331+ $cnt = $edit_submit_bot_sort {$domain} ;
 332+
 333+ last if $cnt < 100 ;
 334+
 335+ print "DOMAIN $domain total $cnt\n" ;
 336+ foreach $key (sort keys %{$edit_submit_bot {$domain}})
 337+ { print sprintf ("%5d", $edit_submit_bot {$domain} {$key}) . ": $key\n" ; }
 338+ # { print "$key: ${edit_submit_bot {$domain} {$key}}, " ; }
 339+ print "\n" ;
 340+ }
 341+ print "\n\n" ;
 342+ foreach $agent (keys_sorted_by_value_num_desc %edit_submit_bot_agent_sort)
 343+ {
 344+ $cnt = $edit_submit_bot_agent_sort {$agent} ;
 345+
 346+ last if $cnt < 25 ;
 347+
 348+ print "AGENT $agent total $cnt\n" ;
 349+ foreach $key (sort keys %{$edit_submit_bot_agent {$agent}})
 350+ { print sprintf ("%5d", $edit_submit_bot_agent {$agent} {$key}) . ": $key\n" ; }
 351+ # { print "$key: ${edit_submit_bot {$domain} {$key}}, " ; }
 352+ print "\n" ;
 353+ }
 354+
 355+
 356+
 357+# foreach $key (keys_sorted_by_value_num_desc %edit_submit_bot_agent)
 358+# { print "AGENT: " .sprintf ("%5d", $edit_submit_bot_agent {$key}) . ": $key\n" ; }
 359+# print "\n\n" ;
 360+# foreach $key (keys_sorted_by_value_num_desc %edit_submit_subparms)
 361+# {
 362+# $count = $edit_submit_subparms {$key} ;
 363+#
 364+# last if $count < 5 ;
 365+#
 366+# ($subparm, $referer) = split (',', $key) ;
 367+# print "ZZZ " . sprintf ("%5d", $count) . ": $referer, $subparm\n" ;
 368+# }
 369+ &CalcPercentages ;
 370+ &NormalizeCounts ;
 371+ &SortCounts ;
 372+
 373+ &WriteReportClients ;
 374+ &WriteReportCrawlers ;
 375+
 376+ &WriteReportMethods ;
 377+ &WriteReportMimeTypes ;
 378+ &WriteReportOpSys ;
 379+ &WriteReportOrigins ;
 380+ &WriteReportScripts ;
 381+ &WriteReportGoogle ;
 382+ &WriteReportSkins ;
 383+ &WriteCsvGoogleBots ;
 384+ &WriteCsvBrowserLanguages ;
 385+
 386+# &WriteCsvCountriesTimed ;
 387+# &WriteCsvCountriesTargets ;
 388+ close "FILE_LOG" ;
 389+ print "\nReady\n\n" ;
 390+
 391+ if (-d "/a/squid")
 392+ {
 393+# $cmd = "tar -cf $dir_reports/$date_last\-csv.tar $dir_reports_in/*.csv | bzip2 $dir_reports/$date_last\-csv.tar" ;
 394+# print "cmd = '$cmd'\n" ;
 395+# `$cmd` ;
 396+ $cmd = "tar -cf $dir_reports/$reportmonth\-html.tar $dir_reports/*.htm | bzip2 $dir_reports/$reportmonth\-html.tar" ;
 397+ print "cmd = '$cmd'\n" ;
 398+ `$cmd` ;
 399+ }
 400+
 401+ exit ;
 402+
 403+sub ReportCountries
 404+{
 405+ my $mode_report = shift ;
 406+
 407+ if ($mode_report eq 'Views')
 408+ {
 409+ $selection = 'PageViews' ;
 410+ $selection2 = 'Visits' ;
 411+ $views_edits = 'Page Views' ;
 412+ }
 413+ else
 414+ {
 415+ $selection = 'PageEdits' ;
 416+ $selection2 = 'Saves' ;
 417+ $views_edits = 'Page Edits' ;
 418+ }
 419+
 420+ ($quarter_only2 = $quarter_only) =~ s/ // ;
 421+
 422+ $file_csv_squid_counts_monthly = "SquidData${selection2}PerCountryMonthly.csv" ; # LockePrev.csv" ;
 423+ $file_csv_squid_counts_daily = "SquidData${selection2}PerCountryDaily.csv" ;
 424+
 425+ $file_html_per_country_breakdown = "SquidReport${selection}PerCountryBreakdown.htm" ;
 426+ $file_html_per_country_breakdown_huge = "SquidReport${selection}PerCountryBreakdownHuge.htm" ;
 427+ $file_html_per_country_overview = "SquidReport${selection}PerCountryOverview$quarter_only2.htm" ;
 428+ $file_html_per_country_trends = "SquidReport${selection}PerCountryTrends.htm" ;
 429+ $file_html_per_language_breakdown = "SquidReport${selection}PerLanguageBreakdown.htm" ;
 430+ $file_csv_per_country_overview = "SquidReport${selection}PerCountryOverview.csv" ;
 431+
 432+ $path_csv_squid_counts_monthly = "$path_in/$file_csv_squid_counts_monthly" ;
 433+ if (! -e $path_csv_squid_counts_monthly) { abort ("Input file $path_csv_squid_counts_monthly not found!") ; }
 434+ $path_csv_squid_counts_daily = "$path_in/$file_csv_squid_counts_daily" ;
 435+ if (! -e $path_csv_squid_counts_daily) { abort ("Input file $path_csv_squid_counts_daily not found!") ; }
 436+
 437+ &ReadInputCountriesMonthly ($project_mode) ;
 438+ &ReadInputCountriesDaily ($project_mode) ;
 439+
 440+# foreach $week (sort {$a <=> $b} keys %changes_per_week_per_country_code)
 441+# { &WriteCsvSvgFilePerCountryOverview ($views_edits, $week, \%changes_per_week_per_country_code, 200, "Wikipedia " . lc $views_edits . ", weekly trend") } ;
 442+
 443+# foreach $week (sort {$a <=> $b} keys %requests_per_week_per_country_code)
 444+# { &WriteCsvSvgFilePerCountryOverview ($views_edits, $week, \%requests_per_week_per_country_code, $max_requests_per_connected_us_week, "Wikipedia " . lc $views_edits . " per person") } ;
 445+# foreach $yyyymm (sort keys %yyyymm_)
 446+# { &WriteCsvSvgFilePerCountryOverview ($views_edits, $yyyymm, \%requests_per_month_per_country_code, $max_requests_per_connected_us_month, "Wikipedia " . lc $views_edits . " per person") } ;
 447+
 448+ &PrepHtml ;
 449+
 450+# $comment = "<p>&nbsp;See also: <a href='SquidReportTrafficPerCountry.htm'>Wikipedia $views_edits per Country</a> / <a href='SquidReportLanguagesVisitedDetailed.htm'>Breakdown per Country of Wikipedia's Visited (detailed)</a> / <a href='SquidReportTrafficPerWikipediaOverview.htm'>Breakdown per Wikipedia of Requesting Countries</a>" ;
 451+
 452+ $title_main = "Wikimedia Traffic Analysis Report" ;
 453+
 454+ $links = "<p>&nbsp;Also: <b>$views_edits Per Country</b> - " .
 455+ "<a href='$file_html_per_country_overview'>Overview</a> / " .
 456+ "<a href='$file_html_per_country_breakdown'>Breakdown</a> / " .
 457+ "<a href='$file_html_per_country_trends'>Trends</a>,&nbsp;&nbsp;&nbsp;&nbsp;" .
 458+ "<b>$views_edits Per Wikipedia Language - </b> " .
 459+ "<a href='$file_html_per_language_breakdown'>Breakdown</a>" ;
 460+
 461+ $title = "$title_main - Wikipedia $views_edits Per Country - Overview" ;
 462+ &WriteReportPerCountryOverview ($title, $views_edits, &UnLink ($links,1)) ; ;
 463+
 464+ $title = "$title_main - Wikipedia $views_edits Per Country - Breakdown" ;
 465+ &WriteReportPerCountryBreakdown ($title, $views_edits, &UnLink ($links,2),$cutoff_requests = 100, $cutoff_percentage = 1, $show_logcount = $false) ;
 466+ &WriteReportPerCountryBreakdown ($title, $views_edits, &UnLink ($links,2),$cutoff_requests = 10, $cutoff_percentage = 0.1, $show_logcount = $true) ;
 467+
 468+ $title = "$title_main - Wikipedia $views_edits Per Country - Trends" ;
 469+ &WriteReportPerCountryTrends ($title, $views_edits, &UnLink ($links,3)) ;
 470+
 471+ $links =~ s/,.*$// ;
 472+ $title = "$title_main - $views_edits Per Wikipedia Language - Breakdown" ;
 473+ &WriteReportPerLanguageBreakDown ($title, $views_edits, &UnLink ($links,4)) ;
 474+}
 475+
 476+sub ReadDate
 477+{
 478+ open CSV_CRAWLERS, '<', "$dir_process/$file_csv_crawlers" ;
 479+ $line = <CSV_CRAWLERS> ;
 480+ close CSV_CRAWLERS ;
 481+# print "DATE LINE $line\n" ;
 482+ chomp ($line) ;
 483+ $line =~ s/^.*?(\d\d\d\d\-\d\d\-\d\d(?:T\d\d)?).*?(\d\d\d\d\-\d\d\-\d\d(?:T\d\d)?).*$/$1.",".$2/e ;
 484+ ($timefrom,$timetill) = split (',', $line) ;
 485+ if (($timefrom eq "") || ($timetill eq ""))
 486+ { abort ("$file_csv_crawlers does not contain valid date range on first line\n") ; }
 487+
 488+ $yearfrom = substr ($timefrom,0,4) ;
 489+ $monthfrom = substr ($timefrom,5,2) ;
 490+ $dayfrom = substr ($timefrom,8,2) ;
 491+ $hourfrom = substr ($timefrom,11,2) ;
 492+
 493+ $yeartill = substr ($timetill,0,4) ;
 494+ $monthtill = substr ($timetill,5,2) ;
 495+ $daytill = substr ($timetill,8,2) ;
 496+ $hourtill = substr ($timetill,11,2) ;
 497+
 498+ $period = sprintf ("%d %s %d %d:00 - %d %s %d %d:00", $dayfrom, month_english_short ($monthfrom-1), $yearfrom, $hourfrom, $daytill, month_english_short ($monthtill-1), $yeartill, $hourtill) ;
 499+
 500+ $timefrom = timegm (0,0,$hourfrom,$dayfrom,$monthfrom-1,$yearfrom-1900) ;
 501+ $timetill = timegm (0,0,$hourtill,$daytill,$monthtill-1,$yeartill-1900) ;
 502+
 503+ $timespan = ($timetill - $timefrom) / 3600 ;
 504+ $multiplier = (24 * 3600) / ($timetill - $timefrom) ;
 505+ print "Multiplier = $multiplier\n" ;
 506+ $header =~ s/DATE/Daily averages, based on sample period: $period (yyyy-mm-dd)/ ;
 507+}
 508+
 509+sub SetPeriod
 510+{
 511+ $year_first = substr ($date_first,0,4) ;
 512+ $month_first = substr ($date_first,5,2) ;
 513+ $day_first = substr ($date_first,8,2) ;
 514+
 515+ $year_last = substr ($date_last,0,4) ;
 516+ $month_last = substr ($date_last,5,2) ;
 517+ $day_last = substr ($date_last,8,2) ;
 518+
 519+ $timefrom = timegm (0,0,0,$day_first,$month_first-1,$year_first-1900) ;
 520+ $timetill = timegm (0,0,0,$day_last,$month_last-1,$year_last-1900) + 86400 ; # date_last + 1 day (in seconds)
 521+
 522+ $timespan = ($timetill - $timefrom) / 3600 ;
 523+ $multiplier = (24 * 3600) / ($timetill - $timefrom) ;
 524+
 525+ $period = sprintf ("%d %s %d - %d %s %d", $day_first, month_english_short ($month_first-1), $year_first, $day_last, month_english_short ($month_last-1), $year_last) ;
 526+ $header =~ s/DATE/Daily averages, based on sample period: $period/ ;
 527+ print "Sample period: $period => for daily averages multiplier = " . sprintf ("%.2f",$multiplier) . "\n" ;
 528+}
 529+
 530+sub PrepHtml
 531+{
 532+ $language = "en" ;
 533+ $header = "<!DOCTYPE FILE_HTML PUBLIC '-//W3C//DTD FILE_HTML 4.01 Transitional//EN' 'http://www.w3.org/TR/html4/loose.dtd'>\n" .
 534+ "<html lang='en'>\n" .
 535+ "<head>\n" .
 536+ "<title>TITLE</title>\n" .
 537+ "<meta http-equiv='Content-type' content='text/html; charset=iso-8859-1'>\n" .
 538+ "<meta name='robots' content='index,follow'>\n" .
 539+ "<script language='javascript' type='text/javascript' src='../WikipediaStatistics13.js'></script>\n" .
 540+ "<style type='text/css'>\n" .
 541+ "<!--\n" .
 542+ "body {font-family:arial,sans-serif; font-size:12px }\n" .
 543+ "h2 {margin:0px 0px 3px 0px; font-size:18px}\n" .
 544+ "table {font-size:12px ;}\n" .
 545+ "td {white-space:wrap; text-align:right; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:12px ; vertical-align:top}\n" .
 546+ "th {white-space:nowrap; text-align:right; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:12px ; vertical-align:top ; font-width:bold}\n" .
 547+ "th.small {white-space:wrap; text-align:right; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:11px ; vertical-align:top ; font-width:bold}\n" .
 548+ "td.hl {text-align:left;}\n" .
 549+ "td.hr {text-align:right;}\n" .
 550+ "td.r {text-align:right; border: inset 1px #FFFFFF}\n" .
 551+ "td.c {text-align:center; border: inset 1px #FFFFFF}\n" .
 552+ "td.l {text-align:left; border: inset 1px #FFFFFF}\n" .
 553+ "th.c {text-align:center; border: inset 1px #FFFFFF}\n" .
 554+ "th.l {text-align:left; border: inset 1px #FFFFFF}\n" .
 555+ "th.lh3 {text-align:left; border: inset 1px #FFFFFF ; font-size:14px}\n" .
 556+ "a:link { color:blue;text-decoration:none;}\n" .
 557+ "a:visited {color:#0000FF;text-decoration:none;}\n" .
 558+ "a:active {color:#0000FF;text-decoration:none;}\n" .
 559+ "a:hover {color:#FF00FF;text-decoration:underline}\n" .
 560+ "-->\n" .
 561+ "</style>\n" .
 562+ "<body bgcolor='\#FFFFDD'>\n<table width=100%>\n<tr><td class=hl>\n<h2>HEADER</h2>\n<b>DATE</b>\n</td>\n<td class=hr>" .
 563+ "<input type='button' value=' Archive ' onclick='window.location=\"http://stats.wikimedia.org/archive/squid_reports\"'> " .
 564+ "<input type='button' value=' Wikimedia Statistics ' onclick='window.location=\"http://stats.wikimedia.org\"'>" .
 565+ "</td></tr>\n</table><hr>" .
 566+ "&nbsp;This analysis is based on a 1:1000 sampled server log (squids) X1000\nALSO<p>" ;
 567+
 568+ # to be localized some day like any reports
 569+ $out_license = "All data and images on this page are in the public domain." ;
 570+ $out_generated = "Generated on " ;
 571+ $out_author = "Author" ;
 572+ $out_mail = "Mail" ;
 573+ $out_site = "Web site" ;
 574+ $out_myname = "Erik Zachte" ;
 575+ $out_mymail = "ezachte@### (no spam: ### = wikimedia.org)" ;
 576+ $out_mysite = "http://infodisiac.com/" ;
 577+
 578+ $colophon = "<p>\n" .
 579+ $out_generated . date_time_english (time) . "\n<br>" .
 580+ $out_author . ":" . $out_myname .
 581+ " (<a href='" . $out_mysite . "'>" . $out_site . "</a>)\n<br>" .
 582+ "$out_mail: $out_mymail<br>\n" .
 583+ "$out_license" .
 584+ "</small>\n" .
 585+ "</body>\n" .
 586+ "</html>\n" ;
 587+
 588+ $dummy_requests = "Requests <font color=#808080>by destination</font> or " ;
 589+ $dummy_origins = "<font color=#000060>by origin</font>" ;
 590+ $dummy_methods = "<font color=#000060>Methods</font>" ;
 591+ $dummy_scripts = "<font color=#000060>Scripts</font>" ;
 592+ $dummy_skins = "<font color=#000060>Skins</font>" ;
 593+ $dummy_crawlers = "<font color=#000060>Crawlers</font>" ;
 594+ $dummy_opsys = "<font color=#000060>Op.Sys.</font>" ;
 595+ $dummy_browsers = "<font color=#000060>Browsers</font>" ;
 596+ $dummy_google = "<font color=#000060>Google</font>" ;
 597+
 598+ $link_requests = "Requests <a href='$file_html_requests'>by destination</a> or " ;
 599+ $link_origins = "<a href='$file_html_origins'>by origin</a>" ;
 600+ $link_methods = "<a href='$file_html_methods'>Methods</a>" ;
 601+ $link_scripts = "<a href='$file_html_scripts'>Scripts</a>" ;
 602+ $link_skins = "<a href='$file_html_skins'>Skins</a>" ;
 603+ $link_crawlers = "<a href='$file_html_crawlers'>Crawlers</a>" ;
 604+ $link_opsys = "<a href='$file_html_opsys'>Op.Sys.</a>" ;
 605+ $link_browsers = "<a href='$file_html_clients'>Browsers</a>" ;
 606+ $link_google = "<a href='$file_html_google'>Google</a>" ;
 607+}
 608+
 609+sub ReadCountryCodes
 610+{
 611+ open CODES, '<', "$path_in/$file_csv_country_codes" ;
 612+ while ($line = <CODES>)
 613+ {
 614+ if ($line =~ /^[A-Z]/)
 615+ {
 616+ chomp ($line) ;
 617+ ($code,$region,$north_south,$name) = split (',',$line,4) ;
 618+ $country_codes {$code} = $name ;
 619+ # print "$code => $name\n" ;
 620+ }
 621+ }
 622+ close CODES ;
 623+}
 624+
 625+sub ReadInputClients
 626+{
 627+ my $file_csv = "$dir_process/$file_csv_clients" ;
 628+ if (! -e $file_csv)
 629+ { abort ("Function ReadInputClients: file $file_csv not found!!!") ; }
 630+ open CSV_CLIENTS, '<', $file_csv ;
 631+
 632+ while ($line = <CSV_CLIENTS>)
 633+ {
 634+ next if $line =~ /^#/ ; # comments
 635+ next if $line =~ /^:/ ; # csv header (not a comment)
 636+
 637+ chomp ($line) ;
 638+
 639+ if ($line =~ /^E/)
 640+ {
 641+ ($rectype, $engine, $count) = split (',', $line) ;
 642+
 643+ next if ($engine !~ /^Gecko/) && ($engine !~ /^AppleWebKit/) ;
 644+
 645+ if ($engine !~ / \d/)
 646+ { $engine =~ s/\// / ; }
 647+
 648+ if ($engine =~ /AppleWebKit/)
 649+ {
 650+ $engine =~ s/AppleWebKit\//AppleWebKit / ; # fix
 651+ $engine =~ s/Safari\/\d+/Safari/ ; # fix input
 652+ $engine =~ s/(?:|iPad|iPod|iPhone) Mozilla.*$/iPod)/i ; # fix input
 653+ ($engine2 = $engine) =~ s/\s*\/?\d\d\d// ;
 654+ $webkit_engines {$engine2} += $count ;
 655+
 656+ # $webkit_total_engines {$engine} += $count ;
 657+ }
 658+
 659+ $engines {$engine} += $count ;
 660+
 661+ $engine =~ s/\/.*$// ;
 662+ $engine =~ s/ .*$// ;
 663+ $total_engines {$engine} += $count ;
 664+ }
 665+ elsif ($line =~ /^G/)
 666+ {
 667+ ($rectype, $mobile, $group, $count, $perc) = split (',', $line) ;
 668+ $total_clientgroups {$mobile} += $count ;
 669+
 670+ $group =~ s/^KDDI.*$/KDDI/ ;
 671+ $group =~ s/^MOT.*$/MOT/ ;
 672+ $group =~ s/^LG-.*$/LG/i ;
 673+ $group =~ s/^LGE.*$/LGE/i ;
 674+ $group =~ s/^KWC.*$/KWC/i ;
 675+ $group =~ s/^Nokia.*$/Nokia/i ;
 676+ $group =~ s/^Samsung.*$/Samsung/i ;
 677+ $group =~ s/^Motorola.*$/Motorola/i ;
 678+ $group =~ s/^SonyEricsson.*$/SonyEricsson/i ;
 679+ $group =~ s/^PANTECH.*$/PanTech/i ;
 680+ $group =~ s/^Palm_Pre/Palm Pre/i ;
 681+ $clientgroups {"$mobile,$group"} += $count ;
 682+ }
 683+ else
 684+ {
 685+ ($rectype, $client, $count, $perc) = split (',', $line) ;
 686+
 687+ $total_clients += $count ;
 688+ $client =~ s/_/./g ;
 689+ $client =~ s/\.\./Other/g ;
 690+ if ($client !=~ / \d/)
 691+ { $client =~ s/\// / ; }
 692+ if ($rectype eq "-") { $total_clients_non_mobile += $count ; }
 693+ if ($rectype eq "M") { $total_clients_mobile += $count ; }
 694+ $clients {"$rectype,$client"} += $count ;
 695+ }
 696+ }
 697+ close CSV_CLIENTS ;
 698+
 699+# foreach $key (sort keys %clientgroups)
 700+# {
 701+# next if $clientgroups {$key} < 50000 ; }
 702+# next if $key =~ /^M/ ; }
 703+
 704+# print "$key:" . $clientgroups {$key} . "\n" ;
 705+# }
 706+# print "\n" ;
 707+# foreach $key (sort keys %total_clientgroups)
 708+# {
 709+# print "$key:" . $total_clientgroups {$key} . "\n" ;
 710+# }
 711+# print "\n" ;
 712+}
 713+
 714+sub ReadInputCrawlers
 715+{
 716+ my $file_csv = "$dir_process/$file_csv_crawlers" ;
 717+ if (! -e $file_csv)
 718+ { abort ("Function ReadInputCrawlers: file $file_csv not found!!!\n") ; }
 719+ open CSV_CRAWLERS, '<', $file_csv ;
 720+ while ($line = <CSV_CRAWLERS>)
 721+ {
 722+ next if $line =~ /^#/ ; # comments
 723+ next if $line =~ /^:/ ; # csv header (not a comment)
 724+
 725+ chomp ($line) ;
 726+ ($count, $mime, $agent) = split (',', $line,3) ;
 727+
 728+
 729+ $mime2 = $mime ;
 730+ $mime =~ s/^image\/.*$/image\/../ ;
 731+ $mime =~ s/^text\/.*$/text\/../ ;
 732+ $agent =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
 733+
 734+ next if $agent =~ /<\s*script\s*>/i ;
 735+ next if $agent =~ /MSIE \d+\.\d+/ ; # most likely false positives
 736+
 737+ if ($agent =~ /\|Google ip add?ress/) # typo
 738+ {
 739+ $agent =~ s/\|Google ip add?ress// ;
 740+ $agent =~ s/GoogleBot/<b><font color=green>GoogleBot<\/font><\/b>/gi ;
 741+ }
 742+ if ($agent =~ / \|no Google ip address/)
 743+ {
 744+ $agent =~ s/ \|no Google ip address// ;
 745+ $agent =~ s/GoogleBot/<b><font color=red>GoogleBot<\/font><\/b>/gi ;
 746+ }
 747+ if ($agent =~ /www\.teesoft\.info/)
 748+ {
 749+ $agent =~ s/(\((?:X11|Windows|Macintosh);[^;]*;)[^;]*;[^\)]*\)/$1 [lang code]; rv:[..]\)/ ;
 750+ $agent =~ s/Gecko\/\d+/Gecko\/../ ;
 751+ $agent =~ s/Firefox\/\d+\.\d*\.?\d*/Firefox\/../ ;
 752+ $agent =~ s/(Gecko\/\.\.).*?\(http/$1 etc \(http/ ;
 753+ }
 754+
 755+ $agent =~ s/\+//g ;
 756+# $agent =~ s/^Mozilla\/\d+\.\d+\s*\(compatible\s*;\s*([^\)]*)\)\s*/$1/ ; # Mozilla/5.0 (compatible; xxx) -> xxx
 757+# $agent =~ s/^Mozilla\/\d+\.\d+\s*\(\s*([^\)]*)\)\s*/$1/ ; # Mozilla/5.0 (xxx) -> xxx
 758+ $agent =~ s/\((http:.*?feedfetcher.html)[^\)]*\)/($1)/ ; # (http://www.google.com/feedfetcher.html; 1 subscribers; feed-id=1894739019218796495)
 759+ $agent =~ s/FeedFetcher-Google/FeedFetcher-Google/i ;
 760+ if ($agent !~ /http:/)
 761+ { $agent =~ s/(bot|spider|crawl(?:er)?)/<b>$1<\/b>/gi ; }
 762+ if ($mime2 eq "text/html")
 763+ { $total_page_crawlerrequests += $count ; }
 764+ $crawlers {"$mime|$agent"} += $count ;
 765+ }
 766+ close CSV_CRAWLERS ;
 767+}
 768+
 769+sub ReadInputMethods
 770+{
 771+ my $file_csv = "$dir_process/$file_csv_methods" ;
 772+ if (! -e $file_csv)
 773+ { abort ("Function ReadInputMethods: file $file_csv not found!!!") ; }
 774+ open CSV_METHODS, '<', $file_csv ;
 775+ while ($line = <CSV_METHODS>)
 776+ {
 777+ next if $line =~ /^#/ ; # comments
 778+ next if $line =~ /^:/ ; # csv header (not a comment)
 779+
 780+ ($method, $status, $count) = split (',', $line) ;
 781+ $statusses {"$method,$status"} += $count ;
 782+ $methods {$method} += $count ;
 783+ }
 784+ close CSV_METHODS ;
 785+}
 786+
 787+sub ReadInputMimeTypes
 788+{
 789+ my $file_csv = "$dir_process/$file_csv_requests" ;
 790+ if (! -e $file_csv)
 791+ { abort ("Function ReadInputMimeTypes: file $file_csv not found!!!") ; }
 792+ open CSV_REQUESTS, '<', $file_csv ;
 793+ while ($line = <CSV_REQUESTS>)
 794+ {
 795+ next if $line =~ /^#/ ; # comments
 796+ next if $line =~ /^:/ ; # csv header (not a comment)
 797+
 798+ chomp $line ;
 799+ ($project, $origin, $ext, $mime, $parm, $count) = split (',', $line) ;
 800+
 801+ $project = &ExpandAbbreviation ($project) ;
 802+
 803+ $mime =~ s/(\w+\.)(\w+\.)(\w+)/$1$2<br>$3/ ;
 804+ $mime =~ s/opensearchdescription/opensearch-<br>description/ ;
 805+ if ($project =~ /\./)
 806+ {
 807+ $project = '!invalid!' ;
 808+ if ($origin ne "external")
 809+ { $origin = 'internal' ; }
 810+ $ext = ".." ;
 811+ $mime = ".." ;
 812+ next ;
 813+ }
 814+
 815+ if ($parms eq "")
 816+ { $parms = "&nbsp;" ; }
 817+ $ext =~ s/^([a-z\[\]]*)[^a-z\[\]].*$/$1/g ;
 818+ $ext =~ s/\((.*)\)/ ($1.php)/ ;
 819+ if ($project eq $origin)
 820+ { $origin = '&lArr;' ; }
 821+
 822+ if ($project ne "upload")
 823+ { @counts_prem {"$project,$origin,$ext,$mime"} += $count ; }
 824+ # if ($project ne "upload")
 825+ # { @counts_pm {"$project,$mime"} += $count ; }
 826+
 827+ $counts_pm {"$project,$mime"} += $count ;
 828+ ($domain = $project) =~ s/\:.*$// ;
 829+ $counts_dm {"$domain,$mime"} += $count ;
 830+ $mimetypes {$mime} += $count ;
 831+ $projects {$project} += $count ;
 832+ $domains {$domain} += $count ;
 833+
 834+ if ($mime =~ /image\/(?:png|jpeg|gif)/)
 835+ {
 836+ $images_project {$project} += $count ;
 837+ $images_domain {$domain} += $count ;
 838+ }
 839+ $mimetypes_found {$mime} ++ ;
 840+ # @counts_prem {"$project,$origin,$ext,$mime"} += $count ;
 841+
 842+ $total_mimes += $count ;
 843+ }
 844+ close CSV_REQUESTS ;
 845+
 846+# $html .= "<tr><th class=c>counts</th><th class=l>project</th><th class=l>origin</th><th class=l>extension</th><th class=l>mime</th></tr>\n" ;
 847+# $rows = 0 ;
 848+# foreach $key (sort keys %counts_prem)
 849+# {
 850+# ($project, $origin, $ext, $mime) = split (',', $key) ;
 851+# $count = $counts_prem {$key} ;
 852+# $count =~ s/^(\d+?)(\d\d\d)$/$1,$2/ ;
 853+# $html .= "<tr><td class=r>${count},000</td><td class=l>$project</td><td class=l>$origin</td><td class=l>$ext</td><td class=l>$mime</td></tr>\n" ;
 854+# $rows++ ;
 855+# }
 856+# $html .= "</table>\n" ;
 857+# $html .= "<small>$rows rows written</small><p>" ;
 858+
 859+# $html .= "<table border=1>\n" ;
 860+# $html .= "<tr><th class=c>counts</th><th class=l>project</th><th class=l>mime</th></tr>\n" ;
 861+# $rows = 0 ;
 862+# foreach $key (sort keys %counts_pm)
 863+# {
 864+# ($project, $mime) = split (',', $key) ;
 865+# $count = $counts_pm {$key} ;
 866+# $count =~ s/^(\d+?)(\d\d\d)$/$1,$2/ ;
 867+# $html .= "<tr><td class=r>${count},000</td><td class=l>$project</td><td class=l>$mime</td></tr>\n" ;
 868+# $rows++ ;
 869+# }
 870+# $html .= "</table>\n" ;
 871+# $html .= "<small>$rows rows written</small><p>" ;
 872+}
 873+
 874+sub ReadInputOpSys
 875+{
 876+ my $file_csv = "$dir_process/$file_csv_opsys" ;
 877+ if (! -e $file_csv)
 878+ { abort ("Function ReadInputOpSys: file $file_csv not found!!!") ; }
 879+ open CSV_OPSYS, '<', $file_csv ;
 880+ while ($line = <CSV_OPSYS>)
 881+ {
 882+ if ($line =~ /^#/) # comments
 883+ {
 884+ if ($line =~ /^# mobile:/)
 885+ {
 886+ $line =~ s/^.*?: // ;
 887+ ($month_upd_keywords_mobile = $line) =~ s/^.*?\(([^\)]+)\).*$/$1/ ;
 888+ ($keywords_mobile = $line) =~ s/ \([^\)]+\).*$// ;
 889+ $keywords_mobile =~ s/\|/, /g ;
 890+ $keywords_mobile =~ s/((?:[^,]+,){10})/$1<br>/g ;
 891+ next ;
 892+ }
 893+ next ;
 894+ }
 895+ next if $line =~ /^:/ ; # csv header (not a comment)
 896+
 897+ chomp $line ;
 898+ ($rectype, $os, $count, $perc) = split (',', $line) ;
 899+
 900+ next if $count !~ /^\d+$/ ; # -,Linux Gentoo,,2,0.00% (extra comma !)
 901+
 902+ $os =~ s/_/./g ;
 903+ $os =~ s/\.\./Other/g ;
 904+ if ($rectype ne "G")
 905+ {
 906+ if ($os =~ / \d/)
 907+ { ; }
 908+ else
 909+ { $os =~ s/\// / ; }
 910+ }
 911+
 912+ if ($rectype eq "-") { $total_opsys_non_mobile += $count ; }
 913+ if ($rectype eq "M") { $total_opsys_mobile += $count ; }
 914+
 915+ $opsys {"$rectype,$os"} += $count ;
 916+ }
 917+}
 918+
 919+
 920+sub ReadInputOrigins
 921+{
 922+ my $file_csv = "$dir_process/$file_csv_origins" ;
 923+ if (! -e $file_csv)
 924+ { abort ("Function ReadInputOrigins: file $file_csv not found!!!") ; }
 925+ open CSV_ORIGINS, '<', $file_csv ;
 926+ while ($line = <CSV_ORIGINS>)
 927+ {
 928+ next if $line =~ /^#/ ; # comments
 929+ next if $line =~ /^:/ ; # csv header (not a comment)
 930+
 931+ chomp $line ;
 932+ ($source, $origin, $toplevel, $mimecat, $count) = split (',', $line) ;
 933+
 934+# test:
 935+ if (($source eq "external") && ($origin !~ /^google/))
 936+ { $origin .= $toplevel ; }
 937+
 938+# ~ s/xx:upload/upload (~css)/;
 939+# $origin =~ s/wb:/wikibooks:/;
 940+# $origin =~ s/wk:/wiktionary:/;
 941+# $origin =~ s/wn:/wikinews:/;
 942+# $origin =~ s/wp:/wikipedia:/;
 943+# $origin =~ s/wq:/wikiquote:/;
 944+# $origin =~ s/ws:/wikisource:/;
 945+# $origin =~ s/wv:/wikiversity:/;
 946+# $origin =~ s/wx://;
 947+# $origin =~ s/mw:/mediawiki:/;
 948+# $origin =~ s/wm:/wikimedia:/;
 949+# $origin =~ s/wmf:/foundation:/;
 950+# $origin =~ s/:www$/:portal/;
 951+# $origin =~ s/:mw$/:mediawiki/;
 952+
 953+ if ($source eq "internal")
 954+ {
 955+ $origin = &ExpandAbbreviation ($origin) ;
 956+ ($project,$subproject) = split (':', $origin) ;
 957+ $origin_int_top_split {"$mimecat:$origin"} += $count ;
 958+ $origin_int_top {$origin} += $count ;
 959+ $project_int_top_split {"$mimecat:$project"} += $count ;
 960+ $project_int_top {$project} += $count ;
 961+ }
 962+ else
 963+ {
 964+# $origin2 = $origin ;
 965+# $origin2 =~ s/^google.*?\|/google:ext|/ ;
 966+# $origin2 =~ s/^yahoo.*\|/yahoo:ext|/ ;
 967+# if (($origin2 !~ /^google/) && ($origin2 !~ /^yahoo/))
 968+# { $origin2 =~ s/^.*?\|/other:ext|/ ; }
 969+# ($prefix,$code) = split ('\:', $origin2) ;
 970+# print "$origin -> $origin2\n" ;
 971+# $origin_ext_top_split {$origin} += $count ;
 972+# $origin_ext_top {$code} += $count ;
 973+
 974+# if ($origin =~ /\|page/)
 975+# {
 976+# ($prefix,$code) = split ('\:', $origin) ;
 977+# $code =~ s/\|.*$// ;
 978+# $origin =~ s/\|.*$// ;
 979+# $origin_ext_page_top_split {$origin} += $count ;
 980+# $origin_ext_page_top {$code} += $count ;
 981+# }
 982+ if ($origin eq "unmatched ip address")
 983+ { $origin = "origin unknown" ; }
 984+
 985+ if ($mimecat eq "page")
 986+ { $total_page_requests_external += $count ; }
 987+
 988+ $origin_ext_top_split {"$mimecat:$origin"} += $count ;
 989+ $origin_ext_top {$origin} += $count ;
 990+ $total_origins_external_counted += $count ;
 991+ # if ($origin =~ /^google/)
 992+ # {
 993+ # $origin = "google (total)" ;
 994+ # $origin_ext_top_split {"$mimecat:$origin"} += $count ;
 995+ # $origin_ext_top {$origin} += $count ;
 996+ # }
 997+ }
 998+ }
 999+
 1000+ close CSV_ORIGINS ;
 1001+}
 1002+
 1003+sub ReadInputScripts
 1004+{
 1005+ my $file_csv = "$dir_process/$file_csv_scripts" ;
 1006+ if (! -e $file_csv)
 1007+ { abort ("Function ReadInputScripts: file $file_csv not found!!!") ; }
 1008+ open CSV_SCRIPTS, '<', $file_csv ;
 1009+ while ($line = <CSV_SCRIPTS>)
 1010+ {
 1011+ next if $line =~ /^#/ ; # comments
 1012+ next if $line =~ /^:/ ; # csv header (not a comment)
 1013+
 1014+ chomp $line ;
 1015+ $line =~ s/\%3B/;/gi ;
 1016+ $line =~ s/\&amp;/\&/gi ;
 1017+ ($ext, $script, $parm, $count) = split (',', $line) ;
 1018+ if ($script =~ /\%/)
 1019+ { $script = "other" ; }
 1020+ if ($parm =~ /\%/)
 1021+ { $parm = "other" ; }
 1022+
 1023+ if (($ext eq "php") && ($parm =~ /action=/) && ($parm !~ /search=/)) # action can occur as parm after search
 1024+ {
 1025+ @parms = split ('\&', $parm) ;
 1026+ foreach $parm (@parms)
 1027+ {
 1028+ ($keyword,$data) = split ('\=', $parm) ;
 1029+ if ($keyword eq "action")
 1030+ { @actions {"$script,$data"} += $count }
 1031+ }
 1032+ }
 1033+ }
 1034+ close CSV_SCRIPTS ;
 1035+
 1036+# foreach $key (keys_sorted_by_value_num_desc %actions)
 1037+# { print "$key: " . $actions {$key} . "\n" ; }
 1038+
 1039+ open CSV_SCRIPTS, '<', "$dir_process/$file_csv_scripts" ;
 1040+ read_script:
 1041+ while ($line = <CSV_SCRIPTS>)
 1042+ {
 1043+ next if $line =~ /^#/ ; # comments
 1044+ next if $line =~ /^:/ ; # csv header (not a comment)
 1045+
 1046+ chomp $line ;
 1047+ $line =~ s/\%3B/;/gi ;
 1048+ $line =~ s/\%5B/[/gi ;
 1049+ $line =~ s/\%5D/]/gi ;
 1050+ $line =~ s/\&amp;/\&/gi ;
 1051+ ($ext, $script, $parm, $count) = split (',', $line) ;
 1052+
 1053+ # incomplete validation check on valid names, but captures already lot of rubbish
 1054+ if ($script =~ /\%/)
 1055+ { $script = "other" ; }
 1056+ if ($parm =~ /\%/)
 1057+ { $parm = "other" ; }
 1058+
 1059+ if (($parm =~ /amp;amp;/) ||
 1060+ ($parm =~ /feed=.*feed=/))
 1061+ { next read_script ; }
 1062+
 1063+ if (($ext eq "php") && ($parm =~ /action=/))
 1064+ {
 1065+ @parms = split ('\&', $parm) ;
 1066+ foreach $parm (@parms)
 1067+ {
 1068+ ($keyword,$data) = split ('\=', $parm) ;
 1069+ if ($keyword eq "action")
 1070+ {
 1071+ if (@actions {"$script,$data"} < 2)
 1072+ { next read_script ; }
 1073+ }
 1074+ }
 1075+ }
 1076+ if ($ext eq "php")
 1077+ {
 1078+ # generalize ns10 -> ns.. + remove all ns..=.. but one
 1079+ $parm =~ s/\&ns\d+/\&ns../g ;
 1080+ $parm =~ s/\&ns\.\.=\.\./-*^-*^/ ;
 1081+ $parm =~ s/\&ns\.\.=\.\.//g ;
 1082+ $parm =~ s/\-\*\^\-\*\^/\&ns\.\.=\.\./g ;
 1083+
 1084+ # generalize nsargs[]= -> remove all but one
 1085+ $parm =~ s/\&rsargs\[\]=\.\./-*^-*^/ ;
 1086+ $parm =~ s/\&rsargs\[\]=\.\.//g ;
 1087+ $parm =~ s/\-\*\^\-\*\^/\&rsargs\[n\]=\.\./g ;
 1088+
 1089+ if (length ($parm) > 100)
 1090+ { $parm =~ s/(.{100}[^\&]*\&)/$1<br>/g ; }
 1091+
 1092+ $parms {"$script,$parm"} += $count ;
 1093+ $scripts_php {$script} += $count ;
 1094+ }
 1095+ elsif ($ext eq "js")
 1096+ { $scripts_js {$script} += $count ; }
 1097+ elsif ($ext eq "css")
 1098+ { $scripts_css {$script} += $count ; }
 1099+ }
 1100+ close CSV_SCRIPTS ;
 1101+}
 1102+
 1103+sub ReadInputGoogle
 1104+{
 1105+ my $file_csv = "$dir_process/$file_csv_google" ;
 1106+ if (! -e $file_csv)
 1107+ { abort ("Function ReadInputGoogle: file $file_csv not found!!!") ; }
 1108+ open CSV_SEARCH, '<', $file_csv ;
 1109+ while ($line = <CSV_SEARCH>)
 1110+ {
 1111+ next if $line =~ /^#/ ; # comments
 1112+ next if $line =~ /^:/ ; # csv header (not a comment)
 1113+
 1114+ chomp $line ;
 1115+ ($matches, $site, $origin, $service, $agent, $mimecat, $toplevel, $count) = split (',', $line) ;
 1116+
 1117+ if ($service eq "Imposters?")
 1118+ { $service = "GoogleBot?" ; }
 1119+ if ($service eq "GoogleBotNot?")
 1120+ { $service = "GoogleBot?" ; }
 1121+ if ($service eq "Crawler")
 1122+ { $service = "GoogleBot" ; }
 1123+
 1124+ if ($matches =~ /x/)
 1125+ { $googleIp = 'Y' ; }
 1126+ else
 1127+ { $googleIp = 'N' ; }
 1128+
 1129+ next if $site ne "google" ;
 1130+
 1131+ if ($toplevel eq "-")
 1132+ { $toplevel = "undefined" ; }
 1133+ if (length ($toplevel) > 3)
 1134+ { $toplevel = "_$toplevel" ; } # sort on top
 1135+
 1136+ $searches_crawlers {$service} += $count ;
 1137+ $searches_service {"$service,$googleIp"} += $count ;
 1138+ $searches_toplevel {$toplevel} += $count ;
 1139+ $searches_service_mimecat {"$service,$mimecat,$googleIp"} += $count ;
 1140+ $searches_service_mimecat {"$service,total,$googleIp"} += $count ;
 1141+ $searches_service_matches {"$service,$matches"} += $count ;
 1142+
 1143+# if ($origin =~ /search/i)
 1144+ if ($toplevel =~ /^[a-zA-Z0-9-]+$/)
 1145+ { $searches_toplevel_tld_found {$toplevel} += $count ; } # print "$line\n" ;}
 1146+ else
 1147+ {
 1148+ $searches_mimecat_tld_not_found {$mimecat} += $count ;
 1149+ $searches_mimecat_tld_not_found {"total"} += $count ;
 1150+ }
 1151+
 1152+ $searches_toplevel_mimecat {"$toplevel,$mimecat"} += $count ;
 1153+ $searches_toplevel_mimecat {"$toplevel,total"} += $count ;
 1154+
 1155+# if ($toplevel !~ /:/) { print "invalid toplevel $toplevel\n" ; }
 1156+ }
 1157+ close CSV_SEARCH ;
 1158+}
 1159+
 1160+sub ReadInputSkins
 1161+{
 1162+ my $file_csv = "$dir_process/$file_csv_skins" ;
 1163+ if (! -e $file_csv)
 1164+ { abort ("Function ReadInputSkins: file $file_csv not found!!!") ; }
 1165+ open CSV_SKINS, '<', $file_csv ;
 1166+ while ($line = <CSV_SKINS>)
 1167+ {
 1168+ next if $line =~ /^#/ ; # comments
 1169+ next if $line =~ /^:/ ; # csv header (not a comment)
 1170+
 1171+ chomp $line ;
 1172+ ($skins, $count) = split (',', $line) ;
 1173+
 1174+ $skins {$skins} += $count ;
 1175+ ($name,$rest) = split ('\/', $skins, 2) ;
 1176+ $skin_set {$name}+= $count ;
 1177+ }
 1178+ close CSV_SCRIPTS ;
 1179+}
 1180+
 1181+sub ReadInputIndexPhp
 1182+{
 1183+ my $file_csv = "$dir_process/$file_csv_indexphp" ;
 1184+ if (! -e $file_csv)
 1185+ { abort ("Function ReadInputIndexPhp: file $file_csv not found!!!") ; }
 1186+ open CSV_INDEXPHP, '<', $file_csv ;
 1187+ while ($line = <CSV_INDEXPHP>)
 1188+ {
 1189+ next if $line =~ /^#/ ; # comments
 1190+ next if $line =~ /^:/ ; # csv header (not a comment)
 1191+
 1192+ chomp $line ;
 1193+ ($bot,$domain,$referer,$ext,$status,$mime,$parm,$agent) = split (',', $line) ;
 1194+
 1195+ my $action = "" ;
 1196+ if ($parm =~ /action=edit/)
 1197+ { $action = 'edit' ; }
 1198+ if ($parm =~ /action=submit/)
 1199+ { $action = 'submit' ; }
 1200+
 1201+ next if $ext !~ /index.php/ ;
 1202+ next if $parm !~ /action=(?:edit|submit)(?:$|\&)/ ; # submit or submit&.., not submitlogin
 1203+ next if $mime ne "text/html" ; # excludes mime - (undefined), application/x-external-editor on action=edit
 1204+ # and text/plain, text/xml, application/xml on action=submit
 1205+
 1206+ if ($bot =~ /Y/)
 1207+ {
 1208+ $intent = "" ;
 1209+
 1210+ if ($agent =~ /DotNetWikiBot/i)
 1211+ { $agent = "DotNetWikiBot" ; }
 1212+ $agent =~ s/\%27/\'/g ;
 1213+ # $agent =~ s/\(.*?\)//g;
 1214+
 1215+ if ($action eq "edit")
 1216+ {
 1217+ if ($referer =~ /^\w\w:/)
 1218+ { $referer = "int" ; }
 1219+ $edit_submit_bot {$domain} {"edit,$referer"} ++ ;
 1220+ $edit_submit_bot_sort {$domain} ++ ;
 1221+ $edit_submit_bot_agent {$agent} {"$action,$referer"}++ ;
 1222+ $edit_submit_bot_agent_sort {$agent}++ ;
 1223+ }
 1224+
 1225+ if ($action eq "submit")
 1226+ {
 1227+ if ($referer =~ /^\w\w:/)
 1228+ { $referer = "int" ; }
 1229+
 1230+ $intent = 'unknown' ;
 1231+ if ($status eq "TCP_MISS/302") { $intent = 'save' ; }
 1232+ elsif ($status eq "TCP_MISS/200") { $intent = 'preview' ; }
 1233+ # next if $intent ne 'save' ;
 1234+
 1235+ $edit_submit_bot {$domain} {"$intent,$referer"} ++ ;
 1236+ $edit_submit_bot_sort {$domain} ++ ;
 1237+
 1238+ # if ($referer eq "-") { $edit_submit_bot_agent {$agent}++ ; }
 1239+ $edit_submit_bot_agent {$agent} {"$intent,$referer"}++ ;
 1240+ $edit_submit_bot_agent_sort {$agent}++ ;
 1241+ }
 1242+ }
 1243+
 1244+ next if $bot =~ /N/ ; # 2009-05 /N/ -> total oldid: 127, total other: 54, total redlink: 4
 1245+ next if $bot =~ /Y/ ; # 2009-05 /N/ -> total oldid: 127, total other: 54, total redlink: 4
 1246+ next if $domain ne "wp:en" ; # 2009-05 ne -> total other: 26, total redlink: 22
 1247+ # if (($referer ne "-") && ($referer ne "ext") && ($referer ne "wp:en")) { next ; }
 1248+ # if (($referer ne "-") && ($referer !~ /^..:/)) { $referer = "ext" ; }
 1249+ # if ($referer eq "-") { $referer = "- " ; }
 1250+ next if $referer ne "wp:en" ; # 2009-05 eq -> # total other: 2014, total redlink: 1031, total oldid: 47, total undo: 30
 1251+
 1252+ my $filter = '' ;
 1253+ if ($parm =~ /action=edit/)
 1254+ {
 1255+ $filter = 'other' ;
 1256+ if ($parm =~ /redlink/) { $filter = 'redlink' ; }
 1257+ if ($parm =~ /oldid=/) { $filter = 'oldid' ; }
 1258+ if ($parm =~ /undo=/) { $filter = 'undo' ; }
 1259+
 1260+ $edit_submit {"[$bot $referer $action $filter] $parm"}++ ;
 1261+ $edit_submits {"$filter"}++ ;
 1262+ }
 1263+ if ($parm =~ /action=submit/)
 1264+ {
 1265+ $edit_submit {"$bot $referer $action $status"}++ ;
 1266+ }
 1267+
 1268+ # my @subparms = split ('\&', $parm) ;
 1269+ # foreach $subparm (@subparms)
 1270+ # { $edit_submit_subparms {"[$action] [$filter] $subparm"}++ ; }
 1271+ }
 1272+ close CSV_INDEXPHP ;
 1273+
 1274+# next if $bot =~ /N/ ; # + any referrer ->
 1275+# Sample period: 1 May 2009 - 31 May 2009 => for daily averages multiplier = 0.03
 1276+# 9: [bot=Y - edit oldid] action=edit&oldid=&section=&title=..
 1277+# 3: [bot=Y - edit oldid] action=edit&oldid=..&title=..
 1278+# 17: [bot=Y - edit oldid] action=edit&oldid=..&title=..&useskin=..
 1279+# 1: [bot=Y - edit other] _herbs&action=edit&title=..
 1280+# 65: [bot=Y - edit other] action=edit&section=..&title=..
 1281+# 1: [bot=Y - edit other] action=edit&stub&title=..
 1282+# 2: [bot=Y - edit other] action=edit&title=
 1283+# 188: [bot=Y - edit other] action=edit&title=..
 1284+# 31: [bot=Y - edit other] action=edit&title=..&useskin=..
 1285+# 30: [bot=Y - edit redlink] action=edit&redlink=..&title=..
 1286+# 5: [bot=Y - edit undo] action=edit&title=..&undo=..&undoafter=..
 1287+# 14: [bot=Y ext edit other] action=edit&section=..&title=..
 1288+# 5: [bot=Y ext edit other] action=edit&title=..
 1289+# 11: [bot=Y ext edit redlink] action=edit&redlink=..&title=..
 1290+# 2: [bot=Y ext edit undo] action=edit&title=..&undo=..&undoafter=..
 1291+# 107: [bot=Y wp:en edit oldid] action=edit&oldid=&section=&title=..
 1292+# 3: [bot=Y wp:en edit oldid] action=edit&oldid=..&section=&title=..
 1293+# 17: [bot=Y wp:en edit oldid] action=edit&oldid=..&title=..
 1294+# 1: [bot=Y wp:en edit other] action=edit&articleget=..&dykcredittab=..&editintro=..&preload=..&preloadtitle=..&section=..&title=..
 1295+# 5: [bot=Y wp:en edit other] action=edit&section=..&title=..
 1296+# 48: [bot=Y wp:en edit other] action=edit&title=..
 1297+# 4: [bot=Y wp:en edit redlink] action=edit&redlink=..&title=..
 1298+# 9: bot=Y - submit TCP_MISS/200
 1299+# 62: bot=Y - submit TCP_MISS/302
 1300+# 31: bot=Y wp:en submit TCP_MISS/302
 1301+# total other: 361
 1302+# total oldid: 156
 1303+# total redlink: 45
 1304+# total undo: 7
 1305+}
 1306+
 1307+sub ReadInputCountriesTimed
 1308+{
 1309+ my $file_csv = "$dir_process/$file_csv_countries_timed" ;
 1310+ if (! -e $file_csv)
 1311+ { abort ("Function ReadInputSkins: file $file_csv not found!!! ") ; }
 1312+ open CSV_COUNTRIES, '<', $file_csv ;
 1313+ while ($line = <CSV_COUNTRIES>)
 1314+ {
 1315+ next if $line =~ /^#/ ; # comments
 1316+ next if $line =~ /^:/ ; # csv header (not a comment)
 1317+
 1318+ chomp $line ;
 1319+ ($bot,$target,$country,$time,$count) = split (',', $line) ;
 1320+
 1321+ next if $target !~ /^wp/ ; # wikipedia only
 1322+
 1323+ if ($bot =~ /Y/)
 1324+ { $bot = 'Y' }
 1325+ else
 1326+ { $bot = 'N' }
 1327+ $countries {$country} ++ ;
 1328+ $targets {$target} ++ ;
 1329+ $times {$time} ++ ;
 1330+ $countries_timed {"$bot,$target,$country,$time"} += $count ;
 1331+ $countries_totals {"$bot,$target"}{$country} += $count ;
 1332+ $targets_totals {"$bot,$country"}{$target} += $count ;
 1333+ }
 1334+ close CSV_COUNTRIES ;
 1335+}
 1336+
 1337+sub ReadInputCountriesNames
 1338+{
 1339+ $path_csv_country_codes = "$path_in/$file_csv_country_codes" ;
 1340+ if (! -e $path_csv_country_codes) { abort ("Input file $path_csv_country_codes not found!") ; }
 1341+
 1342+ open CSV_COUNTRY_CODES, '<', $path_csv_country_codes ;
 1343+ $country_names {"--"} = "Unknown" ;
 1344+ while ($line = <CSV_COUNTRY_CODES>)
 1345+ {
 1346+ chomp $line ;
 1347+
 1348+ next if $line =~ /^#/ ;
 1349+
 1350+ ($country_code,$region_code,$north_south_code,$country_name) = split (',', $line,4) ;
 1351+ $region_codes {$country_code} = $region_code ;
 1352+ $north_south_codes {$country_code} = $north_south_code ;
 1353+
 1354+ $country_name =~ s/"//g ;
 1355+
 1356+ next if $country_name eq "Anonymous Proxy" ;
 1357+ next if $country_name eq "Satellite Provider" ;
 1358+ next if $country_name eq "Other Country" ;
 1359+ next if $country_name eq "Asia/Pacific Region" ;
 1360+ next if $country_name eq "Europe" ;
 1361+
 1362+# if ($country_meta_info {$country} eq "")
 1363+# {
 1364+# if ($country_meta_info_not_found_reported {$country} ++ == 0)
 1365+# { print "Meta info not found for country '$country'\n" ; }
 1366+# }
 1367+
 1368+ $country_names {$country_code} = $country_name ;
 1369+ $country_codes_all {"$country_name|$country_code"} ++ ;
 1370+ }
 1371+}
 1372+
 1373+sub ReadInputCountriesMeta
 1374+{
 1375+ # http://en.wikipedia.org/wiki/List_of_countries_by_population
 1376+ # http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users
 1377+ open COUNTRY_META_INFO, '<', "$path_in/$file_csv_country_meta_info" ;
 1378+ while ($line = <COUNTRY_META_INFO>)
 1379+ {
 1380+ chomp $line ;
 1381+ ($country,$link,$population,$connected,$icon) = split ',', $line ;
 1382+print "$line\n" ; # qqq
 1383+ $country =~ s/&comma;/,/g ;
 1384+
 1385+ # use country names as given by MaxMind
 1386+ $country =~ s/Brunei/Brunei Darussalam/ ;
 1387+ $country =~ s/C..?te d'Ivoire/Cote d'Ivoire/ ;
 1388+ $country =~ s/Congo, The Democratic Republic of the/Republic of the Congo/ ;
 1389+ $country =~ s/Dem. Rep. of Congo/Congo - The Democratic Republic of the/ ;
 1390+ $country =~ s/East timor/Timor-Leste/ ;
 1391+ $country =~ s/Guyane/French Guiana/ ;
 1392+ $country =~ s/Iran/Iran, Islamic Republic of/ ;
 1393+ $country =~ s/Laos/Lao People's Democratic Republic/ ;
 1394+ $country =~ s/Libya/Libyan Arab Jamahiriya/ ;
 1395+ $country =~ s/Macau/Macao/ ;
 1396+ $country =~ s/Moldova/Moldova, Republic of/ ;
 1397+ $country =~ s/North Korea/Korea, Republic of/ ;
 1398+ $country =~ s/Palestine/Palestinian Territory/ ;
 1399+ $country =~ s/Republic of the Congo/Congo/ ;
 1400+ $country =~ s/Russia/Russian Federation/ ;
 1401+ $country =~ s/North Korea/Korea, Democratic People's Republic of/ ;
 1402+ $country =~ s/South Korea/Korea, Republic of/ ;
 1403+ $country =~ s/Syria/Syrian Arab Republic/ ;
 1404+ $country =~ s/Tanzania/Tanzania, United Republic of/ ;
 1405+ $country =~ s/U.S. Virgin Islands/Virgin Islands, British/ ;
 1406+ $country =~ s/Vatican City/Holy See (Vatican City State)/ ;
 1407+ $country =~ s/^Korea$/South Korea/ ;
 1408+
 1409+ $connected =~ s/connected/../g ;
 1410+ $country_meta_info {$country} = "$link,$population,$connected,$icon" ;
 1411+print "meta info found for '$country'\n" ; # qqq
 1412+
 1413+ if ($country eq "United States")
 1414+ { ($connected_us = $connected) =~ s/_//g ; }
 1415+ }
 1416+ close COUNTRY_META_INFO ;
 1417+}
 1418+
 1419+sub CollectRegionCounts
 1420+{
 1421+ my ($country_code, $region_code, $north_south_code, $country_name) ;
 1422+
 1423+ foreach $country_code (keys %country_names)
 1424+ {
 1425+ $country_name = $country_names {$country_code} ;
 1426+ $country_meta = $country_meta_info {$country_name} ;
 1427+ my ($link,$population,$connected,$icon) = split (',', $country_meta) ;
 1428+
 1429+ $region_code = $region_codes {$country_code} ;
 1430+ $north_south_code = $north_south_codes {$country_code} ;
 1431+
 1432+ $population =~ s/_//g ;
 1433+ $connected =~ s/_//g ;
 1434+
 1435+ $population_tot += $population ;
 1436+ $connected_tot += $connected ;
 1437+
 1438+ $population_per_region {$region_code} += $population ;
 1439+ $connected_per_region {$region_code} += $connected ;
 1440+
 1441+ $population_per_region {$north_south_code} += $population ;
 1442+ $connected_per_region {$north_south_code} += $connected ;
 1443+
 1444+ # print "CODE $country_code NAME $country_name POP $population, $CONN $connected REGION $region_code NS $north_south_code PPR ${population_per_region {$region_code}}\n" ;
 1445+ }
 1446+}
 1447+
 1448+sub ReadInputCountriesMonthly
 1449+{
 1450+ my $project_mode = shift ;
 1451+
 1452+ undef %yyyymm_ ;
 1453+ undef %quarters ;
 1454+ undef %requests_unknown_per_quarter ;
 1455+ undef %country_codes ;
 1456+ undef %requests_all ;
 1457+ undef %requests_all_per_period ;
 1458+ undef %requests_per_quarter ;
 1459+ undef %requests_per_country ;
 1460+ undef %requests_per_quarter_per_country ;
 1461+ undef %requests_per_country_per_language ;
 1462+ undef %requests_per_language_per_country ;
 1463+ undef %requests_per_quarter_per_country_per_language ;
 1464+ undef %requests_per_month_per_country_code ;
 1465+ undef %requests_per_month_us ;
 1466+ undef %descriptions_per_period ;
 1467+ undef %requests_recently_all ;
 1468+ undef %requests_recently_per_country_code ;
 1469+ undef %requests_recently_per_country ;
 1470+ undef %requests_recently_per_country_per_language ;
 1471+ undef %requests_recently_per_language_per_country ;
 1472+ undef %requests_recently_per_language ;
 1473+ undef %months_recently ;
 1474+
 1475+ $requests_recently_start = "999999" ;
 1476+ $requests_recently_stop = "000000" ;
 1477+ $requests_start = "999999" ;
 1478+ $requests_stop = "000000" ;
 1479+
 1480+ $requests_all = 0 ;
 1481+ $requests_recently_all = 0 ;
 1482+
 1483+ my ($sec,$min,$hour,$day,$report_month,$report_year) = localtime (time) ;
 1484+ $report_year += 1900 ;
 1485+ $report_month ++ ;
 1486+
 1487+ print "Process project $project_mode\n\n" ;
 1488+
 1489+ open CSV_SQUID_COUNTS_MONTHLY, '<', $path_csv_squid_counts_monthly ;
 1490+ while ($line = <CSV_SQUID_COUNTS_MONTHLY>)
 1491+ {
 1492+ chomp $line ;
 1493+ $line =~ s/,\s+/,/g ;
 1494+ $line =~ s/\s+,/,/g ;
 1495+ ($yyyymm,$project,$language,$code,$bot,$count) = split (',', $line) ;
 1496+
 1497+ ($code,$language) = &NormalizeSquidInput ($code,$language) ;
 1498+ $country = &GetCountryName ($code) ;
 1499+
 1500+ next if &DiscardSquidInput ($bot,$project,$project_mode,$code,$language) ;
 1501+
 1502+ # $yyyymm = "2009-12" ;
 1503+ $yyyymm_ {$yyyymm} ++ ;
 1504+
 1505+ $year = substr ($yyyymm,0,4) ;
 1506+ $month = substr ($yyyymm,5,2) ;
 1507+ # print "year $year report_year month $month $report_year $report_month\n" ;
 1508+
 1509+ $recently = $false ;
 1510+
 1511+ if (($year == $report_year) or (($year == $report_year - 1) && ($month >= $report_month))) # last 12 months
 1512+ { $recently = $true ; }
 1513+
 1514+ if ($month <= 3) { $quarter = $year . ' Q1' ; }
 1515+ elsif ($month <= 6) { $quarter = $year . ' Q2' ; }
 1516+ elsif ($month <= 9) { $quarter = $year . ' Q3' ; }
 1517+ else { $quarter = $year . ' Q4' ; }
 1518+
 1519+ if ($quarter_only ne '')
 1520+ { next if $quarter ne $quarter_only ; }
 1521+
 1522+ # if ($views_edits eq 'Page Edits')
 1523+
 1524+ $quarters {$quarter} ++ ;
 1525+
 1526+ if (($country =~ /\?/) || ($country =~ /unknown/i))
 1527+ { $requests_unknown_per_quarter {$quarter} += $count ; next ; }
 1528+ $country_codes {"$country|$code"}++ ;
 1529+ $requests_all += $count ;
 1530+ $requests_all_per_period {$yyyymm} += $count ;
 1531+ $requests_per_quarter {$quarter} += $count ;
 1532+ $requests_per_country {$country} += $count ;
 1533+
 1534+ $requests_per_quarter_per_country {$quarter} {$country} += $count ;
 1535+ $requests_per_country_per_language {$country} {$language} += $count ;
 1536+ $requests_per_language_per_country {$language} {$country} += $count ;
 1537+ $requests_per_quarter_per_country_per_language {$quarter} {$country} {$language} += $count ;
 1538+ $requests_per_month_per_country_code {$yyyymm} {"$country|$code"} += $count ;
 1539+
 1540+ if ($code eq "US")
 1541+ {$requests_per_month_us {$yyyymm} += $count ; }
 1542+
 1543+ $descriptions_per_period {$yyyymm} = $yyyymm ;
 1544+ if ($yyyymm lt $requests_start) { $requests_start = $yyyymm ; }
 1545+ if ($yyyymm gt $requests_stop) { $requests_stop = $yyyymm ; }
 1546+
 1547+ if ($recently)
 1548+ {
 1549+ if ($yyyymm lt $requests_recently_start) { $requests_recently_start = $yyyymm ; }
 1550+ if ($yyyymm gt $requests_recently_stop) { $requests_recently_stop = $yyyymm ; }
 1551+
 1552+ $months_recently {$yyyymm}++ ;
 1553+ $requests_recently_all += $count ;
 1554+ $requests_recently_per_country_code {"$country|$code"} += $count ;
 1555+ $requests_recently_per_country {$country} += $count ;
 1556+ $requests_recently_per_country_per_language {$country} {$language} += $count ;
 1557+ $requests_recently_per_language_per_country {$language} {$country} += $count ;
 1558+ $requests_recently_per_language {$language} += $count ;
 1559+ }
 1560+ }
 1561+
 1562+ print "\n" ;
 1563+ @quarters = keys_sorted_alpha_desc %quarters ;
 1564+ foreach $quarter (@quarters)
 1565+ {
 1566+ print "Quarter $quarter: requests: " . (0+$requests_per_quarter {$quarter}) . "\n" ;
 1567+ if ($requests_per_quarter {$quarter} == 0)
 1568+ { abort ("No known requests found for quarter $quarter") ; }
 1569+ }
 1570+ print "\n" ;
 1571+
 1572+ $months_recently = keys %months_recently ;
 1573+ if ($months_recently == 0) { die "\$months_recently == 0\n" ; }
 1574+
 1575+ $requests_recently_start = substr ($requests_recently_start,5,2) . "/" . substr ($requests_recently_start,2,2) ;
 1576+ $requests_recently_stop = substr ($requests_recently_stop ,5,2) . "/" . substr ($requests_recently_stop ,2,2) ;
 1577+ $requests_start = substr ($requests_start,5,2) . "/" . substr ($requests_start,2,2) ;
 1578+ $requests_stop = substr ($requests_stop ,5,2) . "/" . substr ($requests_stop ,2,2) ;
 1579+
 1580+ foreach $yyyymm (keys %$yyyymm)
 1581+ {
 1582+ if ($requests_per_month_us {$week} > $max_requests_per_month_us)
 1583+ { $max_requests_per_month_us = $requests_per_month_us {$week} ; }
 1584+ }
 1585+
 1586+ # die "\$connected_us == 0" if $connected_us == 0 ;
 1587+ if ($connected_us > 0)
 1588+ { $max_requests_per_connected_us_month = sprintf ("%.1f", $max_requests_per_month_us / $connected_us) ; }
 1589+
 1590+# foreach $country_code (sort keys %country_codes_all)
 1591+# {
 1592+# $200907 = ${$requests_per_month_per_country_code {"200907"}} {$country_code} ;
 1593+# $200908 = ${$requests_per_month_per_country_code {"200908"}} {$country_code} ;
 1594+# $200909 = ${$requests_per_month_per_country_code {"200909"}} {$country_code} ;
 1595+# $200910 = ${$requests_per_month_per_country_code {"200910"}} {$country_code} ;
 1596+# $200911 = ${$requests_per_month_per_country_code {"200911"}} {$country_code} ;
 1597+# $200912 = ${$requests_per_month_per_country_code {"200912"}} {$country_code} ;
 1598+# print "$country_code, $200907, $200908, $200909, $200910, $200911, $200912\n" ;
 1599+# }
 1600+# exit ;
 1601+}
 1602+
 1603+sub ReadInputCountriesDaily
 1604+{
 1605+ # http://en.wikipedia.org/wiki/List_of_countries_by_population
 1606+ # http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users
 1607+
 1608+ my $project_mode = shift ;
 1609+
 1610+ undef %country_codes_found ;
 1611+ undef %weeknum_this_years ;
 1612+ undef %descriptions_per_period ;
 1613+ undef %days_in_input_for_week ;
 1614+ undef %requests_all_per_period ;
 1615+ undef %requests_per_week_per_country_code ;
 1616+ undef %requests_per_week_us ;
 1617+ undef %missing_days ;
 1618+ undef %correct_for_missing_days ;
 1619+ undef %changes_per_week_per_country_code ;
 1620+
 1621+# $requests_recently_start = "999999" ;
 1622+# $requests_recently_stop = "000000" ;
 1623+
 1624+# $time_2000_01_01 = timegm(0,0,0,1,1-1,2000-1900) ;
 1625+ $sec_per_day = 24 * 60 * 60 ;
 1626+
 1627+ my ($sec,$min,$hour,$day,$report_month,$report_year) = localtime (time) ;
 1628+ $report_year += 1900 ;
 1629+ $report_month ++ ;
 1630+
 1631+ print "Process project $project_mode\n\n" ;
 1632+
 1633+ $yyyymmdd_prev = "" ;
 1634+ open CSV_SQUID_COUNTS_DAILY, '<', $path_csv_squid_counts_daily ;
 1635+ while ($line = <CSV_SQUID_COUNTS_DAILY>)
 1636+ {
 1637+ chomp $line ;
 1638+ ($yyyymmdd,$project,$language,$code,$bot,$count) = split (',', $line) ;
 1639+
 1640+ die "\$yyyymmdd $yyyymmdd lt \$yyyymmdd_prev $yyyymmdd_prev" if $yyyymmdd lt $yyyymmdd_prev ;
 1641+ $yyyymmdd_prev = $yyyymmdd ;
 1642+
 1643+ ($code,$language) = &NormalizeSquidInput ($code,$language) ;
 1644+ $country = &GetCountryName ($code) ;
 1645+
 1646+ $country_codes_found {"$country|$code"} ++ ;
 1647+
 1648+ next if &DiscardSquidInput ($bot,$project,$project_mode,$code,$language) ;
 1649+
 1650+ # $yyyymmdd = "2009-12-01" ;
 1651+ $yyyymmdd_ {$yyyymmdd} ++ ;
 1652+
 1653+ $year = substr ($yyyymmdd,0,4) ;
 1654+ $month = substr ($yyyymmdd,5,2) ;
 1655+ $day = substr ($yyyymmdd,8,2) ;
 1656+
 1657+ $time = timegm(0,0,0,$day,$month-1,$year-1900) ;
 1658+ # $days_since_2000 = int (($time - $time_2000_01_01) / $sec_per_day) ;
 1659+ $days_this_year = (gmtime $time) [7] ;
 1660+ $weeknum_this_year = int ($days_this_year / 7) + 1 ;
 1661+ $weeknum_since_2000 = $year . sprintf ("%02d",$weeknum_this_year) ; # * int ($days_since_2000 / 7) + 1 ;
 1662+
 1663+ $weeknum_this_years {"$weeknum_this_year - $weeknum_since_2000"}++ ;
 1664+
 1665+ $descriptions_per_period {$weeknum_since_2000} = "week $weeknum_this_year - " . month_english_short ($month-1) . " $year" ;
 1666+ $days_in_input_for_week {$weeknum_since_2000} {$yyyymmdd} ++ ;
 1667+
 1668+ $requests_all_per_period {$weeknum_since_2000} += $count ;
 1669+ $requests_per_week_per_country_code {$weeknum_since_2000} {"$country|$code"} += $count ;
 1670+
 1671+ if ($code eq "US")
 1672+ {$requests_per_week_us {$weeknum_since_2000} += $count ; }
 1673+
 1674+ # last if ($weeknum_since_2000 == 501) ; # test
 1675+ }
 1676+
 1677+ foreach $week (sort keys %weeknum_this_years)
 1678+ { print "week $week " . $weeknum_this_years {$week} . "\n" ; }
 1679+
 1680+ foreach $week (sort {$a <=> $b} keys %days_in_input_for_week)
 1681+ {
 1682+ @keys = keys %{$requests_per_week_per_country_code {$week-1}} ;
 1683+ if (@keys == 0)
 1684+ {
 1685+ # print "skip week $week: no data for previous week available.\n" ;
 1686+ next ;
 1687+ }
 1688+
 1689+ if ($requests_per_week_us {$week} > $max_requests_per_week_us)
 1690+ { $max_requests_per_week_us = $requests_per_week_us {$week} ; }
 1691+
 1692+ $desc= $week_descriptions {$week} ;
 1693+ @days = keys %{$days_in_input_for_week {$week}} ;
 1694+ $daycount = @days ;
 1695+ $missing_days {$week} = 7 - $daycount ;
 1696+ $correct_for_missing_days {$week} = 7 / $daycount ;
 1697+ # print "Week $week: $desc: $daycount " . (join ' - ', @days) . " ${correct_for_missing_days {$week}}\n" ;
 1698+ # foreach $country_code (keys %{$requests_per_week_per_country_code {$week}})
 1699+
 1700+ foreach $country_code (keys %country_codes_all)
 1701+ {
 1702+ $new = &CorrectForMissingDays ($week , ${$requests_per_week_per_country_code {$week }} {$country_code}) ;
 1703+ $old = &CorrectForMissingDays ($week-1, ${$requests_per_week_per_country_code {$week-1}} {$country_code}) ;
 1704+
 1705+ # print "country_code $country_code\n" ;
 1706+ if ($old == 0)
 1707+ {
 1708+ if ($new > 0)
 1709+ {
 1710+ # print "$country_code: no data for prev week\n" ;
 1711+ $changes_per_week_per_country_code {$week} {$country_code} = 100 ;
 1712+ }
 1713+ }
 1714+ else
 1715+ {
 1716+ $delta = sprintf ("%.1f", 100 * sqrt ($new / $old)) ;
 1717+ if ($delta < 0) { $delta = 0 ; }
 1718+ if ($delta > 200) { $delta = 200 ; }
 1719+ $changes_per_week_per_country_code {$week} {$country_code} = $delta ;
 1720+ $country_code =~ s/,/;/g ;
 1721+ push @trace, "$country_code, $week, $old, $new, $delta\n" ;
 1722+ }
 1723+
 1724+ }
 1725+ }
 1726+ open TRACE, '>', "svg/SquidReportPageViewsPerCountryTrend.csv" ;
 1727+ print TRACE sort @trace ;
 1728+ close TRACE ;
 1729+
 1730+ # die "\$connected_us == 0" if $connected_us == 0 ;
 1731+ if ($connected_us > 0)
 1732+ { $max_requests_per_connected_us_week = sprintf ("%.1f", (($max_requests_per_week_us * 1000) / $connected_us)) ; }
 1733+}
 1734+
 1735+sub NormalizeSquidInput
 1736+{
 1737+ my ($code,$language) = @_ ;
 1738+
 1739+ if ($language eq "jp") { $language = "ja" ; }
 1740+ if ($language eq "cz") { $language = "cs" ; }
 1741+
 1742+ # following are part of France, according to Wikipedia, List_of_countries_by_population
 1743+ if ($code eq 'BL') { $code = 'FR' ; } # Saint Barth�lemy
 1744+ if ($code eq 'MF') { $code = 'FR' ; } # Saint Martin
 1745+ if ($code eq 'MQ') { $code = 'FR' ; } # Martinique
 1746+ if ($code eq 'NC') { $code = 'FR' ; } # New Caledonia
 1747+ if ($code eq 'PF') { $code = 'FR' ; } # French Polynesia
 1748+ if ($code eq 'PM') { $code = 'FR' ; } # Saint Pierre and Miquelon
 1749+ if ($code eq 'WF') { $code = 'FR' ; } # Wallis and Futuna
 1750+ if ($code eq 'YT') { $code = 'FR' ; } # Mayotte
 1751+
 1752+ return ($code,$language) ;
 1753+}
 1754+
 1755+sub DiscardSquidInput
 1756+{
 1757+ ($bot,$project,$project_mode,$code,$language) = @_ ;
 1758+ if ($bot ne "U" or # user
 1759+ $project ne $project_mode or # eg 'wp'
 1760+ $language eq "upload" or
 1761+ $language =~ /mobile/i or
 1762+ $code eq "A1" or # Anonymous Proxy
 1763+ $code eq "A2" or # Satellite Provider
 1764+ $code eq "AP" or # Asia/Pacific Region
 1765+ $code eq "EU") # Europe
 1766+ {
 1767+ # print "bot $bot project '$project' project_mode $project_mode code $code language $language\n" ;
 1768+ return ($true) ;
 1769+ }
 1770+
 1771+ return ($false) ;
 1772+}
 1773+
 1774+sub GetCountryName
 1775+{
 1776+ my $code = shift ;
 1777+ if ($country_names {$code} eq "")
 1778+ {
 1779+ $country = "$code (?)" ;
 1780+ if ($country_code_not_specified_reported {$code}++ == 0)
 1781+ { print "Country name not specified for $code\n" ; }
 1782+ }
 1783+ else
 1784+ { $country = $country_names {$code} ; }
 1785+ return ($country) ;
 1786+}
 1787+
 1788+sub ReadInputBrowserLanguages
 1789+{
 1790+ my $file_csv = "$dir_process/$file_csv_browser_languages" ;
 1791+ if (! -e $file_csv)
 1792+ { abort ("Function ReadInputBrowserLanguages: file $file_csv not found!!! ") ; }
 1793+ open CSV_BROWSER_LANGUAGES, '<', $file_csv ;
 1794+ while ($line = <CSV_BROWSER_LANGUAGES>)
 1795+ {
 1796+ next if $line =~ /^#/ ; # comments
 1797+ next if $line =~ /^:/ ; # csv header (not a comment)
 1798+
 1799+ chomp $line ;
 1800+ ($browser,$language,$count) = split (',', $line) ;
 1801+
 1802+ $browser_languages {"$browser,$language"} += $count ;
 1803+ }
 1804+ close CSV_BROWSER_LANGUAGES ;
 1805+}
 1806+
 1807+sub CalcPercentages
 1808+{
 1809+ my $total_opsys = $total_opsys_mobile + $total_opsys_non_mobile ;
 1810+ foreach $key (keys %opsys)
 1811+ { $opsys_perc {$key} = sprintf ("%.2f",(100*$opsys {$key}/$total_opsys)) . "%" ; }
 1812+
 1813+ foreach $key (keys %clients)
 1814+ { $clients_perc {$key} = sprintf ("%.2f",(100*$clients {$key}/$total_clients)) . "%" ; }
 1815+
 1816+ foreach $key (keys %clientgroups)
 1817+ {
 1818+ $perc = 100*$clientgroups {$key}/$total_clients ;
 1819+ if ($key =~ /^M/)
 1820+ { $perc_threshold = 0.005 ; }
 1821+ else
 1822+ { $perc_threshold = 0.02 ; }
 1823+
 1824+ if ($perc > $perc_threshold)
 1825+ { $clientgroups_perc {$key} = sprintf ("%.2f",$perc) . "%" ; }
 1826+ else
 1827+ {
 1828+ ($mobile,$group) = split (',', $key) ;
 1829+ $clientgroups_other {$mobile} += $clientgroups {$key} ;
 1830+ $clientgroups {$key} = 0 ;
 1831+ }
 1832+ }
 1833+}
 1834+
 1835+sub NormalizeCounts
 1836+{
 1837+# ReadInputClients
 1838+ foreach $key (keys %engines)
 1839+ { $engines {$key} = &Normalize ($engines {$key}) ; }
 1840+
 1841+ foreach $key (keys %clientgroups)
 1842+ { $clientgroups {$key} = &Normalize ($clientgroups {$key}) ; }
 1843+
 1844+ foreach $key (keys %clients)
 1845+ { $clients {$key} = &Normalize ($clients {$key}) ; }
 1846+
 1847+ foreach $key (keys %clientgroups_other)
 1848+ { $clientgroups_other {$key} = &Normalize ($clientgroups_other {$key}) ; }
 1849+
 1850+ foreach $key (keys %total_clientgroups)
 1851+ { $total_clientgroups {$key} = &Normalize ($total_clientgroups {$key}) ; }
 1852+
 1853+ foreach $key (keys %total_engines)
 1854+ { $total_engines {$key} = &Normalize ($total_engines {$key}) ; }
 1855+
 1856+ foreach $key (keys %webkit_engines)
 1857+ { $webkit_engines {$key} = &Normalize ($webkit_engines {$key}) ; }
 1858+
 1859+ $total_clients = &Normalize ($total_clients) ;
 1860+ $total_clients_mobile = &Normalize ($total_clients_mobile) ;
 1861+ $total_clients_non_mobile = &Normalize ($total_clients_non_mobile) ;
 1862+
 1863+# ReadInputCrawlers
 1864+ foreach $key (keys %crawlers)
 1865+ { $crawlers {$key} = &Normalize ($crawlers {$key}) ; }
 1866+
 1867+ $total_page_crawlerrequests = &Normalize ($total_page_crawlerrequests) ;
 1868+
 1869+# ReadInputMethods
 1870+ foreach $key (keys %statusses)
 1871+ { $statusses {$key} = &Normalize ($statusses {$key}) ; }
 1872+ foreach $key (keys %methods)
 1873+ { $methods {$key} = &Normalize ($methods {$key}) ; }
 1874+
 1875+# ReadInputMimeTypes
 1876+ foreach $key (keys %mimetypes)
 1877+ { $mimetypes {$key} = &Normalize ($mimetypes {$key}) ; }
 1878+ foreach $key (keys %projects)
 1879+ { $projects {$key} = &Normalize ($projects {$key}) ; }
 1880+ foreach $key (keys %domains)
 1881+ { $domains {$key} = &Normalize ($domains {$key}) ; }
 1882+ foreach $key (keys %images_project)
 1883+ { $images_project {$key} = &Normalize ($images_project {$key}) ; }
 1884+ foreach $key (keys %images_domain)
 1885+ { $images_domain {$key} = &Normalize ($images_domain {$key}) ; }
 1886+ foreach $key (keys %mimetypes_found)
 1887+ { $mimetypes_found {$key} = &Normalize ($mimetypes_found {$key}) ; }
 1888+ foreach $key (keys %counts_pm)
 1889+ { $counts_pm {$key} = &Normalize ($counts_pm {$key}) ; }
 1890+ foreach $key (keys %counts_dm)
 1891+ { $counts_dm {$key} = &Normalize ($counts_dm {$key}) ; }
 1892+ foreach $key (keys %counts_prem)
 1893+ { $counts_prem {$key} = &Normalize ($counts_prem {$key}) ; }
 1894+
 1895+ $total_mimes = &Normalize ($total_mimes) ;
 1896+
 1897+# ReadInputOpSys
 1898+ foreach $key (keys %opsys)
 1899+ { $opsys {$key} = &Normalize ($opsys {$key}) ; }
 1900+
 1901+ $total_opsys_non_mobile = &Normalize ($total_opsys_non_mobile) ;
 1902+ $total_opsys_mobile = &Normalize ($total_opsys_mobile) ;
 1903+
 1904+# ReadInputOrigins
 1905+ foreach $key (keys %origin_int_top)
 1906+ { $origin_int_top {$key} = &Normalize ($origin_int_top {$key}) ; }
 1907+ foreach $key (keys %origin_int_top_split)
 1908+ { $origin_int_top_split {$key} = &Normalize ($origin_int_top_split {$key}) ; }
 1909+ foreach $key (keys %origin_ext_top)
 1910+ { $origin_ext_top {$key} = &Normalize ($origin_ext_top {$key}) ; }
 1911+ foreach $key (keys %origin_ext_top_split)
 1912+ { $origin_ext_top_split {$key} = &Normalize ($origin_ext_top_split {$key}) ; }
 1913+ foreach $key (keys %origin_ext_page_top)
 1914+ { $origin_ext_page_top {$key} = &Normalize ($origin_ext_page_top {$key}) ; }
 1915+ foreach $key (keys %project_int_top)
 1916+ { $project_int_top {$key} = &Normalize ($project_int_top {$key}) ; }
 1917+ foreach $key (keys %project_int_top_split)
 1918+ { $project_int_top_split {$key} = &Normalize ($project_int_top_split {$key}) ; }
 1919+
 1920+ $total_page_requests_external = &Normalize ($total_page_requests_external) ;
 1921+ $total_origins_external_counted = &Normalize ($total_origins_external_counted) ;
 1922+
 1923+# ReadInputScripts
 1924+ foreach $key (keys %actions)
 1925+ { $actions {$key} = &Normalize ($actions {$key}) ; }
 1926+ foreach $key (keys %parms)
 1927+ { $parms {$key} = &Normalize ($parms {$key}) ; }
 1928+ foreach $key (keys %scripts_php)
 1929+ { $scripts_php {$key} = &Normalize ($scripts_php {$key}) ; }
 1930+ foreach $key (keys %scripts_js)
 1931+ { $scripts_js {$key} = &Normalize ($scripts_js {$key}) ; }
 1932+ foreach $key (keys %scripts_css)
 1933+ { $scripts_css {$key} = &Normalize ($scripts_css {$key}) ; }
 1934+
 1935+# ReadInputGoogle
 1936+ foreach $key (keys %searches_service)
 1937+ { $searches_service {$key} = &Normalize ($searches_service {$key}) ; }
 1938+ foreach $key (keys %searches_crawlers)
 1939+ { $searches_crawlers {$key} = &Normalize ($searches_crawlers {$key}) ; }
 1940+ foreach $key (keys %searches_toplevel)
 1941+ { $searches_toplevel {$key} = &Normalize ($searches_toplevel {$key}) ; }
 1942+ foreach $key (keys %searches_toplevel_tld_found)
 1943+ { $searches_toplevel_tld_found {$key} = &Normalize ($searches_toplevel_tld_found {$key}) ; }
 1944+ foreach $key (keys %searches_service_mimecat)
 1945+ { $searches_service_mimecat {$key} = &Normalize ($searches_service_mimecat {$key}) ; }
 1946+ foreach $key (keys %searches_service_matches)
 1947+ { $searches_service_matches {$key} = &Normalize ($searches_service_matches {$key}) ; }
 1948+ foreach $key (keys %searches_toplevel_mimecat)
 1949+ { $searches_toplevel_mimecat {$key} = &Normalize ($searches_toplevel_mimecat {$key}) ; }
 1950+ foreach $key (keys %searches_mimecat_tld_not_found)
 1951+ { $searches_mimecat_tld_not_found {$key} = &Normalize ($searches_mimecat_tld_not_found {$key}) ; }
 1952+
 1953+# ReadInputSkins
 1954+ foreach $key (keys %skins)
 1955+ { $skins {$key} = &Normalize ($skins {$key}) ; }
 1956+ foreach $key (keys %skin_set)
 1957+ { $skin_set {$key} = &Normalize ($skin_set {$key}) ; }
 1958+
 1959+# ReadInputBrowserLanguages
 1960+ foreach $key (keys %browser_languages)
 1961+ { $browser_languages {$key} = &Normalize ($browser_languages {$key}) ; }
 1962+}
 1963+
 1964+sub SortCounts
 1965+{
 1966+# ReadInputClients
 1967+# @engines_sorted_count = keys_sorted_by_value_num_desc %engines ;
 1968+ @engines_sorted_alpha = keys_sorted_alpha_asc %engines ;
 1969+ @webkit_engines_sorted_alpha = keys_sorted_alpha_asc %webkit_engines ;
 1970+ @clientgroups_sorted_count = keys_sorted_by_value_num_desc %clientgroups ;
 1971+ @clientgroups_sorted_alpha = keys_sorted_alpha_asc %clientgroups ;
 1972+ @clients_sorted_count = keys_sorted_by_value_num_desc %clients ;
 1973+ @clients_sorted_alpha = keys_sorted_alpha_asc %clients ;
 1974+
 1975+# ReadInputCrawlers
 1976+# @crawlers_sorted_count = keys_sorted_by_value_num_desc %crawlers ;
 1977+# @crawlers_sorted_alpha = keys_sorted_alpha_asc %crawlers ;
 1978+
 1979+# ReadInputMethods
 1980+ @statusses_sorted_count = keys_sorted_by_value_num_desc %statusses ;
 1981+ @statusses_sorted_method = keys_sorted_alpha_desc %statusses ;
 1982+ @methods_sorted_count = keys_sorted_by_value_num_desc %methods ;
 1983+ @methods_sorted_method = keys_sorted_alpha_desc %methods ;
 1984+
 1985+# ReadInputMimeTypes
 1986+ @mimetypes_sorted = sort {&SortMime ($b) <=> &SortMime ($a)} keys %mimetypes ;
 1987+ @projects_sorted = keys_sorted_by_value_num_desc %projects ;
 1988+ @domains_sorted = keys_sorted_by_value_num_desc %domains ;
 1989+
 1990+# ReadInputOpSys
 1991+ @opsys_sorted_alpha = sort {lc($a) cmp lc($b)} keys %opsys ;
 1992+ @opsys_sorted_count = keys_sorted_by_value_num_desc %opsys ;
 1993+
 1994+# ReadInputOrigins
 1995+ @origin_int_top_sorted_alpha = keys_sorted_alpha_desc %origin_int_top ;
 1996+ @origin_ext_top_sorted_alpha = keys_sorted_alpha_desc %origin_ext_top ;
 1997+ @origin_ext_page_top_sorted_alpha = keys_sorted_alpha_desc %origin_ext_page_top ;
 1998+ @origin_int_top_sorted_count = keys_sorted_by_value_num_desc %origin_int_top ;
 1999+ @origin_ext_top_sorted_count = keys_sorted_by_value_num_desc %origin_ext_top ;
 2000+ @origin_ext_page_top_sorted_count = keys_sorted_by_value_num_desc %origin_ext_page_top ;
 2001+
 2002+ @project_int_top_sorted_alpha = keys_sorted_alpha_desc %project_int_top ;
 2003+ @project_int_top_sorted_count = keys_sorted_by_value_num_desc %project_int_top ;
 2004+
 2005+# ReadInputScripts
 2006+ @parms_sorted_count = keys_sorted_by_value_num_desc %parms ;
 2007+ @parms_sorted_script = keys_sorted_alpha_desc %parms ;
 2008+
 2009+ @scripts_php_sorted_count = keys_sorted_by_value_num_desc %scripts_php ;
 2010+ @scripts_php_sorted_script = keys_sorted_alpha_asc %scripts_php ;
 2011+ @scripts_js_sorted_count = keys_sorted_by_value_num_desc %scripts_js ;
 2012+ @scripts_js_sorted_script = keys_sorted_alpha_asc %scripts_js ;
 2013+ @scripts_css_sorted_count = keys_sorted_by_value_num_desc %scripts_css ;
 2014+ @scripts_css_sorted_script = keys_sorted_alpha_asc %scripts_css ;
 2015+
 2016+# ReadInputGoogle
 2017+ @searches_service_count = keys_sorted_by_value_num_desc %searches_service ;
 2018+ @searches_service_alpha = keys_sorted_alpha_desc %searches_service ;
 2019+ @searches_toplevel_count = keys_sorted_by_value_num_desc %searches_toplevel_tld_found ;
 2020+ @searches_toplevel_alpha = keys_sorted_alpha_asc %searches_toplevel_tld_found ;
 2021+ @searches_service_matches_alpha = keys_sorted_alpha_asc %searches_service_matches ;
 2022+
 2023+# ReadInputSkins
 2024+ @skins_sorted_skin = keys_sorted_alpha_asc %skins ;
 2025+}
 2026+
 2027+sub WriteReportClients
 2028+{
 2029+ open FILE_HTML_CLIENTS, '>', "$dir_reports/$file_html_clients" ;
 2030+
 2031+ $html = $header ;
 2032+ $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Browsers e.a./ ;
 2033+ $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Browsers e.a./ ;
 2034+ $html =~ s/ALSO/&nbsp;See also: <b>LINKS<\/b>/ ;
 2035+ $html =~ s/LINKS/$link_requests $link_origins \/ $link_methods \/ $link_scripts \/ $link_skins \/ $link_crawlers \/ $link_opsys \/ $dummy_browsers \/ $link_google/ ;
 2036+ $html =~ s/X1000/&rArr; <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ;
 2037+
 2038+ $html .= "<table border=1>\n" ;
 2039+ $html .= "<tr><td class=l colspan=99 wrap>The following overview of page requests per client (~browser) application is based on the <a href='http://en.wikipedia.org/wiki/User_agent'>user agent</a> information that accompanies most server requests.<br>" .
 2040+ "Please note that agent information does not follow strict guidelines and some programs may provide wrong information on purpose.<br>" .
 2041+ "This report ignores all requests where agent information is missing, or contains any of the following: bot, crawl(er) or spider.<p>" .
 2042+ "<b>Recommended reading:</b> <a href='http://en.wikipedia.org/wiki/Usage_share_of_web_browsers'>Wikipedia article</a> on usage share of web browsers and measurement methodology." .
 2043+ "</td></tr>\n" ;
 2044+
 2045+ # CLIENTS SORTED BY FREQUENCY
 2046+ $html .= "<tr><td width=50% valign=top>" ;
 2047+ $html .= "<table border=1 width=100%>\n" ;
 2048+ $html .= "<tr><th colspan=99 class=l><h3>In order of popularity</h3></th></tr>\n" ;
 2049+
 2050+ $html .= "<tr><th colspan=99 class=l>&nbsp;<br>Browsers, non mobile</th></tr>\n" ;
 2051+ $perc_total = 0 ;
 2052+ foreach $key (@clientgroups_sorted_count)
 2053+ {
 2054+ $count = $clientgroups {$key} ;
 2055+
 2056+ next if $count == 0 ;
 2057+
 2058+ $perc = $clientgroups_perc {$key} ;
 2059+ ($mobile,$group) = split (',', $key) ;
 2060+
 2061+ next if $mobile ne '-' ;
 2062+
 2063+ $count = &FormatCount ($count) ;
 2064+ $html .= "<tr><td class=l>$group</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2065+ $perc =~ s/\%// ;
 2066+ $perc_total += $perc ;
 2067+ }
 2068+
 2069+ $perc = ".." ;
 2070+ $count = $clientgroups_other {'-'} ;
 2071+ if ($total_clientgroups {'-'} + $total_clientgroups {'M'} > 0)
 2072+ {
 2073+ $perc = sprintf ("%.2f", 100 * $clientgroups_other {'-'} / ($total_clientgroups {'-'} + $total_clientgroups {'M'})) ;
 2074+ $perc_total += $perc ;
 2075+ }
 2076+ $html .= "<tr><td class=l>Other</th><td class=r>$count</td><td class=r>$perc\%</td></tr>\n" ;
 2077+
 2078+ $total = &FormatCount ($total_clientgroups {'-'}) ;
 2079+ $perc_total = sprintf ("%.1f", $perc_total) ;
 2080+ $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc_total\%</th></tr>\n" ;
 2081+
 2082+ $html .= "<tr><th colspan=99 class=l>&nbsp;<br>Browsers, mobile</th></tr>\n" ;
 2083+ foreach $key (@clientgroups_sorted_count)
 2084+ {
 2085+ $count = $clientgroups {$key} ;
 2086+
 2087+ next if $count == 0 ;
 2088+
 2089+ $perc = $clientgroups_perc {$key} ;
 2090+ ($mobile,$group) = split (',', $key) ;
 2091+
 2092+ next if $mobile ne 'M' ;
 2093+
 2094+ $count = &FormatCount ($count) ;
 2095+ $html .= "<tr><td class=l>$group</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2096+ $perc =~ s/\%// ;
 2097+ }
 2098+ $count = $clientgroups_other {'M'} ;
 2099+
 2100+ $perc = ".." ;
 2101+ if ($total_clientgroups {'-'} + $total_clientgroups {'M'} > 0)
 2102+ { $perc = sprintf ("%.2f", 100 * $count / ($total_clientgroups {'-'} + $total_clientgroups {'M'})) ; }
 2103+
 2104+ $perc_total = sprintf ("%.1f", (100 - $perc_total)) ;
 2105+ $total = &FormatCount ($total_clientgroups {'M'}) ;
 2106+ $html .= "<tr><td class=l>Other</th><td class=r>$count</td><td class=r>$perc\%</td></tr>\n" ;
 2107+ $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc_total\%</th></tr>\n" ;
 2108+
 2109+ $html .= "<tr><th colspan=99 class=l>&nbsp;<br>Browser versions, non mobile</th></tr>\n" ;
 2110+
 2111+ foreach $key (@clients_sorted_count)
 2112+ {
 2113+ $count = $clients {$key} ;
 2114+ ($rectype, $client) = split (',', $key,2) ;
 2115+
 2116+ next if $rectype ne '-' ; # group
 2117+
 2118+ $perc = $clients_perc {$key} ;
 2119+
 2120+ next if $perc lt "0.02%" ;
 2121+
 2122+ $count = &FormatCount ($count) ;
 2123+ $html .= "<tr><td class=l>$client</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2124+ $perc =~ s/\%// ;
 2125+ }
 2126+ $total = &FormatCount ($total_clients_non_mobile) ;
 2127+
 2128+ $perc_total = sprintf ("%.1f", (100 - $perc_total)) ;
 2129+ $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc_total\%</th></tr>\n" ;
 2130+
 2131+ $html .= "<tr><th colspan=99 class=l>&nbsp;<br>Browser versions, mobile</th></tr>\n" ;
 2132+ foreach $key (@clients_sorted_count)
 2133+ {
 2134+ $count = $clients {$key} ;
 2135+ ($rectype, $client) = split (',', $key,2) ;
 2136+
 2137+ next if $rectype ne 'M' ; # group
 2138+
 2139+ $perc = $clients_perc {$key} ;
 2140+
 2141+ next if $perc lt "0.02%" ;
 2142+
 2143+ $count = &FormatCount ($count) ;
 2144+ $html .= "<tr><td class=l>$client</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2145+ }
 2146+ $total = &FormatCount ($total_clients_mobile) ;
 2147+ $perc = sprintf ("%.1f", (100 - $perc_total)) ;
 2148+ $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc\%</th></tr>\n" ;
 2149+
 2150+ $html .= "</table>\n" ;
 2151+
 2152+ # CLIENTS In alphabetical order
 2153+ $html .= "</td><td width=50% valign=top>" ;
 2154+ $html .= "<table border=1 width=100%>\n" ;
 2155+ $html .= "<tr><th colspan=99 class=l><h3>In alphabetical order</h3></th></tr>\n" ;
 2156+
 2157+ $html .= "<tr><th colspan=99 class=l>&nbsp;<br>Browsers, non mobile</th></tr>\n" ;
 2158+ $perc_total = 0 ;
 2159+ foreach $key (@clientgroups_sorted_alpha)
 2160+ {
 2161+ $count = $clientgroups {$key} ;
 2162+
 2163+ next if $count == 0 ;
 2164+
 2165+ $perc = $clientgroups_perc {$key} ;
 2166+ ($mobile,$group) = split (',', $key) ;
 2167+
 2168+ next if $mobile ne '-' ;
 2169+
 2170+ $count = &FormatCount ($count) ;
 2171+ $html .= "<tr><td class=l>$group</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2172+ $perc =~ s/\%// ;
 2173+ $perc_total += $perc ;
 2174+ }
 2175+
 2176+ $count = $clientgroups_other {'-'} ;
 2177+ $total = &FormatCount ($total_clientgroups {'-'}) ;
 2178+ $perc = ".." ;
 2179+ if ($total_clientgroups {'-'} + $total_clientgroups {'M'} > 0)
 2180+ { $perc = sprintf ("%.2f", 100 * $count / ($total_clientgroups {'-'} + $total_clientgroups {'M'})) ; }
 2181+ $perc_total += $perc ;
 2182+ $perc_total = sprintf ("%.1f", $perc_total) ;
 2183+ $html .= "<tr><td class=l>Other</th><td class=r>$count</td><td class=r>$perc\%</td></tr>\n" ;
 2184+ $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc_total\%</th></tr>\n" ;
 2185+
 2186+ $html .= "<tr><th colspan=99 class=l>&nbsp;<br>Browsers, mobile</th></tr>\n" ;
 2187+ foreach $key (@clientgroups_sorted_alpha)
 2188+ {
 2189+ $count = $clientgroups {$key} ;
 2190+
 2191+ next if $count == 0 ;
 2192+
 2193+ $perc = $clientgroups_perc {$key} ;
 2194+ ($mobile,$group) = split (',', $key) ;
 2195+
 2196+ next if $mobile ne 'M' ;
 2197+
 2198+ $count = &FormatCount ($count) ;
 2199+ $html .= "<tr><td class=l>$group</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2200+ $perc =~ s/\%// ;
 2201+ }
 2202+ $count = $clientgroups_other {'M'} ;
 2203+ $total = &FormatCount ($total_clientgroups {'M'}) ;
 2204+ $perc = sprintf ("%.2f", 100 * $count / ($total_clientgroups {'-'} + $total_clientgroups {'M'})) ;
 2205+ $perc_total = sprintf ("%.1f", (100 - $perc_total)) ;
 2206+ $html .= "<tr><td class=l>Other</th><td class=r>$count</td><td class=r>$perc\%</td></tr>\n" ;
 2207+ $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc_total\%</th></tr>\n" ;
 2208+
 2209+ $html .= "<tr><th colspan=99 class=l>&nbsp;<br>Browser versions, non mobile</th></tr>\n" ;
 2210+
 2211+ foreach $key (@clients_sorted_alpha)
 2212+ {
 2213+ $count = $clients {$key} ;
 2214+ ($rectype, $client) = split (',', $key,2) ;
 2215+
 2216+ next if $rectype ne '-' ; # group
 2217+
 2218+ $perc = $clients_perc {$key} ;
 2219+
 2220+ next if $perc lt "0.02%" ;
 2221+
 2222+ $count = &FormatCount ($count) ;
 2223+ $html .= "<tr><td class=l>$client</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2224+ }
 2225+ $total = &FormatCount ($total_clients_non_mobile) ;
 2226+ $perc = sprintf ("%.1f",100*$total_clients_non_mobile / ($total_clients_mobile + $total_clients_non_mobile)) ;
 2227+ $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc\%</th></tr>\n" ;
 2228+
 2229+ $html .= "<tr><th colspan=99 class=l>&nbsp;<br>Browser versions, mobile</th></tr>\n" ;
 2230+ foreach $key (@clients_sorted_alpha)
 2231+ {
 2232+ $count = $clients {$key} ;
 2233+ ($rectype, $client) = split (',', $key,2) ;
 2234+
 2235+ next if $rectype ne 'M' ; # group
 2236+
 2237+ $perc = $clients_perc {$key} ;
 2238+
 2239+ next if $perc lt "0.02%" ;
 2240+
 2241+ $count = &FormatCount ($count) ;
 2242+ $html .= "<tr><td class=l>$client</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2243+ }
 2244+ $total = &FormatCount ($total_clients_mobile) ;
 2245+ $perc = sprintf ("%.1f",100*$total_clients_mobile / ($total_clients_mobile + $total_clients_non_mobile)) ;
 2246+ $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc\%</th></tr>\n" ;
 2247+
 2248+ $html .= "<tr><th colspan=99 class=l>&nbsp;<br>Browser engines</th></tr>\n" ;
 2249+
 2250+ $engine_prev = "" ;
 2251+ foreach $engine (@webkit_engines_sorted_alpha)
 2252+ {
 2253+ $total = $webkit_engines {$engine} ;
 2254+
 2255+ next if $total < 5 ;
 2256+
 2257+ $engine2 = $engine ;
 2258+ $engine2 =~ s/\/.*$// ;
 2259+ $engine2 =~ s/ .*$// ;
 2260+ if (($engine2 ne $engine_prev) && ($engine_prev ne ""))
 2261+ {
 2262+ $total_engine = $total_engines {$engine_prev} ;
 2263+ $perc_engine = sprintf ("%.1f", 100 * $total_engine / ($total_clients_mobile + $total_clients_non_mobile)) ;
 2264+ $total_engine = &FormatCount ($total_engine) ;
 2265+ $html .= "<tr><th class=l>Total</th><th class=r>$total_engine</th><th class=r>$perc_engine\%</th></tr>\n" ;
 2266+ }
 2267+ $engine_prev = $engine2 ;
 2268+ $total = &FormatCount ($total) ;
 2269+ $html .= "<tr><td class=l>$engine</td><td class=r>$total</td><td class=r>&nbsp;</td></tr>\n" ;
 2270+ }
 2271+ $total_engine = $total_engines {$engine_prev} ;
 2272+ $perc_engine = sprintf ("%.1f", 100 * $total_engine / ($total_clients_mobile + $total_clients_non_mobile)) ;
 2273+ $total_engine = &FormatCount ($total_engine) ;
 2274+ $html .= "<tr><th class=l>Total</th><th class=r>$total_engine</th><th class=r>$perc_engine\%</th></tr>\n" ;
 2275+
 2276+ $engine_prev = "" ;
 2277+ foreach $engine (@engines_sorted_alpha)
 2278+ {
 2279+ $total = $engines {$engine} ;
 2280+
 2281+ next if $total < 5 ;
 2282+
 2283+ $engine2 = $engine ;
 2284+ $engine2 =~ s/\/.*$// ;
 2285+ $engine2 =~ s/ .*$// ;
 2286+ if (($engine2 ne $engine_prev) && ($engine_prev ne ""))
 2287+ {
 2288+ $total_engine = $total_engines {$engine_prev} ;
 2289+ $perc_engine = sprintf ("%.1f", 100 * $total_engine / ($total_clients_mobile + $total_clients_non_mobile)) ;
 2290+ $total_engine = &FormatCount ($total_engine) ;
 2291+ $html .= "<tr><th class=l>Total</th><th class=r>$total_engine</th><th class=r>$perc_engine\%</th></tr>\n" ;
 2292+ }
 2293+ $engine_prev = $engine2 ;
 2294+ $total = &FormatCount ($total) ;
 2295+ $html .= "<tr><td class=l>$engine</td><td class=r>$total</td><td class=r>&nbsp;</td></tr>\n" ;
 2296+ }
 2297+ $total_engine = $total_engines {$engine_prev} ;
 2298+ $perc_engine = sprintf ("%.1f", 100 * $total_engine / ($total_clients_mobile + $total_clients_non_mobile)) ;
 2299+ $total_engine = &FormatCount ($total_engine) ;
 2300+ $html .= "<tr><th class=l>Total</th><th class=r>$total_engine</th><th class=r>$perc_engine\%</th></tr>\n" ;
 2301+
 2302+ $html .= "</table>\n" ;
 2303+ $html .= "</td></tr>\n" ;
 2304+
 2305+ $html .= "<tr><td colspan=99 class=l wrap>Requests from mobile devices are recognized as follows:<br>" .
 2306+ "Agent string contains any of the following terms (last upd: $month_upd_keywords_mobile):<br>" .
 2307+ "<i>$keywords_mobile</i></td></tr>" ;
 2308+
 2309+ $html .= "</table>\n" ;
 2310+
 2311+# $html .= "<p><b>Explanation:</b><br>'osd' = opensearchdescription / 'php.ser' = vnd.php.serialized" ;
 2312+ $html .= $colophon ;
 2313+
 2314+ print FILE_HTML_CLIENTS $html ;
 2315+ close FILE_HTML_CLIENTS ;
 2316+}
 2317+
 2318+sub WriteReportCrawlers
 2319+{
 2320+ open FILE_HTML_CRAWLERS, '>', "$dir_reports/$file_html_crawlers" ;
 2321+
 2322+ $html = $header ;
 2323+ $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Crawler requests/ ;
 2324+ $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Crawler requests/ ;
 2325+ $html =~ s/ALSO/&nbsp;See also: <b>LINKS<\/b>/ ;
 2326+ $html =~ s/LINKS/$link_requests $link_origins \/ $link_methods \/ $link_scripts \/ $link_skins \/ $dummy_crawlers \/ $link_opsys \/ $dummy_browsers \/ $link_google/ ;
 2327+ $html =~ s/X1000/&rArr; <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ;
 2328+
 2329+ $html .= "<table border=1>\n" ;
 2330+ $html .= "<tr><td class=l colspan=99>The following overview of crawler (aka bot) page requests is based on the <a href='http://en.wikipedia.org/wiki/User_agent'>user agent</a> information that accompanies most server requests." .
 2331+ " Unfortunately this user agent information follows rather loosely defined guidelines." .
 2332+ "<br>Also please bear in mind than the most popular crawler names may be somewhat overrepresented." .
 2333+ " This is the result of so called <i>user agent spoofing</i> (where a requester supplies false credentials, e.g. to bypass web servers filters)." .
 2334+ "<br>GoogleBot seems to be a favorite for spoofing. Therefore requests from an ip address registered by Google (see below) are color coded <b><font color=green>GoogleBot</font></b>, others <b><font color=red>GoogleBot</font></b>" .
 2335+ "<p>For this report page requests are considered to be issued by a crawler in two cases:" .
 2336+ "<br>1 The user agent string contains a web address (only crawlers should have that, but there a some false positives, " .
 2337+ " where a browser sends a user agent string with a web address (ill behaved plug-in, main offenders have been eliminated)" .
 2338+ "<br>2 The user agent string contains the term bot, spider or crawl[er]'" .
 2339+ "PERC_GOOGLE\n" .
 2340+ "</td></tr>\n" ;
 2341+
 2342+ $total_crawlers = 0 ;
 2343+# $html .= "<tr><th class=l>Count<br><small>x 1000</small></th><th class=l>Secondary domain<br>(~site) name</th><th class=l>Mime type</th><th class=l>User agent</th></tr>\n" ;
 2344+ foreach $mime_agent (keys_sorted_by_value_num_desc %crawlers)
 2345+ {
 2346+ $count = $crawlers {$mime_agent} ;
 2347+ ($mime, $agent) = split ('\|', $mime_agent,2) ;
 2348+ $agent =~ s/([^,;\(\)\s]+?\@[^,;\(\)\s]+)/ <font color=#808080>mail address<\/font> /g ;
 2349+ $agent =~ s/([\w-]+\s*.?at.?\s*[\w-]+\s*.?dot.?\s*[\w-]+)/ <font color=#808080>mail address<\/font> /gi ;
 2350+ $site = "-" ;
 2351+ if ($agent =~ /http:/)
 2352+ {
 2353+ $site = $agent ;
 2354+ $site =~ s/^.*?http:/http:/ ;
 2355+ $site =~ s/&gt;/>/gi ;
 2356+ $site =~ s/&lt;/</gi ;
 2357+ $site =~ s/^(.*?)[,;\)\<\>\s)].*$/$1/ ;
 2358+ }
 2359+ $agent =~ s/\Q$site\E/<b>$site<\/b>/ ;
 2360+ # $agent =~ s/\Q$site\E// ;
 2361+
 2362+ $secondary_domain = &GetSecondaryDomain ($site) ;
 2363+ if (($secondary_domain eq "google") and ($agent =~ /color=red>GoogleBot</))
 2364+ { $secondary_domain .= "?" ; }
 2365+
 2366+ $secondary_domains {$secondary_domain} += $count ;
 2367+
 2368+ if ($secondary_domain ne "-")
 2369+ { $crawlers_per_domain {$secondary_domain} {$mime_agent} += $count ; }
 2370+ else
 2371+ {
 2372+ $crawlers_no_url {$agent} {$mime} += $count ;
 2373+ $crawlers_no_url_agent {$agent} += $count ;
 2374+ }
 2375+
 2376+ $total_crawlers += $count ;
 2377+
 2378+ next if $count <= 2 ;
 2379+
 2380+ # $count = &FormatCount ($count) ;
 2381+ # $html .= "<tr><td class=r>$count</td><td class=l><a href='$site'>$secondary_domain</a></td><td class=l>$mime</td><td class=l>$agent</td></tr>\n" ;
 2382+ # $rows++ ;
 2383+ }
 2384+
 2385+ $perc_crawlers = ".." ;
 2386+ if ($total_page_requests_external > 0)
 2387+ { $perc_crawlers = sprintf ("%.1f",100 * $total_page_crawlerrequests/$total_page_requests_external) ; }
 2388+
 2389+ $total_page_requests_external2 = &FormatCount ($total_page_requests_external*1000) ;
 2390+ $total_page_crawlerrequests2 = &FormatCount ($total_page_crawlerrequests*1000) ;
 2391+ $html =~ s/PERC_GOOGLE/<p>In total $total_page_crawlerrequests2 page requests (mime type <a href='SquidReportRequests.htm'>text\/html<\/a> only!) per day are considered crawler requests, out of $total_page_requests_external2 external requests, which is $perc_crawlers%/ ;
 2392+
 2393+ $total_crawlers = &FormatCount ($total_crawlers) ;
 2394+# $html .= "<tr><th class=l>$total_crawlers</th><th class=l colspan=2>total</th></tr>\n" ;
 2395+# $html .= "</table><p>\n" ;
 2396+
 2397+# $html .= "<table border=1>\n" ;
 2398+# $html .= "<tr><th class=l colspan=99>Top 25 secondary domains<br>(~ sites) mentioned</th></tr>\n" ;
 2399+# foreach $secondary_domain (keys_sorted_by_value_num_desc %secondary_domains)
 2400+# {
 2401+# next if $secondary_domain eq ".." ;
 2402+# last if ++$secondary_domains_listed > 25 ;
 2403+#
 2404+# $count = $secondary_domains {$secondary_domain} ;
 2405+# $count = &FormatCount ($count) ;
 2406+# $html .= "<tr><td class=r>$count</td><td class=l colspan=2>$secondary_domain</td></tr>\n" ;
 2407+# }
 2408+# $html .= "</table>\n" ;
 2409+
 2410+ $html .= "<tr><th class=lh3 colspan=99>Page requests for crawlers that specify a url in the agent string</th></tr>\n" ;
 2411+ $html .= "<tr><th class=l>Count<br><small>x 1000</small></th><th class=l>Secondary domain<br>(~site) name</th><th class=l>URL</th><th class=l>Mime type</th><th class=l>User agent</th></tr>\n" ;
 2412+ foreach $secondary_domain (keys_sorted_by_value_num_desc %secondary_domains)
 2413+ {
 2414+ next if $secondary_domain eq "-" ;
 2415+
 2416+ $total = $secondary_domains {$secondary_domain} ;
 2417+ $total_crawlers_url += $total ;
 2418+
 2419+ last if $total < 10 ;
 2420+
 2421+ $total = &FormatCount ($total) ;
 2422+ $html .= "<tr><th class=r>$total</th><th class=l colspan=99>$secondary_domain</th></tr>\n" ;
 2423+ foreach $mime_agent (sort {$crawlers_per_domain {$secondary_domain} {$b} <=> $crawlers_per_domain {$secondary_domain} {$a}} keys %{$crawlers_per_domain {$secondary_domain}})
 2424+ {
 2425+ ($mime, $agent) = split ('\|', $mime_agent,2) ;
 2426+ $agent =~ s/([^,;\(\)\s]+?\@[^,;\(\)\s]+)/ <font color=#808080>mail address<\/font> /g ;
 2427+ $agent =~ s/([\w-]+\s*.?at.?\s*[\w-]+\s*.?dot.?\s*[\w-]+)/ <font color=#808080>mail address<\/font> /gi ;
 2428+ $site = "-" ;
 2429+ if ($agent =~ /http:/)
 2430+ {
 2431+ $site = $agent ;
 2432+ $site =~ s/^.*?http:/http:/ ;
 2433+ $site =~ s/&gt;/>/gi ;
 2434+ $site =~ s/&lt;/</gi ;
 2435+ $site =~ s/^(.*?)[,;\)\<\>\s)].*$/$1/ ;
 2436+ }
 2437+ # $agent =~ s/\Q$site\E/<b>$site<\/b> <a href='$site'>x<\/a>/ ;
 2438+ if ($site ne "-")
 2439+ { $agent =~ s/\Q$site\E/<b>url<\/b>/ ; }
 2440+ $count = $crawlers_per_domain {$secondary_domain} {$mime_agent} ;
 2441+
 2442+ next if $count <= 2 ;
 2443+
 2444+ # print "[$secondary_domain] [$mime_agent] : $count\n" ;
 2445+ $count = &FormatCount ($count) ;
 2446+ ($site2 = $site) =~ s/^http:\/\/// ;
 2447+ $html .= "<tr><td class=r>$count</td><td class=l>&nbsp;</td><td class=l><a href='$site' ref='nofollow'>$site2<\/a></td><td class=l>$mime</td><td class=l>$agent</td></tr>\n" ;
 2448+ $rows++ ;
 2449+ }
 2450+ }
 2451+ $total_crawlers_url = &FormatCount ($total_crawlers_url) ;
 2452+ $html .= "<tr><th class=l>$total_crawlers_url</th><th class=l colspan=99>total</th></tr>\n" ;
 2453+ $html .= "</table><p>\n" ;
 2454+
 2455+ $total_crawlers_no_url = 0 ;
 2456+ $html .= "<table border=1>\n" ;
 2457+ $html .= "<tr><th class=lh3 colspan=99>Page requests for probable crawlers, recognized by keyword</th></tr>\n" ;
 2458+ $html .= "<tr><th class=l width=40>Count<br><small>x 1000</small></th><th class=l colspan=99>Agent string</th></tr>\n" ;
 2459+ $html .= "<tr><th class=l width=40>&nbsp;</td><th class=l width=40>&nbsp;</td><th class=l>Mime type (count &ge; 3)</th></tr>\n" ;
 2460+ foreach $agent (keys_sorted_by_value_num_desc %crawlers_no_url_agent)
 2461+ {
 2462+ $total = $crawlers_no_url_agent {$agent} ;
 2463+ $total_crawlers_no_url += $total ;
 2464+
 2465+ last if $total < 3 ;
 2466+
 2467+ $total = &FormatCount ($total) ;
 2468+ $html .= "<tr><th class=r>$total</th><td class=l colspan=99>$agent</td></tr>\n" ;
 2469+ foreach $mime (sort {$crawlers_no_url {$agent} {$b} <=> $crawlers_no_url {$agent} {$a}} keys %{$crawlers_no_url {$agent}})
 2470+ {
 2471+ $agent =~ s/([^,;\(\)\s]+?\@[^,;\(\)\s]+)/ <font color=#808080>mail address<\/font> /g ;
 2472+ $agent =~ s/([\w-]+\s*.?at.?\s*[\w-]+\s*.?dot.?\s*[\w-]+)/ <font color=#808080>mail address<\/font> /gi ;
 2473+ $count = $crawlers_no_url {$agent} {$mime} ;
 2474+ $count = &FormatCount ($count) ;
 2475+ ($site2 = $site) =~ s/^http:\/\/// ;
 2476+ $html .= "<tr><td class=r>$count</td><td>&nbsp;</td><td class=l colspan=99>$mime</td></tr>\n" ;
 2477+ $rows++ ;
 2478+ }
 2479+ }
 2480+
 2481+ $total_crawlers_no_url = &FormatCount ($total_crawlers_no_url) ;
 2482+ $html .= "<tr><th class=l>$total_crawlers_no_url</th><th class=l colspan=99>total</th></tr>\n" ;
 2483+ $html .= "</table><p>\n" ;
 2484+
 2485+ $html .= "<p>$google_ip_ranges" ;
 2486+ $html .= $colophon ;
 2487+
 2488+ print FILE_HTML_CRAWLERS $html ;
 2489+ close FILE_HTML_CRAWLERS ;
 2490+}
 2491+
 2492+sub WriteReportMethods
 2493+{
 2494+ open FILE_HTML_METHODS, '>', "$dir_reports/$file_html_methods" ;
 2495+
 2496+ $html = $header ;
 2497+ $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Request Methods/ ;
 2498+ $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Request Methods/ ;
 2499+ $html =~ s/ALSO/&nbsp;See also: <b>LINKS<\/b>/ ;
 2500+ $html =~ s/LINKS/$link_requests $link_origins \/ $dummy_methods \/ $link_scripts \/ $link_skins \/ $link_crawlers \/ $link_opsys \/ $link_browsers \/ $link_google/ ;
 2501+ $html =~ s/X1000/&rArr; <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ;
 2502+
 2503+ $html .= "<table border=0>\n" ;
 2504+ $html .= "<tr><td>" ;
 2505+
 2506+ $html .= "<table border=1>\n" ;
 2507+ $html .= "<tr><th colspan=99 class=l><h3>In order of request volume</h3></th></tr>\n" ;
 2508+ $html .= "<tr><th colspan=2 class=l>Method</th><th class=r>Count<br><small>x 1000</small></th></tr>\n" ;
 2509+ $rows = 0 ;
 2510+ $total_methods = 0 ;
 2511+ foreach $method (@methods_sorted_count)
 2512+ {
 2513+ $total = $methods {$method} ;
 2514+ $total_methods += $total ;
 2515+ $total = &FormatCount ($total) ;
 2516+ $html .= "<tr><td colspan=2 class=l>$method</td><td class=r>$total</td></tr>\n" ;
 2517+ }
 2518+ $total_methods = &FormatCount ($total_methods) ;
 2519+ $html .= "<tr><th colspan=2 class=l>Total</th><th class=r>$total_methods</th></tr>\n" ;
 2520+ $html .= "<tr><td colspan=99>&nbsp;</td></tr>\n" ;
 2521+ $html .= "<tr><td class=l>Method</th><th class=l>Result</th><th class=r>Count<br><small>x 1000</small></th></tr>\n" ;
 2522+ $total_statusses = 0 ;
 2523+ foreach $status (@statusses_sorted_count)
 2524+ {
 2525+ $total = $statusses {$status} ;
 2526+ $total_statusses += $total ;
 2527+ $total = &FormatCount ($total) ;
 2528+ ($method,$result) = split (',', $status, 2) ;
 2529+
 2530+ $html .= "<tr><td class=l>$method</td><td class=l>$result</td><td class=r>$total</td></tr>\n" ;
 2531+ $rows++ ;
 2532+ }
 2533+ $total_statusses = &FormatCount ($total_statusses) ;
 2534+ $html .= "<tr><th colspan=2 class=l>Total</th><th class=r>$total_statusses</th></tr>\n" ;
 2535+ $html .= "</table>\n" ;
 2536+
 2537+ $html .= "</td><td>&nbsp;&nbsp;&nbsp;</td><td>" ;
 2538+
 2539+ $html .= "<table border=1>\n" ;
 2540+ $html .= "<tr><th colspan=99 class=l><h3>In alphabetical order: method+result</h3></th></tr>\n" ;
 2541+ $html .= "<tr><th colspan=2 class=l>Method</th><th class=r>Count<br><small>x 1000</small></th></tr>\n" ;
 2542+ $rows = 0 ;
 2543+ foreach $method (@methods_sorted_method)
 2544+ {
 2545+ $total = &FormatCount ($methods {$method}) ;
 2546+ $html .= "<tr><td colspan=2 class=l>$method</td><td class=r>$total</td></tr>\n" ;
 2547+ }
 2548+ $html .= "<tr><th colspan=2 class=l>Total</th><th class=r>$total_methods</th></tr>\n" ;
 2549+ $html .= "<tr><td colspan=99>&nbsp;</td></tr>\n" ;
 2550+ $html .= "<tr><th class=l>Method</th><th class=l>Result</th><th class=r>Count<br><small>x 1000</small></th></tr>\n" ;
 2551+ foreach $status (@statusses_sorted_method)
 2552+ {
 2553+ $total = &FormatCount ($statusses {$status}) ;
 2554+ ($method,$result) = split (',', $status, 2) ;
 2555+
 2556+ $html .= "<tr><td class=l>$method</td><td class=l>$result</td><td class=r>$total</td></tr>\n" ;
 2557+ $rows++ ;
 2558+ }
 2559+ $html .= "<tr><th colspan=2 class=l>Total</th><th class=r>$total_statusses</th></tr>\n" ;
 2560+ $html .= "</table>\n" ;
 2561+
 2562+ $html .= "</td></tr></table>\n" ;
 2563+ $html .= "&nbsp;<small>$rows rows written</small><p>" ;
 2564+
 2565+# $html .= "<p><b>Explanation:</b><br>'osd' = opensearchdescription / 'php.ser' = vnd.php.serialized" ;
 2566+ $html .= $colophon ;
 2567+
 2568+ print FILE_HTML_METHODS $html ;
 2569+ close FILE_HTML_METHODS ;
 2570+}
 2571+
 2572+sub WriteReportMimeTypes
 2573+{
 2574+ open FILE_HTML_REQUESTS, '>', "$dir_reports/$file_html_requests" ;
 2575+
 2576+ $html = $header ;
 2577+ $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Requests by destination/ ;
 2578+ $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Requests by destination/ ;
 2579+ $html =~ s/ALSO/&nbsp;See also: <b>LINKS<\/b>/ ;
 2580+ $html =~ s/NOTES/<br>&nbsp;This report shows where requests are sent to. Report 'Requests by origin' shows where requests come from.<br>&nbsp;Those numbers bear no direct relation.<br>/ ;
 2581+ $html =~ s/LINKS/$dummy_requests $link_origins \/ $link_methods \/ $link_scripts \/ $link_skins \/ $link_crawlers \/ $link_opsys \/ $link_browsers \/ $link_google/ ;
 2582+ $html .= "<table border=1>\n" ;
 2583+
 2584+ $header1 = "<tr><th colspan=2 class=l><small>x 1000</small></th><th colspan=2 class=c>Totals</th><th class=c><font color=#008000>Pages</font></th><th colspan=3 class=c><font color=#900000>Images</font></th><th colspan=99 class=c>Other</th></tr>\n" ;
 2585+ $header2 = "<tr><th colspan=2 class=l>&nbsp;</th><th class=c>total<br>all</th><th class=c><font color=#900000>total<br>images</font></th>\n" ;
 2586+ $columns = 0 ;
 2587+ foreach $mimetype (@mimetypes_sorted)
 2588+ {
 2589+ $columns++ ;
 2590+
 2591+ next if $mimetypes_found {$mimetype} < $threshold_mime ;
 2592+
 2593+ $mimetype2 = $mimetype ;
 2594+ if ($mimetype2 eq "text/html")
 2595+ { $mimetype2 .= "<br><small>(page)</small> " ; }
 2596+ if ($mimetype2 =~ /image\/(?:png|jpeg|gif)/)
 2597+ { $mimetype2 .= "<br><small>(img)</small> " ; }
 2598+ if ($columns == 1)
 2599+ { $mimetype2 = "<font color=#008000>$mimetype2</font" ; }
 2600+ if (($columns >= 2) && ($columns <= 4))
 2601+ { $mimetype2 = "<font color=#900000>$mimetype2</font" ; }
 2602+ ($mime1,$mime2) = split ('\/', $mimetype2, 2) ;
 2603+ $header2 .= "<th class=c>$mime1<br>$mime2</th>\n" ;
 2604+ }
 2605+ $header2 .= "</tr>\n" ;
 2606+ $html .= $header1 . $header2 ;
 2607+
 2608+ $rows = 0 ;
 2609+ $total_mimes2 = 0 ;
 2610+ $total_images1 = 0 ;
 2611+ foreach $domain (@domains_sorted)
 2612+ {
 2613+ $html .= "<tr><td colspan=2 class=l>" . ucfirst($domain) . "</td>\n" ;
 2614+ $total = $domains {$domain} ;
 2615+ $total_mimes2 += $total ;
 2616+ $total = &FormatCount ($total) ;
 2617+ $total_images = $images_domain {$domain} ;
 2618+ $total_images1 += $total_images ;
 2619+ $total_images = &FormatCount ($total_images) ;
 2620+ $total_images = "<font color=#900000>" . &FormatCount ($total_images) . "</font>" ;
 2621+
 2622+ $html .= "<th class=r>$total</th><th class=r>$total_images</th>\n" ;
 2623+ $columns = 0 ;
 2624+ foreach $mimetype (@mimetypes_sorted)
 2625+ {
 2626+ $columns++ ;
 2627+
 2628+ next if $mimetypes_found {$mimetype} < $threshold_mime ;
 2629+
 2630+ $count = &FormatCount ($counts_dm {"$domain,$mimetype"}) ;
 2631+ if ($columns == 1)
 2632+ { $count = "<font color=#008000>$count</font" ; }
 2633+ if (($columns >= 2) && ($columns <= 4))
 2634+ { $count = "<font color=#900000>$count</font" ; }
 2635+ if ($count eq "")
 2636+ { $count = "&nbsp;" ; }
 2637+ $html .= "<td class=r>$count</td>\n" ;
 2638+ }
 2639+ $html .= "</tr>\n" ;
 2640+ $rows++ ;
 2641+ }
 2642+
 2643+ if ($total_mimes != $total_mimes2)
 2644+ {
 2645+ print ERR "total_mimes $total_mimes != total_mimes2 $total_mimes2\n" ;
 2646+ print "total_mimes $total_mimes != total_mimes2 $total_mimes2\n" ;
 2647+ }
 2648+
 2649+ $total_mimes1 = &FormatCount ($total_mimes) ;
 2650+ $total_images1 = &FormatCount ($total_images1) ;
 2651+ $total_images1 = "<font color=#900000>" . &FormatCount ($total_images1) . "</font>" ;
 2652+ $html .= "<tr><th colspan=2 class=l>Total</th><th class=c>$total_mimes1</th><th class=c>$total_images1</th>\n" ;
 2653+ $columns = 0 ;
 2654+ foreach $mimetype (@mimetypes_sorted)
 2655+ {
 2656+ $columns++ ;
 2657+
 2658+ next if $mimetypes_found {$mimetype} < $threshold_mime ;
 2659+
 2660+ $count = &FormatCount ($mimetypes {$mimetype}) ;
 2661+ if ($columns == 1)
 2662+ { $count = "<font color=#008000>$count</font" ; }
 2663+ if (($columns >= 2) && ($columns <= 4))
 2664+ { $count = "<font color=#900000>$count</font" ; }
 2665+ $html .= "<th class=r>$count</th>\n" ;
 2666+ }
 2667+ $html .= "</tr>\n" ;
 2668+
 2669+ $html .= "<tr><th colspan=99>&nbsp;</th></tr>\n" ;
 2670+ $html .= "<tr><td colspan=99 class=l><b>Per project / language subproject</b> (top 50)</td></tr>\n" ;
 2671+ $total_mimes3 = 0 ;
 2672+ $total_mimes4 = 0 ;
 2673+ $cnt_projects = 0 ;
 2674+ foreach $project (@projects_sorted)
 2675+ {
 2676+ last if ++ $cnt_projects > 50 ;
 2677+
 2678+ $total = $projects {$project} ;
 2679+ $total_mimes3 += $total ;
 2680+
 2681+ next if $total < $threshold_project ;
 2682+
 2683+ $total_mimes4 += $total ;
 2684+ ($domain,$language) = split ('\:', $project,2) ;
 2685+ $html .= "<tr><td class=l>" . ucfirst($domain) . "</td><td class=l>$language</td>\n" ;
 2686+
 2687+ $total = &FormatCount ($total) ;
 2688+ $total_images = $images_project {$project} ;
 2689+ $total_images = "<font color=#900000>" . &FormatCount ($total_images) . "</font>" ;
 2690+ $html .= "<th class=r>$total</th><th class=r>$total_images</th>\n" ;
 2691+
 2692+ $columns = 0 ;
 2693+ foreach $mimetype (@mimetypes_sorted)
 2694+ {
 2695+ $columns++ ;
 2696+
 2697+ next if $mimetypes_found {$mimetype} < $threshold_mime ;
 2698+
 2699+ $count = &FormatCount ($counts_pm {"$project,$mimetype"}) ;
 2700+ if ($columns == 1)
 2701+ { $count = "<font color=#008000>$count</font" ; }
 2702+ if (($columns >= 2) && ($columns <= 4))
 2703+ { $count = "<font color=#900000>$count</font" ; }
 2704+# if ($count eq "")
 2705+# { $count = "&nbsp;" ; }
 2706+ $html .= "<td class=r>$count</td>\n" ;
 2707+ }
 2708+ $html .= "</tr>\n" ;
 2709+ $rows++ ;
 2710+ }
 2711+ $html .= $header2 . $header1 ;
 2712+ $html .= "</table>\n" ;
 2713+ $html .= "&nbsp;<small>$rows rows written</small><p>" ;
 2714+
 2715+ if ($total_mimes != $total_mimes3)
 2716+ {
 2717+ print ERR "total_mimes $total_mimes != total_mimes3 $total_mimes3\n" ;
 2718+ print "total_mimes $total_mimes != total_mimes3 $total_mimes3\n" ;
 2719+ }
 2720+
 2721+ if ($threshold_mime > 0)
 2722+ {
 2723+ $html .= "<b>Mime types that are found on less than $threshold_mime projects:</b> (again 1 = 1000)<p>" ;
 2724+ foreach $mimetype (@mimetypes_sorted)
 2725+ {
 2726+ next if $mimetypes_found {$mimetype} >= $threshold_mime ;
 2727+
 2728+ $count = $mimetypes {$mimetype} ;
 2729+ $count =~ s/^(\d{1,3})(\d\d\d)$/$1,$2/ ;
 2730+ $count =~ s/^(\d{1,3})(\d\d\d)(\d\d\d)$/$1,$2,$3/ ;
 2731+ $html .= "<b>$mimetype</b> $count total<br>" ;
 2732+ }
 2733+ }
 2734+
 2735+# $html .= "<p><b>Explanation:</b><br>'osd' = opensearchdescription / 'php.ser' = vnd.php.serialized" ;
 2736+ $html .= $colophon ;
 2737+
 2738+ print FILE_HTML_REQUESTS $html ;
 2739+ close FILE_HTML_REQUESTS ;
 2740+}
 2741+
 2742+sub WriteReportOpSys
 2743+{
 2744+ open FILE_HTML_OPSYS, '>', "$dir_reports/$file_html_opsys" ;
 2745+
 2746+ $html = $header ;
 2747+ $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Operating Systems/ ;
 2748+ $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Operating Systems/ ;
 2749+ $html =~ s/ALSO/&nbsp;See also: <b>LINKS<\/b>/ ;
 2750+ $html =~ s/LINKS/$link_requests $link_origins \/ $link_methods \/ $link_scripts \/ $link_skins \/ $link_crawlers \/ $dummy_opsys \/ $link_browsers \/ $link_google/ ;
 2751+ $html =~ s/X1000/&rArr; <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ;
 2752+
 2753+ $total_all2 = &FormatCount ($total_opsys_mobile + $total_opsys_non_mobile) ;
 2754+ $total_opsys_mobile2 = &FormatCount ($total_opsys_mobile) ;
 2755+ $total_opsys_non_mobile2 = &FormatCount ($total_opsys_non_mobile) ;
 2756+ $total_perc_mobile = sprintf ("%.1f", 100 * $total_opsys_mobile / ($total_opsys_mobile + $total_opsys_non_mobile)) ;
 2757+ $total_perc_non_mobile = 100 - $total_perc_mobile ;
 2758+ $line_total_all = "<tr><th class=l>Total</th><th class=r>$total_all2</th><th class=r>100\%</th></tr>\n" ;
 2759+ $line_total_mobile = "<tr><th class=l>Total</th><th class=r>$total_opsys_mobile2</th><th class=r>$total_perc_mobile\%</th></tr>\n" ;
 2760+ $line_total_non_mobile = "<tr><th class=l>Total</th><th class=r>$total_opsys_non_mobile2</th><th class=r>$total_perc_non_mobile\%</th></tr>\n" ;
 2761+
 2762+ $html .= "<table border=1>\n" ;
 2763+ $html .= "<tr><td class=l colspan=99>The following overview of page requests by operating system is based on the <a href='http://en.wikipedia.org/wiki/User_agent'>user agent</a> information that accompanies most server requests.<br>" .
 2764+ "Please note that agent information does not follow strict guidelines and some programs may provide wrong information on purpose.<br>" .
 2765+ "This report ignores all requests where agent information is missing, or contains any of the following: bot, crawl(er) or spider.<p>" .
 2766+ "<a href='http://en.wikipedia.org/wiki/Windows_NT#Releases'>Wikipedia</a>: NT 5.0 = Windows 2000, NT 5.1/5.2 = XP + Server 2003, NT 6.0 = VISTA + Server 2008, NT 6.1 = Windows 7.<br> " .
 2767+ "<a href='http://en.wikipedia.org/wiki/Mac_OS_X#Versions'>Wikipedia</a>: OS X 10.4 = Tiger, 10.5 = Leopard, 10.6 = Snow Leopard.<br> " .
 2768+ "<a href='http://en.wikipedia.org/wiki/Ubuntu#Releases'>Wikipedia</a>: Ubuntu 7.10 = Gutsy Gibbon, 8.04 = Hardy Heron, 8.10 = Intrepid Ibex, 9.04 = Jaunty Jackalope, 9.10 = Karma Koala." .
 2769+ "</td></tr>\n" ;
 2770+
 2771+# $html .= "<tr><th class=l>Count<br><small>x 1000</small></th><th class=l>Secondary domain<br>(~site) name</th><th class=l>Mime type</th><th class=l>User agent</th></tr>\n" ;
 2772+
 2773+ $html .= "<tr><td width=50% valign=top>" ;
 2774+
 2775+ # OS SORTED BY FREQUENCY
 2776+ $html .= "<table border=1 width=100%>\n" ;
 2777+ $html .= "<tr><td colspan=99 class=l><h3>In order of popularity</h3></td></tr>" ;
 2778+ $html .= "<tr><th class=l>Operating System</th><th class=r>Requests</th><th class=r>Percentage</th></tr>\n" ;
 2779+ foreach $key (@opsys_sorted_count)
 2780+ {
 2781+ $count = $opsys {$key} ;
 2782+ $perc = $opsys_perc {$key} ;
 2783+ ($rectype, $os) = split (',', $key,2) ;
 2784+
 2785+ next if $rectype ne 'G' ; # group
 2786+ next if $key =~ / / ; # subgroup
 2787+
 2788+ $count = &FormatCount ($count) ;
 2789+ $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2790+ # $rows++ ;
 2791+ }
 2792+ $html .= $line_total_all ;
 2793+
 2794+ $html .= "<tr><th class=l colspan=99>&nbsp;<br>Breakdown per platform for Mac and Linux</th></tr>\n" ;
 2795+ foreach $key (@opsys_sorted_count)
 2796+ {
 2797+ $count = $opsys {$key} ;
 2798+ $perc = $opsys_perc {$key} ;
 2799+ ($rectype, $os) = split (',', $key,2) ;
 2800+
 2801+ next if $rectype ne 'G' ; # group
 2802+ next if $key !~ / / ; # subgroup
 2803+
 2804+ $count = &FormatCount ($count) ;
 2805+ $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2806+ # $rows++ ;
 2807+ }
 2808+
 2809+ $html .= "<tr><th class=l colspan=99>&nbsp;<br>Breakdown per OS version, non mobile</th></tr>\n" ;
 2810+ foreach $key (@opsys_sorted_count)
 2811+ {
 2812+ $count = $opsys {$key} ;
 2813+ $perc = $opsys_perc {$key} ;
 2814+
 2815+ next if $perc lt "0.02%" ;
 2816+
 2817+ ($rectype, $os) = split (',', $key,2) ;
 2818+
 2819+ next if $rectype ne '-' ; # group
 2820+
 2821+ $count = &FormatCount ($count) ;
 2822+ $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2823+ # $rows++ ;
 2824+ }
 2825+ $html .= $line_total_non_mobile ;
 2826+
 2827+ $html .= "<tr><th class=l colspan=99>&nbsp;<br>Breakdown per OS version, mobile</th></tr>\n" ;
 2828+ foreach $key (@opsys_sorted_count)
 2829+ {
 2830+ $count = $opsys {$key} ;
 2831+ $perc = $opsys_perc {$key} ;
 2832+
 2833+ next if $perc lt "0.02%" ;
 2834+
 2835+ ($rectype, $os) = split (',', $key,2) ;
 2836+
 2837+ next if $rectype ne 'M' ; # group
 2838+
 2839+ $count = &FormatCount ($count) ;
 2840+ $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2841+ # $rows++ ;
 2842+ }
 2843+ $html .= $line_total_mobile ;
 2844+ $html .= "</table>\n" ;
 2845+
 2846+ $html .= "</td><td width=50% valign=top>" ;
 2847+
 2848+ # IN ALPHABETICAL ORDER
 2849+ $html .= "<table border=1 width=100%>\n" ;
 2850+
 2851+ $html .= "<tr><td colspan=99 class=l><h3>In alphabetical order</h3></td></tr>" ;
 2852+ $html .= "<tr><th class=l>Operating System</th><th class=r>Requests</th><th class=r>Percentage</th></tr>\n" ;
 2853+ foreach $key (@opsys_sorted_alpha)
 2854+ {
 2855+ $count = $opsys {$key} ;
 2856+ $perc = $opsys_perc {$key} ;
 2857+ ($rectype, $os) = split (',', $key,2) ;
 2858+
 2859+ next if $rectype ne 'G' ; # group
 2860+ next if $key =~ / / ; # subgroup
 2861+
 2862+ $count = &FormatCount ($count) ;
 2863+ $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2864+ # $rows++ ;
 2865+ }
 2866+ $html .= $line_total_all ;
 2867+
 2868+ $html .= "<tr><th class=l colspan=99>&nbsp;<br>Breakdown per platform for Mac and Linux</th></tr>\n" ;
 2869+ foreach $key (@opsys_sorted_alpha)
 2870+ {
 2871+ $count = $opsys {$key} ;
 2872+ $perc = $opsys_perc {$key} ;
 2873+ ($rectype, $os) = split (',', $key,2) ;
 2874+
 2875+ next if $rectype ne 'G' ; # group
 2876+ next if $key !~ / / ; # subgroup
 2877+
 2878+ $count = &FormatCount ($count) ;
 2879+ $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2880+ # $rows++ ;
 2881+ }
 2882+
 2883+ $html .= "<tr><th class=l colspan=99>&nbsp;<br>Breakdown per OS version, non mobile</th></tr>\n" ;
 2884+ foreach $key (@opsys_sorted_alpha)
 2885+ {
 2886+ $count = $opsys {$key} ;
 2887+ $perc = $opsys_perc {$key} ;
 2888+
 2889+ next if $perc lt "0.02%" ;
 2890+
 2891+ ($rectype, $os) = split (',', $key,2) ;
 2892+
 2893+ next if $rectype ne '-' ; # group
 2894+
 2895+ $count = &FormatCount ($count) ;
 2896+ $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2897+ # $rows++ ;
 2898+ }
 2899+
 2900+ $html .= $line_total_non_mobile ;
 2901+ $html .= "<tr><th class=l colspan=99>&nbsp;<br>Breakdown per OS version, mobile</th></tr>\n" ;
 2902+ foreach $key (@opsys_sorted_alpha)
 2903+ {
 2904+ $count = $opsys {$key} ;
 2905+ $perc = $opsys_perc {$key} ;
 2906+
 2907+ next if $perc lt "0.02%" ;
 2908+
 2909+ ($rectype, $os) = split (',', $key,2) ;
 2910+
 2911+ next if $rectype ne 'M' ; # group
 2912+
 2913+ $count = &FormatCount ($count) ;
 2914+ $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ;
 2915+ # $rows++ ;
 2916+ }
 2917+ $html .= $line_total_mobile ;
 2918+ $html .= "</table>\n" ;
 2919+ $html .= "</td></tr>" ;
 2920+
 2921+ $html .= "<tr><td colspan=99 class=l wrap>Requests from mobile devices are recognized as follows:<br>" .
 2922+ "Agent string contains any of the following terms (last upd: $month_upd_keywords_mobile):<br>" .
 2923+ "<i>$keywords_mobile</i></td></tr>" ;
 2924+
 2925+ $html .= "</table><p>" ;
 2926+
 2927+# $perc_crawlers = sprintf ("%.1f",100 * $total_page_crawlerrequests/$total_page_requests_external) ;
 2928+# $total_page_requests_external2 = &FormatCount ($total_page_requests_external*1000) ;
 2929+# $total_page_crawlerrequests2 = &FormatCount ($total_page_crawlerrequests*1000) ;
 2930+# $html =~ s/PERC_GOOGLE/<p>In total $total_page_crawlerrequests2 page requests (mime type <a href='SquidReportRequests.htm'>text\/html<\/a> only!) per day are considered crawler requests, out of $total_page_requests_external2 external requests, which is $perc_crawlers%/ ;
 2931+
 2932+# $total_crawlers = &FormatCount ($total_crawlers) ;
 2933+
 2934+# $html .= "<tr><th class=l>$total_crawlers</th><th class=l colspan=2>total</th></tr>\n" ;
 2935+# $html .= "</table><p>\n" ;
 2936+
 2937+# $html .= "<table border=1>\n" ;
 2938+# $html .= "<tr><th class=l colspan=99>Top 25 secondary domains<br>(~ sites) mentioned</th></tr>\n" ;
 2939+# foreach $secondary_domain (keys_sorted_by_value_num_desc %secondary_domains)
 2940+# {
 2941+# next if $secondary_domain eq ".." ;
 2942+# last if ++$secondary_domains_listed > 25 ;
 2943+#
 2944+# $count = $secondary_domains {$secondary_domain} ;
 2945+# $count = &FormatCount ($count) ;
 2946+# $html .= "<tr><td class=r>$count</td><td class=l colspan=2>$secondary_domain</td></tr>\n" ;
 2947+# }
 2948+# $html .= "</table>\n" ;
 2949+
 2950+ $html .= $colophon ;
 2951+
 2952+ print FILE_HTML_OPSYS $html ;
 2953+ close FILE_HTML_OPSYS ;
 2954+}
 2955+
 2956+# http://en.wikipedia.org/wiki/Domain_name
 2957+sub WriteReportOrigins
 2958+{
 2959+ open FILE_HTML_ORIGINS, '>', "$dir_reports/$file_html_origins" ;
 2960+
 2961+ $html = $header ;
 2962+ $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Requests by origin/ ;
 2963+ $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Requests by origin/ ;
 2964+ $html =~ s/ALSO/&nbsp;See also: <b>LINKS<\/b>/ ;
 2965+ $html =~ s/LINKS/$link_requests $dummy_origins \/ $link_methods \/ $link_scripts \/ $link_skins \/ $link_crawlers \/ $link_opsys \/ $link_browsers \/ $link_google/ ;
 2966+ $html =~ s/NOTES/<br>&nbsp;This report shows where requests come from. Report 'Requests by destination' shows where requests are serviced.<br>&nbsp;Those numbers bear no direct relation.<br>/ ;
 2967+
 2968+ $html .= "<table border=1>\n" ;
 2969+ $html .= "<tr><td colspan=99>" ;
 2970+
 2971+
 2972+ $html .= "<table border=0 width=100%>\n" ;
 2973+# $html .= "<tr><td colspan=99 class=c>traffic from yahoo is allocated as if yahoo used same domain naming scheme as google: <b>search.yahoo.ca</b> instead of <b>ca.search.yahoo.com</b></td></tr>\n" ;
 2974+# $html .= "<tr><td colspan=99 class=c><small>All counts x 1000</small></td></tr>\n" ;
 2975+
 2976+ # INTERNAL ORIGINS
 2977+
 2978+ $html .= "<tr><td colspan=99 class=c><h3>Requests with internal origins</h3></td></tr>\n" ;
 2979+ $html .= "<table border=1 width=100%>\n" ;
 2980+
 2981+ $html .= "<tr><td width=50% valign=top>" ;
 2982+ $html .= "<table border=1 width=100%>\n" ;
 2983+ $html .= "<tr><td colspan=2 class=l><b>Internal origins<br>sorted by<br>frequency</b></td><th class=r>&nbsp;Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ;
 2984+
 2985+ $total_total = 0 ;
 2986+ $total_page = 0 ;
 2987+ $total_image = 0 ;
 2988+ $total_rest = 0 ;
 2989+ foreach $project (@project_int_top_sorted_count)
 2990+ {
 2991+ $total = $project_int_top {$project} ;
 2992+ $page = $project_int_top_split {"page:$project"} ;
 2993+ $image = $project_int_top_split {"image:$project"} ;
 2994+ $rest = $project_int_top_split {"other:$project"} ;
 2995+ $total_total += $total ;
 2996+ $total_page += $page ;
 2997+ $total_image += $image ;
 2998+ $total_rest += $rest ;
 2999+ $total = &FormatCount ($total) ;
 3000+ $page = &FormatCount ($page) ;
 3001+ $image = &FormatCount ($image) ;
 3002+ $rest = &FormatCount ($rest) ;
 3003+ $html .= "<tr><td colspan=2 class=l>" . ucfirst($project) . "</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3004+ }
 3005+ $total_total = &FormatCount ($total_total) ;
 3006+ $total_page = &FormatCount ($total_page) ;
 3007+ $total_image = &FormatCount ($total_image) ;
 3008+ $total_rest = &FormatCount ($total_rest) ;
 3009+ $html .= "<tr><th colspan=2 class=l>Total</th><th class=r>$total_total</th><td class=r>$total_page</td><td class=r>$total_image</td><td class=r>$total_rest</td></tr>\n" ;
 3010+
 3011+ $html .= "<tr><td colspan=99>&nbsp;</td></tr>\n" ;
 3012+ $html .= "<tr><td colspan=99 class=l><b>Per project language / subproject</b> (top 50)</td></tr>\n" ;
 3013+ $projects = 0 ;
 3014+ $total_total = 0 ;
 3015+ $total_page = 0 ;
 3016+ $total_image = 0 ;
 3017+ $total_rest = 0 ;
 3018+ foreach $origin (@origin_int_top_sorted_count)
 3019+ {
 3020+ if (++$projects > 50)
 3021+ {
 3022+ $origin_int_top_other {"all"} += $origin_int_top {$origin} ; ;
 3023+ $origin_int_top_other {"page"} += $origin_int_top_split {"page:$origin"} ;
 3024+ $origin_int_top_other {"image"} += $origin_int_top_split {"image:$origin"} ;
 3025+ $origin_int_top_other {"other"} += $origin_int_top_split {"other:$origin"} ;
 3026+ next ;
 3027+ }
 3028+ $top100_internal_origins {$origin} ++ ;
 3029+ $total = $origin_int_top {$origin} ;
 3030+ $page = $origin_int_top_split {"page:$origin"} ;
 3031+ $image = $origin_int_top_split {"image:$origin"} ;
 3032+ $rest = $origin_int_top_split {"other:$origin"} ;
 3033+ $total_total += $total ;
 3034+ $total_page += $page ;
 3035+ $total_image += $image ;
 3036+ $total_rest += $rest ;
 3037+ $total = &FormatCount ($total) ;
 3038+ $page = &FormatCount ($page) ;
 3039+ $image = &FormatCount ($image) ;
 3040+ $rest = &FormatCount ($rest) ;
 3041+ ($project,$subproject) = split (':', $origin) ;
 3042+ $html .= "<tr><td class=l>" . ucfirst($project) . "</td><td class=l>$subproject</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3043+
 3044+ }
 3045+ $total = $origin_int_top_other {"all"} ;
 3046+ $page = $origin_int_top_other {"page"} ;
 3047+ $image = $origin_int_top_other {"image"} ;
 3048+ $rest = $origin_int_top_other {"other"} ;
 3049+ $total_total += $total ;
 3050+ $total_page += $page ;
 3051+ $total_image += $image ;
 3052+ $total_rest += $rest ;
 3053+ $total = &FormatCount ($total) ;
 3054+ $page = &FormatCount ($page) ;
 3055+ $image = &FormatCount ($image) ;
 3056+ $rest = &FormatCount ($rest) ;
 3057+ $html .= "<tr><td colspan=2 class=l>Other</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3058+ $grand_grand_total = $total_total ;
 3059+ $total_total = &FormatCount ($total_total) ;
 3060+ $total_page = &FormatCount ($total_page) ;
 3061+ $total_image = &FormatCount ($total_image) ;
 3062+ $total_rest = &FormatCount ($total_rest) ;
 3063+ $html .= "<tr><th colspan=2 class=l>Total</th><th class=r>$total_total</th><td class=r>$total_page</td><td class=r>$total_image</td><td class=r>$total_rest</td></tr>\n" ;
 3064+ $html .= "</table>" ;
 3065+
 3066+ # BY ALPHABET
 3067+ $html .= "</td><td width=50% valign=top>" ;
 3068+
 3069+ $html .= "<table border=1 width=100%>\n" ;
 3070+ $html .= "<tr><td colspan=2 class=l><b>Internal origins<br>sorted by<br>alphabet</b></td><th class=r>&nbsp;Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ;
 3071+
 3072+ $total_total = 0 ;
 3073+ $total_page = 0 ;
 3074+ $total_image = 0 ;
 3075+ $total_rest = 0 ;
 3076+ foreach $project (@project_int_top_sorted_alpha)
 3077+ {
 3078+ $total = $project_int_top {$project} ;
 3079+ $page = $project_int_top_split {"page:$project"} ;
 3080+ $image = $project_int_top_split {"image:$project"} ;
 3081+ $rest = $project_int_top_split {"other:$project"} ;
 3082+ $total_total += $total ;
 3083+ $total_page += $page ;
 3084+ $total_image += $image ;
 3085+ $total_rest += $rest ;
 3086+ $total = &FormatCount ($total) ;
 3087+ $page = &FormatCount ($page) ;
 3088+ $image = &FormatCount ($image) ;
 3089+ $rest = &FormatCount ($rest) ;
 3090+ $html .= "<tr><td colspan=2 class=l>$project</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3091+ }
 3092+ $total_total = &FormatCount ($total_total) ;
 3093+ $total_page = &FormatCount ($total_page) ;
 3094+ $total_image = &FormatCount ($total_image) ;
 3095+ $total_rest = &FormatCount ($total_rest) ;
 3096+ $html .= "<tr><th colspan=2 class=l>total</th><th class=r>$total_total</th><td class=r>$total_page</td><td class=r>$total_image</td><td class=r>$total_rest</td></tr>\n" ;
 3097+
 3098+ $html .= "<tr><td colspan=99>&nbsp;</td></tr>\n" ;
 3099+ $html .= "<tr><td colspan=99 class=l><b>Per project language / subproject</b> (top 50)</td></tr>\n" ;
 3100+ $projects = 0 ;
 3101+ $total_total = 0 ;
 3102+ $total_page = 0 ;
 3103+ $total_image = 0 ;
 3104+ $total_rest = 0 ;
 3105+ foreach $origin (@origin_int_top_sorted_alpha)
 3106+ {
 3107+ next if $top100_internal_origins {$origin} == 0 ;
 3108+
 3109+ $total = $origin_int_top {$origin} ;
 3110+ $page = $origin_int_top_split {"page:$origin"} ;
 3111+ $image = $origin_int_top_split {"image:$origin"} ;
 3112+ $rest = $origin_int_top_split {"other:$origin"} ;
 3113+ $total_total += $total ;
 3114+ $total_page += $page ;
 3115+ $total_image += $image ;
 3116+ $total_rest += $rest ;
 3117+ $total = &FormatCount ($total) ;
 3118+ $page = &FormatCount ($page) ;
 3119+ $image = &FormatCount ($image) ;
 3120+ $rest = &FormatCount ($rest) ;
 3121+ ($project,$subproject) = split (':', $origin) ;
 3122+ $html .= "<tr><td class=l>$project</td><td class=l>$subproject</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3123+
 3124+ }
 3125+ $total = $origin_int_top_other {"all"} ;
 3126+ $page = $origin_int_top_other {"page"} ;
 3127+ $image = $origin_int_top_other {"image"} ;
 3128+ $rest = $origin_int_top_other {"other"} ;
 3129+ $total_total += $total ;
 3130+ $total_page += $page ;
 3131+ $total_image += $image ;
 3132+ $total_rest += $rest ;
 3133+ $total = &FormatCount ($total) ;
 3134+ $page = &FormatCount ($page) ;
 3135+ $image = &FormatCount ($image) ;
 3136+ $rest = &FormatCount ($rest) ;
 3137+ $html .= "<tr><td colspan=2 class=l>other</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3138+ $total_total = &FormatCount ($total_total) ;
 3139+ $total_page = &FormatCount ($total_page) ;
 3140+ $total_image = &FormatCount ($total_image) ;
 3141+ $total_rest = &FormatCount ($total_rest) ;
 3142+ $html .= "<tr><th colspan=2 class=l>total</th><th class=r>$total_total</th><td class=r>$total_page</td><td class=r>$total_image</td><td class=r>$total_rest</td></tr>\n" ;
 3143+ $html .= "</table>" ;
 3144+
 3145+ $html .= "</td></tr>" ;
 3146+ $html .= "</table>" ;
 3147+
 3148+ # REQUESTS WITH EXTERNAL ORIGINS
 3149+
 3150+ $html .= "<table border=1 width=100%>\n" ;
 3151+ $html .= "<tr><td colspan=99 class=c>&nbsp;</td></tr>\n" ;
 3152+ $html .= "<tr><td colspan=99 class=c><h3>Requests with external origins</h3></td></tr>\n" ;
 3153+ $html .= "<table border=1 width=100%>\n" ;
 3154+
 3155+ $html .= "<tr><td width=50% valign=top>" ;
 3156+ $html .= "<table border=1 width=100%>\n" ;
 3157+# $html .= "<tr><td class=l><b><a href='http://..'>External origins</a><br>sorted by<br>frequency</b><br>top 100</td><th class=r>&nbsp;Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ;
 3158+ $html .= "<tr><td class=l><b>External origins<br>sorted by<br>frequency</b><br>top 100</td><th class=r>&nbsp;Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ;
 3159+
 3160+ $projects = 0 ;
 3161+ $total_total = 0 ;
 3162+ $total_page = 0 ;
 3163+ $total_image = 0 ;
 3164+ $total_rest = 0 ;
 3165+ foreach $origin (@origin_ext_top_sorted_count)
 3166+ {
 3167+ $total = $origin_ext_top {$origin} ;
 3168+ $page = $origin_ext_top_split {"page:$origin"} ;
 3169+ $image = $origin_ext_top_split {"image:$origin"} ;
 3170+ $rest = $origin_ext_top_split {"other:$origin"} ;
 3171+ $total_total += $total ;
 3172+ $total_page += $page ;
 3173+ $total_image += $image ;
 3174+ $total_rest += $rest ;
 3175+ $total = &FormatCount ($total) ;
 3176+ $page = &FormatCount ($page) ;
 3177+ $image = &FormatCount ($image) ;
 3178+ $rest = &FormatCount ($rest) ;
 3179+
 3180+ if (++$projects > 100)
 3181+ {
 3182+ $origin_ext_top_other {"all"} += $origin_ext_top {$origin} ; ;
 3183+ $origin_ext_top_other {"page"} += $origin_ext_top_split {"page:$origin"} ;
 3184+ $origin_ext_top_other {"image"} += $origin_ext_top_split {"image:$origin"} ;
 3185+ $origin_ext_top_other {"other"} += $origin_ext_top_split {"other:$origin"} ;
 3186+ next ;
 3187+ }
 3188+ $top100_internal_origins {$origin} ++ ;
 3189+
 3190+ if ($origin =~ /\./)
 3191+ { $link_origin = "<a href='http://$origin' ref='nofollow'>$origin</a>" ; }
 3192+ else
 3193+ { $link_origin = $origin ; }
 3194+ $html .= "<tr><td class=l>$link_origin</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3195+ }
 3196+ $total = $origin_ext_top_other {"all"} ;
 3197+ $page = $origin_ext_top_other {"page"} ;
 3198+ $image = $origin_ext_top_other {"image"} ;
 3199+ $rest = $origin_ext_top_other {"other"} ;
 3200+ $total = &FormatCount ($total) ;
 3201+ $page = &FormatCount ($page) ;
 3202+ $image = &FormatCount ($image) ;
 3203+ $rest = &FormatCount ($rest) ;
 3204+ $html .= "<tr><td class=l>other</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3205+ $grand_grand_total = $total_total ;
 3206+ $total_total = &FormatCount ($total_total) ;
 3207+ $total_page = &FormatCount ($total_page) ;
 3208+ $total_image = &FormatCount ($total_image) ;
 3209+ $total_rest = &FormatCount ($total_rest) ;
 3210+ $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_page</td><td class=r>$total_image</td><td class=r>$total_rest</td></tr>\n" ;
 3211+ $html .= "</table>" ;
 3212+
 3213+ # BY ALPHABET
 3214+ $html .= "</td><td width=50% valign=top>" ;
 3215+
 3216+ $html .= "<table border=1 width=100%>\n" ;
 3217+# $html .= "<tr><td class=l><b><a href='http://..'>External origins</a><br>sorted by<br>alphabet</b><br>top 100</td><th class=r>&nbsp;Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ;
 3218+ $html .= "<tr><td class=l><b>External origins<br>sorted by<br>alphabet</b><br>top 100</td><th class=r>&nbsp;Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ;
 3219+
 3220+ $projects = 0 ;
 3221+ $total_total = 0 ;
 3222+ $total_page = 0 ;
 3223+ $total_image = 0 ;
 3224+ $total_rest = 0 ;
 3225+ foreach $origin (@origin_ext_top_sorted_alpha)
 3226+ {
 3227+
 3228+ $total = $origin_ext_top {$origin} ;
 3229+ $page = $origin_ext_top_split {"page:$origin"} ;
 3230+ $image = $origin_ext_top_split {"image:$origin"} ;
 3231+ $rest = $origin_ext_top_split {"other:$origin"} ;
 3232+ $total_total += $total ;
 3233+ $total_page += $page ;
 3234+ $total_image += $image ;
 3235+ $total_rest += $rest ;
 3236+ $total = &FormatCount ($total) ;
 3237+ $page = &FormatCount ($page) ;
 3238+ $image = &FormatCount ($image) ;
 3239+ $rest = &FormatCount ($rest) ;
 3240+
 3241+ next if $top100_internal_origins {$origin} == 0 ;
 3242+
 3243+ $html .= "<tr><td class=l>$origin</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3244+
 3245+ }
 3246+ $total = $origin_ext_top_other {"all"} ;
 3247+ $page = $origin_ext_top_other {"page"} ;
 3248+ $image = $origin_ext_top_other {"image"} ;
 3249+ $rest = $origin_ext_top_other {"other"} ;
 3250+ $total = &FormatCount ($total) ;
 3251+ $page = &FormatCount ($page) ;
 3252+ $image = &FormatCount ($image) ;
 3253+ $rest = &FormatCount ($rest) ;
 3254+ $html .= "<tr><td class=l>other</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3255+ $total_total = &FormatCount ($total_total) ;
 3256+ $total_page = &FormatCount ($total_page) ;
 3257+ $total_image = &FormatCount ($total_image) ;
 3258+ $total_rest = &FormatCount ($total_rest) ;
 3259+ $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_page</td><td class=r>$total_image</td><td class=r>$total_rest</td></tr>\n" ;
 3260+ $html .= "</table>" ;
 3261+
 3262+ $html .= "</td></tr>" ;
 3263+# $html .= "<tr><td colspan=99 class=c>For presentation conciseness the top level domain (.org, .com, ..) is ignored here. There is a theoretical<br> possibility that figures for two unrelated sites which are both popular are presented as one here.<p>" .
 3264+# "'Unmatched ip address': all requests without explicit referer url that were not allocated <br>to a site based on known ip range, e.g. google (by ip) or agent string, e.g. google (by agent)</td></tr>" ;
 3265+ $html .= "<tr><td colspan=99 class=c>'Origin unknown': all requests without explicit referer url, without known ip range and without identity clue in the agent string.<br>Note that right now only ip ranges for Google and Yahoo are recognized by the script (manual input Feb 2009)</td></tr>" ;
 3266+ $html .= "</table>" ;
 3267+
 3268+ # EXTERNAL ORIGINS
 3269+if (0)
 3270+{
 3271+ $html .= "<tr><td colspan=99 class=c>&nbsp;</td></tr>\n" ;
 3272+ $html .= "<tr><td colspan=99 class=c><h3>External origins</h3></td></tr>\n" ;
 3273+ $html .= "<tr><td width=50% valign=top>" ;
 3274+
 3275+
 3276+ $html .= "<table border=1 width=100%>\n" ;
 3277+ $html .= "<tr><td class=l><b><a href='http://en.wikipedia.org/wiki/Top-level_domain'>Top level domains</a> (tld)<br>sorted by<br>frequency</b></td><th class=r>&nbsp;Total</th><th class=r>Google</th><th class=r>Yahoo</th><th class=r>Other</th></tr>\n" ;
 3278+ $html .= "<tr><td colspan=99 class=l>&nbsp;<br><b><a href='http://en.wikipedia.org/wiki/Generic_top-level_domain'>Generic</a> and <a href='http://en.wikipedia.org/wiki/Sponsored_top-level_domains'>Sponsored</a> tld's</a></b></td></tr>\n" ;
 3279+ foreach $toplevel (@origin_ext_page_top_sorted_count)
 3280+ {
 3281+ next if (length ($toplevel) <= 2) || ($toplevel =~ /^(?:address|local|rest|unspecified)$/) ;
 3282+
 3283+ $total = $origin_ext_page_top {$toplevel} ;
 3284+ $google = $origin_ext_page_top_split {"google:$toplevel"} ;
 3285+ $yahoo = $origin_ext_page_top_split {"yahoo:$toplevel"} ;
 3286+ $rest = $origin_ext_page_top_split {"other:$toplevel"} ;
 3287+ $total_total += $total ;
 3288+ $total_google += $google ;
 3289+ $total_yahoo += $yahoo ;
 3290+ $total_rest += $rest ;
 3291+ $total = &FormatCount ($total) ;
 3292+ $google = &FormatCount ($google) ;
 3293+ $yahoo = &FormatCount ($yahoo) ;
 3294+ $rest = &FormatCount ($rest) ;
 3295+ $html .= "<tr><td class=l>$toplevel</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ;
 3296+ }
 3297+ $grand_total += $total_total ;
 3298+ $grand_google += $total_google ;
 3299+ $grand_yahoo += $total_yahoo ;
 3300+ $grand_rest += $total_rest ;
 3301+ $total_total = &FormatCount ($total_total) ;
 3302+ $total_google = &FormatCount ($total_google) ;
 3303+ $total_yahoo = &FormatCount ($total_yahoo) ;
 3304+ $total_rest = &FormatCount ($total_rest) ;
 3305+ $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_google</td><td class=r>$total_yahoo</td><td class=r>$total_rest</td></tr>\n" ;
 3306+
 3307+ $total_total = 0 ;
 3308+ $total_google = 0 ;
 3309+ $total_yahoo = 0 ;
 3310+ $total_rest = 0 ;
 3311+ $html .= "<tr><td colspan=99 class=l>&nbsp;<br><b><a href='http://en.wikipedia.org/wiki/Country_code_top-level_domain'>Country code tld's</a></b></td></tr>\n" ;
 3312+ foreach $toplevel (@origin_ext_page_top_sorted_count)
 3313+ {
 3314+ next if length ($toplevel) != 2 ;
 3315+
 3316+ $total = $origin_ext_page_top {$toplevel} ;
 3317+ $google = $origin_ext_page_top_split {"google:$toplevel"} ;
 3318+ $yahoo = $origin_ext_page_top_split {"yahoo:$toplevel"} ;
 3319+ $rest = $origin_ext_page_top_split {"other:$toplevel"} ;
 3320+ $total_total += $total ;
 3321+ $total_google += $google ;
 3322+ $total_yahoo += $yahoo ;
 3323+ $total_rest += $rest ;
 3324+ $total = &FormatCount ($total) ;
 3325+ $google = &FormatCount ($google) ;
 3326+ $yahoo = &FormatCount ($yahoo) ;
 3327+ $rest = &FormatCount ($rest) ;
 3328+ $html .= "<tr><td class=l>$toplevel</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ;
 3329+ }
 3330+ $grand_total += $total_total ;
 3331+ $grand_google += $total_google ;
 3332+ $grand_yahoo += $total_yahoo ;
 3333+ $grand_rest += $total_rest ;
 3334+ $total_total = &FormatCount ($total_total) ;
 3335+ $total_google = &FormatCount ($total_google) ;
 3336+ $total_yahoo = &FormatCount ($total_yahoo) ;
 3337+ $total_rest = &FormatCount ($total_rest) ;
 3338+ $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_google</td><td class=r>$total_yahoo</td><td class=r>$total_rest</td></tr>\n" ;
 3339+
 3340+ $total_total = 0 ;
 3341+ $total_google = 0 ;
 3342+ $total_yahoo = 0 ;
 3343+ $total_rest = 0 ;
 3344+ $html .= "<tr><td colspan=99 class=l>&nbsp;<br><b>Remainder</th></tr>\n" ;
 3345+ $total = $origin_ext_page_top {"local"} ;
 3346+ $google = $origin_ext_page_top_split {"google:local"} ; # always zero
 3347+ $yahoo = $origin_ext_page_top_split {"yahoo:local"} ; # always zero
 3348+ $rest = $origin_ext_page_top_split {"other:local"} ;
 3349+ $total_total += $total ;
 3350+ $total_google += $google ;
 3351+ $total_yahoo += $yahoo ;
 3352+ $total_rest += $rest ;
 3353+ $total = &FormatCount ($total) ;
 3354+ $google = &FormatCount ($google) ;
 3355+ $yahoo = &FormatCount ($yahoo) ;
 3356+ $rest = &FormatCount ($rest) ;
 3357+ $html .= "<tr><td class=l>localhost</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ;
 3358+
 3359+ $total = $origin_ext_page_top {"address"} ;
 3360+ $google = $origin_ext_page_top_split {"google:address"} ;
 3361+ $yahoo = $origin_ext_page_top_split {"yahoo:address"} ;
 3362+ $rest = $origin_ext_page_top_split {"other:address"} ;
 3363+ $total_total += $total ;
 3364+ $total_google += $google ;
 3365+ $total_yahoo += $yahoo ;
 3366+ $total_rest += $rest ;
 3367+ $total = &FormatCount ($total) ;
 3368+ $google = &FormatCount ($google) ;
 3369+ $yahoo = &FormatCount ($yahoo) ;
 3370+ $rest = &FormatCount ($rest) ;
 3371+ $html .= "<tr><td class=l>ip address</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ;
 3372+
 3373+ $total = $origin_ext_page_top {"rest"} ;
 3374+ $google = $origin_ext_page_top_split {"google:rest"} ;
 3375+ $yahoo = $origin_ext_page_top_split {"yahoo:rest"} ;
 3376+ $rest = $origin_ext_page_top_split {"other:rest"} ;
 3377+ $total_total += $total ;
 3378+ $total_google += $google ;
 3379+ $total_yahoo += $yahoo ;
 3380+ $total_rest += $rest ;
 3381+ $total = &FormatCount ($total) ;
 3382+ $google = &FormatCount ($google) ;
 3383+ $yahoo = &FormatCount ($yahoo) ;
 3384+ $rest = &FormatCount ($rest) ;
 3385+ $html .= "<tr><td class=l>other</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ;
 3386+
 3387+ $total = $origin_ext_page_top {"unspecified"} ;
 3388+ $google = $origin_ext_page_top_split {"google:unspecified"} ;
 3389+ $yahoo = $origin_ext_page_top_split {"yahoo:unspecified"} ;
 3390+ $rest = $origin_ext_page_top_split {"other:unspecified"} ;
 3391+ $total_total += $total ;
 3392+ $total_google += $google ;
 3393+ $total_yahoo += $yahoo ;
 3394+ $total_rest += $rest ;
 3395+ $total = &FormatCount ($total) ;
 3396+ $google = &FormatCount ($google) ;
 3397+ $yahoo = &FormatCount ($yahoo) ;
 3398+ $rest = &FormatCount ($rest) ;
 3399+ $html .= "<tr><td class=l>anonymous</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ;
 3400+
 3401+ $grand_total += $total_total ;
 3402+ $grand_google += $total_google ;
 3403+ $grand_yahoo += $total_yahoo ;
 3404+ $grand_rest += $total_rest ;
 3405+ $total_total = &FormatCount ($total_total) ;
 3406+ $total_google = &FormatCount ($total_google) ;
 3407+ $total_yahoo = &FormatCount ($total_yahoo) ;
 3408+ $total_rest = &FormatCount ($total_rest) ;
 3409+ $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_google</td><td class=r>$total_yahoo</td><td class=r>$total_rest</td></tr>\n" ;
 3410+
 3411+ $html .= "<tr><td colspan=99 class=l>&nbsp;<br><b>Grand total external</th></tr>\n" ;
 3412+ $grand_total = &FormatCount ($grand_total) ;
 3413+ $grand_google = &FormatCount ($grand_google) ;
 3414+ $grand_yahoo = &FormatCount ($grand_yahoo) ;
 3415+ $grand_rest = &FormatCount ($grand_rest) ;
 3416+ $html .= "<tr><th class=l>total</th><th class=r>$grand_total</th><td class=r>$grand_google</td><td class=r>$grand_yahoo</td><td class=r>$grand_rest</td></tr>\n" ;
 3417+ $html .= "</table>" ;
 3418+
 3419+ $html .= "</td><td width=50% valign=top>" ;
 3420+
 3421+ $html .= "<table border=1 width=100%>\n" ;
 3422+
 3423+ $html .= "<tr><th class=l>Top level domains<br>sorted by<br>alphabet</th><th class=r>Total<th class=r>Google<th class=r>Yahoo<th class=r>Other</th></tr>\n" ;
 3424+# $html .= "<tr><th colspan=99 class=l>&nbsp;<br><b><a href='http://en.wikipedia.org/wiki/Top-level_domain'>generic/sponsored tld's</a></b></th></tr>\n" ;
 3425+ $total_total = 0 ;
 3426+ $total_google = 0 ;
 3427+ $total_yahoo = 0 ;
 3428+ $total_rest = 0 ;
 3429+ $html .= "<tr><td colspan=99 class=l>&nbsp;<br><b>Generic and sponsored tld's</b></td></tr>\n" ;
 3430+
 3431+ foreach $toplevel (@origin_ext_page_top_sorted_alpha)
 3432+ {
 3433+ next if (length ($toplevel) <= 2) || ($toplevel =~ /^(?:address|local|rest|unspecified)$/) ;
 3434+
 3435+ $total = $origin_ext_page_top {$toplevel} ;
 3436+ $google = $origin_ext_page_top_split {"google:$toplevel"} ;
 3437+ $yahoo = $origin_ext_page_top_split {"yahoo:$toplevel"} ;
 3438+ $rest = $origin_ext_page_top_split {"other:$toplevel"} ;
 3439+ $total_total += $total ;
 3440+ $total_google += $google ;
 3441+ $total_yahoo += $yahoo ;
 3442+ $total_rest += $rest ;
 3443+ $total = &FormatCount ($total) ;
 3444+ $google = &FormatCount ($google) ;
 3445+ $yahoo = &FormatCount ($yahoo) ;
 3446+ $rest = &FormatCount ($rest) ;
 3447+ $html .= "<tr><td class=l>$toplevel</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ;
 3448+ }
 3449+ $total_total = &FormatCount ($total_total) ;
 3450+ $total_google = &FormatCount ($total_google) ;
 3451+ $total_yahoo = &FormatCount ($total_yahoo) ;
 3452+ $total_rest = &FormatCount ($total_rest) ;
 3453+ $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_google</td><td class=r>$total_yahoo</td><td class=r>$total_rest</td></tr>\n" ;
 3454+
 3455+ $total_total = 0 ;
 3456+ $total_google = 0 ;
 3457+ $total_yahoo = 0 ;
 3458+ $total_rest = 0 ;
 3459+ $html .= "<tr><td colspan=99 class=l>&nbsp;<br><b><a href='http://en.wikipedia.org/wiki/Country_code_top-level_domain'>Country code tld's</a></b></td></tr>\n" ;
 3460+ foreach $toplevel (@origin_ext_page_top_sorted_alpha)
 3461+ {
 3462+ next if length ($toplevel) != 2 ;
 3463+
 3464+ $total = $origin_ext_page_top {$toplevel} ;
 3465+ $google = $origin_ext_page_top_split {"google:$toplevel"} ;
 3466+ $yahoo = $origin_ext_page_top_split {"yahoo:$toplevel"} ;
 3467+ $rest = $origin_ext_page_top_split {"other:$toplevel"} ;
 3468+ $total_total += $total ;
 3469+ $total_google += $google ;
 3470+ $total_yahoo += $yahoo ;
 3471+ $total_rest += $rest ;
 3472+ $total = &FormatCount ($total) ;
 3473+ $google = &FormatCount ($google) ;
 3474+ $yahoo = &FormatCount ($yahoo) ;
 3475+ $rest = &FormatCount ($rest) ;
 3476+ $html .= "<tr><td class=l>$toplevel</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ;
 3477+ }
 3478+ $total_total = &FormatCount ($total_total) ;
 3479+ $total_google = &FormatCount ($total_google) ;
 3480+ $total_yahoo = &FormatCount ($total_yahoo) ;
 3481+ $total_rest = &FormatCount ($total_rest) ;
 3482+ $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_google</td><td class=r>$total_yahoo</td><td class=r>$total_rest</td></tr>\n" ;
 3483+
 3484+ $total_total = 0 ;
 3485+ $total_google = 0 ;
 3486+ $total_yahoo = 0 ;
 3487+ $total_rest = 0 ;
 3488+ $html .= "<tr><td colspan=99 class=l>&nbsp;<br><b>Remainder</th></tr>\n" ;
 3489+ $total = $origin_ext_page_top {"local"} ;
 3490+ $google = $origin_ext_page_top_split {"google:local"} ; # always zero
 3491+ $yahoo = $origin_ext_page_top_split {"yahoo:local"} ; # always zero
 3492+ $rest = $origin_ext_page_top_split {"other:local"} ;
 3493+ $total_total += $total ;
 3494+ $total_google += $google ;
 3495+ $total_yahoo += $yahoo ;
 3496+ $total_rest += $rest ;
 3497+ $total = &FormatCount ($total) ;
 3498+ $google = &FormatCount ($google) ;
 3499+ $yahoo = &FormatCount ($yahoo) ;
 3500+ $rest = &FormatCount ($rest) ;
 3501+ $html .= "<tr><td class=l>localhost</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ;
 3502+
 3503+ $total = $origin_ext_page_top {"address"} ;
 3504+ $google = $origin_ext_page_top_split {"google:address"} ;
 3505+ $yahoo = $origin_ext_page_top_split {"yahoo:address"} ;
 3506+ $rest = $origin_ext_page_top_split {"other:address"} ;
 3507+ $total_total += $total ;
 3508+ $total_google += $google ;
 3509+ $total_yahoo += $yahoo ;
 3510+ $total_rest += $rest ;
 3511+ $total = &FormatCount ($total) ;
 3512+ $google = &FormatCount ($google) ;
 3513+ $yahoo = &FormatCount ($yahoo) ;
 3514+ $rest = &FormatCount ($rest) ;
 3515+ $html .= "<tr><td class=l>ip address</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ;
 3516+
 3517+ $total = $origin_ext_page_top {"rest"} ;
 3518+ $google = $origin_ext_page_top_split {"google:rest"} ;
 3519+ $yahoo = $origin_ext_page_top_split {"yahoo:rest"} ;
 3520+ $rest = $origin_ext_page_top_split {"other:rest"} ;
 3521+ $total_total += $total ;
 3522+ $total_google += $google ;
 3523+ $total_yahoo += $yahoo ;
 3524+ $total_rest += $rest ;
 3525+ $total = &FormatCount ($total) ;
 3526+ $google = &FormatCount ($google) ;
 3527+ $yahoo = &FormatCount ($yahoo) ;
 3528+ $rest = &FormatCount ($rest) ;
 3529+ $html .= "<tr><td class=l>other</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ;
 3530+
 3531+ $total = $origin_ext_page_top {"unspecified"} ;
 3532+ $google = $origin_ext_page_top_split {"google:unspecified"} ;
 3533+ $yahoo = $origin_ext_page_top_split {"yahoo:unspecified"} ;
 3534+ $rest = $origin_ext_page_top_split {"other:unspecified"} ;
 3535+ $total_total += $total ;
 3536+ $total_google += $google ;
 3537+ $total_yahoo += $yahoo ;
 3538+ $total_rest += $rest ;
 3539+ $total = &FormatCount ($total) ;
 3540+ $google = &FormatCount ($google) ;
 3541+ $yahoo = &FormatCount ($yahoo) ;
 3542+ $rest = &FormatCount ($rest) ;
 3543+ $html .= "<tr><td class=l>anonymous</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ;
 3544+
 3545+ $total_total = &FormatCount ($total_total) ;
 3546+ $total_google = &FormatCount ($total_google) ;
 3547+ $total_yahoo = &FormatCount ($total_yahoo) ;
 3548+ $total_rest = &FormatCount ($total_rest) ;
 3549+ $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_google</td><td class=r>$total_yahoo</td><td class=r>$total_rest</td></tr>\n" ;
 3550+
 3551+ $html .= "<tr><td colspan=99 class=l>&nbsp;<br><b>Grand total external</th></tr>\n" ;
 3552+ $html .= "<tr><th class=l>total</th><th class=r>$grand_total</th><td class=r>$grand_google</td><td class=r>$grand_yahoo</td><td class=r>$grand_rest</td></tr>\n" ;
 3553+ $html .= "</table>" ;
 3554+
 3555+ $html .= "</td></tr>" ;
 3556+ $html .= "</table>" ;
 3557+ $html .= "</td></tr>" ;
 3558+
 3559+ $html .= "</table>\n" ;
 3560+}
 3561+
 3562+sub WriteReportScripts
 3563+{
 3564+ open FILE_HTML_SCRIPTS, '>', "$dir_reports/$file_html_scripts" ;
 3565+
 3566+ $html = $header ;
 3567+ $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Scripts/ ;
 3568+ $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Scripts/ ;
 3569+ $html =~ s/ALSO/&nbsp;See also: <b>LINKS<\/b>/ ;
 3570+ $html =~ s/LINKS/$link_requests $link_origins \/ $link_methods \/ $dummy_scripts \/ $link_skins \/ $link_crawlers \/ $link_opsys \/ $link_browsers \/ $link_google/ ;
 3571+ $html =~ s/X1000/&rArr; <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ;
 3572+
 3573+ $html .= "<table border=1>\n" ;
 3574+ $html .= "<tr><td colspan=99>" ;
 3575+
 3576+
 3577+ $html .= "<table border=0 width=100%>\n" ;
 3578+ $html .= "<tr><td width=50% valign=top>" ;
 3579+ $html .= "<table border=1 width=100%>\n" ;
 3580+
 3581+ $html .= "<tr><td class=l><h3>In order of request volume</h3></td><th class=r>Count<br><small>x 1000</small></th></tr>\n" ;
 3582+ $html .= "<tr><th colspan=99 class=l>&nbsp;<br><b>css</b></th></tr>\n" ;
 3583+ foreach $script (@scripts_css_sorted_count)
 3584+ {
 3585+ $total = $scripts_css {$script} ;
 3586+
 3587+ next if $total < 3 ;
 3588+
 3589+ $total = &FormatCount ($total) ;
 3590+ $html .= "<tr><td class=l>$script</td><td class=r>$total</td></tr>\n" ;
 3591+ }
 3592+ $html .= "<tr><th colspan=99 class=l>&nbsp;<br><b>js</b></th></tr>\n" ;
 3593+ foreach $script (@scripts_js_sorted_count)
 3594+ {
 3595+ $total = $scripts_js {$script} ;
 3596+
 3597+ next if $total < 3 ;
 3598+
 3599+ $total = &FormatCount ($total) ;
 3600+ $html .= "<tr><td class=l>$script</td><td class=r>$total</td></tr>\n" ;
 3601+ }
 3602+ $html .= "<tr><th colspan=99 class=l>&nbsp;<br><b>php</b></th></tr>\n" ;
 3603+ $total_php = 0 ;
 3604+ foreach $script (@scripts_php_sorted_count)
 3605+ {
 3606+ $total = $scripts_php {$script} ;
 3607+
 3608+ next if $total < 3 ;
 3609+
 3610+ $total_php += $total ;
 3611+ $total = &FormatCount ($total) ;
 3612+ $html .= "<tr><td class=l>$script</td><td class=r>$total</td></tr>\n" ;
 3613+ foreach $key (keys_sorted_by_value_num_desc %actions)
 3614+ {
 3615+ ($script2,$action) = split (',', $key) ;
 3616+ if (($script eq $script2) && ($actions {$key} < $scripts_php {$script}))
 3617+ { $html .= "<tr><td class=l>&nbsp;&nbsp;&nbsp;<small>$action</small></td><td class=r><small>" . &FormatCount ($actions {$key}) . "</small></td></tr>\n" ; }
 3618+ }
 3619+ }
 3620+ $total_php = &FormatCount ($total_php) ;
 3621+ $html .= "<tr><th class=l>total php</th><th class=r>$total_php</th></tr>\n" ;
 3622+ $html .= "</table>" ;
 3623+
 3624+ $html .= "</td><td width=50% valign=top>" ;
 3625+
 3626+ $html .= "<table border=1 width=100%>\n" ;
 3627+
 3628+ $html .= "<tr><td class=l><h3>In alphabetical order</h3></td><th class=r>Count<br><small>x 1000</small></th></tr>\n" ;
 3629+ $html .= "<tr><th colspan=99 class=l>&nbsp;<br><b>css</b></th></tr>\n" ;
 3630+ foreach $script (@scripts_css_sorted_script)
 3631+ {
 3632+ $total = $scripts_css {$script} ;
 3633+
 3634+ next if $total < 3 ;
 3635+
 3636+ $total = &FormatCount ($total) ;
 3637+ $html .= "<tr><td class=l>$script</td><td class=r>$total</td></tr>\n" ;
 3638+ }
 3639+ $html .= "<tr><th colspan=99 class=l>&nbsp;<br><b>js</b></th></tr>\n" ;
 3640+ foreach $script (@scripts_js_sorted_script)
 3641+ {
 3642+ $total = $scripts_js {$script} ;
 3643+
 3644+ next if $total < 3 ;
 3645+
 3646+ $total = &FormatCount ($total) ;
 3647+ $html .= "<tr><td class=l>$script</td><td class=r>$total</td></tr>\n" ;
 3648+ }
 3649+ $html .= "<tr><th colspan=99 class=l>&nbsp;<br><b>php</b></th></tr>\n" ;
 3650+ foreach $script (@scripts_php_sorted_script)
 3651+ {
 3652+ $total = $scripts_php {$script} ;
 3653+
 3654+ next if $total < 3 ;
 3655+
 3656+ $total_php += $total ;
 3657+ $total = &FormatCount ($total) ;
 3658+ $html .= "<tr><td class=l>$script</td><td class=r>$total</td></tr>\n" ;
 3659+ foreach $key (sort keys %actions)
 3660+ {
 3661+ ($script2,$action) = split (',', $key) ;
 3662+ if (($script eq $script2) && ($actions {$key} < $scripts_php {$script}))
 3663+ { $html .= "<tr><td class=l>&nbsp;&nbsp;&nbsp;<small>$action</small></td><td class=r><small>" . &FormatCount ($actions {$key}) . "</small></td></tr>\n" ; }
 3664+ }
 3665+ }
 3666+ $html .= "<tr><th class=l>total php</th><th class=r>$total_php</th></tr>\n" ;
 3667+ $html .= "</table>" ;
 3668+
 3669+ $html .= "</td></tr>" ;
 3670+ $html .= "</table>" ;
 3671+ $html .= "</td></tr>" ;
 3672+
 3673+ $html .= "<tr><td colspan=99>&nbsp;</td></tr>\n" ;
 3674+ $html .= "<tr><th colspan=99 class=l><h3>PHP scripts and generalized arguments, sorted by frequency, top 25</h3></th></tr>\n" ;
 3675+ $html .= "<tr><th class=l>Script</th><th class=l>Parameters</th><th class=r>Count<br><small>x 1000</small></th></tr>\n" ;
 3676+ $rows = 0 ;
 3677+ foreach $parm (@parms_sorted_count)
 3678+ {
 3679+ $total = &FormatCount ($parms {$parm}) ;
 3680+ ($name,$parms) = split (',', $parm) ;
 3681+ if ($parms eq "")
 3682+ { $parms = "-" ; }
 3683+ $html .= "<tr><td class=l>$name</td><td class=l>$parms</td><td class=r>$total</td></tr>\n" ;
 3684+ $rows++ ;
 3685+
 3686+ last if $rows == 25 ;
 3687+ }
 3688+# $html .= "</table>\n" ;
 3689+# $html .= "</td><td>&nbsp;&nbsp;&nbsp;</td><td>" ;
 3690+# $html .= "<table border=1>\n" ;
 3691+ $html .= "<tr><th colspan=99 class=l>&nbsp;</th></tr>\n" ;
 3692+
 3693+ $html .= "<tr><th colspan=99 class=l><h3>PHP scripts and generalized arguments, in alphabetical order <small>(&ge; 3)</small></h3></small></th></tr>\n" ;
 3694+
 3695+ $html .= "<tr><td colspan=2 class=l><b>Script</b><br>Parameters</td><th class=r>Count<br><small>x 1000</small></th></tr>\n" ;
 3696+ $rows = 0 ;
 3697+ $nameprev = "" ;
 3698+ foreach $parm (@parms_sorted_script)
 3699+ {
 3700+ ($name,$parms) = split (',', $parm, 2) ;
 3701+
 3702+ $total = &FormatCount ($parms {$parm}) ;
 3703+ if ($name ne $nameprev)
 3704+ {
 3705+ $total = &FormatCount ($scripts_php {$name}) ;
 3706+
 3707+ next if $total < 3 ;
 3708+
 3709+ if ($nameprev ne "")
 3710+ { $html .= "<tr><th colspan=99 class=l>&nbsp;</th></tr>\n" ; }
 3711+ if (($name eq "api.php") || ($name eq "index.php"))
 3712+ { $html .= "<tr><td colspan=2 class=l><b>$name</b> <small>(&ge; 3)</small></td><th class=r>$total</th></tr>\n" ; }
 3713+ else
 3714+ { $html .= "<tr><td colspan=2 class=l><b>$name</b></td><th class=r>$total</th></tr>\n" ; }
 3715+ }
 3716+ $total = $parms {$parm} ;
 3717+
 3718+ next if (($name eq "api.php") || ($name eq "index.php")) && ($total <= 2) ;
 3719+
 3720+ $total = &FormatCount ($total) ;
 3721+ if ($parms eq "")
 3722+ { $parms = "-" ; }
 3723+ $html .= "<tr><td colspan=2 class=l>$parms</td><td class=r>$total</td></tr>\n" ;
 3724+ $rows++ ;
 3725+ $nameprev = $name ;
 3726+ }
 3727+ $html .= "</table>\n" ;
 3728+
 3729+ $html .= "</td></tr></table>\n" ;
 3730+ $html .= "&nbsp;<small>$rows rows written</small><p>" ;
 3731+
 3732+# $html .= "<p><b>Explanation:</b><br>'osd' = opensearchdescription / 'php.ser' = vnd.php.serialized" ;
 3733+ $html .= $colophon ;
 3734+
 3735+ print FILE_HTML_SCRIPTS $html ;
 3736+ close FILE_HTML_SCRIPTS ;
 3737+}
 3738+
 3739+sub WriteReportGoogle
 3740+{
 3741+ open FILE_HTML_SEARCH, '>', "$dir_reports/$file_html_google" ;
 3742+
 3743+ $html = $header ;
 3744+ $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Google requests/ ;
 3745+ $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Google requests/ ;
 3746+ $html =~ s/ALSO/&nbsp;See also: <b>LINKS<\/b>/ ;
 3747+ $html =~ s/LINKS/$link_requests $link_origins \/ $link_methods \/ $link_scripts \/ $link_skins \/ $link_crawlers \/ $link_opsys \/ $link_browsers \/ $dummy_google/ ;
 3748+ $html =~ s/X1000/&rArr; <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ;
 3749+
 3750+ $html .= "<table border=1 width=500 wrap>\n" ;
 3751+# $html .= "<tr><td colspan=99 class=l>&nbsp;<br>This report shows <b>all requests to Wikimedia servers where a Google server of service was involved in any way</b>,<br> " .
 3752+# "be it the <a href='http://en.wikipedia.org/wiki/Googlebot'>GoogleBot</a> crawler or <a href='http://www.google.com/feedfetcher.html'>FeedFetcher</a> collector scripts that run on Google servers,<br> " .
 3753+# "or a user that follows a link from a Google Web or Google Desktop search results page, or " .
 3754+# "from Google Maps or Google Earth etcetera. <p>Technically speaking three fields in the <a href='http://wikitech.wikimedia.org/view/Squid_log_format'>squid log records</a> are checked for this: " .
 3755+# "client ip address, referer header and user agent header.<br>A request can originate from an ip address which has been registered by Google and/or it can carry a referer tag that tells us<br>a user clicked a link " .
 3756+# "on a Google results page and/or it can carry an agent string that mentions a Google application which<br>can reasonably be assumed to be genuinely Google's. See bottom of page for <a href='#details'>further details</a>." .
 3757+# "PERC_GOOGLE\n" ;
 3758+ $html .= "<tr><td colspan=99 class=l wrap>&nbsp;<br>This report shows <b>all requests to Wikimedia servers where a Google server of service was involved in any way</b>, " .
 3759+ "be it the <a href='http://en.wikipedia.org/wiki/Googlebot'>GoogleBot</a> crawler or <a href='http://www.google.com/feedfetcher.html'>FeedFetcher</a> collector scripts that run on Google servers, " .
 3760+ "or a user that follows a link from a Google Web or Google Desktop search results page, or " .
 3761+ "from Google Maps or Google Earth etcetera. <p>Technically speaking three fields in the <a href='http://wikitech.wikimedia.org/view/Squid_log_format'>squid log records</a> are checked for this: " .
 3762+ "client ip address, referer header and user agent header. A request can originate from an ip address which has been registered by Google and/or it can carry a referer tag that tells us a user clicked a link " .
 3763+ "on a Google results page and/or it can carry an agent string that mentions a Google application which can reasonably be assumed to be genuinely Google's. See bottom of page for <a href='#details'>further details</a>." .
 3764+ "PERC_GOOGLE\n" ;
 3765+
 3766+ $html .= "<tr><td width=50%>\n" ;
 3767+
 3768+ # SORTED BY FREQUENCY
 3769+ $html .= "<table border=1>\n" ;
 3770+ $html .= "<tr><th colspan=99 class=l><h3>In order of request volume</h3></th></tr>\n" ;
 3771+ $html .= "<tr><th colspan=99 class=l>Requests originating from a Google ip address</th></tr>\n" ;
 3772+# $html .= "<tr><th colspan=99 class=l><small>x 1000</small></th>\n" ;
 3773+ my $total_total_direct ;
 3774+ my $total_page_direct ;
 3775+ my $total_image_direct ;
 3776+ my $total_rest_direct ;
 3777+ $html .= "<tr><th class=l>Service</a><th class=r>Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ;
 3778+ foreach $key (@searches_service_count)
 3779+ {
 3780+ next if $key !~ /Y$/ ; # googleIp
 3781+
 3782+ ($key2 = $key) =~ s/,[YN]$// ;
 3783+ $total = $searches_service_mimecat {"$key2,total,Y"} ;
 3784+ $page = $searches_service_mimecat {"$key2,page,Y"} ;
 3785+ $image = $searches_service_mimecat {"$key2,image,Y"} ;
 3786+ $rest = $searches_service_mimecat {"$key2,other,Y"} ;
 3787+ $total_total_direct += $total ;
 3788+ $total_page_direct += $page ;
 3789+ $total_image_direct += $image ;
 3790+ $total_rest_direct += $rest ;
 3791+ $total = &FormatCount ($total) ;
 3792+ $page = &FormatCount ($page) ;
 3793+ $image = &FormatCount ($image) ;
 3794+ $rest = &FormatCount ($rest) ;
 3795+ $html .= "<tr><td class=l>$key2</a></td><td class=r>$total</td><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3796+ }
 3797+ $total_page_all = $total_page_direct ;
 3798+
 3799+# $total_page_requests_external_fmt = &FormatCount ($total_page_requests_external*1000) ;
 3800+
 3801+ $perc_google_direct = ".." ;
 3802+ if ($total_page_requests_external > 0)
 3803+ { $perc_google_direct = sprintf ("%.1f",100 * $total_page_direct/$total_page_requests_external) ; }
 3804+ $total_page_direct_fmt = &FormatCount ($total_page_direct*1000) ;
 3805+ $perc_google_msg_direct = "<p>Including all of its different search crawlers and services hosted on its servers, Google itself requested another $total_page_direct_fmt page pages per day, representing $perc_google_direct% of our external page requests.\n" ;
 3806+
 3807+ $total_total_direct = &FormatCount ($total_total_direct) ;
 3808+ $total_page_direct = &FormatCount ($total_page_direct) ;
 3809+ $total_image_direct = &FormatCount ($total_image_direct) ;
 3810+ $total_rest_direct = &FormatCount ($total_rest_direct) ;
 3811+
 3812+ $html .= "<tr><th class=l>Total</a></th><th class=r>$total_total_direct</th><th class=r>$total_page_direct</th><th class=r>$total_image_direct</th><th class=r>$total_rest_direct</th></tr>\n" ;
 3813+
 3814+ my $total_total_indirect ;
 3815+ my $total_page_indirect ;
 3816+ my $total_image_indirect ;
 3817+ my $total_rest_indirect ;
 3818+
 3819+ $html .= "<tr><th colspan=99 class=l>&nbsp;</th></tr>\n" ;
 3820+ $html .= "<tr><th colspan=99 class=l>Requests originating from elsewhere</th></tr>\n" ;
 3821+ $html .= "<tr><th class=l>Service</a><th class=r>Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ;
 3822+ foreach $key (@searches_service_count)
 3823+ {
 3824+ next if $key =~ /Y$/ ; # googleIp
 3825+
 3826+ ($key2 = $key) =~ s/,[YN]$// ;
 3827+ $total = $searches_service_mimecat {"$key2,total,N"} ;
 3828+ $page = $searches_service_mimecat {"$key2,page,N"} ;
 3829+ $image = $searches_service_mimecat {"$key2,image,N"} ;
 3830+ $rest = $searches_service_mimecat {"$key2,other,N"} ;
 3831+ $total_total_indirect += $total ;
 3832+ $total_page_indirect += $page ;
 3833+ $total_image_indirect += $image ;
 3834+ $total_rest_indirect += $rest ;
 3835+ $total = &FormatCount ($total) ;
 3836+ $page = &FormatCount ($page) ;
 3837+ $image = &FormatCount ($image) ;
 3838+ $rest = &FormatCount ($rest) ;
 3839+ $html .= "<tr><td class=l>$key2</a></td><td class=r>$total</td><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3840+ }
 3841+ $total_page_all += $total_page_indirect ;
 3842+
 3843+ $perc_google_indirect = ".." ;
 3844+ if ($total_page_requests_external > 0)
 3845+ { $perc_google_indirect = sprintf ("%.1f",100 * $total_page_indirect/$total_page_requests_external) ; }
 3846+ $total_page_indirect_fmt = &FormatCount ($total_page_indirect*1000) ;
 3847+ $perc_google_msg_indirect = "<p>Google referred to our sites, through its services including search, maps, and Google Earth, $total_page_indirect_fmt page views per day, representing $perc_google_indirect% of our external page requests.\n" ;
 3848+
 3849+ $total_total_indirect = &FormatCount ($total_total_indirect) ;
 3850+ $total_page_indirect = &FormatCount ($total_page_indirect) ;
 3851+ $total_image_indirect = &FormatCount ($total_image_indirect) ;
 3852+ $total_rest_indirect = &FormatCount ($total_rest_indirect) ;
 3853+
 3854+ $html .= "<tr><th class=l>Total</a></th><th class=r>$total_total_indirect</th><th class=r>$total_page_indirect</th><th class=r>$total_image_indirect</th><th class=r>$total_rest_indirect</th></tr>\n" ;
 3855+ $html .= "<tr><th class=l colspan=99>&nbsp;</td></tr>\n" ;
 3856+ $html .= "<tr><th colspan=99 class=l><a href='http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains'>Top level domains</a></th></tr>\n" ;
 3857+
 3858+# $total_page_all_fmt = &FormatCount ($total_page_all*1000) ;
 3859+
 3860+ $perc_google = ".." ;
 3861+ if ($total_page_requests_external > 0)
 3862+ { $perc_google = sprintf ("%.1f",100 * $total_page_all/$total_page_requests_external) ; }
 3863+
 3864+ $perc_google_msg_all = "<p>In total Google was somehow involved in $perc_google\% of daily external page<sup>*<\/sup> requests \n" ;
 3865+ $html =~ s/PERC_GOOGLE/<hr width=90%>$perc_google_msg_all $perc_google_msg_indirect $perc_google_msg_direct<p><small>* = mime type <a href='SquidReportRequests.htm'>text\/html<\/a> only<\/small>/ ;
 3866+
 3867+ $total_total = 0 ;
 3868+ $total_page = 0 ;
 3869+ $total_image = 0 ;
 3870+ $total_rest = 0 ;
 3871+ foreach $key (@searches_toplevel_count)
 3872+ {
 3873+ $total = $searches_toplevel_mimecat {"$key,total"} ;
 3874+ $page = $searches_toplevel_mimecat {"$key,page"} ;
 3875+ $image = $searches_toplevel_mimecat {"$key,image"} ;
 3876+ $rest = $searches_toplevel_mimecat {"$key,other"} ;
 3877+ $total_total += $total ;
 3878+ $total_page += $page ;
 3879+ $total_image += $image ;
 3880+ $total_rest += $rest ;
 3881+ $total = &FormatCount ($total) ;
 3882+ $page = &FormatCount ($page) ;
 3883+ $image = &FormatCount ($image) ;
 3884+ $rest = &FormatCount ($rest) ;
 3885+ if ($key !~ /^[\_\.]/)
 3886+ { $key = ".$key" ; }
 3887+# else
 3888+# { $key =~ s/^[\.]// ; }
 3889+ if ($key =~ /^\_/)
 3890+ { $key = "<i>" . substr ($key,1) . "</i>" ; }
 3891+ $html .= "<tr><td class=l>$key</a></td><td class=r>$total</td><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3892+ }
 3893+ $total_no_tld = $searches_mimecat_tld_not_found {"total"} ;
 3894+ $page_no_tld = $searches_mimecat_tld_not_found {"page"} ;
 3895+ $image_no_tld = $searches_mimecat_tld_not_found {"image"} ;
 3896+ $other_no_tld = $searches_mimecat_tld_not_found {"other"} ;
 3897+
 3898+ $total_total += $total_no_tld ;
 3899+ $total_page += $page_no_tld ;
 3900+ $total_image += $image_no_tld ;
 3901+ $total_rest += $other_no_tld ;
 3902+
 3903+ $total_no_tld = &FormatCount ($total_no_tld) ;
 3904+ $page_no_tld = &FormatCount ($page_no_tld) ;
 3905+ $image_no_tld = &FormatCount ($image_no_tld) ;
 3906+ $other_no_tld = &FormatCount ($other_no_tld) ;
 3907+ $html .= "<tr><td class=l>undefined</a></td><td class=r>$total_no_tld</td><td class=r>$page_no_tld</td><td class=r>$image_no_tld</td><td class=r>$other_no_tld</td></tr>\n" ;
 3908+
 3909+ $total_total = &FormatCount ($total_total) ;
 3910+ $total_page = &FormatCount ($total_page) ;
 3911+ $total_image = &FormatCount ($total_image) ;
 3912+ $total_rest = &FormatCount ($total_rest) ;
 3913+ $html .= "<tr><th class=l>Total</a></th><th class=r>$total_total</th><th class=r>$total_page</th><th class=r>$total_image</th><th class=r>$total_rest</th></tr>\n" ;
 3914+
 3915+ $html .= "</table>\n" ;
 3916+
 3917+ $html .= "</td><td width=50%>\n" ;
 3918+
 3919+ # SORTED BY ALPHABETICALLY
 3920+ $html .= "<table border=1>\n" ;
 3921+ $html .= "<tr><th colspan=99 class=l><h3>In alphabetical order</h3></th></tr>\n" ;
 3922+ $html .= "<tr><th colspan=99 class=l>Requests originating from a Google ip address</th></tr>\n" ;
 3923+# $html .= "<tr><th colspan=99 class=l><small>x 1000</small></th>\n" ;
 3924+ $html .= "<tr><th class=l>Service</a><th class=r>Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ;
 3925+ foreach $key (@searches_service_alpha)
 3926+ {
 3927+ next if $key !~ /Y$/ ; # googleIp
 3928+
 3929+ ($key2 = $key) =~ s/,[YN]$// ;
 3930+ $total = $searches_service_mimecat {"$key2,total,Y"} ;
 3931+ $page = $searches_service_mimecat {"$key2,page,Y"} ;
 3932+ $image = $searches_service_mimecat {"$key2,image,Y"} ;
 3933+ $rest = $searches_service_mimecat {"$key2,other,Y"} ;
 3934+ $total = &FormatCount ($total) ;
 3935+ $page = &FormatCount ($page) ;
 3936+ $image = &FormatCount ($image) ;
 3937+ $rest = &FormatCount ($rest) ;
 3938+ if ($key !~ /(?:undefined|unspecified|crawler|feedfetcher|wireless transcoder)/)
 3939+ { $key = ucfirst ($key) ; }
 3940+ else
 3941+ { $key = "<i>$key</i>" ; }
 3942+ $html .= "<tr><td class=l>$key2</a></td><td class=r>$total</td><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3943+ }
 3944+ $html .= "<tr><th class=l>Total</a></th><th class=r>$total_total_direct</th><th class=r>$total_page_direct</th><th class=r>$total_image_direct</th><th class=r>$total_rest_direct</th></tr>\n" ;
 3945+
 3946+ $html .= "<tr><th colspan=99 class=l>&nbsp;</th></tr>\n" ;
 3947+ $html .= "<tr><th colspan=99 class=l>Requests originating from elsewhere</th></tr>\n" ;
 3948+ $html .= "<tr><th class=l>Service</a><th class=r>Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ;
 3949+ foreach $key (@searches_service_alpha)
 3950+ {
 3951+ next if $key =~ /Y$/ ; # googleIp
 3952+
 3953+ ($key2 = $key) =~ s/,[YN]$// ;
 3954+ $total = $searches_service_mimecat {"$key2,total,N"} ;
 3955+ $page = $searches_service_mimecat {"$key2,page,N"} ;
 3956+ $image = $searches_service_mimecat {"$key2,image,N"} ;
 3957+ $rest = $searches_service_mimecat {"$key2,other,N"} ;
 3958+ $total = &FormatCount ($total) ;
 3959+ $page = &FormatCount ($page) ;
 3960+ $image = &FormatCount ($image) ;
 3961+ $rest = &FormatCount ($rest) ;
 3962+ if ($key !~ /(?:undefined|unspecified|crawler|feedfetcher|wireless transcoder)/)
 3963+ { $key = ucfirst ($key) ; }
 3964+ else
 3965+ { $key = "<i>$key</i>" ; }
 3966+ $html .= "<tr><td class=l>$key2</a></td><td class=r>$total</td><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3967+ }
 3968+ $html .= "<tr><th class=l>Total</a></th><th class=r>$total_total_indirect</th><th class=r>$total_page_indirect</th><th class=r>$total_image_indirect</th><th class=r>$total_rest_indirect</th></tr>\n" ;
 3969+ $html .= "<tr><th class=l colspan=99>&nbsp;</td></tr>\n" ;
 3970+ $html .= "<tr><th colspan=99 class=l>Top level domains</th></tr>\n" ;
 3971+
 3972+ $total_total = 0 ;
 3973+ $total_page = 0 ;
 3974+ $total_image = 0 ;
 3975+ $total_rest = 0 ;
 3976+ foreach $key (@searches_toplevel_alpha)
 3977+ {
 3978+ $total = $searches_toplevel_mimecat {"$key,total"} ;
 3979+ $page = $searches_toplevel_mimecat {"$key,page"} ;
 3980+ $image = $searches_toplevel_mimecat {"$key,image"} ;
 3981+ $rest = $searches_toplevel_mimecat {"$key,other"} ;
 3982+ $total_total += $total ;
 3983+ $total_page += $page ;
 3984+ $total_image += $image ;
 3985+ $total_rest += $rest ;
 3986+ $total = &FormatCount ($total) ;
 3987+ $page = &FormatCount ($page) ;
 3988+ $image = &FormatCount ($image) ;
 3989+ $rest = &FormatCount ($rest) ;
 3990+ if ($key !~ /^[\_\.]/)
 3991+ { $key = ".$key" ; }
 3992+ if ($key =~ /^\_/)
 3993+ { $key = "<i>" . substr ($key,1) . "</i>" ; }
 3994+ $html .= "<tr><td class=l>$key</a></td><td class=r>$total</td><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ;
 3995+ }
 3996+ $total_no_tld = $searches_mimecat_tld_not_found {"total"} ;
 3997+ $page_no_tld = $searches_mimecat_tld_not_found {"page"} ;
 3998+ $image_no_tld = $searches_mimecat_tld_not_found {"image"} ;
 3999+ $other_no_tld = $searches_mimecat_tld_not_found {"other"} ;
 4000+
 4001+ $total_total += $total_no_tld ;
 4002+ $total_page += $page_no_tld ;
 4003+ $total_image += $image_no_tld ;
 4004+ $total_rest += $other_no_tld ;
 4005+
 4006+ $total_no_tld = &FormatCount ($total_no_tld) ;
 4007+ $page_no_tld = &FormatCount ($page_no_tld) ;
 4008+ $image_no_tld = &FormatCount ($image_no_tld) ;
 4009+ $other_no_tld = &FormatCount ($other_no_tld) ;
 4010+ $html .= "<tr><td class=l>undefined</a></td><td class=r>$total_no_tld</td><td class=r>$page_no_tld</td><td class=r>$image_no_tld</td><td class=r>$other_no_tld</td></tr>\n" ;
 4011+
 4012+ $total_total = &FormatCount ($total_total) ;
 4013+ $total_page = &FormatCount ($total_page) ;
 4014+ $total_image = &FormatCount ($total_image) ;
 4015+ $total_rest = &FormatCount ($total_rest) ;
 4016+ $html .= "<tr><th class=l>Total</a></th><th class=r>$total_total</th><th class=r>$total_page</th><th class=r>$total_image</th><th class=r>$total_rest</th></tr>\n" ;
 4017+
 4018+ $html .= "</table>\n" ;
 4019+ $html .= "</td></tr>\n" ;
 4020+
 4021+
 4022+ $breakdown = "Here is detailed breakdown per service of indicators that pointed to Google <small>(total &ge; 3)</small><br>&nbsp;<br>" .
 4023+ "<table width=100%><tr><th class=l>Service</th><th class=c>Total</th><th class=c>Originating from<br>Google ip address</th><th class=c>Referer mentions<br>Google url</th><th class=c>Agent mentions<br>Google service</th></tr>\n" ;
 4024+ foreach $key (@searches_service_matches_alpha)
 4025+ {
 4026+ $count = $searches_service_matches {$key} ;
 4027+
 4028+ next if $count <= 2 ;
 4029+
 4030+ $count = &FormatCount ($count) ;
 4031+ ($service,$matches) = split (',', $key) ;
 4032+ if ($matches =~ /x/) { $x = 'Y' } else { $x = '-' } ;
 4033+ if ($matches =~ /y/) { $y = 'Y' } else { $y = '-' } ;
 4034+ if ($matches =~ /z/) { $z = 'Y' } else { $z = '-' } ;
 4035+ $breakdown .= "<tr><td class=l>$service</td><td class=r>$count</td><td class=c>$x</td><td class=c>$y</td><td class=c>$z</td></tr>" ;
 4036+ }
 4037+ $breakdown .= "</table><br.&bsp;<br>\n" ;
 4038+
 4039+
 4040+ $html .= "<tr><td class=l colspan=99><a name='details' id='details'></a>&nbsp;<p>" .
 4041+ $google_ip_ranges .
 4042+ "<b>Agents</b>: as for genuine agent strings: too many crawlers indentify themselves as 'GoogleBot' to take this at face value. " .
 4043+ "They are accepted as genuine Google crawler requests only when the ip address matches a known range (see above). " .
 4044+ "Other records that mention GoogleBot are counted as GoogleBot? (question mark, as this may include partners, like DoCoMo). " .
 4045+ "However when the agent string mentions Google Desktop or Google Earth this is always accepted" .
 4046+ "<p><b>Service</b>: the service name is based on the agent string (plus for GoogleBot check for ip address, see above), if this is inconclusive it is based on the referer string." .
 4047+ "<p>$breakdown" .
 4048+ "<p><b>Top Level Domain 'undefined'</b>: requests with top level domain 'undefined' are nearly all requests from anonymous ip addresses (crawler and other services)" .
 4049+ "<p><b>Note</b>: averages below 1 are always rounded up to 1\n" .
 4050+ "</small></td></tr>\n";
 4051+
 4052+ $html .= "</table>\n" ;
 4053+
 4054+ $html .= $colophon ;
 4055+
 4056+ print FILE_HTML_SEARCH $html ;
 4057+ close FILE_HTML_SEARCH ;
 4058+}
 4059+
 4060+sub WriteReportSkins
 4061+{
 4062+ open FILE_HTML_SKINS, '>', "$dir_reports/$file_html_skins" ;
 4063+
 4064+ $html = $header ;
 4065+ $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Skins/ ;
 4066+ $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Skins/ ;
 4067+ $html =~ s/ALSO/&nbsp;See also: <b>LINKS<\/b>/ ;
 4068+ $html =~ s/LINKS/$link_requests $link_origins \/ $link_methods \/ $link_scripts \/ $dummy_skins \/ $link_crawlers \/ $link_opsys \/ $link_browsers \/ $link_google/ ;
 4069+ $html =~ s/X1000/&rArr; <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ;
 4070+
 4071+ $html .= "<table border=1>\n" ;
 4072+
 4073+ $html .= "<tr><td colspan=99 class=l><b>Skin</b><br>Files (&ge; 3)</td></tr>\n" ;
 4074+ $rows = 0 ;
 4075+ $nameprev = "" ;
 4076+ foreach $skin (@skins_sorted_skin)
 4077+ {
 4078+ $count = &FormatCount ($skins {$skin}) ;
 4079+
 4080+ next if $count < 3 ;
 4081+
 4082+ $skin =~ s/^skins\/// ;
 4083+ ($name,$rest) = split ('\/', $skin, 2) ;
 4084+
 4085+ next if $skin_set {$name} < 3 ;
 4086+
 4087+ if ($name ne $nameprev)
 4088+ { $html .= "<tr><th colspan=99 class=l>&nbsp;<br><b>" . ucfirst ($name) . "</b></th></tr>\n" ; }
 4089+ $nameprev = $name ;
 4090+ $html .= "<tr><td class=l>$skin</td><td class=r>$count</td></tr>\n" ;
 4091+ $rows++ ;
 4092+ }
 4093+ $html .= "</table>\n" ;
 4094+
 4095+ $html .= "&nbsp;<small>$rows rows written</small><p>" ;
 4096+
 4097+# $html .= "<p><b>Explanation:</b><br>'osd' = opensearchdescription / 'php.ser' = vnd.php.serialized" ;
 4098+ $html .= $colophon ;
 4099+
 4100+ print FILE_HTML_SKINS $html ;
 4101+ close FILE_HTML_SKINS ;
 4102+}
 4103+
 4104+ $html .= "</td></tr></table>\n" ;
 4105+# $html .= "&nbsp;<small>$rows rows written</small><p>" ;
 4106+
 4107+# $html .= "<p><b>Explanation:</b><br>'osd' = opensearchdescription / 'php.ser' = vnd.php.serialized" ;
 4108+ $html .= $colophon ;
 4109+
 4110+ print FILE_HTML_ORIGINS $html ;
 4111+ close FILE_HTML_ORIGINS ;
 4112+}
 4113+
 4114+sub WriteCsvGoogleBots
 4115+{
 4116+ open CSV_GOOGLE_BOTS_OUT, '>', "$dir_reports/$file_csv_google_bots" ;
 4117+ print CSV_GOOGLE_BOTS_OUT "Date Time,Ip Range,Hits\n" ;
 4118+ foreach $dir_process (@dirs_process)
 4119+ {
 4120+ open CSV_GOOGLE_BOTS_IN, '<', "$dir_process/$file_csv_google_bots" ;
 4121+ while ($line = <CSV_GOOGLE_BOTS_IN>)
 4122+ {
 4123+ next if $line =~ /^#/ ; # comments
 4124+ next if $line =~ /^:/ ; # csv header (not a comment)
 4125+
 4126+ chomp $line ;
 4127+ ($datetime,$range,$hits) = split (',', $line) ;
 4128+ ($date,$time) = split (' ', $datetime) ;
 4129+ ($year,$month,$day) = split ('\/', $date) ;
 4130+ $hour = substr ($time,0,2) ;
 4131+ $datetime = "\"=DATE($year,$month,$day)+TIME($hour,0,0)\"" ;
 4132+ print CSV_GOOGLE_BOTS_OUT "$datetime,$hits,$range\n" ;
 4133+ $googlebots {$datetime} += $hits ;
 4134+ }
 4135+ close CSV_GOOGLE_BOTS_IN ;
 4136+ }
 4137+ foreach $datetime (sort keys %googlebots)
 4138+ { print CSV_GOOGLE_BOTS_OUT "$datetime,${googlebots{$datetime}},*\n" ; }
 4139+ close CSV_GOOGLE_BOTS_OUT ;
 4140+}
 4141+
 4142+sub WriteCsvBrowserLanguages
 4143+{
 4144+ open CSV_BROWSER_LANGUAGES, '>', "$dir_reports/$file_csv_browser_languages" ;
 4145+ print CSV_BROWSER_LANGUAGES "Browser,Languages,Hits\n" ;
 4146+ foreach $key (keys_sorted_alpha_asc %browser_languages)
 4147+ { print CSV_BROWSER_LANGUAGES "$key,${browser_languages {$key}}\n" ; }
 4148+ close CSV_BROWSER_LANGUAGES ;
 4149+}
 4150+
 4151+sub WriteCsvCountriesTimed
 4152+{
 4153+ $multiplier_1000 = 1000 * $multiplier ;
 4154+# open CSV_COUNTRIES_TIMED, '>', "$dir_reports/$file_csv_countries_timed" ;
 4155+ open CSV_COUNTRIES_TIMED, '>', "/home/ezachte/$file_csv_countries_timed" ;
 4156+
 4157+ foreach $target (sort keys %targets)
 4158+ {
 4159+ @countries = sort {$countries_totals {"N,$target"}{$b} <=> $countries_totals {"N,$target"}{$a}} keys %{$countries_totals {"N,$target"}} ;
 4160+
 4161+ foreach $bot ("N","Y")
 4162+ {
 4163+ $line = "\nBot,Wiki,Time," ;
 4164+ $cnt_countries = 0 ;
 4165+ foreach $country (@countries)
 4166+ {
 4167+ $line .= sprintf ("%.0f", $multiplier_1000 * $countries_totals {"$bot,$target"}{$country}) . "," ;
 4168+
 4169+ last if $cnt_countries++ >= 25 ;
 4170+ }
 4171+ print CSV_COUNTRIES_TIMED "$line\n" ;
 4172+
 4173+ $line = "\nBot,Wiki,Time," ;
 4174+ $cnt_countries = 0 ;
 4175+ foreach $country (@countries)
 4176+ {
 4177+ $country_name = $country_codes {$country} ;
 4178+ $line .= "$country_name," ;
 4179+
 4180+ last if $cnt_countries++ >= 25 ;
 4181+ }
 4182+ print CSV_COUNTRIES_TIMED "$line\n" ;
 4183+
 4184+ foreach $time (sort {$a <=> $b} keys %times)
 4185+ {
 4186+ $hrs = $time / 60 ;
 4187+ $min = $time % 60 ;
 4188+ $time2 = "\"=Time($hrs,$min,0)\"" ;
 4189+ $line = "$bot,$target,$time2," ;
 4190+ $cnt_countries = 0 ;
 4191+ foreach $country (@countries)
 4192+ {
 4193+ $line .= sprintf ("%.0f", $multiplier_1000 * $countries_timed {"$bot,$target,$country,$time"}) . "," ;
 4194+
 4195+ last if $cnt_countries++ >= 25 ;
 4196+ }
 4197+ print CSV_COUNTRIES_TIMED "$line\n" ;
 4198+ }
 4199+ }
 4200+ }
 4201+ close CSV_COUNTRIES_TIMED ;
 4202+}
 4203+
 4204+# http://www.maxmind.com/app/iso3166 country codes
 4205+sub WriteCsvCountriesGoTo
 4206+{
 4207+# open CSV_COUNTRIES_TIMED, '>', "$dir_reports/$file_csv_countries_timed" ;
 4208+ open CSV_COUNTRIES_LANGUAGES_VISITED, '>', "/home/ezachte/$file_csv_countries_languages_visited" ;
 4209+
 4210+ foreach $country (sort keys %countries)
 4211+ {
 4212+ @targets = sort {$targets_totals {"N,$country"}{$b} <=> $targets_totals {"N,$country"}{$a}} keys %{$targets_totals {"N,$country"}} ;
 4213+
 4214+ $line = "\nBot,Country," ;
 4215+ $cnt_targets = 0 ;
 4216+ foreach $target (@targets)
 4217+ {
 4218+ $target2 = $target ;
 4219+ $target2 =~ s/^.*?:// ;
 4220+ $target3 = $out_languages {$target2} ;
 4221+ if ($target3 eq "")
 4222+ { $target3 = "[$target2]" ; }
 4223+ $line .= "$target3," ;
 4224+
 4225+ last if $cnt_targets++ >= 25 ;
 4226+ }
 4227+ print CSV_COUNTRIES_LANGUAGES_VISITED "$line\n" ;
 4228+
 4229+ foreach $bot ("N","Y")
 4230+ {
 4231+ $country_name = $country_codes {$country} ;
 4232+ $country_name =~ s/\n//gs ;
 4233+ $country_name =~ s/[0x00-0x1F]//gs ;
 4234+
 4235+ $cnt_targets = 0 ;
 4236+ $tot_targets = 0 ;
 4237+ foreach $target (@targets)
 4238+ {
 4239+ $tot_targets += $targets_totals {"$bot,$country"}{$target} ;
 4240+ }
 4241+
 4242+ $line = "$bot,$country_name," ;
 4243+ $cnt_targets = 0 ;
 4244+ foreach $target (@targets)
 4245+ {
 4246+ $line .= $targets_totals {"$bot,$country"}{$target} . "," ;
 4247+
 4248+ last if $cnt_targets++ >= 25 ;
 4249+ }
 4250+ print CSV_COUNTRIES_LANGUAGES_VISITED "$line\n" ;
 4251+
 4252+ $line = "$bot,$country_name," ;
 4253+ $cnt_targets = 0 ;
 4254+ if ($tot_targets > 0)
 4255+ {
 4256+ foreach $target (@targets)
 4257+ {
 4258+ $line .= sprintf ("%.1f\%",100*$targets_totals {"$bot,$country"}{$target} / $tot_targets) . "," ;
 4259+
 4260+ last if $cnt_targets++ >= 25 ;
 4261+ }
 4262+ print CSV_COUNTRIES_LANGUAGES_VISITED "$line\n" ;
 4263+ }
 4264+ }
 4265+ }
 4266+ close CSV_COUNTRIES_LANGUAGES_VISITED ;
 4267+}
 4268+
 4269+sub WriteReportPerLanguageBreakDown
 4270+{
 4271+ print "\nWriteReportPerLanguageBreakDown\n" ;
 4272+
 4273+ my ($title,$views_edits,$links) = @_ ;
 4274+ my ($link_country,$population,$icon,$bar,$bars,$bar_width,$perc,$perc_tot,$perc_global,$requests_tot) ;
 4275+ my @index_countries ;
 4276+ my $views_edits_lc = lc $views_edits ;
 4277+
 4278+ $html = $header ;
 4279+ $html =~ s/TITLE/$title/ ;
 4280+ $html =~ s/HEADER/$title/ ;
 4281+ $html =~ s/ALSO/$links/ ;
 4282+ $html =~ s/LINKS// ;
 4283+ $html =~ s/NOTES// ;
 4284+ $html =~ s/X1000/.&nbsp;Period <b>$requests_recently_start - $requests_recently_stop<\/b>/ ;
 4285+ $html =~ s/DATE// ;
 4286+
 4287+ $html .= "<p><table border=1 width=800>INDEX\n" ;
 4288+
 4289+ my $languages_reported ;
 4290+
 4291+ foreach $language (keys_sorted_by_value_num_desc %requests_recently_per_language)
 4292+ {
 4293+ next if $requests_recently_per_language {$language} < 100 ;
 4294+
 4295+ ($language_name,$anchor_language) = &GetLanguageInfo ($language) ;
 4296+
 4297+ my %requests_per_country = %{$requests_recently_per_language_per_country {$language}} ;
 4298+ @countries = keys_sorted_by_value_num_desc %requests_per_country ;
 4299+
 4300+ my $requests_this_language = $requests_recently_per_language {$language} ;
 4301+
 4302+ $perc_global = '..' ;
 4303+ if ($requests_recently_all > 0)
 4304+ { $perc_global = &Percentage ($requests_this_language / $requests_recently_all) ; }
 4305+
 4306+ $html .= "<tr><th colspan=99 class=lh3><a id='$anchor_language' name='$anchor_language'></a><br>$language_name ($language) <small>($perc_global share of global total)</small></th></tr>\n" ;
 4307+
 4308+ if ($languages_reported % 2 == 0)
 4309+ { $gif = "bluebar_hor2.gif" ; }
 4310+ else
 4311+ { $gif = "greenbar_hor2.gif" ; }
 4312+
 4313+ $perc_tot = 0;
 4314+ for ($l = 0 ; $l < 50 ; $l++)
 4315+ {
 4316+ my $requests_this_country = $requests_recently_per_language_per_country {$language} {$countries [$l]} ;
 4317+ my $requests_all_countries = $requests_recently_per_language {$language} ;
 4318+ $perc = 0 ;
 4319+ if ($requests_all_countries > 0)
 4320+ {
 4321+ $perc = &Percentage ($requests_this_country / $requests_all_countries) ;
 4322+
 4323+ last if ($perc < 0.5) || (($perc_global < 0.1) && ($perc < 1) || (($perc_global < 0.01) && ($perc < 3)) || (($perc_global < 0.001) && ($perc < 5))) ;
 4324+
 4325+ $perc_tot += $perc ;
 4326+ }
 4327+
 4328+ $country = $countries [$l] ;
 4329+ $country =~ s/ .*$// if length ($country) > 20 ;
 4330+ $bar_width = int ($perc * 6) ;
 4331+
 4332+ $bar_100 = "" ;
 4333+ if ($bars++ == 0)
 4334+ {
 4335+ $bar_width_100 = 600 - $bar_width ;
 4336+ $bar_100 = "<img src='background.gif' width=$bar_width_100 height=15>" ;
 4337+ }
 4338+ if (($country =~ /Australia/) && ($language_name =~ /Japanese/) && ($perc > 5))
 4339+ { $perc .= " <b><a href='#anomaly' onclick='alert(\"Probably incorrectly assigned to this country.\\nOutdated Regional Internet Registry (RIR) administration may have caused this.\")';><font color='#FF0000'>(*)</font></a></b>" ; $anomaly_found = $true ;}
 4340+ $html .= "<tr><th class=l class=small nowrap>$country</th>" .
 4341+ "<td class=c>[$requests_this_country ]$perc</td>" .
 4342+ "<td class=l><img src='$gif' width=$bar_width height=15>$bar_100</td></tr>\n" ;
 4343+ }
 4344+
 4345+ if ($perc_tot > 100) { $perc_tot = 100 ; }
 4346+
 4347+ $perc_other = sprintf '%.1f', 100 - $perc_tot ;
 4348+ if ($perc_other > 0)
 4349+ {
 4350+ $bar_width = $perc_other * 6 ;
 4351+ $html .= "<tr><th class=l class=small nowrap>Other</th>" .
 4352+ "<td class=c>$perc_other%</td>" .
 4353+ "<td class=l><img src='$gif' width=$bar_width height=15></td></tr>\n" ;
 4354+ }
 4355+
 4356+ push @index_languages, "<a href='#$anchor_language'>$language_name</a> " ;
 4357+
 4358+ # print "\n" ;
 4359+ # $html .= "<tr><td colspan=99>&nbsp;</td></tr>\n" ;
 4360+ }
 4361+ $html .= "</table>" ;
 4362+ $html .= "<p><b>Share<\/b> is the percentage of requesting ip addresses (out of the global total) which originated from this country" .
 4363+ "<br>&nbsp;Further percentages show per country share of requests per Wikipedia visited" ;
 4364+ $html .= "<p>Countries are only included if the number of requests in the period exceeds 100,000 (100 matching records in 1:1000 sampled log)" ;
 4365+ $html .= "<br>Page requests by bots are not included. Also all ip addresses that occur more than once on a given day are discarded for that day." ;
 4366+ $html .= "<br> A few false negatives are taken for granted. " ;
 4367+ $html .= $colophon ;
 4368+
 4369+ $index = &HtmlIndex (join '/ ', sort (@index_languages)) ;
 4370+ $html =~ s/INDEX/$index/ ;
 4371+
 4372+ &PrintHtml ($html, "$path_out/$file_html_per_language_breakdown") ;
 4373+}
 4374+
 4375+sub WriteReportPerCountryOverview
 4376+{
 4377+ print "\nWriteReportPerCountryOverview\n" ;
 4378+
 4379+ my ($title,$views_edits,$links) = @_ ;
 4380+ my ($link_country,$population,$icon,$bar,$bars,$bar_width,$perc,$perc_tot,$perc_global,$requests_tot) ;
 4381+ my (@index_countries,@csv_countries) ;
 4382+ my $views_edits_lc = lc $views_edits ;
 4383+ my $views_edits_lcf = ucfirst $views_edits_lc ;
 4384+ ($views_edits2 = $views_edits) =~ s/ /\<br\>/ ;
 4385+ if ($views_edits =~ /edit/i)
 4386+ { $MPVE = 'MPE' ; } # monthly page edits
 4387+ else
 4388+ { $MPVE = 'MPV' ; } # monthly page views
 4389+
 4390+ $html = $header ;
 4391+ $html =~ s/TITLE/$title/ ;
 4392+ $html =~ s/HEADER/$title/ ;
 4393+ $html =~ s/LINKS// ;
 4394+ $html =~ s/ALSO/$links/ ;
 4395+ $html =~ s/NOTES// ;
 4396+ $html =~ s/X1000/.&nbsp;Period <b>$requests_recently_start - $requests_recently_stop<\/b>/ ;
 4397+ $html =~ s/DATE// ;
 4398+
 4399+ $html .= &HtmlSortTable ;
 4400+
 4401+ $html .= "<p><table border=1 width=800 class=tablesorter id=table1>\n" ;
 4402+ $html .= "<thead>\n" ;
 4403+ $html .= "INDEX\n" ;
 4404+
 4405+ $html .= &HtmlWorldMaps ;
 4406+
 4407+ $html .= "<tr><td class=rh5 colspan=3 rowspan=1><b>Country</b></td><td class=c rowspan=2><b>Monthly<br>$views_edits2</b></td>" .
 4408+ "<td class=r rowspan=2><b>Population</b></td>" . # <td class=c rowspan=2><b>$MPVE's<br>Per<br>Person</b></td>" .
 4409+ "<td class=c colspan=2><b>Internet<br>Users</b></td><td class=c><b>${MPVE}'s<br>Per<br>I U</b></td>" .
 4410+ "<td colspan=99 class=l rowspan=2><b>Share in Global Monthly $views_edits</b><br><small><font color=#808080>red and blue bars have different scale</font></small></td></tr>\n" ;
 4411+ $html .= "<tr><td class=c><b>Name</b></td><td class=c><b>Region</b><br><img src='http://stats.wikimedia.org/Location_of_Continents2.gif'></td><td class=c><b>N/S</b></td><td class=c><b>Total</b></td><td class=c><b>/Pop.</b></td></tr>\n" ;
 4412+ $html .= "<tr><th>&nbsp;</th><th>&nbsp;</th><th>&nbsp;</th><th>&nbsp;</th><th>&nbsp;</th><th>&nbsp;</th><th>&nbsp;</th><th>&nbsp;</th><th>&nbsp;</th><th colspan=2>&nbsp;</th></tr>\n" ;
 4413+ $html .= "</thead><tbody>\nTOTAL\nREGIONS\n" ;
 4414+
 4415+ push @csv_countries, "# Wikimedia Traffic Analysis Report - Wikipedia $views_edits Per Country - Overview\n" .
 4416+ "# Report based on data from $requests_recently_start - $requests_recently_stop\n" .
 4417+ "country name, country code, monthly $views_edits_lc,population,internet users,internet penetration,monthly $views_edits_lc per internet user,share of global $views_edits_lc\n" ;
 4418+
 4419+ $requests_tot = 0 ;
 4420+
 4421+ undef %requests_per_region ;
 4422+
 4423+ foreach $country_code (keys_sorted_by_value_num_desc %requests_recently_per_country_code)
 4424+ {
 4425+ my ($country,$code) = split ('\|', $country_code) ;
 4426+
 4427+ my $region_code = $region_codes {$code} ;
 4428+ my $north_south_code = $north_south_codes {$code} ;
 4429+
 4430+ $region_name = $region_code ;
 4431+ $region_name =~ s/^AF$/<font color=#028702><b>Africa<\/b><\/font>/ ;
 4432+ $region_name =~ s/^CA$/<font color=#249CA0><b>Central-America<\/b><\/font>/ ;
 4433+ $region_name =~ s/^SA$/<font color=#FCAA03><b>South-America<\/b><\/font>/ ;
 4434+ $region_name =~ s/^NA$/<font color=#C802CA><b>North-America<\/b><\/font>/ ;
 4435+ $region_name =~ s/^AU$/<font color=#02AAD4><b>Australia<\/b><\/font>/ ;
 4436+ $region_name =~ s/^EU$/<font color=#0100CA><b>Europe<\/b><\/font>/ ;
 4437+ $region_name =~ s/^AS$/<font color=#E10202><b>Asia<\/b><\/font>/ ;
 4438+ $region_name =~ s/^OC$/<font color=#02AAD4><b>Oceania<\/b><\/font>/ ;
 4439+
 4440+ $north_south_name = $north_south_code ;
 4441+ $north_south_name =~ s/^N$/<font color=#000BF7><b>N<\/b><\/font>/ ;
 4442+ $north_south_name =~ s/^S$/<font color=#FE0B0D><b>S<\/b><\/font>/ ;
 4443+
 4444+print "\n" ; # qqq
 4445+ ($link_country,$icon,$population,$connected) = &CountryMetaInfo ($country) ;
 4446+
 4447+ my $requests_this_country = $requests_recently_per_country {$country} ;
 4448+ my $requests_this_country2 = int ($requests_this_country * 1000 / $months_recently) ;
 4449+ $requests_tot += $requests_this_country2 ;
 4450+
 4451+ $requests_per_region {$region_code} += $requests_this_country ;
 4452+ $requests_per_region {$north_south_code} += $requests_this_country ;
 4453+ $requests_per_region2 {$region_code} += $requests_this_country2 ;
 4454+ $requests_per_region2 {$north_south_code} += $requests_this_country2 ;
 4455+
 4456+ $requests_per_person = ".." ;
 4457+ if ($population > 0)
 4458+ { $requests_per_person = sprintf ("%.0f", $requests_this_country2 / $population) ; }
 4459+
 4460+ $requests_per_connected_person = ".." ;
 4461+ if ($connected > 0)
 4462+ {
 4463+ if ($views_edits =~ /edit/i)
 4464+ { $requests_per_connected_person = sprintf ("%.4f", $requests_this_country2 / $connected) ; }
 4465+ else
 4466+ {
 4467+ if ($requests_this_country2 / $connected >= 1.95)
 4468+ { $requests_per_connected_person = sprintf ("%.0f", $requests_this_country2 / $connected) ; }
 4469+ else
 4470+ { $requests_per_connected_person = sprintf ("%.1f", $requests_this_country2 / $connected) ; }
 4471+ }
 4472+ }
 4473+
 4474+ $perc_share_total = '..' ;
 4475+ if ($requests_recently_all > 0)
 4476+ { $perc_share_total = &Percentage ($requests_this_country / $requests_recently_all) ; }
 4477+ $perc_tot += $perc_share_total ;
 4478+
 4479+ $bar = "&nbsp;" ;
 4480+ if ($perc_share_total > 0)
 4481+ { $bar = "<img src='redbar_hor.gif' width=" . (int ($perc_share_total * 10)) . " height=15>" ; }
 4482+
 4483+ $perc_connected = ".." ;
 4484+ if ($population > 0)
 4485+ { $perc_connected = sprintf ("%.0f", 100 * $connected / $population) .'%' ; }
 4486+
 4487+ # now use country names that are suitable for http://gunn.co.nz/map/
 4488+ $country2 = $country ;
 4489+ $country2 =~ s/Moldova, Republic of/Moldova/ ;
 4490+ $country2 =~ s/Korea, Republic of/South Korea/ ;
 4491+ $country2 =~ s/Korea, Democratic People's Republic of/North Korea/ ;
 4492+ $country2 =~ s/Iran, Islamic Republic of/Iran/ ;
 4493+ $country2 =~ s/UAE/United Arab Emirates/ ;
 4494+ $country2 =~ s/Congo - The Democratic Republic of the/Democratic Republic of the Congo/ ;
 4495+ $country2 =~ s/^Congo$/Republic of the Congo/ ;
 4496+ $country2 =~ s/Syrian Arab Republic/Syria/ ;
 4497+ $country2 =~ s/Tanzania, United Republic of/Tanzania/ ;
 4498+ $country2 =~ s/Libyan Arab Jamahiriya/Libya/ ;
 4499+ $country2 =~ s/C..?te d'Ivoire/C\xC3\xB4te d'Ivoire/ ;
 4500+ $country2 =~ s/Serbia/republic of serbia/ ;
 4501+ $country2 =~ s/Lao People's Democratic Republic/Laos/ ;
 4502+
 4503+
 4504+ push @csv_countries, "$country2,$code,$requests_this_country2,$population,$connected,$perc_connected,$requests_per_connected_person,$perc\n" ;
 4505+
 4506+ $population2 = &i2KM2 ($population) ;
 4507+ $connected2 = &i2KM2 ($connected) ;
 4508+ $requests_this_country2 = &i2KM2 ($requests_this_country2) ;
 4509+ $html .= "<tr><th class=rh3><a id='$country' name='$country'></a>$link_country $icon</td>" .
 4510+ "<td>$region_name</td>" .
 4511+ "<td>$north_south_name</td>" .
 4512+ "<td>$requests_this_country2</td>" .
 4513+ "<td>$population2</td>" . # <td>$requests_per_person</td>" .
 4514+ "<td>$connected2</td>" .
 4515+ "<td>$perc_connected</td>" .
 4516+ "<td>$requests_per_connected_person</td>" .
 4517+ "<td>$perc_share_total</td>" .
 4518+ "<td class=l>$bar</td></tr>\n" ;
 4519+
 4520+ if ($verbose)
 4521+ { push @index_countries, "<a href=#$country>$country ($perc)</a>\n " ; }
 4522+ else
 4523+ { push @index_countries, "<a href=#$country>$country</a>\n " ; }
 4524+ }
 4525+
 4526+
 4527+ $requests_per_person_tot = '..' ;
 4528+
 4529+ if ($population_tot > 0)
 4530+ { $requests_per_person_tot = sprintf ("%.0f", $requests_tot / $population_tot) ; }
 4531+
 4532+ if ($connected_tot > 0)
 4533+ {
 4534+ if ($views_edits =~ /edit/i)
 4535+ { $requests_per_connected_person_tot = sprintf ("%.4f", $requests_tot / $connected_tot) ; }
 4536+ else
 4537+ { $requests_per_connected_person_tot = sprintf ("%.0f", $requests_tot / $connected_tot) ; }
 4538+ }
 4539+
 4540+ $perc_connected_tot = ".." ;
 4541+ if ($population_tot > 0)
 4542+ { $perc_connected_tot = sprintf ("%.0f", 100 * $connected_tot / $population_tot) .'%' ; }
 4543+
 4544+ push @csv_countries, "world,*,$requests_tot,$population_tot,$connected_tot,$perc_connected_tot,$requests_per_connected_person_tot,100%\n" ;
 4545+
 4546+ $requests_tot2 = &i2KM2 ($requests_tot) ;
 4547+ $population_tot2 = &i2KM2 ($population_tot) ;
 4548+ $connected_tot2 = &i2KM2 ($connected_tot) ;
 4549+
 4550+ $html_total = "<tr><th class=rh3>All countries in</td>" .
 4551+ "<td><b>World</b></td>" .
 4552+ "<td>&nbsp;</td>" .
 4553+ "<td>$requests_tot2</td>" .
 4554+ "<td>$population_tot2</td>" .
 4555+ "<td>$connected_tot2</td>" .
 4556+ "<td>$perc_connected_tot</td>" .
 4557+ "<td>$requests_per_connected_person_tot</td>" .
 4558+ "<td>100%</th>" .
 4559+ "<td class=l>&nbsp;</td></tr>\n" ;
 4560+ $html_total .= "<tr><td colspan=99>&nbsp;</td></tr>" ;
 4561+
 4562+
 4563+ undef @keys_regions ;
 4564+# foreach $key (sort keys %population_per_hemisphere)
 4565+# { push @keys_regions, $key ; }
 4566+ $html_regions = '' ;
 4567+ foreach $key (qw (N S AF AS AU EU CA NA SA OC))
 4568+ {
 4569+ $region = $key ;
 4570+
 4571+ $region =~ s/^N$/<font color=#000BF7><b>Global North<\/b><\/font>/ ;
 4572+ $region =~ s/^S$/<font color=#FE0B0D><b>Global South<\/b><\/font>/ ;
 4573+
 4574+ $region =~ s/^AF$/<font color=#028702><b>Africa<\/b><\/font>/ ;
 4575+ $region =~ s/^CA$/<font color=#249CA0><b>Central-America<\/b><\/font>/ ;
 4576+ $region =~ s/^SA$/<font color=#FCAA03><b>South-America<\/b><\/font>/ ;
 4577+ $region =~ s/^NA$/<font color=#C802CA><b>North-America<\/b><\/font>/ ;
 4578+ $region =~ s/^AU$/<font color=#02AAD4><b>Australia<\/b><\/font>/ ;
 4579+ $region =~ s/^EU$/<font color=#0100CA><b>Europe<\/b><\/font>/ ;
 4580+ $region =~ s/^AS$/<font color=#E10202><b>Asia<\/b><\/font>/ ;
 4581+ $region =~ s/^OC$/<font color=#02AAD4><b>Oceania<\/b><\/font>/ ;
 4582+
 4583+ $population_region = $population_per_region {$key} ;
 4584+ $connected_region = $connected_per_region {$key} ;
 4585+ $requests_region = $requests_per_region {$key} ;
 4586+ $requests_region2 = $requests_per_region2 {$key} ;
 4587+
 4588+ $perc_connected_region = ".." ;
 4589+ if ($population_region > 0)
 4590+ { $perc_connected_region = sprintf ("%.0f", 100 * $connected_region / $population_region) .'%' ; }
 4591+
 4592+ $perc_share_total = '..' ;
 4593+ if ($requests_recently_all > 0)
 4594+ { $perc_share_total = &Percentage ($requests_region / $requests_recently_all) ; }
 4595+
 4596+ $perc_connected_region = ".." ;
 4597+ if ($population_region > 0)
 4598+ { $perc_connected_region = sprintf ("%.0f", 100 * $connected_region / $population_region) .'%' ; }
 4599+
 4600+ # $requests_region2 = int ($requests_region * 1000 / $months_recently) ;
 4601+
 4602+ $requests_per_connected_person = '..' ;
 4603+ if ($connected_region > 0)
 4604+ {
 4605+ if ($views_edits =~ /edit/i)
 4606+ { $requests_per_connected_person = sprintf ("%.4f", $requests_region2 / $connected_region) ; }
 4607+ else
 4608+ { $requests_per_connected_person = sprintf ("%.0f", $requests_region2 / $connected_region) ; }
 4609+ }
 4610+
 4611+ $population_region = &i2KM2 ($population_region) ;
 4612+ $connected_region = &i2KM2 ($connected_region) ;
 4613+ $requests_region = &i2KM2 ($requests_region) ;
 4614+ $requests_region2 = &i2KM2 ($requests_region2) ;
 4615+
 4616+ $bar = "&nbsp;" ;
 4617+ if ($perc_share_total > 0)
 4618+ { $bar = "<img src='bluebar_hor.gif' width=" . (int ($perc_share_total * 3)) . " height=15>" ; }
 4619+
 4620+ # $html_regions .= &WriteReportPerCountryOverviewLine ("All countries in", $region, '', $requests, $population) ;
 4621+ $html_regions .= "<tr><th>All countries in</th>" .
 4622+ "</td><td>$region</td>" .
 4623+ "<td>&nbsp;</td>" .
 4624+ "<td>$requests_region2</td>" .
 4625+ "<td>$population_region</td>" .
 4626+ "<td>$connected_region</td>" .
 4627+ "<td>$perc_connected_region</td>" .
 4628+ "<td>$requests_per_connected_person</td>" .
 4629+ "<td>$perc_share_total</th>" .
 4630+ "<td class=l>$bar</td></tr>\n" ;
 4631+
 4632+ if (($key eq 'S') || (($key eq 'OC')))
 4633+ { $html_regions .= "<tr><td colspan=99>&nbsp;</td></tr>" ; }
 4634+ }
 4635+
 4636+
 4637+ $html .= "</tbody>\n</table>" ;
 4638+ $html .= "<p>Countries are only included if the number of $views_edits_lc in the period exceeds 100,000 (100 matching records in 1:1000 sampled log)" ;
 4639+ $html .= "<br>$views_edits_lcf by bots are not included. Also all ip addresses that occur more than once on a given day are discarded for that day." ;
 4640+ $html .= "<br> A few false negatives are taken for granted. " ;
 4641+ $html .= "Country meta data collected from English Wikipedia (<a href='http://en.wikipedia.org/wiki/List_of_countries_by_population'>population</a>, <a href='http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users'>internet users</a>)). " ;
 4642+# $html .= "<br>Monthly $views_edits_lc per person is calculated over total population, regardless of age and internet connectivity" ; # how come, misplaced here ?!
 4643+
 4644+ $html .= &HtmlSortTableColumns; ;
 4645+ $html .= $colophon ;
 4646+
 4647+ $index = &HtmlIndex (join '/ ', sort (@index_countries)) ;
 4648+ $html =~ s/INDEX/$index/ ;
 4649+ $html =~ s/TOTAL/$html_total/ ;
 4650+ $html =~ s/REGIONS/$html_regions/ ;
 4651+
 4652+ &PrintHtml ($html, "$path_out/$file_html_per_country_overview") ;
 4653+}
 4654+
 4655+#sub WriteReportPerCountryOverviewLine
 4656+#{
 4657+# my ($name,$region,$hemisphere,$population,$connected,$requests) = @_ ;
 4658+# my ($perc_requests, $perc_connected, $requests_per_connected_person) ;
 4659+# my $html ;
 4660+# $html = "<tr><th>$name</th></td><td>$region</td><td>$hemisphere</td><td>$requests</td>" .
 4661+# "<td>$population</td>" . # <td>$requests_per_person_tot</td>" .
 4662+# "<td>$connected</td><td>$perc_connected</td><td>$requests_per_connected_person</td>" .
 4663+# "<td>$perc_requests</th><td class=l>&nbsp;</td></tr>\n" ;
 4664+# return ($html) ;
 4665+#}
 4666+
 4667+sub WriteCsvSvgFilePerCountryOverview
 4668+{
 4669+ my ($views_edits, $period, $ref_requests_per_period_per_country_code, $max_requests_per_connected_us, $desc_animation) = @_ ;
 4670+
 4671+ my %requests_per_country_code = %{$ref_requests_per_period_per_country_code -> {$period}} ;
 4672+ my %requests_per_country_code_prev = %{$ref_requests_per_period_per_country_code -> {$period_prev}} ;
 4673+ $period_prev = $period ;
 4674+
 4675+ my $description = $descriptions_per_period {$period} ;
 4676+ my $postfix = $descriptions_per_period {$period} ;
 4677+# $test = join '', sort values %requests_per_country_code ;
 4678+# print $test . "\n\n" ;
 4679+ print "\nWriteCsvSvgFilePerCountryOverview\n" ;
 4680+
 4681+ my ($link_country,$country,$code,$population,$connected,$icon,$bar,$bars,$bar_width,$perc,$perc_tot,$perc_global,$requests_tot,$requests_max,$requests_this_country,$requests_this_country2) ;
 4682+ my (@index_countries,@csv_countries,%svg_groups,%percentage_of_total_pageviews,%requests_per_connected_persons) ;
 4683+
 4684+ undef @csv_countries ;
 4685+ $header_csv_countries = "# Wikimedia Traffic Analysis Report - Wikipedia $views_edits Per Country - Overview\n" .
 4686+ "# Report based on data from $description\n" .
 4687+ "country,code,views,population,internet users,%connected,views per user,%global views\n" ;
 4688+
 4689+ $requests_tot = 0 ;
 4690+ undef %fills ;
 4691+
 4692+# # normalize to 100% average
 4693+# $requests_cnt = 0 ;
 4694+# $requests_tot = 0 ;
 4695+# foreach $country_code (keys %requests_per_country_code)
 4696+# {
 4697+# $requests_cnt ++ ;
 4698+# $requests_tot += $requests_per_country_code {$country_code} ;
 4699+# }
 4700+
 4701+# die "\$requests_cnt == 0" if $requests_cnt == 0 ;
 4702+# $requests_avg = $requests_tot / $requests_cnt ;
 4703+# print "requests cnt: $requests_cnt, tot: $requests_tot, avg: $requests_avg\n" ;
 4704+
 4705+# die "\$requests_avg == 0" if $requests_avg == 0 ;
 4706+# foreach $country_code (keys %requests_per_country_code)
 4707+# { $requests_per_country_code {$country_code} *= 100/$requests_avg ; }
 4708+# # normalize complete
 4709+
 4710+# print "$code, $country: $requests_this_country\n" ;
 4711+ $requests_this_country = $requests_per_country_code {$country_code} ;
 4712+
 4713+ foreach $country_code (keys_sorted_by_value_num_desc %requests_per_country_code)
 4714+ {
 4715+ ($country,$code) = split ('\|', $country_code) ;
 4716+ ($link_country,$icon,$population,$connected) = &CountryMetaInfo ($country) ;
 4717+
 4718+ $requests_this_country = ($requests_per_country_code {$country_code} +
 4719+ 4*$requests_per_country_code_prev {$country_code}) / 5 ;
 4720+ ($requests_svg,$ratio_svg,$fill_svg) = RatioAndFillColor2 ($code, $requests_this_country, 200, $ratio_linear) ;
 4721+ }
 4722+ &WriteWorldMapSvg ("$period-1", $description) ;
 4723+
 4724+ foreach $country_code (keys_sorted_by_value_num_desc %requests_per_country_code)
 4725+ {
 4726+ ($country,$code) = split ('\|', $country_code) ;
 4727+ ($link_country,$icon,$population,$connected) = &CountryMetaInfo ($country) ;
 4728+
 4729+ $requests_this_country = (2*$requests_per_country_code {$country_code} +
 4730+ 3*$requests_per_country_code_prev {$country_code}) / 5 ;
 4731+ ($requests_svg,$ratio_svg,$fill_svg) = RatioAndFillColor2 ($code, $requests_this_country, 200, $ratio_linear) ;
 4732+ }
 4733+ &WriteWorldMapSvg ("$period-2", $description) ;
 4734+
 4735+ foreach $country_code (keys_sorted_by_value_num_desc %requests_per_country_code)
 4736+ {
 4737+ ($country,$code) = split ('\|', $country_code) ;
 4738+ ($link_country,$icon,$population,$connected) = &CountryMetaInfo ($country) ;
 4739+
 4740+ $requests_this_country = (3*$requests_per_country_code {$country_code} +
 4741+ 2*$requests_per_country_code_prev {$country_code}) / 5 ;
 4742+ ($requests_svg,$ratio_svg,$fill_svg) = RatioAndFillColor2 ($code, $requests_this_country, 200, $ratio_linear) ;
 4743+ }
 4744+ &WriteWorldMapSvg ("$period-3", $description) ;
 4745+
 4746+ foreach $country_code (keys_sorted_by_value_num_desc %requests_per_country_code)
 4747+ {
 4748+ ($country,$code) = split ('\|', $country_code) ;
 4749+ ($link_country,$icon,$population,$connected) = &CountryMetaInfo ($country) ;
 4750+
 4751+ $requests_this_country = (4*$requests_per_country_code {$country_code} +
 4752+ $requests_per_country_code_prev {$country_code}) / 5 ;
 4753+ ($requests_svg,$ratio_svg,$fill_svg) = RatioAndFillColor2 ($code, $requests_this_country, 200, $ratio_linear) ;
 4754+ }
 4755+ &WriteWorldMapSvg ("$period-4", $description) ;
 4756+
 4757+
 4758+# print "$code, $country: $requests_this_country\n" ;
 4759+
 4760+
 4761+ foreach $country_code (keys_sorted_by_value_num_desc %requests_per_country_code)
 4762+ {
 4763+ ($country,$code) = split ('\|', $country_code) ;
 4764+ ($link_country,$icon,$population,$connected) = &CountryMetaInfo ($country) ;
 4765+
 4766+# print "$code, $country: $requests_this_country\n" ;
 4767+ $requests_this_country = $requests_per_country_code {$country_code} ;
 4768+ ($requests_svg,$ratio_svg,$fill_svg) = RatioAndFillColor2 ($code, $requests_this_country, 200, $ratio_linear) ;
 4769+
 4770+next ;
 4771+ $requests_this_country = &CorrectForMissingDays ($period, $requests_per_country_code {$country_code} * 1000, $code, "\$requests_this_country") ;
 4772+
 4773+ $requests_tot += $requests_this_country ;
 4774+
 4775+ $requests_per_person = ".." ;
 4776+ if ($population > 0)
 4777+ { $requests_per_person = sprintf ("%.1f", $requests_this_country / $population) ; }
 4778+
 4779+ $requests_per_connected_person = ".." ;
 4780+ if ($connected > 0)
 4781+ {
 4782+ # if ($requests_this_country / $connected >= 1.95)
 4783+ # { $requests_per_connected_person = sprintf ("%.0f", $requests_this_country / $connected) ; }
 4784+ # else
 4785+ # { $requests_per_connected_person = sprintf ("%.1f", $requests_this_country / $connected) ; }
 4786+ $requests_per_connected_person = sprintf ("%.1f", $requests_this_country / $connected) ;
 4787+ }
 4788+
 4789+ $perc = '..' ;
 4790+ $requests_all = &CorrectForMissingDays ($period, $requests_all_per_period {$period} * 1000, $code, "\$requests_all") ;
 4791+ if ($requests_all > 0)
 4792+ { $perc = &Percentage ($requests_this_country / $requests_all) ; }
 4793+ $perc_tot += $perc ;
 4794+
 4795+ $perc_connected = ".." ;
 4796+ if ($population > 0)
 4797+ { $perc_connected = sprintf ("%.1f", 100 * $connected / $population) .'%' ; }
 4798+
 4799+ # now use country names that are suitable for http://gunn.co.nz/map/
 4800+ $country =~ s/Moldova, Republic of/Moldova/ ;
 4801+ $country =~ s/Korea, Republic of/South Korea/ ;
 4802+ $country =~ s/Korea, Democratic People's Republic of/North Korea/ ;
 4803+ $country =~ s/Iran, Islamic Republic of/Iran/ ;
 4804+ $country =~ s/UAE/United Arab Emirates/ ;
 4805+ $country =~ s/Congo - The Democratic Republic of the/Democratic Republic of the Congo/ ;
 4806+ $country =~ s/^Congo$/Republic of the Congo/ ;
 4807+ $country =~ s/Syrian Arab Republic/Syria/ ;
 4808+ $country =~ s/Tanzania, United Republic of/Tanzania/ ;
 4809+ $country =~ s/Libyan Arab Jamahiriya/Libya/ ;
 4810+ $country =~ s/C..?te d'Ivoire/C\xC3\xB4te d'Ivoire/ ;
 4811+ $country =~ s/Serbia/republic of serbia/ ;
 4812+ $country =~ s/Lao People's Democratic Republic/Laos/ ;
 4813+
 4814+ # ($requests_svg,$ratio_svg,$fill_svg) = RatioAndFillColor ($code, $requests_per_connected_person, $max_requests_per_connected_us, $ratio_sqrt) ;
 4815+ ($requests_svg,$ratio_svg,$fill_svg) = RatioAndFillColor ($code, $requests_per_person, 3, $ratio_sqrt) ;
 4816+ $ratio_svg = sprintf ("%.1f", $ratio_svg) ;
 4817+ push @csv_countries, "\"$country\",$code,$requests_this_country,$population,$connected,$perc_connected,$requests_per_connected_person,$perc,$requests_svg,$ratio_svg,$fill_svg\n" ;
 4818+
 4819+ $requests_per_connected_persons {lc $code} = $requests_per_connected_person ;
 4820+ $requests_per_persons {lc $code} = $requests_per_person ;
 4821+ $percentage_of_total_pageviews {lc $code} = $perc ;
 4822+ }
 4823+ &WriteWorldMapSvg ("$period-5", $description) ;
 4824+
 4825+ $requests_per_person_tot = '..' ;
 4826+
 4827+ if ($population_tot > 0)
 4828+ { $requests_per_person_tot = sprintf ("%.1f", $requests_tot / $population_tot) ; }
 4829+
 4830+ if ($connected_tot > 0)
 4831+ { $requests_per_connected_person_tot = sprintf ("%.1f", $requests_tot / $connected_tot) ; }
 4832+
 4833+ $perc_connected_tot = ".." ;
 4834+ if ($population_tot > 0)
 4835+ { $perc_connected_tot = sprintf ("%.1f", 100 * $connected_tot / $population_tot) .'%' ; }
 4836+
 4837+ push @csv_countries, "world,*,$requests_tot,$population_tot,$connected_tot,$perc_connected_tot,$requests_per_connected_person_tot,100%\n" ;
 4838+ print "$period $requests_tot\n" ;
 4839+
 4840+ $file_csv_per_country_overview2 = $file_csv_per_country_overview ;
 4841+ $file_csv_per_country_overview2 =~ s/\.csv/-$postfix.csv/ ;
 4842+ &PrintCsv ($header_csv_countries . join ('', sort @csv_countries), "$path_out/svg/$file_csv_per_country_overview2") ;
 4843+
 4844+# $perc_tot = 0 ;
 4845+# foreach $code (keys_sorted_by_value_num_desc %requests_per_connected_persons)
 4846+# {
 4847+# $perc = $percentage_of_total_pageviews {$code} ;
 4848+# $requests = $requests_per_connected_persons {$code} ;
 4849+# $perc =~ s/\%// ;
 4850+# $perc_tot += $perc ;
 4851+# print "$code $requests $perc $perc_tot\n" ;
 4852+# if ($perc_tot > 30)
 4853+# {
 4854+# $requests_max = $requests ;
 4855+# print "Max requests = $requests_max\n " ;
 4856+# last ;
 4857+# }
 4858+# }
 4859+
 4860+# for svg with prefined styles (InkScape only ?)
 4861+# foreach $code (keys %requests_per_connected_persons)
 4862+# {
 4863+# $requests = $requests_per_connected_persons {$code} ;
 4864+# if ($requests > $max_requests_per_connected_us)
 4865+# { $requests = $max_requests_per_connected_us ; }
 4866+# $svg_groups {$requests} .= "." . lc ($code) . ", " ;
 4867+# }
 4868+
 4869+#foreach $code (keys %requests_per_connected_persons)
 4870+# {
 4871+# $requests = $requests_per_connected_persons {$code} ;
 4872+# if ($requests > $max_requests_per_connected_us)
 4873+# { $requests = $max_requests_per_connected_us ; }
 4874+
 4875+# $ratio = sqrt ($requests / $max_requests_per_connected_us) ;
 4876+# if ($ratio >= 0.20)
 4877+# {
 4878+# $green = 180 ;
 4879+# $red = 180 - int (0.5 + 180 * 5/4 * ($ratio-0.20)) ;
 4880+# $blue = int ($green / 3) ;
 4881+# }
 4882+# else
 4883+# {
 4884+# $red = 220 ;
 4885+# $green = int (0.5 + 220 * 5 * $ratio) ;
 4886+# $blue = 0 ; #int ($green / 2) ;
 4887+# }
 4888+# $fill = "\#" . sprintf ("%02x%02x%02x",$red,$green,$blue) ;
 4889+# $fill = lc hsv2rgb($ratio*120,1,1) ;
 4890+
 4891+# $fills {$code} = $fill ;
 4892+# }
 4893+}
 4894+
 4895+sub WriteWorldMapSvg
 4896+{
 4897+ ($period, $description) = @_ ;
 4898+
 4899+ open SVG_IN, "world_map_blank_plain2.svg" ;
 4900+# open SVG_IN, "BlankMap-World6,_compact with text box.svg" ;
 4901+ @lines = <SVG_IN> ;
 4902+ close SVG_IN ;
 4903+
 4904+# foreach $line (@lines)
 4905+# { $line =~ s/COUNTRY_STYLES/$svg_text/ ; }
 4906+
 4907+ ($text1,$text2) = split ' - ', $description ;
 4908+ print "Animation description: $description -> $text1 | $text2\n" ;
 4909+
 4910+ $lines = join '', @lines ;
 4911+ $lines =~ s/<circle[^>]*?>//gs ;
 4912+ $lines =~ s/Yyyy/$text2/ ;
 4913+ $lines =~ s/Xxxx/$text1/ ;
 4914+# $lines =~ s/Zzzz/Wikipedia views per internet user/ ;
 4915+ $lines =~ s/Zzzz/$desc_animation/ ;
 4916+
 4917+ $linenum = 0 ;
 4918+ @lines = split '<g', $lines ;
 4919+ foreach $line (@lines)
 4920+ {
 4921+ @lines2 = split '<path', $line ;
 4922+
 4923+ ($code = $lines2 [0]) =~ s/^.*?id=\"(\w+)\".*$/$1/s ;
 4924+ $code = substr ($code,0,2) ;
 4925+
 4926+ if (defined $fills {$code})
 4927+ {
 4928+ $fill = $fills {$code} ;
 4929+ $lines2 [0] =~ s/(id="$code[x-]?")(?:\s*\n\s*style="[^"]*")?/$1\n style="fill:$fill;fill-opacity:1;stroke:#000000;stroke-width:2.5"/s ;
 4930+ }
 4931+ $linenum = 0 ;
 4932+ foreach $line2 (@lines2)
 4933+ {
 4934+ ($code = $line2) =~ s/^.*?id=\"(\w+)\".*$/$1/s ;
 4935+ $code = substr ($code,0,2) ;
 4936+
 4937+ next if ! defined $fills {$code} ;
 4938+ $fill = $fills {$code} ;
 4939+
 4940+ # $trace_svg = $false ;
 4941+ # if (($code eq 'ne') && ($line2 =~ /id=\"$code/i))
 4942+ # { $trace_svg = $true ; }
 4943+ # print "A " . $line2 . "\n\n" if $trace_svg ;
 4944+
 4945+ next if $linenum ++ == 0 ;
 4946+ $line2 =~ s/style="[^"]*"/style="fill:$fill;fill-opacity:1;stroke:#000000;stroke-width:2.5"/s;
 4947+
 4948+ # print "B " . $line2 [0] . "\n\n" if $trace_svg ;
 4949+ }
 4950+ $line = join '<path', @lines2 ;
 4951+ }
 4952+ $lines = join '<g', @lines ;
 4953+
 4954+ @lines = split '<path', $lines ;
 4955+ foreach $line (@lines)
 4956+ {
 4957+ ($code = $line) =~ s/^.*?id=\"([\w-]+)\".*$/$1/s ;
 4958+ next if ! defined $requests_per_persons {$code} ;
 4959+ # print "A $line\n" ;
 4960+ $fill = $fills {$code} ;
 4961+ $line =~ s/(id="$code[x-]?")\s*\n\s*style="fill:#b9b9b9[^"]*"/$1\n style="fill:$fill;fill-opacity:1;stroke:#000000;stroke-width:2.5"/sg ;
 4962+ # print "B $line\n" ;
 4963+ }
 4964+ $lines = join '<path', @lines ;
 4965+
 4966+ # if (! defined $fills {$code}) { if ($code =~ /^.{2,3}$/) { print uc($code) . ",\"CODE NOT FOUND\"\n" ; } }
 4967+
 4968+ $lines =~ s/fill:#b9b9b9;stroke:#ffffff;stroke-width:[\d\.]*/fill:#606060;stroke:#000000;stroke-width:2.5/g ;
 4969+
 4970+ @lines = split ("\n", $lines) ;
 4971+ open SVG_OUT, '>', "svg/world_map_$period.svg" ;
 4972+ foreach $line (@lines)
 4973+ {
 4974+ chomp $line ;
 4975+ print SVG_OUT "$line\n" ;
 4976+ }
 4977+ close SVG_OUT ;
 4978+
 4979+ print "Convert world_map_$period.svg to png\n" ;
 4980+ `svg/convert.exe svg/world_map_$period.svg png:svg/world_map_$period.png` ;
 4981+# print "Convert world_map_$period.svg to jpg\n" ;
 4982+# `svg/convert.exe svg/world_map_$period.svg jpg:svg/world_map_$period.jpg` ;
 4983+# print "Convert world_map_$period.svg to gif\n" ;
 4984+# `svg/convert.exe svg/world_map_$period.svg gif:svg/world_map_$period.gif` ;
 4985+
 4986+# exit ; # qqq
 4987+# exit ;
 4988+# sleep (2) ; # until computer fan fixed
 4989+}
 4990+
 4991+sub RatioAndFillColor
 4992+{
 4993+ my ($code, $requests,$requests_max, $ratio_sqrt) = @_ ;
 4994+ my ($ratio,$green,$red,$blue,$fill) ;
 4995+
 4996+ if ($requests > $requests_max)
 4997+ { $requests = $requests_max ; }
 4998+
 4999+ $ratio = $requests / $requests_max ;
 5000+
 5001+ if ($ratio_sqrt && ($ratio > 0))
 5002+ { $ratio = sqrt ($ratio) ; }
 5003+
 5004+# if ($ratio >= 0.20)
 5005+# {
 5006+# $green = 180 ;
 5007+# $red = 180 - int (0.5 + 180 * 5/4 * ($ratio-0.20)) ;
 5008+# $blue = int ($green / 3) ;
 5009+# }
 5010+# else
 5011+# {
 5012+# $red = 220 ;
 5013+# $green = int (0.5 + 220 * 5 * $ratio) ;
 5014+# $blue = 0 ; #int ($green / 2) ;
 5015+# }
 5016+
 5017+# $fill = "\#" . sprintf ("%02x%02x%02x",$red,$green,$blue) ;
 5018+# $fill = lc hsv2rgb($ratio*150,0.67+$ratio*0.33,0.8-0.2*$ratio) ;
 5019+ $fill = lc hsv2rgb($ratio*120,1,1) ;
 5020+
 5021+ $fills {lc $code} = $fill ;
 5022+ return ($requests,$ratio,$fill) ;
 5023+}
 5024+
 5025+sub RatioAndFillColor2
 5026+{
 5027+ my ($code, $requests,$requests_max, $ratio_sqrt) = @_ ;
 5028+ my ($ratio,$green,$red,$blue,$fill,$value) ;
 5029+
 5030+ if ($requests > $requests_max)
 5031+ { $requests = $requests_max ; }
 5032+
 5033+ $ratio = $requests / $requests_max ;
 5034+
 5035+# if ($ratio_sqrt && ($ratio > 0))
 5036+# { $ratio = sqrt ($ratio) ; }
 5037+
 5038+ if ($ratio >= 0.5)
 5039+ {
 5040+ $value = $ratio * 2 - 1 ; # 0.5 - 1 -> 0 - 1
 5041+ $fill = lc hsv2rgb(60+$value*60,0.5+$value/2,0.5+$value/2) ;
 5042+ $fill = lc hsv2rgb(120,0+$value,0.5+$value/2) ;
 5043+ }
 5044+ else
 5045+ {
 5046+ $value = 1 - $ratio * 2 ; # 0 - 0.5 -> 1 - 0
 5047+ $fill = lc hsv2rgb(60-$value*60,0.5+$value/2,0.5+$value/2) ;
 5048+ $fill = lc hsv2rgb(0,0+$value,0.5+$value/2) ;
 5049+ } # lc hsv2rgb($ratio*150,0.67+$ratio*0.33,0.8-0.2*$ratio) ; }
 5050+# print "ratio $ratio: requests $requests max requests $requests_max $fill\n" ;
 5051+
 5052+ $fills {lc $code} = $fill ;
 5053+ return ($requests,$ratio,$fill) ;
 5054+}
 5055+
 5056+
 5057+sub WriteReportPerCountryBreakdown
 5058+{
 5059+ print "\nWriteReportPerCountryBreakDown\n" ;
 5060+
 5061+ my ($title,$views_edits,$links,$cutoff_requests, $cutoff_percentage, $show_logcount) = @_ ;
 5062+ my ($link_country,$population,$icon,$bar,$bars,$bar_width,$perc,$perc_tot,$perc_global,$requests_tot) ;
 5063+ my ($requests_this_language, $requests_all_languages, $requests_used, $requests_other) ;
 5064+ my @index_countries ;
 5065+ my $views_edits_lc = lc $views_edits ;
 5066+
 5067+ if ($show_logcount)
 5068+ { $report_version = "<p>This is the extended version of this report, with even small percentages included (> $cutoff_percentage\%) (see also bottom of page). " .
 5069+ "Switch to <a href='$file_html_per_country_breakdown'>regular version</a>" ; }
 5070+ else
 5071+ { $report_version = "<p>This is the regular version of this report, with only major percentages (> $cutoff_percentage\%) included." .
 5072+ " Switch to <a href='$file_html_per_country_breakdown_huge'>extended version</a>" ; }
 5073+
 5074+ $html = $header ;
 5075+ $html =~ s/TITLE/$title/ ;
 5076+ $html =~ s/HEADER/$title/ ;
 5077+ $html =~ s/LINKS// ;
 5078+ $html =~ s/ALSO/$links/ ;
 5079+ $html =~ s/NOTES// ;
 5080+ $html =~ s/X1000/.&nbsp;Period <b>$requests_recently_start - $requests_recently_stop<\/b><br>$report_version/ ;
 5081+ $html =~ s/DATE// ;
 5082+
 5083+ $html .= "<p><table border=1 width=800>INDEX\n" ;
 5084+
 5085+ $html .= &HtmlWorldMaps ;
 5086+
 5087+ my $anomaly_found ;
 5088+
 5089+ foreach $country (keys_sorted_by_value_num_desc %requests_recently_per_country)
 5090+ {
 5091+ next if $requests_recently_per_country {$country} < $cutoff_requests ;
 5092+
 5093+ %requests_per_language = %{$requests_recently_per_country_per_language {$country}} ;
 5094+ @languages = keys_sorted_by_value_num_desc %requests_per_language ;
 5095+
 5096+ $requests_this_country = $requests_recently_per_country {$country} ;
 5097+
 5098+ $perc = 'n.a.' ;
 5099+ if ($requests_recently_all > 0)
 5100+ { $perc = &Percentage ($requests_this_country / $requests_recently_all) ; }
 5101+
 5102+ ($link_country,$icon,$population) = &CountryMetaInfo ($country) ;
 5103+
 5104+ $html .= "<tr><th colspan=99 class=lh3><a id='$country' name='$country'></a><br>$icon $link_country <small>($perc share of global total)</small></th></tr>\n" ;
 5105+
 5106+ $perc_tot = 0;
 5107+ $requests_used = 0 ;
 5108+ for ($l = 0 ; $l < 50 ; $l++)
 5109+ {
 5110+ $requests_this_language = $requests_recently_per_country_per_language {$country} {$languages [$l]} ;
 5111+ $requests_all_languages = $requests_recently_per_country {$country} ;
 5112+
 5113+ last if $requests_this_language == 0 ;
 5114+
 5115+ $requests_used += $requests_this_language ;
 5116+
 5117+ $perc = 0 ;
 5118+ if ($requests_recently_all > 0)
 5119+ {
 5120+ $perc = &Percentage ($requests_this_language / $requests_all_languages) ;
 5121+
 5122+ last if $perc < $cutoff_percentage ;
 5123+
 5124+ $perc_tot += $perc ;
 5125+ }
 5126+
 5127+ $language = $languages [$l] ;
 5128+ if ($out_languages {$language} ne "")
 5129+ { $language = $out_languages {$language} ; }
 5130+ if (length ($language) > 20)
 5131+ { $language =~ s/ .*$// ; }
 5132+ $bar_width = int ($perc * 6) ;
 5133+
 5134+ if (($country eq "Australia") && ($language eq "Japanese") && ($perc > 5))
 5135+ { $language .= " <b><a href='#anomaly' onclick='alert(\"Probably incorrectly assigned to this country.\\nOutdated Regional Internet Registry (RIR) administration may have caused this.\")';><font color='#FF0000'>(*)</font></a></b>" ; $anomaly_found = $true ;}
 5136+
 5137+ $bar_100 = "" ;
 5138+ if ($bars++ == 0)
 5139+ {
 5140+ $bar_width_100 = 600 - $bar_width ;
 5141+ $bar_100 = "<img src='background.gif' width=$bar_width_100 height=15>" ;
 5142+ }
 5143+
 5144+ if ($language !~ /Portal/)
 5145+ { $language .= " Wp" ; }
 5146+
 5147+ $perc =~ s/(\.\d)0/$1/ ; # 0.10% -> 0.1%
 5148+ if ($show_logcount && ($requests_this_language < 5 * $months_recently)) # show in grey to discuss threshold on foundation-l
 5149+ { $perc = "<font color=#800000>$perc</font" ; }
 5150+
 5151+ $html .= "<tr><th class=l class=small nowrap>$language</th>" .
 5152+ ($show_logcount ? "<td class=r>$requests_this_language</td>" : "") .
 5153+ "<td class=c>$perc</td>" .
 5154+ "<td class=l><img src='yellowbar_hor.gif' width=$bar_width height=15>$bar_100</td></tr>\n" ;
 5155+ }
 5156+
 5157+ if ($perc_tot > 100) { $perc_tot = 100 ; }
 5158+ $requests_other = $requests_all_languages - $requests_used ;
 5159+ $perc_other = sprintf '%.1f', 100 - $perc_tot ;
 5160+ if (($requests_other > 0) && ($perc_other > 0))
 5161+ {
 5162+ $bar_width = $perc_other * 6 ;
 5163+ $html .= "<tr><th class=l class=small nowrap>Other</th>" .
 5164+ ($show_logcount ? "<td class=r>$requests_other</td>" : "") .
 5165+ "<td class=c>$perc_other%</td>" .
 5166+ "<td class=l><img src='yellowbar_hor.gif' width=$bar_width height=15></td></tr>\n" ;
 5167+ }
 5168+
 5169+ if ($verbose)
 5170+ { push @index_countries, "<a href='#$country'>$country ($perc)</a> " ; }
 5171+ else
 5172+ { push @index_countries, "<a href='#$country'>$country</a> " ; }
 5173+
 5174+ # print "\n" ;
 5175+ # $html .= "<tr><td colspan=99>&nbsp;</td></tr>\n" ;
 5176+ }
 5177+ $html .= "</table>" ;
 5178+ $html .= "<p><b>Share<\/b> is the percentage of requesting ip addresses (out of the global total) which originated from this country" .
 5179+ "<br>&nbsp;Further percentages show per country share of $views_edits_lc per Wikipedia visited" ;
 5180+ $html .= "<p><b>Countries</b> are only included if the number of requests in the period exceeds $cutoff_requests,000 ($cutoff_requests matching records in 1:1000 sampled log)" ;
 5181+ $html .= "<p><b>Wikipedia's</b> are only listed for some country if the share of visitors for that particular country exceeds $cutoff_percentage\%." ;
 5182+ if ($show_logcount)
 5183+ {
 5184+ $html .= "<p>The second column displays the actual <b>numbers of records</b> found in the 1:1000 sampled log on which the percentage is based." .
 5185+ "<br>Multiply by 1000 for actual $views_edits_lc over the whole period of $months_recently months." ;
 5186+ $html .= "<br>If the number of records in the sampled log does not reach the (arbitrary) number of 5 per sampled month, the percentage is flagged dark red to extra emphasize high inaccuracy." ;
 5187+ }
 5188+
 5189+ $html .= "<p>Page requests by bots are not included. Also all ip addresses that occur more than once on a given day are discarded for that day." ;
 5190+ $html .= "<br> A few false negatives are taken for granted. " .
 5191+ "Country meta data collected from <a href='http://en.wikipedia.org/wiki/List_of_countries_by_population'>English Wikipedia</a>. " .
 5192+ "Portal = <a href='http://www.wikipedia.org'>www.wikipedia.org</a>" ;
 5193+# if ($anomaly_found)
 5194+# { $html .= "<p><a id='anomaly' name='anomaly'>Probably anomaly caused by outdated <a href='http://en.wikipedia.org/wiki/Regional_Internet_Registry'>Regional Internet Registry</a> administration.\n" ; }
 5195+
 5196+ $html .= $colophon ;
 5197+
 5198+ $index = &HtmlIndex (join '/ ', sort (@index_countries)) ;
 5199+ $html =~ s/INDEX/$index/ ;
 5200+
 5201+ if (! $show_logcount)
 5202+ { &PrintHtml ($html, "$path_out/$file_html_per_country_breakdown") ; }
 5203+ else
 5204+ { &PrintHtml ($html, "$path_out/$file_html_per_country_breakdown_huge") ; }
 5205+}
 5206+
 5207+sub WriteReportPerCountryTrends
 5208+{
 5209+ print "\nWriteReportPerCountryTrends\n" ;
 5210+
 5211+ my ($title,$views_edits,$links) = @_ ;
 5212+ my ($link_country,$population,$icon,$bar,$bars,$bar_width,$perc,$perc_tot,$perc_global,$requests_tot) ;
 5213+ my @index_languages ;
 5214+ my $views_edits_lc = lc $views_edits ;
 5215+
 5216+ $html = $header ;
 5217+ $html =~ s/TITLE/$title/ ;
 5218+ $html =~ s/HEADER/$title/ ;
 5219+ $html =~ s/LINKS// ;
 5220+ $html =~ s/ALSO/$links/ ;
 5221+ $html =~ s/NOTES// ;
 5222+ $html =~ s/X1000/.&nbsp;Period <b>$requests_start - $requests_stop<\/b>/ ;
 5223+ $html =~ s/DATE// ;
 5224+
 5225+ $html .= "<p><table border=1 width=800>INDEX\n" ;
 5226+
 5227+ $html .= &HtmlWorldMaps ;
 5228+
 5229+ foreach $country (keys_sorted_by_value_num_desc %requests_per_country)
 5230+ {
 5231+ next if $requests_per_country {$country} < 50 * ($#quarters + 1) ;
 5232+
 5233+ %requests_per_language = %{$requests_per_country_per_language {$country}} ;
 5234+ @languages = keys_sorted_by_value_num_desc %requests_per_language ;
 5235+
 5236+ ($link_country,$icon,$population) = &CountryMetaInfo ($country) ;
 5237+
 5238+ $html .= "<tr><th colspan=99 class=lh3><a id='$country' name='$country'></a><br>$icon $link_country</th></tr>\n" ;
 5239+
 5240+ if ($views_edits eq 'Page Edits')
 5241+ { $rowspan = $#quarters+2 ; }
 5242+ else
 5243+ { $rowspan = $#quarters+3 ; }
 5244+
 5245+ $html .= "<tr><th class=small>Quarter</th>[<th class=small>Total</th>]<th class=small>Share</th><th rowspan=$rowspan>&nbsp;</th>\n" ;
 5246+ for ($l = 0 ; $l < 10 ; $l++)
 5247+ {
 5248+ $language = $languages [$l] ;
 5249+ if ($out_languages {$language} ne "")
 5250+ { $language = $out_languages {$language} ; }
 5251+ if (length ($language) > 20)
 5252+ { $language =~ s/ .*$// ; }
 5253+ $html .= "<th class=c class=small>$language</th>\n" ;
 5254+ # print " [$language] " ;
 5255+ }
 5256+ $html .= "<th>other</th>\n" ;
 5257+ $html .= "</tr>\n" ;
 5258+ # print "\n" ;
 5259+
 5260+ $lines = 0 ;
 5261+ foreach $quarter (reverse @quarters)
 5262+ {
 5263+ next if $views_edits eq 'Page Edits' and $quarter =~ /2009.*?Q3/ ; # strange results, to be researched
 5264+
 5265+ $line1 = "<tr>\n" ;
 5266+ $line2 = "<tr>\n" ;
 5267+
 5268+ my $requests_this_country = $requests_per_quarter_per_country {$quarter} {$country} ;
 5269+ my $requests_all_countries = $requests_per_quarter {$quarter} ;
 5270+
 5271+ $perc = 'n.a.' ;
 5272+ if ($requests_all_countries > 0)
 5273+ {
 5274+ $perc = &Percentage ($requests_this_country / $requests_all_countries) ;
 5275+ # print "$quarter: " . sprintf ("%9d", $requests_this_country) . " = $perc\% $country\n" ;
 5276+ $line1 .= "<th class=c nowrap>&nbsp;$quarter&nbsp;</th>[<td align=right>$requests_this_country</td>]<td align=center>$perc</td>" ;
 5277+ $line2 .= "<th nowrap>&nbsp;$quarter&nbsp;</th>[<td align=right>$requests_this_country</td>]<td align=center>$perc</td>" ;
 5278+ }
 5279+
 5280+ $perc_tot = 0;
 5281+ for ($l = 0 ; $l < 10 ; $l++)
 5282+ {
 5283+ my $requests_this_language = $requests_per_quarter_per_country_per_language {$quarter} {$country} {$languages [$l]} ;
 5284+ my $requests_all_languages = $requests_per_quarter_per_country {$quarter} {$country} ;
 5285+ $perc = 0 ;
 5286+ if ($requests_all_languages > 0)
 5287+ {
 5288+ $perc = &Percentage ($requests_this_language / $requests_all_languages) ;
 5289+ $perc_tot += $perc ;
 5290+ }
 5291+ # print "[" . sprintf ("%9d", $requests_this_language) . " = $perc\%]" ;
 5292+ if ($perc != 0)
 5293+ { $line2 .= "<td class=c><img src='yellowbar_hor.gif' width=$perc height=15></td>" ; }
 5294+ else
 5295+ { $line2 .= "<td class=l>&nbsp;</td>" ; }
 5296+
 5297+ if (($country eq "Australia") && (($perc < 50) && ($perc > 5)))
 5298+ { $perc .= " <b><a href='#anomaly' onclick='alert(\"Probably incorrectly assigned to this country.\\nOutdated Regional Internet Registry (RIR) administration may have caused this.\")';><font color='#FF0000'>(*)</font></a></b>" ; $anomaly_found = $true ;}
 5299+ $line1 .= "<td class=c>[$requests_this_language]$perc</td>" ;
 5300+ }
 5301+ if ($perc_tot > 100) { $perc_tot = 100 ; }
 5302+ $perc_other = sprintf '%.1f', 100 - $perc_tot ;
 5303+ $line1 .= "<td class=c>$perc_other%</td>" ;
 5304+
 5305+ $line1 .= "</tr>\n" ;
 5306+ $line2 .= "</tr>\n" ;
 5307+ $html .= $line1 ;
 5308+ if ($lines++ == $#quarters)
 5309+ { $html .= $line2 ; } # only for last quarter
 5310+ }
 5311+
 5312+ if ($verbose)
 5313+ { push @index_countries, "<a href='#$country'>$country ($perc)</a> " ; }
 5314+ else
 5315+ { push @index_countries, "<a href='#$country'>$country</a> " ; }
 5316+
 5317+ # print "\n" ;
 5318+ # $html .= "<tr><td colspan=99>&nbsp;</td></tr>\n" ;
 5319+ }
 5320+ $html .= "</table>" ;
 5321+ $html .= "<p><b>Share<\/b> is the percentage of requesting ip addresses (out of the global total) which originated from this country" .
 5322+ "<br>&nbsp;Further percentages show per country per quarter share of $views_edits_lc per Wikipedia visited" ;
 5323+ $html .= "<p>Countries are only included if the number of requests in the period exceeds 100,000 (100 matching records in 1:1000 sampled log)" ;
 5324+ $html .= "<br>Page requests by bots are not included. Also all ip addresses that occur more than once on a given day are discarded for that day." ;
 5325+ $html .= "<br> A few false negatives are taken for granted. " .
 5326+ "Country meta data collected from <a href='http://en.wikipedia.org/wiki/List_of_countries_by_population'>English Wikipedia</a>. " .
 5327+ "Portal = <a href='http://www.wikipedia.org'>www.wikipedia.org</a>" ;
 5328+ $html .= $colophon ;
 5329+
 5330+ $index = &HtmlIndex (join '/ ', sort (@index_countries)) ;
 5331+ $html =~ s/INDEX/$index/ ;
 5332+
 5333+ &PrintHtml ($html, "$path_out/$file_html_per_country_trends") ;
 5334+}
 5335+
 5336+sub CorrectForMissingDays
 5337+{
 5338+ my ($period, $count, $code, $var) = @_ ;
 5339+
 5340+ if ($missing_days {$period} > 0)
 5341+ {
 5342+ my $count_prev = $count ;
 5343+ $count = int (0.5 + $count * $correct_for_missing_days {$period}) ;
 5344+ if ($code =~ /us/i)
 5345+ { print "\nperiod $period: correct for ${missing_days {$period}} missing days = * ${correct_for_missing_days {$period}}, " .
 5346+ " e.g. for $code: $var $count_prev -> $count\n\n" ; }
 5347+ }
 5348+ return ($count) ;
 5349+}
 5350+
 5351+sub FormatCount
 5352+{
 5353+ my $count = shift ;
 5354+ if ($count eq "")
 5355+ { return ("&nbsp;") ; }
 5356+ if ($count < 1)
 5357+ { return ("1") ; }
 5358+ $count =~ s/^(\d{1,3})(\d\d\d)$/$1,$2/ ;
 5359+ $count =~ s/^(\d{1,3})(\d\d\d)(\d\d\d)$/$1,$2,$3/ ;
 5360+ $count =~ s/^(\d{1,3})(\d\d\d)(\d\d\d)(\d\d\d)$/$1,$2,$3,$4/ ;
 5361+ return ($count) ;
 5362+}
 5363+
 5364+sub SortMime
 5365+{
 5366+ my $mime = shift ;
 5367+ if ($mime eq "text/html")
 5368+ { return (2000000000 + $mimetypes {$mime}) ; }
 5369+ elsif ($mime =~ /image\/(?:png|jpeg|gif)/)
 5370+ { return (1000000000 + $mimetypes {$mime}) ; }
 5371+ else
 5372+ { return ($mimetypes {$mime}) ; }
 5373+}
 5374+
 5375+sub ExpandAbbreviation
 5376+{
 5377+ my $text = shift ;
 5378+ # reverse (more or less) abbreviations
 5379+ $text =~ s/^[\@\*]// ;
 5380+ $text =~ s/^xx:upload/upload:&nbsp;/;
 5381+ $text =~ s/^wb:/wikibooks:/;
 5382+ $text =~ s/^wk:/wiktionary:/;
 5383+ $text =~ s/^wn:/wikinews:/;
 5384+ $text =~ s/^wp:/wikipedia:/;
 5385+ $text =~ s/^wq:/wikiquote:/;
 5386+ $text =~ s/^ws:/wikisource:/;
 5387+ $text =~ s/^wv:/wikiversity:/;
 5388+ $text =~ s/^wx:/wikispecial:/;
 5389+ $text =~ s/^mw:/wikispecial:/; # eg bugzilla
 5390+ $text =~ s/:!mw/:mediawiki/;
 5391+ $text =~ s/^wm:/wikimedia:/;
 5392+ $text =~ s/:wm$/:wikimedia/;
 5393+ $text =~ s/^wmf:/foundation:/;
 5394+ $text =~ s/:www$/:portal/;
 5395+# $text =~ s/^wikispecial:(.*)$/$1:&nbsp;/;
 5396+ return ($text) ;
 5397+}
 5398+
 5399+sub GetSecondaryDomain
 5400+{
 5401+ $pattern_url_post = "\\.(?:biz|com|info|name|net|org|pro|aero|asia|cat|coop|edu|gov|int|jobs|mil|mobi|museum|tel|travel|arpa|[a-zA-Z0-9-]{2}|(?:com?|ne)\\.[a-zA-Z0-9-]{2})\$" ;
 5402+
 5403+ my $domain = shift ;
 5404+ $domain =~ s/http:\/\/// ;
 5405+ $domain =~ s/\/.*$// ;
 5406+
 5407+ if ($domain !~ /\./)
 5408+ { return ($domain) ; }
 5409+
 5410+ $domain =~ s/$pattern_url_post// ;
 5411+ $domain =~ s/^.*?\.([^\.]+)$/$1/ ;
 5412+ return ($domain) ;
 5413+}
 5414+
 5415+sub OpenLog
 5416+{
 5417+# only shrink log when same log file is appended daily, is no longer the case
 5418+# $fileage = -M "$dir_reports/$file_log" ;
 5419+# if ($fileage > 5)
 5420+# {
 5421+# open "FILE_LOG", "<", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ;
 5422+# @log = <FILE_LOG> ;
 5423+# close "FILE_LOG" ;
 5424+# $lines = 0 ;
 5425+# open "FILE_LOG", ">", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ;
 5426+# foreach $line (@log)
 5427+# {
 5428+# if (++$lines >= $#log - 5000)
 5429+# { print FILE_LOG $line ; }
 5430+# }
 5431+# close "FILE_LOG" ;
 5432+# }
 5433+# open "FILE_LOG", ">>", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ;
 5434+ open "FILE_LOG", ">>", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ;
 5435+ &Log ("\n\n===== Wikimedia Sampled Visitors Log Report / " . date_time_english (time) . " =====\n\n") ;
 5436+}
 5437+
 5438+sub Normalize
 5439+{
 5440+ my $count = shift ;
 5441+ $count *= $multiplier ;
 5442+# if ($count < 1) { $count = 1 ; } -> do this at FormatCount
 5443+ return (sprintf ("%.0f", $count)) ;
 5444+}
 5445+
 5446+sub Log
 5447+{
 5448+ $msg = shift ;
 5449+ print $msg ;
 5450+ print FILE_LOG $msg ;
 5451+}
 5452+
 5453+sub InitProjectNames
 5454+{
 5455+ # copied from WikiReports.pl
 5456+
 5457+ %wikipedias = (
 5458+# mediawiki=>"http://wikimediafoundation.org Wikimedia",
 5459+ nostalgia=>"http://nostalgia.wikipedia.org Nostalgia",
 5460+ sources=>"http://wikisource.org Old&nbsp;Wikisource",
 5461+ meta=>"http://meta.wikimedia.org Meta-Wiki",
 5462+ beta=>"http://beta.wikiversity.org Beta",
 5463+ species=>"http://species.wikipedia.org WikiSpecies",
 5464+ commons=>"http://commons.wikimedia.org Commons",
 5465+ foundation=>"http://wikimediafoundation.org Wikimedia&nbsp;Foundation",
 5466+ sep11=>"http://sep11.wikipedia.org In&nbsp;Memoriam",
 5467+ nlwikimedia=>"http://nl.wikimedia.org Wikimedia&nbsp;Nederland",
 5468+ plwikimedia=>"http://pl.wikimedia.org Wikimedia&nbsp;Polska",
 5469+ mediawiki=>"http://www.mediawiki.org MediaWiki",
 5470+ dewikiversity=>"http://de.wikiversity.org Wikiversit&auml;t",
 5471+ frwikiversity=>"http://fr.wikiversity.org Wikiversit&auml;t",
 5472+ wikimania2005=>"http://wikimania2005.wikimedia.org Wikimania 2005",
 5473+ wikimania2006=>"http://wikimania2006.wikimedia.org Wikimania 2006",
 5474+ aa=>"http://aa.wikipedia.org Afar",
 5475+ ab=>"http://ab.wikipedia.org Abkhazian",
 5476+ ace=>"http://ace.wikipedia.org Acehnese",
 5477+ af=>"http://af.wikipedia.org Afrikaans",
 5478+ ak=>"http://ak.wikipedia.org Akan", # was Akana
 5479+ als=>"http://als.wikipedia.org Alemannic", # was Elsatian
 5480+ am=>"http://am.wikipedia.org Amharic",
 5481+ an=>"http://an.wikipedia.org Aragonese",
 5482+ ang=>"http://ang.wikipedia.org Anglo-Saxon",
 5483+ ar=>"http://ar.wikipedia.org Arabic",
 5484+ arc=>"http://arc.wikipedia.org Aramaic",
 5485+ arz=>"http://arz.wikipedia.org Egyptian Arabic",
 5486+ as=>"http://as.wikipedia.org Assamese",
 5487+ ast=>"http://ast.wikipedia.org Asturian",
 5488+ av=>"http://av.wikipedia.org Avar", # was Avienan
 5489+ ay=>"http://ay.wikipedia.org Aymara",
 5490+ az=>"http://az.wikipedia.org Azeri", # was Azerbaijani
 5491+ ba=>"http://ba.wikipedia.org Bashkir",
 5492+ bar=>"http://bar.wikipedia.org Bavarian",
 5493+ bat_smg=>"http://bat-smg.wikipedia.org Samogitian",
 5494+ "bat-smg"=>"http://bat-smg.wikipedia.org Samogitian",
 5495+ bcl=>"http://bcl.wikipedia.org Central Bicolano",
 5496+ be=>"http://be.wikipedia.org Belarusian",
 5497+ "be-x-old"=>"http://be.wikipedia.org Belarusian (Tarashkevitsa)",
 5498+ be_x_old=>"http://be.wikipedia.org Belarusian (Tarashkevitsa)",
 5499+ bg=>"http://bg.wikipedia.org Bulgarian",
 5500+ bh=>"http://bh.wikipedia.org Bihari",
 5501+ bi=>"http://bi.wikipedia.org Bislama",
 5502+ bm=>"http://bm.wikipedia.org Bambara",
 5503+ bn=>"http://bn.wikipedia.org Bengali",
 5504+ bo=>"http://bo.wikipedia.org Tibetan",
 5505+ bpy=>"http://bpy.wikipedia.org Bishnupriya Manipuri",
 5506+ br=>"http://br.wikipedia.org Breton",
 5507+ bs=>"http://bs.wikipedia.org Bosnian",
 5508+ bug=>"http://bug.wikipedia.org Buginese",
 5509+ bxr=>"http://bxr.wikipedia.org Buryat",
 5510+ ca=>"http://ca.wikipedia.org Catalan",
 5511+ cbk_zam=>"http://cbk-zam.wikipedia.org Chavacano",
 5512+ "cbk-zam"=>"http://cbk-zam.wikipedia.org Chavacano",
 5513+ cdo=>"http://cdo.wikipedia.org Min Dong",
 5514+ ce=>"http://ce.wikipedia.org Chechen",
 5515+ ceb=>"http://ceb.wikipedia.org Cebuano",
 5516+ ch=>"http://ch.wikipedia.org Chamorro", # was Chamoru
 5517+ ckb=>"http://ckb.wikipedia.org Sorani",
 5518+ cho=>"http://cho.wikipedia.org Choctaw", # was Chotaw
 5519+ chr=>"http://chr.wikipedia.org Cherokee",
 5520+ chy=>"http://chy.wikipedia.org Cheyenne", # was Sets&ecirc;hest&acirc;hese
 5521+ co=>"http://co.wikipedia.org Corsican",
 5522+ cr=>"http://cr.wikipedia.org Cree",
 5523+ crh=>"http://crh.wikipedia.org Crimean Tatar",
 5524+ cs=>"http://cs.wikipedia.org Czech",
 5525+ csb=>"http://csb.wikipedia.org Cashubian", # was Kashubian
 5526+ cu=>"http://cv.wikipedia.org Old Church Slavonic",
 5527+ cv=>"http://cv.wikipedia.org Chuvash", # was Cavas
 5528+ cy=>"http://cy.wikipedia.org Welsh",
 5529+ da=>"http://da.wikipedia.org Danish",
 5530+ de=>"http://de.wikipedia.org German",
 5531+ diq=>"http://diq.wikipedia.org Zazaki",
 5532+ dk=>"http://dk.wikipedia.org Danish",
 5533+ dsb=>"http://dsb.wikipedia.org Lower Sorbian",
 5534+ dv=>"http://dv.wikipedia.org Divehi",
 5535+ dz=>"http://dz.wikipedia.org Dzongkha",
 5536+ ee=>"http://ee.wikipedia.org Ewe",
 5537+ el=>"http://el.wikipedia.org Greek",
 5538+ eml=>"http://eml.wikipedia.org Emilian-Romagnol",
 5539+ en=>"http://en.wikipedia.org English",
 5540+ eo=>"http://eo.wikipedia.org Esperanto",
 5541+ es=>"http://es.wikipedia.org Spanish",
 5542+ et=>"http://et.wikipedia.org Estonian",
 5543+ eu=>"http://eu.wikipedia.org Basque",
 5544+ ext=>"http://ext.wikipedia.org Extremaduran",
 5545+ fa=>"http://fa.wikipedia.org Persian",
 5546+ ff=>"http://ff.wikipedia.org Fulfulde",
 5547+ fi=>"http://fi.wikipedia.org Finnish",
 5548+ "fiu-vro"=>"http://fiu-vro.wikipedia.org Voro",
 5549+ fiu_vro=>"http://fiu-vro.wikipedia.org Voro",
 5550+ fj=>"http://fj.wikipedia.org Fijian",
 5551+ fo=>"http://fo.wikipedia.org Faroese", # was Faeroese
 5552+ fr=>"http://fr.wikipedia.org French",
 5553+ frp=>"http://frp.wikipedia.org Arpitan",
 5554+ fur=>"http://fur.wikipedia.org Friulian",
 5555+ fy=>"http://fy.wikipedia.org Frisian",
 5556+ ga=>"http://ga.wikipedia.org Irish",
 5557+ gan=>"http://gan.wikipedia.org Gan",
 5558+ gay=>"http://gay.wikipedia.org Gayo",
 5559+ gd=>"http://gd.wikipedia.org Scots Gaelic", # was Scottish Gaelic
 5560+ gl=>"http://gl.wikipedia.org Galician", # was Galego
 5561+ glk=>"http://glk.wikipedia.org Gilaki",
 5562+ gn=>"http://gn.wikipedia.org Guarani",
 5563+ got=>"http://got.wikipedia.org Gothic",
 5564+ gu=>"http://gu.wikipedia.org Gujarati",
 5565+ gv=>"http://gv.wikipedia.org Manx", # was Manx Gaelic
 5566+ ha=>"http://ha.wikipedia.org Hausa",
 5567+ hak=>"http://hak.wikipedia.org Hakka",
 5568+ haw=>"http://haw.wikipedia.org Hawai'ian", # was Hawaiian
 5569+ he=>"http://he.wikipedia.org Hebrew",
 5570+ hi=>"http://hi.wikipedia.org Hindi",
 5571+ hif=>"http://hif.wikipedia.org Fiji Hindi",
 5572+ ho=>"http://ho.wikipedia.org Hiri Motu",
 5573+ hr=>"http://hr.wikipedia.org Croatian",
 5574+ hsb=>"http://hsb.wikipedia.org Upper Sorbian",
 5575+ ht=>"http://ht.wikipedia.org Haitian",
 5576+ hu=>"http://hu.wikipedia.org Hungarian",
 5577+ hy=>"http://hy.wikipedia.org Armenian",
 5578+ hz=>"http://hz.wikipedia.org Herero",
 5579+ ia=>"http://ia.wikipedia.org Interlingua",
 5580+ iba=>"http://iba.wikipedia.org Iban",
 5581+ id=>"http://id.wikipedia.org Indonesian",
 5582+ ie=>"http://ie.wikipedia.org Interlingue",
 5583+ ig=>"http://ig.wikipedia.org Igbo",
 5584+ ii=>"http://ii.wikipedia.org Yi",
 5585+ ik=>"http://ik.wikipedia.org Inupiak",
 5586+ ilo=>"http://ilo.wikipedia.org Ilokano",
 5587+ io=>"http://io.wikipedia.org Ido",
 5588+ is=>"http://is.wikipedia.org Icelandic",
 5589+ it=>"http://it.wikipedia.org Italian",
 5590+ iu=>"http://iu.wikipedia.org Inuktitut",
 5591+ ja=>"http://ja.wikipedia.org Japanese",
 5592+ jbo=>"http://jbo.wikipedia.org Lojban",
 5593+ jv=>"http://jv.wikipedia.org Javanese",
 5594+ ka=>"http://ka.wikipedia.org Georgian",
 5595+ kaa=>"http://kaa.wikipedia.org Karakalpak",
 5596+ kab=>"http://ka.wikipedia.org Kabyle",
 5597+ kaw=>"http://kaw.wikipedia.org Kawi",
 5598+ kg=>"http://kg.wikipedia.org Kongo",
 5599+ ki=>"http://ki.wikipedia.org Kikuyu",
 5600+ kj=>"http://kj.wikipedia.org Kuanyama", # was Otjiwambo
 5601+ kk=>"http://kk.wikipedia.org Kazakh",
 5602+ kl=>"http://kl.wikipedia.org Greenlandic",
 5603+ km=>"http://km.wikipedia.org Khmer", # was Cambodian
 5604+ kn=>"http://kn.wikipedia.org Kannada",
 5605+ ko=>"http://ko.wikipedia.org Korean",
 5606+ kr=>"http://kr.wikipedia.org Kanuri",
 5607+ ks=>"http://ks.wikipedia.org Kashmiri",
 5608+ ksh=>"http://ksh.wikipedia.org Ripuarian",
 5609+ ku=>"http://ku.wikipedia.org Kurdish",
 5610+ kv=>"http://kv.wikipedia.org Komi",
 5611+ kw=>"http://kw.wikipedia.org Cornish", # was Kornish
 5612+ ky=>"http://ky.wikipedia.org Kirghiz",
 5613+ la=>"http://la.wikipedia.org Latin",
 5614+ lad=>"http://lad.wikipedia.org Ladino",
 5615+ lb=>"http://lb.wikipedia.org Luxembourgish", # was Letzeburgesch
 5616+ lbe=>"http://lbe.wikipedia.org Lak",
 5617+ lg=>"http://lg.wikipedia.org Ganda",
 5618+ li=>"http://li.wikipedia.org Limburgish",
 5619+ lij=>"http://lij.wikipedia.org Ligurian",
 5620+ lmo=>"http://lmo.wikipedia.org Lombard",
 5621+ ln=>"http://ln.wikipedia.org Lingala",
 5622+ lo=>"http://lo.wikipedia.org Laotian",
 5623+ ls=>"http://ls.wikipedia.org Latino Sine Flexione",
 5624+ lt=>"http://lt.wikipedia.org Lithuanian",
 5625+ lv=>"http://lv.wikipedia.org Latvian",
 5626+ mad=>"http://mad.wikipedia.org Madurese",
 5627+ mak=>"http://mak.wikipedia.org Makasar",
 5628+ map_bms=>"http://map-bms.wikipedia.org Banyumasan",
 5629+ "map-bms"=>"http://map-bms.wikipedia.org Banyumasan",
 5630+ mdf=>"http://mdf.wikipedia.org Moksha",
 5631+ mg=>"http://mg.wikipedia.org Malagasy",
 5632+ mh=>"http://mh.wikipedia.org Marshallese",
 5633+ mhr=>"http://mhr.wikipedia.org Eastern Mari",
 5634+ mi=>"http://mi.wikipedia.org Maori",
 5635+ min=>"http://min.wikipedia.org Minangkabau",
 5636+ minnan=>"http://minnan.wikipedia.org Minnan",
 5637+ mk=>"http://mk.wikipedia.org Macedonian",
 5638+ ml=>"http://ml.wikipedia.org Malayalam",
 5639+ mn=>"http://mn.wikipedia.org Mongolian",
 5640+ mo=>"http://mo.wikipedia.org Moldavian",
 5641+ mr=>"http://mr.wikipedia.org Marathi",
 5642+ ms=>"http://ms.wikipedia.org Malay",
 5643+ mt=>"http://mt.wikipedia.org Maltese",
 5644+ mus=>"http://mus.wikipedia.org Muskogee",
 5645+ mwl=>"http://mwl.wikipedia.org Mirandese",
 5646+ my=>"http://my.wikipedia.org Burmese",
 5647+ myv=>"http://myv.wikipedia.org Erzya",
 5648+ mzn=>"http://mzn.wikipedia.org Mazandarani",
 5649+ na=>"http://na.wikipedia.org Nauruan", # was Nauru
 5650+ nah=>"http://nah.wikipedia.org Nahuatl",
 5651+ nap=>"http://nap.wikipedia.org Neapolitan",
 5652+ nds=>"http://nds.wikipedia.org Low Saxon",
 5653+ nds_nl=>"http://nds-nl.wikipedia.org Dutch Low Saxon",
 5654+ "nds-nl"=>"http://nds-nl.wikipedia.org Dutch Low Saxon",
 5655+ ne=>"http://ne.wikipedia.org Nepali",
 5656+ new=>"http://new.wikipedia.org Nepal Bhasa",
 5657+ ng=>"http://ng.wikipedia.org Ndonga",
 5658+ nl=>"http://nl.wikipedia.org Dutch",
 5659+ nov=>"http://nov.wikipedia.org Novial",
 5660+ nrm=>"http://nrm.wikipedia.org Norman",
 5661+ nn=>"http://nn.wikipedia.org Nynorsk", # was Neo-Norwegian
 5662+ no=>"http://no.wikipedia.org Norwegian",
 5663+ nv=>"http://nv.wikipedia.org Navajo", # was Avayo
 5664+ ny=>"http://ny.wikipedia.org Chichewa",
 5665+ oc=>"http://oc.wikipedia.org Occitan",
 5666+ om=>"http://om.wikipedia.org Oromo",
 5667+ or=>"http://or.wikipedia.org Oriya",
 5668+ os=>"http://os.wikipedia.org Ossetic",
 5669+ pa=>"http://pa.wikipedia.org Punjabi",
 5670+ pag=>"http://pag.wikipedia.org Pangasinan",
 5671+ pam=>"http://pam.wikipedia.org Kapampangan",
 5672+ pap=>"http://pap.wikipedia.org Papiamentu",
 5673+ pdc=>"http://pdc.wikipedia.org Pennsylvania German",
 5674+ pi=>"http://pi.wikipedia.org Pali",
 5675+ pih=>"http://pih.wikipedia.org Norfolk",
 5676+ pl=>"http://pl.wikipedia.org Polish",
 5677+ pms=>"http://pms.wikipedia.org Piedmontese",
 5678+ pnb=>"http://pnb.wikipedia.org Western Panjabi",
 5679+ pnt=>"http://pnt.wikipedia.org Pontic",
 5680+ ps=>"http://ps.wikipedia.org Pashto",
 5681+ pt=>"http://pt.wikipedia.org Portuguese",
 5682+ qu=>"http://qu.wikipedia.org Quechua",
 5683+ rm=>"http://rm.wikipedia.org Romansh", # was Rhaeto-Romance
 5684+ rmy=>"http://rmy.wikipedia.org Romani",
 5685+ rn=>"http://rn.wikipedia.org Kirundi",
 5686+ ro=>"http://ro.wikipedia.org Romanian",
 5687+ roa_rup=>"http://roa-rup.wikipedia.org Aromanian",
 5688+ "roa-rup"=>"http://roa-rup.wikipedia.org Aromanian",
 5689+ roa_tara=>"http://roa-tara.wikipedia.org Tarantino",
 5690+ "roa-tara"=>"http://roa-tara.wikipedia.org Tarantino",
 5691+ ru=>"http://ru.wikipedia.org Russian",
 5692+ ru_sib=>"http://ru-sib.wikipedia.org Siberian",
 5693+ "ru-sib"=>"http://ru-sib.wikipedia.org Siberian",
 5694+ rw=>"http://rw.wikipedia.org Kinyarwanda",
 5695+ sa=>"http://sa.wikipedia.org Sanskrit",
 5696+ sah=>"http://sah.wikipedia.org Sakha",
 5697+ sc=>"http://sc.wikipedia.org Sardinian",
 5698+ scn=>"http://scn.wikipedia.org Sicilian",
 5699+ sco=>"http://sco.wikipedia.org Scots",
 5700+ sd=>"http://sd.wikipedia.org Sindhi",
 5701+ se=>"http://se.wikipedia.org Northern Sami",
 5702+ sg=>"http://sg.wikipedia.org Sangro",
 5703+ sh=>"http://sh.wikipedia.org Serbo-Croatian",
 5704+ si=>"http://si.wikipedia.org Sinhala", # was Singhalese
 5705+ simple=>"http://simple.wikipedia.org Simple English",
 5706+ sk=>"http://sk.wikipedia.org Slovak",
 5707+ sl=>"http://sl.wikipedia.org Slovene",
 5708+ sm=>"http://sm.wikipedia.org Samoan",
 5709+ sn=>"http://sn.wikipedia.org Shona",
 5710+ so=>"http://so.wikipedia.org Somali", # was Somalian
 5711+ sq=>"http://sq.wikipedia.org Albanian",
 5712+ sr=>"http://sr.wikipedia.org Serbian",
 5713+ srn=>"http://srn.wikipedia.org Sranan",
 5714+ ss=>"http://ss.wikipedia.org Siswati",
 5715+ st=>"http://st.wikipedia.org Sesotho",
 5716+ stq=>"http://stq.wikipedia.org Saterland Frisian",
 5717+ su=>"http://su.wikipedia.org Sundanese",
 5718+ sv=>"http://sv.wikipedia.org Swedish",
 5719+ sw=>"http://sw.wikipedia.org Swahili",
 5720+ szl=>"http://szl.wikipedia.org Silesian",
 5721+ ta=>"http://ta.wikipedia.org Tamil",
 5722+ te=>"http://te.wikipedia.org Telugu",
 5723+ test=>"http://test.wikipedia.org Test",
 5724+ tet=>"http://tet.wikipedia.org Tetum",
 5725+ tg=>"http://tg.wikipedia.org Tajik",
 5726+ th=>"http://th.wikipedia.org Thai",
 5727+ ti=>"http://ti.wikipedia.org Tigrinya",
 5728+ tk=>"http://tk.wikipedia.org Turkmen",
 5729+ tl=>"http://tl.wikipedia.org Tagalog",
 5730+ tlh=>"http://tlh.wikipedia.org Klingon", # was Klignon
 5731+ tn=>"http://tn.wikipedia.org Setswana",
 5732+ to=>"http://to.wikipedia.org Tongan",
 5733+ tokipona=>"http://tokipona.wikipedia.org Tokipona",
 5734+ tpi=>"http://tpi.wikipedia.org Tok Pisin",
 5735+ tr=>"http://tr.wikipedia.org Turkish",
 5736+ ts=>"http://ts.wikipedia.org Tsonga",
 5737+ tt=>"http://tt.wikipedia.org Tatar",
 5738+ tum=>"http://tum.wikipedia.org Tumbuka",
 5739+ turn=>"http://turn.wikipedia.org Turnbuka",
 5740+ tw=>"http://tw.wikipedia.org Twi",
 5741+ ty=>"http://ty.wikipedia.org Tahitian",
 5742+ udm=>"http://udm.wikipedia.org Udmurt",
 5743+ ug=>"http://ug.wikipedia.org Uighur",
 5744+ uk=>"http://uk.wikipedia.org Ukrainian",
 5745+ ur=>"http://ur.wikipedia.org Urdu",
 5746+ uz=>"http://uz.wikipedia.org Uzbek",
 5747+ ve=>"http://ve.wikipedia.org Venda", # was Lushaka
 5748+ vec=>"http://vec.wikipedia.org Venetian",
 5749+ vi=>"http://vi.wikipedia.org Vietnamese",
 5750+ vls=>"http://vls.wikipedia.org West Flemish",
 5751+ vo=>"http://vo.wikipedia.org Volap&uuml;k",
 5752+ wa=>"http://wa.wikipedia.org Walloon",
 5753+ war=>"http://war.wikipedia.org Waray-Waray",
 5754+ wo=>"http://wo.wikipedia.org Wolof",
 5755+ wuu=>"http://wuu.wikipedia.org Wu",
 5756+ xal=>"http://xal.wikipedia.org Kalmyk",
 5757+ xh=>"http://xh.wikipedia.org Xhosa",
 5758+ yi=>"http://yi.wikipedia.org Yiddish",
 5759+ yo=>"http://yo.wikipedia.org Yoruba",
 5760+ za=>"http://za.wikipedia.org Zhuang",
 5761+ zea=>"http://zea.wikipedia.org Zealandic",
 5762+ zh=>"http://zh.wikipedia.org Chinese",
 5763+ zh_min_nan=>"http://zh-min-nan.wikipedia.org Min Nan",
 5764+ "zh-min-nan"=>"http://zh-min-nan.wikipedia.org Min Nan",
 5765+ zh_classical=>"http://zh-classical.wikipedia.org Classical Chinese",
 5766+ "zh-classical"=>"http://zh-classical.wikipedia.org Classical Chinese",
 5767+ zh_yue=>"http://zh-yue.wikipedia.org Cantonese",
 5768+ "zh-yue"=>"http://zh-yue.wikipedia.org Cantonese",
 5769+ zu=>"http://zu.wikipedia.org Zulu",
 5770+ zz=>"&nbsp; All&nbsp;languages",
 5771+ zzz=>"&nbsp; All&nbsp;languages except English"
 5772+ );
 5773+
 5774+ foreach $key (keys %wikipedias)
 5775+ {
 5776+ my $wikipedia = $wikipedias {$key} ;
 5777+ $out_urls {$key} = $wikipedia ;
 5778+ $out_languages {$key} = $wikipedia ;
 5779+ $out_urls {$key} =~ s/(^[^\s]+).*$/$1/ ;
 5780+ $out_languages {$key} =~ s/^[^\s]+\s+(.*)$/$1/ ;
 5781+ $out_article {$key} = "http://en.wikipedia.org/wiki/" . $out_languages {$key} . "_language" ;
 5782+ $out_article {$key} =~ s/ /_/g ;
 5783+ $out_urls {$key} =~ s/(^[^\s]+).*$/$1/ ;
 5784+ }
 5785+ $out_languages {"www"} = "Portal" ;
 5786+}
 5787+
 5788+
 5789+sub Percentage
 5790+{
 5791+ my $perc = shift ;
 5792+ $perc = 100 * $perc ;
 5793+ if ($perc == 100) { $perc = '100%' ; }
 5794+ if ($perc == 0) { $perc = '&nbsp;' ; }
 5795+ elsif ($perc < 0.00001) { $perc = '0.00001%' ; }
 5796+ elsif ($perc < 0.0001) { $perc = sprintf ("%.5f%", $perc) ; }
 5797+ elsif ($perc < 0.001) { $perc = sprintf ("%.4f%", $perc) ; }
 5798+ elsif ($perc < 0.01) { $perc = sprintf ("%.3f%", $perc) ; }
 5799+ elsif ($perc < 0.1) { $perc = sprintf ("%.2f%", $perc) ; }
 5800+ else { $perc = sprintf ("%.1f%", $perc) ; }
 5801+ return ($perc) ;
 5802+}
 5803+
 5804+sub ReadWikipedia
 5805+{
 5806+ use LWP::Simple qw($ua get);
 5807+
 5808+ $ua->agent('Wikipedia Wikicounts job');
 5809+ $ua->timeout(60);
 5810+ my $url = 'http://en.wikipedia.org/wiki/List_of_countries_by_population';
 5811+ my $html = get $url || die "Timed out!";
 5812+
 5813+# open TEST, '<', 'List_of_countries_by_population.html' ;
 5814+# @lines = <TEST> ;
 5815+# $html = join "\n", @lines ;
 5816+# close TEST ;
 5817+
 5818+ # split file on <tr>'s, remove all behind </tr>
 5819+ $html =~ s/\n/\\n/gs ;
 5820+ foreach $line (split "(?=<tr)", $html)
 5821+ {
 5822+ next if $line !~ /^<tr/ ;
 5823+ next if $line !~ /class=\"flagicon\"/ ;
 5824+
 5825+ $line =~ s/(?<=<\/tr>).*$// ;
 5826+ # print "$line\n\n" ;
 5827+
 5828+ @cells = split "(?=<td)", $line ;
 5829+ # foreach $cell (@cells)
 5830+ # { print "CELL $cell\n" ; }
 5831+
 5832+ if ($cells [2] =~ /<img /)
 5833+ {
 5834+ $icon = $cells [2] ;
 5835+ $icon =~ s/^.*?(<img[^>]*>).*$/$1/ ;
 5836+ $icon =~ s/class=\"[^\"]*\"// ;
 5837+ $icon =~ s/\s*\/>/>/ ;
 5838+ # print "ICON '$icon'\n" ;
 5839+ }
 5840+ else
 5841+ { $icon = "n.a." ; }
 5842+
 5843+ if ($cells [2] =~ /title/)
 5844+ {
 5845+ $country = $cells [2] ;
 5846+ $country =~ s/^.*?<a [^>]*>([^<]*)<.*$/$1/ ;
 5847+ # print "COUNTRY '$country'\n" ;
 5848+ }
 5849+ else
 5850+ { $title = "n.a." ; }
 5851+
 5852+ if ($cells [2] =~ /<a /)
 5853+ {
 5854+ $link = $cells [2] ;
 5855+ $link =~ s/^.*?(<a [^>]*>.*?<\/a>).*$/$1/ ;
 5856+ $link =~ s/\/wiki/http:\/\/en.wikipedia.org\/wiki/ ;
 5857+ # print "LINK '$link'\n" ;
 5858+ }
 5859+ else
 5860+ { $title = "n.a." ; }
 5861+
 5862+ ($population = $cells [3]) =~ s/<td[^>]*>(.*?)<.*$/$1/, $population =~ s/,/_/g ;
 5863+ # print "POP $population\n\n" ;
 5864+
 5865+ $country =~ s/,/&comma;/g ;
 5866+ $link =~ s/,/&comma;/g ;
 5867+ $icon =~ s/,/&comma;/g ;
 5868+
 5869+ $countries {$country} = "$country,$link,$population,connected,$icon\n" ;
 5870+ }
 5871+
 5872+ $url = 'http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users';
 5873+ $html = get $url || die "Timed out!";
 5874+
 5875+ # split file on <tr>'s, remove all behind </tr>
 5876+ $html =~ s/\n/\\n/gs ;
 5877+ foreach $line (split "(?=<tr)", $html)
 5878+ {
 5879+ next if $line !~ /^<tr/ ;
 5880+ next if $line !~ /class=\"flagicon\"/ ;
 5881+
 5882+ $line =~ s/(?<=<\/tr>).*$// ;
 5883+ # print "$line\n\n" ;
 5884+
 5885+ @cells = split "(?=<td)", $line ;
 5886+ # foreach $cell (@cells)
 5887+ # { print "CELL $cell\n" ; }
 5888+
 5889+ if ($cells [2] =~ /title/)
 5890+ {
 5891+ $country = $cells [2] ;
 5892+ $country =~ s/^.*?<a [^>]*>([^<]*)<.*$/$1/ ;
 5893+ # print "COUNTRY '$country'\n" ;
 5894+ }
 5895+ else
 5896+ { $country = "n.a." ; }
 5897+
 5898+ ($connected = $cells [3]) =~ s/<td[^>]*>(.*?)<.*$/$1/, $connected =~ s/,/_/g ;
 5899+ # print "POP $population\n\n" ;
 5900+
 5901+ $country =~ s/,/&comma;/g ;
 5902+ $country =~ s/Bosnia-Herzegovina/Bosnia and Herzegovina/ ;
 5903+ $country =~ s/Cote d'Ivoire/C�te d'Ivoire/ ;
 5904+ $country =~ s/Macao/Macau/ ; # will be changed back later
 5905+ $country =~ s/Samoa/American Samoa/ ;
 5906+ $country =~ s/Timor Leste/Timor-Leste/ ;
 5907+ $country =~ s/UAE/United Arab Emirates/ ;
 5908+
 5909+ $countries {$country} =~ s/connected/$connected/ ;
 5910+ }
 5911+
 5912+ open COUNTRY_META_INFO, '>', "$path_out/SquidReportCountryMetaInfo.csv" ;
 5913+ foreach $country (sort keys %countries)
 5914+ { print COUNTRY_META_INFO $countries {$country} ; }
 5915+ close COUNTRY_META_INFO ;
 5916+}
 5917+
 5918+sub GetLanguageInfo
 5919+{
 5920+ my $language = shift ;
 5921+ my ($language_name,$anchor_language) ;
 5922+ $language_name = "$language (?)" ;
 5923+ if ($out_languages {$language} ne "")
 5924+ { $language_name = $out_languages {$language} ; }
 5925+ ($anchor_language = $language_name) =~ s/ /_/g ;
 5926+ return ($language_name,$anchor_language) ;
 5927+}
 5928+
 5929+sub CountryMetaInfo
 5930+{
 5931+ my $country = shift ;
 5932+print "Country '$country'\n" ; # qqq
 5933+ my ($link_country,$icon,$population) ;
 5934+ if ($country_meta_info {$country} eq "")
 5935+ {
 5936+ if ($country_meta_info_not_found_reported {$country} ++ == 0)
 5937+ { print "_Meta info not found for country '$country'\n" ; }
 5938+ $link_country = $country ;
 5939+ return ($country,'','..','..') ;
 5940+ }
 5941+ else
 5942+ {
 5943+ ($link_country,$population,$connected,$icon) = split ',', $country_meta_info {$country} ;
 5944+ $population =~ s/_//g ;
 5945+ $connected =~ s/_//g ;
 5946+ $link_country =~ s/&comma;/,/g ;
 5947+ $icon =~ s/&comma;/,/g ;
 5948+ $icon =~ s/>/ border=1>/ ;
 5949+ return ($link_country,$icon,$population,$connected) ;
 5950+ }
 5951+}
 5952+
 5953+sub i2KM
 5954+{
 5955+ $out_million = 'M' ;
 5956+ $out_thousand = 'K' ;
 5957+
 5958+ my $v = shift ;
 5959+
 5960+ if ($v == 0)
 5961+ { return ("&nbsp;") ; }
 5962+ if ($v >= 100000000)
 5963+ {
 5964+ $v = sprintf ("%.0f",($v / 1000000)) . "&nbsp;" . $out_million ;
 5965+ $v =~ s/(\d+?)(\d\d\d[^\d])/$1,$2/ ;
 5966+ }
 5967+ elsif ($v >= 1000000)
 5968+ { $v = sprintf ("%.1f",($v / 1000000)) . "&nbsp;" . $out_million ; }
 5969+ elsif ($v >= 10000)
 5970+ { $v = sprintf ("%.0f",($v / 1000)) . "&nbsp;" . $out_thousand ; }
 5971+ elsif ($v >= 1000)
 5972+ { $v = sprintf ("%.1f",($v / 1000)) . "&nbsp;" . $out_thousand ; }
 5973+ return ($v) ;
 5974+}
 5975+
 5976+sub i2KM2
 5977+{
 5978+ $out_million = 'M' ;
 5979+ $out_thousand = 'K' ;
 5980+
 5981+ my $v = shift ;
 5982+ return $v if $v !~ /^\d*$/ ;
 5983+
 5984+# return (sprintf ("%.1f",$v/1000000)) ;
 5985+ if ($v == 0)
 5986+ { return ("&nbsp;") ; }
 5987+ if ($v >= 10000000)
 5988+ { $v = sprintf ("%.0f",($v / 1000000)) . "&nbsp;" . $out_million ; }
 5989+ elsif ($v >= 1000000)
 5990+ { $v = sprintf ("%.1f",($v / 1000000)) . "&nbsp;" . $out_million ; }
 5991+ elsif ($v >= 1000)
 5992+ { $v = sprintf ("%.0f",($v / 1000)) . "&nbsp;" . $out_thousand ; }
 5993+ return ($v) ;
 5994+}
 5995+
 5996+# format: function(s) { return $.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/\\&nbsp\\;/g,"").replace(/M/i,"000000").replace(/&#1052;/,"000000").replace(/K/i,"000").replace(/&#1050;/i,"000")); },
 5997+
 5998+sub UnLink
 5999+{
 6000+ my ($links,$index) = @_ ;
 6001+# print "\n\nUnLink $index\n\n" ;
 6002+ my @segments = split '(?=<a )', $links ;
 6003+# print "SEGMENT 1 $segments[$index]\n" ;
 6004+ $segments [$index] =~ s/^.*?<a .*?>([^<]*)<\/a>/$1/ ;
 6005+# print "SEGMENT 2 $segments[$index]\n" ;
 6006+ $links = join '', @segments ;
 6007+ return ($links) ;
 6008+}
 6009+
 6010+sub PrintHtml
 6011+{
 6012+ ($html, $path) = @_ ;
 6013+
 6014+ $verbose = $false ;
 6015+ if ($verbose)
 6016+ { $html =~ s/\[([^\]]*)\]/$1/g ; }
 6017+ else
 6018+ { $html =~ s/\[([^\]]*)\]//g ; }
 6019+
 6020+ $html =~ s/and images// ; # all data [and images] onthis page are in the public domain
 6021+ open HTML_OUT, '>', $path ;
 6022+ print HTML_OUT $html ;
 6023+ close HTML_OUT ;
 6024+}
 6025+
 6026+sub PrintCsv
 6027+{
 6028+ ($csv, $path) = @_ ;
 6029+
 6030+ open HTML_CSV, '>', $path ;
 6031+ print HTML_CSV $csv ;
 6032+ close HTML_CSV ;
 6033+}
 6034+
 6035+sub HtmlSortTable
 6036+{
 6037+ my $html = <<__HTML_SORT_TABLE__ ;
 6038+
 6039+<script src="jquery-1.3.2.min.js" type="text/javascript"></script>
 6040+<script src="jquery.tablesorter.js" type="text/javascript"></script>
 6041+
 6042+<script type="text/javascript">
 6043+\$.tablesorter.addParser({
 6044+ id: "nohtml",
 6045+ is: function(s) { return false; },
 6046+ format: function(s) { return s.replace(/<.*?>/g,"").replace(/&nbsp;/g,""); },
 6047+ type: "text"
 6048+});
 6049+
 6050+\$.tablesorter.addParser({
 6051+ id: "millions",
 6052+ is: function(s) { return false; },
 6053+ format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/&nbsp;/g,"").replace(/M/,"000000").replace(/&#1052;/,"000000").replace(/K/,"000").replace(/&#1050;/i,"000")); },
 6054+ type: "numeric"
 6055+});
 6056+
 6057+
 6058+\$.tablesorter.addParser({
 6059+ id: "digitsonly",
 6060+ is: function(s) { return false; },
 6061+ format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<.*?>/g,"").replace(/&nbsp;/g,"").replace(/,/g,"").replace(/-/,"-1")); },
 6062+ type: "numeric"
 6063+});
 6064+</script>
 6065+
 6066+<style type="text/css">
 6067+table.tablesorter
 6068+{
 6069+/*
 6070+ font-family:arial;
 6071+ background-color: #CDCDCD;
 6072+ margin:10px 0pt 15px;
 6073+ font-size: 7pt;
 6074+ width: 80%;
 6075+ text-align: left;
 6076+*/
 6077+}
 6078+table.tablesorter thead tr th, table.tablesorter tfoot tr th
 6079+{
 6080+/*
 6081+ background-color: #99D;
 6082+ border: 1px solid #FFF;
 6083+ font-size: 8pt;
 6084+ padding: 4px;
 6085+*/
 6086+}
 6087+table.tablesorter thead tr .header
 6088+{
 6089+ background-color: #ffffdd;
 6090+ background-image: url(bg.gif);
 6091+ background-repeat: no-repeat;
 6092+ background-position: center right;
 6093+ cursor: pointer;
 6094+}
 6095+table.tablesorter tbody th
 6096+{
 6097+/*
 6098+ color: #3D3D3D;
 6099+ padding: 4px;
 6100+ background-color: #CCF;
 6101+ vertical-align: top;
 6102+*/
 6103+}
 6104+table.tablesorter tbody tr.odd th
 6105+{
 6106+ background-color:#eeeeaa;
 6107+ background-image:url(asc.gif);
 6108+}
 6109+table.tablesorter thead tr .headerSortUp
 6110+{
 6111+ background-color:#eeeeaa;
 6112+ background-image:url(asc.gif);
 6113+}
 6114+table.tablesorter thead tr .headerSortDown
 6115+{
 6116+ background-color:#eeeeaa;
 6117+ background-image:url(desc.gif);
 6118+}
 6119+table.tablesorter thead tr .headerSorthown, table.tablesorter thead tr .headerSortUp
 6120+{
 6121+ background-color: #eeeeaa;
 6122+}
 6123+</style>
 6124+__HTML_SORT_TABLE__
 6125+return ($html) ;
 6126+}
 6127+
 6128+sub HtmlSortTableColumns
 6129+{
 6130+ my $html = <<__HTML_SORT_TABLE_COLUMNS__ ;
 6131+
 6132+<script type='text/javascript'>
 6133+\$('#table1').tablesorter({
 6134+ // debug:true,
 6135+ headers:{0:{sorter:'nohtml'},1:{sorter:'nohtml'},2:{sorter:'nohtml'},3:{sorter:'millions'},4:{sorter:'millions'},5:{sorter:'millions'},6:{sorter:'digitsonly'},7:{sorter:'digitsonly'},6:{sorter:'digitsonly'},7:{sorter:'digitsonly'}}
 6136+});
 6137+</script>
 6138+__HTML_SORT_TABLE_COLUMNS__
 6139+return ($html) ;
 6140+}
 6141+
 6142+sub HtmlIndex
 6143+{
 6144+ $index = shift ;
 6145+
 6146+ my $html = <<__HTML_INDEX__ ;
 6147+
 6148+<script type="text/javascript">
 6149+<!--
 6150+function toggle_visibility_index()
 6151+{
 6152+ var index = document.getElementById('index');
 6153+ var toggle = document.getElementById('toggle');
 6154+ if (index.style.display == 'block')
 6155+ {
 6156+ index.style.display = 'none';
 6157+ toggle.innerHTML = 'Show index';
 6158+ }
 6159+ else
 6160+ {
 6161+ index.style.display = 'block';
 6162+ toggle.innerHTML = 'Hide index';
 6163+ }
 6164+}
 6165+//-->
 6166+</script>
 6167+
 6168+<tr><td class=r colspan=99><a href="#" id='toggle' onclick="toggle_visibility_index();">Show index</a></td></tr>
 6169+<tr><td class=l colspan=99><span id='index' style="display:none">\n$index\n</span></td></tr>
 6170+__HTML_INDEX__
 6171+
 6172+return ($html) ;
 6173+}
 6174+
 6175+sub hsv_to_rgb {
 6176+
 6177+ my $h = shift;
 6178+ my $s = shift;
 6179+ my $v = shift;
 6180+
 6181+ # limit this to h values between 0 and 360 and s/v values
 6182+ # between 0 and 1
 6183+
 6184+ unless (defined($h) && defined($s) && defined($v) &&
 6185+ $h >= 0 && $s >= 0 && $v >= 0 &&
 6186+ $h <= 360 && $s <= 1 && $v <= 1) {
 6187+ return (undef, undef, undef);
 6188+ }
 6189+
 6190+ my $r;
 6191+ my $g;
 6192+ my $b;
 6193+
 6194+ # 0.003 is less than 1/255; use this to make the floating point
 6195+ # approximation of zero, since the resulting rgb values will
 6196+ # normally be used as integers between 0 and 255. Feel free to
 6197+ # change this approximation of zero to something else, if this
 6198+ # suits you.
 6199+
 6200+ if ($s < 0.003) {
 6201+ $r = $g = $b = $v;
 6202+ }
 6203+ else {
 6204+
 6205+ $h /= 60;
 6206+ my $sector = int($h);
 6207+ my $fraction = $h - $sector;
 6208+
 6209+ my $p = $v * (1 - $s);
 6210+ my $q = $v * (1 - ($s * $fraction));
 6211+ my $t = $v * (1 - ($s * (1 - $fraction)));
 6212+
 6213+ if ($sector == 0) { $r = $v; $g = $t; $b = $p; }
 6214+ elsif ($sector == 1) { $r = $q; $g = $v; $b = $p; }
 6215+ elsif ($sector == 2) { $r = $p; $g = $v; $b = $t; }
 6216+ elsif ($sector == 3) { $r = $p; $g = $q; $b = $v; }
 6217+ elsif ($sector == 4) { $r = $t; $g = $p; $b = $v; }
 6218+ else { $r = $v; $g = $p; $b = $q; }
 6219+ }
 6220+
 6221+ # Convert the r/g/b values to all be between 0 and 255; use the
 6222+ # ol' 0.003 approximation again, with the same comment as above.
 6223+
 6224+ $r = ($r < 0.003 ? 0.0 : $r * 255);
 6225+ $g = ($g < 0.003 ? 0.0 : $g * 255);
 6226+ $b = ($b < 0.003 ? 0.0 : $b * 255);
 6227+
 6228+ return ($r, $g, $b);
 6229+ }
 6230+
 6231+sub hsv2rgb
 6232+{
 6233+ my ($h,$s,$v) = @_;
 6234+ my ($p,$q) ;
 6235+ ($v,$p,$q) = hsv_to_rgb ($h,$s,$v) ;
 6236+ my $color = "\#" . sprintf ("%02X", int($v)) . sprintf ("%02X", int($p)) . sprintf ("%02X", int($q)) ;
 6237+ return ($color) ;
 6238+}
 6239+
 6240+sub HtmlWorldMaps
 6241+{
 6242+my $html_worldmaps = <<__HTML_WORLD_MAPS__ ;
 6243+<tr><td colspan=99 align=center>
 6244+<table width='100%' align=center><td align=left>
 6245+<small>
 6246+<img src='http://upload.wikimedia.org/wikipedia/commons/thumb/b/b1/World_population.PNG/400px-World_population.PNG' border='1'>
 6247+<br><a href='http://en.wikipedia.org/wiki/List_of_countries_by_population'>Countries by population</a> - English Wikipedia
 6248+</small>
 6249+</td><td>
 6250+<small>
 6251+<img src='http://upload.wikimedia.org/wikipedia/commons/thumb/a/af/Internet_Penetration.png/400px-Internet_Penetration.png' border='1'>
 6252+<br><a href='http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users'>Internet penetration</a> (% of population) - English Wikipedia
 6253+</small>
 6254+</td></tr>
 6255+<tr><td>
 6256+<small>
 6257+<img src='http://upload.wikimedia.org/wikipedia/commons/thumb/4/46/North_South_divide.svg/400px-North_South_divide.svg.png' border='1'>
 6258+<br><a href='http://en.wikipedia.org/wiki/North-South_divide'>Global North South</a> - English Wikipedia
 6259+</small>
 6260+</td></tr>
 6261+</table>
 6262+</td></tr>
 6263+__HTML_WORLD_MAPS__
 6264+
 6265+return $html_worldmaps ;
 6266+}

Status & tagging log