Index: trunk/wikistats/squids/SquidCountryScan.sh |
— | — | @@ -0,0 +1,6 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +ulimit -v 4000000 |
| 5 | + |
| 6 | +# perl ./SquidCountryScan.pl -y 2010 |
| 7 | +perl ./SquidCountryScan.pl # start in July 2009 |
Property changes on: trunk/wikistats/squids/SquidCountryScan.sh |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 8 | + native |
Index: trunk/wikistats/squids/SquidCountryScan.pl |
— | — | @@ -0,0 +1,488 @@ |
| 2 | +#!/usr/bin/perl |
| 3 | +## Collect page views stats by country on Locke |
| 4 | +## sub CollectRawData -> SquidDataCountries.csv |
| 5 | +## sub ProcessRawData <- SquidDataCountries.csv -> ?? |
| 6 | + |
| 7 | + use lib "/home/ezachte/lib" ; |
| 8 | + use EzLib ; |
| 9 | + $trace_on_exit = $true ; |
| 10 | + |
| 11 | + use Time::Local ; |
| 12 | + use Getopt::Std ; |
| 13 | + use Cwd; |
| 14 | + $timestart = time ; |
| 15 | + |
| 16 | + my %options ; |
| 17 | + getopt ("y", \%options) ; |
| 18 | + $process_year = $options {"y"} ; |
| 19 | + if (($process_year !~ /^\d\d\d\d$/) || ($process_year < 2009)) |
| 20 | + { |
| 21 | + $process_year = 2009 ; |
| 22 | + # print "Specify year as '-y nnnn'\n\n" ; |
| 23 | + # exit ; |
| 24 | + } |
| 25 | + |
| 26 | + $path_root = "/a/ezachte/" ; |
| 27 | +# $path_root = "w:/! perl/squids/archive/" ; |
| 28 | + |
| 29 | + $file_raw_data_monthly_visits = "$path_root/SquidDataVisitsPerCountryMonthly.csv" ; |
| 30 | + $file_raw_data_daily_visits = "$path_root/SquidDataVisitsPerCountryDaily.csv" ; |
| 31 | + $file_per_country_visits = "public/SquidDataCountriesViews.csv" ; |
| 32 | + $file_per_country_visits_old = "SquidDataCountries2.csv" ; |
| 33 | + |
| 34 | + $file_raw_data_monthly_saves = "$path_root/SquidDataSavesPerCountryMonthly.csv" ; |
| 35 | + $file_raw_data_daily_saves = "$path_root/SquidDataSavesPerCountryDaily.csv" ; |
| 36 | + $file_per_country_saves = "public/SquidDataCountriesSaves.csv" ; |
| 37 | + $file_per_country_saves_old = "SquidDataCountriesSaves.csv" ; |
| 38 | + |
| 39 | + &CollectRawData ('visits', $file_per_country_visits, $file_per_country_visits_old, $file_raw_data_monthly_visits, $file_raw_data_daily_visits) ; |
| 40 | + &CollectRawData ('saves', $file_per_country_saves, $file_per_country_saves_old, $file_raw_data_monthly_saves, $file_raw_data_daily_saves) ; |
| 41 | +# &ProcessRawData ; |
| 42 | + |
| 43 | + exit ; |
| 44 | + |
| 45 | +sub CollectRawData |
| 46 | +{ |
| 47 | + my ($mode, $file_per_country, $file_per_country_old, $file_raw_data_monthly, $file_raw_data_daily) = @_ ; |
| 48 | + my ($visits_wp_total, $visits_total_wp_en) ; |
| 49 | + my (%visits_monthly, %visits_daily, %visits_wp_yyyymm, %visits_per_project, %visits_per_language, %visits_per_country, %visits_wp_b, %visits_wp_u, %correct_for_missing_days) ; |
| 50 | + |
| 51 | + print "Collect raw data for $mode\n\n" ; |
| 52 | + print "Input data per country $file_per_country, $file_per_country_old\n" ; |
| 53 | + print "Raw data monthly $file_raw_data_monthly\n" ; |
| 54 | + print "Raw data daily $file_raw_data_daily\n\n" ; |
| 55 | + |
| 56 | + $year = $process_year ; |
| 57 | + if ($year == 2009) |
| 58 | + { $month = 7 ; } |
| 59 | + else |
| 60 | + { $month = 1 ; } |
| 61 | + |
| 62 | + while ($true) |
| 63 | + { |
| 64 | + $dir = "$path_root/" . sprintf ("%04d-%02d", $year, $month) ; |
| 65 | + $yyyymm = sprintf ("%04d-%02d", $year, $month) ; |
| 66 | + if (-d $dir) |
| 67 | + { |
| 68 | + print "Dir: $dir\n" ; |
| 69 | + $days_in_month = &DaysInMonth ($year,$month) ; |
| 70 | + |
| 71 | + $days_found = 0 ; |
| 72 | + for ($day = 1 ; $day <= $days_in_month ; $day++) |
| 73 | + { |
| 74 | + if (($month == 4) && ($year == 2009) && ($day < 18)) { next ; } |
| 75 | + |
| 76 | + $yyyymmdd = sprintf ("%04d-%02d-%02d", $year, $month, $day) ; |
| 77 | + |
| 78 | + # do not combine with SquidDataCountries.csv from earlier months |
| 79 | + # only from 2009-07 anonymous bots (hits > 1 in sampled log) were ignored |
| 80 | + $file = "$dir/" . sprintf ("%04d-%02d-%02d", $year, $month, $day) . "/$file_per_country_old" ; |
| 81 | + # print "READ1 $file\n" ; |
| 82 | + if (! -e $file) |
| 83 | + { |
| 84 | + $file = "$dir/" . sprintf ("%04d-%02d-%02d", $year, $month, $day) . "/$file_per_country" ; |
| 85 | + # print "READ2 $file\n" ; |
| 86 | + } |
| 87 | + |
| 88 | + if (-e $file) |
| 89 | + { |
| 90 | + $days_found++ ; |
| 91 | + # print "File: $file\n" ; |
| 92 | + open IN, '<', $file ; |
| 93 | + while ($line = <IN>) |
| 94 | + { |
| 95 | + if ($line =~ /^#/) { next ; } |
| 96 | + |
| 97 | + chomp $line ; |
| 98 | + ($bot,$wiki,$country,$count) = split (',', $line) ; |
| 99 | + |
| 100 | + if ($bot =~ /Y/) |
| 101 | + { $bot = 'B' ; } |
| 102 | + else |
| 103 | + { $bot = 'U' ; } |
| 104 | + |
| 105 | + ($project,$language) = split (':', $wiki) ; |
| 106 | + $project =~ s/\s//g ; |
| 107 | + |
| 108 | + # if ($project ne "wp") { next ; } |
| 109 | + # if ($yyyymm ne "2009-11") { next ; } |
| 110 | + # if ($language eq "www") { next ; } |
| 111 | + |
| 112 | + $visits_monthly {"$yyyymm,$project,$language,$country,$bot"} += $count ; |
| 113 | + $visits_daily {"$yyyymmdd,$project,$language,$country,$bot"} += $count ; |
| 114 | + |
| 115 | + # following hashes for specific research, not for regular csv files |
| 116 | + if (($project eq "wp") && ($bot eq 'U') && ($country ne "--")) |
| 117 | + { |
| 118 | + $visits_wp_yyyymm {$yyyymm} += $count ; |
| 119 | + $visits_wp_total += $count ; |
| 120 | + } |
| 121 | + |
| 122 | + if (($project eq "wp") && ($language eq "en") && ($bot eq 'U') && ($country ne "--")) |
| 123 | + { |
| 124 | + $visits_total_wp_en += $count ; |
| 125 | + $visits_wp_en {$country} += $count ; |
| 126 | + } |
| 127 | + |
| 128 | + if (($bot eq 'U') && ($country ne "--")) |
| 129 | + { |
| 130 | + $visits_per_project {$project} += $count ; |
| 131 | + $visits_per_language {$language} += $count ; |
| 132 | + $visits_per_country {$country} += $count ; |
| 133 | + } |
| 134 | + |
| 135 | + $visits_total += $count ; |
| 136 | + |
| 137 | + if (($project eq "wp") && ($language =~ /^(?:th|sk)$/)) |
| 138 | + { |
| 139 | + if ($bot eq 'U') |
| 140 | + { $visits_wp_u {"$language $yyyymm"} += $count ; } |
| 141 | + else |
| 142 | + { $visits_wp_b {"$language $yyyymm"} += $count ; } |
| 143 | + } |
| 144 | + } |
| 145 | + close IN ; |
| 146 | + } |
| 147 | + else |
| 148 | + { print "Miss! $file\n" ; } |
| 149 | + } |
| 150 | + $correct_for_missing_days {$yyyymm} = 1 ; |
| 151 | + if (($days_found > 0) && ($days_in_month > $days_found)) |
| 152 | + { |
| 153 | + $correct_for_missing_days {$yyyymm} = $days_in_month / $days_found ; |
| 154 | + print "Correct for $yyyymm: $days_found -> $days_in_month = * ${correct_for_missing_days {$yyyymm}}\n" ; |
| 155 | + } |
| 156 | + } |
| 157 | + else |
| 158 | + { |
| 159 | + print "Folder $dir not found. Processing complete.\n" ; |
| 160 | + last ; |
| 161 | + } |
| 162 | + |
| 163 | + $month++ ; |
| 164 | + if ($month > 12) |
| 165 | + { |
| 166 | + $month =1 ; |
| 167 | + $year ++ ; |
| 168 | + # last ; |
| 169 | + } |
| 170 | + } |
| 171 | + |
| 172 | + print "\nVisits per project:\n" ; |
| 173 | + foreach $key (sort {$visits_per_project {$b} <=> $visits_per_project {$a} } keys %visits_per_project) |
| 174 | + { |
| 175 | + print sprintf ("%9d", $visits_per_project {$key}) . " " .sprintf ("%5.2f", 100 * $visits_per_project {$key}/$visits_total) . "% $key\n" ; |
| 176 | + } |
| 177 | + |
| 178 | + print "\n\n" ; |
| 179 | + |
| 180 | + print "\nVisits per country:\n" ; |
| 181 | + foreach $key (sort {$visits_per_country {$a} <=> $visits_per_country {$b}} keys %visits_per_country) |
| 182 | + { |
| 183 | + print sprintf ("%9d", $visits_per_country {$key}) . " " .sprintf ("%6.3f", 100 * $visits_per_country {$key}/$visits_total) . "% $key\n" ; |
| 184 | + } |
| 185 | + |
| 186 | + print "\nWikipedia visits per country:\n" ; |
| 187 | + foreach $key (sort {$visits_wp_u {$b} cmp $visits_wp_u {$a}} keys %visits_wp_u) |
| 188 | + { |
| 189 | + print sprintf ("%9.1f", ($visits_wp_u {$key} + $visits_wp_b {$key}) /1000) . " - " . sprintf ("%9.1f", $visits_wp_u {$key} /1000) . " - " . sprintf ("%9.1f", $visits_wp_b {$key} /1000) . " $key\n" ; # / 1000 on 1:1000 sampled file is millions |
| 190 | + } |
| 191 | + |
| 192 | + print "\nVisits per language:\n" ; |
| 193 | + foreach $key (sort {$visits_per_language {$a} <=> $visits_per_language {$b}} keys %visits_per_language) |
| 194 | + { |
| 195 | + print sprintf ("%9d", $visits_per_language {$key}) . " " .sprintf ("%6.3f", 100 * $visits_per_language {$key}/$visits_total) . "% $key\n" ; |
| 196 | + } |
| 197 | + |
| 198 | + print "\nVisits to English Wikipedia\n" ; |
| 199 | + foreach $key (sort {$visits_wp_en {$a} <=> $visits_wp_en {$b}} keys %visits_wp_en) |
| 200 | + { |
| 201 | + print sprintf ("%9d", $visits_wp_en {$key}) . " " .sprintf ("%6.3f", 100 * $visits_wp_en {$key}/$visits_total_wp_en) . "% $key\n" ; |
| 202 | + } |
| 203 | + |
| 204 | + print "\n\n" ; |
| 205 | + |
| 206 | + print "\n\n" ; |
| 207 | + |
| 208 | +# foreach $key (sort keys %visits) |
| 209 | +# { |
| 210 | +# if ($key !~ /wq/) { next ; } |
| 211 | +# print sprintf ("%5d", $visits {$key}) . " $key\n" ; |
| 212 | +# } |
| 213 | + |
| 214 | + open CSV_MONTHLY, '>', $file_raw_data_monthly ; |
| 215 | + foreach $key (sort keys %visits_monthly) |
| 216 | + { |
| 217 | + ($yyyymm, $project, $language, $country) = split (',', $key) ; |
| 218 | + $correction = $correct_for_missing_days {$yyyymm} ; |
| 219 | + $count = $visits_monthly{$key} ; |
| 220 | + $count2 = $count ; |
| 221 | + if (($correction != 0) && ($correction != 1)) |
| 222 | + { |
| 223 | + $count2 = $count ; |
| 224 | + $count = sprintf ("%.0f", $count * $correction) ; |
| 225 | + # print "$yyyymm: $count2 -> $count (=* $correction)\n" ; |
| 226 | + } |
| 227 | + print CSV_MONTHLY "$key,$count\n" ; |
| 228 | + } |
| 229 | + close CSV_MONTHLY ; |
| 230 | + |
| 231 | + # note correct for missing days in follow processing, see monthly data above |
| 232 | + open CSV_DAILY, '>', $file_raw_data_daily ; |
| 233 | + foreach $key (sort keys %visits_daily) |
| 234 | + { print CSV_DAILY "$key,${visits_daily{$key}}\n" ; } |
| 235 | + close CSV_DAILY ; |
| 236 | + |
| 237 | + foreach $yyyymm (sort keys %visits_wp_yyyymm) |
| 238 | + { |
| 239 | + $total = $visits_wp_yyyymm {$yyyymm} ; |
| 240 | + $correction = $correct_for_missing_days {$yyyymm} ; |
| 241 | + $total_corrected = $total * $correction ; |
| 242 | + $total_corrected_share = int (100 * $total_corrected / $visits_wp_total) ; |
| 243 | + print "$yyyymm: $total * $correction = $total_corrected / $visits_wp_total = $total_corrected_share\%\n" ; |
| 244 | + } |
| 245 | +} |
| 246 | + |
| 247 | +sub ProcessRawData |
| 248 | +{ |
| 249 | + print "\nProcessRawData\n\n" ; |
| 250 | + |
| 251 | + open IN, '<', $file_raw_data ; |
| 252 | + open OUT, '>', $file_csv_counts_daily_project ; |
| 253 | + |
| 254 | + $date_prev = "" ; |
| 255 | + |
| 256 | + while ($line = <IN>) |
| 257 | + { |
| 258 | + $lines++ ; |
| 259 | + chomp ($line) ; |
| 260 | + # ($date,$bot,$from,$to,$php,$status,$mime,$action,$agent,$count) = split (',', $line) ; |
| 261 | + ($date,$bot,$from,$to,$status,$mime,$action,$count) = split (',', $line) ; |
| 262 | + |
| 263 | +# if ($to !~ /wk:lt/) { next ; } |
| 264 | + |
| 265 | + if ($bot =~ /^#/) { next ; } # fix, should be removed in CollectRawData |
| 266 | + |
| 267 | + # if ($php ne "php(index.php)") { $lines_unexpected_php {$php}++ ; next ; } |
| 268 | + |
| 269 | + $action2 = $action ; |
| 270 | + $action2 =~ s/\&.*$// ; |
| 271 | + $counts_per_action {"$action2"} += $count ; |
| 272 | + |
| 273 | + $action =~ s/\&/&/g ; |
| 274 | + |
| 275 | + if ($action =~ /submitlogin/) |
| 276 | + { next ; } |
| 277 | + |
| 278 | + if (($action !~ /^action=edit\&/) && ($action !~ /^action=submit\&/) ) |
| 279 | + { |
| 280 | + $invalid_actions ++ ; |
| 281 | + next ; |
| 282 | + } |
| 283 | + |
| 284 | + if ($mime ne "text/html") |
| 285 | + { |
| 286 | + $mime_not_text_html {$mime} ++ ; |
| 287 | + next ; |
| 288 | + } |
| 289 | + |
| 290 | + if (! ((($action =~ /action=edit/) && ($status =~ /200/)) || |
| 291 | + (($action =~ /action=submit/) && ($status =~ /302/)))) |
| 292 | + { next ; } |
| 293 | + |
| 294 | + $counts_per_relevant_action_and_status1 {"$action2"} += $count ; |
| 295 | + |
| 296 | + $counts_per_bot_relevant_action_and_status2 {"$bot,$action2,$status"} += $count ; |
| 297 | + |
| 298 | + if ($action !~ /redlink/) |
| 299 | + { |
| 300 | + $counts_per_relevant_action_and_status_no_redlink {"$action2,$status"} += $count ; |
| 301 | + |
| 302 | + $counts_per_bot_relevant_action_and_status_no_redlink {"$bot,$status,$action2"} += $count ; |
| 303 | + |
| 304 | + if ($bot =~ /N/) |
| 305 | + { |
| 306 | + # print "$to,$action2,$count\n" ; |
| 307 | + $counts_no_bot_per_relevant_action_and_status_no_redlink {"$to,$action2"} += $count ; |
| 308 | + $counts_no_bot_no_redlink_per_destination {$to} += $count ; |
| 309 | + } |
| 310 | + } |
| 311 | + |
| 312 | + if (($action =~ /redlink/) && ($status =~ /(?:200|302)/)) |
| 313 | + { |
| 314 | + $counts_per_relevant_status_with_redlink {"$to,action=edit,redlink=..,$status"} += $count ; |
| 315 | + $counts_per_destination {$to} += $count ; |
| 316 | + } |
| 317 | + |
| 318 | + if ($action =~ /redlink/) |
| 319 | + { next ; } |
| 320 | + |
| 321 | + if (($to !~ /wp:(?:en|de|ja|es|fr|ru|zh)$/) && ($to !~ /wk:(?:lt)$/) && ($to !~ /wx:(?:mw)$/)) |
| 322 | + { next ; } |
| 323 | + |
| 324 | + if ($bot !~ /N/) |
| 325 | + { next ; } |
| 326 | + |
| 327 | + $counts {"$date,$to,$action2"} += $count ; |
| 328 | + $dates {$date}++ ; |
| 329 | + $tos {$to}++ ; |
| 330 | + |
| 331 | + if ($bot eq "bot=Y") |
| 332 | + { |
| 333 | + if ($action =~ /action=edit/) |
| 334 | + {$ bots_edits += $count ; } |
| 335 | + elsif ($action =~ /action=submit/) |
| 336 | + { $bots_saves += $count ; } |
| 337 | + } |
| 338 | + else |
| 339 | + { |
| 340 | + if ($action =~ /action=edit/) |
| 341 | + {$user_edits += $count ; } |
| 342 | + elsif ($action =~ /action=submit/) |
| 343 | + { $user_saves += $count ; } |
| 344 | + } |
| 345 | + } |
| 346 | + |
| 347 | + |
| 348 | + print OUT "date," ; |
| 349 | + foreach $to (sort keys %tos) |
| 350 | + { print OUT "edits $to,saves $to,ratio $to," ; } |
| 351 | + print OUT "\n" ; |
| 352 | + |
| 353 | + foreach $date (sort keys %dates) |
| 354 | + { |
| 355 | + # print "DAY $date\n" ; |
| 356 | + $csv_date = "\"=DATE(" . substr ($date,0,4) . "," . substr ($date,4,2) . "," . substr ($date,6,2) . ")\"" ; |
| 357 | + |
| 358 | + print OUT "$csv_date, " ; |
| 359 | + |
| 360 | + foreach $to (sort keys %tos) |
| 361 | + { |
| 362 | + # print "TO $to\n" ; |
| 363 | + |
| 364 | + $edits = $counts {"$date,$to,action=edit"} ; |
| 365 | + $submits = $counts {"$date,$to,action=submit"} ; |
| 366 | + $ratio = -1 ; |
| 367 | + if ($submits > 0) |
| 368 | + { $ratio = sprintf ("%.1f", $edits/$submits) ; } |
| 369 | + print OUT "$edits,$submits,$ratio," ; |
| 370 | + } |
| 371 | + print OUT "\n" ; |
| 372 | + } |
| 373 | + |
| 374 | + # Write CSV_COUNT_DAILY |
| 375 | + |
| 376 | + open CSV_COUNT_DAILY, '>', $file_csv_counts_daily ; |
| 377 | + foreach $key (sort keys %counts) |
| 378 | + { print CSV_COUNT_DAILY sprintf ("%6d", $counts {$key}) . ",$key\n" ; } |
| 379 | + close CSV_COUNT_DAILY ; |
| 380 | + |
| 381 | + $text = "" ; |
| 382 | + $text .= "\nInvalid actions: $invalid_actions\n\n" ; |
| 383 | + |
| 384 | + $text .= "Counts per action:\n" ; |
| 385 | + foreach $key (sort keys %counts_per_action) |
| 386 | + { |
| 387 | + $count = $counts_per_action {$key} ; |
| 388 | + if ($count < 5) { next ; } |
| 389 | + $text .= sprintf ("%6d", $count) . ",$key\n" ; |
| 390 | + } |
| 391 | + $text .= "\n\n" ; |
| 392 | + |
| 393 | + $text .= "Counts per relevant action and status:\n" ; |
| 394 | + foreach $key (sort keys %counts_per_relevant_action_and_status1) |
| 395 | + { |
| 396 | + $count = $counts_per_relevant_action_and_status1 {$key} ; |
| 397 | + # if ($count < 5) { next ; } |
| 398 | + $text .= sprintf ("%6d", $count) . ",$key\n" ; |
| 399 | + } |
| 400 | + $text .= "\n\n" ; |
| 401 | + |
| 402 | + $text .= "Counts per bot, relevant action and status:\n" ; |
| 403 | + foreach $key (sort keys %counts_per_bot_relevant_action_and_status2) |
| 404 | + { |
| 405 | + $count = $counts_per_bot_relevant_action_and_status2 {$key} ; |
| 406 | + # if ($count < 5) { next ; } |
| 407 | + $text .= sprintf ("%6d", $count) . ",$key\n" ; |
| 408 | + } |
| 409 | + $text .= "\n\n" ; |
| 410 | + |
| 411 | + $text .= "Counts per relevant action and status and no redlinks:\n" ; |
| 412 | + foreach $key (sort keys %counts_per_relevant_action_and_status_no_redlink) |
| 413 | + { |
| 414 | + $count = $counts_per_relevant_action_and_status_no_redlink {$key} ; |
| 415 | + if ($count < 5) { next ; } |
| 416 | + $text .= sprintf ("%6d", $count) . ",$key\n" ; |
| 417 | + } |
| 418 | + $text .= "\n\n" ; |
| 419 | + |
| 420 | + $text .= "Count per bot, relevant action and status and no redlink:\n" ; |
| 421 | + foreach $key (sort keys %counts_per_bot_relevant_action_and_status_no_redlink) |
| 422 | + { |
| 423 | + $count = $counts_per_bot_relevant_action_and_status_no_redlink {$key} ; |
| 424 | + # if ($count < 5) { next ; } |
| 425 | + $text .= sprintf ("%-33s",$key) . sprintf ("%6d", $count) . "\n" ; |
| 426 | + } |
| 427 | + $text .= "\n\n" ; |
| 428 | + |
| 429 | + $text .= "Counts no bot, per relevant action and status no redlink:\n" ; |
| 430 | + foreach $key (sort keys %counts_no_bot_per_relevant_action_and_status_no_redlink) |
| 431 | + { |
| 432 | + ($to = $key) =~ s/,.*$// ; |
| 433 | + if ($to !~ /:/) { next ; } |
| 434 | + if ($counts_no_bot_no_redlink_per_destination {$to} < 100) { next ; } |
| 435 | + $count = $counts_no_bot_per_relevant_action_and_status_no_redlink {$key} ; |
| 436 | + if ($key =~ /action=edit/) |
| 437 | + { |
| 438 | + $count_edit = $counts_no_bot_per_relevant_action_and_status_no_redlink {"$to,action=edit"} ; |
| 439 | + $count_submit = $counts_no_bot_per_relevant_action_and_status_no_redlink {"$to,action=submit"} ; |
| 440 | + $count_edits += $count_edit ; |
| 441 | + $count_submits += $count_submit ; |
| 442 | + $ratio = '..' ; |
| 443 | + if ($count_submit > 0) |
| 444 | + { $ratio = sprintf ("%5.1f", $count_edit / $count_submit) ; } |
| 445 | + push @ratios, "$ratio|" . sprintf ("%-14s",$to) . "edits " . sprintf ("%6d", $count_edit) . ", submits ". sprintf ("%6d", $count_submit) . ", ratio $ratio\n" ; |
| 446 | + } |
| 447 | + # $text .= sprintf ("%-33s",$key) . sprintf ("%6d", $count) . "\n" ; |
| 448 | + } |
| 449 | + @ratios = sort {$b <=> $a} @ratios ; |
| 450 | + foreach $line (@ratios) |
| 451 | + { |
| 452 | + ($ratio, $line) = split ('\|', $line) ; |
| 453 | + $text .= $line ; |
| 454 | + } |
| 455 | + $ratio = sprintf ("%5.1f", $count_edits / $count_submits) ; |
| 456 | + $text .= sprintf ("%-14s",'total') . "edits " . sprintf ("%6d", $count_edits) . ", submits ". sprintf ("%6d", $count_submits) . ", ratio $ratio\n" ; |
| 457 | + $text .= "\n\n" ; |
| 458 | + print $count |
| 459 | + |
| 460 | + $text .= "Count per relevant status with redlink:\n" ; |
| 461 | + foreach $key (sort keys %counts_per_relevant_status_with_redlink) |
| 462 | + { |
| 463 | + $count = $counts_per_relevant_status_with_redlink {$key} ; |
| 464 | + ($to = $key) =~ s/,.*$// ; |
| 465 | + if ($counts_per_destination {$to} < 100) { next ; } |
| 466 | + $text .= sprintf ("%6d", $count) . ",$key\n" ; |
| 467 | + } |
| 468 | + $text .= "\n\n" ; |
| 469 | + |
| 470 | + open SUMMARY, '>', $file_txt_summary ; |
| 471 | + print SUMMARY $text ; |
| 472 | + close SUMMARY ; |
| 473 | + |
| 474 | + print $text ; |
| 475 | +} |
| 476 | + |
| 477 | + |
| 478 | +sub DaysInMonth |
| 479 | +{ |
| 480 | + my $year = shift ; |
| 481 | + my $month = shift ; |
| 482 | + my $timegm1 = timegm (0,0,0,1,$month-1,$year-1900) ; |
| 483 | + $month++ ; |
| 484 | + if ($month > 12) |
| 485 | + { $month = 1 ; $year++ } |
| 486 | + my $timegm2 = timegm (0,0,0,1,$month-1,$year-1900) ; |
| 487 | + my $days = ($timegm2-$timegm1) / (24*60*60) ; |
| 488 | + return ($days) ; |
| 489 | +} |
Index: trunk/wikistats/squids/SquidCountArchive.sh |
— | — | @@ -0,0 +1,13 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +ulimit -v 4000000 |
| 5 | + |
| 6 | +home="/a/ezachte" |
| 7 | +log="$home/SquidCountArchiveLog.txt" |
| 8 | +script="$home/SquidCountArchive.pl" |
| 9 | + |
| 10 | +echo "" > $log |
| 11 | + |
| 12 | +nice perl $script -d 2011/02/07-2011/02/11 |
| 13 | +echo "Ready" >> $log |
| 14 | +echo "Ready" |
Property changes on: trunk/wikistats/squids/SquidCountArchive.sh |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 15 | + native |
Index: trunk/wikistats/squids/SquidCountArchive.pl |
— | — | @@ -0,0 +1,1030 @@ |
| 2 | + #!/usr/bin/perl |
| 3 | + |
| 4 | + use lib "/home/ezachte/lib" ; |
| 5 | + use EzLib ; |
| 6 | + |
| 7 | + $trace_on_exit = $true ; |
| 8 | + ez_lib_version (13) ; |
| 9 | + |
| 10 | + use SquidCountArchiveProcessLogRecord ; |
| 11 | + use SquidCountArchiveReadInput ; |
| 12 | + use SquidCountArchiveWriteOutput ; |
| 13 | + |
| 14 | + # set defaults mainly for tests on local machine |
| 15 | + default_argv "-d 2010/05/10" ; |
| 16 | + |
| 17 | +# http://wikitech.wikimedia.org/view/Squid_log_format |
| 18 | +# 1. Hostname |
| 19 | +# 2. Sequence number |
| 20 | +# 3. Current time in ISO 8601 format (oplus milliseconds), according ot the squid server's clock |
| 21 | +# 4. Request time in ms |
| 22 | +# 5. Client IP |
| 23 | +# 6. Squid request status, HTTP status code |
| 24 | +# 7. Reply size including HTTP headers |
| 25 | +# 8. Request method (GET/POST etc) |
| 26 | +# 9. URL |
| 27 | +# 10. Squid hierarchy status, peer IP |
| 28 | +# 11. MIME content type |
| 29 | +# 12. Referer header |
| 30 | +# 13. X-Forwarded-For header |
| 31 | +# 14 User-Agent header |
| 32 | + |
| 33 | +# valid parameters: |
| 34 | +# parm -d m[-n] (last m|n days before today) or yyyymmdd[-yyyymmdd] or yyyy/mm/dd[-yyyy/mm/dd] |
| 35 | +# parm -f [1|2|12] force phase 1 and or 2 even when already ran succesfully earlier |
| 36 | +# phase 1 = collect IP frequency counts, this is first pass through data (there is litle change this needs to be redone, hence default is no overwrite) |
| 37 | +# phase 2 = collect other counts, this may have to be redone after filtering logic has changed |
| 38 | +# parm -t test mode |
| 39 | + |
| 40 | +# todo: parm -e use unsampled file with all edits and saves |
| 41 | +# todo: parm -r root folder |
| 42 | + |
| 43 | + $test = $false ; |
| 44 | + $test_maxlines = 4000000 ; |
| 45 | + |
| 46 | + if (! $job_runs_on_production_server) |
| 47 | + { |
| 48 | + $test = $true ; |
| 49 | + $file_test = "w:/# Out Locke/sampled-1000-log-20100510b.txt" ; |
| 50 | + # $file_test = getcwd . "/SquidDataFilterFY.txt" ; |
| 51 | + if (! -e $file_test) |
| 52 | + { abort "Test input file '$file_test' not found" ; } |
| 53 | + } |
| 54 | + |
| 55 | + $time_start = time ; |
| 56 | + |
| 57 | + if ($job_runs_on_production_server) |
| 58 | + { $path_root = "/a/ezachte" ; } |
| 59 | + else |
| 60 | + { $path_root = "w:/! perl/squids/archive/test" ; } |
| 61 | + |
| 62 | + $tags_mobile = "Android|BlackBerry|Windows CE|DoCoMo|iPad|iPod|iPhone|HipTop|LGE|Linux arm|Mobile|MIDP|NetFront|Nintendo|Nokia|Obigo|Opera Mini|Palm Pre|Playstation|Samsung|SoftBank|SonyEricsson|SymbianOS|UP\.Browser|Vodafone|WAP|webOS|Wikiamo|Wikipanion" ; |
| 63 | + $tags_mobile_upd = "May 2010" ; |
| 64 | + |
| 65 | + $pattern_url_pre = "(?:^|[a-zA-Z0-9-]+\\.)*?" ; |
| 66 | + $pattern_url_post = "\\.(?:biz|com|info|name|net|org|pro|aero|asia|cat|coop|edu|gov|int|jobs|mil|mobi|museum|tel|travel|arpa|[a-zA-Z0-9-]{2}|(?:com?|ne)\\.[a-zA-Z0-9-]{2})\$" ; |
| 67 | + |
| 68 | + my (%squid_seqno_lo, %squid_seqno_hi) ; |
| 69 | + |
| 70 | + my ($from_days_ago, $till_days_ago, $from_date, $till_date) = &ParseArguments ; |
| 71 | + &SetFileNames ; |
| 72 | + |
| 73 | + my ($path_out, $path_out_month) ; |
| 74 | + for ($days_ago = $from_days_ago ; $days_ago >= $till_days_ago ; $days_ago--) |
| 75 | + { |
| 76 | + if ($days_to_process ++ > 0) |
| 77 | + { print "\n" . "=" x 80 . "\n" ; } |
| 78 | + ($path_out, $path_out_month) = &SetPathOut ($days_ago) ; |
| 79 | + |
| 80 | + open OUT, '>', "$path_out/$file_out" ; |
| 81 | + open OUT2, '>', "$path_out/$file_out2" ; |
| 82 | + open ERR, '>', "$path_out/$file_err" ; |
| 83 | + # open FILTER_FY, '>>', "$path_out_month/$file_filter_fy" ; |
| 84 | + |
| 85 | + my $do_phase1 = &CheckProcessPhase1 ($days_ago, $path_out) ; # Collect IP frequencies |
| 86 | + my $do_phase2 = &CheckProcessPhase2 ($days_ago, $path_out) ; # collect other data |
| 87 | + |
| 88 | + next if ! $do_phase1 and ! $do_phase2 ; |
| 89 | + |
| 90 | + &InitGlobals ; |
| 91 | + undef @files ; # keep out of InitGlobals, to allow rerun with same files, see 'test InitGlobals' below |
| 92 | + |
| 93 | + ($date_collect_files, $time_to_start, $time_to_stop) = &SetTimeRangeToProcess ($days_ago) ; |
| 94 | + |
| 95 | + $all_files_found = &CollectFilesToProcess ($days_ago, $date_collect_files, $time_to_start, $time_to_stop, $path_out, $path_out_month) ; |
| 96 | + next if not $all_files_found ; |
| 97 | + |
| 98 | + if ($do_phase1) # Collect IP frequencies |
| 99 | + { &ProcessPhase1 ($days_ago, $date_collect_files, $time_to_start, $time_to_stop, $path_out, @files) ; } |
| 100 | + |
| 101 | + if ($do_phase2) # collect other data |
| 102 | + { &ProcessPhase2 ($days_ago, $date_collect_files, $time_to_start, $time_to_stop, $path_out, $path_out_month, @files) ; } |
| 103 | + |
| 104 | + # test InitGlobals: rebuild files in alternate folder, if InitGlobals did its work, all files are binary equal |
| 105 | + # &InitGlobals ; |
| 106 | + # if ($do_phase2) # collect other data |
| 107 | + # { &ProcessPhase2 ($days_ago, $date_collect_files, $time_to_start, $time_to_stop, $path_out. 'b', $path_out_month, @files) ; } |
| 108 | + |
| 109 | + close OUT ; |
| 110 | + close OUT2 ; |
| 111 | + close ERR ; |
| 112 | + # close FILTER_FY ; |
| 113 | + } |
| 114 | + |
| 115 | +# if (defined ($options {"u"})) # all lines with action=edit or action=submit generated in mode scan_squid_archive |
| 116 | +# { &ScanEditsSavesFile ; } # also use to build ScanDataCountriesSaves.csv for earlier months from SquidDataEditsSavesyyyy-mm-dd.txt.bz2 |
| 117 | +# else |
| 118 | +# { |
| 119 | +# if (defined ($options {"a"})) # scan ip addresses only (find multiple occurrences, store for reuse) |
| 120 | +# { |
| 121 | +# $scan_ip_frequencies = $true ; |
| 122 | +# print "Scan for multiple occurrences of ip addresses\n\n" ; |
| 123 | +# } |
| 124 | +# elsif (defined ($options {"s"})) # scan squid sequence numbers |
| 125 | +# { |
| 126 | +# $scan_squid_msg_sequence_numbers = $true ; |
| 127 | +# print "Scan for squid sequence numbers\n\n" ; |
| 128 | +# } |
| 129 | +# else |
| 130 | +# { |
| 131 | +# $scan_all_fields = $true ; |
| 132 | +# print "Scan all fields\n\n" ; |
| 133 | +# } |
| 134 | + |
| 135 | +# &ScanSquidArchive ; |
| 136 | +# } |
| 137 | + |
| 138 | +# &ProcessSquidSequenceNumbers ; |
| 139 | + |
| 140 | + print "\n\nReady\n\n" ; |
| 141 | + exit ; |
| 142 | + |
| 143 | +sub ParseArguments |
| 144 | +{ |
| 145 | + trace ParseArguments ; |
| 146 | + |
| 147 | + my %options ; |
| 148 | + |
| 149 | + getopt ("df", \%options) ; |
| 150 | + |
| 151 | + $date_range = $options {"d"} ; |
| 152 | + $force_phases = $options {"f"} ; |
| 153 | + |
| 154 | + if ($force_phases !~ /^(?:|1|2|12|21)$/) |
| 155 | + { abort "Invalid data for -f parameter: specify which phases to force as -f [1|2|12]\nForce = execute phase even when already done succesfully earlier\nPhase1 = collect ip counts\nPhase2 = collect other counts\n" ; } |
| 156 | + |
| 157 | + if ($date_range eq '') |
| 158 | + { abort "No valid date range specified\n\nSpecify first and last day to process as:\n'-d yyyymmdd[-yyyymmdd]' (yymmdd or yyyy/mm/dd, " . |
| 159 | + "second date defaults to first)\nor\n'-d mmm[-nnn]', where mmm and nnn are days before today (mmm less or equal to nnn), nnn defaults to mmm\n\n" ; } |
| 160 | + |
| 161 | + if ($date_range =~ m/^\d{4}\/?\d{2}\/?\d{2}(?:\-\d{4}\/?\d{2}\/?\d{2})?$/) # specify daterange as yyyymmdd-yyyymmdd or yyyy/mm/dd-yyyy/mm/dd |
| 162 | + { |
| 163 | + if ($date_range =~ /^\d{4}\/?\d{2}\/?\d{2}$/) # expand shorthand version |
| 164 | + { $date_range =~ s/^(\d{4}\/?\d{2}\/?\d{2})$/$1-$1/ ; } |
| 165 | + |
| 166 | + ($from_date,$till_date) = split '-', $date_range ; |
| 167 | + |
| 168 | + $from_year = substr ($from_date,0,4) ; |
| 169 | + $from_month = substr ($from_date,4,2) ; |
| 170 | + $from_day = substr ($from_date,6,2) ; |
| 171 | + |
| 172 | + $till_year = substr ($till_date,0,4) ; |
| 173 | + $till_month = substr ($till_date,4,2) ; |
| 174 | + $till_day = substr ($till_date,6.2) ; |
| 175 | + |
| 176 | + $from_days_ago = ValidateDateAndCalcDaysAgo ('from date', $from_date) ; |
| 177 | + $till_days_ago = ValidateDateAndCalcDaysAgo ('till date', $till_date) ; |
| 178 | + |
| 179 | + my $diff_days = ($from_days_ago - $till_days_ago) + 1 ; |
| 180 | + if ($till_days_ago > $from_days_ago) |
| 181 | + { abort "Invalid date range: from date '$from_date' is later than till date '$till_date'\n" ; } |
| 182 | + |
| 183 | + $yyyymmdd = 'yyyy/mm/dd' ; |
| 184 | + if ($from_date !~ /\//) |
| 185 | + { $yyyymmdd =~ s/\///g ; } |
| 186 | + print "Process following date range:\nFrom '$from_date' till '$till_date' ($yyyymmdd)\nWhich is from $from_days_ago till $till_days_ago days ago = $diff_days days\n" ; |
| 187 | + } |
| 188 | + elsif ($date_range =~ /^\d{1,3}(?:-\d{1,3})?$/) # specify daterange as mmm-nnn (where mmm and nnn are number of days before today), nnn defaults to mmm |
| 189 | + { |
| 190 | + if ($date_range =~ /^\d+$/) # expand shorthand version |
| 191 | + { $date_range =~ s/^(\d+)$/$1-$1/ ; } |
| 192 | + |
| 193 | + ($from_days_ago,$till_days_ago) = split '-', $date_range ; |
| 194 | + |
| 195 | + if ($till_days_ago > $from_days_ago) # swap |
| 196 | + # { abort "Invalid date range: from date '$from_date' is later than till date '$till_date'\n" ; } |
| 197 | + { my $temp = $till_days_ago ; $till_days_ago = $from_days_ago ; $from_days_ago = $temp ; } |
| 198 | + |
| 199 | + ($sec,$min,$hour,$day,$month,$year) = localtime (time) ; |
| 200 | + ($year,$month,$day) = &ShiftDays ($year+1900, $month+1, $day, - $from_days_ago) ; |
| 201 | + $from_date = sprintf ("%04d/%02d/%02d",$year,$month,$day) ; |
| 202 | + |
| 203 | + ($sec,$min,$hour,$day,$month,$year) = localtime (time) ; |
| 204 | + ($year,$month,$day) = &ShiftDays ($year+1900, $month+1, $day, - $till_days_ago) ; |
| 205 | + $till_date = sprintf ("%04d/%02d/%02d",$year,$month,$day) ; |
| 206 | + |
| 207 | + my $diff_days = ($from_days_ago - $till_days_ago) + 1 ; |
| 208 | + print "Process following date range:\nFrom $from_days_ago till $till_days_ago days ago, which is:\nFrom '$from_date' till '$till_date' (yyyy/mm/dd) = $diff_days days\n" ; |
| 209 | + } |
| 210 | + else |
| 211 | + { abort "\nNo valid date range specified!\n\nSpecify first and last day to process as:\n'-d yyyymmdd[-yyyymmdd]' (yyyy/m/dd also valid)\n" . |
| 212 | + "(second date defaults to first)\nor\n'-d mmm[-nnn]', where mmm and nnn are days before today (mmm =< nnn), nnn defaults to mmm\n\n" ; } |
| 213 | + |
| 214 | + if ($options {"t"}) |
| 215 | + { |
| 216 | + $test = $true ; |
| 217 | + print "Run in test mode: process less input\n" ; |
| 218 | + } |
| 219 | + |
| 220 | + return ($from_days_ago, $till_days_ago, $from_date, $till_date) ; |
| 221 | +} |
| 222 | + |
| 223 | +sub ValidateDateAndCalcDaysAgo |
| 224 | +{ |
| 225 | + trace ValidateDateAndCalcDaysAgo ; |
| 226 | + |
| 227 | + my ($desc, $date) = @_ ; |
| 228 | + |
| 229 | + my ($sec,$min,$hour,$day,$month,$year) ; |
| 230 | + ($sec,$min,$hour,$day,$month,$year) = localtime (time) ; |
| 231 | + |
| 232 | + my $date_today = sprintf ("%4d/%02d/%02d", $year+1900,$month+1,$day) ; |
| 233 | + if ($date !~ /\//) |
| 234 | + { $date_today =~ s/\///g ; } |
| 235 | + |
| 236 | + if ($date =~ m!^(20\d\d)/?(0[1-9]|1[012])/?(0[1-9]|[12][0-9]|3[01])$!) |
| 237 | + { |
| 238 | + # At this point, $1 holds the year, $2 the month and $3 the day of the date entered |
| 239 | + $year = $1 ; |
| 240 | + $month = $2 ; |
| 241 | + $day = $3 ; |
| 242 | + |
| 243 | + if ($day == 31 and ($month == 4 or $month == 6 or $month == 9 or $month == 11)) |
| 244 | + { abort "$desc '$date': 31st of a month with 30 days" ; } |
| 245 | + elsif ($day >= 30 and $month == 2) |
| 246 | + { abort "$desc '$date': February 30th or 31st" ; } |
| 247 | + elsif ($month == 2 and $day == 29 and not ($year % 4 == 0 and ($year % 100 != 0 or $year % 400 == 0))) |
| 248 | + { abort "$desc '$date': February 29th outside a leap year" ; } |
| 249 | + else { ; } # valid date |
| 250 | + } |
| 251 | + else { abort "$date: not valid date format: use yyyymmdd or yyyy/mm/dd" ; } |
| 252 | + |
| 253 | + my $time_input = timelocal (0,0,0,$day, $month-1, $year-1900) ; |
| 254 | + ($sec,$min,$hour,$day,$month,$year) = localtime (time) ; |
| 255 | + my $time_today = timelocal (0,0,0,$day, $month, $year) ; |
| 256 | + |
| 257 | + my $days_ago = ($time_today - $time_input) / (24 * 60 * 60) ; |
| 258 | + |
| 259 | + if ($days_ago < 1) |
| 260 | + { abort "$desc '$date' should be before today which is $date_today" ; } |
| 261 | + |
| 262 | + if ($days_ago > 366) |
| 263 | + { abort "$desc '$date' should be a year or less ago (but before today: '$date_today')" ; } |
| 264 | + |
| 265 | + return ($days_ago) ; |
| 266 | +} |
| 267 | + |
| 268 | +sub SetFileNames |
| 269 | +{ |
| 270 | + trace SetFileNames ; |
| 271 | + |
| 272 | + $file_out = "private/DebugSquidDataOutDoNotPublish.txt" ; |
| 273 | + $file_out2 = "private/DebugSquidDataOutDoNotPublish2.txt" ; |
| 274 | + $file_err = "private/DebugSquidDataErrDoNotPublish.txt" ; |
| 275 | + |
| 276 | + $file_ip_frequencies = "private/SquidDataIpFrequenciesDoNotPublish.csv" ; |
| 277 | + $file_ip_frequencies_bz2 = "private/SquidDataIpFrequenciesDoNotPublish.csv.bz2" ; |
| 278 | + $file_out_referers = "private/SquidDataReferersDoNotPublish.txt" ; |
| 279 | + $file_edits_saves = "private/SquidDataEditsSavesDoNotPublish.txt" ; |
| 280 | + |
| 281 | + $file_csv_agents = "public/SquidDataAgents.csv" ; |
| 282 | + $file_csv_banners = "public/SquidDataBanners.csv" ; |
| 283 | + $file_csv_binaries = "public/SquidDataBinaries.csv" ; |
| 284 | + $file_csv_clients = "public/SquidDataClients.csv" ; |
| 285 | + $file_csv_clients_by_wiki = "public/SquidDataClientsByWiki.csv" ; # request Howie |
| 286 | + $file_csv_countries_views = "public/SquidDataCountriesViews.csv" ; # was SquidDataCountries2.csv |
| 287 | + $file_csv_countries_timed = "public/SquidDataCountriesViewsTimed.csv" ; # was SquidDataCountriesTimed2.csv |
| 288 | + $file_csv_countries_saves = "public/SquidDataCountriesSaves.csv" ; |
| 289 | + $file_csv_bots = "public/SquidDataCrawlers.csv" ; |
| 290 | + $file_csv_extensions = "public/SquidDataExtensions.csv" ; |
| 291 | + $file_csv_googlebots = "public/SquidDataGoogleBots.csv" ; |
| 292 | + $file_csv_images = "public/SquidDataImages.csv" ; |
| 293 | + $file_csv_indexphp = "public/SquidDataIndexPhp.csv" ; # |
| 294 | + $file_csv_languages = "public/SquidDataLanguages.csv" ; |
| 295 | + $file_head_tail = "public/SquidDataLogFilesHeadTail.csv" ; |
| 296 | + $file_csv_methods = "public/SquidDataMethods.csv" ; |
| 297 | + $file_csv_opsys = "public/SquidDataOpSys.csv" ; |
| 298 | + $file_csv_origins = "public/SquidDataOrigins.csv" ; |
| 299 | + $file_csv_requests = "public/SquidDataRequests.csv" ; |
| 300 | + $file_csv_requests_wap = "public/SquidDataRequestsWap.csv" ; |
| 301 | + $file_csv_requests_m = "public/SquidDataRequestsM.csv" ; # .m. in url, not mobile as derived from agent |
| 302 | + $file_csv_scripts = "public/SquidDataScripts.csv" ; |
| 303 | + $file_csv_search = "public/SquidDataSearch.csv" ; |
| 304 | + $file_csv_skins = "public/SquidDataSkins.csv" ; |
| 305 | + |
| 306 | + $file_seqno_per_squidhour = "SquidDataSequenceNumbersPerSquidHour.csv" ; |
| 307 | + $file_seqno_all_squids = "SquidDataSequenceNumbersAllSquids.csv" ; |
| 308 | + $file_head_tail = "SquidDataLogFilesHeadTail.csv" ; |
| 309 | +# $file_filter_fy = "SquidDataFilterFY.txt" ; |
| 310 | + |
| 311 | + $path_out = "" ; |
| 312 | +} |
| 313 | + |
| 314 | +sub SetPathOut |
| 315 | +{ |
| 316 | + trace SetPathOut ; # to keep trace tidy , do this at end of routine |
| 317 | + |
| 318 | + my $days_ago = shift ; |
| 319 | + my ($path_out, $path_out_month) ; |
| 320 | + |
| 321 | + ($sec,$min,$hour,$day,$month,$year) = localtime ($time_start - $days_ago * 24 * 3600) ; |
| 322 | + |
| 323 | + $path_out = sprintf ("%04d-%02d", $year+1900, $month+1) ; |
| 324 | + |
| 325 | + $path_out = "$path_root/$path_out" ; |
| 326 | + $path_out_month = $path_out ; |
| 327 | + |
| 328 | + if (! -d $path_out) |
| 329 | + { |
| 330 | + # print "mkdir $path_out\n" ; |
| 331 | + mkdir ($path_out) || die "Unable to create directory $path_out\n" ; |
| 332 | + } |
| 333 | + |
| 334 | + $path_out .= "/" . sprintf ("%04d-%02d-%02d", $year+1900, $month+1, $day) ; |
| 335 | + if (! -d $path_out) |
| 336 | + { |
| 337 | + # print "mkdir $path_out\n" ; |
| 338 | + mkdir ($path_out) || die "Unable to create directory $path_out\n" ; |
| 339 | + # print "mkdir $path_out/private\n" ; |
| 340 | + mkdir ("$path_out/private") || die "Unable to create directory $path_out/private\n" ; |
| 341 | + # print "mkdir $path_out/public\n" ; |
| 342 | + mkdir ("$path_out/public" ) || die "Unable to create directory $path_out/public\n" ; |
| 343 | + } |
| 344 | + |
| 345 | + # clean up obsolete signal files |
| 346 | + $file_ready = "$path_out/\^Ready" ; |
| 347 | + unlink $file_ready ; |
| 348 | + $file_ready = "$path_out/\@Ready" ; |
| 349 | + unlink $file_ready ; |
| 350 | + |
| 351 | + trace "SetPathOut for $days_ago days ago => path_out = '$path_out'\n" ; |
| 352 | + return ($path_out, $path_out_month) ; |
| 353 | +} |
| 354 | + |
| 355 | +sub SetTimeRangeToProcess |
| 356 | +{ |
| 357 | + my $days_ago = shift ; |
| 358 | + |
| 359 | + my ($sec,$min,$hour,$day,$month,$year) = localtime ($time_start - $days_ago * 24 * 3600) ; |
| 360 | + my $date_collect_files = sprintf ("%4d-%02d-%02d", $year+1900, $month+1, $day) ; |
| 361 | + my $time_to_start = $date_collect_files . "T00:00:00" ; |
| 362 | + my ($sec,$min,$hour,$day,$month,$year) = localtime ($time_start - ($days_ago-1) * 24 * 3600) ; |
| 363 | + my $date_after_collect_files = sprintf ("%4d-%02d-%02d", $year+1900, $month+1, $day) ; |
| 364 | + my $time_to_stop = $date_after_collect_files . "T00:00:00" ; |
| 365 | +# my $time_to_stop = $date_collect_files . "T23:30:00" ; # Q&D fix to process last file available |
| 366 | + |
| 367 | + # if ($test) |
| 368 | + # { $time_to_stop = $date_collect_files . "T00:30:00" ; } |
| 369 | + |
| 370 | + return ($date_collect_files, $time_to_start, $time_to_stop) ; |
| 371 | +} |
| 372 | + |
| 373 | +sub CheckProcessPhase1 # Collect IP frequencies |
| 374 | +{ |
| 375 | + trace CheckProcessPhase1 ; |
| 376 | + |
| 377 | + my ($days_ago, $path_out) = @_ ; |
| 378 | + my $process = $true ; |
| 379 | + |
| 380 | + my $file_ready = "$file_ip_frequencies_bz2" ; |
| 381 | + my $path_ready = "$path_out/$file_ready" ; |
| 382 | + |
| 383 | + if (-e $path_ready) |
| 384 | + { |
| 385 | + if ($force_phases !~ /1/) |
| 386 | + { |
| 387 | + $process = $false ; |
| 388 | + print "File '[path_out]$file_ready' already exists => skip phase 1 (collecting ip address counts)\n" ; |
| 389 | + } |
| 390 | + else |
| 391 | + { print "File '[path_out]$file_ready' already exists.\nYet force execute phase 1 (collecting ip address counts), as -f 1 has been specified\n" ; } |
| 392 | + } |
| 393 | + else |
| 394 | + { print "File '[path_out]/$file_ready' not found -> process phase 1\n" ; } |
| 395 | + |
| 396 | + return ($process) ; |
| 397 | +} |
| 398 | + |
| 399 | +sub CheckProcessPhase2 # collect other data |
| 400 | + |
| 401 | +{ |
| 402 | + trace CheckProcessPhase2 ; |
| 403 | + |
| 404 | + my ($days_ago, $path_out) = @_ ; |
| 405 | + my $process = $true ; |
| 406 | + |
| 407 | + my $file_ready = "#Ready" ; |
| 408 | + my $path_ready = "$path_out/$file_ready" ; |
| 409 | + if (-e $path_ready) |
| 410 | + { |
| 411 | + if ($force_phases !~ /2/) |
| 412 | + { |
| 413 | + $process = $false ; |
| 414 | + print "File '[path_out]/$file_ready' already exists => skip phase 2 (collecting counts other than ip counts)\n" ; |
| 415 | + } |
| 416 | + else |
| 417 | + { print "File '[path_out]/$file_ready' already exists.\nYet force execute phase 2 (collecting counts other than ip counts), as -f 2 has been specified\n" ; } |
| 418 | + } |
| 419 | + else |
| 420 | + { print "File '[path_out]/$file_ready' not found -> process phase 2\n" ; } |
| 421 | + |
| 422 | + return ($process) ; |
| 423 | +} |
| 424 | + |
| 425 | +sub InitGlobals # qqq |
| 426 | +{ |
| 427 | + trace InitGlobals ; |
| 428 | + |
| 429 | + undef $addresses_stored ; |
| 430 | + undef $banner_requests_ignored ; |
| 431 | + undef $date_prev ; |
| 432 | + undef $fields_too_few ; |
| 433 | + undef $fields_too_many ; |
| 434 | + undef $googlebots ; |
| 435 | + undef $googles ; |
| 436 | + undef $html_pages_found ; |
| 437 | + undef $lines_in_file ; |
| 438 | + undef $lines_processed ; |
| 439 | + undef $lines_this_day ; |
| 440 | + undef $newest_time_read ; |
| 441 | + undef $oldest_time_read ; |
| 442 | + undef $statusses_non_tcp ; |
| 443 | + undef $tot_mime_html ; |
| 444 | + undef $tot_mime_html2 ; |
| 445 | + undef $tot_origins_external_counted ; |
| 446 | + undef $tot_referers_external ; |
| 447 | + undef $tot_referers_internal ; |
| 448 | + undef $unrecognized_domains ; |
| 449 | + |
| 450 | + undef %google_bot_hits ; |
| 451 | + undef %ip_bot_no_google ; |
| 452 | + undef %agents_raw ; |
| 453 | + undef %binaries ; |
| 454 | + undef %bots ; |
| 455 | + undef %client_ip_record_cnt ; |
| 456 | + undef %client_ip_record_cnt_total ; |
| 457 | + undef %clients ; |
| 458 | + undef %clients_by_wiki ; |
| 459 | + undef %cnt_ip_ranges ; |
| 460 | + undef %countries ; |
| 461 | + undef %countries_saves ; |
| 462 | + undef %countries_timed ; |
| 463 | + undef %countries_views ; |
| 464 | + undef %edit_submit_filtered ; |
| 465 | + undef %engines ; |
| 466 | + undef %exts ; |
| 467 | + undef %google_imposters ; |
| 468 | + undef %googlebins ; |
| 469 | + undef %googlebins2 ; |
| 470 | + undef %grouped_clients ; |
| 471 | + undef %imagesizes ; |
| 472 | + undef %index_php ; |
| 473 | + undef %index_php_raw ; |
| 474 | + undef %ip_distribution ; |
| 475 | + undef %ip_frequencies ; |
| 476 | + undef %languages ; |
| 477 | + undef %languages_unrecognized ; |
| 478 | + undef %lines_read ; |
| 479 | + undef %mobile_other ; |
| 480 | + undef %operating_systems ; |
| 481 | + undef %origin_simplified ; |
| 482 | + undef %origins ; |
| 483 | + undef %origins_external ; |
| 484 | + undef %origins_unsimplified ; |
| 485 | + undef %referers_internal ; |
| 486 | + undef %requests ; |
| 487 | + undef %scripts ; |
| 488 | + undef %search ; |
| 489 | + undef %skins ; |
| 490 | + undef %squid_delta ; |
| 491 | + undef %squid_events ; |
| 492 | + undef %squid_seqno ; |
| 493 | + undef %statusses ; |
| 494 | + undef %unrecognized_domains ; |
| 495 | + undef %wikis ; |
| 496 | +# undef @files ; |
| 497 | +}; |
| 498 | + |
| 499 | +sub ProcessPhase1 # collect IP frequencies, needed for filtering probable bots in phase 2 |
| 500 | + |
| 501 | +{ |
| 502 | + trace "ProcessPhase1: Collect IP frequencies" ; |
| 503 | + my ($days_ago, $date_collect_files, $time_to_start, $time_to_stop, $path_out, @files) = @_ ; |
| 504 | + |
| 505 | + $scan_ip_frequencies = $true ; |
| 506 | + $scan_all_fields = $false ; |
| 507 | + |
| 508 | + my $data_read = &ReadSquidLogFiles ($path_out, $time_to_start, $time_to_stop, @files) ; |
| 509 | + return if not $data_read ; |
| 510 | + |
| 511 | + &WriteOutputIpFrequencies ($path_out) ; |
| 512 | +} |
| 513 | + |
| 514 | +sub ProcessPhase2 # Collect other data |
| 515 | +{ |
| 516 | + trace "ProcessPhase2: Collect other data" ; |
| 517 | + my ($days_ago, $date_collect_files, $time_to_start, $time_to_stop, $path_out, $path_out_month, @files) = @_ ; |
| 518 | + |
| 519 | + $scan_ip_frequencies = $false ; |
| 520 | + $scan_all_fields = $true ; |
| 521 | + |
| 522 | + my $data_read = &ReadIpFrequencies ($path_out) ; |
| 523 | + return if not $data_read ; |
| 524 | + |
| 525 | + my $data_read = &ReadSquidLogFiles ($path_out, $time_to_start, $time_to_stop, @files) ; |
| 526 | + return if not $data_read ; |
| 527 | + |
| 528 | + &WriteOutputSquidSequenceGaps ($path_out) ; |
| 529 | + &WriteOutputSquidLogs ($path_out) ; |
| 530 | + &WriteOutputEditsSavesFile ($path_out) ; |
| 531 | + &WriteOutputCountriesSaves ($path_out) ; |
| 532 | + |
| 533 | + &WriteDiagnostics ; |
| 534 | + |
| 535 | + if ($job_runs_on_production_server) |
| 536 | + { &MoveAndCompressFiles ($path_out, $path_out_month, $date_collect_files) ; } |
| 537 | + |
| 538 | + |
| 539 | + if ($job_runs_on_production_server) |
| 540 | + { |
| 541 | + $cmd = "echo \"Ready in \"" . ddhhmmss (time - $time_start). " > $path_out/\#Ready" ; # use in next run to test whether this day has been completely processed |
| 542 | + `$cmd` ; |
| 543 | + $cmd = "echo \"\nReady in \"" . ddhhmmss (time - $time_start). " >> /home/ezachte/SquidCountArchiveLog.txt\n\n" ; |
| 544 | + `$cmd` ; |
| 545 | + } |
| 546 | +} |
| 547 | + |
| 548 | +#sub ScanSquidArchive |
| 549 | +#{ |
| 550 | +# trace ScanSquidArchive ; |
| 551 | + |
| 552 | +# $T00 = "T00:00:00" ; |
| 553 | + |
| 554 | +# ($time_to_start, $time_to_stop) = &GetSquidLogsToProcess ; # aborts if not all found |
| 555 | + |
| 556 | +# open OUT, '>', "$path_out/$file_out" ; |
| 557 | +# open OUT2, '>', "$path_out/$file_out2" ; |
| 558 | +# open ERR, '>', "$path_out/$file_err" ; |
| 559 | + |
| 560 | +# &CheckSquidLogsAlreadyProcessed ; # aborts if this is the case |
| 561 | + |
| 562 | +# if ($scan_all_fields) |
| 563 | +# { &ReadIpFrequencies ; } |
| 564 | + |
| 565 | +# &ReadSquidLogFiles ; |
| 566 | + |
| 567 | +# if (($oldest_time_read gt $time_to_start) || ($newest_time_read lt $time_to_stop)) |
| 568 | +# { abort ("Log does not contain full range from $time_to_start till $time_to_stop (oldest time read $oldest_time_read, newest time read $newest_time_read)\n") unless $test ; } |
| 569 | + |
| 570 | +# print "\ncd $path_out\n" ; |
| 571 | +# chdir ($path_out) ; |
| 572 | + |
| 573 | +# &WriteOutputSquidLogs ; |
| 574 | + |
| 575 | +# if ($scan_all_fields) |
| 576 | +# { &WriteDiagnostics ; } |
| 577 | + |
| 578 | +# close OUT ; |
| 579 | +# close OUT2 ; |
| 580 | +# close ERR ; |
| 581 | + |
| 582 | +# if ($job_runs_on_production_server && $scan_all_fields) |
| 583 | +# { &MoveAndCompressFiles ($path_out, $time_to_start) ; } |
| 584 | +#} |
| 585 | + |
| 586 | +#sub GetSquidLogsToProcess |
| 587 | +#{ |
| 588 | +# trace GetSquidLogsToProcess ; |
| 589 | + |
| 590 | +# my ($date_archived, $datestart, $datestop) ; |
| 591 | + |
| 592 | +# $time = time ; |
| 593 | +# my ($sec,$min,$hour,$day,$month,$year) = localtime ($time) ; |
| 594 | + |
| 595 | +# $day_today = sprintf ("%04d-%02d-%02d",$year+1900,$month+1,$day) ; |
| 596 | +# print "Date today is $day_today.\n\n" ; |
| 597 | + |
| 598 | +# if ($job_runs_on_production_server) |
| 599 | +# { |
| 600 | +# $dir_in = "/a/squid/archive" ; |
| 601 | + |
| 602 | +# if ($logdate =~ /^\d{8}$/) |
| 603 | +# { |
| 604 | +# $year = substr ($logdate,0,4) ; |
| 605 | +# $month = substr ($logdate,4,2) ; |
| 606 | +# $day = substr ($logdate,6,2) ; |
| 607 | + |
| 608 | +# $time_to_start = sprintf ("%04d-%02d-%02d$T00",$year,$month,$day) ; |
| 609 | +# ($year,$month,$day) = &ShiftDays ($year, $month, $day, 1) ; |
| 610 | +# $time_to_stop = sprintf ("%04d-%02d-%02d$T00",$year,$month,$day) ; |
| 611 | +# } |
| 612 | +# elsif ($logdate =~ /^-\d+$/) |
| 613 | +# { |
| 614 | +# ($sec,$min,$hour,$day,$month,$year) = localtime ($time+$logdate*24*3600) ; |
| 615 | +# $year += 1900 ; |
| 616 | +# $month += 1 ; |
| 617 | +# $time_to_start = sprintf ("%04d-%02d-%02d$T00",$year,$month,$day) ; |
| 618 | +# ($year,$month,$day) = &ShiftDays ($year, $month, $day, 1) ; |
| 619 | +# $time_to_stop = sprintf ("%04d-%02d-%02d$T00",$year,$month,$day) ; |
| 620 | +# } |
| 621 | +# else |
| 622 | +# { |
| 623 | +# print "No logdate specified\n" ; |
| 624 | +# exit ; |
| 625 | +# } |
| 626 | + |
| 627 | +# print "-d $logdate => Process data from $time_to_start till $time_to_stop\n\n" ; |
| 628 | +# } |
| 629 | +# else # test |
| 630 | +# { |
| 631 | +# # $time_to_start = "2009-02-05T00" ; |
| 632 | +# # $time_to_stop = "2009-02-05T23:59:59" ; |
| 633 | +# # push @files, getcwd . "/sampled-1000-oneday.txt" ; |
| 634 | + |
| 635 | +# $time_to_start = "2010-05-10T00" ; |
| 636 | +# $time_to_stop = "2010-05-10T01" ; |
| 637 | +# push @files, getcwd . "/sampled-1000-log-20100510.txt" ; |
| 638 | + |
| 639 | +# print "Job runs in test env => Process data from $time_to_start till $time_to_stop\n\n" ; |
| 640 | +# } |
| 641 | + |
| 642 | +# $some_files_found = $false ; |
| 643 | +# $full_range_found = $false ; |
| 644 | + |
| 645 | +# ($path_out, $path_out_month) = &GetPathOut ($time_to_start) ; |
| 646 | +# $path_head_tail = "$path_out_month/$file_head_tail" ; |
| 647 | + |
| 648 | +# if ($job_runs_on_production_server) |
| 649 | +# { |
| 650 | +# # file naming scheme on server: sampled-1000.log-yyyymmdd, does not mean on that day file sampled-1000.log was archived |
| 651 | +# # file can contain data for days(s) before and day (days?) after yyyymmdd, see e.g. sampled-10000.log-20090802 (days 0801-0803) |
| 652 | +# # this is confusing so start a few days earlier and check for each day: |
| 653 | +# # whether a file exists and whether it's 'head' and or 'tail' time (first last record) fall within range |
| 654 | + |
| 655 | +# # find first and last file to process that comprise all log records within date range |
| 656 | +# $year = substr ($time_to_stop,0,4) ; |
| 657 | +# $month = substr ($time_to_stop,5,2) ; |
| 658 | +# $day = substr ($time_to_stop,8,2) ; |
| 659 | +# ($year,$month,$day) = &ShiftDays ($year, $month, $day, +5) ; |
| 660 | +# $datestop = sprintf ("%4d%02d%02d", $year, $month, $day) ; |
| 661 | + |
| 662 | +# $year = substr ($time_to_start,0,4) ; |
| 663 | +# $month = substr ($time_to_start,5,2) ; |
| 664 | +# $day = substr ($time_to_start,8,2) ; |
| 665 | + |
| 666 | +# ($year,$month,$day) = &ShiftDays ($year, $month, $day, -5) ; |
| 667 | +# $datestart = sprintf ("%4d%02d%02d", $year, $month, $day) ; |
| 668 | + |
| 669 | +# $date_archived = $datestart ; |
| 670 | +# while ($date_archived lt $datestop) |
| 671 | +# { |
| 672 | +# $date_archived = sprintf ("%4d%02d%02d", $year, $month, $day) ; |
| 673 | +# ($year,$month,$day) = &ShiftDays ($year, $month, $day, +1) ; |
| 674 | + |
| 675 | +# $file = "$dir_in/sampled-1000.log-$date_archived.gz" ; |
| 676 | + |
| 677 | +# if (-e $file) |
| 678 | +# { |
| 679 | +# ($timehead,$timetail) = &GetLogRange ($file, $path_head_tail) ; |
| 680 | + |
| 681 | +# if (($timehead lt $time_to_start) && ($timetail ge $time_to_start)) |
| 682 | +# { |
| 683 | +# $some_files_found = $true ; |
| 684 | +# $processfiles = $true ; |
| 685 | +# } |
| 686 | + |
| 687 | +# if ($processfiles) |
| 688 | +# { |
| 689 | +# print "$file: time range $timehead - $timetail\n" ; |
| 690 | +# push @files, $file ; |
| 691 | +# } |
| 692 | + |
| 693 | +# if (($timehead lt $time_to_stop) && ($timetail ge $time_to_stop)) |
| 694 | +# { |
| 695 | +# $full_range_found = $true ; |
| 696 | +# last ; |
| 697 | +# } |
| 698 | +# } |
| 699 | +# } |
| 700 | +# } |
| 701 | + |
| 702 | +# if ($job_runs_on_production_server) |
| 703 | +# { |
| 704 | +# if (! $some_files_found) |
| 705 | +# { print "Not any file containing start time. Aborting...\n\n" ; exit ; } |
| 706 | +# if (! $full_range_found) |
| 707 | +# { print "Not all files were found. Aborting...\n\n" ; exit ; } |
| 708 | +# } |
| 709 | + |
| 710 | +# print "\n" ; |
| 711 | +# foreach $file (sort @files) |
| 712 | +# { print "Process $file\n" ; } |
| 713 | + |
| 714 | +# return ($time_to_start, $time_to_stop) ; |
| 715 | +#} |
| 716 | + |
| 717 | +#sub GetPathOut |
| 718 | +#{ |
| 719 | +# my $time_to_start = shift ; |
| 720 | + |
| 721 | +# $path_out = substr ($time_to_start,0,7) ; |
| 722 | +# if ($job_runs_on_production_server) |
| 723 | +# { |
| 724 | +# $path_out = "$path_root/$path_out" ; |
| 725 | +# $path_out_month = $path_out ; |
| 726 | +# } |
| 727 | + |
| 728 | +# if (! -d $path_out) |
| 729 | +# { |
| 730 | +# mkdir ($path_out) || die "Unable to create directory $path_out\n" ; |
| 731 | +# print "mkdir $path_out\n" ; |
| 732 | +# } |
| 733 | + |
| 734 | +# $path_out .= "/" . substr ($time_to_start,0,10) ; |
| 735 | +# if (! -d $path_out) |
| 736 | +# { |
| 737 | +# mkdir ($path_out) || die "Unable to create directory $path_out\n" ; |
| 738 | +# print "mkdir $path_out\n" ; |
| 739 | +# } |
| 740 | + |
| 741 | +# # clean up obsolete signal files |
| 742 | +# $file_ready = "$path_out/\^Ready" ; |
| 743 | +# unlink $file_ready ; |
| 744 | +# $file_ready = "$path_out/\@Ready" ; |
| 745 | +# unlink $file_ready ; |
| 746 | + |
| 747 | +# return ($path_out,$path_out_month) ; |
| 748 | +#} |
| 749 | + |
| 750 | +#sub CheckSquidLogsAlreadyProcessed |
| 751 | +#{ |
| 752 | +# trace CheckSquidLogsAlreadyProcessed ; |
| 753 | + |
| 754 | +# if ($scan_ip_frequencies) |
| 755 | +# { |
| 756 | +# if (-e $file_ip_frequencies) |
| 757 | +# { |
| 758 | +# print "File $path_out/$file_ip_frequencies exists -> Day already processed\nExiting ...\n" ; |
| 759 | +# exit ; |
| 760 | +# } |
| 761 | +# } |
| 762 | +# elsif ($scan_squid_msg_sequence_numbers) |
| 763 | +# { |
| 764 | +# if (-e $file_sequence_numbers) |
| 765 | +# { |
| 766 | +# print "File $path_out/$file_sequence_numbers exists -> Day already processed\nExiting ...\n" ; |
| 767 | +# exit ; |
| 768 | +# } |
| 769 | +# } |
| 770 | +# else |
| 771 | +# { |
| 772 | +# if (-e $file_ready) |
| 773 | +# { |
| 774 | +# print "File $file_ready exists -> Day already processed\nExiting ...\n" ; |
| 775 | +# exit ; |
| 776 | +# } |
| 777 | +# else |
| 778 | +# { print "File $file_ready not found -> process data\n" ; } |
| 779 | +# } |
| 780 | +#} |
| 781 | + |
| 782 | +#sub ScanEditsSavesFile |
| 783 | +#{ |
| 784 | +# trace ScanEditsSavesFile ; |
| 785 | + |
| 786 | +# if ($logdate =~ /^\d{8}$/) |
| 787 | +# { |
| 788 | +# $year = substr ($logdate,0,4) ; |
| 789 | +# $month = substr ($logdate,4,2) ; |
| 790 | +# $day = substr ($logdate,6,2) ; |
| 791 | +# } |
| 792 | +# else |
| 793 | +# { |
| 794 | +# print "No (valid) logdate specified\n" ; |
| 795 | +# if ($job_runs_on_production_server) |
| 796 | +# { exit ; } |
| 797 | +# else |
| 798 | +# { |
| 799 | +# $year = 2010 ; |
| 800 | +# $month = 4 ; |
| 801 | +# $day = 01 ; |
| 802 | +# } |
| 803 | +# } |
| 804 | + |
| 805 | +# $time_to_start = sprintf ("%04d-%02d-%02d$T00",$year,$month,$day) ; |
| 806 | +# ($year2,$month2,$day2) = &ShiftDays ($year, $month, $day, 1) ; |
| 807 | +# $time_to_stop = sprintf ("%04d-%02d-%02d$T00",$year2,$month2,$day2) ; |
| 808 | + |
| 809 | +# ($path_out, $path_out_month) = &GetPathOut ($time_to_start) ; |
| 810 | + |
| 811 | +# if ($job_runs_on_production_server) |
| 812 | +# { $path_out = $path_root ; } |
| 813 | +# else |
| 814 | +# { |
| 815 | +# push @files, getcwd . "/sampled-1000.log-20100401" ; |
| 816 | +# # return ; |
| 817 | +# } |
| 818 | + |
| 819 | +# $file_txt = "$path_root/" . sprintf ("%4d-%02d", $year, $month) . "/SquidDataEditsSaves" . sprintf ("%4d-%02d-%02d", $year, $month, $day) . ".txt.bz2" ; |
| 820 | +# $file_csv = "$path_root/" . sprintf ("%4d-%02d", $year, $month) . "/" . sprintf ("%4d-%02d-%02d", $year, $month, $day) . "/$file_csv_indexphp" ; |
| 821 | +# $file_csv_countries_saves = "$path_root/" . sprintf ("%4d-%02d", $year, $month) . "/" . sprintf ("%4d-%02d-%02d", $year, $month, $day) . "/$file_csv_countries_saves" ; |
| 822 | +# if (-e $file_txt) |
| 823 | +# { |
| 824 | +# &ReadInputEditsSavesFile ($file_txt) ; |
| 825 | +# &WriteOutputEditsSavesFile ($file_csv) ; |
| 826 | +# &WriteOutputCountriesSaves ($file_csv_countries_saves) ; |
| 827 | +# } |
| 828 | +# else |
| 829 | +# { print "ScanEditsSavesFile: File $file_txt not found. Aborting...\n\n" ; exit ; } |
| 830 | +#} |
| 831 | + |
| 832 | +sub ShiftDays |
| 833 | +{ |
| 834 | + my $year = shift ; |
| 835 | + my $month = shift ; |
| 836 | + my $day = shift ; |
| 837 | + my $delta = shift ; |
| 838 | + |
| 839 | + my $time = timelocal (0,0,0,$day, $month-1, $year-1900) ; |
| 840 | + ($sec,$min,$hour,$day,$month,$year) = localtime ($time+$delta*24*3600) ; |
| 841 | + |
| 842 | + return ($year+1900,$month+1,$day) ; |
| 843 | +} |
| 844 | + |
| 845 | +sub ExpandAbbreviation |
| 846 | + |
| 847 | +{ |
| 848 | + my $text = shift ; |
| 849 | + # reverse (more or less) abbreviations |
| 850 | + $text =~ s/^[\@\*]//o ; |
| 851 | + $text =~ s/^xx:upload/upload: /o; |
| 852 | + $text =~ s/^wb:/wikibooks:/o; |
| 853 | + $text =~ s/^wk:/wiktionary:/o; |
| 854 | + $text =~ s/^wn:/wikinews:/o; |
| 855 | + $text =~ s/^wp:/wikipedia:/o; |
| 856 | + $text =~ s/^wq:/wikiquote:/o; |
| 857 | + $text =~ s/^ws:/wikisource:/o; |
| 858 | + $text =~ s/^wv:/wikiversity:/o; |
| 859 | + $text =~ s/^wx:/wikispecial:/o; |
| 860 | + $text =~ s/^mw:/wikispecial:/o; # eg bugzilla |
| 861 | + $text =~ s/:!mw/:mediawiki/o; |
| 862 | + $text =~ s/^wm:/wikimedia:/o; |
| 863 | + $text =~ s/:wm$/:wikimedia/o; |
| 864 | + $text =~ s/^wmf:/foundation:/o; |
| 865 | + $text =~ s/:www$/:portal/o; |
| 866 | +# $text =~ s/^wikispecial:(.*)$/$1: /o; |
| 867 | + return ($text) ; |
| 868 | +} |
| 869 | + |
| 870 | +sub ProcessSquidSequenceNumbers |
| 871 | +{ |
| 872 | + # input has been established for tast three months of data in WriteOutputSquidLogs |
| 873 | + # there for each day per squid and hour of day total event and total gap were established |
| 874 | + # avg gap for all squids combined (per hour and per day) was written to this csv file |
| 875 | + open CSV, '<', 'SquidDataSequenceNumbersAllSquids.csv' ; |
| 876 | + while ($line = <CSV>) |
| 877 | + { |
| 878 | + next if $line =~ /\*/o ; |
| 879 | + next if $line !~ /\d\d\d\d\-\d\d\-\d\d,/o ; |
| 880 | + chomp $line ; |
| 881 | + ($date,$hour,$events,$mean_gap) = split (',', $line) ; |
| 882 | + $yyyy = substr ($date,0,4) ; |
| 883 | + $mm = substr ($date,5,2) ; |
| 884 | + $dd = substr ($date,8,2) ; |
| 885 | + $time = timelocal (0,0,0,$dd,$mm-1,$yyyy-1900) ; |
| 886 | + ($ss,$nn,$hh,$day,$month,$year,$wday,$yday,$isdst) = localtime($time); |
| 887 | + $month ++ ; |
| 888 | + $weekno = int ($yday / 7) ; |
| 889 | + if ($weekno_start {$weekno} eq '') |
| 890 | + { $weekno_start {$weekno} = $date ; } |
| 891 | + $weekno_stop {$weekno} = $date ; |
| 892 | + $events {"$weekno,$hour"} += $events ; |
| 893 | + $totgap {"$weekno,$hour"} += $events * $mean_gap ; |
| 894 | + $events_allday {$weekno} += $events ; |
| 895 | + $totgap_allday {$weekno} += $events * $mean_gap ; |
| 896 | + |
| 897 | + # to establish correction factor per month igore all days when another anomaly occurred, or after problem was fixed |
| 898 | + # wk 23: from 6/11 till 6/16 unusually many messages got lost due to temporary slowdown of server |
| 899 | + # (unwanted blocking process had been introduced by vector switch) |
| 900 | + # wk 26: on 6/27 and 6/28 22 hours of data were lost after incomplete manual restart of locke |
| 901 | + # wk 26/27: from 7/7 till 7/10 69 hours of data were lost after incomplete restart of locke after power down |
| 902 | + # (week 27 does not stand out in the chart, squids got rebooted? <- counters were reset?) |
| 903 | + # wk 29: 7/22 Mark stopped several secondary processes on locke, |
| 904 | + # around 14.00 hrs GMT message loss vanished almost entirely |
| 905 | + # After that average gap became 1003, meaning only 0.3% of messages is missing. |
| 906 | + |
| 907 | + |
| 908 | + next if $month == 6 and (($day >= 11 and $day <= 16) or ($day >= 27 and $day <= 28)) ; |
| 909 | + next if $month == 7 and (($day >= 7 and $day <= 10) or ($day >= 22)) ; |
| 910 | + # these dates where data were missing or underreported are already skipped in WikiCountsSummarizeProjectCounts |
| 911 | + # and totals are already extrapolated |
| 912 | + |
| 913 | + $events_allmonth {$month} += $events ; |
| 914 | + $totgap_allmonth {$month} += $events * $mean_gap ; |
| 915 | + |
| 916 | + $weeks {$weekno} ++ ; |
| 917 | + $months {$month} ++ ; |
| 918 | + } |
| 919 | + close CSV ; |
| 920 | + |
| 921 | + open CSV, '>', 'SquidDataSequenceNumbersAllSquidsOut.csv' ; |
| 922 | + |
| 923 | + print CSV "hour," ; |
| 924 | + print "hour," ; |
| 925 | + foreach $weekno (sort {$a <=> $b} keys %weeks) |
| 926 | + { |
| 927 | + $start = substr ($weekno_start {$weekno},5) ; |
| 928 | + $start =~ s/-/\//go ; |
| 929 | + $start =~ s/^0//go ; |
| 930 | + # $stop = substr ($weekno_stop {$weekno},5) ; |
| 931 | + |
| 932 | + |
| 933 | + print CSV "wk $weekno: ($start ..)," ; |
| 934 | + print "wk $weekno: ($start ..)," ; |
| 935 | + } |
| 936 | + print "\n" ; |
| 937 | + print CSV "\n" ; |
| 938 | + |
| 939 | + foreach ($hour = 0 ; $hour <= 23 ; $hour++) |
| 940 | + { |
| 941 | + print CSV "$hour," ; |
| 942 | + print "$hour," ; |
| 943 | + |
| 944 | + $hour = sprintf ("%02d", $hour) ; |
| 945 | + foreach $weekno (sort {$a <=> $b} keys %weeks) |
| 946 | + { |
| 947 | + $events = $events {"$weekno,$hour"} ; |
| 948 | + $totgap = $totgap {"$weekno,$hour"} ; |
| 949 | + $mean_gap = 0 ; |
| 950 | + if ($events > 0) |
| 951 | + { $mean_gap = sprintf ("%.0f", $totgap / $events ) ; } |
| 952 | + print CSV "$mean_gap," ; |
| 953 | + print "$mean_gap," ; |
| 954 | + } |
| 955 | + |
| 956 | + print "\n" ; |
| 957 | + print CSV "\n" ; |
| 958 | + } |
| 959 | + print CSV "all day," ; |
| 960 | + print "all day," ; |
| 961 | + foreach $weekno (sort {$a <=> $b} keys %weeks) |
| 962 | + { |
| 963 | + $events = $events_allday {$weekno} ; |
| 964 | + $totgap = $totgap_allday {$weekno} ; |
| 965 | + $mean_gap = 0 ; |
| 966 | + if ($events > 0) |
| 967 | + { $mean_gap = sprintf ("%.0f", $totgap / $events ) ; } |
| 968 | + print CSV "$mean_gap," ; |
| 969 | + print "$mean_gap," ; |
| 970 | + } |
| 971 | + |
| 972 | + # the following yields (month, avg gap) |
| 973 | + # 4: 1241 so assume this factor for full April: 1,000,000 / 1241 gap = x msgs, too short: y msgs = 1000 - x |
| 974 | + # 5: 1310 |
| 975 | + # 6: 1328 |
| 976 | + # 7: 1470 so assume this factor for 22.5/days for July |
| 977 | + |
| 978 | + print "\n\n" ; |
| 979 | + print CSV "\n\n" ; |
| 980 | + foreach $month (sort {$a <=> $b} keys %months) |
| 981 | + { |
| 982 | + print CSV "month $month," ; |
| 983 | + print "month $month," ; |
| 984 | + $events = $events_allmonth {$month} ; |
| 985 | + $totgap = $totgap_allmonth {$month} ; |
| 986 | + $mean_gap = 0 ; |
| 987 | + if ($events > 0) |
| 988 | + { $mean_gap = sprintf ("%.0f", $totgap / $events ) ; } |
| 989 | + print CSV "$mean_gap\n" ; |
| 990 | + print "$mean_gap\n" ; |
| 991 | + } |
| 992 | + |
| 993 | + close CSV ; |
| 994 | +} |
| 995 | + |
| 996 | + |
| 997 | +# how to detect page saves: |
| 998 | +# henbane /a/log/vu.awk: (see also Domasz' webstats collector) |
| 999 | +# |
| 1000 | +# function savemark(url, code) { |
| 1001 | +# if (url ~ /action=submit$/ && code == "TCP_MISS/302") |
| 1002 | +# return "save" |
| 1003 | +# return "-" |
| 1004 | +# } |
| 1005 | + |
| 1006 | +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/tools/counter/ |
| 1007 | +# http://leuksman.com/log/2007/06/07/wikimedia-page-views/ |
| 1008 | +# http://www.iplists.com/ |
| 1009 | +# WHOIS http://ws.arin.net/whois/?queryinput=N%20.%20GOOGLE |
| 1010 | +# WHOIS http://tools.whois.net/index.php?fuseaction=whois.whoisbyipresults |
| 1011 | +# http://en.wikipedia.org/wiki/List_of_search_engines |
| 1012 | + |
| 1013 | +# http://en.wikipedia.org/wiki/User_agent |
| 1014 | +# http://www.texsoft.it/index.php?c=software&m=sw.php.useragent&l=it |
| 1015 | +# http://www.hyperborea.org/journal/archives/2004/06/19/whats-in-a-user-agent-string/ |
| 1016 | + |
| 1017 | +# Funwebproducts |
| 1018 | +# No fun with funwebproducts http://www.networkworld.com/newsletters/web/2003/1208web2.html |
| 1019 | + |
| 1020 | +# SLCC |
| 1021 | +# Nice and easy. SLCC1 stands for Secure Licensing Commerce Client version 1.0. SLCC is the service responsible for the Windows Anytime upgrade process present in Vista and Server 2008 which allows you to upgrade Vista Home Basic to Vista Ultimate Edition, or Server 2008 Standard to Server 2008 Enterprise ad-hoc. |
| 1022 | +# SLCC is present in the browser identifier tag, the User Agent, in order to allow Microsoft update servers to offer you the tantalising and irresistible promise of an even more resource heavy version of Vista! |
| 1023 | +# J2ME |
| 1024 | +# Java 2 Micro Edition |
| 1025 | + |
| 1026 | +# Chrome Safari |
| 1027 | +# http://www.neowin.net/news/main/09/02/01/chrome-masks-as-safari-to-fool-windows-live-mail |
| 1028 | + |
| 1029 | +# Danger Hiptop |
| 1030 | +# http://en.wikipedia.org/wiki/Danger_Hiptop |
| 1031 | + |
Index: trunk/wikistats/squids/SquidReportArchive.sh |
— | — | @@ -0,0 +1,10 @@ |
| 2 | +#! /bin/sh |
| 3 | +ulimit -v 4000000 |
| 4 | +home="/a/ezachte" |
| 5 | +# perl $home/SquidReportArchive.pl -m 201007 > SquidReportArchiveLog.txt |
| 6 | +# after further automating SquidScanCountries.sh: |
| 7 | +perl $home/SquidReportArchive.pl -c 201101 >> SquidReportArchiveLog.txt # -c for per country reports |
| 8 | +perl $home/SquidReportArchive.pl -m 201101 >> SquidReportArchiveLog.txt |
| 9 | +tar -cf reports.tar /a/ezachte/*.htm |
| 10 | +bzip2 reports.tar |
| 11 | +mv reports.tar.bz2 /a/ezachte |
Property changes on: trunk/wikistats/squids/SquidReportArchive.sh |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 12 | + native |
Index: trunk/wikistats/squids/SquidReportArchive.pl |
— | — | @@ -0,0 +1,6265 @@ |
| 2 | +#!/usr/bin/perl |
| 3 | + |
| 4 | + use lib "/home/ezachte/lib" ; |
| 5 | + use EzLib ; |
| 6 | + $trace_on_exit = $true ; |
| 7 | + ez_lib_version (2) ; |
| 8 | + |
| 9 | +# $quarter_only = '2010 Q3' ; # if not empty filter process for this quarter only |
| 10 | + |
| 11 | + # set defaults mainly for tests on local machine |
| 12 | +# default_argv "-m 201009 " ; |
| 13 | + default_argv "-c " ; |
| 14 | + |
| 15 | +# $html = "<html><body bgcolor=black><table>" ; |
| 16 | +# for ($i = 4 ; $i >= 0 ; $i-=0.5) |
| 17 | +# { |
| 18 | +# ($requests,$ratio,$fill) = RatioAndFillColor1 ('',$i,4, $ratio_sqrt) ; |
| 19 | +# print sprintf ("%.1f",$i) . ": $fill\n" ; |
| 20 | +# $i2 = sprintf ("%0.1f", $i) ; |
| 21 | +# $html .= "<tr><td align=right><font color=grey>$i2</font></td><td width=15> </td><td width=50 style=\"background:$fill\"> </td><td width=15> </td><td><font color=grey> $fill</font></td></tr>" ; |
| 22 | +# } |
| 23 | +# $html .= "<tr><td height=30 colspan=99> </td></tr>" ; |
| 24 | +# for ($i = 4 ; $i >= 0 ; $i-=0.5) |
| 25 | +# { |
| 26 | +# ($requests,$ratio,$fill) = RatioAndFillColor2 ('',$i,4, $ratio_sqrt) ; |
| 27 | +# print sprintf ("%.1f",$i) . ": $fill\n" ; |
| 28 | +# $i2 = sprintf ("%0.1f", $i) ; |
| 29 | +# $html .= "<tr><td align=right><font color=grey>$i2</font></td><td width=15> </td><td width=50 style=\"background:$fill\"> </td><td width=15> </td><td><font color=grey> $fill</font></td></tr>" ; |
| 30 | +# } |
| 31 | +# $html .= "</table><body></html>" ; |
| 32 | +# open HTML, '>', 'color_range2.html' ; |
| 33 | +# print HTML $html ; |
| 34 | +# close HTML ; |
| 35 | +# exit ; |
| 36 | + |
| 37 | +#sub RatioAndFillColor1 |
| 38 | +#{ |
| 39 | +# my ($code, $requests,$requests_max) = @_ ; |
| 40 | +# my ($ratio,$green,$red,$blue,$fill) ; |
| 41 | + |
| 42 | +# if ($requests > $requests_max) |
| 43 | +# { $requests = $requests_max ; } |
| 44 | + |
| 45 | +# $ratio = sqrt ($requests / $requests_max) ; |
| 46 | +# if ($ratio >= 0.20) |
| 47 | +# { |
| 48 | +# $green = 180 ; |
| 49 | +# $red = 180 - int (0.5 + 180 * 5/4 * ($ratio-0.20)) ; |
| 50 | +# $blue = int ($green / 3) ; |
| 51 | +# } |
| 52 | +# else |
| 53 | +# { |
| 54 | +# $red = 220 ; |
| 55 | +# $green = int (0.5 + 220 * 5 * $ratio) ; |
| 56 | +# $blue = 0 ; #int ($green / 2) ; |
| 57 | +# } |
| 58 | + |
| 59 | +# $fill = "\#" . sprintf ("%02x%02x%02x",$red,$green,$blue) ; |
| 60 | +# $fill = lc hsv2rgb($ratio*150,0.67+$ratio*0.33,0.8-0.2*$ratio) ; |
| 61 | + |
| 62 | +# $fills {lc $code} = $fill ; |
| 63 | +# return ($requests,$ratio,$fill) ; |
| 64 | +#} |
| 65 | + |
| 66 | +#sub RatioAndFillColor2 |
| 67 | +#{ |
| 68 | +# my ($code, $requests,$requests_max) = @_ ; |
| 69 | +# my ($ratio,$green,$red,$blue,$fill) ; |
| 70 | + |
| 71 | +# if ($requests > $requests_max) |
| 72 | +# { $requests = $requests_max ; } |
| 73 | + |
| 74 | +# $ratio = $requests / $requests_max ; |
| 75 | +# if ($ratio >= 0.20) |
| 76 | +# { |
| 77 | +# $green = 180 ; |
| 78 | +# $red = 180 - int (0.5 + 180 * 5/4 * ($ratio-0.20)) ; |
| 79 | +# $blue = int ($green / 3) ; |
| 80 | +# } |
| 81 | +# else |
| 82 | +# { |
| 83 | +# $red = 220 ; |
| 84 | +# $green = int (0.5 + 220 * 5 * $ratio) ; |
| 85 | +# $blue = 0 ; #int ($green / 2) ; |
| 86 | +# } |
| 87 | + |
| 88 | +# $fill = "\#" . sprintf ("%02x%02x%02x",$red,$green,$blue) ; |
| 89 | +# $fill = lc hsv2rgb($ratio*150,1-$ratio*0.334,0.6) ; |
| 90 | + |
| 91 | +# $fills {lc $code} = $fill ; |
| 92 | +# return ($requests,$ratio,$fill) ; |
| 93 | +#} |
| 94 | + |
| 95 | +# to do: add text from http://wiki.squid-cache.org/SquidFaq/SquidLogs |
| 96 | +# ReportOrigin how to handle '!error <-> other |
| 97 | +# SquidReportOrigins.htm total count<->alpha are not the same (+ skip total for "google (total)") |
| 98 | +# SquidReportOrigins.htm totals google don't match ReportMimeTypes |
| 99 | +# SquidReportOrigins.htm internal tonen als bij mime types |
| 100 | + |
| 101 | +# cater for missing files -> different multiplier |
| 102 | +# csv file google bot hits per hour -> Stu |
| 103 | +# report for edit/submit |
| 104 | +# log.txt s -> date folder |
| 105 | + |
| 106 | +# http://www.linux.com/community/blogs/Convert-a-.svg-file-to-a-.png-in-Ubuntu.html |
| 107 | + |
| 108 | +# use CGI::Carp qw(fatalsToBrowser); |
| 109 | +# use Getopt::Std ; |
| 110 | + use Time::Local ; |
| 111 | + use Cwd; |
| 112 | + |
| 113 | + $ratio_sqrt = $true ; |
| 114 | + $ratio_linear = $false ; |
| 115 | + |
| 116 | + getopt ("dm", \%options) ; |
| 117 | + |
| 118 | + if (-d "/a/squid") |
| 119 | + { |
| 120 | + print "\n\nJob runs on server $hostname\n\n" ; |
| 121 | + $path_root = "/a/ezachte" ; |
| 122 | + } |
| 123 | + elsif ($hostname eq 'bayes') |
| 124 | + { |
| 125 | + print "\n\nJob runs on server $hostname\n\n" ; |
| 126 | + $path_root = "/home/ezachte/wikistats/animation" ; |
| 127 | + } |
| 128 | + else |
| 129 | + { |
| 130 | + print "Job runs local for tests\n\n" ; |
| 131 | + $path_root = "W:/! Perl/Squids/Archive/test5" ; |
| 132 | + } |
| 133 | + $path_in = $path_root ; |
| 134 | + $path_out = $path_root ; |
| 135 | + |
| 136 | + print "Path root = $path_root\n" ; |
| 137 | + |
| 138 | + # periodically harvest updated metrics from |
| 139 | + # 'http://en.wikipedia.org/wiki/List_of_countries_by_population' |
| 140 | + # 'http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users' |
| 141 | + if (defined ($options {"w"})) |
| 142 | + { &ReadWikipedia ; exit ; } |
| 143 | + |
| 144 | + if (defined ($options {"c"})) |
| 145 | + { $reportcountries = $true ; } |
| 146 | + |
| 147 | + # date range used to be read from csv file with ReadDate, now there are daily csv files |
| 148 | + # if earlier methods still is useful it needs to be tweaked |
| 149 | +# if (($reportmonth ne "") && ($reportmonth !~ /^\d{6}$/)) |
| 150 | + |
| 151 | + &InitProjectNames ; |
| 152 | + |
| 153 | + if ($reportcountries) |
| 154 | + { |
| 155 | + $project_mode = "wp" ; |
| 156 | + |
| 157 | + $file_csv_country_codes = "CountryCodes.csv" ; |
| 158 | + $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ; |
| 159 | + |
| 160 | + &ReadInputCountriesNames ; |
| 161 | + &ReadInputCountriesMeta ; |
| 162 | + |
| 163 | + &CollectRegionCounts ; |
| 164 | + |
| 165 | + &ReportCountries ('Saves'); |
| 166 | + &ReportCountries ('Views'); |
| 167 | + |
| 168 | + exit ; |
| 169 | + } |
| 170 | + |
| 171 | + $reportdaysback = $options {"d"} ; |
| 172 | + $reportmonth = $options {"m"} ; |
| 173 | + |
| 174 | + if (($reportmonth !~ /^\d{6}$/) && ($reportdaysback !~ /^-\d+/)) |
| 175 | + { print "Specify month as -m yyyymm or days back as -d -[days] (e.g. -d -1 for yesterday)" ; exit ; } |
| 176 | + |
| 177 | + if ($reportmonth =~ /^\d{6}$/) |
| 178 | + { $reportmonth = substr ($reportmonth,0,4) . "-" . substr ($reportmonth,4,2) ; } |
| 179 | + else |
| 180 | + { |
| 181 | + ($sec,$min,$hour,$day,$month,$year) = localtime (time+$reportdaysback*86400) ; |
| 182 | + $reportmonth = sprintf ("%04d-%02d",$year+1900,$month+1) ; |
| 183 | + } |
| 184 | + print "Report month = $reportmonth\n" ; |
| 185 | + |
| 186 | + $threshold_mime = 0 ; |
| 187 | + $threshold_project = 10 ; |
| 188 | + |
| 189 | + $file_log = "WikiReportsSampledVisitorsLog.log" ; |
| 190 | + |
| 191 | + $file_html_crawlers = "SquidReportCrawlers.htm" ; |
| 192 | + $file_html_methods = "SquidReportMethods.htm" ; |
| 193 | + $file_html_origins = "SquidReportOrigins.htm" ; |
| 194 | + $file_html_opsys = "SquidReportOperatingSystems.htm" ; |
| 195 | + $file_html_scripts = "SquidReportScripts.htm" ; |
| 196 | + $file_html_skins = "SquidReportSkins.htm" ; |
| 197 | + $file_html_requests = "SquidReportRequests.htm" ; |
| 198 | + $file_html_google = "SquidReportGoogle.htm" ; |
| 199 | + $file_html_clients = "SquidReportClients.htm" ; |
| 200 | + |
| 201 | +# names till 2010-07-01 |
| 202 | +# |
| 203 | +# $file_csv_crawlers = "SquidDataCrawlers.csv" ; |
| 204 | +# $file_csv_methods = "SquidDataMethods.csv" ; |
| 205 | +# $file_csv_origins = "SquidDataOrigins.csv" ; |
| 206 | +# $file_csv_opsys = "SquidDataOpSys.csv" ; |
| 207 | +# $file_csv_requests = "SquidDataRequests.csv" ; |
| 208 | +# $file_csv_scripts = "SquidDataScripts.csv" ; |
| 209 | +# $file_csv_google = "SquidDataSearch.csv" ; |
| 210 | +# $file_csv_skins = "SquidDataSkins.csv" ; |
| 211 | +# $file_csv_clients = "SquidDataClients.csv" ; |
| 212 | +# $file_csv_google_bots = "SquidDataGoogleBots.csv" ; |
| 213 | +# $file_csv_indexphp = "SquidDataIndexPhp.csv" ; |
| 214 | +# $file_csv_countries_languages_visited = "SquidDataCountriesLanguagesVisited.csv" ; |
| 215 | +# $file_csv_countries_timed = "SquidDataCountriesTimed.csv" ; |
| 216 | +# $file_csv_browser_languages = "SquidDataLanguages.csv" ; |
| 217 | + |
| 218 | + $file_csv_crawlers = "public/SquidDataCrawlers.csv" ; |
| 219 | + $file_csv_methods = "public/SquidDataMethods.csv" ; |
| 220 | + $file_csv_origins = "public/SquidDataOrigins.csv" ; |
| 221 | + $file_csv_opsys = "public/SquidDataOpSys.csv" ; |
| 222 | + $file_csv_requests = "public/SquidDataRequests.csv" ; |
| 223 | + $file_csv_scripts = "public/SquidDataScripts.csv" ; |
| 224 | + $file_csv_google = "public/SquidDataSearch.csv" ; |
| 225 | + $file_csv_skins = "public/SquidDataSkins.csv" ; |
| 226 | + $file_csv_clients = "public/SquidDataClients.csv" ; |
| 227 | + $file_csv_google_bots = "public/SquidDataGoogleBots.csv" ; |
| 228 | + $file_csv_indexphp = "public/SquidDataIndexPhp.csv" ; |
| 229 | + $file_csv_countries_languages_visited = "public/SquidDataCountriesViews.csv" ; |
| 230 | + $file_csv_countries_timed = "public/SquidDataCountriesViewsTimed.csv" ; |
| 231 | + $file_csv_browser_languages = "public/SquidDataLanguages.csv" ; |
| 232 | + |
| 233 | + print "\n\nJob SquidReportArchive.pl\n\n" ; |
| 234 | + |
| 235 | +# if (! -d "/a/squid") |
| 236 | +# { |
| 237 | +# if (! -e $file_csv_requests) { $file_csv_requests =~ s/\./Test./ } |
| 238 | +# if (! -e $file_csv_methods) { $file_csv_methods =~ s/\./Test./ } |
| 239 | +# if (! -e $file_csv_skins) { $file_csv_skins =~ s/\./Test./ } |
| 240 | +# if (! -e $file_csv_scripts) { $file_csv_scripts =~ s/\./Test./ } |
| 241 | +# if (! -e $file_csv_opsys) { $file_csv_opsys =~ s/\./Test./ } |
| 242 | +# if (! -e $file_csv_origins) { $file_csv_origins =~ s/\./Test./ } |
| 243 | +# if (! -e $file_csv_google) { $file_csv_google =~ s/\./Test./ } |
| 244 | +# if (! -e $file_csv_crawlers) { $file_csv_crawlers =~ s/\./Test./ } |
| 245 | +# } |
| 246 | + |
| 247 | + if (! -d "$path_root/$reportmonth") |
| 248 | + { print "Directory not found: $path_root\/$reportmonth\n" ; exit ; } |
| 249 | + |
| 250 | +# for ($month = 4 ; $month <= 10 ; $month ++) |
| 251 | +# { |
| 252 | +# $reportmonth = "2009-" . sprintf ("%02d", $month) ; |
| 253 | + |
| 254 | + for ($day = 1 ; $day <= 31 ; $day ++) |
| 255 | + { |
| 256 | +# last if ($month == 10) && ($day > 24) # temp code stay with DST summer time zone for SV |
| 257 | + |
| 258 | + $date = $reportmonth . "-". sprintf ("%02d", $day) ; |
| 259 | + $dir = "$path_root/$reportmonth/$date" ; |
| 260 | + |
| 261 | + if (-d $dir) |
| 262 | + { |
| 263 | + if (-e "$dir/#Ready") |
| 264 | + { |
| 265 | + if ($date_first eq "") |
| 266 | + { $date_first = $date ; } |
| 267 | + $date_last = $date ; |
| 268 | + print "Process dir $dir\n" ; |
| 269 | + push @dirs_process, $dir ; |
| 270 | + } |
| 271 | + else |
| 272 | + { print "Empty or incomplete dir $dir!\n" ; } |
| 273 | + } |
| 274 | + else |
| 275 | + { print "Missing dir $dir!\n" ; } |
| 276 | + } |
| 277 | +# } |
| 278 | + if ($#dirs_process < 0) |
| 279 | + { print "No valid data to process.\n" ; exit ; } |
| 280 | + |
| 281 | + $dir_reports = "$path_root/$reportmonth" ; |
| 282 | + |
| 283 | + $google_ip_ranges = "<b>IP ranges:</b> known ip ranges for Google are 64.233.[160.0-191.255], 66.249.[64.0-95.255], 66.102.[0.0-15.255], 72.14.[192.0-255.255], <br>74.125.[0.0-255.255], " . |
| 284 | + "209.085.[128.0-255.255], 216.239.[32.0-63.255] and a few minor other subranges</small><p>\n" ; |
| 285 | + |
| 286 | + &OpenLog ; |
| 287 | + &PrepHtml ; |
| 288 | + &SetPeriod ; # now date range derived from which folders found |
| 289 | + |
| 290 | +# &ReadDate ; date range was read from csv file |
| 291 | + |
| 292 | + foreach $dir_process (@dirs_process) |
| 293 | + { |
| 294 | + $days_input_found ++ ; |
| 295 | + |
| 296 | + &ReadInputClients ; |
| 297 | + &ReadInputCrawlers ; |
| 298 | + &ReadInputMethods ; |
| 299 | + &ReadInputMimeTypes ; |
| 300 | + &ReadInputOpSys ; |
| 301 | + &ReadInputOrigins ; |
| 302 | + &ReadInputScripts ; |
| 303 | + &ReadInputGoogle ; |
| 304 | + &ReadInputSkins ; |
| 305 | + &ReadInputIndexPhp ; |
| 306 | + &ReadInputBrowserLanguages ; |
| 307 | +# &ReadInputCountriesTimed ; |
| 308 | + } |
| 309 | + |
| 310 | +#&ReadCountryCodes ; |
| 311 | + |
| 312 | + print "\nDays input = $days_input_found\n" ; |
| 313 | + $multiplier = 1 / $days_input_found ; |
| 314 | + print "\nMultiplier = " . sprintf ("%.4f", $multiplier) . "\n" ; |
| 315 | + |
| 316 | +#&WriteCsvCountriesTimed ; |
| 317 | +#&WriteCsvCountriesGoTo ; |
| 318 | +#exit ; |
| 319 | + |
| 320 | + foreach $key (keys_sorted_alpha_desc %edit_submit) |
| 321 | + { print "YYY " . sprintf ("%5d", $edit_submit {$key}) . ": $key\n" ; } |
| 322 | + |
| 323 | + foreach $total (keys_sorted_by_value_num_desc %edit_submits) |
| 324 | + { print "total $total: ${edit_submits {$total}} \n" ; } |
| 325 | + |
| 326 | + print "\n\n" ; |
| 327 | + |
| 328 | + |
| 329 | + foreach $domain (keys_sorted_by_value_num_desc %edit_submit_bot_sort) |
| 330 | + { |
| 331 | + $cnt = $edit_submit_bot_sort {$domain} ; |
| 332 | + |
| 333 | + last if $cnt < 100 ; |
| 334 | + |
| 335 | + print "DOMAIN $domain total $cnt\n" ; |
| 336 | + foreach $key (sort keys %{$edit_submit_bot {$domain}}) |
| 337 | + { print sprintf ("%5d", $edit_submit_bot {$domain} {$key}) . ": $key\n" ; } |
| 338 | + # { print "$key: ${edit_submit_bot {$domain} {$key}}, " ; } |
| 339 | + print "\n" ; |
| 340 | + } |
| 341 | + print "\n\n" ; |
| 342 | + foreach $agent (keys_sorted_by_value_num_desc %edit_submit_bot_agent_sort) |
| 343 | + { |
| 344 | + $cnt = $edit_submit_bot_agent_sort {$agent} ; |
| 345 | + |
| 346 | + last if $cnt < 25 ; |
| 347 | + |
| 348 | + print "AGENT $agent total $cnt\n" ; |
| 349 | + foreach $key (sort keys %{$edit_submit_bot_agent {$agent}}) |
| 350 | + { print sprintf ("%5d", $edit_submit_bot_agent {$agent} {$key}) . ": $key\n" ; } |
| 351 | + # { print "$key: ${edit_submit_bot {$domain} {$key}}, " ; } |
| 352 | + print "\n" ; |
| 353 | + } |
| 354 | + |
| 355 | + |
| 356 | + |
| 357 | +# foreach $key (keys_sorted_by_value_num_desc %edit_submit_bot_agent) |
| 358 | +# { print "AGENT: " .sprintf ("%5d", $edit_submit_bot_agent {$key}) . ": $key\n" ; } |
| 359 | +# print "\n\n" ; |
| 360 | +# foreach $key (keys_sorted_by_value_num_desc %edit_submit_subparms) |
| 361 | +# { |
| 362 | +# $count = $edit_submit_subparms {$key} ; |
| 363 | +# |
| 364 | +# last if $count < 5 ; |
| 365 | +# |
| 366 | +# ($subparm, $referer) = split (',', $key) ; |
| 367 | +# print "ZZZ " . sprintf ("%5d", $count) . ": $referer, $subparm\n" ; |
| 368 | +# } |
| 369 | + &CalcPercentages ; |
| 370 | + &NormalizeCounts ; |
| 371 | + &SortCounts ; |
| 372 | + |
| 373 | + &WriteReportClients ; |
| 374 | + &WriteReportCrawlers ; |
| 375 | + |
| 376 | + &WriteReportMethods ; |
| 377 | + &WriteReportMimeTypes ; |
| 378 | + &WriteReportOpSys ; |
| 379 | + &WriteReportOrigins ; |
| 380 | + &WriteReportScripts ; |
| 381 | + &WriteReportGoogle ; |
| 382 | + &WriteReportSkins ; |
| 383 | + &WriteCsvGoogleBots ; |
| 384 | + &WriteCsvBrowserLanguages ; |
| 385 | + |
| 386 | +# &WriteCsvCountriesTimed ; |
| 387 | +# &WriteCsvCountriesTargets ; |
| 388 | + close "FILE_LOG" ; |
| 389 | + print "\nReady\n\n" ; |
| 390 | + |
| 391 | + if (-d "/a/squid") |
| 392 | + { |
| 393 | +# $cmd = "tar -cf $dir_reports/$date_last\-csv.tar $dir_reports_in/*.csv | bzip2 $dir_reports/$date_last\-csv.tar" ; |
| 394 | +# print "cmd = '$cmd'\n" ; |
| 395 | +# `$cmd` ; |
| 396 | + $cmd = "tar -cf $dir_reports/$reportmonth\-html.tar $dir_reports/*.htm | bzip2 $dir_reports/$reportmonth\-html.tar" ; |
| 397 | + print "cmd = '$cmd'\n" ; |
| 398 | + `$cmd` ; |
| 399 | + } |
| 400 | + |
| 401 | + exit ; |
| 402 | + |
| 403 | +sub ReportCountries |
| 404 | +{ |
| 405 | + my $mode_report = shift ; |
| 406 | + |
| 407 | + if ($mode_report eq 'Views') |
| 408 | + { |
| 409 | + $selection = 'PageViews' ; |
| 410 | + $selection2 = 'Visits' ; |
| 411 | + $views_edits = 'Page Views' ; |
| 412 | + } |
| 413 | + else |
| 414 | + { |
| 415 | + $selection = 'PageEdits' ; |
| 416 | + $selection2 = 'Saves' ; |
| 417 | + $views_edits = 'Page Edits' ; |
| 418 | + } |
| 419 | + |
| 420 | + ($quarter_only2 = $quarter_only) =~ s/ // ; |
| 421 | + |
| 422 | + $file_csv_squid_counts_monthly = "SquidData${selection2}PerCountryMonthly.csv" ; # LockePrev.csv" ; |
| 423 | + $file_csv_squid_counts_daily = "SquidData${selection2}PerCountryDaily.csv" ; |
| 424 | + |
| 425 | + $file_html_per_country_breakdown = "SquidReport${selection}PerCountryBreakdown.htm" ; |
| 426 | + $file_html_per_country_breakdown_huge = "SquidReport${selection}PerCountryBreakdownHuge.htm" ; |
| 427 | + $file_html_per_country_overview = "SquidReport${selection}PerCountryOverview$quarter_only2.htm" ; |
| 428 | + $file_html_per_country_trends = "SquidReport${selection}PerCountryTrends.htm" ; |
| 429 | + $file_html_per_language_breakdown = "SquidReport${selection}PerLanguageBreakdown.htm" ; |
| 430 | + $file_csv_per_country_overview = "SquidReport${selection}PerCountryOverview.csv" ; |
| 431 | + |
| 432 | + $path_csv_squid_counts_monthly = "$path_in/$file_csv_squid_counts_monthly" ; |
| 433 | + if (! -e $path_csv_squid_counts_monthly) { abort ("Input file $path_csv_squid_counts_monthly not found!") ; } |
| 434 | + $path_csv_squid_counts_daily = "$path_in/$file_csv_squid_counts_daily" ; |
| 435 | + if (! -e $path_csv_squid_counts_daily) { abort ("Input file $path_csv_squid_counts_daily not found!") ; } |
| 436 | + |
| 437 | + &ReadInputCountriesMonthly ($project_mode) ; |
| 438 | + &ReadInputCountriesDaily ($project_mode) ; |
| 439 | + |
| 440 | +# foreach $week (sort {$a <=> $b} keys %changes_per_week_per_country_code) |
| 441 | +# { &WriteCsvSvgFilePerCountryOverview ($views_edits, $week, \%changes_per_week_per_country_code, 200, "Wikipedia " . lc $views_edits . ", weekly trend") } ; |
| 442 | + |
| 443 | +# foreach $week (sort {$a <=> $b} keys %requests_per_week_per_country_code) |
| 444 | +# { &WriteCsvSvgFilePerCountryOverview ($views_edits, $week, \%requests_per_week_per_country_code, $max_requests_per_connected_us_week, "Wikipedia " . lc $views_edits . " per person") } ; |
| 445 | +# foreach $yyyymm (sort keys %yyyymm_) |
| 446 | +# { &WriteCsvSvgFilePerCountryOverview ($views_edits, $yyyymm, \%requests_per_month_per_country_code, $max_requests_per_connected_us_month, "Wikipedia " . lc $views_edits . " per person") } ; |
| 447 | + |
| 448 | + &PrepHtml ; |
| 449 | + |
| 450 | +# $comment = "<p> See also: <a href='SquidReportTrafficPerCountry.htm'>Wikipedia $views_edits per Country</a> / <a href='SquidReportLanguagesVisitedDetailed.htm'>Breakdown per Country of Wikipedia's Visited (detailed)</a> / <a href='SquidReportTrafficPerWikipediaOverview.htm'>Breakdown per Wikipedia of Requesting Countries</a>" ; |
| 451 | + |
| 452 | + $title_main = "Wikimedia Traffic Analysis Report" ; |
| 453 | + |
| 454 | + $links = "<p> Also: <b>$views_edits Per Country</b> - " . |
| 455 | + "<a href='$file_html_per_country_overview'>Overview</a> / " . |
| 456 | + "<a href='$file_html_per_country_breakdown'>Breakdown</a> / " . |
| 457 | + "<a href='$file_html_per_country_trends'>Trends</a>, " . |
| 458 | + "<b>$views_edits Per Wikipedia Language - </b> " . |
| 459 | + "<a href='$file_html_per_language_breakdown'>Breakdown</a>" ; |
| 460 | + |
| 461 | + $title = "$title_main - Wikipedia $views_edits Per Country - Overview" ; |
| 462 | + &WriteReportPerCountryOverview ($title, $views_edits, &UnLink ($links,1)) ; ; |
| 463 | + |
| 464 | + $title = "$title_main - Wikipedia $views_edits Per Country - Breakdown" ; |
| 465 | + &WriteReportPerCountryBreakdown ($title, $views_edits, &UnLink ($links,2),$cutoff_requests = 100, $cutoff_percentage = 1, $show_logcount = $false) ; |
| 466 | + &WriteReportPerCountryBreakdown ($title, $views_edits, &UnLink ($links,2),$cutoff_requests = 10, $cutoff_percentage = 0.1, $show_logcount = $true) ; |
| 467 | + |
| 468 | + $title = "$title_main - Wikipedia $views_edits Per Country - Trends" ; |
| 469 | + &WriteReportPerCountryTrends ($title, $views_edits, &UnLink ($links,3)) ; |
| 470 | + |
| 471 | + $links =~ s/,.*$// ; |
| 472 | + $title = "$title_main - $views_edits Per Wikipedia Language - Breakdown" ; |
| 473 | + &WriteReportPerLanguageBreakDown ($title, $views_edits, &UnLink ($links,4)) ; |
| 474 | +} |
| 475 | + |
| 476 | +sub ReadDate |
| 477 | +{ |
| 478 | + open CSV_CRAWLERS, '<', "$dir_process/$file_csv_crawlers" ; |
| 479 | + $line = <CSV_CRAWLERS> ; |
| 480 | + close CSV_CRAWLERS ; |
| 481 | +# print "DATE LINE $line\n" ; |
| 482 | + chomp ($line) ; |
| 483 | + $line =~ s/^.*?(\d\d\d\d\-\d\d\-\d\d(?:T\d\d)?).*?(\d\d\d\d\-\d\d\-\d\d(?:T\d\d)?).*$/$1.",".$2/e ; |
| 484 | + ($timefrom,$timetill) = split (',', $line) ; |
| 485 | + if (($timefrom eq "") || ($timetill eq "")) |
| 486 | + { abort ("$file_csv_crawlers does not contain valid date range on first line\n") ; } |
| 487 | + |
| 488 | + $yearfrom = substr ($timefrom,0,4) ; |
| 489 | + $monthfrom = substr ($timefrom,5,2) ; |
| 490 | + $dayfrom = substr ($timefrom,8,2) ; |
| 491 | + $hourfrom = substr ($timefrom,11,2) ; |
| 492 | + |
| 493 | + $yeartill = substr ($timetill,0,4) ; |
| 494 | + $monthtill = substr ($timetill,5,2) ; |
| 495 | + $daytill = substr ($timetill,8,2) ; |
| 496 | + $hourtill = substr ($timetill,11,2) ; |
| 497 | + |
| 498 | + $period = sprintf ("%d %s %d %d:00 - %d %s %d %d:00", $dayfrom, month_english_short ($monthfrom-1), $yearfrom, $hourfrom, $daytill, month_english_short ($monthtill-1), $yeartill, $hourtill) ; |
| 499 | + |
| 500 | + $timefrom = timegm (0,0,$hourfrom,$dayfrom,$monthfrom-1,$yearfrom-1900) ; |
| 501 | + $timetill = timegm (0,0,$hourtill,$daytill,$monthtill-1,$yeartill-1900) ; |
| 502 | + |
| 503 | + $timespan = ($timetill - $timefrom) / 3600 ; |
| 504 | + $multiplier = (24 * 3600) / ($timetill - $timefrom) ; |
| 505 | + print "Multiplier = $multiplier\n" ; |
| 506 | + $header =~ s/DATE/Daily averages, based on sample period: $period (yyyy-mm-dd)/ ; |
| 507 | +} |
| 508 | + |
| 509 | +sub SetPeriod |
| 510 | +{ |
| 511 | + $year_first = substr ($date_first,0,4) ; |
| 512 | + $month_first = substr ($date_first,5,2) ; |
| 513 | + $day_first = substr ($date_first,8,2) ; |
| 514 | + |
| 515 | + $year_last = substr ($date_last,0,4) ; |
| 516 | + $month_last = substr ($date_last,5,2) ; |
| 517 | + $day_last = substr ($date_last,8,2) ; |
| 518 | + |
| 519 | + $timefrom = timegm (0,0,0,$day_first,$month_first-1,$year_first-1900) ; |
| 520 | + $timetill = timegm (0,0,0,$day_last,$month_last-1,$year_last-1900) + 86400 ; # date_last + 1 day (in seconds) |
| 521 | + |
| 522 | + $timespan = ($timetill - $timefrom) / 3600 ; |
| 523 | + $multiplier = (24 * 3600) / ($timetill - $timefrom) ; |
| 524 | + |
| 525 | + $period = sprintf ("%d %s %d - %d %s %d", $day_first, month_english_short ($month_first-1), $year_first, $day_last, month_english_short ($month_last-1), $year_last) ; |
| 526 | + $header =~ s/DATE/Daily averages, based on sample period: $period/ ; |
| 527 | + print "Sample period: $period => for daily averages multiplier = " . sprintf ("%.2f",$multiplier) . "\n" ; |
| 528 | +} |
| 529 | + |
| 530 | +sub PrepHtml |
| 531 | +{ |
| 532 | + $language = "en" ; |
| 533 | + $header = "<!DOCTYPE FILE_HTML PUBLIC '-//W3C//DTD FILE_HTML 4.01 Transitional//EN' 'http://www.w3.org/TR/html4/loose.dtd'>\n" . |
| 534 | + "<html lang='en'>\n" . |
| 535 | + "<head>\n" . |
| 536 | + "<title>TITLE</title>\n" . |
| 537 | + "<meta http-equiv='Content-type' content='text/html; charset=iso-8859-1'>\n" . |
| 538 | + "<meta name='robots' content='index,follow'>\n" . |
| 539 | + "<script language='javascript' type='text/javascript' src='../WikipediaStatistics13.js'></script>\n" . |
| 540 | + "<style type='text/css'>\n" . |
| 541 | + "<!--\n" . |
| 542 | + "body {font-family:arial,sans-serif; font-size:12px }\n" . |
| 543 | + "h2 {margin:0px 0px 3px 0px; font-size:18px}\n" . |
| 544 | + "table {font-size:12px ;}\n" . |
| 545 | + "td {white-space:wrap; text-align:right; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:12px ; vertical-align:top}\n" . |
| 546 | + "th {white-space:nowrap; text-align:right; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:12px ; vertical-align:top ; font-width:bold}\n" . |
| 547 | + "th.small {white-space:wrap; text-align:right; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:11px ; vertical-align:top ; font-width:bold}\n" . |
| 548 | + "td.hl {text-align:left;}\n" . |
| 549 | + "td.hr {text-align:right;}\n" . |
| 550 | + "td.r {text-align:right; border: inset 1px #FFFFFF}\n" . |
| 551 | + "td.c {text-align:center; border: inset 1px #FFFFFF}\n" . |
| 552 | + "td.l {text-align:left; border: inset 1px #FFFFFF}\n" . |
| 553 | + "th.c {text-align:center; border: inset 1px #FFFFFF}\n" . |
| 554 | + "th.l {text-align:left; border: inset 1px #FFFFFF}\n" . |
| 555 | + "th.lh3 {text-align:left; border: inset 1px #FFFFFF ; font-size:14px}\n" . |
| 556 | + "a:link { color:blue;text-decoration:none;}\n" . |
| 557 | + "a:visited {color:#0000FF;text-decoration:none;}\n" . |
| 558 | + "a:active {color:#0000FF;text-decoration:none;}\n" . |
| 559 | + "a:hover {color:#FF00FF;text-decoration:underline}\n" . |
| 560 | + "-->\n" . |
| 561 | + "</style>\n" . |
| 562 | + "<body bgcolor='\#FFFFDD'>\n<table width=100%>\n<tr><td class=hl>\n<h2>HEADER</h2>\n<b>DATE</b>\n</td>\n<td class=hr>" . |
| 563 | + "<input type='button' value=' Archive ' onclick='window.location=\"http://stats.wikimedia.org/archive/squid_reports\"'> " . |
| 564 | + "<input type='button' value=' Wikimedia Statistics ' onclick='window.location=\"http://stats.wikimedia.org\"'>" . |
| 565 | + "</td></tr>\n</table><hr>" . |
| 566 | + " This analysis is based on a 1:1000 sampled server log (squids) X1000\nALSO<p>" ; |
| 567 | + |
| 568 | + # to be localized some day like any reports |
| 569 | + $out_license = "All data and images on this page are in the public domain." ; |
| 570 | + $out_generated = "Generated on " ; |
| 571 | + $out_author = "Author" ; |
| 572 | + $out_mail = "Mail" ; |
| 573 | + $out_site = "Web site" ; |
| 574 | + $out_myname = "Erik Zachte" ; |
| 575 | + $out_mymail = "ezachte@### (no spam: ### = wikimedia.org)" ; |
| 576 | + $out_mysite = "http://infodisiac.com/" ; |
| 577 | + |
| 578 | + $colophon = "<p>\n" . |
| 579 | + $out_generated . date_time_english (time) . "\n<br>" . |
| 580 | + $out_author . ":" . $out_myname . |
| 581 | + " (<a href='" . $out_mysite . "'>" . $out_site . "</a>)\n<br>" . |
| 582 | + "$out_mail: $out_mymail<br>\n" . |
| 583 | + "$out_license" . |
| 584 | + "</small>\n" . |
| 585 | + "</body>\n" . |
| 586 | + "</html>\n" ; |
| 587 | + |
| 588 | + $dummy_requests = "Requests <font color=#808080>by destination</font> or " ; |
| 589 | + $dummy_origins = "<font color=#000060>by origin</font>" ; |
| 590 | + $dummy_methods = "<font color=#000060>Methods</font>" ; |
| 591 | + $dummy_scripts = "<font color=#000060>Scripts</font>" ; |
| 592 | + $dummy_skins = "<font color=#000060>Skins</font>" ; |
| 593 | + $dummy_crawlers = "<font color=#000060>Crawlers</font>" ; |
| 594 | + $dummy_opsys = "<font color=#000060>Op.Sys.</font>" ; |
| 595 | + $dummy_browsers = "<font color=#000060>Browsers</font>" ; |
| 596 | + $dummy_google = "<font color=#000060>Google</font>" ; |
| 597 | + |
| 598 | + $link_requests = "Requests <a href='$file_html_requests'>by destination</a> or " ; |
| 599 | + $link_origins = "<a href='$file_html_origins'>by origin</a>" ; |
| 600 | + $link_methods = "<a href='$file_html_methods'>Methods</a>" ; |
| 601 | + $link_scripts = "<a href='$file_html_scripts'>Scripts</a>" ; |
| 602 | + $link_skins = "<a href='$file_html_skins'>Skins</a>" ; |
| 603 | + $link_crawlers = "<a href='$file_html_crawlers'>Crawlers</a>" ; |
| 604 | + $link_opsys = "<a href='$file_html_opsys'>Op.Sys.</a>" ; |
| 605 | + $link_browsers = "<a href='$file_html_clients'>Browsers</a>" ; |
| 606 | + $link_google = "<a href='$file_html_google'>Google</a>" ; |
| 607 | +} |
| 608 | + |
| 609 | +sub ReadCountryCodes |
| 610 | +{ |
| 611 | + open CODES, '<', "$path_in/$file_csv_country_codes" ; |
| 612 | + while ($line = <CODES>) |
| 613 | + { |
| 614 | + if ($line =~ /^[A-Z]/) |
| 615 | + { |
| 616 | + chomp ($line) ; |
| 617 | + ($code,$region,$north_south,$name) = split (',',$line,4) ; |
| 618 | + $country_codes {$code} = $name ; |
| 619 | + # print "$code => $name\n" ; |
| 620 | + } |
| 621 | + } |
| 622 | + close CODES ; |
| 623 | +} |
| 624 | + |
| 625 | +sub ReadInputClients |
| 626 | +{ |
| 627 | + my $file_csv = "$dir_process/$file_csv_clients" ; |
| 628 | + if (! -e $file_csv) |
| 629 | + { abort ("Function ReadInputClients: file $file_csv not found!!!") ; } |
| 630 | + open CSV_CLIENTS, '<', $file_csv ; |
| 631 | + |
| 632 | + while ($line = <CSV_CLIENTS>) |
| 633 | + { |
| 634 | + next if $line =~ /^#/ ; # comments |
| 635 | + next if $line =~ /^:/ ; # csv header (not a comment) |
| 636 | + |
| 637 | + chomp ($line) ; |
| 638 | + |
| 639 | + if ($line =~ /^E/) |
| 640 | + { |
| 641 | + ($rectype, $engine, $count) = split (',', $line) ; |
| 642 | + |
| 643 | + next if ($engine !~ /^Gecko/) && ($engine !~ /^AppleWebKit/) ; |
| 644 | + |
| 645 | + if ($engine !~ / \d/) |
| 646 | + { $engine =~ s/\// / ; } |
| 647 | + |
| 648 | + if ($engine =~ /AppleWebKit/) |
| 649 | + { |
| 650 | + $engine =~ s/AppleWebKit\//AppleWebKit / ; # fix |
| 651 | + $engine =~ s/Safari\/\d+/Safari/ ; # fix input |
| 652 | + $engine =~ s/(?:|iPad|iPod|iPhone) Mozilla.*$/iPod)/i ; # fix input |
| 653 | + ($engine2 = $engine) =~ s/\s*\/?\d\d\d// ; |
| 654 | + $webkit_engines {$engine2} += $count ; |
| 655 | + |
| 656 | + # $webkit_total_engines {$engine} += $count ; |
| 657 | + } |
| 658 | + |
| 659 | + $engines {$engine} += $count ; |
| 660 | + |
| 661 | + $engine =~ s/\/.*$// ; |
| 662 | + $engine =~ s/ .*$// ; |
| 663 | + $total_engines {$engine} += $count ; |
| 664 | + } |
| 665 | + elsif ($line =~ /^G/) |
| 666 | + { |
| 667 | + ($rectype, $mobile, $group, $count, $perc) = split (',', $line) ; |
| 668 | + $total_clientgroups {$mobile} += $count ; |
| 669 | + |
| 670 | + $group =~ s/^KDDI.*$/KDDI/ ; |
| 671 | + $group =~ s/^MOT.*$/MOT/ ; |
| 672 | + $group =~ s/^LG-.*$/LG/i ; |
| 673 | + $group =~ s/^LGE.*$/LGE/i ; |
| 674 | + $group =~ s/^KWC.*$/KWC/i ; |
| 675 | + $group =~ s/^Nokia.*$/Nokia/i ; |
| 676 | + $group =~ s/^Samsung.*$/Samsung/i ; |
| 677 | + $group =~ s/^Motorola.*$/Motorola/i ; |
| 678 | + $group =~ s/^SonyEricsson.*$/SonyEricsson/i ; |
| 679 | + $group =~ s/^PANTECH.*$/PanTech/i ; |
| 680 | + $group =~ s/^Palm_Pre/Palm Pre/i ; |
| 681 | + $clientgroups {"$mobile,$group"} += $count ; |
| 682 | + } |
| 683 | + else |
| 684 | + { |
| 685 | + ($rectype, $client, $count, $perc) = split (',', $line) ; |
| 686 | + |
| 687 | + $total_clients += $count ; |
| 688 | + $client =~ s/_/./g ; |
| 689 | + $client =~ s/\.\./Other/g ; |
| 690 | + if ($client !=~ / \d/) |
| 691 | + { $client =~ s/\// / ; } |
| 692 | + if ($rectype eq "-") { $total_clients_non_mobile += $count ; } |
| 693 | + if ($rectype eq "M") { $total_clients_mobile += $count ; } |
| 694 | + $clients {"$rectype,$client"} += $count ; |
| 695 | + } |
| 696 | + } |
| 697 | + close CSV_CLIENTS ; |
| 698 | + |
| 699 | +# foreach $key (sort keys %clientgroups) |
| 700 | +# { |
| 701 | +# next if $clientgroups {$key} < 50000 ; } |
| 702 | +# next if $key =~ /^M/ ; } |
| 703 | + |
| 704 | +# print "$key:" . $clientgroups {$key} . "\n" ; |
| 705 | +# } |
| 706 | +# print "\n" ; |
| 707 | +# foreach $key (sort keys %total_clientgroups) |
| 708 | +# { |
| 709 | +# print "$key:" . $total_clientgroups {$key} . "\n" ; |
| 710 | +# } |
| 711 | +# print "\n" ; |
| 712 | +} |
| 713 | + |
| 714 | +sub ReadInputCrawlers |
| 715 | +{ |
| 716 | + my $file_csv = "$dir_process/$file_csv_crawlers" ; |
| 717 | + if (! -e $file_csv) |
| 718 | + { abort ("Function ReadInputCrawlers: file $file_csv not found!!!\n") ; } |
| 719 | + open CSV_CRAWLERS, '<', $file_csv ; |
| 720 | + while ($line = <CSV_CRAWLERS>) |
| 721 | + { |
| 722 | + next if $line =~ /^#/ ; # comments |
| 723 | + next if $line =~ /^:/ ; # csv header (not a comment) |
| 724 | + |
| 725 | + chomp ($line) ; |
| 726 | + ($count, $mime, $agent) = split (',', $line,3) ; |
| 727 | + |
| 728 | + |
| 729 | + $mime2 = $mime ; |
| 730 | + $mime =~ s/^image\/.*$/image\/../ ; |
| 731 | + $mime =~ s/^text\/.*$/text\/../ ; |
| 732 | + $agent =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg; |
| 733 | + |
| 734 | + next if $agent =~ /<\s*script\s*>/i ; |
| 735 | + next if $agent =~ /MSIE \d+\.\d+/ ; # most likely false positives |
| 736 | + |
| 737 | + if ($agent =~ /\|Google ip add?ress/) # typo |
| 738 | + { |
| 739 | + $agent =~ s/\|Google ip add?ress// ; |
| 740 | + $agent =~ s/GoogleBot/<b><font color=green>GoogleBot<\/font><\/b>/gi ; |
| 741 | + } |
| 742 | + if ($agent =~ / \|no Google ip address/) |
| 743 | + { |
| 744 | + $agent =~ s/ \|no Google ip address// ; |
| 745 | + $agent =~ s/GoogleBot/<b><font color=red>GoogleBot<\/font><\/b>/gi ; |
| 746 | + } |
| 747 | + if ($agent =~ /www\.teesoft\.info/) |
| 748 | + { |
| 749 | + $agent =~ s/(\((?:X11|Windows|Macintosh);[^;]*;)[^;]*;[^\)]*\)/$1 [lang code]; rv:[..]\)/ ; |
| 750 | + $agent =~ s/Gecko\/\d+/Gecko\/../ ; |
| 751 | + $agent =~ s/Firefox\/\d+\.\d*\.?\d*/Firefox\/../ ; |
| 752 | + $agent =~ s/(Gecko\/\.\.).*?\(http/$1 etc \(http/ ; |
| 753 | + } |
| 754 | + |
| 755 | + $agent =~ s/\+//g ; |
| 756 | +# $agent =~ s/^Mozilla\/\d+\.\d+\s*\(compatible\s*;\s*([^\)]*)\)\s*/$1/ ; # Mozilla/5.0 (compatible; xxx) -> xxx |
| 757 | +# $agent =~ s/^Mozilla\/\d+\.\d+\s*\(\s*([^\)]*)\)\s*/$1/ ; # Mozilla/5.0 (xxx) -> xxx |
| 758 | + $agent =~ s/\((http:.*?feedfetcher.html)[^\)]*\)/($1)/ ; # (http://www.google.com/feedfetcher.html; 1 subscribers; feed-id=1894739019218796495) |
| 759 | + $agent =~ s/FeedFetcher-Google/FeedFetcher-Google/i ; |
| 760 | + if ($agent !~ /http:/) |
| 761 | + { $agent =~ s/(bot|spider|crawl(?:er)?)/<b>$1<\/b>/gi ; } |
| 762 | + if ($mime2 eq "text/html") |
| 763 | + { $total_page_crawlerrequests += $count ; } |
| 764 | + $crawlers {"$mime|$agent"} += $count ; |
| 765 | + } |
| 766 | + close CSV_CRAWLERS ; |
| 767 | +} |
| 768 | + |
| 769 | +sub ReadInputMethods |
| 770 | +{ |
| 771 | + my $file_csv = "$dir_process/$file_csv_methods" ; |
| 772 | + if (! -e $file_csv) |
| 773 | + { abort ("Function ReadInputMethods: file $file_csv not found!!!") ; } |
| 774 | + open CSV_METHODS, '<', $file_csv ; |
| 775 | + while ($line = <CSV_METHODS>) |
| 776 | + { |
| 777 | + next if $line =~ /^#/ ; # comments |
| 778 | + next if $line =~ /^:/ ; # csv header (not a comment) |
| 779 | + |
| 780 | + ($method, $status, $count) = split (',', $line) ; |
| 781 | + $statusses {"$method,$status"} += $count ; |
| 782 | + $methods {$method} += $count ; |
| 783 | + } |
| 784 | + close CSV_METHODS ; |
| 785 | +} |
| 786 | + |
| 787 | +sub ReadInputMimeTypes |
| 788 | +{ |
| 789 | + my $file_csv = "$dir_process/$file_csv_requests" ; |
| 790 | + if (! -e $file_csv) |
| 791 | + { abort ("Function ReadInputMimeTypes: file $file_csv not found!!!") ; } |
| 792 | + open CSV_REQUESTS, '<', $file_csv ; |
| 793 | + while ($line = <CSV_REQUESTS>) |
| 794 | + { |
| 795 | + next if $line =~ /^#/ ; # comments |
| 796 | + next if $line =~ /^:/ ; # csv header (not a comment) |
| 797 | + |
| 798 | + chomp $line ; |
| 799 | + ($project, $origin, $ext, $mime, $parm, $count) = split (',', $line) ; |
| 800 | + |
| 801 | + $project = &ExpandAbbreviation ($project) ; |
| 802 | + |
| 803 | + $mime =~ s/(\w+\.)(\w+\.)(\w+)/$1$2<br>$3/ ; |
| 804 | + $mime =~ s/opensearchdescription/opensearch-<br>description/ ; |
| 805 | + if ($project =~ /\./) |
| 806 | + { |
| 807 | + $project = '!invalid!' ; |
| 808 | + if ($origin ne "external") |
| 809 | + { $origin = 'internal' ; } |
| 810 | + $ext = ".." ; |
| 811 | + $mime = ".." ; |
| 812 | + next ; |
| 813 | + } |
| 814 | + |
| 815 | + if ($parms eq "") |
| 816 | + { $parms = " " ; } |
| 817 | + $ext =~ s/^([a-z\[\]]*)[^a-z\[\]].*$/$1/g ; |
| 818 | + $ext =~ s/\((.*)\)/ ($1.php)/ ; |
| 819 | + if ($project eq $origin) |
| 820 | + { $origin = '⇐' ; } |
| 821 | + |
| 822 | + if ($project ne "upload") |
| 823 | + { @counts_prem {"$project,$origin,$ext,$mime"} += $count ; } |
| 824 | + # if ($project ne "upload") |
| 825 | + # { @counts_pm {"$project,$mime"} += $count ; } |
| 826 | + |
| 827 | + $counts_pm {"$project,$mime"} += $count ; |
| 828 | + ($domain = $project) =~ s/\:.*$// ; |
| 829 | + $counts_dm {"$domain,$mime"} += $count ; |
| 830 | + $mimetypes {$mime} += $count ; |
| 831 | + $projects {$project} += $count ; |
| 832 | + $domains {$domain} += $count ; |
| 833 | + |
| 834 | + if ($mime =~ /image\/(?:png|jpeg|gif)/) |
| 835 | + { |
| 836 | + $images_project {$project} += $count ; |
| 837 | + $images_domain {$domain} += $count ; |
| 838 | + } |
| 839 | + $mimetypes_found {$mime} ++ ; |
| 840 | + # @counts_prem {"$project,$origin,$ext,$mime"} += $count ; |
| 841 | + |
| 842 | + $total_mimes += $count ; |
| 843 | + } |
| 844 | + close CSV_REQUESTS ; |
| 845 | + |
| 846 | +# $html .= "<tr><th class=c>counts</th><th class=l>project</th><th class=l>origin</th><th class=l>extension</th><th class=l>mime</th></tr>\n" ; |
| 847 | +# $rows = 0 ; |
| 848 | +# foreach $key (sort keys %counts_prem) |
| 849 | +# { |
| 850 | +# ($project, $origin, $ext, $mime) = split (',', $key) ; |
| 851 | +# $count = $counts_prem {$key} ; |
| 852 | +# $count =~ s/^(\d+?)(\d\d\d)$/$1,$2/ ; |
| 853 | +# $html .= "<tr><td class=r>${count},000</td><td class=l>$project</td><td class=l>$origin</td><td class=l>$ext</td><td class=l>$mime</td></tr>\n" ; |
| 854 | +# $rows++ ; |
| 855 | +# } |
| 856 | +# $html .= "</table>\n" ; |
| 857 | +# $html .= "<small>$rows rows written</small><p>" ; |
| 858 | + |
| 859 | +# $html .= "<table border=1>\n" ; |
| 860 | +# $html .= "<tr><th class=c>counts</th><th class=l>project</th><th class=l>mime</th></tr>\n" ; |
| 861 | +# $rows = 0 ; |
| 862 | +# foreach $key (sort keys %counts_pm) |
| 863 | +# { |
| 864 | +# ($project, $mime) = split (',', $key) ; |
| 865 | +# $count = $counts_pm {$key} ; |
| 866 | +# $count =~ s/^(\d+?)(\d\d\d)$/$1,$2/ ; |
| 867 | +# $html .= "<tr><td class=r>${count},000</td><td class=l>$project</td><td class=l>$mime</td></tr>\n" ; |
| 868 | +# $rows++ ; |
| 869 | +# } |
| 870 | +# $html .= "</table>\n" ; |
| 871 | +# $html .= "<small>$rows rows written</small><p>" ; |
| 872 | +} |
| 873 | + |
| 874 | +sub ReadInputOpSys |
| 875 | +{ |
| 876 | + my $file_csv = "$dir_process/$file_csv_opsys" ; |
| 877 | + if (! -e $file_csv) |
| 878 | + { abort ("Function ReadInputOpSys: file $file_csv not found!!!") ; } |
| 879 | + open CSV_OPSYS, '<', $file_csv ; |
| 880 | + while ($line = <CSV_OPSYS>) |
| 881 | + { |
| 882 | + if ($line =~ /^#/) # comments |
| 883 | + { |
| 884 | + if ($line =~ /^# mobile:/) |
| 885 | + { |
| 886 | + $line =~ s/^.*?: // ; |
| 887 | + ($month_upd_keywords_mobile = $line) =~ s/^.*?\(([^\)]+)\).*$/$1/ ; |
| 888 | + ($keywords_mobile = $line) =~ s/ \([^\)]+\).*$// ; |
| 889 | + $keywords_mobile =~ s/\|/, /g ; |
| 890 | + $keywords_mobile =~ s/((?:[^,]+,){10})/$1<br>/g ; |
| 891 | + next ; |
| 892 | + } |
| 893 | + next ; |
| 894 | + } |
| 895 | + next if $line =~ /^:/ ; # csv header (not a comment) |
| 896 | + |
| 897 | + chomp $line ; |
| 898 | + ($rectype, $os, $count, $perc) = split (',', $line) ; |
| 899 | + |
| 900 | + next if $count !~ /^\d+$/ ; # -,Linux Gentoo,,2,0.00% (extra comma !) |
| 901 | + |
| 902 | + $os =~ s/_/./g ; |
| 903 | + $os =~ s/\.\./Other/g ; |
| 904 | + if ($rectype ne "G") |
| 905 | + { |
| 906 | + if ($os =~ / \d/) |
| 907 | + { ; } |
| 908 | + else |
| 909 | + { $os =~ s/\// / ; } |
| 910 | + } |
| 911 | + |
| 912 | + if ($rectype eq "-") { $total_opsys_non_mobile += $count ; } |
| 913 | + if ($rectype eq "M") { $total_opsys_mobile += $count ; } |
| 914 | + |
| 915 | + $opsys {"$rectype,$os"} += $count ; |
| 916 | + } |
| 917 | +} |
| 918 | + |
| 919 | + |
| 920 | +sub ReadInputOrigins |
| 921 | +{ |
| 922 | + my $file_csv = "$dir_process/$file_csv_origins" ; |
| 923 | + if (! -e $file_csv) |
| 924 | + { abort ("Function ReadInputOrigins: file $file_csv not found!!!") ; } |
| 925 | + open CSV_ORIGINS, '<', $file_csv ; |
| 926 | + while ($line = <CSV_ORIGINS>) |
| 927 | + { |
| 928 | + next if $line =~ /^#/ ; # comments |
| 929 | + next if $line =~ /^:/ ; # csv header (not a comment) |
| 930 | + |
| 931 | + chomp $line ; |
| 932 | + ($source, $origin, $toplevel, $mimecat, $count) = split (',', $line) ; |
| 933 | + |
| 934 | +# test: |
| 935 | + if (($source eq "external") && ($origin !~ /^google/)) |
| 936 | + { $origin .= $toplevel ; } |
| 937 | + |
| 938 | +# ~ s/xx:upload/upload (~css)/; |
| 939 | +# $origin =~ s/wb:/wikibooks:/; |
| 940 | +# $origin =~ s/wk:/wiktionary:/; |
| 941 | +# $origin =~ s/wn:/wikinews:/; |
| 942 | +# $origin =~ s/wp:/wikipedia:/; |
| 943 | +# $origin =~ s/wq:/wikiquote:/; |
| 944 | +# $origin =~ s/ws:/wikisource:/; |
| 945 | +# $origin =~ s/wv:/wikiversity:/; |
| 946 | +# $origin =~ s/wx://; |
| 947 | +# $origin =~ s/mw:/mediawiki:/; |
| 948 | +# $origin =~ s/wm:/wikimedia:/; |
| 949 | +# $origin =~ s/wmf:/foundation:/; |
| 950 | +# $origin =~ s/:www$/:portal/; |
| 951 | +# $origin =~ s/:mw$/:mediawiki/; |
| 952 | + |
| 953 | + if ($source eq "internal") |
| 954 | + { |
| 955 | + $origin = &ExpandAbbreviation ($origin) ; |
| 956 | + ($project,$subproject) = split (':', $origin) ; |
| 957 | + $origin_int_top_split {"$mimecat:$origin"} += $count ; |
| 958 | + $origin_int_top {$origin} += $count ; |
| 959 | + $project_int_top_split {"$mimecat:$project"} += $count ; |
| 960 | + $project_int_top {$project} += $count ; |
| 961 | + } |
| 962 | + else |
| 963 | + { |
| 964 | +# $origin2 = $origin ; |
| 965 | +# $origin2 =~ s/^google.*?\|/google:ext|/ ; |
| 966 | +# $origin2 =~ s/^yahoo.*\|/yahoo:ext|/ ; |
| 967 | +# if (($origin2 !~ /^google/) && ($origin2 !~ /^yahoo/)) |
| 968 | +# { $origin2 =~ s/^.*?\|/other:ext|/ ; } |
| 969 | +# ($prefix,$code) = split ('\:', $origin2) ; |
| 970 | +# print "$origin -> $origin2\n" ; |
| 971 | +# $origin_ext_top_split {$origin} += $count ; |
| 972 | +# $origin_ext_top {$code} += $count ; |
| 973 | + |
| 974 | +# if ($origin =~ /\|page/) |
| 975 | +# { |
| 976 | +# ($prefix,$code) = split ('\:', $origin) ; |
| 977 | +# $code =~ s/\|.*$// ; |
| 978 | +# $origin =~ s/\|.*$// ; |
| 979 | +# $origin_ext_page_top_split {$origin} += $count ; |
| 980 | +# $origin_ext_page_top {$code} += $count ; |
| 981 | +# } |
| 982 | + if ($origin eq "unmatched ip address") |
| 983 | + { $origin = "origin unknown" ; } |
| 984 | + |
| 985 | + if ($mimecat eq "page") |
| 986 | + { $total_page_requests_external += $count ; } |
| 987 | + |
| 988 | + $origin_ext_top_split {"$mimecat:$origin"} += $count ; |
| 989 | + $origin_ext_top {$origin} += $count ; |
| 990 | + $total_origins_external_counted += $count ; |
| 991 | + # if ($origin =~ /^google/) |
| 992 | + # { |
| 993 | + # $origin = "google (total)" ; |
| 994 | + # $origin_ext_top_split {"$mimecat:$origin"} += $count ; |
| 995 | + # $origin_ext_top {$origin} += $count ; |
| 996 | + # } |
| 997 | + } |
| 998 | + } |
| 999 | + |
| 1000 | + close CSV_ORIGINS ; |
| 1001 | +} |
| 1002 | + |
| 1003 | +sub ReadInputScripts |
| 1004 | +{ |
| 1005 | + my $file_csv = "$dir_process/$file_csv_scripts" ; |
| 1006 | + if (! -e $file_csv) |
| 1007 | + { abort ("Function ReadInputScripts: file $file_csv not found!!!") ; } |
| 1008 | + open CSV_SCRIPTS, '<', $file_csv ; |
| 1009 | + while ($line = <CSV_SCRIPTS>) |
| 1010 | + { |
| 1011 | + next if $line =~ /^#/ ; # comments |
| 1012 | + next if $line =~ /^:/ ; # csv header (not a comment) |
| 1013 | + |
| 1014 | + chomp $line ; |
| 1015 | + $line =~ s/\%3B/;/gi ; |
| 1016 | + $line =~ s/\&/\&/gi ; |
| 1017 | + ($ext, $script, $parm, $count) = split (',', $line) ; |
| 1018 | + if ($script =~ /\%/) |
| 1019 | + { $script = "other" ; } |
| 1020 | + if ($parm =~ /\%/) |
| 1021 | + { $parm = "other" ; } |
| 1022 | + |
| 1023 | + if (($ext eq "php") && ($parm =~ /action=/) && ($parm !~ /search=/)) # action can occur as parm after search |
| 1024 | + { |
| 1025 | + @parms = split ('\&', $parm) ; |
| 1026 | + foreach $parm (@parms) |
| 1027 | + { |
| 1028 | + ($keyword,$data) = split ('\=', $parm) ; |
| 1029 | + if ($keyword eq "action") |
| 1030 | + { @actions {"$script,$data"} += $count } |
| 1031 | + } |
| 1032 | + } |
| 1033 | + } |
| 1034 | + close CSV_SCRIPTS ; |
| 1035 | + |
| 1036 | +# foreach $key (keys_sorted_by_value_num_desc %actions) |
| 1037 | +# { print "$key: " . $actions {$key} . "\n" ; } |
| 1038 | + |
| 1039 | + open CSV_SCRIPTS, '<', "$dir_process/$file_csv_scripts" ; |
| 1040 | + read_script: |
| 1041 | + while ($line = <CSV_SCRIPTS>) |
| 1042 | + { |
| 1043 | + next if $line =~ /^#/ ; # comments |
| 1044 | + next if $line =~ /^:/ ; # csv header (not a comment) |
| 1045 | + |
| 1046 | + chomp $line ; |
| 1047 | + $line =~ s/\%3B/;/gi ; |
| 1048 | + $line =~ s/\%5B/[/gi ; |
| 1049 | + $line =~ s/\%5D/]/gi ; |
| 1050 | + $line =~ s/\&/\&/gi ; |
| 1051 | + ($ext, $script, $parm, $count) = split (',', $line) ; |
| 1052 | + |
| 1053 | + # incomplete validation check on valid names, but captures already lot of rubbish |
| 1054 | + if ($script =~ /\%/) |
| 1055 | + { $script = "other" ; } |
| 1056 | + if ($parm =~ /\%/) |
| 1057 | + { $parm = "other" ; } |
| 1058 | + |
| 1059 | + if (($parm =~ /amp;amp;/) || |
| 1060 | + ($parm =~ /feed=.*feed=/)) |
| 1061 | + { next read_script ; } |
| 1062 | + |
| 1063 | + if (($ext eq "php") && ($parm =~ /action=/)) |
| 1064 | + { |
| 1065 | + @parms = split ('\&', $parm) ; |
| 1066 | + foreach $parm (@parms) |
| 1067 | + { |
| 1068 | + ($keyword,$data) = split ('\=', $parm) ; |
| 1069 | + if ($keyword eq "action") |
| 1070 | + { |
| 1071 | + if (@actions {"$script,$data"} < 2) |
| 1072 | + { next read_script ; } |
| 1073 | + } |
| 1074 | + } |
| 1075 | + } |
| 1076 | + if ($ext eq "php") |
| 1077 | + { |
| 1078 | + # generalize ns10 -> ns.. + remove all ns..=.. but one |
| 1079 | + $parm =~ s/\&ns\d+/\&ns../g ; |
| 1080 | + $parm =~ s/\&ns\.\.=\.\./-*^-*^/ ; |
| 1081 | + $parm =~ s/\&ns\.\.=\.\.//g ; |
| 1082 | + $parm =~ s/\-\*\^\-\*\^/\&ns\.\.=\.\./g ; |
| 1083 | + |
| 1084 | + # generalize nsargs[]= -> remove all but one |
| 1085 | + $parm =~ s/\&rsargs\[\]=\.\./-*^-*^/ ; |
| 1086 | + $parm =~ s/\&rsargs\[\]=\.\.//g ; |
| 1087 | + $parm =~ s/\-\*\^\-\*\^/\&rsargs\[n\]=\.\./g ; |
| 1088 | + |
| 1089 | + if (length ($parm) > 100) |
| 1090 | + { $parm =~ s/(.{100}[^\&]*\&)/$1<br>/g ; } |
| 1091 | + |
| 1092 | + $parms {"$script,$parm"} += $count ; |
| 1093 | + $scripts_php {$script} += $count ; |
| 1094 | + } |
| 1095 | + elsif ($ext eq "js") |
| 1096 | + { $scripts_js {$script} += $count ; } |
| 1097 | + elsif ($ext eq "css") |
| 1098 | + { $scripts_css {$script} += $count ; } |
| 1099 | + } |
| 1100 | + close CSV_SCRIPTS ; |
| 1101 | +} |
| 1102 | + |
| 1103 | +sub ReadInputGoogle |
| 1104 | +{ |
| 1105 | + my $file_csv = "$dir_process/$file_csv_google" ; |
| 1106 | + if (! -e $file_csv) |
| 1107 | + { abort ("Function ReadInputGoogle: file $file_csv not found!!!") ; } |
| 1108 | + open CSV_SEARCH, '<', $file_csv ; |
| 1109 | + while ($line = <CSV_SEARCH>) |
| 1110 | + { |
| 1111 | + next if $line =~ /^#/ ; # comments |
| 1112 | + next if $line =~ /^:/ ; # csv header (not a comment) |
| 1113 | + |
| 1114 | + chomp $line ; |
| 1115 | + ($matches, $site, $origin, $service, $agent, $mimecat, $toplevel, $count) = split (',', $line) ; |
| 1116 | + |
| 1117 | + if ($service eq "Imposters?") |
| 1118 | + { $service = "GoogleBot?" ; } |
| 1119 | + if ($service eq "GoogleBotNot?") |
| 1120 | + { $service = "GoogleBot?" ; } |
| 1121 | + if ($service eq "Crawler") |
| 1122 | + { $service = "GoogleBot" ; } |
| 1123 | + |
| 1124 | + if ($matches =~ /x/) |
| 1125 | + { $googleIp = 'Y' ; } |
| 1126 | + else |
| 1127 | + { $googleIp = 'N' ; } |
| 1128 | + |
| 1129 | + next if $site ne "google" ; |
| 1130 | + |
| 1131 | + if ($toplevel eq "-") |
| 1132 | + { $toplevel = "undefined" ; } |
| 1133 | + if (length ($toplevel) > 3) |
| 1134 | + { $toplevel = "_$toplevel" ; } # sort on top |
| 1135 | + |
| 1136 | + $searches_crawlers {$service} += $count ; |
| 1137 | + $searches_service {"$service,$googleIp"} += $count ; |
| 1138 | + $searches_toplevel {$toplevel} += $count ; |
| 1139 | + $searches_service_mimecat {"$service,$mimecat,$googleIp"} += $count ; |
| 1140 | + $searches_service_mimecat {"$service,total,$googleIp"} += $count ; |
| 1141 | + $searches_service_matches {"$service,$matches"} += $count ; |
| 1142 | + |
| 1143 | +# if ($origin =~ /search/i) |
| 1144 | + if ($toplevel =~ /^[a-zA-Z0-9-]+$/) |
| 1145 | + { $searches_toplevel_tld_found {$toplevel} += $count ; } # print "$line\n" ;} |
| 1146 | + else |
| 1147 | + { |
| 1148 | + $searches_mimecat_tld_not_found {$mimecat} += $count ; |
| 1149 | + $searches_mimecat_tld_not_found {"total"} += $count ; |
| 1150 | + } |
| 1151 | + |
| 1152 | + $searches_toplevel_mimecat {"$toplevel,$mimecat"} += $count ; |
| 1153 | + $searches_toplevel_mimecat {"$toplevel,total"} += $count ; |
| 1154 | + |
| 1155 | +# if ($toplevel !~ /:/) { print "invalid toplevel $toplevel\n" ; } |
| 1156 | + } |
| 1157 | + close CSV_SEARCH ; |
| 1158 | +} |
| 1159 | + |
| 1160 | +sub ReadInputSkins |
| 1161 | +{ |
| 1162 | + my $file_csv = "$dir_process/$file_csv_skins" ; |
| 1163 | + if (! -e $file_csv) |
| 1164 | + { abort ("Function ReadInputSkins: file $file_csv not found!!!") ; } |
| 1165 | + open CSV_SKINS, '<', $file_csv ; |
| 1166 | + while ($line = <CSV_SKINS>) |
| 1167 | + { |
| 1168 | + next if $line =~ /^#/ ; # comments |
| 1169 | + next if $line =~ /^:/ ; # csv header (not a comment) |
| 1170 | + |
| 1171 | + chomp $line ; |
| 1172 | + ($skins, $count) = split (',', $line) ; |
| 1173 | + |
| 1174 | + $skins {$skins} += $count ; |
| 1175 | + ($name,$rest) = split ('\/', $skins, 2) ; |
| 1176 | + $skin_set {$name}+= $count ; |
| 1177 | + } |
| 1178 | + close CSV_SCRIPTS ; |
| 1179 | +} |
| 1180 | + |
| 1181 | +sub ReadInputIndexPhp |
| 1182 | +{ |
| 1183 | + my $file_csv = "$dir_process/$file_csv_indexphp" ; |
| 1184 | + if (! -e $file_csv) |
| 1185 | + { abort ("Function ReadInputIndexPhp: file $file_csv not found!!!") ; } |
| 1186 | + open CSV_INDEXPHP, '<', $file_csv ; |
| 1187 | + while ($line = <CSV_INDEXPHP>) |
| 1188 | + { |
| 1189 | + next if $line =~ /^#/ ; # comments |
| 1190 | + next if $line =~ /^:/ ; # csv header (not a comment) |
| 1191 | + |
| 1192 | + chomp $line ; |
| 1193 | + ($bot,$domain,$referer,$ext,$status,$mime,$parm,$agent) = split (',', $line) ; |
| 1194 | + |
| 1195 | + my $action = "" ; |
| 1196 | + if ($parm =~ /action=edit/) |
| 1197 | + { $action = 'edit' ; } |
| 1198 | + if ($parm =~ /action=submit/) |
| 1199 | + { $action = 'submit' ; } |
| 1200 | + |
| 1201 | + next if $ext !~ /index.php/ ; |
| 1202 | + next if $parm !~ /action=(?:edit|submit)(?:$|\&)/ ; # submit or submit&.., not submitlogin |
| 1203 | + next if $mime ne "text/html" ; # excludes mime - (undefined), application/x-external-editor on action=edit |
| 1204 | + # and text/plain, text/xml, application/xml on action=submit |
| 1205 | + |
| 1206 | + if ($bot =~ /Y/) |
| 1207 | + { |
| 1208 | + $intent = "" ; |
| 1209 | + |
| 1210 | + if ($agent =~ /DotNetWikiBot/i) |
| 1211 | + { $agent = "DotNetWikiBot" ; } |
| 1212 | + $agent =~ s/\%27/\'/g ; |
| 1213 | + # $agent =~ s/\(.*?\)//g; |
| 1214 | + |
| 1215 | + if ($action eq "edit") |
| 1216 | + { |
| 1217 | + if ($referer =~ /^\w\w:/) |
| 1218 | + { $referer = "int" ; } |
| 1219 | + $edit_submit_bot {$domain} {"edit,$referer"} ++ ; |
| 1220 | + $edit_submit_bot_sort {$domain} ++ ; |
| 1221 | + $edit_submit_bot_agent {$agent} {"$action,$referer"}++ ; |
| 1222 | + $edit_submit_bot_agent_sort {$agent}++ ; |
| 1223 | + } |
| 1224 | + |
| 1225 | + if ($action eq "submit") |
| 1226 | + { |
| 1227 | + if ($referer =~ /^\w\w:/) |
| 1228 | + { $referer = "int" ; } |
| 1229 | + |
| 1230 | + $intent = 'unknown' ; |
| 1231 | + if ($status eq "TCP_MISS/302") { $intent = 'save' ; } |
| 1232 | + elsif ($status eq "TCP_MISS/200") { $intent = 'preview' ; } |
| 1233 | + # next if $intent ne 'save' ; |
| 1234 | + |
| 1235 | + $edit_submit_bot {$domain} {"$intent,$referer"} ++ ; |
| 1236 | + $edit_submit_bot_sort {$domain} ++ ; |
| 1237 | + |
| 1238 | + # if ($referer eq "-") { $edit_submit_bot_agent {$agent}++ ; } |
| 1239 | + $edit_submit_bot_agent {$agent} {"$intent,$referer"}++ ; |
| 1240 | + $edit_submit_bot_agent_sort {$agent}++ ; |
| 1241 | + } |
| 1242 | + } |
| 1243 | + |
| 1244 | + next if $bot =~ /N/ ; # 2009-05 /N/ -> total oldid: 127, total other: 54, total redlink: 4 |
| 1245 | + next if $bot =~ /Y/ ; # 2009-05 /N/ -> total oldid: 127, total other: 54, total redlink: 4 |
| 1246 | + next if $domain ne "wp:en" ; # 2009-05 ne -> total other: 26, total redlink: 22 |
| 1247 | + # if (($referer ne "-") && ($referer ne "ext") && ($referer ne "wp:en")) { next ; } |
| 1248 | + # if (($referer ne "-") && ($referer !~ /^..:/)) { $referer = "ext" ; } |
| 1249 | + # if ($referer eq "-") { $referer = "- " ; } |
| 1250 | + next if $referer ne "wp:en" ; # 2009-05 eq -> # total other: 2014, total redlink: 1031, total oldid: 47, total undo: 30 |
| 1251 | + |
| 1252 | + my $filter = '' ; |
| 1253 | + if ($parm =~ /action=edit/) |
| 1254 | + { |
| 1255 | + $filter = 'other' ; |
| 1256 | + if ($parm =~ /redlink/) { $filter = 'redlink' ; } |
| 1257 | + if ($parm =~ /oldid=/) { $filter = 'oldid' ; } |
| 1258 | + if ($parm =~ /undo=/) { $filter = 'undo' ; } |
| 1259 | + |
| 1260 | + $edit_submit {"[$bot $referer $action $filter] $parm"}++ ; |
| 1261 | + $edit_submits {"$filter"}++ ; |
| 1262 | + } |
| 1263 | + if ($parm =~ /action=submit/) |
| 1264 | + { |
| 1265 | + $edit_submit {"$bot $referer $action $status"}++ ; |
| 1266 | + } |
| 1267 | + |
| 1268 | + # my @subparms = split ('\&', $parm) ; |
| 1269 | + # foreach $subparm (@subparms) |
| 1270 | + # { $edit_submit_subparms {"[$action] [$filter] $subparm"}++ ; } |
| 1271 | + } |
| 1272 | + close CSV_INDEXPHP ; |
| 1273 | + |
| 1274 | +# next if $bot =~ /N/ ; # + any referrer -> |
| 1275 | +# Sample period: 1 May 2009 - 31 May 2009 => for daily averages multiplier = 0.03 |
| 1276 | +# 9: [bot=Y - edit oldid] action=edit&oldid=§ion=&title=.. |
| 1277 | +# 3: [bot=Y - edit oldid] action=edit&oldid=..&title=.. |
| 1278 | +# 17: [bot=Y - edit oldid] action=edit&oldid=..&title=..&useskin=.. |
| 1279 | +# 1: [bot=Y - edit other] _herbs&action=edit&title=.. |
| 1280 | +# 65: [bot=Y - edit other] action=edit§ion=..&title=.. |
| 1281 | +# 1: [bot=Y - edit other] action=edit&stub&title=.. |
| 1282 | +# 2: [bot=Y - edit other] action=edit&title= |
| 1283 | +# 188: [bot=Y - edit other] action=edit&title=.. |
| 1284 | +# 31: [bot=Y - edit other] action=edit&title=..&useskin=.. |
| 1285 | +# 30: [bot=Y - edit redlink] action=edit&redlink=..&title=.. |
| 1286 | +# 5: [bot=Y - edit undo] action=edit&title=..&undo=..&undoafter=.. |
| 1287 | +# 14: [bot=Y ext edit other] action=edit§ion=..&title=.. |
| 1288 | +# 5: [bot=Y ext edit other] action=edit&title=.. |
| 1289 | +# 11: [bot=Y ext edit redlink] action=edit&redlink=..&title=.. |
| 1290 | +# 2: [bot=Y ext edit undo] action=edit&title=..&undo=..&undoafter=.. |
| 1291 | +# 107: [bot=Y wp:en edit oldid] action=edit&oldid=§ion=&title=.. |
| 1292 | +# 3: [bot=Y wp:en edit oldid] action=edit&oldid=..§ion=&title=.. |
| 1293 | +# 17: [bot=Y wp:en edit oldid] action=edit&oldid=..&title=.. |
| 1294 | +# 1: [bot=Y wp:en edit other] action=edit&articleget=..&dykcredittab=..&editintro=..&preload=..&preloadtitle=..§ion=..&title=.. |
| 1295 | +# 5: [bot=Y wp:en edit other] action=edit§ion=..&title=.. |
| 1296 | +# 48: [bot=Y wp:en edit other] action=edit&title=.. |
| 1297 | +# 4: [bot=Y wp:en edit redlink] action=edit&redlink=..&title=.. |
| 1298 | +# 9: bot=Y - submit TCP_MISS/200 |
| 1299 | +# 62: bot=Y - submit TCP_MISS/302 |
| 1300 | +# 31: bot=Y wp:en submit TCP_MISS/302 |
| 1301 | +# total other: 361 |
| 1302 | +# total oldid: 156 |
| 1303 | +# total redlink: 45 |
| 1304 | +# total undo: 7 |
| 1305 | +} |
| 1306 | + |
| 1307 | +sub ReadInputCountriesTimed |
| 1308 | +{ |
| 1309 | + my $file_csv = "$dir_process/$file_csv_countries_timed" ; |
| 1310 | + if (! -e $file_csv) |
| 1311 | + { abort ("Function ReadInputSkins: file $file_csv not found!!! ") ; } |
| 1312 | + open CSV_COUNTRIES, '<', $file_csv ; |
| 1313 | + while ($line = <CSV_COUNTRIES>) |
| 1314 | + { |
| 1315 | + next if $line =~ /^#/ ; # comments |
| 1316 | + next if $line =~ /^:/ ; # csv header (not a comment) |
| 1317 | + |
| 1318 | + chomp $line ; |
| 1319 | + ($bot,$target,$country,$time,$count) = split (',', $line) ; |
| 1320 | + |
| 1321 | + next if $target !~ /^wp/ ; # wikipedia only |
| 1322 | + |
| 1323 | + if ($bot =~ /Y/) |
| 1324 | + { $bot = 'Y' } |
| 1325 | + else |
| 1326 | + { $bot = 'N' } |
| 1327 | + $countries {$country} ++ ; |
| 1328 | + $targets {$target} ++ ; |
| 1329 | + $times {$time} ++ ; |
| 1330 | + $countries_timed {"$bot,$target,$country,$time"} += $count ; |
| 1331 | + $countries_totals {"$bot,$target"}{$country} += $count ; |
| 1332 | + $targets_totals {"$bot,$country"}{$target} += $count ; |
| 1333 | + } |
| 1334 | + close CSV_COUNTRIES ; |
| 1335 | +} |
| 1336 | + |
| 1337 | +sub ReadInputCountriesNames |
| 1338 | +{ |
| 1339 | + $path_csv_country_codes = "$path_in/$file_csv_country_codes" ; |
| 1340 | + if (! -e $path_csv_country_codes) { abort ("Input file $path_csv_country_codes not found!") ; } |
| 1341 | + |
| 1342 | + open CSV_COUNTRY_CODES, '<', $path_csv_country_codes ; |
| 1343 | + $country_names {"--"} = "Unknown" ; |
| 1344 | + while ($line = <CSV_COUNTRY_CODES>) |
| 1345 | + { |
| 1346 | + chomp $line ; |
| 1347 | + |
| 1348 | + next if $line =~ /^#/ ; |
| 1349 | + |
| 1350 | + ($country_code,$region_code,$north_south_code,$country_name) = split (',', $line,4) ; |
| 1351 | + $region_codes {$country_code} = $region_code ; |
| 1352 | + $north_south_codes {$country_code} = $north_south_code ; |
| 1353 | + |
| 1354 | + $country_name =~ s/"//g ; |
| 1355 | + |
| 1356 | + next if $country_name eq "Anonymous Proxy" ; |
| 1357 | + next if $country_name eq "Satellite Provider" ; |
| 1358 | + next if $country_name eq "Other Country" ; |
| 1359 | + next if $country_name eq "Asia/Pacific Region" ; |
| 1360 | + next if $country_name eq "Europe" ; |
| 1361 | + |
| 1362 | +# if ($country_meta_info {$country} eq "") |
| 1363 | +# { |
| 1364 | +# if ($country_meta_info_not_found_reported {$country} ++ == 0) |
| 1365 | +# { print "Meta info not found for country '$country'\n" ; } |
| 1366 | +# } |
| 1367 | + |
| 1368 | + $country_names {$country_code} = $country_name ; |
| 1369 | + $country_codes_all {"$country_name|$country_code"} ++ ; |
| 1370 | + } |
| 1371 | +} |
| 1372 | + |
| 1373 | +sub ReadInputCountriesMeta |
| 1374 | +{ |
| 1375 | + # http://en.wikipedia.org/wiki/List_of_countries_by_population |
| 1376 | + # http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users |
| 1377 | + open COUNTRY_META_INFO, '<', "$path_in/$file_csv_country_meta_info" ; |
| 1378 | + while ($line = <COUNTRY_META_INFO>) |
| 1379 | + { |
| 1380 | + chomp $line ; |
| 1381 | + ($country,$link,$population,$connected,$icon) = split ',', $line ; |
| 1382 | +print "$line\n" ; # qqq |
| 1383 | + $country =~ s/,/,/g ; |
| 1384 | + |
| 1385 | + # use country names as given by MaxMind |
| 1386 | + $country =~ s/Brunei/Brunei Darussalam/ ; |
| 1387 | + $country =~ s/C..?te d'Ivoire/Cote d'Ivoire/ ; |
| 1388 | + $country =~ s/Congo, The Democratic Republic of the/Republic of the Congo/ ; |
| 1389 | + $country =~ s/Dem. Rep. of Congo/Congo - The Democratic Republic of the/ ; |
| 1390 | + $country =~ s/East timor/Timor-Leste/ ; |
| 1391 | + $country =~ s/Guyane/French Guiana/ ; |
| 1392 | + $country =~ s/Iran/Iran, Islamic Republic of/ ; |
| 1393 | + $country =~ s/Laos/Lao People's Democratic Republic/ ; |
| 1394 | + $country =~ s/Libya/Libyan Arab Jamahiriya/ ; |
| 1395 | + $country =~ s/Macau/Macao/ ; |
| 1396 | + $country =~ s/Moldova/Moldova, Republic of/ ; |
| 1397 | + $country =~ s/North Korea/Korea, Republic of/ ; |
| 1398 | + $country =~ s/Palestine/Palestinian Territory/ ; |
| 1399 | + $country =~ s/Republic of the Congo/Congo/ ; |
| 1400 | + $country =~ s/Russia/Russian Federation/ ; |
| 1401 | + $country =~ s/North Korea/Korea, Democratic People's Republic of/ ; |
| 1402 | + $country =~ s/South Korea/Korea, Republic of/ ; |
| 1403 | + $country =~ s/Syria/Syrian Arab Republic/ ; |
| 1404 | + $country =~ s/Tanzania/Tanzania, United Republic of/ ; |
| 1405 | + $country =~ s/U.S. Virgin Islands/Virgin Islands, British/ ; |
| 1406 | + $country =~ s/Vatican City/Holy See (Vatican City State)/ ; |
| 1407 | + $country =~ s/^Korea$/South Korea/ ; |
| 1408 | + |
| 1409 | + $connected =~ s/connected/../g ; |
| 1410 | + $country_meta_info {$country} = "$link,$population,$connected,$icon" ; |
| 1411 | +print "meta info found for '$country'\n" ; # qqq |
| 1412 | + |
| 1413 | + if ($country eq "United States") |
| 1414 | + { ($connected_us = $connected) =~ s/_//g ; } |
| 1415 | + } |
| 1416 | + close COUNTRY_META_INFO ; |
| 1417 | +} |
| 1418 | + |
| 1419 | +sub CollectRegionCounts |
| 1420 | +{ |
| 1421 | + my ($country_code, $region_code, $north_south_code, $country_name) ; |
| 1422 | + |
| 1423 | + foreach $country_code (keys %country_names) |
| 1424 | + { |
| 1425 | + $country_name = $country_names {$country_code} ; |
| 1426 | + $country_meta = $country_meta_info {$country_name} ; |
| 1427 | + my ($link,$population,$connected,$icon) = split (',', $country_meta) ; |
| 1428 | + |
| 1429 | + $region_code = $region_codes {$country_code} ; |
| 1430 | + $north_south_code = $north_south_codes {$country_code} ; |
| 1431 | + |
| 1432 | + $population =~ s/_//g ; |
| 1433 | + $connected =~ s/_//g ; |
| 1434 | + |
| 1435 | + $population_tot += $population ; |
| 1436 | + $connected_tot += $connected ; |
| 1437 | + |
| 1438 | + $population_per_region {$region_code} += $population ; |
| 1439 | + $connected_per_region {$region_code} += $connected ; |
| 1440 | + |
| 1441 | + $population_per_region {$north_south_code} += $population ; |
| 1442 | + $connected_per_region {$north_south_code} += $connected ; |
| 1443 | + |
| 1444 | + # print "CODE $country_code NAME $country_name POP $population, $CONN $connected REGION $region_code NS $north_south_code PPR ${population_per_region {$region_code}}\n" ; |
| 1445 | + } |
| 1446 | +} |
| 1447 | + |
| 1448 | +sub ReadInputCountriesMonthly |
| 1449 | +{ |
| 1450 | + my $project_mode = shift ; |
| 1451 | + |
| 1452 | + undef %yyyymm_ ; |
| 1453 | + undef %quarters ; |
| 1454 | + undef %requests_unknown_per_quarter ; |
| 1455 | + undef %country_codes ; |
| 1456 | + undef %requests_all ; |
| 1457 | + undef %requests_all_per_period ; |
| 1458 | + undef %requests_per_quarter ; |
| 1459 | + undef %requests_per_country ; |
| 1460 | + undef %requests_per_quarter_per_country ; |
| 1461 | + undef %requests_per_country_per_language ; |
| 1462 | + undef %requests_per_language_per_country ; |
| 1463 | + undef %requests_per_quarter_per_country_per_language ; |
| 1464 | + undef %requests_per_month_per_country_code ; |
| 1465 | + undef %requests_per_month_us ; |
| 1466 | + undef %descriptions_per_period ; |
| 1467 | + undef %requests_recently_all ; |
| 1468 | + undef %requests_recently_per_country_code ; |
| 1469 | + undef %requests_recently_per_country ; |
| 1470 | + undef %requests_recently_per_country_per_language ; |
| 1471 | + undef %requests_recently_per_language_per_country ; |
| 1472 | + undef %requests_recently_per_language ; |
| 1473 | + undef %months_recently ; |
| 1474 | + |
| 1475 | + $requests_recently_start = "999999" ; |
| 1476 | + $requests_recently_stop = "000000" ; |
| 1477 | + $requests_start = "999999" ; |
| 1478 | + $requests_stop = "000000" ; |
| 1479 | + |
| 1480 | + $requests_all = 0 ; |
| 1481 | + $requests_recently_all = 0 ; |
| 1482 | + |
| 1483 | + my ($sec,$min,$hour,$day,$report_month,$report_year) = localtime (time) ; |
| 1484 | + $report_year += 1900 ; |
| 1485 | + $report_month ++ ; |
| 1486 | + |
| 1487 | + print "Process project $project_mode\n\n" ; |
| 1488 | + |
| 1489 | + open CSV_SQUID_COUNTS_MONTHLY, '<', $path_csv_squid_counts_monthly ; |
| 1490 | + while ($line = <CSV_SQUID_COUNTS_MONTHLY>) |
| 1491 | + { |
| 1492 | + chomp $line ; |
| 1493 | + $line =~ s/,\s+/,/g ; |
| 1494 | + $line =~ s/\s+,/,/g ; |
| 1495 | + ($yyyymm,$project,$language,$code,$bot,$count) = split (',', $line) ; |
| 1496 | + |
| 1497 | + ($code,$language) = &NormalizeSquidInput ($code,$language) ; |
| 1498 | + $country = &GetCountryName ($code) ; |
| 1499 | + |
| 1500 | + next if &DiscardSquidInput ($bot,$project,$project_mode,$code,$language) ; |
| 1501 | + |
| 1502 | + # $yyyymm = "2009-12" ; |
| 1503 | + $yyyymm_ {$yyyymm} ++ ; |
| 1504 | + |
| 1505 | + $year = substr ($yyyymm,0,4) ; |
| 1506 | + $month = substr ($yyyymm,5,2) ; |
| 1507 | + # print "year $year report_year month $month $report_year $report_month\n" ; |
| 1508 | + |
| 1509 | + $recently = $false ; |
| 1510 | + |
| 1511 | + if (($year == $report_year) or (($year == $report_year - 1) && ($month >= $report_month))) # last 12 months |
| 1512 | + { $recently = $true ; } |
| 1513 | + |
| 1514 | + if ($month <= 3) { $quarter = $year . ' Q1' ; } |
| 1515 | + elsif ($month <= 6) { $quarter = $year . ' Q2' ; } |
| 1516 | + elsif ($month <= 9) { $quarter = $year . ' Q3' ; } |
| 1517 | + else { $quarter = $year . ' Q4' ; } |
| 1518 | + |
| 1519 | + if ($quarter_only ne '') |
| 1520 | + { next if $quarter ne $quarter_only ; } |
| 1521 | + |
| 1522 | + # if ($views_edits eq 'Page Edits') |
| 1523 | + |
| 1524 | + $quarters {$quarter} ++ ; |
| 1525 | + |
| 1526 | + if (($country =~ /\?/) || ($country =~ /unknown/i)) |
| 1527 | + { $requests_unknown_per_quarter {$quarter} += $count ; next ; } |
| 1528 | + $country_codes {"$country|$code"}++ ; |
| 1529 | + $requests_all += $count ; |
| 1530 | + $requests_all_per_period {$yyyymm} += $count ; |
| 1531 | + $requests_per_quarter {$quarter} += $count ; |
| 1532 | + $requests_per_country {$country} += $count ; |
| 1533 | + |
| 1534 | + $requests_per_quarter_per_country {$quarter} {$country} += $count ; |
| 1535 | + $requests_per_country_per_language {$country} {$language} += $count ; |
| 1536 | + $requests_per_language_per_country {$language} {$country} += $count ; |
| 1537 | + $requests_per_quarter_per_country_per_language {$quarter} {$country} {$language} += $count ; |
| 1538 | + $requests_per_month_per_country_code {$yyyymm} {"$country|$code"} += $count ; |
| 1539 | + |
| 1540 | + if ($code eq "US") |
| 1541 | + {$requests_per_month_us {$yyyymm} += $count ; } |
| 1542 | + |
| 1543 | + $descriptions_per_period {$yyyymm} = $yyyymm ; |
| 1544 | + if ($yyyymm lt $requests_start) { $requests_start = $yyyymm ; } |
| 1545 | + if ($yyyymm gt $requests_stop) { $requests_stop = $yyyymm ; } |
| 1546 | + |
| 1547 | + if ($recently) |
| 1548 | + { |
| 1549 | + if ($yyyymm lt $requests_recently_start) { $requests_recently_start = $yyyymm ; } |
| 1550 | + if ($yyyymm gt $requests_recently_stop) { $requests_recently_stop = $yyyymm ; } |
| 1551 | + |
| 1552 | + $months_recently {$yyyymm}++ ; |
| 1553 | + $requests_recently_all += $count ; |
| 1554 | + $requests_recently_per_country_code {"$country|$code"} += $count ; |
| 1555 | + $requests_recently_per_country {$country} += $count ; |
| 1556 | + $requests_recently_per_country_per_language {$country} {$language} += $count ; |
| 1557 | + $requests_recently_per_language_per_country {$language} {$country} += $count ; |
| 1558 | + $requests_recently_per_language {$language} += $count ; |
| 1559 | + } |
| 1560 | + } |
| 1561 | + |
| 1562 | + print "\n" ; |
| 1563 | + @quarters = keys_sorted_alpha_desc %quarters ; |
| 1564 | + foreach $quarter (@quarters) |
| 1565 | + { |
| 1566 | + print "Quarter $quarter: requests: " . (0+$requests_per_quarter {$quarter}) . "\n" ; |
| 1567 | + if ($requests_per_quarter {$quarter} == 0) |
| 1568 | + { abort ("No known requests found for quarter $quarter") ; } |
| 1569 | + } |
| 1570 | + print "\n" ; |
| 1571 | + |
| 1572 | + $months_recently = keys %months_recently ; |
| 1573 | + if ($months_recently == 0) { die "\$months_recently == 0\n" ; } |
| 1574 | + |
| 1575 | + $requests_recently_start = substr ($requests_recently_start,5,2) . "/" . substr ($requests_recently_start,2,2) ; |
| 1576 | + $requests_recently_stop = substr ($requests_recently_stop ,5,2) . "/" . substr ($requests_recently_stop ,2,2) ; |
| 1577 | + $requests_start = substr ($requests_start,5,2) . "/" . substr ($requests_start,2,2) ; |
| 1578 | + $requests_stop = substr ($requests_stop ,5,2) . "/" . substr ($requests_stop ,2,2) ; |
| 1579 | + |
| 1580 | + foreach $yyyymm (keys %$yyyymm) |
| 1581 | + { |
| 1582 | + if ($requests_per_month_us {$week} > $max_requests_per_month_us) |
| 1583 | + { $max_requests_per_month_us = $requests_per_month_us {$week} ; } |
| 1584 | + } |
| 1585 | + |
| 1586 | + # die "\$connected_us == 0" if $connected_us == 0 ; |
| 1587 | + if ($connected_us > 0) |
| 1588 | + { $max_requests_per_connected_us_month = sprintf ("%.1f", $max_requests_per_month_us / $connected_us) ; } |
| 1589 | + |
| 1590 | +# foreach $country_code (sort keys %country_codes_all) |
| 1591 | +# { |
| 1592 | +# $200907 = ${$requests_per_month_per_country_code {"200907"}} {$country_code} ; |
| 1593 | +# $200908 = ${$requests_per_month_per_country_code {"200908"}} {$country_code} ; |
| 1594 | +# $200909 = ${$requests_per_month_per_country_code {"200909"}} {$country_code} ; |
| 1595 | +# $200910 = ${$requests_per_month_per_country_code {"200910"}} {$country_code} ; |
| 1596 | +# $200911 = ${$requests_per_month_per_country_code {"200911"}} {$country_code} ; |
| 1597 | +# $200912 = ${$requests_per_month_per_country_code {"200912"}} {$country_code} ; |
| 1598 | +# print "$country_code, $200907, $200908, $200909, $200910, $200911, $200912\n" ; |
| 1599 | +# } |
| 1600 | +# exit ; |
| 1601 | +} |
| 1602 | + |
| 1603 | +sub ReadInputCountriesDaily |
| 1604 | +{ |
| 1605 | + # http://en.wikipedia.org/wiki/List_of_countries_by_population |
| 1606 | + # http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users |
| 1607 | + |
| 1608 | + my $project_mode = shift ; |
| 1609 | + |
| 1610 | + undef %country_codes_found ; |
| 1611 | + undef %weeknum_this_years ; |
| 1612 | + undef %descriptions_per_period ; |
| 1613 | + undef %days_in_input_for_week ; |
| 1614 | + undef %requests_all_per_period ; |
| 1615 | + undef %requests_per_week_per_country_code ; |
| 1616 | + undef %requests_per_week_us ; |
| 1617 | + undef %missing_days ; |
| 1618 | + undef %correct_for_missing_days ; |
| 1619 | + undef %changes_per_week_per_country_code ; |
| 1620 | + |
| 1621 | +# $requests_recently_start = "999999" ; |
| 1622 | +# $requests_recently_stop = "000000" ; |
| 1623 | + |
| 1624 | +# $time_2000_01_01 = timegm(0,0,0,1,1-1,2000-1900) ; |
| 1625 | + $sec_per_day = 24 * 60 * 60 ; |
| 1626 | + |
| 1627 | + my ($sec,$min,$hour,$day,$report_month,$report_year) = localtime (time) ; |
| 1628 | + $report_year += 1900 ; |
| 1629 | + $report_month ++ ; |
| 1630 | + |
| 1631 | + print "Process project $project_mode\n\n" ; |
| 1632 | + |
| 1633 | + $yyyymmdd_prev = "" ; |
| 1634 | + open CSV_SQUID_COUNTS_DAILY, '<', $path_csv_squid_counts_daily ; |
| 1635 | + while ($line = <CSV_SQUID_COUNTS_DAILY>) |
| 1636 | + { |
| 1637 | + chomp $line ; |
| 1638 | + ($yyyymmdd,$project,$language,$code,$bot,$count) = split (',', $line) ; |
| 1639 | + |
| 1640 | + die "\$yyyymmdd $yyyymmdd lt \$yyyymmdd_prev $yyyymmdd_prev" if $yyyymmdd lt $yyyymmdd_prev ; |
| 1641 | + $yyyymmdd_prev = $yyyymmdd ; |
| 1642 | + |
| 1643 | + ($code,$language) = &NormalizeSquidInput ($code,$language) ; |
| 1644 | + $country = &GetCountryName ($code) ; |
| 1645 | + |
| 1646 | + $country_codes_found {"$country|$code"} ++ ; |
| 1647 | + |
| 1648 | + next if &DiscardSquidInput ($bot,$project,$project_mode,$code,$language) ; |
| 1649 | + |
| 1650 | + # $yyyymmdd = "2009-12-01" ; |
| 1651 | + $yyyymmdd_ {$yyyymmdd} ++ ; |
| 1652 | + |
| 1653 | + $year = substr ($yyyymmdd,0,4) ; |
| 1654 | + $month = substr ($yyyymmdd,5,2) ; |
| 1655 | + $day = substr ($yyyymmdd,8,2) ; |
| 1656 | + |
| 1657 | + $time = timegm(0,0,0,$day,$month-1,$year-1900) ; |
| 1658 | + # $days_since_2000 = int (($time - $time_2000_01_01) / $sec_per_day) ; |
| 1659 | + $days_this_year = (gmtime $time) [7] ; |
| 1660 | + $weeknum_this_year = int ($days_this_year / 7) + 1 ; |
| 1661 | + $weeknum_since_2000 = $year . sprintf ("%02d",$weeknum_this_year) ; # * int ($days_since_2000 / 7) + 1 ; |
| 1662 | + |
| 1663 | + $weeknum_this_years {"$weeknum_this_year - $weeknum_since_2000"}++ ; |
| 1664 | + |
| 1665 | + $descriptions_per_period {$weeknum_since_2000} = "week $weeknum_this_year - " . month_english_short ($month-1) . " $year" ; |
| 1666 | + $days_in_input_for_week {$weeknum_since_2000} {$yyyymmdd} ++ ; |
| 1667 | + |
| 1668 | + $requests_all_per_period {$weeknum_since_2000} += $count ; |
| 1669 | + $requests_per_week_per_country_code {$weeknum_since_2000} {"$country|$code"} += $count ; |
| 1670 | + |
| 1671 | + if ($code eq "US") |
| 1672 | + {$requests_per_week_us {$weeknum_since_2000} += $count ; } |
| 1673 | + |
| 1674 | + # last if ($weeknum_since_2000 == 501) ; # test |
| 1675 | + } |
| 1676 | + |
| 1677 | + foreach $week (sort keys %weeknum_this_years) |
| 1678 | + { print "week $week " . $weeknum_this_years {$week} . "\n" ; } |
| 1679 | + |
| 1680 | + foreach $week (sort {$a <=> $b} keys %days_in_input_for_week) |
| 1681 | + { |
| 1682 | + @keys = keys %{$requests_per_week_per_country_code {$week-1}} ; |
| 1683 | + if (@keys == 0) |
| 1684 | + { |
| 1685 | + # print "skip week $week: no data for previous week available.\n" ; |
| 1686 | + next ; |
| 1687 | + } |
| 1688 | + |
| 1689 | + if ($requests_per_week_us {$week} > $max_requests_per_week_us) |
| 1690 | + { $max_requests_per_week_us = $requests_per_week_us {$week} ; } |
| 1691 | + |
| 1692 | + $desc= $week_descriptions {$week} ; |
| 1693 | + @days = keys %{$days_in_input_for_week {$week}} ; |
| 1694 | + $daycount = @days ; |
| 1695 | + $missing_days {$week} = 7 - $daycount ; |
| 1696 | + $correct_for_missing_days {$week} = 7 / $daycount ; |
| 1697 | + # print "Week $week: $desc: $daycount " . (join ' - ', @days) . " ${correct_for_missing_days {$week}}\n" ; |
| 1698 | + # foreach $country_code (keys %{$requests_per_week_per_country_code {$week}}) |
| 1699 | + |
| 1700 | + foreach $country_code (keys %country_codes_all) |
| 1701 | + { |
| 1702 | + $new = &CorrectForMissingDays ($week , ${$requests_per_week_per_country_code {$week }} {$country_code}) ; |
| 1703 | + $old = &CorrectForMissingDays ($week-1, ${$requests_per_week_per_country_code {$week-1}} {$country_code}) ; |
| 1704 | + |
| 1705 | + # print "country_code $country_code\n" ; |
| 1706 | + if ($old == 0) |
| 1707 | + { |
| 1708 | + if ($new > 0) |
| 1709 | + { |
| 1710 | + # print "$country_code: no data for prev week\n" ; |
| 1711 | + $changes_per_week_per_country_code {$week} {$country_code} = 100 ; |
| 1712 | + } |
| 1713 | + } |
| 1714 | + else |
| 1715 | + { |
| 1716 | + $delta = sprintf ("%.1f", 100 * sqrt ($new / $old)) ; |
| 1717 | + if ($delta < 0) { $delta = 0 ; } |
| 1718 | + if ($delta > 200) { $delta = 200 ; } |
| 1719 | + $changes_per_week_per_country_code {$week} {$country_code} = $delta ; |
| 1720 | + $country_code =~ s/,/;/g ; |
| 1721 | + push @trace, "$country_code, $week, $old, $new, $delta\n" ; |
| 1722 | + } |
| 1723 | + |
| 1724 | + } |
| 1725 | + } |
| 1726 | + open TRACE, '>', "svg/SquidReportPageViewsPerCountryTrend.csv" ; |
| 1727 | + print TRACE sort @trace ; |
| 1728 | + close TRACE ; |
| 1729 | + |
| 1730 | + # die "\$connected_us == 0" if $connected_us == 0 ; |
| 1731 | + if ($connected_us > 0) |
| 1732 | + { $max_requests_per_connected_us_week = sprintf ("%.1f", (($max_requests_per_week_us * 1000) / $connected_us)) ; } |
| 1733 | +} |
| 1734 | + |
| 1735 | +sub NormalizeSquidInput |
| 1736 | +{ |
| 1737 | + my ($code,$language) = @_ ; |
| 1738 | + |
| 1739 | + if ($language eq "jp") { $language = "ja" ; } |
| 1740 | + if ($language eq "cz") { $language = "cs" ; } |
| 1741 | + |
| 1742 | + # following are part of France, according to Wikipedia, List_of_countries_by_population |
| 1743 | + if ($code eq 'BL') { $code = 'FR' ; } # Saint Barth�lemy |
| 1744 | + if ($code eq 'MF') { $code = 'FR' ; } # Saint Martin |
| 1745 | + if ($code eq 'MQ') { $code = 'FR' ; } # Martinique |
| 1746 | + if ($code eq 'NC') { $code = 'FR' ; } # New Caledonia |
| 1747 | + if ($code eq 'PF') { $code = 'FR' ; } # French Polynesia |
| 1748 | + if ($code eq 'PM') { $code = 'FR' ; } # Saint Pierre and Miquelon |
| 1749 | + if ($code eq 'WF') { $code = 'FR' ; } # Wallis and Futuna |
| 1750 | + if ($code eq 'YT') { $code = 'FR' ; } # Mayotte |
| 1751 | + |
| 1752 | + return ($code,$language) ; |
| 1753 | +} |
| 1754 | + |
| 1755 | +sub DiscardSquidInput |
| 1756 | +{ |
| 1757 | + ($bot,$project,$project_mode,$code,$language) = @_ ; |
| 1758 | + if ($bot ne "U" or # user |
| 1759 | + $project ne $project_mode or # eg 'wp' |
| 1760 | + $language eq "upload" or |
| 1761 | + $language =~ /mobile/i or |
| 1762 | + $code eq "A1" or # Anonymous Proxy |
| 1763 | + $code eq "A2" or # Satellite Provider |
| 1764 | + $code eq "AP" or # Asia/Pacific Region |
| 1765 | + $code eq "EU") # Europe |
| 1766 | + { |
| 1767 | + # print "bot $bot project '$project' project_mode $project_mode code $code language $language\n" ; |
| 1768 | + return ($true) ; |
| 1769 | + } |
| 1770 | + |
| 1771 | + return ($false) ; |
| 1772 | +} |
| 1773 | + |
| 1774 | +sub GetCountryName |
| 1775 | +{ |
| 1776 | + my $code = shift ; |
| 1777 | + if ($country_names {$code} eq "") |
| 1778 | + { |
| 1779 | + $country = "$code (?)" ; |
| 1780 | + if ($country_code_not_specified_reported {$code}++ == 0) |
| 1781 | + { print "Country name not specified for $code\n" ; } |
| 1782 | + } |
| 1783 | + else |
| 1784 | + { $country = $country_names {$code} ; } |
| 1785 | + return ($country) ; |
| 1786 | +} |
| 1787 | + |
| 1788 | +sub ReadInputBrowserLanguages |
| 1789 | +{ |
| 1790 | + my $file_csv = "$dir_process/$file_csv_browser_languages" ; |
| 1791 | + if (! -e $file_csv) |
| 1792 | + { abort ("Function ReadInputBrowserLanguages: file $file_csv not found!!! ") ; } |
| 1793 | + open CSV_BROWSER_LANGUAGES, '<', $file_csv ; |
| 1794 | + while ($line = <CSV_BROWSER_LANGUAGES>) |
| 1795 | + { |
| 1796 | + next if $line =~ /^#/ ; # comments |
| 1797 | + next if $line =~ /^:/ ; # csv header (not a comment) |
| 1798 | + |
| 1799 | + chomp $line ; |
| 1800 | + ($browser,$language,$count) = split (',', $line) ; |
| 1801 | + |
| 1802 | + $browser_languages {"$browser,$language"} += $count ; |
| 1803 | + } |
| 1804 | + close CSV_BROWSER_LANGUAGES ; |
| 1805 | +} |
| 1806 | + |
| 1807 | +sub CalcPercentages |
| 1808 | +{ |
| 1809 | + my $total_opsys = $total_opsys_mobile + $total_opsys_non_mobile ; |
| 1810 | + foreach $key (keys %opsys) |
| 1811 | + { $opsys_perc {$key} = sprintf ("%.2f",(100*$opsys {$key}/$total_opsys)) . "%" ; } |
| 1812 | + |
| 1813 | + foreach $key (keys %clients) |
| 1814 | + { $clients_perc {$key} = sprintf ("%.2f",(100*$clients {$key}/$total_clients)) . "%" ; } |
| 1815 | + |
| 1816 | + foreach $key (keys %clientgroups) |
| 1817 | + { |
| 1818 | + $perc = 100*$clientgroups {$key}/$total_clients ; |
| 1819 | + if ($key =~ /^M/) |
| 1820 | + { $perc_threshold = 0.005 ; } |
| 1821 | + else |
| 1822 | + { $perc_threshold = 0.02 ; } |
| 1823 | + |
| 1824 | + if ($perc > $perc_threshold) |
| 1825 | + { $clientgroups_perc {$key} = sprintf ("%.2f",$perc) . "%" ; } |
| 1826 | + else |
| 1827 | + { |
| 1828 | + ($mobile,$group) = split (',', $key) ; |
| 1829 | + $clientgroups_other {$mobile} += $clientgroups {$key} ; |
| 1830 | + $clientgroups {$key} = 0 ; |
| 1831 | + } |
| 1832 | + } |
| 1833 | +} |
| 1834 | + |
| 1835 | +sub NormalizeCounts |
| 1836 | +{ |
| 1837 | +# ReadInputClients |
| 1838 | + foreach $key (keys %engines) |
| 1839 | + { $engines {$key} = &Normalize ($engines {$key}) ; } |
| 1840 | + |
| 1841 | + foreach $key (keys %clientgroups) |
| 1842 | + { $clientgroups {$key} = &Normalize ($clientgroups {$key}) ; } |
| 1843 | + |
| 1844 | + foreach $key (keys %clients) |
| 1845 | + { $clients {$key} = &Normalize ($clients {$key}) ; } |
| 1846 | + |
| 1847 | + foreach $key (keys %clientgroups_other) |
| 1848 | + { $clientgroups_other {$key} = &Normalize ($clientgroups_other {$key}) ; } |
| 1849 | + |
| 1850 | + foreach $key (keys %total_clientgroups) |
| 1851 | + { $total_clientgroups {$key} = &Normalize ($total_clientgroups {$key}) ; } |
| 1852 | + |
| 1853 | + foreach $key (keys %total_engines) |
| 1854 | + { $total_engines {$key} = &Normalize ($total_engines {$key}) ; } |
| 1855 | + |
| 1856 | + foreach $key (keys %webkit_engines) |
| 1857 | + { $webkit_engines {$key} = &Normalize ($webkit_engines {$key}) ; } |
| 1858 | + |
| 1859 | + $total_clients = &Normalize ($total_clients) ; |
| 1860 | + $total_clients_mobile = &Normalize ($total_clients_mobile) ; |
| 1861 | + $total_clients_non_mobile = &Normalize ($total_clients_non_mobile) ; |
| 1862 | + |
| 1863 | +# ReadInputCrawlers |
| 1864 | + foreach $key (keys %crawlers) |
| 1865 | + { $crawlers {$key} = &Normalize ($crawlers {$key}) ; } |
| 1866 | + |
| 1867 | + $total_page_crawlerrequests = &Normalize ($total_page_crawlerrequests) ; |
| 1868 | + |
| 1869 | +# ReadInputMethods |
| 1870 | + foreach $key (keys %statusses) |
| 1871 | + { $statusses {$key} = &Normalize ($statusses {$key}) ; } |
| 1872 | + foreach $key (keys %methods) |
| 1873 | + { $methods {$key} = &Normalize ($methods {$key}) ; } |
| 1874 | + |
| 1875 | +# ReadInputMimeTypes |
| 1876 | + foreach $key (keys %mimetypes) |
| 1877 | + { $mimetypes {$key} = &Normalize ($mimetypes {$key}) ; } |
| 1878 | + foreach $key (keys %projects) |
| 1879 | + { $projects {$key} = &Normalize ($projects {$key}) ; } |
| 1880 | + foreach $key (keys %domains) |
| 1881 | + { $domains {$key} = &Normalize ($domains {$key}) ; } |
| 1882 | + foreach $key (keys %images_project) |
| 1883 | + { $images_project {$key} = &Normalize ($images_project {$key}) ; } |
| 1884 | + foreach $key (keys %images_domain) |
| 1885 | + { $images_domain {$key} = &Normalize ($images_domain {$key}) ; } |
| 1886 | + foreach $key (keys %mimetypes_found) |
| 1887 | + { $mimetypes_found {$key} = &Normalize ($mimetypes_found {$key}) ; } |
| 1888 | + foreach $key (keys %counts_pm) |
| 1889 | + { $counts_pm {$key} = &Normalize ($counts_pm {$key}) ; } |
| 1890 | + foreach $key (keys %counts_dm) |
| 1891 | + { $counts_dm {$key} = &Normalize ($counts_dm {$key}) ; } |
| 1892 | + foreach $key (keys %counts_prem) |
| 1893 | + { $counts_prem {$key} = &Normalize ($counts_prem {$key}) ; } |
| 1894 | + |
| 1895 | + $total_mimes = &Normalize ($total_mimes) ; |
| 1896 | + |
| 1897 | +# ReadInputOpSys |
| 1898 | + foreach $key (keys %opsys) |
| 1899 | + { $opsys {$key} = &Normalize ($opsys {$key}) ; } |
| 1900 | + |
| 1901 | + $total_opsys_non_mobile = &Normalize ($total_opsys_non_mobile) ; |
| 1902 | + $total_opsys_mobile = &Normalize ($total_opsys_mobile) ; |
| 1903 | + |
| 1904 | +# ReadInputOrigins |
| 1905 | + foreach $key (keys %origin_int_top) |
| 1906 | + { $origin_int_top {$key} = &Normalize ($origin_int_top {$key}) ; } |
| 1907 | + foreach $key (keys %origin_int_top_split) |
| 1908 | + { $origin_int_top_split {$key} = &Normalize ($origin_int_top_split {$key}) ; } |
| 1909 | + foreach $key (keys %origin_ext_top) |
| 1910 | + { $origin_ext_top {$key} = &Normalize ($origin_ext_top {$key}) ; } |
| 1911 | + foreach $key (keys %origin_ext_top_split) |
| 1912 | + { $origin_ext_top_split {$key} = &Normalize ($origin_ext_top_split {$key}) ; } |
| 1913 | + foreach $key (keys %origin_ext_page_top) |
| 1914 | + { $origin_ext_page_top {$key} = &Normalize ($origin_ext_page_top {$key}) ; } |
| 1915 | + foreach $key (keys %project_int_top) |
| 1916 | + { $project_int_top {$key} = &Normalize ($project_int_top {$key}) ; } |
| 1917 | + foreach $key (keys %project_int_top_split) |
| 1918 | + { $project_int_top_split {$key} = &Normalize ($project_int_top_split {$key}) ; } |
| 1919 | + |
| 1920 | + $total_page_requests_external = &Normalize ($total_page_requests_external) ; |
| 1921 | + $total_origins_external_counted = &Normalize ($total_origins_external_counted) ; |
| 1922 | + |
| 1923 | +# ReadInputScripts |
| 1924 | + foreach $key (keys %actions) |
| 1925 | + { $actions {$key} = &Normalize ($actions {$key}) ; } |
| 1926 | + foreach $key (keys %parms) |
| 1927 | + { $parms {$key} = &Normalize ($parms {$key}) ; } |
| 1928 | + foreach $key (keys %scripts_php) |
| 1929 | + { $scripts_php {$key} = &Normalize ($scripts_php {$key}) ; } |
| 1930 | + foreach $key (keys %scripts_js) |
| 1931 | + { $scripts_js {$key} = &Normalize ($scripts_js {$key}) ; } |
| 1932 | + foreach $key (keys %scripts_css) |
| 1933 | + { $scripts_css {$key} = &Normalize ($scripts_css {$key}) ; } |
| 1934 | + |
| 1935 | +# ReadInputGoogle |
| 1936 | + foreach $key (keys %searches_service) |
| 1937 | + { $searches_service {$key} = &Normalize ($searches_service {$key}) ; } |
| 1938 | + foreach $key (keys %searches_crawlers) |
| 1939 | + { $searches_crawlers {$key} = &Normalize ($searches_crawlers {$key}) ; } |
| 1940 | + foreach $key (keys %searches_toplevel) |
| 1941 | + { $searches_toplevel {$key} = &Normalize ($searches_toplevel {$key}) ; } |
| 1942 | + foreach $key (keys %searches_toplevel_tld_found) |
| 1943 | + { $searches_toplevel_tld_found {$key} = &Normalize ($searches_toplevel_tld_found {$key}) ; } |
| 1944 | + foreach $key (keys %searches_service_mimecat) |
| 1945 | + { $searches_service_mimecat {$key} = &Normalize ($searches_service_mimecat {$key}) ; } |
| 1946 | + foreach $key (keys %searches_service_matches) |
| 1947 | + { $searches_service_matches {$key} = &Normalize ($searches_service_matches {$key}) ; } |
| 1948 | + foreach $key (keys %searches_toplevel_mimecat) |
| 1949 | + { $searches_toplevel_mimecat {$key} = &Normalize ($searches_toplevel_mimecat {$key}) ; } |
| 1950 | + foreach $key (keys %searches_mimecat_tld_not_found) |
| 1951 | + { $searches_mimecat_tld_not_found {$key} = &Normalize ($searches_mimecat_tld_not_found {$key}) ; } |
| 1952 | + |
| 1953 | +# ReadInputSkins |
| 1954 | + foreach $key (keys %skins) |
| 1955 | + { $skins {$key} = &Normalize ($skins {$key}) ; } |
| 1956 | + foreach $key (keys %skin_set) |
| 1957 | + { $skin_set {$key} = &Normalize ($skin_set {$key}) ; } |
| 1958 | + |
| 1959 | +# ReadInputBrowserLanguages |
| 1960 | + foreach $key (keys %browser_languages) |
| 1961 | + { $browser_languages {$key} = &Normalize ($browser_languages {$key}) ; } |
| 1962 | +} |
| 1963 | + |
| 1964 | +sub SortCounts |
| 1965 | +{ |
| 1966 | +# ReadInputClients |
| 1967 | +# @engines_sorted_count = keys_sorted_by_value_num_desc %engines ; |
| 1968 | + @engines_sorted_alpha = keys_sorted_alpha_asc %engines ; |
| 1969 | + @webkit_engines_sorted_alpha = keys_sorted_alpha_asc %webkit_engines ; |
| 1970 | + @clientgroups_sorted_count = keys_sorted_by_value_num_desc %clientgroups ; |
| 1971 | + @clientgroups_sorted_alpha = keys_sorted_alpha_asc %clientgroups ; |
| 1972 | + @clients_sorted_count = keys_sorted_by_value_num_desc %clients ; |
| 1973 | + @clients_sorted_alpha = keys_sorted_alpha_asc %clients ; |
| 1974 | + |
| 1975 | +# ReadInputCrawlers |
| 1976 | +# @crawlers_sorted_count = keys_sorted_by_value_num_desc %crawlers ; |
| 1977 | +# @crawlers_sorted_alpha = keys_sorted_alpha_asc %crawlers ; |
| 1978 | + |
| 1979 | +# ReadInputMethods |
| 1980 | + @statusses_sorted_count = keys_sorted_by_value_num_desc %statusses ; |
| 1981 | + @statusses_sorted_method = keys_sorted_alpha_desc %statusses ; |
| 1982 | + @methods_sorted_count = keys_sorted_by_value_num_desc %methods ; |
| 1983 | + @methods_sorted_method = keys_sorted_alpha_desc %methods ; |
| 1984 | + |
| 1985 | +# ReadInputMimeTypes |
| 1986 | + @mimetypes_sorted = sort {&SortMime ($b) <=> &SortMime ($a)} keys %mimetypes ; |
| 1987 | + @projects_sorted = keys_sorted_by_value_num_desc %projects ; |
| 1988 | + @domains_sorted = keys_sorted_by_value_num_desc %domains ; |
| 1989 | + |
| 1990 | +# ReadInputOpSys |
| 1991 | + @opsys_sorted_alpha = sort {lc($a) cmp lc($b)} keys %opsys ; |
| 1992 | + @opsys_sorted_count = keys_sorted_by_value_num_desc %opsys ; |
| 1993 | + |
| 1994 | +# ReadInputOrigins |
| 1995 | + @origin_int_top_sorted_alpha = keys_sorted_alpha_desc %origin_int_top ; |
| 1996 | + @origin_ext_top_sorted_alpha = keys_sorted_alpha_desc %origin_ext_top ; |
| 1997 | + @origin_ext_page_top_sorted_alpha = keys_sorted_alpha_desc %origin_ext_page_top ; |
| 1998 | + @origin_int_top_sorted_count = keys_sorted_by_value_num_desc %origin_int_top ; |
| 1999 | + @origin_ext_top_sorted_count = keys_sorted_by_value_num_desc %origin_ext_top ; |
| 2000 | + @origin_ext_page_top_sorted_count = keys_sorted_by_value_num_desc %origin_ext_page_top ; |
| 2001 | + |
| 2002 | + @project_int_top_sorted_alpha = keys_sorted_alpha_desc %project_int_top ; |
| 2003 | + @project_int_top_sorted_count = keys_sorted_by_value_num_desc %project_int_top ; |
| 2004 | + |
| 2005 | +# ReadInputScripts |
| 2006 | + @parms_sorted_count = keys_sorted_by_value_num_desc %parms ; |
| 2007 | + @parms_sorted_script = keys_sorted_alpha_desc %parms ; |
| 2008 | + |
| 2009 | + @scripts_php_sorted_count = keys_sorted_by_value_num_desc %scripts_php ; |
| 2010 | + @scripts_php_sorted_script = keys_sorted_alpha_asc %scripts_php ; |
| 2011 | + @scripts_js_sorted_count = keys_sorted_by_value_num_desc %scripts_js ; |
| 2012 | + @scripts_js_sorted_script = keys_sorted_alpha_asc %scripts_js ; |
| 2013 | + @scripts_css_sorted_count = keys_sorted_by_value_num_desc %scripts_css ; |
| 2014 | + @scripts_css_sorted_script = keys_sorted_alpha_asc %scripts_css ; |
| 2015 | + |
| 2016 | +# ReadInputGoogle |
| 2017 | + @searches_service_count = keys_sorted_by_value_num_desc %searches_service ; |
| 2018 | + @searches_service_alpha = keys_sorted_alpha_desc %searches_service ; |
| 2019 | + @searches_toplevel_count = keys_sorted_by_value_num_desc %searches_toplevel_tld_found ; |
| 2020 | + @searches_toplevel_alpha = keys_sorted_alpha_asc %searches_toplevel_tld_found ; |
| 2021 | + @searches_service_matches_alpha = keys_sorted_alpha_asc %searches_service_matches ; |
| 2022 | + |
| 2023 | +# ReadInputSkins |
| 2024 | + @skins_sorted_skin = keys_sorted_alpha_asc %skins ; |
| 2025 | +} |
| 2026 | + |
| 2027 | +sub WriteReportClients |
| 2028 | +{ |
| 2029 | + open FILE_HTML_CLIENTS, '>', "$dir_reports/$file_html_clients" ; |
| 2030 | + |
| 2031 | + $html = $header ; |
| 2032 | + $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Browsers e.a./ ; |
| 2033 | + $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Browsers e.a./ ; |
| 2034 | + $html =~ s/ALSO/ See also: <b>LINKS<\/b>/ ; |
| 2035 | + $html =~ s/LINKS/$link_requests $link_origins \/ $link_methods \/ $link_scripts \/ $link_skins \/ $link_crawlers \/ $link_opsys \/ $dummy_browsers \/ $link_google/ ; |
| 2036 | + $html =~ s/X1000/⇒ <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ; |
| 2037 | + |
| 2038 | + $html .= "<table border=1>\n" ; |
| 2039 | + $html .= "<tr><td class=l colspan=99 wrap>The following overview of page requests per client (~browser) application is based on the <a href='http://en.wikipedia.org/wiki/User_agent'>user agent</a> information that accompanies most server requests.<br>" . |
| 2040 | + "Please note that agent information does not follow strict guidelines and some programs may provide wrong information on purpose.<br>" . |
| 2041 | + "This report ignores all requests where agent information is missing, or contains any of the following: bot, crawl(er) or spider.<p>" . |
| 2042 | + "<b>Recommended reading:</b> <a href='http://en.wikipedia.org/wiki/Usage_share_of_web_browsers'>Wikipedia article</a> on usage share of web browsers and measurement methodology." . |
| 2043 | + "</td></tr>\n" ; |
| 2044 | + |
| 2045 | + # CLIENTS SORTED BY FREQUENCY |
| 2046 | + $html .= "<tr><td width=50% valign=top>" ; |
| 2047 | + $html .= "<table border=1 width=100%>\n" ; |
| 2048 | + $html .= "<tr><th colspan=99 class=l><h3>In order of popularity</h3></th></tr>\n" ; |
| 2049 | + |
| 2050 | + $html .= "<tr><th colspan=99 class=l> <br>Browsers, non mobile</th></tr>\n" ; |
| 2051 | + $perc_total = 0 ; |
| 2052 | + foreach $key (@clientgroups_sorted_count) |
| 2053 | + { |
| 2054 | + $count = $clientgroups {$key} ; |
| 2055 | + |
| 2056 | + next if $count == 0 ; |
| 2057 | + |
| 2058 | + $perc = $clientgroups_perc {$key} ; |
| 2059 | + ($mobile,$group) = split (',', $key) ; |
| 2060 | + |
| 2061 | + next if $mobile ne '-' ; |
| 2062 | + |
| 2063 | + $count = &FormatCount ($count) ; |
| 2064 | + $html .= "<tr><td class=l>$group</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2065 | + $perc =~ s/\%// ; |
| 2066 | + $perc_total += $perc ; |
| 2067 | + } |
| 2068 | + |
| 2069 | + $perc = ".." ; |
| 2070 | + $count = $clientgroups_other {'-'} ; |
| 2071 | + if ($total_clientgroups {'-'} + $total_clientgroups {'M'} > 0) |
| 2072 | + { |
| 2073 | + $perc = sprintf ("%.2f", 100 * $clientgroups_other {'-'} / ($total_clientgroups {'-'} + $total_clientgroups {'M'})) ; |
| 2074 | + $perc_total += $perc ; |
| 2075 | + } |
| 2076 | + $html .= "<tr><td class=l>Other</th><td class=r>$count</td><td class=r>$perc\%</td></tr>\n" ; |
| 2077 | + |
| 2078 | + $total = &FormatCount ($total_clientgroups {'-'}) ; |
| 2079 | + $perc_total = sprintf ("%.1f", $perc_total) ; |
| 2080 | + $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc_total\%</th></tr>\n" ; |
| 2081 | + |
| 2082 | + $html .= "<tr><th colspan=99 class=l> <br>Browsers, mobile</th></tr>\n" ; |
| 2083 | + foreach $key (@clientgroups_sorted_count) |
| 2084 | + { |
| 2085 | + $count = $clientgroups {$key} ; |
| 2086 | + |
| 2087 | + next if $count == 0 ; |
| 2088 | + |
| 2089 | + $perc = $clientgroups_perc {$key} ; |
| 2090 | + ($mobile,$group) = split (',', $key) ; |
| 2091 | + |
| 2092 | + next if $mobile ne 'M' ; |
| 2093 | + |
| 2094 | + $count = &FormatCount ($count) ; |
| 2095 | + $html .= "<tr><td class=l>$group</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2096 | + $perc =~ s/\%// ; |
| 2097 | + } |
| 2098 | + $count = $clientgroups_other {'M'} ; |
| 2099 | + |
| 2100 | + $perc = ".." ; |
| 2101 | + if ($total_clientgroups {'-'} + $total_clientgroups {'M'} > 0) |
| 2102 | + { $perc = sprintf ("%.2f", 100 * $count / ($total_clientgroups {'-'} + $total_clientgroups {'M'})) ; } |
| 2103 | + |
| 2104 | + $perc_total = sprintf ("%.1f", (100 - $perc_total)) ; |
| 2105 | + $total = &FormatCount ($total_clientgroups {'M'}) ; |
| 2106 | + $html .= "<tr><td class=l>Other</th><td class=r>$count</td><td class=r>$perc\%</td></tr>\n" ; |
| 2107 | + $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc_total\%</th></tr>\n" ; |
| 2108 | + |
| 2109 | + $html .= "<tr><th colspan=99 class=l> <br>Browser versions, non mobile</th></tr>\n" ; |
| 2110 | + |
| 2111 | + foreach $key (@clients_sorted_count) |
| 2112 | + { |
| 2113 | + $count = $clients {$key} ; |
| 2114 | + ($rectype, $client) = split (',', $key,2) ; |
| 2115 | + |
| 2116 | + next if $rectype ne '-' ; # group |
| 2117 | + |
| 2118 | + $perc = $clients_perc {$key} ; |
| 2119 | + |
| 2120 | + next if $perc lt "0.02%" ; |
| 2121 | + |
| 2122 | + $count = &FormatCount ($count) ; |
| 2123 | + $html .= "<tr><td class=l>$client</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2124 | + $perc =~ s/\%// ; |
| 2125 | + } |
| 2126 | + $total = &FormatCount ($total_clients_non_mobile) ; |
| 2127 | + |
| 2128 | + $perc_total = sprintf ("%.1f", (100 - $perc_total)) ; |
| 2129 | + $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc_total\%</th></tr>\n" ; |
| 2130 | + |
| 2131 | + $html .= "<tr><th colspan=99 class=l> <br>Browser versions, mobile</th></tr>\n" ; |
| 2132 | + foreach $key (@clients_sorted_count) |
| 2133 | + { |
| 2134 | + $count = $clients {$key} ; |
| 2135 | + ($rectype, $client) = split (',', $key,2) ; |
| 2136 | + |
| 2137 | + next if $rectype ne 'M' ; # group |
| 2138 | + |
| 2139 | + $perc = $clients_perc {$key} ; |
| 2140 | + |
| 2141 | + next if $perc lt "0.02%" ; |
| 2142 | + |
| 2143 | + $count = &FormatCount ($count) ; |
| 2144 | + $html .= "<tr><td class=l>$client</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2145 | + } |
| 2146 | + $total = &FormatCount ($total_clients_mobile) ; |
| 2147 | + $perc = sprintf ("%.1f", (100 - $perc_total)) ; |
| 2148 | + $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc\%</th></tr>\n" ; |
| 2149 | + |
| 2150 | + $html .= "</table>\n" ; |
| 2151 | + |
| 2152 | + # CLIENTS In alphabetical order |
| 2153 | + $html .= "</td><td width=50% valign=top>" ; |
| 2154 | + $html .= "<table border=1 width=100%>\n" ; |
| 2155 | + $html .= "<tr><th colspan=99 class=l><h3>In alphabetical order</h3></th></tr>\n" ; |
| 2156 | + |
| 2157 | + $html .= "<tr><th colspan=99 class=l> <br>Browsers, non mobile</th></tr>\n" ; |
| 2158 | + $perc_total = 0 ; |
| 2159 | + foreach $key (@clientgroups_sorted_alpha) |
| 2160 | + { |
| 2161 | + $count = $clientgroups {$key} ; |
| 2162 | + |
| 2163 | + next if $count == 0 ; |
| 2164 | + |
| 2165 | + $perc = $clientgroups_perc {$key} ; |
| 2166 | + ($mobile,$group) = split (',', $key) ; |
| 2167 | + |
| 2168 | + next if $mobile ne '-' ; |
| 2169 | + |
| 2170 | + $count = &FormatCount ($count) ; |
| 2171 | + $html .= "<tr><td class=l>$group</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2172 | + $perc =~ s/\%// ; |
| 2173 | + $perc_total += $perc ; |
| 2174 | + } |
| 2175 | + |
| 2176 | + $count = $clientgroups_other {'-'} ; |
| 2177 | + $total = &FormatCount ($total_clientgroups {'-'}) ; |
| 2178 | + $perc = ".." ; |
| 2179 | + if ($total_clientgroups {'-'} + $total_clientgroups {'M'} > 0) |
| 2180 | + { $perc = sprintf ("%.2f", 100 * $count / ($total_clientgroups {'-'} + $total_clientgroups {'M'})) ; } |
| 2181 | + $perc_total += $perc ; |
| 2182 | + $perc_total = sprintf ("%.1f", $perc_total) ; |
| 2183 | + $html .= "<tr><td class=l>Other</th><td class=r>$count</td><td class=r>$perc\%</td></tr>\n" ; |
| 2184 | + $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc_total\%</th></tr>\n" ; |
| 2185 | + |
| 2186 | + $html .= "<tr><th colspan=99 class=l> <br>Browsers, mobile</th></tr>\n" ; |
| 2187 | + foreach $key (@clientgroups_sorted_alpha) |
| 2188 | + { |
| 2189 | + $count = $clientgroups {$key} ; |
| 2190 | + |
| 2191 | + next if $count == 0 ; |
| 2192 | + |
| 2193 | + $perc = $clientgroups_perc {$key} ; |
| 2194 | + ($mobile,$group) = split (',', $key) ; |
| 2195 | + |
| 2196 | + next if $mobile ne 'M' ; |
| 2197 | + |
| 2198 | + $count = &FormatCount ($count) ; |
| 2199 | + $html .= "<tr><td class=l>$group</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2200 | + $perc =~ s/\%// ; |
| 2201 | + } |
| 2202 | + $count = $clientgroups_other {'M'} ; |
| 2203 | + $total = &FormatCount ($total_clientgroups {'M'}) ; |
| 2204 | + $perc = sprintf ("%.2f", 100 * $count / ($total_clientgroups {'-'} + $total_clientgroups {'M'})) ; |
| 2205 | + $perc_total = sprintf ("%.1f", (100 - $perc_total)) ; |
| 2206 | + $html .= "<tr><td class=l>Other</th><td class=r>$count</td><td class=r>$perc\%</td></tr>\n" ; |
| 2207 | + $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc_total\%</th></tr>\n" ; |
| 2208 | + |
| 2209 | + $html .= "<tr><th colspan=99 class=l> <br>Browser versions, non mobile</th></tr>\n" ; |
| 2210 | + |
| 2211 | + foreach $key (@clients_sorted_alpha) |
| 2212 | + { |
| 2213 | + $count = $clients {$key} ; |
| 2214 | + ($rectype, $client) = split (',', $key,2) ; |
| 2215 | + |
| 2216 | + next if $rectype ne '-' ; # group |
| 2217 | + |
| 2218 | + $perc = $clients_perc {$key} ; |
| 2219 | + |
| 2220 | + next if $perc lt "0.02%" ; |
| 2221 | + |
| 2222 | + $count = &FormatCount ($count) ; |
| 2223 | + $html .= "<tr><td class=l>$client</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2224 | + } |
| 2225 | + $total = &FormatCount ($total_clients_non_mobile) ; |
| 2226 | + $perc = sprintf ("%.1f",100*$total_clients_non_mobile / ($total_clients_mobile + $total_clients_non_mobile)) ; |
| 2227 | + $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc\%</th></tr>\n" ; |
| 2228 | + |
| 2229 | + $html .= "<tr><th colspan=99 class=l> <br>Browser versions, mobile</th></tr>\n" ; |
| 2230 | + foreach $key (@clients_sorted_alpha) |
| 2231 | + { |
| 2232 | + $count = $clients {$key} ; |
| 2233 | + ($rectype, $client) = split (',', $key,2) ; |
| 2234 | + |
| 2235 | + next if $rectype ne 'M' ; # group |
| 2236 | + |
| 2237 | + $perc = $clients_perc {$key} ; |
| 2238 | + |
| 2239 | + next if $perc lt "0.02%" ; |
| 2240 | + |
| 2241 | + $count = &FormatCount ($count) ; |
| 2242 | + $html .= "<tr><td class=l>$client</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2243 | + } |
| 2244 | + $total = &FormatCount ($total_clients_mobile) ; |
| 2245 | + $perc = sprintf ("%.1f",100*$total_clients_mobile / ($total_clients_mobile + $total_clients_non_mobile)) ; |
| 2246 | + $html .= "<tr><th class=l>Total</th><th class=r>$total</th><th class=r>$perc\%</th></tr>\n" ; |
| 2247 | + |
| 2248 | + $html .= "<tr><th colspan=99 class=l> <br>Browser engines</th></tr>\n" ; |
| 2249 | + |
| 2250 | + $engine_prev = "" ; |
| 2251 | + foreach $engine (@webkit_engines_sorted_alpha) |
| 2252 | + { |
| 2253 | + $total = $webkit_engines {$engine} ; |
| 2254 | + |
| 2255 | + next if $total < 5 ; |
| 2256 | + |
| 2257 | + $engine2 = $engine ; |
| 2258 | + $engine2 =~ s/\/.*$// ; |
| 2259 | + $engine2 =~ s/ .*$// ; |
| 2260 | + if (($engine2 ne $engine_prev) && ($engine_prev ne "")) |
| 2261 | + { |
| 2262 | + $total_engine = $total_engines {$engine_prev} ; |
| 2263 | + $perc_engine = sprintf ("%.1f", 100 * $total_engine / ($total_clients_mobile + $total_clients_non_mobile)) ; |
| 2264 | + $total_engine = &FormatCount ($total_engine) ; |
| 2265 | + $html .= "<tr><th class=l>Total</th><th class=r>$total_engine</th><th class=r>$perc_engine\%</th></tr>\n" ; |
| 2266 | + } |
| 2267 | + $engine_prev = $engine2 ; |
| 2268 | + $total = &FormatCount ($total) ; |
| 2269 | + $html .= "<tr><td class=l>$engine</td><td class=r>$total</td><td class=r> </td></tr>\n" ; |
| 2270 | + } |
| 2271 | + $total_engine = $total_engines {$engine_prev} ; |
| 2272 | + $perc_engine = sprintf ("%.1f", 100 * $total_engine / ($total_clients_mobile + $total_clients_non_mobile)) ; |
| 2273 | + $total_engine = &FormatCount ($total_engine) ; |
| 2274 | + $html .= "<tr><th class=l>Total</th><th class=r>$total_engine</th><th class=r>$perc_engine\%</th></tr>\n" ; |
| 2275 | + |
| 2276 | + $engine_prev = "" ; |
| 2277 | + foreach $engine (@engines_sorted_alpha) |
| 2278 | + { |
| 2279 | + $total = $engines {$engine} ; |
| 2280 | + |
| 2281 | + next if $total < 5 ; |
| 2282 | + |
| 2283 | + $engine2 = $engine ; |
| 2284 | + $engine2 =~ s/\/.*$// ; |
| 2285 | + $engine2 =~ s/ .*$// ; |
| 2286 | + if (($engine2 ne $engine_prev) && ($engine_prev ne "")) |
| 2287 | + { |
| 2288 | + $total_engine = $total_engines {$engine_prev} ; |
| 2289 | + $perc_engine = sprintf ("%.1f", 100 * $total_engine / ($total_clients_mobile + $total_clients_non_mobile)) ; |
| 2290 | + $total_engine = &FormatCount ($total_engine) ; |
| 2291 | + $html .= "<tr><th class=l>Total</th><th class=r>$total_engine</th><th class=r>$perc_engine\%</th></tr>\n" ; |
| 2292 | + } |
| 2293 | + $engine_prev = $engine2 ; |
| 2294 | + $total = &FormatCount ($total) ; |
| 2295 | + $html .= "<tr><td class=l>$engine</td><td class=r>$total</td><td class=r> </td></tr>\n" ; |
| 2296 | + } |
| 2297 | + $total_engine = $total_engines {$engine_prev} ; |
| 2298 | + $perc_engine = sprintf ("%.1f", 100 * $total_engine / ($total_clients_mobile + $total_clients_non_mobile)) ; |
| 2299 | + $total_engine = &FormatCount ($total_engine) ; |
| 2300 | + $html .= "<tr><th class=l>Total</th><th class=r>$total_engine</th><th class=r>$perc_engine\%</th></tr>\n" ; |
| 2301 | + |
| 2302 | + $html .= "</table>\n" ; |
| 2303 | + $html .= "</td></tr>\n" ; |
| 2304 | + |
| 2305 | + $html .= "<tr><td colspan=99 class=l wrap>Requests from mobile devices are recognized as follows:<br>" . |
| 2306 | + "Agent string contains any of the following terms (last upd: $month_upd_keywords_mobile):<br>" . |
| 2307 | + "<i>$keywords_mobile</i></td></tr>" ; |
| 2308 | + |
| 2309 | + $html .= "</table>\n" ; |
| 2310 | + |
| 2311 | +# $html .= "<p><b>Explanation:</b><br>'osd' = opensearchdescription / 'php.ser' = vnd.php.serialized" ; |
| 2312 | + $html .= $colophon ; |
| 2313 | + |
| 2314 | + print FILE_HTML_CLIENTS $html ; |
| 2315 | + close FILE_HTML_CLIENTS ; |
| 2316 | +} |
| 2317 | + |
| 2318 | +sub WriteReportCrawlers |
| 2319 | +{ |
| 2320 | + open FILE_HTML_CRAWLERS, '>', "$dir_reports/$file_html_crawlers" ; |
| 2321 | + |
| 2322 | + $html = $header ; |
| 2323 | + $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Crawler requests/ ; |
| 2324 | + $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Crawler requests/ ; |
| 2325 | + $html =~ s/ALSO/ See also: <b>LINKS<\/b>/ ; |
| 2326 | + $html =~ s/LINKS/$link_requests $link_origins \/ $link_methods \/ $link_scripts \/ $link_skins \/ $dummy_crawlers \/ $link_opsys \/ $dummy_browsers \/ $link_google/ ; |
| 2327 | + $html =~ s/X1000/⇒ <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ; |
| 2328 | + |
| 2329 | + $html .= "<table border=1>\n" ; |
| 2330 | + $html .= "<tr><td class=l colspan=99>The following overview of crawler (aka bot) page requests is based on the <a href='http://en.wikipedia.org/wiki/User_agent'>user agent</a> information that accompanies most server requests." . |
| 2331 | + " Unfortunately this user agent information follows rather loosely defined guidelines." . |
| 2332 | + "<br>Also please bear in mind than the most popular crawler names may be somewhat overrepresented." . |
| 2333 | + " This is the result of so called <i>user agent spoofing</i> (where a requester supplies false credentials, e.g. to bypass web servers filters)." . |
| 2334 | + "<br>GoogleBot seems to be a favorite for spoofing. Therefore requests from an ip address registered by Google (see below) are color coded <b><font color=green>GoogleBot</font></b>, others <b><font color=red>GoogleBot</font></b>" . |
| 2335 | + "<p>For this report page requests are considered to be issued by a crawler in two cases:" . |
| 2336 | + "<br>1 The user agent string contains a web address (only crawlers should have that, but there a some false positives, " . |
| 2337 | + " where a browser sends a user agent string with a web address (ill behaved plug-in, main offenders have been eliminated)" . |
| 2338 | + "<br>2 The user agent string contains the term bot, spider or crawl[er]'" . |
| 2339 | + "PERC_GOOGLE\n" . |
| 2340 | + "</td></tr>\n" ; |
| 2341 | + |
| 2342 | + $total_crawlers = 0 ; |
| 2343 | +# $html .= "<tr><th class=l>Count<br><small>x 1000</small></th><th class=l>Secondary domain<br>(~site) name</th><th class=l>Mime type</th><th class=l>User agent</th></tr>\n" ; |
| 2344 | + foreach $mime_agent (keys_sorted_by_value_num_desc %crawlers) |
| 2345 | + { |
| 2346 | + $count = $crawlers {$mime_agent} ; |
| 2347 | + ($mime, $agent) = split ('\|', $mime_agent,2) ; |
| 2348 | + $agent =~ s/([^,;\(\)\s]+?\@[^,;\(\)\s]+)/ <font color=#808080>mail address<\/font> /g ; |
| 2349 | + $agent =~ s/([\w-]+\s*.?at.?\s*[\w-]+\s*.?dot.?\s*[\w-]+)/ <font color=#808080>mail address<\/font> /gi ; |
| 2350 | + $site = "-" ; |
| 2351 | + if ($agent =~ /http:/) |
| 2352 | + { |
| 2353 | + $site = $agent ; |
| 2354 | + $site =~ s/^.*?http:/http:/ ; |
| 2355 | + $site =~ s/>/>/gi ; |
| 2356 | + $site =~ s/</</gi ; |
| 2357 | + $site =~ s/^(.*?)[,;\)\<\>\s)].*$/$1/ ; |
| 2358 | + } |
| 2359 | + $agent =~ s/\Q$site\E/<b>$site<\/b>/ ; |
| 2360 | + # $agent =~ s/\Q$site\E// ; |
| 2361 | + |
| 2362 | + $secondary_domain = &GetSecondaryDomain ($site) ; |
| 2363 | + if (($secondary_domain eq "google") and ($agent =~ /color=red>GoogleBot</)) |
| 2364 | + { $secondary_domain .= "?" ; } |
| 2365 | + |
| 2366 | + $secondary_domains {$secondary_domain} += $count ; |
| 2367 | + |
| 2368 | + if ($secondary_domain ne "-") |
| 2369 | + { $crawlers_per_domain {$secondary_domain} {$mime_agent} += $count ; } |
| 2370 | + else |
| 2371 | + { |
| 2372 | + $crawlers_no_url {$agent} {$mime} += $count ; |
| 2373 | + $crawlers_no_url_agent {$agent} += $count ; |
| 2374 | + } |
| 2375 | + |
| 2376 | + $total_crawlers += $count ; |
| 2377 | + |
| 2378 | + next if $count <= 2 ; |
| 2379 | + |
| 2380 | + # $count = &FormatCount ($count) ; |
| 2381 | + # $html .= "<tr><td class=r>$count</td><td class=l><a href='$site'>$secondary_domain</a></td><td class=l>$mime</td><td class=l>$agent</td></tr>\n" ; |
| 2382 | + # $rows++ ; |
| 2383 | + } |
| 2384 | + |
| 2385 | + $perc_crawlers = ".." ; |
| 2386 | + if ($total_page_requests_external > 0) |
| 2387 | + { $perc_crawlers = sprintf ("%.1f",100 * $total_page_crawlerrequests/$total_page_requests_external) ; } |
| 2388 | + |
| 2389 | + $total_page_requests_external2 = &FormatCount ($total_page_requests_external*1000) ; |
| 2390 | + $total_page_crawlerrequests2 = &FormatCount ($total_page_crawlerrequests*1000) ; |
| 2391 | + $html =~ s/PERC_GOOGLE/<p>In total $total_page_crawlerrequests2 page requests (mime type <a href='SquidReportRequests.htm'>text\/html<\/a> only!) per day are considered crawler requests, out of $total_page_requests_external2 external requests, which is $perc_crawlers%/ ; |
| 2392 | + |
| 2393 | + $total_crawlers = &FormatCount ($total_crawlers) ; |
| 2394 | +# $html .= "<tr><th class=l>$total_crawlers</th><th class=l colspan=2>total</th></tr>\n" ; |
| 2395 | +# $html .= "</table><p>\n" ; |
| 2396 | + |
| 2397 | +# $html .= "<table border=1>\n" ; |
| 2398 | +# $html .= "<tr><th class=l colspan=99>Top 25 secondary domains<br>(~ sites) mentioned</th></tr>\n" ; |
| 2399 | +# foreach $secondary_domain (keys_sorted_by_value_num_desc %secondary_domains) |
| 2400 | +# { |
| 2401 | +# next if $secondary_domain eq ".." ; |
| 2402 | +# last if ++$secondary_domains_listed > 25 ; |
| 2403 | +# |
| 2404 | +# $count = $secondary_domains {$secondary_domain} ; |
| 2405 | +# $count = &FormatCount ($count) ; |
| 2406 | +# $html .= "<tr><td class=r>$count</td><td class=l colspan=2>$secondary_domain</td></tr>\n" ; |
| 2407 | +# } |
| 2408 | +# $html .= "</table>\n" ; |
| 2409 | + |
| 2410 | + $html .= "<tr><th class=lh3 colspan=99>Page requests for crawlers that specify a url in the agent string</th></tr>\n" ; |
| 2411 | + $html .= "<tr><th class=l>Count<br><small>x 1000</small></th><th class=l>Secondary domain<br>(~site) name</th><th class=l>URL</th><th class=l>Mime type</th><th class=l>User agent</th></tr>\n" ; |
| 2412 | + foreach $secondary_domain (keys_sorted_by_value_num_desc %secondary_domains) |
| 2413 | + { |
| 2414 | + next if $secondary_domain eq "-" ; |
| 2415 | + |
| 2416 | + $total = $secondary_domains {$secondary_domain} ; |
| 2417 | + $total_crawlers_url += $total ; |
| 2418 | + |
| 2419 | + last if $total < 10 ; |
| 2420 | + |
| 2421 | + $total = &FormatCount ($total) ; |
| 2422 | + $html .= "<tr><th class=r>$total</th><th class=l colspan=99>$secondary_domain</th></tr>\n" ; |
| 2423 | + foreach $mime_agent (sort {$crawlers_per_domain {$secondary_domain} {$b} <=> $crawlers_per_domain {$secondary_domain} {$a}} keys %{$crawlers_per_domain {$secondary_domain}}) |
| 2424 | + { |
| 2425 | + ($mime, $agent) = split ('\|', $mime_agent,2) ; |
| 2426 | + $agent =~ s/([^,;\(\)\s]+?\@[^,;\(\)\s]+)/ <font color=#808080>mail address<\/font> /g ; |
| 2427 | + $agent =~ s/([\w-]+\s*.?at.?\s*[\w-]+\s*.?dot.?\s*[\w-]+)/ <font color=#808080>mail address<\/font> /gi ; |
| 2428 | + $site = "-" ; |
| 2429 | + if ($agent =~ /http:/) |
| 2430 | + { |
| 2431 | + $site = $agent ; |
| 2432 | + $site =~ s/^.*?http:/http:/ ; |
| 2433 | + $site =~ s/>/>/gi ; |
| 2434 | + $site =~ s/</</gi ; |
| 2435 | + $site =~ s/^(.*?)[,;\)\<\>\s)].*$/$1/ ; |
| 2436 | + } |
| 2437 | + # $agent =~ s/\Q$site\E/<b>$site<\/b> <a href='$site'>x<\/a>/ ; |
| 2438 | + if ($site ne "-") |
| 2439 | + { $agent =~ s/\Q$site\E/<b>url<\/b>/ ; } |
| 2440 | + $count = $crawlers_per_domain {$secondary_domain} {$mime_agent} ; |
| 2441 | + |
| 2442 | + next if $count <= 2 ; |
| 2443 | + |
| 2444 | + # print "[$secondary_domain] [$mime_agent] : $count\n" ; |
| 2445 | + $count = &FormatCount ($count) ; |
| 2446 | + ($site2 = $site) =~ s/^http:\/\/// ; |
| 2447 | + $html .= "<tr><td class=r>$count</td><td class=l> </td><td class=l><a href='$site' ref='nofollow'>$site2<\/a></td><td class=l>$mime</td><td class=l>$agent</td></tr>\n" ; |
| 2448 | + $rows++ ; |
| 2449 | + } |
| 2450 | + } |
| 2451 | + $total_crawlers_url = &FormatCount ($total_crawlers_url) ; |
| 2452 | + $html .= "<tr><th class=l>$total_crawlers_url</th><th class=l colspan=99>total</th></tr>\n" ; |
| 2453 | + $html .= "</table><p>\n" ; |
| 2454 | + |
| 2455 | + $total_crawlers_no_url = 0 ; |
| 2456 | + $html .= "<table border=1>\n" ; |
| 2457 | + $html .= "<tr><th class=lh3 colspan=99>Page requests for probable crawlers, recognized by keyword</th></tr>\n" ; |
| 2458 | + $html .= "<tr><th class=l width=40>Count<br><small>x 1000</small></th><th class=l colspan=99>Agent string</th></tr>\n" ; |
| 2459 | + $html .= "<tr><th class=l width=40> </td><th class=l width=40> </td><th class=l>Mime type (count ≥ 3)</th></tr>\n" ; |
| 2460 | + foreach $agent (keys_sorted_by_value_num_desc %crawlers_no_url_agent) |
| 2461 | + { |
| 2462 | + $total = $crawlers_no_url_agent {$agent} ; |
| 2463 | + $total_crawlers_no_url += $total ; |
| 2464 | + |
| 2465 | + last if $total < 3 ; |
| 2466 | + |
| 2467 | + $total = &FormatCount ($total) ; |
| 2468 | + $html .= "<tr><th class=r>$total</th><td class=l colspan=99>$agent</td></tr>\n" ; |
| 2469 | + foreach $mime (sort {$crawlers_no_url {$agent} {$b} <=> $crawlers_no_url {$agent} {$a}} keys %{$crawlers_no_url {$agent}}) |
| 2470 | + { |
| 2471 | + $agent =~ s/([^,;\(\)\s]+?\@[^,;\(\)\s]+)/ <font color=#808080>mail address<\/font> /g ; |
| 2472 | + $agent =~ s/([\w-]+\s*.?at.?\s*[\w-]+\s*.?dot.?\s*[\w-]+)/ <font color=#808080>mail address<\/font> /gi ; |
| 2473 | + $count = $crawlers_no_url {$agent} {$mime} ; |
| 2474 | + $count = &FormatCount ($count) ; |
| 2475 | + ($site2 = $site) =~ s/^http:\/\/// ; |
| 2476 | + $html .= "<tr><td class=r>$count</td><td> </td><td class=l colspan=99>$mime</td></tr>\n" ; |
| 2477 | + $rows++ ; |
| 2478 | + } |
| 2479 | + } |
| 2480 | + |
| 2481 | + $total_crawlers_no_url = &FormatCount ($total_crawlers_no_url) ; |
| 2482 | + $html .= "<tr><th class=l>$total_crawlers_no_url</th><th class=l colspan=99>total</th></tr>\n" ; |
| 2483 | + $html .= "</table><p>\n" ; |
| 2484 | + |
| 2485 | + $html .= "<p>$google_ip_ranges" ; |
| 2486 | + $html .= $colophon ; |
| 2487 | + |
| 2488 | + print FILE_HTML_CRAWLERS $html ; |
| 2489 | + close FILE_HTML_CRAWLERS ; |
| 2490 | +} |
| 2491 | + |
| 2492 | +sub WriteReportMethods |
| 2493 | +{ |
| 2494 | + open FILE_HTML_METHODS, '>', "$dir_reports/$file_html_methods" ; |
| 2495 | + |
| 2496 | + $html = $header ; |
| 2497 | + $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Request Methods/ ; |
| 2498 | + $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Request Methods/ ; |
| 2499 | + $html =~ s/ALSO/ See also: <b>LINKS<\/b>/ ; |
| 2500 | + $html =~ s/LINKS/$link_requests $link_origins \/ $dummy_methods \/ $link_scripts \/ $link_skins \/ $link_crawlers \/ $link_opsys \/ $link_browsers \/ $link_google/ ; |
| 2501 | + $html =~ s/X1000/⇒ <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ; |
| 2502 | + |
| 2503 | + $html .= "<table border=0>\n" ; |
| 2504 | + $html .= "<tr><td>" ; |
| 2505 | + |
| 2506 | + $html .= "<table border=1>\n" ; |
| 2507 | + $html .= "<tr><th colspan=99 class=l><h3>In order of request volume</h3></th></tr>\n" ; |
| 2508 | + $html .= "<tr><th colspan=2 class=l>Method</th><th class=r>Count<br><small>x 1000</small></th></tr>\n" ; |
| 2509 | + $rows = 0 ; |
| 2510 | + $total_methods = 0 ; |
| 2511 | + foreach $method (@methods_sorted_count) |
| 2512 | + { |
| 2513 | + $total = $methods {$method} ; |
| 2514 | + $total_methods += $total ; |
| 2515 | + $total = &FormatCount ($total) ; |
| 2516 | + $html .= "<tr><td colspan=2 class=l>$method</td><td class=r>$total</td></tr>\n" ; |
| 2517 | + } |
| 2518 | + $total_methods = &FormatCount ($total_methods) ; |
| 2519 | + $html .= "<tr><th colspan=2 class=l>Total</th><th class=r>$total_methods</th></tr>\n" ; |
| 2520 | + $html .= "<tr><td colspan=99> </td></tr>\n" ; |
| 2521 | + $html .= "<tr><td class=l>Method</th><th class=l>Result</th><th class=r>Count<br><small>x 1000</small></th></tr>\n" ; |
| 2522 | + $total_statusses = 0 ; |
| 2523 | + foreach $status (@statusses_sorted_count) |
| 2524 | + { |
| 2525 | + $total = $statusses {$status} ; |
| 2526 | + $total_statusses += $total ; |
| 2527 | + $total = &FormatCount ($total) ; |
| 2528 | + ($method,$result) = split (',', $status, 2) ; |
| 2529 | + |
| 2530 | + $html .= "<tr><td class=l>$method</td><td class=l>$result</td><td class=r>$total</td></tr>\n" ; |
| 2531 | + $rows++ ; |
| 2532 | + } |
| 2533 | + $total_statusses = &FormatCount ($total_statusses) ; |
| 2534 | + $html .= "<tr><th colspan=2 class=l>Total</th><th class=r>$total_statusses</th></tr>\n" ; |
| 2535 | + $html .= "</table>\n" ; |
| 2536 | + |
| 2537 | + $html .= "</td><td> </td><td>" ; |
| 2538 | + |
| 2539 | + $html .= "<table border=1>\n" ; |
| 2540 | + $html .= "<tr><th colspan=99 class=l><h3>In alphabetical order: method+result</h3></th></tr>\n" ; |
| 2541 | + $html .= "<tr><th colspan=2 class=l>Method</th><th class=r>Count<br><small>x 1000</small></th></tr>\n" ; |
| 2542 | + $rows = 0 ; |
| 2543 | + foreach $method (@methods_sorted_method) |
| 2544 | + { |
| 2545 | + $total = &FormatCount ($methods {$method}) ; |
| 2546 | + $html .= "<tr><td colspan=2 class=l>$method</td><td class=r>$total</td></tr>\n" ; |
| 2547 | + } |
| 2548 | + $html .= "<tr><th colspan=2 class=l>Total</th><th class=r>$total_methods</th></tr>\n" ; |
| 2549 | + $html .= "<tr><td colspan=99> </td></tr>\n" ; |
| 2550 | + $html .= "<tr><th class=l>Method</th><th class=l>Result</th><th class=r>Count<br><small>x 1000</small></th></tr>\n" ; |
| 2551 | + foreach $status (@statusses_sorted_method) |
| 2552 | + { |
| 2553 | + $total = &FormatCount ($statusses {$status}) ; |
| 2554 | + ($method,$result) = split (',', $status, 2) ; |
| 2555 | + |
| 2556 | + $html .= "<tr><td class=l>$method</td><td class=l>$result</td><td class=r>$total</td></tr>\n" ; |
| 2557 | + $rows++ ; |
| 2558 | + } |
| 2559 | + $html .= "<tr><th colspan=2 class=l>Total</th><th class=r>$total_statusses</th></tr>\n" ; |
| 2560 | + $html .= "</table>\n" ; |
| 2561 | + |
| 2562 | + $html .= "</td></tr></table>\n" ; |
| 2563 | + $html .= " <small>$rows rows written</small><p>" ; |
| 2564 | + |
| 2565 | +# $html .= "<p><b>Explanation:</b><br>'osd' = opensearchdescription / 'php.ser' = vnd.php.serialized" ; |
| 2566 | + $html .= $colophon ; |
| 2567 | + |
| 2568 | + print FILE_HTML_METHODS $html ; |
| 2569 | + close FILE_HTML_METHODS ; |
| 2570 | +} |
| 2571 | + |
| 2572 | +sub WriteReportMimeTypes |
| 2573 | +{ |
| 2574 | + open FILE_HTML_REQUESTS, '>', "$dir_reports/$file_html_requests" ; |
| 2575 | + |
| 2576 | + $html = $header ; |
| 2577 | + $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Requests by destination/ ; |
| 2578 | + $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Requests by destination/ ; |
| 2579 | + $html =~ s/ALSO/ See also: <b>LINKS<\/b>/ ; |
| 2580 | + $html =~ s/NOTES/<br> This report shows where requests are sent to. Report 'Requests by origin' shows where requests come from.<br> Those numbers bear no direct relation.<br>/ ; |
| 2581 | + $html =~ s/LINKS/$dummy_requests $link_origins \/ $link_methods \/ $link_scripts \/ $link_skins \/ $link_crawlers \/ $link_opsys \/ $link_browsers \/ $link_google/ ; |
| 2582 | + $html .= "<table border=1>\n" ; |
| 2583 | + |
| 2584 | + $header1 = "<tr><th colspan=2 class=l><small>x 1000</small></th><th colspan=2 class=c>Totals</th><th class=c><font color=#008000>Pages</font></th><th colspan=3 class=c><font color=#900000>Images</font></th><th colspan=99 class=c>Other</th></tr>\n" ; |
| 2585 | + $header2 = "<tr><th colspan=2 class=l> </th><th class=c>total<br>all</th><th class=c><font color=#900000>total<br>images</font></th>\n" ; |
| 2586 | + $columns = 0 ; |
| 2587 | + foreach $mimetype (@mimetypes_sorted) |
| 2588 | + { |
| 2589 | + $columns++ ; |
| 2590 | + |
| 2591 | + next if $mimetypes_found {$mimetype} < $threshold_mime ; |
| 2592 | + |
| 2593 | + $mimetype2 = $mimetype ; |
| 2594 | + if ($mimetype2 eq "text/html") |
| 2595 | + { $mimetype2 .= "<br><small>(page)</small> " ; } |
| 2596 | + if ($mimetype2 =~ /image\/(?:png|jpeg|gif)/) |
| 2597 | + { $mimetype2 .= "<br><small>(img)</small> " ; } |
| 2598 | + if ($columns == 1) |
| 2599 | + { $mimetype2 = "<font color=#008000>$mimetype2</font" ; } |
| 2600 | + if (($columns >= 2) && ($columns <= 4)) |
| 2601 | + { $mimetype2 = "<font color=#900000>$mimetype2</font" ; } |
| 2602 | + ($mime1,$mime2) = split ('\/', $mimetype2, 2) ; |
| 2603 | + $header2 .= "<th class=c>$mime1<br>$mime2</th>\n" ; |
| 2604 | + } |
| 2605 | + $header2 .= "</tr>\n" ; |
| 2606 | + $html .= $header1 . $header2 ; |
| 2607 | + |
| 2608 | + $rows = 0 ; |
| 2609 | + $total_mimes2 = 0 ; |
| 2610 | + $total_images1 = 0 ; |
| 2611 | + foreach $domain (@domains_sorted) |
| 2612 | + { |
| 2613 | + $html .= "<tr><td colspan=2 class=l>" . ucfirst($domain) . "</td>\n" ; |
| 2614 | + $total = $domains {$domain} ; |
| 2615 | + $total_mimes2 += $total ; |
| 2616 | + $total = &FormatCount ($total) ; |
| 2617 | + $total_images = $images_domain {$domain} ; |
| 2618 | + $total_images1 += $total_images ; |
| 2619 | + $total_images = &FormatCount ($total_images) ; |
| 2620 | + $total_images = "<font color=#900000>" . &FormatCount ($total_images) . "</font>" ; |
| 2621 | + |
| 2622 | + $html .= "<th class=r>$total</th><th class=r>$total_images</th>\n" ; |
| 2623 | + $columns = 0 ; |
| 2624 | + foreach $mimetype (@mimetypes_sorted) |
| 2625 | + { |
| 2626 | + $columns++ ; |
| 2627 | + |
| 2628 | + next if $mimetypes_found {$mimetype} < $threshold_mime ; |
| 2629 | + |
| 2630 | + $count = &FormatCount ($counts_dm {"$domain,$mimetype"}) ; |
| 2631 | + if ($columns == 1) |
| 2632 | + { $count = "<font color=#008000>$count</font" ; } |
| 2633 | + if (($columns >= 2) && ($columns <= 4)) |
| 2634 | + { $count = "<font color=#900000>$count</font" ; } |
| 2635 | + if ($count eq "") |
| 2636 | + { $count = " " ; } |
| 2637 | + $html .= "<td class=r>$count</td>\n" ; |
| 2638 | + } |
| 2639 | + $html .= "</tr>\n" ; |
| 2640 | + $rows++ ; |
| 2641 | + } |
| 2642 | + |
| 2643 | + if ($total_mimes != $total_mimes2) |
| 2644 | + { |
| 2645 | + print ERR "total_mimes $total_mimes != total_mimes2 $total_mimes2\n" ; |
| 2646 | + print "total_mimes $total_mimes != total_mimes2 $total_mimes2\n" ; |
| 2647 | + } |
| 2648 | + |
| 2649 | + $total_mimes1 = &FormatCount ($total_mimes) ; |
| 2650 | + $total_images1 = &FormatCount ($total_images1) ; |
| 2651 | + $total_images1 = "<font color=#900000>" . &FormatCount ($total_images1) . "</font>" ; |
| 2652 | + $html .= "<tr><th colspan=2 class=l>Total</th><th class=c>$total_mimes1</th><th class=c>$total_images1</th>\n" ; |
| 2653 | + $columns = 0 ; |
| 2654 | + foreach $mimetype (@mimetypes_sorted) |
| 2655 | + { |
| 2656 | + $columns++ ; |
| 2657 | + |
| 2658 | + next if $mimetypes_found {$mimetype} < $threshold_mime ; |
| 2659 | + |
| 2660 | + $count = &FormatCount ($mimetypes {$mimetype}) ; |
| 2661 | + if ($columns == 1) |
| 2662 | + { $count = "<font color=#008000>$count</font" ; } |
| 2663 | + if (($columns >= 2) && ($columns <= 4)) |
| 2664 | + { $count = "<font color=#900000>$count</font" ; } |
| 2665 | + $html .= "<th class=r>$count</th>\n" ; |
| 2666 | + } |
| 2667 | + $html .= "</tr>\n" ; |
| 2668 | + |
| 2669 | + $html .= "<tr><th colspan=99> </th></tr>\n" ; |
| 2670 | + $html .= "<tr><td colspan=99 class=l><b>Per project / language subproject</b> (top 50)</td></tr>\n" ; |
| 2671 | + $total_mimes3 = 0 ; |
| 2672 | + $total_mimes4 = 0 ; |
| 2673 | + $cnt_projects = 0 ; |
| 2674 | + foreach $project (@projects_sorted) |
| 2675 | + { |
| 2676 | + last if ++ $cnt_projects > 50 ; |
| 2677 | + |
| 2678 | + $total = $projects {$project} ; |
| 2679 | + $total_mimes3 += $total ; |
| 2680 | + |
| 2681 | + next if $total < $threshold_project ; |
| 2682 | + |
| 2683 | + $total_mimes4 += $total ; |
| 2684 | + ($domain,$language) = split ('\:', $project,2) ; |
| 2685 | + $html .= "<tr><td class=l>" . ucfirst($domain) . "</td><td class=l>$language</td>\n" ; |
| 2686 | + |
| 2687 | + $total = &FormatCount ($total) ; |
| 2688 | + $total_images = $images_project {$project} ; |
| 2689 | + $total_images = "<font color=#900000>" . &FormatCount ($total_images) . "</font>" ; |
| 2690 | + $html .= "<th class=r>$total</th><th class=r>$total_images</th>\n" ; |
| 2691 | + |
| 2692 | + $columns = 0 ; |
| 2693 | + foreach $mimetype (@mimetypes_sorted) |
| 2694 | + { |
| 2695 | + $columns++ ; |
| 2696 | + |
| 2697 | + next if $mimetypes_found {$mimetype} < $threshold_mime ; |
| 2698 | + |
| 2699 | + $count = &FormatCount ($counts_pm {"$project,$mimetype"}) ; |
| 2700 | + if ($columns == 1) |
| 2701 | + { $count = "<font color=#008000>$count</font" ; } |
| 2702 | + if (($columns >= 2) && ($columns <= 4)) |
| 2703 | + { $count = "<font color=#900000>$count</font" ; } |
| 2704 | +# if ($count eq "") |
| 2705 | +# { $count = " " ; } |
| 2706 | + $html .= "<td class=r>$count</td>\n" ; |
| 2707 | + } |
| 2708 | + $html .= "</tr>\n" ; |
| 2709 | + $rows++ ; |
| 2710 | + } |
| 2711 | + $html .= $header2 . $header1 ; |
| 2712 | + $html .= "</table>\n" ; |
| 2713 | + $html .= " <small>$rows rows written</small><p>" ; |
| 2714 | + |
| 2715 | + if ($total_mimes != $total_mimes3) |
| 2716 | + { |
| 2717 | + print ERR "total_mimes $total_mimes != total_mimes3 $total_mimes3\n" ; |
| 2718 | + print "total_mimes $total_mimes != total_mimes3 $total_mimes3\n" ; |
| 2719 | + } |
| 2720 | + |
| 2721 | + if ($threshold_mime > 0) |
| 2722 | + { |
| 2723 | + $html .= "<b>Mime types that are found on less than $threshold_mime projects:</b> (again 1 = 1000)<p>" ; |
| 2724 | + foreach $mimetype (@mimetypes_sorted) |
| 2725 | + { |
| 2726 | + next if $mimetypes_found {$mimetype} >= $threshold_mime ; |
| 2727 | + |
| 2728 | + $count = $mimetypes {$mimetype} ; |
| 2729 | + $count =~ s/^(\d{1,3})(\d\d\d)$/$1,$2/ ; |
| 2730 | + $count =~ s/^(\d{1,3})(\d\d\d)(\d\d\d)$/$1,$2,$3/ ; |
| 2731 | + $html .= "<b>$mimetype</b> $count total<br>" ; |
| 2732 | + } |
| 2733 | + } |
| 2734 | + |
| 2735 | +# $html .= "<p><b>Explanation:</b><br>'osd' = opensearchdescription / 'php.ser' = vnd.php.serialized" ; |
| 2736 | + $html .= $colophon ; |
| 2737 | + |
| 2738 | + print FILE_HTML_REQUESTS $html ; |
| 2739 | + close FILE_HTML_REQUESTS ; |
| 2740 | +} |
| 2741 | + |
| 2742 | +sub WriteReportOpSys |
| 2743 | +{ |
| 2744 | + open FILE_HTML_OPSYS, '>', "$dir_reports/$file_html_opsys" ; |
| 2745 | + |
| 2746 | + $html = $header ; |
| 2747 | + $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Operating Systems/ ; |
| 2748 | + $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Operating Systems/ ; |
| 2749 | + $html =~ s/ALSO/ See also: <b>LINKS<\/b>/ ; |
| 2750 | + $html =~ s/LINKS/$link_requests $link_origins \/ $link_methods \/ $link_scripts \/ $link_skins \/ $link_crawlers \/ $dummy_opsys \/ $link_browsers \/ $link_google/ ; |
| 2751 | + $html =~ s/X1000/⇒ <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ; |
| 2752 | + |
| 2753 | + $total_all2 = &FormatCount ($total_opsys_mobile + $total_opsys_non_mobile) ; |
| 2754 | + $total_opsys_mobile2 = &FormatCount ($total_opsys_mobile) ; |
| 2755 | + $total_opsys_non_mobile2 = &FormatCount ($total_opsys_non_mobile) ; |
| 2756 | + $total_perc_mobile = sprintf ("%.1f", 100 * $total_opsys_mobile / ($total_opsys_mobile + $total_opsys_non_mobile)) ; |
| 2757 | + $total_perc_non_mobile = 100 - $total_perc_mobile ; |
| 2758 | + $line_total_all = "<tr><th class=l>Total</th><th class=r>$total_all2</th><th class=r>100\%</th></tr>\n" ; |
| 2759 | + $line_total_mobile = "<tr><th class=l>Total</th><th class=r>$total_opsys_mobile2</th><th class=r>$total_perc_mobile\%</th></tr>\n" ; |
| 2760 | + $line_total_non_mobile = "<tr><th class=l>Total</th><th class=r>$total_opsys_non_mobile2</th><th class=r>$total_perc_non_mobile\%</th></tr>\n" ; |
| 2761 | + |
| 2762 | + $html .= "<table border=1>\n" ; |
| 2763 | + $html .= "<tr><td class=l colspan=99>The following overview of page requests by operating system is based on the <a href='http://en.wikipedia.org/wiki/User_agent'>user agent</a> information that accompanies most server requests.<br>" . |
| 2764 | + "Please note that agent information does not follow strict guidelines and some programs may provide wrong information on purpose.<br>" . |
| 2765 | + "This report ignores all requests where agent information is missing, or contains any of the following: bot, crawl(er) or spider.<p>" . |
| 2766 | + "<a href='http://en.wikipedia.org/wiki/Windows_NT#Releases'>Wikipedia</a>: NT 5.0 = Windows 2000, NT 5.1/5.2 = XP + Server 2003, NT 6.0 = VISTA + Server 2008, NT 6.1 = Windows 7.<br> " . |
| 2767 | + "<a href='http://en.wikipedia.org/wiki/Mac_OS_X#Versions'>Wikipedia</a>: OS X 10.4 = Tiger, 10.5 = Leopard, 10.6 = Snow Leopard.<br> " . |
| 2768 | + "<a href='http://en.wikipedia.org/wiki/Ubuntu#Releases'>Wikipedia</a>: Ubuntu 7.10 = Gutsy Gibbon, 8.04 = Hardy Heron, 8.10 = Intrepid Ibex, 9.04 = Jaunty Jackalope, 9.10 = Karma Koala." . |
| 2769 | + "</td></tr>\n" ; |
| 2770 | + |
| 2771 | +# $html .= "<tr><th class=l>Count<br><small>x 1000</small></th><th class=l>Secondary domain<br>(~site) name</th><th class=l>Mime type</th><th class=l>User agent</th></tr>\n" ; |
| 2772 | + |
| 2773 | + $html .= "<tr><td width=50% valign=top>" ; |
| 2774 | + |
| 2775 | + # OS SORTED BY FREQUENCY |
| 2776 | + $html .= "<table border=1 width=100%>\n" ; |
| 2777 | + $html .= "<tr><td colspan=99 class=l><h3>In order of popularity</h3></td></tr>" ; |
| 2778 | + $html .= "<tr><th class=l>Operating System</th><th class=r>Requests</th><th class=r>Percentage</th></tr>\n" ; |
| 2779 | + foreach $key (@opsys_sorted_count) |
| 2780 | + { |
| 2781 | + $count = $opsys {$key} ; |
| 2782 | + $perc = $opsys_perc {$key} ; |
| 2783 | + ($rectype, $os) = split (',', $key,2) ; |
| 2784 | + |
| 2785 | + next if $rectype ne 'G' ; # group |
| 2786 | + next if $key =~ / / ; # subgroup |
| 2787 | + |
| 2788 | + $count = &FormatCount ($count) ; |
| 2789 | + $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2790 | + # $rows++ ; |
| 2791 | + } |
| 2792 | + $html .= $line_total_all ; |
| 2793 | + |
| 2794 | + $html .= "<tr><th class=l colspan=99> <br>Breakdown per platform for Mac and Linux</th></tr>\n" ; |
| 2795 | + foreach $key (@opsys_sorted_count) |
| 2796 | + { |
| 2797 | + $count = $opsys {$key} ; |
| 2798 | + $perc = $opsys_perc {$key} ; |
| 2799 | + ($rectype, $os) = split (',', $key,2) ; |
| 2800 | + |
| 2801 | + next if $rectype ne 'G' ; # group |
| 2802 | + next if $key !~ / / ; # subgroup |
| 2803 | + |
| 2804 | + $count = &FormatCount ($count) ; |
| 2805 | + $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2806 | + # $rows++ ; |
| 2807 | + } |
| 2808 | + |
| 2809 | + $html .= "<tr><th class=l colspan=99> <br>Breakdown per OS version, non mobile</th></tr>\n" ; |
| 2810 | + foreach $key (@opsys_sorted_count) |
| 2811 | + { |
| 2812 | + $count = $opsys {$key} ; |
| 2813 | + $perc = $opsys_perc {$key} ; |
| 2814 | + |
| 2815 | + next if $perc lt "0.02%" ; |
| 2816 | + |
| 2817 | + ($rectype, $os) = split (',', $key,2) ; |
| 2818 | + |
| 2819 | + next if $rectype ne '-' ; # group |
| 2820 | + |
| 2821 | + $count = &FormatCount ($count) ; |
| 2822 | + $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2823 | + # $rows++ ; |
| 2824 | + } |
| 2825 | + $html .= $line_total_non_mobile ; |
| 2826 | + |
| 2827 | + $html .= "<tr><th class=l colspan=99> <br>Breakdown per OS version, mobile</th></tr>\n" ; |
| 2828 | + foreach $key (@opsys_sorted_count) |
| 2829 | + { |
| 2830 | + $count = $opsys {$key} ; |
| 2831 | + $perc = $opsys_perc {$key} ; |
| 2832 | + |
| 2833 | + next if $perc lt "0.02%" ; |
| 2834 | + |
| 2835 | + ($rectype, $os) = split (',', $key,2) ; |
| 2836 | + |
| 2837 | + next if $rectype ne 'M' ; # group |
| 2838 | + |
| 2839 | + $count = &FormatCount ($count) ; |
| 2840 | + $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2841 | + # $rows++ ; |
| 2842 | + } |
| 2843 | + $html .= $line_total_mobile ; |
| 2844 | + $html .= "</table>\n" ; |
| 2845 | + |
| 2846 | + $html .= "</td><td width=50% valign=top>" ; |
| 2847 | + |
| 2848 | + # IN ALPHABETICAL ORDER |
| 2849 | + $html .= "<table border=1 width=100%>\n" ; |
| 2850 | + |
| 2851 | + $html .= "<tr><td colspan=99 class=l><h3>In alphabetical order</h3></td></tr>" ; |
| 2852 | + $html .= "<tr><th class=l>Operating System</th><th class=r>Requests</th><th class=r>Percentage</th></tr>\n" ; |
| 2853 | + foreach $key (@opsys_sorted_alpha) |
| 2854 | + { |
| 2855 | + $count = $opsys {$key} ; |
| 2856 | + $perc = $opsys_perc {$key} ; |
| 2857 | + ($rectype, $os) = split (',', $key,2) ; |
| 2858 | + |
| 2859 | + next if $rectype ne 'G' ; # group |
| 2860 | + next if $key =~ / / ; # subgroup |
| 2861 | + |
| 2862 | + $count = &FormatCount ($count) ; |
| 2863 | + $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2864 | + # $rows++ ; |
| 2865 | + } |
| 2866 | + $html .= $line_total_all ; |
| 2867 | + |
| 2868 | + $html .= "<tr><th class=l colspan=99> <br>Breakdown per platform for Mac and Linux</th></tr>\n" ; |
| 2869 | + foreach $key (@opsys_sorted_alpha) |
| 2870 | + { |
| 2871 | + $count = $opsys {$key} ; |
| 2872 | + $perc = $opsys_perc {$key} ; |
| 2873 | + ($rectype, $os) = split (',', $key,2) ; |
| 2874 | + |
| 2875 | + next if $rectype ne 'G' ; # group |
| 2876 | + next if $key !~ / / ; # subgroup |
| 2877 | + |
| 2878 | + $count = &FormatCount ($count) ; |
| 2879 | + $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2880 | + # $rows++ ; |
| 2881 | + } |
| 2882 | + |
| 2883 | + $html .= "<tr><th class=l colspan=99> <br>Breakdown per OS version, non mobile</th></tr>\n" ; |
| 2884 | + foreach $key (@opsys_sorted_alpha) |
| 2885 | + { |
| 2886 | + $count = $opsys {$key} ; |
| 2887 | + $perc = $opsys_perc {$key} ; |
| 2888 | + |
| 2889 | + next if $perc lt "0.02%" ; |
| 2890 | + |
| 2891 | + ($rectype, $os) = split (',', $key,2) ; |
| 2892 | + |
| 2893 | + next if $rectype ne '-' ; # group |
| 2894 | + |
| 2895 | + $count = &FormatCount ($count) ; |
| 2896 | + $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2897 | + # $rows++ ; |
| 2898 | + } |
| 2899 | + |
| 2900 | + $html .= $line_total_non_mobile ; |
| 2901 | + $html .= "<tr><th class=l colspan=99> <br>Breakdown per OS version, mobile</th></tr>\n" ; |
| 2902 | + foreach $key (@opsys_sorted_alpha) |
| 2903 | + { |
| 2904 | + $count = $opsys {$key} ; |
| 2905 | + $perc = $opsys_perc {$key} ; |
| 2906 | + |
| 2907 | + next if $perc lt "0.02%" ; |
| 2908 | + |
| 2909 | + ($rectype, $os) = split (',', $key,2) ; |
| 2910 | + |
| 2911 | + next if $rectype ne 'M' ; # group |
| 2912 | + |
| 2913 | + $count = &FormatCount ($count) ; |
| 2914 | + $html .= "<tr><td class=l>$os</a></td><td class=r>$count</td><td class=r>$perc</td></tr>\n" ; |
| 2915 | + # $rows++ ; |
| 2916 | + } |
| 2917 | + $html .= $line_total_mobile ; |
| 2918 | + $html .= "</table>\n" ; |
| 2919 | + $html .= "</td></tr>" ; |
| 2920 | + |
| 2921 | + $html .= "<tr><td colspan=99 class=l wrap>Requests from mobile devices are recognized as follows:<br>" . |
| 2922 | + "Agent string contains any of the following terms (last upd: $month_upd_keywords_mobile):<br>" . |
| 2923 | + "<i>$keywords_mobile</i></td></tr>" ; |
| 2924 | + |
| 2925 | + $html .= "</table><p>" ; |
| 2926 | + |
| 2927 | +# $perc_crawlers = sprintf ("%.1f",100 * $total_page_crawlerrequests/$total_page_requests_external) ; |
| 2928 | +# $total_page_requests_external2 = &FormatCount ($total_page_requests_external*1000) ; |
| 2929 | +# $total_page_crawlerrequests2 = &FormatCount ($total_page_crawlerrequests*1000) ; |
| 2930 | +# $html =~ s/PERC_GOOGLE/<p>In total $total_page_crawlerrequests2 page requests (mime type <a href='SquidReportRequests.htm'>text\/html<\/a> only!) per day are considered crawler requests, out of $total_page_requests_external2 external requests, which is $perc_crawlers%/ ; |
| 2931 | + |
| 2932 | +# $total_crawlers = &FormatCount ($total_crawlers) ; |
| 2933 | + |
| 2934 | +# $html .= "<tr><th class=l>$total_crawlers</th><th class=l colspan=2>total</th></tr>\n" ; |
| 2935 | +# $html .= "</table><p>\n" ; |
| 2936 | + |
| 2937 | +# $html .= "<table border=1>\n" ; |
| 2938 | +# $html .= "<tr><th class=l colspan=99>Top 25 secondary domains<br>(~ sites) mentioned</th></tr>\n" ; |
| 2939 | +# foreach $secondary_domain (keys_sorted_by_value_num_desc %secondary_domains) |
| 2940 | +# { |
| 2941 | +# next if $secondary_domain eq ".." ; |
| 2942 | +# last if ++$secondary_domains_listed > 25 ; |
| 2943 | +# |
| 2944 | +# $count = $secondary_domains {$secondary_domain} ; |
| 2945 | +# $count = &FormatCount ($count) ; |
| 2946 | +# $html .= "<tr><td class=r>$count</td><td class=l colspan=2>$secondary_domain</td></tr>\n" ; |
| 2947 | +# } |
| 2948 | +# $html .= "</table>\n" ; |
| 2949 | + |
| 2950 | + $html .= $colophon ; |
| 2951 | + |
| 2952 | + print FILE_HTML_OPSYS $html ; |
| 2953 | + close FILE_HTML_OPSYS ; |
| 2954 | +} |
| 2955 | + |
| 2956 | +# http://en.wikipedia.org/wiki/Domain_name |
| 2957 | +sub WriteReportOrigins |
| 2958 | +{ |
| 2959 | + open FILE_HTML_ORIGINS, '>', "$dir_reports/$file_html_origins" ; |
| 2960 | + |
| 2961 | + $html = $header ; |
| 2962 | + $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Requests by origin/ ; |
| 2963 | + $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Requests by origin/ ; |
| 2964 | + $html =~ s/ALSO/ See also: <b>LINKS<\/b>/ ; |
| 2965 | + $html =~ s/LINKS/$link_requests $dummy_origins \/ $link_methods \/ $link_scripts \/ $link_skins \/ $link_crawlers \/ $link_opsys \/ $link_browsers \/ $link_google/ ; |
| 2966 | + $html =~ s/NOTES/<br> This report shows where requests come from. Report 'Requests by destination' shows where requests are serviced.<br> Those numbers bear no direct relation.<br>/ ; |
| 2967 | + |
| 2968 | + $html .= "<table border=1>\n" ; |
| 2969 | + $html .= "<tr><td colspan=99>" ; |
| 2970 | + |
| 2971 | + |
| 2972 | + $html .= "<table border=0 width=100%>\n" ; |
| 2973 | +# $html .= "<tr><td colspan=99 class=c>traffic from yahoo is allocated as if yahoo used same domain naming scheme as google: <b>search.yahoo.ca</b> instead of <b>ca.search.yahoo.com</b></td></tr>\n" ; |
| 2974 | +# $html .= "<tr><td colspan=99 class=c><small>All counts x 1000</small></td></tr>\n" ; |
| 2975 | + |
| 2976 | + # INTERNAL ORIGINS |
| 2977 | + |
| 2978 | + $html .= "<tr><td colspan=99 class=c><h3>Requests with internal origins</h3></td></tr>\n" ; |
| 2979 | + $html .= "<table border=1 width=100%>\n" ; |
| 2980 | + |
| 2981 | + $html .= "<tr><td width=50% valign=top>" ; |
| 2982 | + $html .= "<table border=1 width=100%>\n" ; |
| 2983 | + $html .= "<tr><td colspan=2 class=l><b>Internal origins<br>sorted by<br>frequency</b></td><th class=r> Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ; |
| 2984 | + |
| 2985 | + $total_total = 0 ; |
| 2986 | + $total_page = 0 ; |
| 2987 | + $total_image = 0 ; |
| 2988 | + $total_rest = 0 ; |
| 2989 | + foreach $project (@project_int_top_sorted_count) |
| 2990 | + { |
| 2991 | + $total = $project_int_top {$project} ; |
| 2992 | + $page = $project_int_top_split {"page:$project"} ; |
| 2993 | + $image = $project_int_top_split {"image:$project"} ; |
| 2994 | + $rest = $project_int_top_split {"other:$project"} ; |
| 2995 | + $total_total += $total ; |
| 2996 | + $total_page += $page ; |
| 2997 | + $total_image += $image ; |
| 2998 | + $total_rest += $rest ; |
| 2999 | + $total = &FormatCount ($total) ; |
| 3000 | + $page = &FormatCount ($page) ; |
| 3001 | + $image = &FormatCount ($image) ; |
| 3002 | + $rest = &FormatCount ($rest) ; |
| 3003 | + $html .= "<tr><td colspan=2 class=l>" . ucfirst($project) . "</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3004 | + } |
| 3005 | + $total_total = &FormatCount ($total_total) ; |
| 3006 | + $total_page = &FormatCount ($total_page) ; |
| 3007 | + $total_image = &FormatCount ($total_image) ; |
| 3008 | + $total_rest = &FormatCount ($total_rest) ; |
| 3009 | + $html .= "<tr><th colspan=2 class=l>Total</th><th class=r>$total_total</th><td class=r>$total_page</td><td class=r>$total_image</td><td class=r>$total_rest</td></tr>\n" ; |
| 3010 | + |
| 3011 | + $html .= "<tr><td colspan=99> </td></tr>\n" ; |
| 3012 | + $html .= "<tr><td colspan=99 class=l><b>Per project language / subproject</b> (top 50)</td></tr>\n" ; |
| 3013 | + $projects = 0 ; |
| 3014 | + $total_total = 0 ; |
| 3015 | + $total_page = 0 ; |
| 3016 | + $total_image = 0 ; |
| 3017 | + $total_rest = 0 ; |
| 3018 | + foreach $origin (@origin_int_top_sorted_count) |
| 3019 | + { |
| 3020 | + if (++$projects > 50) |
| 3021 | + { |
| 3022 | + $origin_int_top_other {"all"} += $origin_int_top {$origin} ; ; |
| 3023 | + $origin_int_top_other {"page"} += $origin_int_top_split {"page:$origin"} ; |
| 3024 | + $origin_int_top_other {"image"} += $origin_int_top_split {"image:$origin"} ; |
| 3025 | + $origin_int_top_other {"other"} += $origin_int_top_split {"other:$origin"} ; |
| 3026 | + next ; |
| 3027 | + } |
| 3028 | + $top100_internal_origins {$origin} ++ ; |
| 3029 | + $total = $origin_int_top {$origin} ; |
| 3030 | + $page = $origin_int_top_split {"page:$origin"} ; |
| 3031 | + $image = $origin_int_top_split {"image:$origin"} ; |
| 3032 | + $rest = $origin_int_top_split {"other:$origin"} ; |
| 3033 | + $total_total += $total ; |
| 3034 | + $total_page += $page ; |
| 3035 | + $total_image += $image ; |
| 3036 | + $total_rest += $rest ; |
| 3037 | + $total = &FormatCount ($total) ; |
| 3038 | + $page = &FormatCount ($page) ; |
| 3039 | + $image = &FormatCount ($image) ; |
| 3040 | + $rest = &FormatCount ($rest) ; |
| 3041 | + ($project,$subproject) = split (':', $origin) ; |
| 3042 | + $html .= "<tr><td class=l>" . ucfirst($project) . "</td><td class=l>$subproject</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3043 | + |
| 3044 | + } |
| 3045 | + $total = $origin_int_top_other {"all"} ; |
| 3046 | + $page = $origin_int_top_other {"page"} ; |
| 3047 | + $image = $origin_int_top_other {"image"} ; |
| 3048 | + $rest = $origin_int_top_other {"other"} ; |
| 3049 | + $total_total += $total ; |
| 3050 | + $total_page += $page ; |
| 3051 | + $total_image += $image ; |
| 3052 | + $total_rest += $rest ; |
| 3053 | + $total = &FormatCount ($total) ; |
| 3054 | + $page = &FormatCount ($page) ; |
| 3055 | + $image = &FormatCount ($image) ; |
| 3056 | + $rest = &FormatCount ($rest) ; |
| 3057 | + $html .= "<tr><td colspan=2 class=l>Other</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3058 | + $grand_grand_total = $total_total ; |
| 3059 | + $total_total = &FormatCount ($total_total) ; |
| 3060 | + $total_page = &FormatCount ($total_page) ; |
| 3061 | + $total_image = &FormatCount ($total_image) ; |
| 3062 | + $total_rest = &FormatCount ($total_rest) ; |
| 3063 | + $html .= "<tr><th colspan=2 class=l>Total</th><th class=r>$total_total</th><td class=r>$total_page</td><td class=r>$total_image</td><td class=r>$total_rest</td></tr>\n" ; |
| 3064 | + $html .= "</table>" ; |
| 3065 | + |
| 3066 | + # BY ALPHABET |
| 3067 | + $html .= "</td><td width=50% valign=top>" ; |
| 3068 | + |
| 3069 | + $html .= "<table border=1 width=100%>\n" ; |
| 3070 | + $html .= "<tr><td colspan=2 class=l><b>Internal origins<br>sorted by<br>alphabet</b></td><th class=r> Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ; |
| 3071 | + |
| 3072 | + $total_total = 0 ; |
| 3073 | + $total_page = 0 ; |
| 3074 | + $total_image = 0 ; |
| 3075 | + $total_rest = 0 ; |
| 3076 | + foreach $project (@project_int_top_sorted_alpha) |
| 3077 | + { |
| 3078 | + $total = $project_int_top {$project} ; |
| 3079 | + $page = $project_int_top_split {"page:$project"} ; |
| 3080 | + $image = $project_int_top_split {"image:$project"} ; |
| 3081 | + $rest = $project_int_top_split {"other:$project"} ; |
| 3082 | + $total_total += $total ; |
| 3083 | + $total_page += $page ; |
| 3084 | + $total_image += $image ; |
| 3085 | + $total_rest += $rest ; |
| 3086 | + $total = &FormatCount ($total) ; |
| 3087 | + $page = &FormatCount ($page) ; |
| 3088 | + $image = &FormatCount ($image) ; |
| 3089 | + $rest = &FormatCount ($rest) ; |
| 3090 | + $html .= "<tr><td colspan=2 class=l>$project</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3091 | + } |
| 3092 | + $total_total = &FormatCount ($total_total) ; |
| 3093 | + $total_page = &FormatCount ($total_page) ; |
| 3094 | + $total_image = &FormatCount ($total_image) ; |
| 3095 | + $total_rest = &FormatCount ($total_rest) ; |
| 3096 | + $html .= "<tr><th colspan=2 class=l>total</th><th class=r>$total_total</th><td class=r>$total_page</td><td class=r>$total_image</td><td class=r>$total_rest</td></tr>\n" ; |
| 3097 | + |
| 3098 | + $html .= "<tr><td colspan=99> </td></tr>\n" ; |
| 3099 | + $html .= "<tr><td colspan=99 class=l><b>Per project language / subproject</b> (top 50)</td></tr>\n" ; |
| 3100 | + $projects = 0 ; |
| 3101 | + $total_total = 0 ; |
| 3102 | + $total_page = 0 ; |
| 3103 | + $total_image = 0 ; |
| 3104 | + $total_rest = 0 ; |
| 3105 | + foreach $origin (@origin_int_top_sorted_alpha) |
| 3106 | + { |
| 3107 | + next if $top100_internal_origins {$origin} == 0 ; |
| 3108 | + |
| 3109 | + $total = $origin_int_top {$origin} ; |
| 3110 | + $page = $origin_int_top_split {"page:$origin"} ; |
| 3111 | + $image = $origin_int_top_split {"image:$origin"} ; |
| 3112 | + $rest = $origin_int_top_split {"other:$origin"} ; |
| 3113 | + $total_total += $total ; |
| 3114 | + $total_page += $page ; |
| 3115 | + $total_image += $image ; |
| 3116 | + $total_rest += $rest ; |
| 3117 | + $total = &FormatCount ($total) ; |
| 3118 | + $page = &FormatCount ($page) ; |
| 3119 | + $image = &FormatCount ($image) ; |
| 3120 | + $rest = &FormatCount ($rest) ; |
| 3121 | + ($project,$subproject) = split (':', $origin) ; |
| 3122 | + $html .= "<tr><td class=l>$project</td><td class=l>$subproject</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3123 | + |
| 3124 | + } |
| 3125 | + $total = $origin_int_top_other {"all"} ; |
| 3126 | + $page = $origin_int_top_other {"page"} ; |
| 3127 | + $image = $origin_int_top_other {"image"} ; |
| 3128 | + $rest = $origin_int_top_other {"other"} ; |
| 3129 | + $total_total += $total ; |
| 3130 | + $total_page += $page ; |
| 3131 | + $total_image += $image ; |
| 3132 | + $total_rest += $rest ; |
| 3133 | + $total = &FormatCount ($total) ; |
| 3134 | + $page = &FormatCount ($page) ; |
| 3135 | + $image = &FormatCount ($image) ; |
| 3136 | + $rest = &FormatCount ($rest) ; |
| 3137 | + $html .= "<tr><td colspan=2 class=l>other</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3138 | + $total_total = &FormatCount ($total_total) ; |
| 3139 | + $total_page = &FormatCount ($total_page) ; |
| 3140 | + $total_image = &FormatCount ($total_image) ; |
| 3141 | + $total_rest = &FormatCount ($total_rest) ; |
| 3142 | + $html .= "<tr><th colspan=2 class=l>total</th><th class=r>$total_total</th><td class=r>$total_page</td><td class=r>$total_image</td><td class=r>$total_rest</td></tr>\n" ; |
| 3143 | + $html .= "</table>" ; |
| 3144 | + |
| 3145 | + $html .= "</td></tr>" ; |
| 3146 | + $html .= "</table>" ; |
| 3147 | + |
| 3148 | + # REQUESTS WITH EXTERNAL ORIGINS |
| 3149 | + |
| 3150 | + $html .= "<table border=1 width=100%>\n" ; |
| 3151 | + $html .= "<tr><td colspan=99 class=c> </td></tr>\n" ; |
| 3152 | + $html .= "<tr><td colspan=99 class=c><h3>Requests with external origins</h3></td></tr>\n" ; |
| 3153 | + $html .= "<table border=1 width=100%>\n" ; |
| 3154 | + |
| 3155 | + $html .= "<tr><td width=50% valign=top>" ; |
| 3156 | + $html .= "<table border=1 width=100%>\n" ; |
| 3157 | +# $html .= "<tr><td class=l><b><a href='http://..'>External origins</a><br>sorted by<br>frequency</b><br>top 100</td><th class=r> Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ; |
| 3158 | + $html .= "<tr><td class=l><b>External origins<br>sorted by<br>frequency</b><br>top 100</td><th class=r> Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ; |
| 3159 | + |
| 3160 | + $projects = 0 ; |
| 3161 | + $total_total = 0 ; |
| 3162 | + $total_page = 0 ; |
| 3163 | + $total_image = 0 ; |
| 3164 | + $total_rest = 0 ; |
| 3165 | + foreach $origin (@origin_ext_top_sorted_count) |
| 3166 | + { |
| 3167 | + $total = $origin_ext_top {$origin} ; |
| 3168 | + $page = $origin_ext_top_split {"page:$origin"} ; |
| 3169 | + $image = $origin_ext_top_split {"image:$origin"} ; |
| 3170 | + $rest = $origin_ext_top_split {"other:$origin"} ; |
| 3171 | + $total_total += $total ; |
| 3172 | + $total_page += $page ; |
| 3173 | + $total_image += $image ; |
| 3174 | + $total_rest += $rest ; |
| 3175 | + $total = &FormatCount ($total) ; |
| 3176 | + $page = &FormatCount ($page) ; |
| 3177 | + $image = &FormatCount ($image) ; |
| 3178 | + $rest = &FormatCount ($rest) ; |
| 3179 | + |
| 3180 | + if (++$projects > 100) |
| 3181 | + { |
| 3182 | + $origin_ext_top_other {"all"} += $origin_ext_top {$origin} ; ; |
| 3183 | + $origin_ext_top_other {"page"} += $origin_ext_top_split {"page:$origin"} ; |
| 3184 | + $origin_ext_top_other {"image"} += $origin_ext_top_split {"image:$origin"} ; |
| 3185 | + $origin_ext_top_other {"other"} += $origin_ext_top_split {"other:$origin"} ; |
| 3186 | + next ; |
| 3187 | + } |
| 3188 | + $top100_internal_origins {$origin} ++ ; |
| 3189 | + |
| 3190 | + if ($origin =~ /\./) |
| 3191 | + { $link_origin = "<a href='http://$origin' ref='nofollow'>$origin</a>" ; } |
| 3192 | + else |
| 3193 | + { $link_origin = $origin ; } |
| 3194 | + $html .= "<tr><td class=l>$link_origin</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3195 | + } |
| 3196 | + $total = $origin_ext_top_other {"all"} ; |
| 3197 | + $page = $origin_ext_top_other {"page"} ; |
| 3198 | + $image = $origin_ext_top_other {"image"} ; |
| 3199 | + $rest = $origin_ext_top_other {"other"} ; |
| 3200 | + $total = &FormatCount ($total) ; |
| 3201 | + $page = &FormatCount ($page) ; |
| 3202 | + $image = &FormatCount ($image) ; |
| 3203 | + $rest = &FormatCount ($rest) ; |
| 3204 | + $html .= "<tr><td class=l>other</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3205 | + $grand_grand_total = $total_total ; |
| 3206 | + $total_total = &FormatCount ($total_total) ; |
| 3207 | + $total_page = &FormatCount ($total_page) ; |
| 3208 | + $total_image = &FormatCount ($total_image) ; |
| 3209 | + $total_rest = &FormatCount ($total_rest) ; |
| 3210 | + $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_page</td><td class=r>$total_image</td><td class=r>$total_rest</td></tr>\n" ; |
| 3211 | + $html .= "</table>" ; |
| 3212 | + |
| 3213 | + # BY ALPHABET |
| 3214 | + $html .= "</td><td width=50% valign=top>" ; |
| 3215 | + |
| 3216 | + $html .= "<table border=1 width=100%>\n" ; |
| 3217 | +# $html .= "<tr><td class=l><b><a href='http://..'>External origins</a><br>sorted by<br>alphabet</b><br>top 100</td><th class=r> Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ; |
| 3218 | + $html .= "<tr><td class=l><b>External origins<br>sorted by<br>alphabet</b><br>top 100</td><th class=r> Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ; |
| 3219 | + |
| 3220 | + $projects = 0 ; |
| 3221 | + $total_total = 0 ; |
| 3222 | + $total_page = 0 ; |
| 3223 | + $total_image = 0 ; |
| 3224 | + $total_rest = 0 ; |
| 3225 | + foreach $origin (@origin_ext_top_sorted_alpha) |
| 3226 | + { |
| 3227 | + |
| 3228 | + $total = $origin_ext_top {$origin} ; |
| 3229 | + $page = $origin_ext_top_split {"page:$origin"} ; |
| 3230 | + $image = $origin_ext_top_split {"image:$origin"} ; |
| 3231 | + $rest = $origin_ext_top_split {"other:$origin"} ; |
| 3232 | + $total_total += $total ; |
| 3233 | + $total_page += $page ; |
| 3234 | + $total_image += $image ; |
| 3235 | + $total_rest += $rest ; |
| 3236 | + $total = &FormatCount ($total) ; |
| 3237 | + $page = &FormatCount ($page) ; |
| 3238 | + $image = &FormatCount ($image) ; |
| 3239 | + $rest = &FormatCount ($rest) ; |
| 3240 | + |
| 3241 | + next if $top100_internal_origins {$origin} == 0 ; |
| 3242 | + |
| 3243 | + $html .= "<tr><td class=l>$origin</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3244 | + |
| 3245 | + } |
| 3246 | + $total = $origin_ext_top_other {"all"} ; |
| 3247 | + $page = $origin_ext_top_other {"page"} ; |
| 3248 | + $image = $origin_ext_top_other {"image"} ; |
| 3249 | + $rest = $origin_ext_top_other {"other"} ; |
| 3250 | + $total = &FormatCount ($total) ; |
| 3251 | + $page = &FormatCount ($page) ; |
| 3252 | + $image = &FormatCount ($image) ; |
| 3253 | + $rest = &FormatCount ($rest) ; |
| 3254 | + $html .= "<tr><td class=l>other</td><th class=r>$total</th><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3255 | + $total_total = &FormatCount ($total_total) ; |
| 3256 | + $total_page = &FormatCount ($total_page) ; |
| 3257 | + $total_image = &FormatCount ($total_image) ; |
| 3258 | + $total_rest = &FormatCount ($total_rest) ; |
| 3259 | + $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_page</td><td class=r>$total_image</td><td class=r>$total_rest</td></tr>\n" ; |
| 3260 | + $html .= "</table>" ; |
| 3261 | + |
| 3262 | + $html .= "</td></tr>" ; |
| 3263 | +# $html .= "<tr><td colspan=99 class=c>For presentation conciseness the top level domain (.org, .com, ..) is ignored here. There is a theoretical<br> possibility that figures for two unrelated sites which are both popular are presented as one here.<p>" . |
| 3264 | +# "'Unmatched ip address': all requests without explicit referer url that were not allocated <br>to a site based on known ip range, e.g. google (by ip) or agent string, e.g. google (by agent)</td></tr>" ; |
| 3265 | + $html .= "<tr><td colspan=99 class=c>'Origin unknown': all requests without explicit referer url, without known ip range and without identity clue in the agent string.<br>Note that right now only ip ranges for Google and Yahoo are recognized by the script (manual input Feb 2009)</td></tr>" ; |
| 3266 | + $html .= "</table>" ; |
| 3267 | + |
| 3268 | + # EXTERNAL ORIGINS |
| 3269 | +if (0) |
| 3270 | +{ |
| 3271 | + $html .= "<tr><td colspan=99 class=c> </td></tr>\n" ; |
| 3272 | + $html .= "<tr><td colspan=99 class=c><h3>External origins</h3></td></tr>\n" ; |
| 3273 | + $html .= "<tr><td width=50% valign=top>" ; |
| 3274 | + |
| 3275 | + |
| 3276 | + $html .= "<table border=1 width=100%>\n" ; |
| 3277 | + $html .= "<tr><td class=l><b><a href='http://en.wikipedia.org/wiki/Top-level_domain'>Top level domains</a> (tld)<br>sorted by<br>frequency</b></td><th class=r> Total</th><th class=r>Google</th><th class=r>Yahoo</th><th class=r>Other</th></tr>\n" ; |
| 3278 | + $html .= "<tr><td colspan=99 class=l> <br><b><a href='http://en.wikipedia.org/wiki/Generic_top-level_domain'>Generic</a> and <a href='http://en.wikipedia.org/wiki/Sponsored_top-level_domains'>Sponsored</a> tld's</a></b></td></tr>\n" ; |
| 3279 | + foreach $toplevel (@origin_ext_page_top_sorted_count) |
| 3280 | + { |
| 3281 | + next if (length ($toplevel) <= 2) || ($toplevel =~ /^(?:address|local|rest|unspecified)$/) ; |
| 3282 | + |
| 3283 | + $total = $origin_ext_page_top {$toplevel} ; |
| 3284 | + $google = $origin_ext_page_top_split {"google:$toplevel"} ; |
| 3285 | + $yahoo = $origin_ext_page_top_split {"yahoo:$toplevel"} ; |
| 3286 | + $rest = $origin_ext_page_top_split {"other:$toplevel"} ; |
| 3287 | + $total_total += $total ; |
| 3288 | + $total_google += $google ; |
| 3289 | + $total_yahoo += $yahoo ; |
| 3290 | + $total_rest += $rest ; |
| 3291 | + $total = &FormatCount ($total) ; |
| 3292 | + $google = &FormatCount ($google) ; |
| 3293 | + $yahoo = &FormatCount ($yahoo) ; |
| 3294 | + $rest = &FormatCount ($rest) ; |
| 3295 | + $html .= "<tr><td class=l>$toplevel</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ; |
| 3296 | + } |
| 3297 | + $grand_total += $total_total ; |
| 3298 | + $grand_google += $total_google ; |
| 3299 | + $grand_yahoo += $total_yahoo ; |
| 3300 | + $grand_rest += $total_rest ; |
| 3301 | + $total_total = &FormatCount ($total_total) ; |
| 3302 | + $total_google = &FormatCount ($total_google) ; |
| 3303 | + $total_yahoo = &FormatCount ($total_yahoo) ; |
| 3304 | + $total_rest = &FormatCount ($total_rest) ; |
| 3305 | + $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_google</td><td class=r>$total_yahoo</td><td class=r>$total_rest</td></tr>\n" ; |
| 3306 | + |
| 3307 | + $total_total = 0 ; |
| 3308 | + $total_google = 0 ; |
| 3309 | + $total_yahoo = 0 ; |
| 3310 | + $total_rest = 0 ; |
| 3311 | + $html .= "<tr><td colspan=99 class=l> <br><b><a href='http://en.wikipedia.org/wiki/Country_code_top-level_domain'>Country code tld's</a></b></td></tr>\n" ; |
| 3312 | + foreach $toplevel (@origin_ext_page_top_sorted_count) |
| 3313 | + { |
| 3314 | + next if length ($toplevel) != 2 ; |
| 3315 | + |
| 3316 | + $total = $origin_ext_page_top {$toplevel} ; |
| 3317 | + $google = $origin_ext_page_top_split {"google:$toplevel"} ; |
| 3318 | + $yahoo = $origin_ext_page_top_split {"yahoo:$toplevel"} ; |
| 3319 | + $rest = $origin_ext_page_top_split {"other:$toplevel"} ; |
| 3320 | + $total_total += $total ; |
| 3321 | + $total_google += $google ; |
| 3322 | + $total_yahoo += $yahoo ; |
| 3323 | + $total_rest += $rest ; |
| 3324 | + $total = &FormatCount ($total) ; |
| 3325 | + $google = &FormatCount ($google) ; |
| 3326 | + $yahoo = &FormatCount ($yahoo) ; |
| 3327 | + $rest = &FormatCount ($rest) ; |
| 3328 | + $html .= "<tr><td class=l>$toplevel</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ; |
| 3329 | + } |
| 3330 | + $grand_total += $total_total ; |
| 3331 | + $grand_google += $total_google ; |
| 3332 | + $grand_yahoo += $total_yahoo ; |
| 3333 | + $grand_rest += $total_rest ; |
| 3334 | + $total_total = &FormatCount ($total_total) ; |
| 3335 | + $total_google = &FormatCount ($total_google) ; |
| 3336 | + $total_yahoo = &FormatCount ($total_yahoo) ; |
| 3337 | + $total_rest = &FormatCount ($total_rest) ; |
| 3338 | + $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_google</td><td class=r>$total_yahoo</td><td class=r>$total_rest</td></tr>\n" ; |
| 3339 | + |
| 3340 | + $total_total = 0 ; |
| 3341 | + $total_google = 0 ; |
| 3342 | + $total_yahoo = 0 ; |
| 3343 | + $total_rest = 0 ; |
| 3344 | + $html .= "<tr><td colspan=99 class=l> <br><b>Remainder</th></tr>\n" ; |
| 3345 | + $total = $origin_ext_page_top {"local"} ; |
| 3346 | + $google = $origin_ext_page_top_split {"google:local"} ; # always zero |
| 3347 | + $yahoo = $origin_ext_page_top_split {"yahoo:local"} ; # always zero |
| 3348 | + $rest = $origin_ext_page_top_split {"other:local"} ; |
| 3349 | + $total_total += $total ; |
| 3350 | + $total_google += $google ; |
| 3351 | + $total_yahoo += $yahoo ; |
| 3352 | + $total_rest += $rest ; |
| 3353 | + $total = &FormatCount ($total) ; |
| 3354 | + $google = &FormatCount ($google) ; |
| 3355 | + $yahoo = &FormatCount ($yahoo) ; |
| 3356 | + $rest = &FormatCount ($rest) ; |
| 3357 | + $html .= "<tr><td class=l>localhost</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ; |
| 3358 | + |
| 3359 | + $total = $origin_ext_page_top {"address"} ; |
| 3360 | + $google = $origin_ext_page_top_split {"google:address"} ; |
| 3361 | + $yahoo = $origin_ext_page_top_split {"yahoo:address"} ; |
| 3362 | + $rest = $origin_ext_page_top_split {"other:address"} ; |
| 3363 | + $total_total += $total ; |
| 3364 | + $total_google += $google ; |
| 3365 | + $total_yahoo += $yahoo ; |
| 3366 | + $total_rest += $rest ; |
| 3367 | + $total = &FormatCount ($total) ; |
| 3368 | + $google = &FormatCount ($google) ; |
| 3369 | + $yahoo = &FormatCount ($yahoo) ; |
| 3370 | + $rest = &FormatCount ($rest) ; |
| 3371 | + $html .= "<tr><td class=l>ip address</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ; |
| 3372 | + |
| 3373 | + $total = $origin_ext_page_top {"rest"} ; |
| 3374 | + $google = $origin_ext_page_top_split {"google:rest"} ; |
| 3375 | + $yahoo = $origin_ext_page_top_split {"yahoo:rest"} ; |
| 3376 | + $rest = $origin_ext_page_top_split {"other:rest"} ; |
| 3377 | + $total_total += $total ; |
| 3378 | + $total_google += $google ; |
| 3379 | + $total_yahoo += $yahoo ; |
| 3380 | + $total_rest += $rest ; |
| 3381 | + $total = &FormatCount ($total) ; |
| 3382 | + $google = &FormatCount ($google) ; |
| 3383 | + $yahoo = &FormatCount ($yahoo) ; |
| 3384 | + $rest = &FormatCount ($rest) ; |
| 3385 | + $html .= "<tr><td class=l>other</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ; |
| 3386 | + |
| 3387 | + $total = $origin_ext_page_top {"unspecified"} ; |
| 3388 | + $google = $origin_ext_page_top_split {"google:unspecified"} ; |
| 3389 | + $yahoo = $origin_ext_page_top_split {"yahoo:unspecified"} ; |
| 3390 | + $rest = $origin_ext_page_top_split {"other:unspecified"} ; |
| 3391 | + $total_total += $total ; |
| 3392 | + $total_google += $google ; |
| 3393 | + $total_yahoo += $yahoo ; |
| 3394 | + $total_rest += $rest ; |
| 3395 | + $total = &FormatCount ($total) ; |
| 3396 | + $google = &FormatCount ($google) ; |
| 3397 | + $yahoo = &FormatCount ($yahoo) ; |
| 3398 | + $rest = &FormatCount ($rest) ; |
| 3399 | + $html .= "<tr><td class=l>anonymous</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ; |
| 3400 | + |
| 3401 | + $grand_total += $total_total ; |
| 3402 | + $grand_google += $total_google ; |
| 3403 | + $grand_yahoo += $total_yahoo ; |
| 3404 | + $grand_rest += $total_rest ; |
| 3405 | + $total_total = &FormatCount ($total_total) ; |
| 3406 | + $total_google = &FormatCount ($total_google) ; |
| 3407 | + $total_yahoo = &FormatCount ($total_yahoo) ; |
| 3408 | + $total_rest = &FormatCount ($total_rest) ; |
| 3409 | + $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_google</td><td class=r>$total_yahoo</td><td class=r>$total_rest</td></tr>\n" ; |
| 3410 | + |
| 3411 | + $html .= "<tr><td colspan=99 class=l> <br><b>Grand total external</th></tr>\n" ; |
| 3412 | + $grand_total = &FormatCount ($grand_total) ; |
| 3413 | + $grand_google = &FormatCount ($grand_google) ; |
| 3414 | + $grand_yahoo = &FormatCount ($grand_yahoo) ; |
| 3415 | + $grand_rest = &FormatCount ($grand_rest) ; |
| 3416 | + $html .= "<tr><th class=l>total</th><th class=r>$grand_total</th><td class=r>$grand_google</td><td class=r>$grand_yahoo</td><td class=r>$grand_rest</td></tr>\n" ; |
| 3417 | + $html .= "</table>" ; |
| 3418 | + |
| 3419 | + $html .= "</td><td width=50% valign=top>" ; |
| 3420 | + |
| 3421 | + $html .= "<table border=1 width=100%>\n" ; |
| 3422 | + |
| 3423 | + $html .= "<tr><th class=l>Top level domains<br>sorted by<br>alphabet</th><th class=r>Total<th class=r>Google<th class=r>Yahoo<th class=r>Other</th></tr>\n" ; |
| 3424 | +# $html .= "<tr><th colspan=99 class=l> <br><b><a href='http://en.wikipedia.org/wiki/Top-level_domain'>generic/sponsored tld's</a></b></th></tr>\n" ; |
| 3425 | + $total_total = 0 ; |
| 3426 | + $total_google = 0 ; |
| 3427 | + $total_yahoo = 0 ; |
| 3428 | + $total_rest = 0 ; |
| 3429 | + $html .= "<tr><td colspan=99 class=l> <br><b>Generic and sponsored tld's</b></td></tr>\n" ; |
| 3430 | + |
| 3431 | + foreach $toplevel (@origin_ext_page_top_sorted_alpha) |
| 3432 | + { |
| 3433 | + next if (length ($toplevel) <= 2) || ($toplevel =~ /^(?:address|local|rest|unspecified)$/) ; |
| 3434 | + |
| 3435 | + $total = $origin_ext_page_top {$toplevel} ; |
| 3436 | + $google = $origin_ext_page_top_split {"google:$toplevel"} ; |
| 3437 | + $yahoo = $origin_ext_page_top_split {"yahoo:$toplevel"} ; |
| 3438 | + $rest = $origin_ext_page_top_split {"other:$toplevel"} ; |
| 3439 | + $total_total += $total ; |
| 3440 | + $total_google += $google ; |
| 3441 | + $total_yahoo += $yahoo ; |
| 3442 | + $total_rest += $rest ; |
| 3443 | + $total = &FormatCount ($total) ; |
| 3444 | + $google = &FormatCount ($google) ; |
| 3445 | + $yahoo = &FormatCount ($yahoo) ; |
| 3446 | + $rest = &FormatCount ($rest) ; |
| 3447 | + $html .= "<tr><td class=l>$toplevel</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ; |
| 3448 | + } |
| 3449 | + $total_total = &FormatCount ($total_total) ; |
| 3450 | + $total_google = &FormatCount ($total_google) ; |
| 3451 | + $total_yahoo = &FormatCount ($total_yahoo) ; |
| 3452 | + $total_rest = &FormatCount ($total_rest) ; |
| 3453 | + $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_google</td><td class=r>$total_yahoo</td><td class=r>$total_rest</td></tr>\n" ; |
| 3454 | + |
| 3455 | + $total_total = 0 ; |
| 3456 | + $total_google = 0 ; |
| 3457 | + $total_yahoo = 0 ; |
| 3458 | + $total_rest = 0 ; |
| 3459 | + $html .= "<tr><td colspan=99 class=l> <br><b><a href='http://en.wikipedia.org/wiki/Country_code_top-level_domain'>Country code tld's</a></b></td></tr>\n" ; |
| 3460 | + foreach $toplevel (@origin_ext_page_top_sorted_alpha) |
| 3461 | + { |
| 3462 | + next if length ($toplevel) != 2 ; |
| 3463 | + |
| 3464 | + $total = $origin_ext_page_top {$toplevel} ; |
| 3465 | + $google = $origin_ext_page_top_split {"google:$toplevel"} ; |
| 3466 | + $yahoo = $origin_ext_page_top_split {"yahoo:$toplevel"} ; |
| 3467 | + $rest = $origin_ext_page_top_split {"other:$toplevel"} ; |
| 3468 | + $total_total += $total ; |
| 3469 | + $total_google += $google ; |
| 3470 | + $total_yahoo += $yahoo ; |
| 3471 | + $total_rest += $rest ; |
| 3472 | + $total = &FormatCount ($total) ; |
| 3473 | + $google = &FormatCount ($google) ; |
| 3474 | + $yahoo = &FormatCount ($yahoo) ; |
| 3475 | + $rest = &FormatCount ($rest) ; |
| 3476 | + $html .= "<tr><td class=l>$toplevel</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ; |
| 3477 | + } |
| 3478 | + $total_total = &FormatCount ($total_total) ; |
| 3479 | + $total_google = &FormatCount ($total_google) ; |
| 3480 | + $total_yahoo = &FormatCount ($total_yahoo) ; |
| 3481 | + $total_rest = &FormatCount ($total_rest) ; |
| 3482 | + $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_google</td><td class=r>$total_yahoo</td><td class=r>$total_rest</td></tr>\n" ; |
| 3483 | + |
| 3484 | + $total_total = 0 ; |
| 3485 | + $total_google = 0 ; |
| 3486 | + $total_yahoo = 0 ; |
| 3487 | + $total_rest = 0 ; |
| 3488 | + $html .= "<tr><td colspan=99 class=l> <br><b>Remainder</th></tr>\n" ; |
| 3489 | + $total = $origin_ext_page_top {"local"} ; |
| 3490 | + $google = $origin_ext_page_top_split {"google:local"} ; # always zero |
| 3491 | + $yahoo = $origin_ext_page_top_split {"yahoo:local"} ; # always zero |
| 3492 | + $rest = $origin_ext_page_top_split {"other:local"} ; |
| 3493 | + $total_total += $total ; |
| 3494 | + $total_google += $google ; |
| 3495 | + $total_yahoo += $yahoo ; |
| 3496 | + $total_rest += $rest ; |
| 3497 | + $total = &FormatCount ($total) ; |
| 3498 | + $google = &FormatCount ($google) ; |
| 3499 | + $yahoo = &FormatCount ($yahoo) ; |
| 3500 | + $rest = &FormatCount ($rest) ; |
| 3501 | + $html .= "<tr><td class=l>localhost</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ; |
| 3502 | + |
| 3503 | + $total = $origin_ext_page_top {"address"} ; |
| 3504 | + $google = $origin_ext_page_top_split {"google:address"} ; |
| 3505 | + $yahoo = $origin_ext_page_top_split {"yahoo:address"} ; |
| 3506 | + $rest = $origin_ext_page_top_split {"other:address"} ; |
| 3507 | + $total_total += $total ; |
| 3508 | + $total_google += $google ; |
| 3509 | + $total_yahoo += $yahoo ; |
| 3510 | + $total_rest += $rest ; |
| 3511 | + $total = &FormatCount ($total) ; |
| 3512 | + $google = &FormatCount ($google) ; |
| 3513 | + $yahoo = &FormatCount ($yahoo) ; |
| 3514 | + $rest = &FormatCount ($rest) ; |
| 3515 | + $html .= "<tr><td class=l>ip address</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ; |
| 3516 | + |
| 3517 | + $total = $origin_ext_page_top {"rest"} ; |
| 3518 | + $google = $origin_ext_page_top_split {"google:rest"} ; |
| 3519 | + $yahoo = $origin_ext_page_top_split {"yahoo:rest"} ; |
| 3520 | + $rest = $origin_ext_page_top_split {"other:rest"} ; |
| 3521 | + $total_total += $total ; |
| 3522 | + $total_google += $google ; |
| 3523 | + $total_yahoo += $yahoo ; |
| 3524 | + $total_rest += $rest ; |
| 3525 | + $total = &FormatCount ($total) ; |
| 3526 | + $google = &FormatCount ($google) ; |
| 3527 | + $yahoo = &FormatCount ($yahoo) ; |
| 3528 | + $rest = &FormatCount ($rest) ; |
| 3529 | + $html .= "<tr><td class=l>other</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ; |
| 3530 | + |
| 3531 | + $total = $origin_ext_page_top {"unspecified"} ; |
| 3532 | + $google = $origin_ext_page_top_split {"google:unspecified"} ; |
| 3533 | + $yahoo = $origin_ext_page_top_split {"yahoo:unspecified"} ; |
| 3534 | + $rest = $origin_ext_page_top_split {"other:unspecified"} ; |
| 3535 | + $total_total += $total ; |
| 3536 | + $total_google += $google ; |
| 3537 | + $total_yahoo += $yahoo ; |
| 3538 | + $total_rest += $rest ; |
| 3539 | + $total = &FormatCount ($total) ; |
| 3540 | + $google = &FormatCount ($google) ; |
| 3541 | + $yahoo = &FormatCount ($yahoo) ; |
| 3542 | + $rest = &FormatCount ($rest) ; |
| 3543 | + $html .= "<tr><td class=l>anonymous</td><th class=r>$total</th><td class=r>$google</td><td class=r>$yahoo</td><td class=r>$rest</td></tr>\n" ; |
| 3544 | + |
| 3545 | + $total_total = &FormatCount ($total_total) ; |
| 3546 | + $total_google = &FormatCount ($total_google) ; |
| 3547 | + $total_yahoo = &FormatCount ($total_yahoo) ; |
| 3548 | + $total_rest = &FormatCount ($total_rest) ; |
| 3549 | + $html .= "<tr><th class=l>total</th><th class=r>$total_total</th><td class=r>$total_google</td><td class=r>$total_yahoo</td><td class=r>$total_rest</td></tr>\n" ; |
| 3550 | + |
| 3551 | + $html .= "<tr><td colspan=99 class=l> <br><b>Grand total external</th></tr>\n" ; |
| 3552 | + $html .= "<tr><th class=l>total</th><th class=r>$grand_total</th><td class=r>$grand_google</td><td class=r>$grand_yahoo</td><td class=r>$grand_rest</td></tr>\n" ; |
| 3553 | + $html .= "</table>" ; |
| 3554 | + |
| 3555 | + $html .= "</td></tr>" ; |
| 3556 | + $html .= "</table>" ; |
| 3557 | + $html .= "</td></tr>" ; |
| 3558 | + |
| 3559 | + $html .= "</table>\n" ; |
| 3560 | +} |
| 3561 | + |
| 3562 | +sub WriteReportScripts |
| 3563 | +{ |
| 3564 | + open FILE_HTML_SCRIPTS, '>', "$dir_reports/$file_html_scripts" ; |
| 3565 | + |
| 3566 | + $html = $header ; |
| 3567 | + $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Scripts/ ; |
| 3568 | + $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Scripts/ ; |
| 3569 | + $html =~ s/ALSO/ See also: <b>LINKS<\/b>/ ; |
| 3570 | + $html =~ s/LINKS/$link_requests $link_origins \/ $link_methods \/ $dummy_scripts \/ $link_skins \/ $link_crawlers \/ $link_opsys \/ $link_browsers \/ $link_google/ ; |
| 3571 | + $html =~ s/X1000/⇒ <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ; |
| 3572 | + |
| 3573 | + $html .= "<table border=1>\n" ; |
| 3574 | + $html .= "<tr><td colspan=99>" ; |
| 3575 | + |
| 3576 | + |
| 3577 | + $html .= "<table border=0 width=100%>\n" ; |
| 3578 | + $html .= "<tr><td width=50% valign=top>" ; |
| 3579 | + $html .= "<table border=1 width=100%>\n" ; |
| 3580 | + |
| 3581 | + $html .= "<tr><td class=l><h3>In order of request volume</h3></td><th class=r>Count<br><small>x 1000</small></th></tr>\n" ; |
| 3582 | + $html .= "<tr><th colspan=99 class=l> <br><b>css</b></th></tr>\n" ; |
| 3583 | + foreach $script (@scripts_css_sorted_count) |
| 3584 | + { |
| 3585 | + $total = $scripts_css {$script} ; |
| 3586 | + |
| 3587 | + next if $total < 3 ; |
| 3588 | + |
| 3589 | + $total = &FormatCount ($total) ; |
| 3590 | + $html .= "<tr><td class=l>$script</td><td class=r>$total</td></tr>\n" ; |
| 3591 | + } |
| 3592 | + $html .= "<tr><th colspan=99 class=l> <br><b>js</b></th></tr>\n" ; |
| 3593 | + foreach $script (@scripts_js_sorted_count) |
| 3594 | + { |
| 3595 | + $total = $scripts_js {$script} ; |
| 3596 | + |
| 3597 | + next if $total < 3 ; |
| 3598 | + |
| 3599 | + $total = &FormatCount ($total) ; |
| 3600 | + $html .= "<tr><td class=l>$script</td><td class=r>$total</td></tr>\n" ; |
| 3601 | + } |
| 3602 | + $html .= "<tr><th colspan=99 class=l> <br><b>php</b></th></tr>\n" ; |
| 3603 | + $total_php = 0 ; |
| 3604 | + foreach $script (@scripts_php_sorted_count) |
| 3605 | + { |
| 3606 | + $total = $scripts_php {$script} ; |
| 3607 | + |
| 3608 | + next if $total < 3 ; |
| 3609 | + |
| 3610 | + $total_php += $total ; |
| 3611 | + $total = &FormatCount ($total) ; |
| 3612 | + $html .= "<tr><td class=l>$script</td><td class=r>$total</td></tr>\n" ; |
| 3613 | + foreach $key (keys_sorted_by_value_num_desc %actions) |
| 3614 | + { |
| 3615 | + ($script2,$action) = split (',', $key) ; |
| 3616 | + if (($script eq $script2) && ($actions {$key} < $scripts_php {$script})) |
| 3617 | + { $html .= "<tr><td class=l> <small>$action</small></td><td class=r><small>" . &FormatCount ($actions {$key}) . "</small></td></tr>\n" ; } |
| 3618 | + } |
| 3619 | + } |
| 3620 | + $total_php = &FormatCount ($total_php) ; |
| 3621 | + $html .= "<tr><th class=l>total php</th><th class=r>$total_php</th></tr>\n" ; |
| 3622 | + $html .= "</table>" ; |
| 3623 | + |
| 3624 | + $html .= "</td><td width=50% valign=top>" ; |
| 3625 | + |
| 3626 | + $html .= "<table border=1 width=100%>\n" ; |
| 3627 | + |
| 3628 | + $html .= "<tr><td class=l><h3>In alphabetical order</h3></td><th class=r>Count<br><small>x 1000</small></th></tr>\n" ; |
| 3629 | + $html .= "<tr><th colspan=99 class=l> <br><b>css</b></th></tr>\n" ; |
| 3630 | + foreach $script (@scripts_css_sorted_script) |
| 3631 | + { |
| 3632 | + $total = $scripts_css {$script} ; |
| 3633 | + |
| 3634 | + next if $total < 3 ; |
| 3635 | + |
| 3636 | + $total = &FormatCount ($total) ; |
| 3637 | + $html .= "<tr><td class=l>$script</td><td class=r>$total</td></tr>\n" ; |
| 3638 | + } |
| 3639 | + $html .= "<tr><th colspan=99 class=l> <br><b>js</b></th></tr>\n" ; |
| 3640 | + foreach $script (@scripts_js_sorted_script) |
| 3641 | + { |
| 3642 | + $total = $scripts_js {$script} ; |
| 3643 | + |
| 3644 | + next if $total < 3 ; |
| 3645 | + |
| 3646 | + $total = &FormatCount ($total) ; |
| 3647 | + $html .= "<tr><td class=l>$script</td><td class=r>$total</td></tr>\n" ; |
| 3648 | + } |
| 3649 | + $html .= "<tr><th colspan=99 class=l> <br><b>php</b></th></tr>\n" ; |
| 3650 | + foreach $script (@scripts_php_sorted_script) |
| 3651 | + { |
| 3652 | + $total = $scripts_php {$script} ; |
| 3653 | + |
| 3654 | + next if $total < 3 ; |
| 3655 | + |
| 3656 | + $total_php += $total ; |
| 3657 | + $total = &FormatCount ($total) ; |
| 3658 | + $html .= "<tr><td class=l>$script</td><td class=r>$total</td></tr>\n" ; |
| 3659 | + foreach $key (sort keys %actions) |
| 3660 | + { |
| 3661 | + ($script2,$action) = split (',', $key) ; |
| 3662 | + if (($script eq $script2) && ($actions {$key} < $scripts_php {$script})) |
| 3663 | + { $html .= "<tr><td class=l> <small>$action</small></td><td class=r><small>" . &FormatCount ($actions {$key}) . "</small></td></tr>\n" ; } |
| 3664 | + } |
| 3665 | + } |
| 3666 | + $html .= "<tr><th class=l>total php</th><th class=r>$total_php</th></tr>\n" ; |
| 3667 | + $html .= "</table>" ; |
| 3668 | + |
| 3669 | + $html .= "</td></tr>" ; |
| 3670 | + $html .= "</table>" ; |
| 3671 | + $html .= "</td></tr>" ; |
| 3672 | + |
| 3673 | + $html .= "<tr><td colspan=99> </td></tr>\n" ; |
| 3674 | + $html .= "<tr><th colspan=99 class=l><h3>PHP scripts and generalized arguments, sorted by frequency, top 25</h3></th></tr>\n" ; |
| 3675 | + $html .= "<tr><th class=l>Script</th><th class=l>Parameters</th><th class=r>Count<br><small>x 1000</small></th></tr>\n" ; |
| 3676 | + $rows = 0 ; |
| 3677 | + foreach $parm (@parms_sorted_count) |
| 3678 | + { |
| 3679 | + $total = &FormatCount ($parms {$parm}) ; |
| 3680 | + ($name,$parms) = split (',', $parm) ; |
| 3681 | + if ($parms eq "") |
| 3682 | + { $parms = "-" ; } |
| 3683 | + $html .= "<tr><td class=l>$name</td><td class=l>$parms</td><td class=r>$total</td></tr>\n" ; |
| 3684 | + $rows++ ; |
| 3685 | + |
| 3686 | + last if $rows == 25 ; |
| 3687 | + } |
| 3688 | +# $html .= "</table>\n" ; |
| 3689 | +# $html .= "</td><td> </td><td>" ; |
| 3690 | +# $html .= "<table border=1>\n" ; |
| 3691 | + $html .= "<tr><th colspan=99 class=l> </th></tr>\n" ; |
| 3692 | + |
| 3693 | + $html .= "<tr><th colspan=99 class=l><h3>PHP scripts and generalized arguments, in alphabetical order <small>(≥ 3)</small></h3></small></th></tr>\n" ; |
| 3694 | + |
| 3695 | + $html .= "<tr><td colspan=2 class=l><b>Script</b><br>Parameters</td><th class=r>Count<br><small>x 1000</small></th></tr>\n" ; |
| 3696 | + $rows = 0 ; |
| 3697 | + $nameprev = "" ; |
| 3698 | + foreach $parm (@parms_sorted_script) |
| 3699 | + { |
| 3700 | + ($name,$parms) = split (',', $parm, 2) ; |
| 3701 | + |
| 3702 | + $total = &FormatCount ($parms {$parm}) ; |
| 3703 | + if ($name ne $nameprev) |
| 3704 | + { |
| 3705 | + $total = &FormatCount ($scripts_php {$name}) ; |
| 3706 | + |
| 3707 | + next if $total < 3 ; |
| 3708 | + |
| 3709 | + if ($nameprev ne "") |
| 3710 | + { $html .= "<tr><th colspan=99 class=l> </th></tr>\n" ; } |
| 3711 | + if (($name eq "api.php") || ($name eq "index.php")) |
| 3712 | + { $html .= "<tr><td colspan=2 class=l><b>$name</b> <small>(≥ 3)</small></td><th class=r>$total</th></tr>\n" ; } |
| 3713 | + else |
| 3714 | + { $html .= "<tr><td colspan=2 class=l><b>$name</b></td><th class=r>$total</th></tr>\n" ; } |
| 3715 | + } |
| 3716 | + $total = $parms {$parm} ; |
| 3717 | + |
| 3718 | + next if (($name eq "api.php") || ($name eq "index.php")) && ($total <= 2) ; |
| 3719 | + |
| 3720 | + $total = &FormatCount ($total) ; |
| 3721 | + if ($parms eq "") |
| 3722 | + { $parms = "-" ; } |
| 3723 | + $html .= "<tr><td colspan=2 class=l>$parms</td><td class=r>$total</td></tr>\n" ; |
| 3724 | + $rows++ ; |
| 3725 | + $nameprev = $name ; |
| 3726 | + } |
| 3727 | + $html .= "</table>\n" ; |
| 3728 | + |
| 3729 | + $html .= "</td></tr></table>\n" ; |
| 3730 | + $html .= " <small>$rows rows written</small><p>" ; |
| 3731 | + |
| 3732 | +# $html .= "<p><b>Explanation:</b><br>'osd' = opensearchdescription / 'php.ser' = vnd.php.serialized" ; |
| 3733 | + $html .= $colophon ; |
| 3734 | + |
| 3735 | + print FILE_HTML_SCRIPTS $html ; |
| 3736 | + close FILE_HTML_SCRIPTS ; |
| 3737 | +} |
| 3738 | + |
| 3739 | +sub WriteReportGoogle |
| 3740 | +{ |
| 3741 | + open FILE_HTML_SEARCH, '>', "$dir_reports/$file_html_google" ; |
| 3742 | + |
| 3743 | + $html = $header ; |
| 3744 | + $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Google requests/ ; |
| 3745 | + $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Google requests/ ; |
| 3746 | + $html =~ s/ALSO/ See also: <b>LINKS<\/b>/ ; |
| 3747 | + $html =~ s/LINKS/$link_requests $link_origins \/ $link_methods \/ $link_scripts \/ $link_skins \/ $link_crawlers \/ $link_opsys \/ $link_browsers \/ $dummy_google/ ; |
| 3748 | + $html =~ s/X1000/⇒ <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ; |
| 3749 | + |
| 3750 | + $html .= "<table border=1 width=500 wrap>\n" ; |
| 3751 | +# $html .= "<tr><td colspan=99 class=l> <br>This report shows <b>all requests to Wikimedia servers where a Google server of service was involved in any way</b>,<br> " . |
| 3752 | +# "be it the <a href='http://en.wikipedia.org/wiki/Googlebot'>GoogleBot</a> crawler or <a href='http://www.google.com/feedfetcher.html'>FeedFetcher</a> collector scripts that run on Google servers,<br> " . |
| 3753 | +# "or a user that follows a link from a Google Web or Google Desktop search results page, or " . |
| 3754 | +# "from Google Maps or Google Earth etcetera. <p>Technically speaking three fields in the <a href='http://wikitech.wikimedia.org/view/Squid_log_format'>squid log records</a> are checked for this: " . |
| 3755 | +# "client ip address, referer header and user agent header.<br>A request can originate from an ip address which has been registered by Google and/or it can carry a referer tag that tells us<br>a user clicked a link " . |
| 3756 | +# "on a Google results page and/or it can carry an agent string that mentions a Google application which<br>can reasonably be assumed to be genuinely Google's. See bottom of page for <a href='#details'>further details</a>." . |
| 3757 | +# "PERC_GOOGLE\n" ; |
| 3758 | + $html .= "<tr><td colspan=99 class=l wrap> <br>This report shows <b>all requests to Wikimedia servers where a Google server of service was involved in any way</b>, " . |
| 3759 | + "be it the <a href='http://en.wikipedia.org/wiki/Googlebot'>GoogleBot</a> crawler or <a href='http://www.google.com/feedfetcher.html'>FeedFetcher</a> collector scripts that run on Google servers, " . |
| 3760 | + "or a user that follows a link from a Google Web or Google Desktop search results page, or " . |
| 3761 | + "from Google Maps or Google Earth etcetera. <p>Technically speaking three fields in the <a href='http://wikitech.wikimedia.org/view/Squid_log_format'>squid log records</a> are checked for this: " . |
| 3762 | + "client ip address, referer header and user agent header. A request can originate from an ip address which has been registered by Google and/or it can carry a referer tag that tells us a user clicked a link " . |
| 3763 | + "on a Google results page and/or it can carry an agent string that mentions a Google application which can reasonably be assumed to be genuinely Google's. See bottom of page for <a href='#details'>further details</a>." . |
| 3764 | + "PERC_GOOGLE\n" ; |
| 3765 | + |
| 3766 | + $html .= "<tr><td width=50%>\n" ; |
| 3767 | + |
| 3768 | + # SORTED BY FREQUENCY |
| 3769 | + $html .= "<table border=1>\n" ; |
| 3770 | + $html .= "<tr><th colspan=99 class=l><h3>In order of request volume</h3></th></tr>\n" ; |
| 3771 | + $html .= "<tr><th colspan=99 class=l>Requests originating from a Google ip address</th></tr>\n" ; |
| 3772 | +# $html .= "<tr><th colspan=99 class=l><small>x 1000</small></th>\n" ; |
| 3773 | + my $total_total_direct ; |
| 3774 | + my $total_page_direct ; |
| 3775 | + my $total_image_direct ; |
| 3776 | + my $total_rest_direct ; |
| 3777 | + $html .= "<tr><th class=l>Service</a><th class=r>Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ; |
| 3778 | + foreach $key (@searches_service_count) |
| 3779 | + { |
| 3780 | + next if $key !~ /Y$/ ; # googleIp |
| 3781 | + |
| 3782 | + ($key2 = $key) =~ s/,[YN]$// ; |
| 3783 | + $total = $searches_service_mimecat {"$key2,total,Y"} ; |
| 3784 | + $page = $searches_service_mimecat {"$key2,page,Y"} ; |
| 3785 | + $image = $searches_service_mimecat {"$key2,image,Y"} ; |
| 3786 | + $rest = $searches_service_mimecat {"$key2,other,Y"} ; |
| 3787 | + $total_total_direct += $total ; |
| 3788 | + $total_page_direct += $page ; |
| 3789 | + $total_image_direct += $image ; |
| 3790 | + $total_rest_direct += $rest ; |
| 3791 | + $total = &FormatCount ($total) ; |
| 3792 | + $page = &FormatCount ($page) ; |
| 3793 | + $image = &FormatCount ($image) ; |
| 3794 | + $rest = &FormatCount ($rest) ; |
| 3795 | + $html .= "<tr><td class=l>$key2</a></td><td class=r>$total</td><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3796 | + } |
| 3797 | + $total_page_all = $total_page_direct ; |
| 3798 | + |
| 3799 | +# $total_page_requests_external_fmt = &FormatCount ($total_page_requests_external*1000) ; |
| 3800 | + |
| 3801 | + $perc_google_direct = ".." ; |
| 3802 | + if ($total_page_requests_external > 0) |
| 3803 | + { $perc_google_direct = sprintf ("%.1f",100 * $total_page_direct/$total_page_requests_external) ; } |
| 3804 | + $total_page_direct_fmt = &FormatCount ($total_page_direct*1000) ; |
| 3805 | + $perc_google_msg_direct = "<p>Including all of its different search crawlers and services hosted on its servers, Google itself requested another $total_page_direct_fmt page pages per day, representing $perc_google_direct% of our external page requests.\n" ; |
| 3806 | + |
| 3807 | + $total_total_direct = &FormatCount ($total_total_direct) ; |
| 3808 | + $total_page_direct = &FormatCount ($total_page_direct) ; |
| 3809 | + $total_image_direct = &FormatCount ($total_image_direct) ; |
| 3810 | + $total_rest_direct = &FormatCount ($total_rest_direct) ; |
| 3811 | + |
| 3812 | + $html .= "<tr><th class=l>Total</a></th><th class=r>$total_total_direct</th><th class=r>$total_page_direct</th><th class=r>$total_image_direct</th><th class=r>$total_rest_direct</th></tr>\n" ; |
| 3813 | + |
| 3814 | + my $total_total_indirect ; |
| 3815 | + my $total_page_indirect ; |
| 3816 | + my $total_image_indirect ; |
| 3817 | + my $total_rest_indirect ; |
| 3818 | + |
| 3819 | + $html .= "<tr><th colspan=99 class=l> </th></tr>\n" ; |
| 3820 | + $html .= "<tr><th colspan=99 class=l>Requests originating from elsewhere</th></tr>\n" ; |
| 3821 | + $html .= "<tr><th class=l>Service</a><th class=r>Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ; |
| 3822 | + foreach $key (@searches_service_count) |
| 3823 | + { |
| 3824 | + next if $key =~ /Y$/ ; # googleIp |
| 3825 | + |
| 3826 | + ($key2 = $key) =~ s/,[YN]$// ; |
| 3827 | + $total = $searches_service_mimecat {"$key2,total,N"} ; |
| 3828 | + $page = $searches_service_mimecat {"$key2,page,N"} ; |
| 3829 | + $image = $searches_service_mimecat {"$key2,image,N"} ; |
| 3830 | + $rest = $searches_service_mimecat {"$key2,other,N"} ; |
| 3831 | + $total_total_indirect += $total ; |
| 3832 | + $total_page_indirect += $page ; |
| 3833 | + $total_image_indirect += $image ; |
| 3834 | + $total_rest_indirect += $rest ; |
| 3835 | + $total = &FormatCount ($total) ; |
| 3836 | + $page = &FormatCount ($page) ; |
| 3837 | + $image = &FormatCount ($image) ; |
| 3838 | + $rest = &FormatCount ($rest) ; |
| 3839 | + $html .= "<tr><td class=l>$key2</a></td><td class=r>$total</td><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3840 | + } |
| 3841 | + $total_page_all += $total_page_indirect ; |
| 3842 | + |
| 3843 | + $perc_google_indirect = ".." ; |
| 3844 | + if ($total_page_requests_external > 0) |
| 3845 | + { $perc_google_indirect = sprintf ("%.1f",100 * $total_page_indirect/$total_page_requests_external) ; } |
| 3846 | + $total_page_indirect_fmt = &FormatCount ($total_page_indirect*1000) ; |
| 3847 | + $perc_google_msg_indirect = "<p>Google referred to our sites, through its services including search, maps, and Google Earth, $total_page_indirect_fmt page views per day, representing $perc_google_indirect% of our external page requests.\n" ; |
| 3848 | + |
| 3849 | + $total_total_indirect = &FormatCount ($total_total_indirect) ; |
| 3850 | + $total_page_indirect = &FormatCount ($total_page_indirect) ; |
| 3851 | + $total_image_indirect = &FormatCount ($total_image_indirect) ; |
| 3852 | + $total_rest_indirect = &FormatCount ($total_rest_indirect) ; |
| 3853 | + |
| 3854 | + $html .= "<tr><th class=l>Total</a></th><th class=r>$total_total_indirect</th><th class=r>$total_page_indirect</th><th class=r>$total_image_indirect</th><th class=r>$total_rest_indirect</th></tr>\n" ; |
| 3855 | + $html .= "<tr><th class=l colspan=99> </td></tr>\n" ; |
| 3856 | + $html .= "<tr><th colspan=99 class=l><a href='http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains'>Top level domains</a></th></tr>\n" ; |
| 3857 | + |
| 3858 | +# $total_page_all_fmt = &FormatCount ($total_page_all*1000) ; |
| 3859 | + |
| 3860 | + $perc_google = ".." ; |
| 3861 | + if ($total_page_requests_external > 0) |
| 3862 | + { $perc_google = sprintf ("%.1f",100 * $total_page_all/$total_page_requests_external) ; } |
| 3863 | + |
| 3864 | + $perc_google_msg_all = "<p>In total Google was somehow involved in $perc_google\% of daily external page<sup>*<\/sup> requests \n" ; |
| 3865 | + $html =~ s/PERC_GOOGLE/<hr width=90%>$perc_google_msg_all $perc_google_msg_indirect $perc_google_msg_direct<p><small>* = mime type <a href='SquidReportRequests.htm'>text\/html<\/a> only<\/small>/ ; |
| 3866 | + |
| 3867 | + $total_total = 0 ; |
| 3868 | + $total_page = 0 ; |
| 3869 | + $total_image = 0 ; |
| 3870 | + $total_rest = 0 ; |
| 3871 | + foreach $key (@searches_toplevel_count) |
| 3872 | + { |
| 3873 | + $total = $searches_toplevel_mimecat {"$key,total"} ; |
| 3874 | + $page = $searches_toplevel_mimecat {"$key,page"} ; |
| 3875 | + $image = $searches_toplevel_mimecat {"$key,image"} ; |
| 3876 | + $rest = $searches_toplevel_mimecat {"$key,other"} ; |
| 3877 | + $total_total += $total ; |
| 3878 | + $total_page += $page ; |
| 3879 | + $total_image += $image ; |
| 3880 | + $total_rest += $rest ; |
| 3881 | + $total = &FormatCount ($total) ; |
| 3882 | + $page = &FormatCount ($page) ; |
| 3883 | + $image = &FormatCount ($image) ; |
| 3884 | + $rest = &FormatCount ($rest) ; |
| 3885 | + if ($key !~ /^[\_\.]/) |
| 3886 | + { $key = ".$key" ; } |
| 3887 | +# else |
| 3888 | +# { $key =~ s/^[\.]// ; } |
| 3889 | + if ($key =~ /^\_/) |
| 3890 | + { $key = "<i>" . substr ($key,1) . "</i>" ; } |
| 3891 | + $html .= "<tr><td class=l>$key</a></td><td class=r>$total</td><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3892 | + } |
| 3893 | + $total_no_tld = $searches_mimecat_tld_not_found {"total"} ; |
| 3894 | + $page_no_tld = $searches_mimecat_tld_not_found {"page"} ; |
| 3895 | + $image_no_tld = $searches_mimecat_tld_not_found {"image"} ; |
| 3896 | + $other_no_tld = $searches_mimecat_tld_not_found {"other"} ; |
| 3897 | + |
| 3898 | + $total_total += $total_no_tld ; |
| 3899 | + $total_page += $page_no_tld ; |
| 3900 | + $total_image += $image_no_tld ; |
| 3901 | + $total_rest += $other_no_tld ; |
| 3902 | + |
| 3903 | + $total_no_tld = &FormatCount ($total_no_tld) ; |
| 3904 | + $page_no_tld = &FormatCount ($page_no_tld) ; |
| 3905 | + $image_no_tld = &FormatCount ($image_no_tld) ; |
| 3906 | + $other_no_tld = &FormatCount ($other_no_tld) ; |
| 3907 | + $html .= "<tr><td class=l>undefined</a></td><td class=r>$total_no_tld</td><td class=r>$page_no_tld</td><td class=r>$image_no_tld</td><td class=r>$other_no_tld</td></tr>\n" ; |
| 3908 | + |
| 3909 | + $total_total = &FormatCount ($total_total) ; |
| 3910 | + $total_page = &FormatCount ($total_page) ; |
| 3911 | + $total_image = &FormatCount ($total_image) ; |
| 3912 | + $total_rest = &FormatCount ($total_rest) ; |
| 3913 | + $html .= "<tr><th class=l>Total</a></th><th class=r>$total_total</th><th class=r>$total_page</th><th class=r>$total_image</th><th class=r>$total_rest</th></tr>\n" ; |
| 3914 | + |
| 3915 | + $html .= "</table>\n" ; |
| 3916 | + |
| 3917 | + $html .= "</td><td width=50%>\n" ; |
| 3918 | + |
| 3919 | + # SORTED BY ALPHABETICALLY |
| 3920 | + $html .= "<table border=1>\n" ; |
| 3921 | + $html .= "<tr><th colspan=99 class=l><h3>In alphabetical order</h3></th></tr>\n" ; |
| 3922 | + $html .= "<tr><th colspan=99 class=l>Requests originating from a Google ip address</th></tr>\n" ; |
| 3923 | +# $html .= "<tr><th colspan=99 class=l><small>x 1000</small></th>\n" ; |
| 3924 | + $html .= "<tr><th class=l>Service</a><th class=r>Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ; |
| 3925 | + foreach $key (@searches_service_alpha) |
| 3926 | + { |
| 3927 | + next if $key !~ /Y$/ ; # googleIp |
| 3928 | + |
| 3929 | + ($key2 = $key) =~ s/,[YN]$// ; |
| 3930 | + $total = $searches_service_mimecat {"$key2,total,Y"} ; |
| 3931 | + $page = $searches_service_mimecat {"$key2,page,Y"} ; |
| 3932 | + $image = $searches_service_mimecat {"$key2,image,Y"} ; |
| 3933 | + $rest = $searches_service_mimecat {"$key2,other,Y"} ; |
| 3934 | + $total = &FormatCount ($total) ; |
| 3935 | + $page = &FormatCount ($page) ; |
| 3936 | + $image = &FormatCount ($image) ; |
| 3937 | + $rest = &FormatCount ($rest) ; |
| 3938 | + if ($key !~ /(?:undefined|unspecified|crawler|feedfetcher|wireless transcoder)/) |
| 3939 | + { $key = ucfirst ($key) ; } |
| 3940 | + else |
| 3941 | + { $key = "<i>$key</i>" ; } |
| 3942 | + $html .= "<tr><td class=l>$key2</a></td><td class=r>$total</td><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3943 | + } |
| 3944 | + $html .= "<tr><th class=l>Total</a></th><th class=r>$total_total_direct</th><th class=r>$total_page_direct</th><th class=r>$total_image_direct</th><th class=r>$total_rest_direct</th></tr>\n" ; |
| 3945 | + |
| 3946 | + $html .= "<tr><th colspan=99 class=l> </th></tr>\n" ; |
| 3947 | + $html .= "<tr><th colspan=99 class=l>Requests originating from elsewhere</th></tr>\n" ; |
| 3948 | + $html .= "<tr><th class=l>Service</a><th class=r>Total</th><th class=r>Pages</th><th class=r>Images</th><th class=r>Other</th></tr>\n" ; |
| 3949 | + foreach $key (@searches_service_alpha) |
| 3950 | + { |
| 3951 | + next if $key =~ /Y$/ ; # googleIp |
| 3952 | + |
| 3953 | + ($key2 = $key) =~ s/,[YN]$// ; |
| 3954 | + $total = $searches_service_mimecat {"$key2,total,N"} ; |
| 3955 | + $page = $searches_service_mimecat {"$key2,page,N"} ; |
| 3956 | + $image = $searches_service_mimecat {"$key2,image,N"} ; |
| 3957 | + $rest = $searches_service_mimecat {"$key2,other,N"} ; |
| 3958 | + $total = &FormatCount ($total) ; |
| 3959 | + $page = &FormatCount ($page) ; |
| 3960 | + $image = &FormatCount ($image) ; |
| 3961 | + $rest = &FormatCount ($rest) ; |
| 3962 | + if ($key !~ /(?:undefined|unspecified|crawler|feedfetcher|wireless transcoder)/) |
| 3963 | + { $key = ucfirst ($key) ; } |
| 3964 | + else |
| 3965 | + { $key = "<i>$key</i>" ; } |
| 3966 | + $html .= "<tr><td class=l>$key2</a></td><td class=r>$total</td><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3967 | + } |
| 3968 | + $html .= "<tr><th class=l>Total</a></th><th class=r>$total_total_indirect</th><th class=r>$total_page_indirect</th><th class=r>$total_image_indirect</th><th class=r>$total_rest_indirect</th></tr>\n" ; |
| 3969 | + $html .= "<tr><th class=l colspan=99> </td></tr>\n" ; |
| 3970 | + $html .= "<tr><th colspan=99 class=l>Top level domains</th></tr>\n" ; |
| 3971 | + |
| 3972 | + $total_total = 0 ; |
| 3973 | + $total_page = 0 ; |
| 3974 | + $total_image = 0 ; |
| 3975 | + $total_rest = 0 ; |
| 3976 | + foreach $key (@searches_toplevel_alpha) |
| 3977 | + { |
| 3978 | + $total = $searches_toplevel_mimecat {"$key,total"} ; |
| 3979 | + $page = $searches_toplevel_mimecat {"$key,page"} ; |
| 3980 | + $image = $searches_toplevel_mimecat {"$key,image"} ; |
| 3981 | + $rest = $searches_toplevel_mimecat {"$key,other"} ; |
| 3982 | + $total_total += $total ; |
| 3983 | + $total_page += $page ; |
| 3984 | + $total_image += $image ; |
| 3985 | + $total_rest += $rest ; |
| 3986 | + $total = &FormatCount ($total) ; |
| 3987 | + $page = &FormatCount ($page) ; |
| 3988 | + $image = &FormatCount ($image) ; |
| 3989 | + $rest = &FormatCount ($rest) ; |
| 3990 | + if ($key !~ /^[\_\.]/) |
| 3991 | + { $key = ".$key" ; } |
| 3992 | + if ($key =~ /^\_/) |
| 3993 | + { $key = "<i>" . substr ($key,1) . "</i>" ; } |
| 3994 | + $html .= "<tr><td class=l>$key</a></td><td class=r>$total</td><td class=r>$page</td><td class=r>$image</td><td class=r>$rest</td></tr>\n" ; |
| 3995 | + } |
| 3996 | + $total_no_tld = $searches_mimecat_tld_not_found {"total"} ; |
| 3997 | + $page_no_tld = $searches_mimecat_tld_not_found {"page"} ; |
| 3998 | + $image_no_tld = $searches_mimecat_tld_not_found {"image"} ; |
| 3999 | + $other_no_tld = $searches_mimecat_tld_not_found {"other"} ; |
| 4000 | + |
| 4001 | + $total_total += $total_no_tld ; |
| 4002 | + $total_page += $page_no_tld ; |
| 4003 | + $total_image += $image_no_tld ; |
| 4004 | + $total_rest += $other_no_tld ; |
| 4005 | + |
| 4006 | + $total_no_tld = &FormatCount ($total_no_tld) ; |
| 4007 | + $page_no_tld = &FormatCount ($page_no_tld) ; |
| 4008 | + $image_no_tld = &FormatCount ($image_no_tld) ; |
| 4009 | + $other_no_tld = &FormatCount ($other_no_tld) ; |
| 4010 | + $html .= "<tr><td class=l>undefined</a></td><td class=r>$total_no_tld</td><td class=r>$page_no_tld</td><td class=r>$image_no_tld</td><td class=r>$other_no_tld</td></tr>\n" ; |
| 4011 | + |
| 4012 | + $total_total = &FormatCount ($total_total) ; |
| 4013 | + $total_page = &FormatCount ($total_page) ; |
| 4014 | + $total_image = &FormatCount ($total_image) ; |
| 4015 | + $total_rest = &FormatCount ($total_rest) ; |
| 4016 | + $html .= "<tr><th class=l>Total</a></th><th class=r>$total_total</th><th class=r>$total_page</th><th class=r>$total_image</th><th class=r>$total_rest</th></tr>\n" ; |
| 4017 | + |
| 4018 | + $html .= "</table>\n" ; |
| 4019 | + $html .= "</td></tr>\n" ; |
| 4020 | + |
| 4021 | + |
| 4022 | + $breakdown = "Here is detailed breakdown per service of indicators that pointed to Google <small>(total ≥ 3)</small><br> <br>" . |
| 4023 | + "<table width=100%><tr><th class=l>Service</th><th class=c>Total</th><th class=c>Originating from<br>Google ip address</th><th class=c>Referer mentions<br>Google url</th><th class=c>Agent mentions<br>Google service</th></tr>\n" ; |
| 4024 | + foreach $key (@searches_service_matches_alpha) |
| 4025 | + { |
| 4026 | + $count = $searches_service_matches {$key} ; |
| 4027 | + |
| 4028 | + next if $count <= 2 ; |
| 4029 | + |
| 4030 | + $count = &FormatCount ($count) ; |
| 4031 | + ($service,$matches) = split (',', $key) ; |
| 4032 | + if ($matches =~ /x/) { $x = 'Y' } else { $x = '-' } ; |
| 4033 | + if ($matches =~ /y/) { $y = 'Y' } else { $y = '-' } ; |
| 4034 | + if ($matches =~ /z/) { $z = 'Y' } else { $z = '-' } ; |
| 4035 | + $breakdown .= "<tr><td class=l>$service</td><td class=r>$count</td><td class=c>$x</td><td class=c>$y</td><td class=c>$z</td></tr>" ; |
| 4036 | + } |
| 4037 | + $breakdown .= "</table><br.&bsp;<br>\n" ; |
| 4038 | + |
| 4039 | + |
| 4040 | + $html .= "<tr><td class=l colspan=99><a name='details' id='details'></a> <p>" . |
| 4041 | + $google_ip_ranges . |
| 4042 | + "<b>Agents</b>: as for genuine agent strings: too many crawlers indentify themselves as 'GoogleBot' to take this at face value. " . |
| 4043 | + "They are accepted as genuine Google crawler requests only when the ip address matches a known range (see above). " . |
| 4044 | + "Other records that mention GoogleBot are counted as GoogleBot? (question mark, as this may include partners, like DoCoMo). " . |
| 4045 | + "However when the agent string mentions Google Desktop or Google Earth this is always accepted" . |
| 4046 | + "<p><b>Service</b>: the service name is based on the agent string (plus for GoogleBot check for ip address, see above), if this is inconclusive it is based on the referer string." . |
| 4047 | + "<p>$breakdown" . |
| 4048 | + "<p><b>Top Level Domain 'undefined'</b>: requests with top level domain 'undefined' are nearly all requests from anonymous ip addresses (crawler and other services)" . |
| 4049 | + "<p><b>Note</b>: averages below 1 are always rounded up to 1\n" . |
| 4050 | + "</small></td></tr>\n"; |
| 4051 | + |
| 4052 | + $html .= "</table>\n" ; |
| 4053 | + |
| 4054 | + $html .= $colophon ; |
| 4055 | + |
| 4056 | + print FILE_HTML_SEARCH $html ; |
| 4057 | + close FILE_HTML_SEARCH ; |
| 4058 | +} |
| 4059 | + |
| 4060 | +sub WriteReportSkins |
| 4061 | +{ |
| 4062 | + open FILE_HTML_SKINS, '>', "$dir_reports/$file_html_skins" ; |
| 4063 | + |
| 4064 | + $html = $header ; |
| 4065 | + $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Skins/ ; |
| 4066 | + $html =~ s/HEADER/Wikimedia Traffic Analysis Report - Skins/ ; |
| 4067 | + $html =~ s/ALSO/ See also: <b>LINKS<\/b>/ ; |
| 4068 | + $html =~ s/LINKS/$link_requests $link_origins \/ $link_methods \/ $link_scripts \/ $dummy_skins \/ $link_crawlers \/ $link_opsys \/ $link_browsers \/ $link_google/ ; |
| 4069 | + $html =~ s/X1000/⇒ <font color=#008000><b>all counts x 1000<\/b><\/font>.<br>/ ; |
| 4070 | + |
| 4071 | + $html .= "<table border=1>\n" ; |
| 4072 | + |
| 4073 | + $html .= "<tr><td colspan=99 class=l><b>Skin</b><br>Files (≥ 3)</td></tr>\n" ; |
| 4074 | + $rows = 0 ; |
| 4075 | + $nameprev = "" ; |
| 4076 | + foreach $skin (@skins_sorted_skin) |
| 4077 | + { |
| 4078 | + $count = &FormatCount ($skins {$skin}) ; |
| 4079 | + |
| 4080 | + next if $count < 3 ; |
| 4081 | + |
| 4082 | + $skin =~ s/^skins\/// ; |
| 4083 | + ($name,$rest) = split ('\/', $skin, 2) ; |
| 4084 | + |
| 4085 | + next if $skin_set {$name} < 3 ; |
| 4086 | + |
| 4087 | + if ($name ne $nameprev) |
| 4088 | + { $html .= "<tr><th colspan=99 class=l> <br><b>" . ucfirst ($name) . "</b></th></tr>\n" ; } |
| 4089 | + $nameprev = $name ; |
| 4090 | + $html .= "<tr><td class=l>$skin</td><td class=r>$count</td></tr>\n" ; |
| 4091 | + $rows++ ; |
| 4092 | + } |
| 4093 | + $html .= "</table>\n" ; |
| 4094 | + |
| 4095 | + $html .= " <small>$rows rows written</small><p>" ; |
| 4096 | + |
| 4097 | +# $html .= "<p><b>Explanation:</b><br>'osd' = opensearchdescription / 'php.ser' = vnd.php.serialized" ; |
| 4098 | + $html .= $colophon ; |
| 4099 | + |
| 4100 | + print FILE_HTML_SKINS $html ; |
| 4101 | + close FILE_HTML_SKINS ; |
| 4102 | +} |
| 4103 | + |
| 4104 | + $html .= "</td></tr></table>\n" ; |
| 4105 | +# $html .= " <small>$rows rows written</small><p>" ; |
| 4106 | + |
| 4107 | +# $html .= "<p><b>Explanation:</b><br>'osd' = opensearchdescription / 'php.ser' = vnd.php.serialized" ; |
| 4108 | + $html .= $colophon ; |
| 4109 | + |
| 4110 | + print FILE_HTML_ORIGINS $html ; |
| 4111 | + close FILE_HTML_ORIGINS ; |
| 4112 | +} |
| 4113 | + |
| 4114 | +sub WriteCsvGoogleBots |
| 4115 | +{ |
| 4116 | + open CSV_GOOGLE_BOTS_OUT, '>', "$dir_reports/$file_csv_google_bots" ; |
| 4117 | + print CSV_GOOGLE_BOTS_OUT "Date Time,Ip Range,Hits\n" ; |
| 4118 | + foreach $dir_process (@dirs_process) |
| 4119 | + { |
| 4120 | + open CSV_GOOGLE_BOTS_IN, '<', "$dir_process/$file_csv_google_bots" ; |
| 4121 | + while ($line = <CSV_GOOGLE_BOTS_IN>) |
| 4122 | + { |
| 4123 | + next if $line =~ /^#/ ; # comments |
| 4124 | + next if $line =~ /^:/ ; # csv header (not a comment) |
| 4125 | + |
| 4126 | + chomp $line ; |
| 4127 | + ($datetime,$range,$hits) = split (',', $line) ; |
| 4128 | + ($date,$time) = split (' ', $datetime) ; |
| 4129 | + ($year,$month,$day) = split ('\/', $date) ; |
| 4130 | + $hour = substr ($time,0,2) ; |
| 4131 | + $datetime = "\"=DATE($year,$month,$day)+TIME($hour,0,0)\"" ; |
| 4132 | + print CSV_GOOGLE_BOTS_OUT "$datetime,$hits,$range\n" ; |
| 4133 | + $googlebots {$datetime} += $hits ; |
| 4134 | + } |
| 4135 | + close CSV_GOOGLE_BOTS_IN ; |
| 4136 | + } |
| 4137 | + foreach $datetime (sort keys %googlebots) |
| 4138 | + { print CSV_GOOGLE_BOTS_OUT "$datetime,${googlebots{$datetime}},*\n" ; } |
| 4139 | + close CSV_GOOGLE_BOTS_OUT ; |
| 4140 | +} |
| 4141 | + |
| 4142 | +sub WriteCsvBrowserLanguages |
| 4143 | +{ |
| 4144 | + open CSV_BROWSER_LANGUAGES, '>', "$dir_reports/$file_csv_browser_languages" ; |
| 4145 | + print CSV_BROWSER_LANGUAGES "Browser,Languages,Hits\n" ; |
| 4146 | + foreach $key (keys_sorted_alpha_asc %browser_languages) |
| 4147 | + { print CSV_BROWSER_LANGUAGES "$key,${browser_languages {$key}}\n" ; } |
| 4148 | + close CSV_BROWSER_LANGUAGES ; |
| 4149 | +} |
| 4150 | + |
| 4151 | +sub WriteCsvCountriesTimed |
| 4152 | +{ |
| 4153 | + $multiplier_1000 = 1000 * $multiplier ; |
| 4154 | +# open CSV_COUNTRIES_TIMED, '>', "$dir_reports/$file_csv_countries_timed" ; |
| 4155 | + open CSV_COUNTRIES_TIMED, '>', "/home/ezachte/$file_csv_countries_timed" ; |
| 4156 | + |
| 4157 | + foreach $target (sort keys %targets) |
| 4158 | + { |
| 4159 | + @countries = sort {$countries_totals {"N,$target"}{$b} <=> $countries_totals {"N,$target"}{$a}} keys %{$countries_totals {"N,$target"}} ; |
| 4160 | + |
| 4161 | + foreach $bot ("N","Y") |
| 4162 | + { |
| 4163 | + $line = "\nBot,Wiki,Time," ; |
| 4164 | + $cnt_countries = 0 ; |
| 4165 | + foreach $country (@countries) |
| 4166 | + { |
| 4167 | + $line .= sprintf ("%.0f", $multiplier_1000 * $countries_totals {"$bot,$target"}{$country}) . "," ; |
| 4168 | + |
| 4169 | + last if $cnt_countries++ >= 25 ; |
| 4170 | + } |
| 4171 | + print CSV_COUNTRIES_TIMED "$line\n" ; |
| 4172 | + |
| 4173 | + $line = "\nBot,Wiki,Time," ; |
| 4174 | + $cnt_countries = 0 ; |
| 4175 | + foreach $country (@countries) |
| 4176 | + { |
| 4177 | + $country_name = $country_codes {$country} ; |
| 4178 | + $line .= "$country_name," ; |
| 4179 | + |
| 4180 | + last if $cnt_countries++ >= 25 ; |
| 4181 | + } |
| 4182 | + print CSV_COUNTRIES_TIMED "$line\n" ; |
| 4183 | + |
| 4184 | + foreach $time (sort {$a <=> $b} keys %times) |
| 4185 | + { |
| 4186 | + $hrs = $time / 60 ; |
| 4187 | + $min = $time % 60 ; |
| 4188 | + $time2 = "\"=Time($hrs,$min,0)\"" ; |
| 4189 | + $line = "$bot,$target,$time2," ; |
| 4190 | + $cnt_countries = 0 ; |
| 4191 | + foreach $country (@countries) |
| 4192 | + { |
| 4193 | + $line .= sprintf ("%.0f", $multiplier_1000 * $countries_timed {"$bot,$target,$country,$time"}) . "," ; |
| 4194 | + |
| 4195 | + last if $cnt_countries++ >= 25 ; |
| 4196 | + } |
| 4197 | + print CSV_COUNTRIES_TIMED "$line\n" ; |
| 4198 | + } |
| 4199 | + } |
| 4200 | + } |
| 4201 | + close CSV_COUNTRIES_TIMED ; |
| 4202 | +} |
| 4203 | + |
| 4204 | +# http://www.maxmind.com/app/iso3166 country codes |
| 4205 | +sub WriteCsvCountriesGoTo |
| 4206 | +{ |
| 4207 | +# open CSV_COUNTRIES_TIMED, '>', "$dir_reports/$file_csv_countries_timed" ; |
| 4208 | + open CSV_COUNTRIES_LANGUAGES_VISITED, '>', "/home/ezachte/$file_csv_countries_languages_visited" ; |
| 4209 | + |
| 4210 | + foreach $country (sort keys %countries) |
| 4211 | + { |
| 4212 | + @targets = sort {$targets_totals {"N,$country"}{$b} <=> $targets_totals {"N,$country"}{$a}} keys %{$targets_totals {"N,$country"}} ; |
| 4213 | + |
| 4214 | + $line = "\nBot,Country," ; |
| 4215 | + $cnt_targets = 0 ; |
| 4216 | + foreach $target (@targets) |
| 4217 | + { |
| 4218 | + $target2 = $target ; |
| 4219 | + $target2 =~ s/^.*?:// ; |
| 4220 | + $target3 = $out_languages {$target2} ; |
| 4221 | + if ($target3 eq "") |
| 4222 | + { $target3 = "[$target2]" ; } |
| 4223 | + $line .= "$target3," ; |
| 4224 | + |
| 4225 | + last if $cnt_targets++ >= 25 ; |
| 4226 | + } |
| 4227 | + print CSV_COUNTRIES_LANGUAGES_VISITED "$line\n" ; |
| 4228 | + |
| 4229 | + foreach $bot ("N","Y") |
| 4230 | + { |
| 4231 | + $country_name = $country_codes {$country} ; |
| 4232 | + $country_name =~ s/\n//gs ; |
| 4233 | + $country_name =~ s/[0x00-0x1F]//gs ; |
| 4234 | + |
| 4235 | + $cnt_targets = 0 ; |
| 4236 | + $tot_targets = 0 ; |
| 4237 | + foreach $target (@targets) |
| 4238 | + { |
| 4239 | + $tot_targets += $targets_totals {"$bot,$country"}{$target} ; |
| 4240 | + } |
| 4241 | + |
| 4242 | + $line = "$bot,$country_name," ; |
| 4243 | + $cnt_targets = 0 ; |
| 4244 | + foreach $target (@targets) |
| 4245 | + { |
| 4246 | + $line .= $targets_totals {"$bot,$country"}{$target} . "," ; |
| 4247 | + |
| 4248 | + last if $cnt_targets++ >= 25 ; |
| 4249 | + } |
| 4250 | + print CSV_COUNTRIES_LANGUAGES_VISITED "$line\n" ; |
| 4251 | + |
| 4252 | + $line = "$bot,$country_name," ; |
| 4253 | + $cnt_targets = 0 ; |
| 4254 | + if ($tot_targets > 0) |
| 4255 | + { |
| 4256 | + foreach $target (@targets) |
| 4257 | + { |
| 4258 | + $line .= sprintf ("%.1f\%",100*$targets_totals {"$bot,$country"}{$target} / $tot_targets) . "," ; |
| 4259 | + |
| 4260 | + last if $cnt_targets++ >= 25 ; |
| 4261 | + } |
| 4262 | + print CSV_COUNTRIES_LANGUAGES_VISITED "$line\n" ; |
| 4263 | + } |
| 4264 | + } |
| 4265 | + } |
| 4266 | + close CSV_COUNTRIES_LANGUAGES_VISITED ; |
| 4267 | +} |
| 4268 | + |
| 4269 | +sub WriteReportPerLanguageBreakDown |
| 4270 | +{ |
| 4271 | + print "\nWriteReportPerLanguageBreakDown\n" ; |
| 4272 | + |
| 4273 | + my ($title,$views_edits,$links) = @_ ; |
| 4274 | + my ($link_country,$population,$icon,$bar,$bars,$bar_width,$perc,$perc_tot,$perc_global,$requests_tot) ; |
| 4275 | + my @index_countries ; |
| 4276 | + my $views_edits_lc = lc $views_edits ; |
| 4277 | + |
| 4278 | + $html = $header ; |
| 4279 | + $html =~ s/TITLE/$title/ ; |
| 4280 | + $html =~ s/HEADER/$title/ ; |
| 4281 | + $html =~ s/ALSO/$links/ ; |
| 4282 | + $html =~ s/LINKS// ; |
| 4283 | + $html =~ s/NOTES// ; |
| 4284 | + $html =~ s/X1000/. Period <b>$requests_recently_start - $requests_recently_stop<\/b>/ ; |
| 4285 | + $html =~ s/DATE// ; |
| 4286 | + |
| 4287 | + $html .= "<p><table border=1 width=800>INDEX\n" ; |
| 4288 | + |
| 4289 | + my $languages_reported ; |
| 4290 | + |
| 4291 | + foreach $language (keys_sorted_by_value_num_desc %requests_recently_per_language) |
| 4292 | + { |
| 4293 | + next if $requests_recently_per_language {$language} < 100 ; |
| 4294 | + |
| 4295 | + ($language_name,$anchor_language) = &GetLanguageInfo ($language) ; |
| 4296 | + |
| 4297 | + my %requests_per_country = %{$requests_recently_per_language_per_country {$language}} ; |
| 4298 | + @countries = keys_sorted_by_value_num_desc %requests_per_country ; |
| 4299 | + |
| 4300 | + my $requests_this_language = $requests_recently_per_language {$language} ; |
| 4301 | + |
| 4302 | + $perc_global = '..' ; |
| 4303 | + if ($requests_recently_all > 0) |
| 4304 | + { $perc_global = &Percentage ($requests_this_language / $requests_recently_all) ; } |
| 4305 | + |
| 4306 | + $html .= "<tr><th colspan=99 class=lh3><a id='$anchor_language' name='$anchor_language'></a><br>$language_name ($language) <small>($perc_global share of global total)</small></th></tr>\n" ; |
| 4307 | + |
| 4308 | + if ($languages_reported % 2 == 0) |
| 4309 | + { $gif = "bluebar_hor2.gif" ; } |
| 4310 | + else |
| 4311 | + { $gif = "greenbar_hor2.gif" ; } |
| 4312 | + |
| 4313 | + $perc_tot = 0; |
| 4314 | + for ($l = 0 ; $l < 50 ; $l++) |
| 4315 | + { |
| 4316 | + my $requests_this_country = $requests_recently_per_language_per_country {$language} {$countries [$l]} ; |
| 4317 | + my $requests_all_countries = $requests_recently_per_language {$language} ; |
| 4318 | + $perc = 0 ; |
| 4319 | + if ($requests_all_countries > 0) |
| 4320 | + { |
| 4321 | + $perc = &Percentage ($requests_this_country / $requests_all_countries) ; |
| 4322 | + |
| 4323 | + last if ($perc < 0.5) || (($perc_global < 0.1) && ($perc < 1) || (($perc_global < 0.01) && ($perc < 3)) || (($perc_global < 0.001) && ($perc < 5))) ; |
| 4324 | + |
| 4325 | + $perc_tot += $perc ; |
| 4326 | + } |
| 4327 | + |
| 4328 | + $country = $countries [$l] ; |
| 4329 | + $country =~ s/ .*$// if length ($country) > 20 ; |
| 4330 | + $bar_width = int ($perc * 6) ; |
| 4331 | + |
| 4332 | + $bar_100 = "" ; |
| 4333 | + if ($bars++ == 0) |
| 4334 | + { |
| 4335 | + $bar_width_100 = 600 - $bar_width ; |
| 4336 | + $bar_100 = "<img src='background.gif' width=$bar_width_100 height=15>" ; |
| 4337 | + } |
| 4338 | + if (($country =~ /Australia/) && ($language_name =~ /Japanese/) && ($perc > 5)) |
| 4339 | + { $perc .= " <b><a href='#anomaly' onclick='alert(\"Probably incorrectly assigned to this country.\\nOutdated Regional Internet Registry (RIR) administration may have caused this.\")';><font color='#FF0000'>(*)</font></a></b>" ; $anomaly_found = $true ;} |
| 4340 | + $html .= "<tr><th class=l class=small nowrap>$country</th>" . |
| 4341 | + "<td class=c>[$requests_this_country ]$perc</td>" . |
| 4342 | + "<td class=l><img src='$gif' width=$bar_width height=15>$bar_100</td></tr>\n" ; |
| 4343 | + } |
| 4344 | + |
| 4345 | + if ($perc_tot > 100) { $perc_tot = 100 ; } |
| 4346 | + |
| 4347 | + $perc_other = sprintf '%.1f', 100 - $perc_tot ; |
| 4348 | + if ($perc_other > 0) |
| 4349 | + { |
| 4350 | + $bar_width = $perc_other * 6 ; |
| 4351 | + $html .= "<tr><th class=l class=small nowrap>Other</th>" . |
| 4352 | + "<td class=c>$perc_other%</td>" . |
| 4353 | + "<td class=l><img src='$gif' width=$bar_width height=15></td></tr>\n" ; |
| 4354 | + } |
| 4355 | + |
| 4356 | + push @index_languages, "<a href='#$anchor_language'>$language_name</a> " ; |
| 4357 | + |
| 4358 | + # print "\n" ; |
| 4359 | + # $html .= "<tr><td colspan=99> </td></tr>\n" ; |
| 4360 | + } |
| 4361 | + $html .= "</table>" ; |
| 4362 | + $html .= "<p><b>Share<\/b> is the percentage of requesting ip addresses (out of the global total) which originated from this country" . |
| 4363 | + "<br> Further percentages show per country share of requests per Wikipedia visited" ; |
| 4364 | + $html .= "<p>Countries are only included if the number of requests in the period exceeds 100,000 (100 matching records in 1:1000 sampled log)" ; |
| 4365 | + $html .= "<br>Page requests by bots are not included. Also all ip addresses that occur more than once on a given day are discarded for that day." ; |
| 4366 | + $html .= "<br> A few false negatives are taken for granted. " ; |
| 4367 | + $html .= $colophon ; |
| 4368 | + |
| 4369 | + $index = &HtmlIndex (join '/ ', sort (@index_languages)) ; |
| 4370 | + $html =~ s/INDEX/$index/ ; |
| 4371 | + |
| 4372 | + &PrintHtml ($html, "$path_out/$file_html_per_language_breakdown") ; |
| 4373 | +} |
| 4374 | + |
| 4375 | +sub WriteReportPerCountryOverview |
| 4376 | +{ |
| 4377 | + print "\nWriteReportPerCountryOverview\n" ; |
| 4378 | + |
| 4379 | + my ($title,$views_edits,$links) = @_ ; |
| 4380 | + my ($link_country,$population,$icon,$bar,$bars,$bar_width,$perc,$perc_tot,$perc_global,$requests_tot) ; |
| 4381 | + my (@index_countries,@csv_countries) ; |
| 4382 | + my $views_edits_lc = lc $views_edits ; |
| 4383 | + my $views_edits_lcf = ucfirst $views_edits_lc ; |
| 4384 | + ($views_edits2 = $views_edits) =~ s/ /\<br\>/ ; |
| 4385 | + if ($views_edits =~ /edit/i) |
| 4386 | + { $MPVE = 'MPE' ; } # monthly page edits |
| 4387 | + else |
| 4388 | + { $MPVE = 'MPV' ; } # monthly page views |
| 4389 | + |
| 4390 | + $html = $header ; |
| 4391 | + $html =~ s/TITLE/$title/ ; |
| 4392 | + $html =~ s/HEADER/$title/ ; |
| 4393 | + $html =~ s/LINKS// ; |
| 4394 | + $html =~ s/ALSO/$links/ ; |
| 4395 | + $html =~ s/NOTES// ; |
| 4396 | + $html =~ s/X1000/. Period <b>$requests_recently_start - $requests_recently_stop<\/b>/ ; |
| 4397 | + $html =~ s/DATE// ; |
| 4398 | + |
| 4399 | + $html .= &HtmlSortTable ; |
| 4400 | + |
| 4401 | + $html .= "<p><table border=1 width=800 class=tablesorter id=table1>\n" ; |
| 4402 | + $html .= "<thead>\n" ; |
| 4403 | + $html .= "INDEX\n" ; |
| 4404 | + |
| 4405 | + $html .= &HtmlWorldMaps ; |
| 4406 | + |
| 4407 | + $html .= "<tr><td class=rh5 colspan=3 rowspan=1><b>Country</b></td><td class=c rowspan=2><b>Monthly<br>$views_edits2</b></td>" . |
| 4408 | + "<td class=r rowspan=2><b>Population</b></td>" . # <td class=c rowspan=2><b>$MPVE's<br>Per<br>Person</b></td>" . |
| 4409 | + "<td class=c colspan=2><b>Internet<br>Users</b></td><td class=c><b>${MPVE}'s<br>Per<br>I U</b></td>" . |
| 4410 | + "<td colspan=99 class=l rowspan=2><b>Share in Global Monthly $views_edits</b><br><small><font color=#808080>red and blue bars have different scale</font></small></td></tr>\n" ; |
| 4411 | + $html .= "<tr><td class=c><b>Name</b></td><td class=c><b>Region</b><br><img src='http://stats.wikimedia.org/Location_of_Continents2.gif'></td><td class=c><b>N/S</b></td><td class=c><b>Total</b></td><td class=c><b>/Pop.</b></td></tr>\n" ; |
| 4412 | + $html .= "<tr><th> </th><th> </th><th> </th><th> </th><th> </th><th> </th><th> </th><th> </th><th> </th><th colspan=2> </th></tr>\n" ; |
| 4413 | + $html .= "</thead><tbody>\nTOTAL\nREGIONS\n" ; |
| 4414 | + |
| 4415 | + push @csv_countries, "# Wikimedia Traffic Analysis Report - Wikipedia $views_edits Per Country - Overview\n" . |
| 4416 | + "# Report based on data from $requests_recently_start - $requests_recently_stop\n" . |
| 4417 | + "country name, country code, monthly $views_edits_lc,population,internet users,internet penetration,monthly $views_edits_lc per internet user,share of global $views_edits_lc\n" ; |
| 4418 | + |
| 4419 | + $requests_tot = 0 ; |
| 4420 | + |
| 4421 | + undef %requests_per_region ; |
| 4422 | + |
| 4423 | + foreach $country_code (keys_sorted_by_value_num_desc %requests_recently_per_country_code) |
| 4424 | + { |
| 4425 | + my ($country,$code) = split ('\|', $country_code) ; |
| 4426 | + |
| 4427 | + my $region_code = $region_codes {$code} ; |
| 4428 | + my $north_south_code = $north_south_codes {$code} ; |
| 4429 | + |
| 4430 | + $region_name = $region_code ; |
| 4431 | + $region_name =~ s/^AF$/<font color=#028702><b>Africa<\/b><\/font>/ ; |
| 4432 | + $region_name =~ s/^CA$/<font color=#249CA0><b>Central-America<\/b><\/font>/ ; |
| 4433 | + $region_name =~ s/^SA$/<font color=#FCAA03><b>South-America<\/b><\/font>/ ; |
| 4434 | + $region_name =~ s/^NA$/<font color=#C802CA><b>North-America<\/b><\/font>/ ; |
| 4435 | + $region_name =~ s/^AU$/<font color=#02AAD4><b>Australia<\/b><\/font>/ ; |
| 4436 | + $region_name =~ s/^EU$/<font color=#0100CA><b>Europe<\/b><\/font>/ ; |
| 4437 | + $region_name =~ s/^AS$/<font color=#E10202><b>Asia<\/b><\/font>/ ; |
| 4438 | + $region_name =~ s/^OC$/<font color=#02AAD4><b>Oceania<\/b><\/font>/ ; |
| 4439 | + |
| 4440 | + $north_south_name = $north_south_code ; |
| 4441 | + $north_south_name =~ s/^N$/<font color=#000BF7><b>N<\/b><\/font>/ ; |
| 4442 | + $north_south_name =~ s/^S$/<font color=#FE0B0D><b>S<\/b><\/font>/ ; |
| 4443 | + |
| 4444 | +print "\n" ; # qqq |
| 4445 | + ($link_country,$icon,$population,$connected) = &CountryMetaInfo ($country) ; |
| 4446 | + |
| 4447 | + my $requests_this_country = $requests_recently_per_country {$country} ; |
| 4448 | + my $requests_this_country2 = int ($requests_this_country * 1000 / $months_recently) ; |
| 4449 | + $requests_tot += $requests_this_country2 ; |
| 4450 | + |
| 4451 | + $requests_per_region {$region_code} += $requests_this_country ; |
| 4452 | + $requests_per_region {$north_south_code} += $requests_this_country ; |
| 4453 | + $requests_per_region2 {$region_code} += $requests_this_country2 ; |
| 4454 | + $requests_per_region2 {$north_south_code} += $requests_this_country2 ; |
| 4455 | + |
| 4456 | + $requests_per_person = ".." ; |
| 4457 | + if ($population > 0) |
| 4458 | + { $requests_per_person = sprintf ("%.0f", $requests_this_country2 / $population) ; } |
| 4459 | + |
| 4460 | + $requests_per_connected_person = ".." ; |
| 4461 | + if ($connected > 0) |
| 4462 | + { |
| 4463 | + if ($views_edits =~ /edit/i) |
| 4464 | + { $requests_per_connected_person = sprintf ("%.4f", $requests_this_country2 / $connected) ; } |
| 4465 | + else |
| 4466 | + { |
| 4467 | + if ($requests_this_country2 / $connected >= 1.95) |
| 4468 | + { $requests_per_connected_person = sprintf ("%.0f", $requests_this_country2 / $connected) ; } |
| 4469 | + else |
| 4470 | + { $requests_per_connected_person = sprintf ("%.1f", $requests_this_country2 / $connected) ; } |
| 4471 | + } |
| 4472 | + } |
| 4473 | + |
| 4474 | + $perc_share_total = '..' ; |
| 4475 | + if ($requests_recently_all > 0) |
| 4476 | + { $perc_share_total = &Percentage ($requests_this_country / $requests_recently_all) ; } |
| 4477 | + $perc_tot += $perc_share_total ; |
| 4478 | + |
| 4479 | + $bar = " " ; |
| 4480 | + if ($perc_share_total > 0) |
| 4481 | + { $bar = "<img src='redbar_hor.gif' width=" . (int ($perc_share_total * 10)) . " height=15>" ; } |
| 4482 | + |
| 4483 | + $perc_connected = ".." ; |
| 4484 | + if ($population > 0) |
| 4485 | + { $perc_connected = sprintf ("%.0f", 100 * $connected / $population) .'%' ; } |
| 4486 | + |
| 4487 | + # now use country names that are suitable for http://gunn.co.nz/map/ |
| 4488 | + $country2 = $country ; |
| 4489 | + $country2 =~ s/Moldova, Republic of/Moldova/ ; |
| 4490 | + $country2 =~ s/Korea, Republic of/South Korea/ ; |
| 4491 | + $country2 =~ s/Korea, Democratic People's Republic of/North Korea/ ; |
| 4492 | + $country2 =~ s/Iran, Islamic Republic of/Iran/ ; |
| 4493 | + $country2 =~ s/UAE/United Arab Emirates/ ; |
| 4494 | + $country2 =~ s/Congo - The Democratic Republic of the/Democratic Republic of the Congo/ ; |
| 4495 | + $country2 =~ s/^Congo$/Republic of the Congo/ ; |
| 4496 | + $country2 =~ s/Syrian Arab Republic/Syria/ ; |
| 4497 | + $country2 =~ s/Tanzania, United Republic of/Tanzania/ ; |
| 4498 | + $country2 =~ s/Libyan Arab Jamahiriya/Libya/ ; |
| 4499 | + $country2 =~ s/C..?te d'Ivoire/C\xC3\xB4te d'Ivoire/ ; |
| 4500 | + $country2 =~ s/Serbia/republic of serbia/ ; |
| 4501 | + $country2 =~ s/Lao People's Democratic Republic/Laos/ ; |
| 4502 | + |
| 4503 | + |
| 4504 | + push @csv_countries, "$country2,$code,$requests_this_country2,$population,$connected,$perc_connected,$requests_per_connected_person,$perc\n" ; |
| 4505 | + |
| 4506 | + $population2 = &i2KM2 ($population) ; |
| 4507 | + $connected2 = &i2KM2 ($connected) ; |
| 4508 | + $requests_this_country2 = &i2KM2 ($requests_this_country2) ; |
| 4509 | + $html .= "<tr><th class=rh3><a id='$country' name='$country'></a>$link_country $icon</td>" . |
| 4510 | + "<td>$region_name</td>" . |
| 4511 | + "<td>$north_south_name</td>" . |
| 4512 | + "<td>$requests_this_country2</td>" . |
| 4513 | + "<td>$population2</td>" . # <td>$requests_per_person</td>" . |
| 4514 | + "<td>$connected2</td>" . |
| 4515 | + "<td>$perc_connected</td>" . |
| 4516 | + "<td>$requests_per_connected_person</td>" . |
| 4517 | + "<td>$perc_share_total</td>" . |
| 4518 | + "<td class=l>$bar</td></tr>\n" ; |
| 4519 | + |
| 4520 | + if ($verbose) |
| 4521 | + { push @index_countries, "<a href=#$country>$country ($perc)</a>\n " ; } |
| 4522 | + else |
| 4523 | + { push @index_countries, "<a href=#$country>$country</a>\n " ; } |
| 4524 | + } |
| 4525 | + |
| 4526 | + |
| 4527 | + $requests_per_person_tot = '..' ; |
| 4528 | + |
| 4529 | + if ($population_tot > 0) |
| 4530 | + { $requests_per_person_tot = sprintf ("%.0f", $requests_tot / $population_tot) ; } |
| 4531 | + |
| 4532 | + if ($connected_tot > 0) |
| 4533 | + { |
| 4534 | + if ($views_edits =~ /edit/i) |
| 4535 | + { $requests_per_connected_person_tot = sprintf ("%.4f", $requests_tot / $connected_tot) ; } |
| 4536 | + else |
| 4537 | + { $requests_per_connected_person_tot = sprintf ("%.0f", $requests_tot / $connected_tot) ; } |
| 4538 | + } |
| 4539 | + |
| 4540 | + $perc_connected_tot = ".." ; |
| 4541 | + if ($population_tot > 0) |
| 4542 | + { $perc_connected_tot = sprintf ("%.0f", 100 * $connected_tot / $population_tot) .'%' ; } |
| 4543 | + |
| 4544 | + push @csv_countries, "world,*,$requests_tot,$population_tot,$connected_tot,$perc_connected_tot,$requests_per_connected_person_tot,100%\n" ; |
| 4545 | + |
| 4546 | + $requests_tot2 = &i2KM2 ($requests_tot) ; |
| 4547 | + $population_tot2 = &i2KM2 ($population_tot) ; |
| 4548 | + $connected_tot2 = &i2KM2 ($connected_tot) ; |
| 4549 | + |
| 4550 | + $html_total = "<tr><th class=rh3>All countries in</td>" . |
| 4551 | + "<td><b>World</b></td>" . |
| 4552 | + "<td> </td>" . |
| 4553 | + "<td>$requests_tot2</td>" . |
| 4554 | + "<td>$population_tot2</td>" . |
| 4555 | + "<td>$connected_tot2</td>" . |
| 4556 | + "<td>$perc_connected_tot</td>" . |
| 4557 | + "<td>$requests_per_connected_person_tot</td>" . |
| 4558 | + "<td>100%</th>" . |
| 4559 | + "<td class=l> </td></tr>\n" ; |
| 4560 | + $html_total .= "<tr><td colspan=99> </td></tr>" ; |
| 4561 | + |
| 4562 | + |
| 4563 | + undef @keys_regions ; |
| 4564 | +# foreach $key (sort keys %population_per_hemisphere) |
| 4565 | +# { push @keys_regions, $key ; } |
| 4566 | + $html_regions = '' ; |
| 4567 | + foreach $key (qw (N S AF AS AU EU CA NA SA OC)) |
| 4568 | + { |
| 4569 | + $region = $key ; |
| 4570 | + |
| 4571 | + $region =~ s/^N$/<font color=#000BF7><b>Global North<\/b><\/font>/ ; |
| 4572 | + $region =~ s/^S$/<font color=#FE0B0D><b>Global South<\/b><\/font>/ ; |
| 4573 | + |
| 4574 | + $region =~ s/^AF$/<font color=#028702><b>Africa<\/b><\/font>/ ; |
| 4575 | + $region =~ s/^CA$/<font color=#249CA0><b>Central-America<\/b><\/font>/ ; |
| 4576 | + $region =~ s/^SA$/<font color=#FCAA03><b>South-America<\/b><\/font>/ ; |
| 4577 | + $region =~ s/^NA$/<font color=#C802CA><b>North-America<\/b><\/font>/ ; |
| 4578 | + $region =~ s/^AU$/<font color=#02AAD4><b>Australia<\/b><\/font>/ ; |
| 4579 | + $region =~ s/^EU$/<font color=#0100CA><b>Europe<\/b><\/font>/ ; |
| 4580 | + $region =~ s/^AS$/<font color=#E10202><b>Asia<\/b><\/font>/ ; |
| 4581 | + $region =~ s/^OC$/<font color=#02AAD4><b>Oceania<\/b><\/font>/ ; |
| 4582 | + |
| 4583 | + $population_region = $population_per_region {$key} ; |
| 4584 | + $connected_region = $connected_per_region {$key} ; |
| 4585 | + $requests_region = $requests_per_region {$key} ; |
| 4586 | + $requests_region2 = $requests_per_region2 {$key} ; |
| 4587 | + |
| 4588 | + $perc_connected_region = ".." ; |
| 4589 | + if ($population_region > 0) |
| 4590 | + { $perc_connected_region = sprintf ("%.0f", 100 * $connected_region / $population_region) .'%' ; } |
| 4591 | + |
| 4592 | + $perc_share_total = '..' ; |
| 4593 | + if ($requests_recently_all > 0) |
| 4594 | + { $perc_share_total = &Percentage ($requests_region / $requests_recently_all) ; } |
| 4595 | + |
| 4596 | + $perc_connected_region = ".." ; |
| 4597 | + if ($population_region > 0) |
| 4598 | + { $perc_connected_region = sprintf ("%.0f", 100 * $connected_region / $population_region) .'%' ; } |
| 4599 | + |
| 4600 | + # $requests_region2 = int ($requests_region * 1000 / $months_recently) ; |
| 4601 | + |
| 4602 | + $requests_per_connected_person = '..' ; |
| 4603 | + if ($connected_region > 0) |
| 4604 | + { |
| 4605 | + if ($views_edits =~ /edit/i) |
| 4606 | + { $requests_per_connected_person = sprintf ("%.4f", $requests_region2 / $connected_region) ; } |
| 4607 | + else |
| 4608 | + { $requests_per_connected_person = sprintf ("%.0f", $requests_region2 / $connected_region) ; } |
| 4609 | + } |
| 4610 | + |
| 4611 | + $population_region = &i2KM2 ($population_region) ; |
| 4612 | + $connected_region = &i2KM2 ($connected_region) ; |
| 4613 | + $requests_region = &i2KM2 ($requests_region) ; |
| 4614 | + $requests_region2 = &i2KM2 ($requests_region2) ; |
| 4615 | + |
| 4616 | + $bar = " " ; |
| 4617 | + if ($perc_share_total > 0) |
| 4618 | + { $bar = "<img src='bluebar_hor.gif' width=" . (int ($perc_share_total * 3)) . " height=15>" ; } |
| 4619 | + |
| 4620 | + # $html_regions .= &WriteReportPerCountryOverviewLine ("All countries in", $region, '', $requests, $population) ; |
| 4621 | + $html_regions .= "<tr><th>All countries in</th>" . |
| 4622 | + "</td><td>$region</td>" . |
| 4623 | + "<td> </td>" . |
| 4624 | + "<td>$requests_region2</td>" . |
| 4625 | + "<td>$population_region</td>" . |
| 4626 | + "<td>$connected_region</td>" . |
| 4627 | + "<td>$perc_connected_region</td>" . |
| 4628 | + "<td>$requests_per_connected_person</td>" . |
| 4629 | + "<td>$perc_share_total</th>" . |
| 4630 | + "<td class=l>$bar</td></tr>\n" ; |
| 4631 | + |
| 4632 | + if (($key eq 'S') || (($key eq 'OC'))) |
| 4633 | + { $html_regions .= "<tr><td colspan=99> </td></tr>" ; } |
| 4634 | + } |
| 4635 | + |
| 4636 | + |
| 4637 | + $html .= "</tbody>\n</table>" ; |
| 4638 | + $html .= "<p>Countries are only included if the number of $views_edits_lc in the period exceeds 100,000 (100 matching records in 1:1000 sampled log)" ; |
| 4639 | + $html .= "<br>$views_edits_lcf by bots are not included. Also all ip addresses that occur more than once on a given day are discarded for that day." ; |
| 4640 | + $html .= "<br> A few false negatives are taken for granted. " ; |
| 4641 | + $html .= "Country meta data collected from English Wikipedia (<a href='http://en.wikipedia.org/wiki/List_of_countries_by_population'>population</a>, <a href='http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users'>internet users</a>)). " ; |
| 4642 | +# $html .= "<br>Monthly $views_edits_lc per person is calculated over total population, regardless of age and internet connectivity" ; # how come, misplaced here ?! |
| 4643 | + |
| 4644 | + $html .= &HtmlSortTableColumns; ; |
| 4645 | + $html .= $colophon ; |
| 4646 | + |
| 4647 | + $index = &HtmlIndex (join '/ ', sort (@index_countries)) ; |
| 4648 | + $html =~ s/INDEX/$index/ ; |
| 4649 | + $html =~ s/TOTAL/$html_total/ ; |
| 4650 | + $html =~ s/REGIONS/$html_regions/ ; |
| 4651 | + |
| 4652 | + &PrintHtml ($html, "$path_out/$file_html_per_country_overview") ; |
| 4653 | +} |
| 4654 | + |
| 4655 | +#sub WriteReportPerCountryOverviewLine |
| 4656 | +#{ |
| 4657 | +# my ($name,$region,$hemisphere,$population,$connected,$requests) = @_ ; |
| 4658 | +# my ($perc_requests, $perc_connected, $requests_per_connected_person) ; |
| 4659 | +# my $html ; |
| 4660 | +# $html = "<tr><th>$name</th></td><td>$region</td><td>$hemisphere</td><td>$requests</td>" . |
| 4661 | +# "<td>$population</td>" . # <td>$requests_per_person_tot</td>" . |
| 4662 | +# "<td>$connected</td><td>$perc_connected</td><td>$requests_per_connected_person</td>" . |
| 4663 | +# "<td>$perc_requests</th><td class=l> </td></tr>\n" ; |
| 4664 | +# return ($html) ; |
| 4665 | +#} |
| 4666 | + |
| 4667 | +sub WriteCsvSvgFilePerCountryOverview |
| 4668 | +{ |
| 4669 | + my ($views_edits, $period, $ref_requests_per_period_per_country_code, $max_requests_per_connected_us, $desc_animation) = @_ ; |
| 4670 | + |
| 4671 | + my %requests_per_country_code = %{$ref_requests_per_period_per_country_code -> {$period}} ; |
| 4672 | + my %requests_per_country_code_prev = %{$ref_requests_per_period_per_country_code -> {$period_prev}} ; |
| 4673 | + $period_prev = $period ; |
| 4674 | + |
| 4675 | + my $description = $descriptions_per_period {$period} ; |
| 4676 | + my $postfix = $descriptions_per_period {$period} ; |
| 4677 | +# $test = join '', sort values %requests_per_country_code ; |
| 4678 | +# print $test . "\n\n" ; |
| 4679 | + print "\nWriteCsvSvgFilePerCountryOverview\n" ; |
| 4680 | + |
| 4681 | + my ($link_country,$country,$code,$population,$connected,$icon,$bar,$bars,$bar_width,$perc,$perc_tot,$perc_global,$requests_tot,$requests_max,$requests_this_country,$requests_this_country2) ; |
| 4682 | + my (@index_countries,@csv_countries,%svg_groups,%percentage_of_total_pageviews,%requests_per_connected_persons) ; |
| 4683 | + |
| 4684 | + undef @csv_countries ; |
| 4685 | + $header_csv_countries = "# Wikimedia Traffic Analysis Report - Wikipedia $views_edits Per Country - Overview\n" . |
| 4686 | + "# Report based on data from $description\n" . |
| 4687 | + "country,code,views,population,internet users,%connected,views per user,%global views\n" ; |
| 4688 | + |
| 4689 | + $requests_tot = 0 ; |
| 4690 | + undef %fills ; |
| 4691 | + |
| 4692 | +# # normalize to 100% average |
| 4693 | +# $requests_cnt = 0 ; |
| 4694 | +# $requests_tot = 0 ; |
| 4695 | +# foreach $country_code (keys %requests_per_country_code) |
| 4696 | +# { |
| 4697 | +# $requests_cnt ++ ; |
| 4698 | +# $requests_tot += $requests_per_country_code {$country_code} ; |
| 4699 | +# } |
| 4700 | + |
| 4701 | +# die "\$requests_cnt == 0" if $requests_cnt == 0 ; |
| 4702 | +# $requests_avg = $requests_tot / $requests_cnt ; |
| 4703 | +# print "requests cnt: $requests_cnt, tot: $requests_tot, avg: $requests_avg\n" ; |
| 4704 | + |
| 4705 | +# die "\$requests_avg == 0" if $requests_avg == 0 ; |
| 4706 | +# foreach $country_code (keys %requests_per_country_code) |
| 4707 | +# { $requests_per_country_code {$country_code} *= 100/$requests_avg ; } |
| 4708 | +# # normalize complete |
| 4709 | + |
| 4710 | +# print "$code, $country: $requests_this_country\n" ; |
| 4711 | + $requests_this_country = $requests_per_country_code {$country_code} ; |
| 4712 | + |
| 4713 | + foreach $country_code (keys_sorted_by_value_num_desc %requests_per_country_code) |
| 4714 | + { |
| 4715 | + ($country,$code) = split ('\|', $country_code) ; |
| 4716 | + ($link_country,$icon,$population,$connected) = &CountryMetaInfo ($country) ; |
| 4717 | + |
| 4718 | + $requests_this_country = ($requests_per_country_code {$country_code} + |
| 4719 | + 4*$requests_per_country_code_prev {$country_code}) / 5 ; |
| 4720 | + ($requests_svg,$ratio_svg,$fill_svg) = RatioAndFillColor2 ($code, $requests_this_country, 200, $ratio_linear) ; |
| 4721 | + } |
| 4722 | + &WriteWorldMapSvg ("$period-1", $description) ; |
| 4723 | + |
| 4724 | + foreach $country_code (keys_sorted_by_value_num_desc %requests_per_country_code) |
| 4725 | + { |
| 4726 | + ($country,$code) = split ('\|', $country_code) ; |
| 4727 | + ($link_country,$icon,$population,$connected) = &CountryMetaInfo ($country) ; |
| 4728 | + |
| 4729 | + $requests_this_country = (2*$requests_per_country_code {$country_code} + |
| 4730 | + 3*$requests_per_country_code_prev {$country_code}) / 5 ; |
| 4731 | + ($requests_svg,$ratio_svg,$fill_svg) = RatioAndFillColor2 ($code, $requests_this_country, 200, $ratio_linear) ; |
| 4732 | + } |
| 4733 | + &WriteWorldMapSvg ("$period-2", $description) ; |
| 4734 | + |
| 4735 | + foreach $country_code (keys_sorted_by_value_num_desc %requests_per_country_code) |
| 4736 | + { |
| 4737 | + ($country,$code) = split ('\|', $country_code) ; |
| 4738 | + ($link_country,$icon,$population,$connected) = &CountryMetaInfo ($country) ; |
| 4739 | + |
| 4740 | + $requests_this_country = (3*$requests_per_country_code {$country_code} + |
| 4741 | + 2*$requests_per_country_code_prev {$country_code}) / 5 ; |
| 4742 | + ($requests_svg,$ratio_svg,$fill_svg) = RatioAndFillColor2 ($code, $requests_this_country, 200, $ratio_linear) ; |
| 4743 | + } |
| 4744 | + &WriteWorldMapSvg ("$period-3", $description) ; |
| 4745 | + |
| 4746 | + foreach $country_code (keys_sorted_by_value_num_desc %requests_per_country_code) |
| 4747 | + { |
| 4748 | + ($country,$code) = split ('\|', $country_code) ; |
| 4749 | + ($link_country,$icon,$population,$connected) = &CountryMetaInfo ($country) ; |
| 4750 | + |
| 4751 | + $requests_this_country = (4*$requests_per_country_code {$country_code} + |
| 4752 | + $requests_per_country_code_prev {$country_code}) / 5 ; |
| 4753 | + ($requests_svg,$ratio_svg,$fill_svg) = RatioAndFillColor2 ($code, $requests_this_country, 200, $ratio_linear) ; |
| 4754 | + } |
| 4755 | + &WriteWorldMapSvg ("$period-4", $description) ; |
| 4756 | + |
| 4757 | + |
| 4758 | +# print "$code, $country: $requests_this_country\n" ; |
| 4759 | + |
| 4760 | + |
| 4761 | + foreach $country_code (keys_sorted_by_value_num_desc %requests_per_country_code) |
| 4762 | + { |
| 4763 | + ($country,$code) = split ('\|', $country_code) ; |
| 4764 | + ($link_country,$icon,$population,$connected) = &CountryMetaInfo ($country) ; |
| 4765 | + |
| 4766 | +# print "$code, $country: $requests_this_country\n" ; |
| 4767 | + $requests_this_country = $requests_per_country_code {$country_code} ; |
| 4768 | + ($requests_svg,$ratio_svg,$fill_svg) = RatioAndFillColor2 ($code, $requests_this_country, 200, $ratio_linear) ; |
| 4769 | + |
| 4770 | +next ; |
| 4771 | + $requests_this_country = &CorrectForMissingDays ($period, $requests_per_country_code {$country_code} * 1000, $code, "\$requests_this_country") ; |
| 4772 | + |
| 4773 | + $requests_tot += $requests_this_country ; |
| 4774 | + |
| 4775 | + $requests_per_person = ".." ; |
| 4776 | + if ($population > 0) |
| 4777 | + { $requests_per_person = sprintf ("%.1f", $requests_this_country / $population) ; } |
| 4778 | + |
| 4779 | + $requests_per_connected_person = ".." ; |
| 4780 | + if ($connected > 0) |
| 4781 | + { |
| 4782 | + # if ($requests_this_country / $connected >= 1.95) |
| 4783 | + # { $requests_per_connected_person = sprintf ("%.0f", $requests_this_country / $connected) ; } |
| 4784 | + # else |
| 4785 | + # { $requests_per_connected_person = sprintf ("%.1f", $requests_this_country / $connected) ; } |
| 4786 | + $requests_per_connected_person = sprintf ("%.1f", $requests_this_country / $connected) ; |
| 4787 | + } |
| 4788 | + |
| 4789 | + $perc = '..' ; |
| 4790 | + $requests_all = &CorrectForMissingDays ($period, $requests_all_per_period {$period} * 1000, $code, "\$requests_all") ; |
| 4791 | + if ($requests_all > 0) |
| 4792 | + { $perc = &Percentage ($requests_this_country / $requests_all) ; } |
| 4793 | + $perc_tot += $perc ; |
| 4794 | + |
| 4795 | + $perc_connected = ".." ; |
| 4796 | + if ($population > 0) |
| 4797 | + { $perc_connected = sprintf ("%.1f", 100 * $connected / $population) .'%' ; } |
| 4798 | + |
| 4799 | + # now use country names that are suitable for http://gunn.co.nz/map/ |
| 4800 | + $country =~ s/Moldova, Republic of/Moldova/ ; |
| 4801 | + $country =~ s/Korea, Republic of/South Korea/ ; |
| 4802 | + $country =~ s/Korea, Democratic People's Republic of/North Korea/ ; |
| 4803 | + $country =~ s/Iran, Islamic Republic of/Iran/ ; |
| 4804 | + $country =~ s/UAE/United Arab Emirates/ ; |
| 4805 | + $country =~ s/Congo - The Democratic Republic of the/Democratic Republic of the Congo/ ; |
| 4806 | + $country =~ s/^Congo$/Republic of the Congo/ ; |
| 4807 | + $country =~ s/Syrian Arab Republic/Syria/ ; |
| 4808 | + $country =~ s/Tanzania, United Republic of/Tanzania/ ; |
| 4809 | + $country =~ s/Libyan Arab Jamahiriya/Libya/ ; |
| 4810 | + $country =~ s/C..?te d'Ivoire/C\xC3\xB4te d'Ivoire/ ; |
| 4811 | + $country =~ s/Serbia/republic of serbia/ ; |
| 4812 | + $country =~ s/Lao People's Democratic Republic/Laos/ ; |
| 4813 | + |
| 4814 | + # ($requests_svg,$ratio_svg,$fill_svg) = RatioAndFillColor ($code, $requests_per_connected_person, $max_requests_per_connected_us, $ratio_sqrt) ; |
| 4815 | + ($requests_svg,$ratio_svg,$fill_svg) = RatioAndFillColor ($code, $requests_per_person, 3, $ratio_sqrt) ; |
| 4816 | + $ratio_svg = sprintf ("%.1f", $ratio_svg) ; |
| 4817 | + push @csv_countries, "\"$country\",$code,$requests_this_country,$population,$connected,$perc_connected,$requests_per_connected_person,$perc,$requests_svg,$ratio_svg,$fill_svg\n" ; |
| 4818 | + |
| 4819 | + $requests_per_connected_persons {lc $code} = $requests_per_connected_person ; |
| 4820 | + $requests_per_persons {lc $code} = $requests_per_person ; |
| 4821 | + $percentage_of_total_pageviews {lc $code} = $perc ; |
| 4822 | + } |
| 4823 | + &WriteWorldMapSvg ("$period-5", $description) ; |
| 4824 | + |
| 4825 | + $requests_per_person_tot = '..' ; |
| 4826 | + |
| 4827 | + if ($population_tot > 0) |
| 4828 | + { $requests_per_person_tot = sprintf ("%.1f", $requests_tot / $population_tot) ; } |
| 4829 | + |
| 4830 | + if ($connected_tot > 0) |
| 4831 | + { $requests_per_connected_person_tot = sprintf ("%.1f", $requests_tot / $connected_tot) ; } |
| 4832 | + |
| 4833 | + $perc_connected_tot = ".." ; |
| 4834 | + if ($population_tot > 0) |
| 4835 | + { $perc_connected_tot = sprintf ("%.1f", 100 * $connected_tot / $population_tot) .'%' ; } |
| 4836 | + |
| 4837 | + push @csv_countries, "world,*,$requests_tot,$population_tot,$connected_tot,$perc_connected_tot,$requests_per_connected_person_tot,100%\n" ; |
| 4838 | + print "$period $requests_tot\n" ; |
| 4839 | + |
| 4840 | + $file_csv_per_country_overview2 = $file_csv_per_country_overview ; |
| 4841 | + $file_csv_per_country_overview2 =~ s/\.csv/-$postfix.csv/ ; |
| 4842 | + &PrintCsv ($header_csv_countries . join ('', sort @csv_countries), "$path_out/svg/$file_csv_per_country_overview2") ; |
| 4843 | + |
| 4844 | +# $perc_tot = 0 ; |
| 4845 | +# foreach $code (keys_sorted_by_value_num_desc %requests_per_connected_persons) |
| 4846 | +# { |
| 4847 | +# $perc = $percentage_of_total_pageviews {$code} ; |
| 4848 | +# $requests = $requests_per_connected_persons {$code} ; |
| 4849 | +# $perc =~ s/\%// ; |
| 4850 | +# $perc_tot += $perc ; |
| 4851 | +# print "$code $requests $perc $perc_tot\n" ; |
| 4852 | +# if ($perc_tot > 30) |
| 4853 | +# { |
| 4854 | +# $requests_max = $requests ; |
| 4855 | +# print "Max requests = $requests_max\n " ; |
| 4856 | +# last ; |
| 4857 | +# } |
| 4858 | +# } |
| 4859 | + |
| 4860 | +# for svg with prefined styles (InkScape only ?) |
| 4861 | +# foreach $code (keys %requests_per_connected_persons) |
| 4862 | +# { |
| 4863 | +# $requests = $requests_per_connected_persons {$code} ; |
| 4864 | +# if ($requests > $max_requests_per_connected_us) |
| 4865 | +# { $requests = $max_requests_per_connected_us ; } |
| 4866 | +# $svg_groups {$requests} .= "." . lc ($code) . ", " ; |
| 4867 | +# } |
| 4868 | + |
| 4869 | +#foreach $code (keys %requests_per_connected_persons) |
| 4870 | +# { |
| 4871 | +# $requests = $requests_per_connected_persons {$code} ; |
| 4872 | +# if ($requests > $max_requests_per_connected_us) |
| 4873 | +# { $requests = $max_requests_per_connected_us ; } |
| 4874 | + |
| 4875 | +# $ratio = sqrt ($requests / $max_requests_per_connected_us) ; |
| 4876 | +# if ($ratio >= 0.20) |
| 4877 | +# { |
| 4878 | +# $green = 180 ; |
| 4879 | +# $red = 180 - int (0.5 + 180 * 5/4 * ($ratio-0.20)) ; |
| 4880 | +# $blue = int ($green / 3) ; |
| 4881 | +# } |
| 4882 | +# else |
| 4883 | +# { |
| 4884 | +# $red = 220 ; |
| 4885 | +# $green = int (0.5 + 220 * 5 * $ratio) ; |
| 4886 | +# $blue = 0 ; #int ($green / 2) ; |
| 4887 | +# } |
| 4888 | +# $fill = "\#" . sprintf ("%02x%02x%02x",$red,$green,$blue) ; |
| 4889 | +# $fill = lc hsv2rgb($ratio*120,1,1) ; |
| 4890 | + |
| 4891 | +# $fills {$code} = $fill ; |
| 4892 | +# } |
| 4893 | +} |
| 4894 | + |
| 4895 | +sub WriteWorldMapSvg |
| 4896 | +{ |
| 4897 | + ($period, $description) = @_ ; |
| 4898 | + |
| 4899 | + open SVG_IN, "world_map_blank_plain2.svg" ; |
| 4900 | +# open SVG_IN, "BlankMap-World6,_compact with text box.svg" ; |
| 4901 | + @lines = <SVG_IN> ; |
| 4902 | + close SVG_IN ; |
| 4903 | + |
| 4904 | +# foreach $line (@lines) |
| 4905 | +# { $line =~ s/COUNTRY_STYLES/$svg_text/ ; } |
| 4906 | + |
| 4907 | + ($text1,$text2) = split ' - ', $description ; |
| 4908 | + print "Animation description: $description -> $text1 | $text2\n" ; |
| 4909 | + |
| 4910 | + $lines = join '', @lines ; |
| 4911 | + $lines =~ s/<circle[^>]*?>//gs ; |
| 4912 | + $lines =~ s/Yyyy/$text2/ ; |
| 4913 | + $lines =~ s/Xxxx/$text1/ ; |
| 4914 | +# $lines =~ s/Zzzz/Wikipedia views per internet user/ ; |
| 4915 | + $lines =~ s/Zzzz/$desc_animation/ ; |
| 4916 | + |
| 4917 | + $linenum = 0 ; |
| 4918 | + @lines = split '<g', $lines ; |
| 4919 | + foreach $line (@lines) |
| 4920 | + { |
| 4921 | + @lines2 = split '<path', $line ; |
| 4922 | + |
| 4923 | + ($code = $lines2 [0]) =~ s/^.*?id=\"(\w+)\".*$/$1/s ; |
| 4924 | + $code = substr ($code,0,2) ; |
| 4925 | + |
| 4926 | + if (defined $fills {$code}) |
| 4927 | + { |
| 4928 | + $fill = $fills {$code} ; |
| 4929 | + $lines2 [0] =~ s/(id="$code[x-]?")(?:\s*\n\s*style="[^"]*")?/$1\n style="fill:$fill;fill-opacity:1;stroke:#000000;stroke-width:2.5"/s ; |
| 4930 | + } |
| 4931 | + $linenum = 0 ; |
| 4932 | + foreach $line2 (@lines2) |
| 4933 | + { |
| 4934 | + ($code = $line2) =~ s/^.*?id=\"(\w+)\".*$/$1/s ; |
| 4935 | + $code = substr ($code,0,2) ; |
| 4936 | + |
| 4937 | + next if ! defined $fills {$code} ; |
| 4938 | + $fill = $fills {$code} ; |
| 4939 | + |
| 4940 | + # $trace_svg = $false ; |
| 4941 | + # if (($code eq 'ne') && ($line2 =~ /id=\"$code/i)) |
| 4942 | + # { $trace_svg = $true ; } |
| 4943 | + # print "A " . $line2 . "\n\n" if $trace_svg ; |
| 4944 | + |
| 4945 | + next if $linenum ++ == 0 ; |
| 4946 | + $line2 =~ s/style="[^"]*"/style="fill:$fill;fill-opacity:1;stroke:#000000;stroke-width:2.5"/s; |
| 4947 | + |
| 4948 | + # print "B " . $line2 [0] . "\n\n" if $trace_svg ; |
| 4949 | + } |
| 4950 | + $line = join '<path', @lines2 ; |
| 4951 | + } |
| 4952 | + $lines = join '<g', @lines ; |
| 4953 | + |
| 4954 | + @lines = split '<path', $lines ; |
| 4955 | + foreach $line (@lines) |
| 4956 | + { |
| 4957 | + ($code = $line) =~ s/^.*?id=\"([\w-]+)\".*$/$1/s ; |
| 4958 | + next if ! defined $requests_per_persons {$code} ; |
| 4959 | + # print "A $line\n" ; |
| 4960 | + $fill = $fills {$code} ; |
| 4961 | + $line =~ s/(id="$code[x-]?")\s*\n\s*style="fill:#b9b9b9[^"]*"/$1\n style="fill:$fill;fill-opacity:1;stroke:#000000;stroke-width:2.5"/sg ; |
| 4962 | + # print "B $line\n" ; |
| 4963 | + } |
| 4964 | + $lines = join '<path', @lines ; |
| 4965 | + |
| 4966 | + # if (! defined $fills {$code}) { if ($code =~ /^.{2,3}$/) { print uc($code) . ",\"CODE NOT FOUND\"\n" ; } } |
| 4967 | + |
| 4968 | + $lines =~ s/fill:#b9b9b9;stroke:#ffffff;stroke-width:[\d\.]*/fill:#606060;stroke:#000000;stroke-width:2.5/g ; |
| 4969 | + |
| 4970 | + @lines = split ("\n", $lines) ; |
| 4971 | + open SVG_OUT, '>', "svg/world_map_$period.svg" ; |
| 4972 | + foreach $line (@lines) |
| 4973 | + { |
| 4974 | + chomp $line ; |
| 4975 | + print SVG_OUT "$line\n" ; |
| 4976 | + } |
| 4977 | + close SVG_OUT ; |
| 4978 | + |
| 4979 | + print "Convert world_map_$period.svg to png\n" ; |
| 4980 | + `svg/convert.exe svg/world_map_$period.svg png:svg/world_map_$period.png` ; |
| 4981 | +# print "Convert world_map_$period.svg to jpg\n" ; |
| 4982 | +# `svg/convert.exe svg/world_map_$period.svg jpg:svg/world_map_$period.jpg` ; |
| 4983 | +# print "Convert world_map_$period.svg to gif\n" ; |
| 4984 | +# `svg/convert.exe svg/world_map_$period.svg gif:svg/world_map_$period.gif` ; |
| 4985 | + |
| 4986 | +# exit ; # qqq |
| 4987 | +# exit ; |
| 4988 | +# sleep (2) ; # until computer fan fixed |
| 4989 | +} |
| 4990 | + |
| 4991 | +sub RatioAndFillColor |
| 4992 | +{ |
| 4993 | + my ($code, $requests,$requests_max, $ratio_sqrt) = @_ ; |
| 4994 | + my ($ratio,$green,$red,$blue,$fill) ; |
| 4995 | + |
| 4996 | + if ($requests > $requests_max) |
| 4997 | + { $requests = $requests_max ; } |
| 4998 | + |
| 4999 | + $ratio = $requests / $requests_max ; |
| 5000 | + |
| 5001 | + if ($ratio_sqrt && ($ratio > 0)) |
| 5002 | + { $ratio = sqrt ($ratio) ; } |
| 5003 | + |
| 5004 | +# if ($ratio >= 0.20) |
| 5005 | +# { |
| 5006 | +# $green = 180 ; |
| 5007 | +# $red = 180 - int (0.5 + 180 * 5/4 * ($ratio-0.20)) ; |
| 5008 | +# $blue = int ($green / 3) ; |
| 5009 | +# } |
| 5010 | +# else |
| 5011 | +# { |
| 5012 | +# $red = 220 ; |
| 5013 | +# $green = int (0.5 + 220 * 5 * $ratio) ; |
| 5014 | +# $blue = 0 ; #int ($green / 2) ; |
| 5015 | +# } |
| 5016 | + |
| 5017 | +# $fill = "\#" . sprintf ("%02x%02x%02x",$red,$green,$blue) ; |
| 5018 | +# $fill = lc hsv2rgb($ratio*150,0.67+$ratio*0.33,0.8-0.2*$ratio) ; |
| 5019 | + $fill = lc hsv2rgb($ratio*120,1,1) ; |
| 5020 | + |
| 5021 | + $fills {lc $code} = $fill ; |
| 5022 | + return ($requests,$ratio,$fill) ; |
| 5023 | +} |
| 5024 | + |
| 5025 | +sub RatioAndFillColor2 |
| 5026 | +{ |
| 5027 | + my ($code, $requests,$requests_max, $ratio_sqrt) = @_ ; |
| 5028 | + my ($ratio,$green,$red,$blue,$fill,$value) ; |
| 5029 | + |
| 5030 | + if ($requests > $requests_max) |
| 5031 | + { $requests = $requests_max ; } |
| 5032 | + |
| 5033 | + $ratio = $requests / $requests_max ; |
| 5034 | + |
| 5035 | +# if ($ratio_sqrt && ($ratio > 0)) |
| 5036 | +# { $ratio = sqrt ($ratio) ; } |
| 5037 | + |
| 5038 | + if ($ratio >= 0.5) |
| 5039 | + { |
| 5040 | + $value = $ratio * 2 - 1 ; # 0.5 - 1 -> 0 - 1 |
| 5041 | + $fill = lc hsv2rgb(60+$value*60,0.5+$value/2,0.5+$value/2) ; |
| 5042 | + $fill = lc hsv2rgb(120,0+$value,0.5+$value/2) ; |
| 5043 | + } |
| 5044 | + else |
| 5045 | + { |
| 5046 | + $value = 1 - $ratio * 2 ; # 0 - 0.5 -> 1 - 0 |
| 5047 | + $fill = lc hsv2rgb(60-$value*60,0.5+$value/2,0.5+$value/2) ; |
| 5048 | + $fill = lc hsv2rgb(0,0+$value,0.5+$value/2) ; |
| 5049 | + } # lc hsv2rgb($ratio*150,0.67+$ratio*0.33,0.8-0.2*$ratio) ; } |
| 5050 | +# print "ratio $ratio: requests $requests max requests $requests_max $fill\n" ; |
| 5051 | + |
| 5052 | + $fills {lc $code} = $fill ; |
| 5053 | + return ($requests,$ratio,$fill) ; |
| 5054 | +} |
| 5055 | + |
| 5056 | + |
| 5057 | +sub WriteReportPerCountryBreakdown |
| 5058 | +{ |
| 5059 | + print "\nWriteReportPerCountryBreakDown\n" ; |
| 5060 | + |
| 5061 | + my ($title,$views_edits,$links,$cutoff_requests, $cutoff_percentage, $show_logcount) = @_ ; |
| 5062 | + my ($link_country,$population,$icon,$bar,$bars,$bar_width,$perc,$perc_tot,$perc_global,$requests_tot) ; |
| 5063 | + my ($requests_this_language, $requests_all_languages, $requests_used, $requests_other) ; |
| 5064 | + my @index_countries ; |
| 5065 | + my $views_edits_lc = lc $views_edits ; |
| 5066 | + |
| 5067 | + if ($show_logcount) |
| 5068 | + { $report_version = "<p>This is the extended version of this report, with even small percentages included (> $cutoff_percentage\%) (see also bottom of page). " . |
| 5069 | + "Switch to <a href='$file_html_per_country_breakdown'>regular version</a>" ; } |
| 5070 | + else |
| 5071 | + { $report_version = "<p>This is the regular version of this report, with only major percentages (> $cutoff_percentage\%) included." . |
| 5072 | + " Switch to <a href='$file_html_per_country_breakdown_huge'>extended version</a>" ; } |
| 5073 | + |
| 5074 | + $html = $header ; |
| 5075 | + $html =~ s/TITLE/$title/ ; |
| 5076 | + $html =~ s/HEADER/$title/ ; |
| 5077 | + $html =~ s/LINKS// ; |
| 5078 | + $html =~ s/ALSO/$links/ ; |
| 5079 | + $html =~ s/NOTES// ; |
| 5080 | + $html =~ s/X1000/. Period <b>$requests_recently_start - $requests_recently_stop<\/b><br>$report_version/ ; |
| 5081 | + $html =~ s/DATE// ; |
| 5082 | + |
| 5083 | + $html .= "<p><table border=1 width=800>INDEX\n" ; |
| 5084 | + |
| 5085 | + $html .= &HtmlWorldMaps ; |
| 5086 | + |
| 5087 | + my $anomaly_found ; |
| 5088 | + |
| 5089 | + foreach $country (keys_sorted_by_value_num_desc %requests_recently_per_country) |
| 5090 | + { |
| 5091 | + next if $requests_recently_per_country {$country} < $cutoff_requests ; |
| 5092 | + |
| 5093 | + %requests_per_language = %{$requests_recently_per_country_per_language {$country}} ; |
| 5094 | + @languages = keys_sorted_by_value_num_desc %requests_per_language ; |
| 5095 | + |
| 5096 | + $requests_this_country = $requests_recently_per_country {$country} ; |
| 5097 | + |
| 5098 | + $perc = 'n.a.' ; |
| 5099 | + if ($requests_recently_all > 0) |
| 5100 | + { $perc = &Percentage ($requests_this_country / $requests_recently_all) ; } |
| 5101 | + |
| 5102 | + ($link_country,$icon,$population) = &CountryMetaInfo ($country) ; |
| 5103 | + |
| 5104 | + $html .= "<tr><th colspan=99 class=lh3><a id='$country' name='$country'></a><br>$icon $link_country <small>($perc share of global total)</small></th></tr>\n" ; |
| 5105 | + |
| 5106 | + $perc_tot = 0; |
| 5107 | + $requests_used = 0 ; |
| 5108 | + for ($l = 0 ; $l < 50 ; $l++) |
| 5109 | + { |
| 5110 | + $requests_this_language = $requests_recently_per_country_per_language {$country} {$languages [$l]} ; |
| 5111 | + $requests_all_languages = $requests_recently_per_country {$country} ; |
| 5112 | + |
| 5113 | + last if $requests_this_language == 0 ; |
| 5114 | + |
| 5115 | + $requests_used += $requests_this_language ; |
| 5116 | + |
| 5117 | + $perc = 0 ; |
| 5118 | + if ($requests_recently_all > 0) |
| 5119 | + { |
| 5120 | + $perc = &Percentage ($requests_this_language / $requests_all_languages) ; |
| 5121 | + |
| 5122 | + last if $perc < $cutoff_percentage ; |
| 5123 | + |
| 5124 | + $perc_tot += $perc ; |
| 5125 | + } |
| 5126 | + |
| 5127 | + $language = $languages [$l] ; |
| 5128 | + if ($out_languages {$language} ne "") |
| 5129 | + { $language = $out_languages {$language} ; } |
| 5130 | + if (length ($language) > 20) |
| 5131 | + { $language =~ s/ .*$// ; } |
| 5132 | + $bar_width = int ($perc * 6) ; |
| 5133 | + |
| 5134 | + if (($country eq "Australia") && ($language eq "Japanese") && ($perc > 5)) |
| 5135 | + { $language .= " <b><a href='#anomaly' onclick='alert(\"Probably incorrectly assigned to this country.\\nOutdated Regional Internet Registry (RIR) administration may have caused this.\")';><font color='#FF0000'>(*)</font></a></b>" ; $anomaly_found = $true ;} |
| 5136 | + |
| 5137 | + $bar_100 = "" ; |
| 5138 | + if ($bars++ == 0) |
| 5139 | + { |
| 5140 | + $bar_width_100 = 600 - $bar_width ; |
| 5141 | + $bar_100 = "<img src='background.gif' width=$bar_width_100 height=15>" ; |
| 5142 | + } |
| 5143 | + |
| 5144 | + if ($language !~ /Portal/) |
| 5145 | + { $language .= " Wp" ; } |
| 5146 | + |
| 5147 | + $perc =~ s/(\.\d)0/$1/ ; # 0.10% -> 0.1% |
| 5148 | + if ($show_logcount && ($requests_this_language < 5 * $months_recently)) # show in grey to discuss threshold on foundation-l |
| 5149 | + { $perc = "<font color=#800000>$perc</font" ; } |
| 5150 | + |
| 5151 | + $html .= "<tr><th class=l class=small nowrap>$language</th>" . |
| 5152 | + ($show_logcount ? "<td class=r>$requests_this_language</td>" : "") . |
| 5153 | + "<td class=c>$perc</td>" . |
| 5154 | + "<td class=l><img src='yellowbar_hor.gif' width=$bar_width height=15>$bar_100</td></tr>\n" ; |
| 5155 | + } |
| 5156 | + |
| 5157 | + if ($perc_tot > 100) { $perc_tot = 100 ; } |
| 5158 | + $requests_other = $requests_all_languages - $requests_used ; |
| 5159 | + $perc_other = sprintf '%.1f', 100 - $perc_tot ; |
| 5160 | + if (($requests_other > 0) && ($perc_other > 0)) |
| 5161 | + { |
| 5162 | + $bar_width = $perc_other * 6 ; |
| 5163 | + $html .= "<tr><th class=l class=small nowrap>Other</th>" . |
| 5164 | + ($show_logcount ? "<td class=r>$requests_other</td>" : "") . |
| 5165 | + "<td class=c>$perc_other%</td>" . |
| 5166 | + "<td class=l><img src='yellowbar_hor.gif' width=$bar_width height=15></td></tr>\n" ; |
| 5167 | + } |
| 5168 | + |
| 5169 | + if ($verbose) |
| 5170 | + { push @index_countries, "<a href='#$country'>$country ($perc)</a> " ; } |
| 5171 | + else |
| 5172 | + { push @index_countries, "<a href='#$country'>$country</a> " ; } |
| 5173 | + |
| 5174 | + # print "\n" ; |
| 5175 | + # $html .= "<tr><td colspan=99> </td></tr>\n" ; |
| 5176 | + } |
| 5177 | + $html .= "</table>" ; |
| 5178 | + $html .= "<p><b>Share<\/b> is the percentage of requesting ip addresses (out of the global total) which originated from this country" . |
| 5179 | + "<br> Further percentages show per country share of $views_edits_lc per Wikipedia visited" ; |
| 5180 | + $html .= "<p><b>Countries</b> are only included if the number of requests in the period exceeds $cutoff_requests,000 ($cutoff_requests matching records in 1:1000 sampled log)" ; |
| 5181 | + $html .= "<p><b>Wikipedia's</b> are only listed for some country if the share of visitors for that particular country exceeds $cutoff_percentage\%." ; |
| 5182 | + if ($show_logcount) |
| 5183 | + { |
| 5184 | + $html .= "<p>The second column displays the actual <b>numbers of records</b> found in the 1:1000 sampled log on which the percentage is based." . |
| 5185 | + "<br>Multiply by 1000 for actual $views_edits_lc over the whole period of $months_recently months." ; |
| 5186 | + $html .= "<br>If the number of records in the sampled log does not reach the (arbitrary) number of 5 per sampled month, the percentage is flagged dark red to extra emphasize high inaccuracy." ; |
| 5187 | + } |
| 5188 | + |
| 5189 | + $html .= "<p>Page requests by bots are not included. Also all ip addresses that occur more than once on a given day are discarded for that day." ; |
| 5190 | + $html .= "<br> A few false negatives are taken for granted. " . |
| 5191 | + "Country meta data collected from <a href='http://en.wikipedia.org/wiki/List_of_countries_by_population'>English Wikipedia</a>. " . |
| 5192 | + "Portal = <a href='http://www.wikipedia.org'>www.wikipedia.org</a>" ; |
| 5193 | +# if ($anomaly_found) |
| 5194 | +# { $html .= "<p><a id='anomaly' name='anomaly'>Probably anomaly caused by outdated <a href='http://en.wikipedia.org/wiki/Regional_Internet_Registry'>Regional Internet Registry</a> administration.\n" ; } |
| 5195 | + |
| 5196 | + $html .= $colophon ; |
| 5197 | + |
| 5198 | + $index = &HtmlIndex (join '/ ', sort (@index_countries)) ; |
| 5199 | + $html =~ s/INDEX/$index/ ; |
| 5200 | + |
| 5201 | + if (! $show_logcount) |
| 5202 | + { &PrintHtml ($html, "$path_out/$file_html_per_country_breakdown") ; } |
| 5203 | + else |
| 5204 | + { &PrintHtml ($html, "$path_out/$file_html_per_country_breakdown_huge") ; } |
| 5205 | +} |
| 5206 | + |
| 5207 | +sub WriteReportPerCountryTrends |
| 5208 | +{ |
| 5209 | + print "\nWriteReportPerCountryTrends\n" ; |
| 5210 | + |
| 5211 | + my ($title,$views_edits,$links) = @_ ; |
| 5212 | + my ($link_country,$population,$icon,$bar,$bars,$bar_width,$perc,$perc_tot,$perc_global,$requests_tot) ; |
| 5213 | + my @index_languages ; |
| 5214 | + my $views_edits_lc = lc $views_edits ; |
| 5215 | + |
| 5216 | + $html = $header ; |
| 5217 | + $html =~ s/TITLE/$title/ ; |
| 5218 | + $html =~ s/HEADER/$title/ ; |
| 5219 | + $html =~ s/LINKS// ; |
| 5220 | + $html =~ s/ALSO/$links/ ; |
| 5221 | + $html =~ s/NOTES// ; |
| 5222 | + $html =~ s/X1000/. Period <b>$requests_start - $requests_stop<\/b>/ ; |
| 5223 | + $html =~ s/DATE// ; |
| 5224 | + |
| 5225 | + $html .= "<p><table border=1 width=800>INDEX\n" ; |
| 5226 | + |
| 5227 | + $html .= &HtmlWorldMaps ; |
| 5228 | + |
| 5229 | + foreach $country (keys_sorted_by_value_num_desc %requests_per_country) |
| 5230 | + { |
| 5231 | + next if $requests_per_country {$country} < 50 * ($#quarters + 1) ; |
| 5232 | + |
| 5233 | + %requests_per_language = %{$requests_per_country_per_language {$country}} ; |
| 5234 | + @languages = keys_sorted_by_value_num_desc %requests_per_language ; |
| 5235 | + |
| 5236 | + ($link_country,$icon,$population) = &CountryMetaInfo ($country) ; |
| 5237 | + |
| 5238 | + $html .= "<tr><th colspan=99 class=lh3><a id='$country' name='$country'></a><br>$icon $link_country</th></tr>\n" ; |
| 5239 | + |
| 5240 | + if ($views_edits eq 'Page Edits') |
| 5241 | + { $rowspan = $#quarters+2 ; } |
| 5242 | + else |
| 5243 | + { $rowspan = $#quarters+3 ; } |
| 5244 | + |
| 5245 | + $html .= "<tr><th class=small>Quarter</th>[<th class=small>Total</th>]<th class=small>Share</th><th rowspan=$rowspan> </th>\n" ; |
| 5246 | + for ($l = 0 ; $l < 10 ; $l++) |
| 5247 | + { |
| 5248 | + $language = $languages [$l] ; |
| 5249 | + if ($out_languages {$language} ne "") |
| 5250 | + { $language = $out_languages {$language} ; } |
| 5251 | + if (length ($language) > 20) |
| 5252 | + { $language =~ s/ .*$// ; } |
| 5253 | + $html .= "<th class=c class=small>$language</th>\n" ; |
| 5254 | + # print " [$language] " ; |
| 5255 | + } |
| 5256 | + $html .= "<th>other</th>\n" ; |
| 5257 | + $html .= "</tr>\n" ; |
| 5258 | + # print "\n" ; |
| 5259 | + |
| 5260 | + $lines = 0 ; |
| 5261 | + foreach $quarter (reverse @quarters) |
| 5262 | + { |
| 5263 | + next if $views_edits eq 'Page Edits' and $quarter =~ /2009.*?Q3/ ; # strange results, to be researched |
| 5264 | + |
| 5265 | + $line1 = "<tr>\n" ; |
| 5266 | + $line2 = "<tr>\n" ; |
| 5267 | + |
| 5268 | + my $requests_this_country = $requests_per_quarter_per_country {$quarter} {$country} ; |
| 5269 | + my $requests_all_countries = $requests_per_quarter {$quarter} ; |
| 5270 | + |
| 5271 | + $perc = 'n.a.' ; |
| 5272 | + if ($requests_all_countries > 0) |
| 5273 | + { |
| 5274 | + $perc = &Percentage ($requests_this_country / $requests_all_countries) ; |
| 5275 | + # print "$quarter: " . sprintf ("%9d", $requests_this_country) . " = $perc\% $country\n" ; |
| 5276 | + $line1 .= "<th class=c nowrap> $quarter </th>[<td align=right>$requests_this_country</td>]<td align=center>$perc</td>" ; |
| 5277 | + $line2 .= "<th nowrap> $quarter </th>[<td align=right>$requests_this_country</td>]<td align=center>$perc</td>" ; |
| 5278 | + } |
| 5279 | + |
| 5280 | + $perc_tot = 0; |
| 5281 | + for ($l = 0 ; $l < 10 ; $l++) |
| 5282 | + { |
| 5283 | + my $requests_this_language = $requests_per_quarter_per_country_per_language {$quarter} {$country} {$languages [$l]} ; |
| 5284 | + my $requests_all_languages = $requests_per_quarter_per_country {$quarter} {$country} ; |
| 5285 | + $perc = 0 ; |
| 5286 | + if ($requests_all_languages > 0) |
| 5287 | + { |
| 5288 | + $perc = &Percentage ($requests_this_language / $requests_all_languages) ; |
| 5289 | + $perc_tot += $perc ; |
| 5290 | + } |
| 5291 | + # print "[" . sprintf ("%9d", $requests_this_language) . " = $perc\%]" ; |
| 5292 | + if ($perc != 0) |
| 5293 | + { $line2 .= "<td class=c><img src='yellowbar_hor.gif' width=$perc height=15></td>" ; } |
| 5294 | + else |
| 5295 | + { $line2 .= "<td class=l> </td>" ; } |
| 5296 | + |
| 5297 | + if (($country eq "Australia") && (($perc < 50) && ($perc > 5))) |
| 5298 | + { $perc .= " <b><a href='#anomaly' onclick='alert(\"Probably incorrectly assigned to this country.\\nOutdated Regional Internet Registry (RIR) administration may have caused this.\")';><font color='#FF0000'>(*)</font></a></b>" ; $anomaly_found = $true ;} |
| 5299 | + $line1 .= "<td class=c>[$requests_this_language]$perc</td>" ; |
| 5300 | + } |
| 5301 | + if ($perc_tot > 100) { $perc_tot = 100 ; } |
| 5302 | + $perc_other = sprintf '%.1f', 100 - $perc_tot ; |
| 5303 | + $line1 .= "<td class=c>$perc_other%</td>" ; |
| 5304 | + |
| 5305 | + $line1 .= "</tr>\n" ; |
| 5306 | + $line2 .= "</tr>\n" ; |
| 5307 | + $html .= $line1 ; |
| 5308 | + if ($lines++ == $#quarters) |
| 5309 | + { $html .= $line2 ; } # only for last quarter |
| 5310 | + } |
| 5311 | + |
| 5312 | + if ($verbose) |
| 5313 | + { push @index_countries, "<a href='#$country'>$country ($perc)</a> " ; } |
| 5314 | + else |
| 5315 | + { push @index_countries, "<a href='#$country'>$country</a> " ; } |
| 5316 | + |
| 5317 | + # print "\n" ; |
| 5318 | + # $html .= "<tr><td colspan=99> </td></tr>\n" ; |
| 5319 | + } |
| 5320 | + $html .= "</table>" ; |
| 5321 | + $html .= "<p><b>Share<\/b> is the percentage of requesting ip addresses (out of the global total) which originated from this country" . |
| 5322 | + "<br> Further percentages show per country per quarter share of $views_edits_lc per Wikipedia visited" ; |
| 5323 | + $html .= "<p>Countries are only included if the number of requests in the period exceeds 100,000 (100 matching records in 1:1000 sampled log)" ; |
| 5324 | + $html .= "<br>Page requests by bots are not included. Also all ip addresses that occur more than once on a given day are discarded for that day." ; |
| 5325 | + $html .= "<br> A few false negatives are taken for granted. " . |
| 5326 | + "Country meta data collected from <a href='http://en.wikipedia.org/wiki/List_of_countries_by_population'>English Wikipedia</a>. " . |
| 5327 | + "Portal = <a href='http://www.wikipedia.org'>www.wikipedia.org</a>" ; |
| 5328 | + $html .= $colophon ; |
| 5329 | + |
| 5330 | + $index = &HtmlIndex (join '/ ', sort (@index_countries)) ; |
| 5331 | + $html =~ s/INDEX/$index/ ; |
| 5332 | + |
| 5333 | + &PrintHtml ($html, "$path_out/$file_html_per_country_trends") ; |
| 5334 | +} |
| 5335 | + |
| 5336 | +sub CorrectForMissingDays |
| 5337 | +{ |
| 5338 | + my ($period, $count, $code, $var) = @_ ; |
| 5339 | + |
| 5340 | + if ($missing_days {$period} > 0) |
| 5341 | + { |
| 5342 | + my $count_prev = $count ; |
| 5343 | + $count = int (0.5 + $count * $correct_for_missing_days {$period}) ; |
| 5344 | + if ($code =~ /us/i) |
| 5345 | + { print "\nperiod $period: correct for ${missing_days {$period}} missing days = * ${correct_for_missing_days {$period}}, " . |
| 5346 | + " e.g. for $code: $var $count_prev -> $count\n\n" ; } |
| 5347 | + } |
| 5348 | + return ($count) ; |
| 5349 | +} |
| 5350 | + |
| 5351 | +sub FormatCount |
| 5352 | +{ |
| 5353 | + my $count = shift ; |
| 5354 | + if ($count eq "") |
| 5355 | + { return (" ") ; } |
| 5356 | + if ($count < 1) |
| 5357 | + { return ("1") ; } |
| 5358 | + $count =~ s/^(\d{1,3})(\d\d\d)$/$1,$2/ ; |
| 5359 | + $count =~ s/^(\d{1,3})(\d\d\d)(\d\d\d)$/$1,$2,$3/ ; |
| 5360 | + $count =~ s/^(\d{1,3})(\d\d\d)(\d\d\d)(\d\d\d)$/$1,$2,$3,$4/ ; |
| 5361 | + return ($count) ; |
| 5362 | +} |
| 5363 | + |
| 5364 | +sub SortMime |
| 5365 | +{ |
| 5366 | + my $mime = shift ; |
| 5367 | + if ($mime eq "text/html") |
| 5368 | + { return (2000000000 + $mimetypes {$mime}) ; } |
| 5369 | + elsif ($mime =~ /image\/(?:png|jpeg|gif)/) |
| 5370 | + { return (1000000000 + $mimetypes {$mime}) ; } |
| 5371 | + else |
| 5372 | + { return ($mimetypes {$mime}) ; } |
| 5373 | +} |
| 5374 | + |
| 5375 | +sub ExpandAbbreviation |
| 5376 | +{ |
| 5377 | + my $text = shift ; |
| 5378 | + # reverse (more or less) abbreviations |
| 5379 | + $text =~ s/^[\@\*]// ; |
| 5380 | + $text =~ s/^xx:upload/upload: /; |
| 5381 | + $text =~ s/^wb:/wikibooks:/; |
| 5382 | + $text =~ s/^wk:/wiktionary:/; |
| 5383 | + $text =~ s/^wn:/wikinews:/; |
| 5384 | + $text =~ s/^wp:/wikipedia:/; |
| 5385 | + $text =~ s/^wq:/wikiquote:/; |
| 5386 | + $text =~ s/^ws:/wikisource:/; |
| 5387 | + $text =~ s/^wv:/wikiversity:/; |
| 5388 | + $text =~ s/^wx:/wikispecial:/; |
| 5389 | + $text =~ s/^mw:/wikispecial:/; # eg bugzilla |
| 5390 | + $text =~ s/:!mw/:mediawiki/; |
| 5391 | + $text =~ s/^wm:/wikimedia:/; |
| 5392 | + $text =~ s/:wm$/:wikimedia/; |
| 5393 | + $text =~ s/^wmf:/foundation:/; |
| 5394 | + $text =~ s/:www$/:portal/; |
| 5395 | +# $text =~ s/^wikispecial:(.*)$/$1: /; |
| 5396 | + return ($text) ; |
| 5397 | +} |
| 5398 | + |
| 5399 | +sub GetSecondaryDomain |
| 5400 | +{ |
| 5401 | + $pattern_url_post = "\\.(?:biz|com|info|name|net|org|pro|aero|asia|cat|coop|edu|gov|int|jobs|mil|mobi|museum|tel|travel|arpa|[a-zA-Z0-9-]{2}|(?:com?|ne)\\.[a-zA-Z0-9-]{2})\$" ; |
| 5402 | + |
| 5403 | + my $domain = shift ; |
| 5404 | + $domain =~ s/http:\/\/// ; |
| 5405 | + $domain =~ s/\/.*$// ; |
| 5406 | + |
| 5407 | + if ($domain !~ /\./) |
| 5408 | + { return ($domain) ; } |
| 5409 | + |
| 5410 | + $domain =~ s/$pattern_url_post// ; |
| 5411 | + $domain =~ s/^.*?\.([^\.]+)$/$1/ ; |
| 5412 | + return ($domain) ; |
| 5413 | +} |
| 5414 | + |
| 5415 | +sub OpenLog |
| 5416 | +{ |
| 5417 | +# only shrink log when same log file is appended daily, is no longer the case |
| 5418 | +# $fileage = -M "$dir_reports/$file_log" ; |
| 5419 | +# if ($fileage > 5) |
| 5420 | +# { |
| 5421 | +# open "FILE_LOG", "<", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ; |
| 5422 | +# @log = <FILE_LOG> ; |
| 5423 | +# close "FILE_LOG" ; |
| 5424 | +# $lines = 0 ; |
| 5425 | +# open "FILE_LOG", ">", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ; |
| 5426 | +# foreach $line (@log) |
| 5427 | +# { |
| 5428 | +# if (++$lines >= $#log - 5000) |
| 5429 | +# { print FILE_LOG $line ; } |
| 5430 | +# } |
| 5431 | +# close "FILE_LOG" ; |
| 5432 | +# } |
| 5433 | +# open "FILE_LOG", ">>", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ; |
| 5434 | + open "FILE_LOG", ">>", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ; |
| 5435 | + &Log ("\n\n===== Wikimedia Sampled Visitors Log Report / " . date_time_english (time) . " =====\n\n") ; |
| 5436 | +} |
| 5437 | + |
| 5438 | +sub Normalize |
| 5439 | +{ |
| 5440 | + my $count = shift ; |
| 5441 | + $count *= $multiplier ; |
| 5442 | +# if ($count < 1) { $count = 1 ; } -> do this at FormatCount |
| 5443 | + return (sprintf ("%.0f", $count)) ; |
| 5444 | +} |
| 5445 | + |
| 5446 | +sub Log |
| 5447 | +{ |
| 5448 | + $msg = shift ; |
| 5449 | + print $msg ; |
| 5450 | + print FILE_LOG $msg ; |
| 5451 | +} |
| 5452 | + |
| 5453 | +sub InitProjectNames |
| 5454 | +{ |
| 5455 | + # copied from WikiReports.pl |
| 5456 | + |
| 5457 | + %wikipedias = ( |
| 5458 | +# mediawiki=>"http://wikimediafoundation.org Wikimedia", |
| 5459 | + nostalgia=>"http://nostalgia.wikipedia.org Nostalgia", |
| 5460 | + sources=>"http://wikisource.org Old Wikisource", |
| 5461 | + meta=>"http://meta.wikimedia.org Meta-Wiki", |
| 5462 | + beta=>"http://beta.wikiversity.org Beta", |
| 5463 | + species=>"http://species.wikipedia.org WikiSpecies", |
| 5464 | + commons=>"http://commons.wikimedia.org Commons", |
| 5465 | + foundation=>"http://wikimediafoundation.org Wikimedia Foundation", |
| 5466 | + sep11=>"http://sep11.wikipedia.org In Memoriam", |
| 5467 | + nlwikimedia=>"http://nl.wikimedia.org Wikimedia Nederland", |
| 5468 | + plwikimedia=>"http://pl.wikimedia.org Wikimedia Polska", |
| 5469 | + mediawiki=>"http://www.mediawiki.org MediaWiki", |
| 5470 | + dewikiversity=>"http://de.wikiversity.org Wikiversität", |
| 5471 | + frwikiversity=>"http://fr.wikiversity.org Wikiversität", |
| 5472 | + wikimania2005=>"http://wikimania2005.wikimedia.org Wikimania 2005", |
| 5473 | + wikimania2006=>"http://wikimania2006.wikimedia.org Wikimania 2006", |
| 5474 | + aa=>"http://aa.wikipedia.org Afar", |
| 5475 | + ab=>"http://ab.wikipedia.org Abkhazian", |
| 5476 | + ace=>"http://ace.wikipedia.org Acehnese", |
| 5477 | + af=>"http://af.wikipedia.org Afrikaans", |
| 5478 | + ak=>"http://ak.wikipedia.org Akan", # was Akana |
| 5479 | + als=>"http://als.wikipedia.org Alemannic", # was Elsatian |
| 5480 | + am=>"http://am.wikipedia.org Amharic", |
| 5481 | + an=>"http://an.wikipedia.org Aragonese", |
| 5482 | + ang=>"http://ang.wikipedia.org Anglo-Saxon", |
| 5483 | + ar=>"http://ar.wikipedia.org Arabic", |
| 5484 | + arc=>"http://arc.wikipedia.org Aramaic", |
| 5485 | + arz=>"http://arz.wikipedia.org Egyptian Arabic", |
| 5486 | + as=>"http://as.wikipedia.org Assamese", |
| 5487 | + ast=>"http://ast.wikipedia.org Asturian", |
| 5488 | + av=>"http://av.wikipedia.org Avar", # was Avienan |
| 5489 | + ay=>"http://ay.wikipedia.org Aymara", |
| 5490 | + az=>"http://az.wikipedia.org Azeri", # was Azerbaijani |
| 5491 | + ba=>"http://ba.wikipedia.org Bashkir", |
| 5492 | + bar=>"http://bar.wikipedia.org Bavarian", |
| 5493 | + bat_smg=>"http://bat-smg.wikipedia.org Samogitian", |
| 5494 | + "bat-smg"=>"http://bat-smg.wikipedia.org Samogitian", |
| 5495 | + bcl=>"http://bcl.wikipedia.org Central Bicolano", |
| 5496 | + be=>"http://be.wikipedia.org Belarusian", |
| 5497 | + "be-x-old"=>"http://be.wikipedia.org Belarusian (Tarashkevitsa)", |
| 5498 | + be_x_old=>"http://be.wikipedia.org Belarusian (Tarashkevitsa)", |
| 5499 | + bg=>"http://bg.wikipedia.org Bulgarian", |
| 5500 | + bh=>"http://bh.wikipedia.org Bihari", |
| 5501 | + bi=>"http://bi.wikipedia.org Bislama", |
| 5502 | + bm=>"http://bm.wikipedia.org Bambara", |
| 5503 | + bn=>"http://bn.wikipedia.org Bengali", |
| 5504 | + bo=>"http://bo.wikipedia.org Tibetan", |
| 5505 | + bpy=>"http://bpy.wikipedia.org Bishnupriya Manipuri", |
| 5506 | + br=>"http://br.wikipedia.org Breton", |
| 5507 | + bs=>"http://bs.wikipedia.org Bosnian", |
| 5508 | + bug=>"http://bug.wikipedia.org Buginese", |
| 5509 | + bxr=>"http://bxr.wikipedia.org Buryat", |
| 5510 | + ca=>"http://ca.wikipedia.org Catalan", |
| 5511 | + cbk_zam=>"http://cbk-zam.wikipedia.org Chavacano", |
| 5512 | + "cbk-zam"=>"http://cbk-zam.wikipedia.org Chavacano", |
| 5513 | + cdo=>"http://cdo.wikipedia.org Min Dong", |
| 5514 | + ce=>"http://ce.wikipedia.org Chechen", |
| 5515 | + ceb=>"http://ceb.wikipedia.org Cebuano", |
| 5516 | + ch=>"http://ch.wikipedia.org Chamorro", # was Chamoru |
| 5517 | + ckb=>"http://ckb.wikipedia.org Sorani", |
| 5518 | + cho=>"http://cho.wikipedia.org Choctaw", # was Chotaw |
| 5519 | + chr=>"http://chr.wikipedia.org Cherokee", |
| 5520 | + chy=>"http://chy.wikipedia.org Cheyenne", # was Setsêhestâhese |
| 5521 | + co=>"http://co.wikipedia.org Corsican", |
| 5522 | + cr=>"http://cr.wikipedia.org Cree", |
| 5523 | + crh=>"http://crh.wikipedia.org Crimean Tatar", |
| 5524 | + cs=>"http://cs.wikipedia.org Czech", |
| 5525 | + csb=>"http://csb.wikipedia.org Cashubian", # was Kashubian |
| 5526 | + cu=>"http://cv.wikipedia.org Old Church Slavonic", |
| 5527 | + cv=>"http://cv.wikipedia.org Chuvash", # was Cavas |
| 5528 | + cy=>"http://cy.wikipedia.org Welsh", |
| 5529 | + da=>"http://da.wikipedia.org Danish", |
| 5530 | + de=>"http://de.wikipedia.org German", |
| 5531 | + diq=>"http://diq.wikipedia.org Zazaki", |
| 5532 | + dk=>"http://dk.wikipedia.org Danish", |
| 5533 | + dsb=>"http://dsb.wikipedia.org Lower Sorbian", |
| 5534 | + dv=>"http://dv.wikipedia.org Divehi", |
| 5535 | + dz=>"http://dz.wikipedia.org Dzongkha", |
| 5536 | + ee=>"http://ee.wikipedia.org Ewe", |
| 5537 | + el=>"http://el.wikipedia.org Greek", |
| 5538 | + eml=>"http://eml.wikipedia.org Emilian-Romagnol", |
| 5539 | + en=>"http://en.wikipedia.org English", |
| 5540 | + eo=>"http://eo.wikipedia.org Esperanto", |
| 5541 | + es=>"http://es.wikipedia.org Spanish", |
| 5542 | + et=>"http://et.wikipedia.org Estonian", |
| 5543 | + eu=>"http://eu.wikipedia.org Basque", |
| 5544 | + ext=>"http://ext.wikipedia.org Extremaduran", |
| 5545 | + fa=>"http://fa.wikipedia.org Persian", |
| 5546 | + ff=>"http://ff.wikipedia.org Fulfulde", |
| 5547 | + fi=>"http://fi.wikipedia.org Finnish", |
| 5548 | + "fiu-vro"=>"http://fiu-vro.wikipedia.org Voro", |
| 5549 | + fiu_vro=>"http://fiu-vro.wikipedia.org Voro", |
| 5550 | + fj=>"http://fj.wikipedia.org Fijian", |
| 5551 | + fo=>"http://fo.wikipedia.org Faroese", # was Faeroese |
| 5552 | + fr=>"http://fr.wikipedia.org French", |
| 5553 | + frp=>"http://frp.wikipedia.org Arpitan", |
| 5554 | + fur=>"http://fur.wikipedia.org Friulian", |
| 5555 | + fy=>"http://fy.wikipedia.org Frisian", |
| 5556 | + ga=>"http://ga.wikipedia.org Irish", |
| 5557 | + gan=>"http://gan.wikipedia.org Gan", |
| 5558 | + gay=>"http://gay.wikipedia.org Gayo", |
| 5559 | + gd=>"http://gd.wikipedia.org Scots Gaelic", # was Scottish Gaelic |
| 5560 | + gl=>"http://gl.wikipedia.org Galician", # was Galego |
| 5561 | + glk=>"http://glk.wikipedia.org Gilaki", |
| 5562 | + gn=>"http://gn.wikipedia.org Guarani", |
| 5563 | + got=>"http://got.wikipedia.org Gothic", |
| 5564 | + gu=>"http://gu.wikipedia.org Gujarati", |
| 5565 | + gv=>"http://gv.wikipedia.org Manx", # was Manx Gaelic |
| 5566 | + ha=>"http://ha.wikipedia.org Hausa", |
| 5567 | + hak=>"http://hak.wikipedia.org Hakka", |
| 5568 | + haw=>"http://haw.wikipedia.org Hawai'ian", # was Hawaiian |
| 5569 | + he=>"http://he.wikipedia.org Hebrew", |
| 5570 | + hi=>"http://hi.wikipedia.org Hindi", |
| 5571 | + hif=>"http://hif.wikipedia.org Fiji Hindi", |
| 5572 | + ho=>"http://ho.wikipedia.org Hiri Motu", |
| 5573 | + hr=>"http://hr.wikipedia.org Croatian", |
| 5574 | + hsb=>"http://hsb.wikipedia.org Upper Sorbian", |
| 5575 | + ht=>"http://ht.wikipedia.org Haitian", |
| 5576 | + hu=>"http://hu.wikipedia.org Hungarian", |
| 5577 | + hy=>"http://hy.wikipedia.org Armenian", |
| 5578 | + hz=>"http://hz.wikipedia.org Herero", |
| 5579 | + ia=>"http://ia.wikipedia.org Interlingua", |
| 5580 | + iba=>"http://iba.wikipedia.org Iban", |
| 5581 | + id=>"http://id.wikipedia.org Indonesian", |
| 5582 | + ie=>"http://ie.wikipedia.org Interlingue", |
| 5583 | + ig=>"http://ig.wikipedia.org Igbo", |
| 5584 | + ii=>"http://ii.wikipedia.org Yi", |
| 5585 | + ik=>"http://ik.wikipedia.org Inupiak", |
| 5586 | + ilo=>"http://ilo.wikipedia.org Ilokano", |
| 5587 | + io=>"http://io.wikipedia.org Ido", |
| 5588 | + is=>"http://is.wikipedia.org Icelandic", |
| 5589 | + it=>"http://it.wikipedia.org Italian", |
| 5590 | + iu=>"http://iu.wikipedia.org Inuktitut", |
| 5591 | + ja=>"http://ja.wikipedia.org Japanese", |
| 5592 | + jbo=>"http://jbo.wikipedia.org Lojban", |
| 5593 | + jv=>"http://jv.wikipedia.org Javanese", |
| 5594 | + ka=>"http://ka.wikipedia.org Georgian", |
| 5595 | + kaa=>"http://kaa.wikipedia.org Karakalpak", |
| 5596 | + kab=>"http://ka.wikipedia.org Kabyle", |
| 5597 | + kaw=>"http://kaw.wikipedia.org Kawi", |
| 5598 | + kg=>"http://kg.wikipedia.org Kongo", |
| 5599 | + ki=>"http://ki.wikipedia.org Kikuyu", |
| 5600 | + kj=>"http://kj.wikipedia.org Kuanyama", # was Otjiwambo |
| 5601 | + kk=>"http://kk.wikipedia.org Kazakh", |
| 5602 | + kl=>"http://kl.wikipedia.org Greenlandic", |
| 5603 | + km=>"http://km.wikipedia.org Khmer", # was Cambodian |
| 5604 | + kn=>"http://kn.wikipedia.org Kannada", |
| 5605 | + ko=>"http://ko.wikipedia.org Korean", |
| 5606 | + kr=>"http://kr.wikipedia.org Kanuri", |
| 5607 | + ks=>"http://ks.wikipedia.org Kashmiri", |
| 5608 | + ksh=>"http://ksh.wikipedia.org Ripuarian", |
| 5609 | + ku=>"http://ku.wikipedia.org Kurdish", |
| 5610 | + kv=>"http://kv.wikipedia.org Komi", |
| 5611 | + kw=>"http://kw.wikipedia.org Cornish", # was Kornish |
| 5612 | + ky=>"http://ky.wikipedia.org Kirghiz", |
| 5613 | + la=>"http://la.wikipedia.org Latin", |
| 5614 | + lad=>"http://lad.wikipedia.org Ladino", |
| 5615 | + lb=>"http://lb.wikipedia.org Luxembourgish", # was Letzeburgesch |
| 5616 | + lbe=>"http://lbe.wikipedia.org Lak", |
| 5617 | + lg=>"http://lg.wikipedia.org Ganda", |
| 5618 | + li=>"http://li.wikipedia.org Limburgish", |
| 5619 | + lij=>"http://lij.wikipedia.org Ligurian", |
| 5620 | + lmo=>"http://lmo.wikipedia.org Lombard", |
| 5621 | + ln=>"http://ln.wikipedia.org Lingala", |
| 5622 | + lo=>"http://lo.wikipedia.org Laotian", |
| 5623 | + ls=>"http://ls.wikipedia.org Latino Sine Flexione", |
| 5624 | + lt=>"http://lt.wikipedia.org Lithuanian", |
| 5625 | + lv=>"http://lv.wikipedia.org Latvian", |
| 5626 | + mad=>"http://mad.wikipedia.org Madurese", |
| 5627 | + mak=>"http://mak.wikipedia.org Makasar", |
| 5628 | + map_bms=>"http://map-bms.wikipedia.org Banyumasan", |
| 5629 | + "map-bms"=>"http://map-bms.wikipedia.org Banyumasan", |
| 5630 | + mdf=>"http://mdf.wikipedia.org Moksha", |
| 5631 | + mg=>"http://mg.wikipedia.org Malagasy", |
| 5632 | + mh=>"http://mh.wikipedia.org Marshallese", |
| 5633 | + mhr=>"http://mhr.wikipedia.org Eastern Mari", |
| 5634 | + mi=>"http://mi.wikipedia.org Maori", |
| 5635 | + min=>"http://min.wikipedia.org Minangkabau", |
| 5636 | + minnan=>"http://minnan.wikipedia.org Minnan", |
| 5637 | + mk=>"http://mk.wikipedia.org Macedonian", |
| 5638 | + ml=>"http://ml.wikipedia.org Malayalam", |
| 5639 | + mn=>"http://mn.wikipedia.org Mongolian", |
| 5640 | + mo=>"http://mo.wikipedia.org Moldavian", |
| 5641 | + mr=>"http://mr.wikipedia.org Marathi", |
| 5642 | + ms=>"http://ms.wikipedia.org Malay", |
| 5643 | + mt=>"http://mt.wikipedia.org Maltese", |
| 5644 | + mus=>"http://mus.wikipedia.org Muskogee", |
| 5645 | + mwl=>"http://mwl.wikipedia.org Mirandese", |
| 5646 | + my=>"http://my.wikipedia.org Burmese", |
| 5647 | + myv=>"http://myv.wikipedia.org Erzya", |
| 5648 | + mzn=>"http://mzn.wikipedia.org Mazandarani", |
| 5649 | + na=>"http://na.wikipedia.org Nauruan", # was Nauru |
| 5650 | + nah=>"http://nah.wikipedia.org Nahuatl", |
| 5651 | + nap=>"http://nap.wikipedia.org Neapolitan", |
| 5652 | + nds=>"http://nds.wikipedia.org Low Saxon", |
| 5653 | + nds_nl=>"http://nds-nl.wikipedia.org Dutch Low Saxon", |
| 5654 | + "nds-nl"=>"http://nds-nl.wikipedia.org Dutch Low Saxon", |
| 5655 | + ne=>"http://ne.wikipedia.org Nepali", |
| 5656 | + new=>"http://new.wikipedia.org Nepal Bhasa", |
| 5657 | + ng=>"http://ng.wikipedia.org Ndonga", |
| 5658 | + nl=>"http://nl.wikipedia.org Dutch", |
| 5659 | + nov=>"http://nov.wikipedia.org Novial", |
| 5660 | + nrm=>"http://nrm.wikipedia.org Norman", |
| 5661 | + nn=>"http://nn.wikipedia.org Nynorsk", # was Neo-Norwegian |
| 5662 | + no=>"http://no.wikipedia.org Norwegian", |
| 5663 | + nv=>"http://nv.wikipedia.org Navajo", # was Avayo |
| 5664 | + ny=>"http://ny.wikipedia.org Chichewa", |
| 5665 | + oc=>"http://oc.wikipedia.org Occitan", |
| 5666 | + om=>"http://om.wikipedia.org Oromo", |
| 5667 | + or=>"http://or.wikipedia.org Oriya", |
| 5668 | + os=>"http://os.wikipedia.org Ossetic", |
| 5669 | + pa=>"http://pa.wikipedia.org Punjabi", |
| 5670 | + pag=>"http://pag.wikipedia.org Pangasinan", |
| 5671 | + pam=>"http://pam.wikipedia.org Kapampangan", |
| 5672 | + pap=>"http://pap.wikipedia.org Papiamentu", |
| 5673 | + pdc=>"http://pdc.wikipedia.org Pennsylvania German", |
| 5674 | + pi=>"http://pi.wikipedia.org Pali", |
| 5675 | + pih=>"http://pih.wikipedia.org Norfolk", |
| 5676 | + pl=>"http://pl.wikipedia.org Polish", |
| 5677 | + pms=>"http://pms.wikipedia.org Piedmontese", |
| 5678 | + pnb=>"http://pnb.wikipedia.org Western Panjabi", |
| 5679 | + pnt=>"http://pnt.wikipedia.org Pontic", |
| 5680 | + ps=>"http://ps.wikipedia.org Pashto", |
| 5681 | + pt=>"http://pt.wikipedia.org Portuguese", |
| 5682 | + qu=>"http://qu.wikipedia.org Quechua", |
| 5683 | + rm=>"http://rm.wikipedia.org Romansh", # was Rhaeto-Romance |
| 5684 | + rmy=>"http://rmy.wikipedia.org Romani", |
| 5685 | + rn=>"http://rn.wikipedia.org Kirundi", |
| 5686 | + ro=>"http://ro.wikipedia.org Romanian", |
| 5687 | + roa_rup=>"http://roa-rup.wikipedia.org Aromanian", |
| 5688 | + "roa-rup"=>"http://roa-rup.wikipedia.org Aromanian", |
| 5689 | + roa_tara=>"http://roa-tara.wikipedia.org Tarantino", |
| 5690 | + "roa-tara"=>"http://roa-tara.wikipedia.org Tarantino", |
| 5691 | + ru=>"http://ru.wikipedia.org Russian", |
| 5692 | + ru_sib=>"http://ru-sib.wikipedia.org Siberian", |
| 5693 | + "ru-sib"=>"http://ru-sib.wikipedia.org Siberian", |
| 5694 | + rw=>"http://rw.wikipedia.org Kinyarwanda", |
| 5695 | + sa=>"http://sa.wikipedia.org Sanskrit", |
| 5696 | + sah=>"http://sah.wikipedia.org Sakha", |
| 5697 | + sc=>"http://sc.wikipedia.org Sardinian", |
| 5698 | + scn=>"http://scn.wikipedia.org Sicilian", |
| 5699 | + sco=>"http://sco.wikipedia.org Scots", |
| 5700 | + sd=>"http://sd.wikipedia.org Sindhi", |
| 5701 | + se=>"http://se.wikipedia.org Northern Sami", |
| 5702 | + sg=>"http://sg.wikipedia.org Sangro", |
| 5703 | + sh=>"http://sh.wikipedia.org Serbo-Croatian", |
| 5704 | + si=>"http://si.wikipedia.org Sinhala", # was Singhalese |
| 5705 | + simple=>"http://simple.wikipedia.org Simple English", |
| 5706 | + sk=>"http://sk.wikipedia.org Slovak", |
| 5707 | + sl=>"http://sl.wikipedia.org Slovene", |
| 5708 | + sm=>"http://sm.wikipedia.org Samoan", |
| 5709 | + sn=>"http://sn.wikipedia.org Shona", |
| 5710 | + so=>"http://so.wikipedia.org Somali", # was Somalian |
| 5711 | + sq=>"http://sq.wikipedia.org Albanian", |
| 5712 | + sr=>"http://sr.wikipedia.org Serbian", |
| 5713 | + srn=>"http://srn.wikipedia.org Sranan", |
| 5714 | + ss=>"http://ss.wikipedia.org Siswati", |
| 5715 | + st=>"http://st.wikipedia.org Sesotho", |
| 5716 | + stq=>"http://stq.wikipedia.org Saterland Frisian", |
| 5717 | + su=>"http://su.wikipedia.org Sundanese", |
| 5718 | + sv=>"http://sv.wikipedia.org Swedish", |
| 5719 | + sw=>"http://sw.wikipedia.org Swahili", |
| 5720 | + szl=>"http://szl.wikipedia.org Silesian", |
| 5721 | + ta=>"http://ta.wikipedia.org Tamil", |
| 5722 | + te=>"http://te.wikipedia.org Telugu", |
| 5723 | + test=>"http://test.wikipedia.org Test", |
| 5724 | + tet=>"http://tet.wikipedia.org Tetum", |
| 5725 | + tg=>"http://tg.wikipedia.org Tajik", |
| 5726 | + th=>"http://th.wikipedia.org Thai", |
| 5727 | + ti=>"http://ti.wikipedia.org Tigrinya", |
| 5728 | + tk=>"http://tk.wikipedia.org Turkmen", |
| 5729 | + tl=>"http://tl.wikipedia.org Tagalog", |
| 5730 | + tlh=>"http://tlh.wikipedia.org Klingon", # was Klignon |
| 5731 | + tn=>"http://tn.wikipedia.org Setswana", |
| 5732 | + to=>"http://to.wikipedia.org Tongan", |
| 5733 | + tokipona=>"http://tokipona.wikipedia.org Tokipona", |
| 5734 | + tpi=>"http://tpi.wikipedia.org Tok Pisin", |
| 5735 | + tr=>"http://tr.wikipedia.org Turkish", |
| 5736 | + ts=>"http://ts.wikipedia.org Tsonga", |
| 5737 | + tt=>"http://tt.wikipedia.org Tatar", |
| 5738 | + tum=>"http://tum.wikipedia.org Tumbuka", |
| 5739 | + turn=>"http://turn.wikipedia.org Turnbuka", |
| 5740 | + tw=>"http://tw.wikipedia.org Twi", |
| 5741 | + ty=>"http://ty.wikipedia.org Tahitian", |
| 5742 | + udm=>"http://udm.wikipedia.org Udmurt", |
| 5743 | + ug=>"http://ug.wikipedia.org Uighur", |
| 5744 | + uk=>"http://uk.wikipedia.org Ukrainian", |
| 5745 | + ur=>"http://ur.wikipedia.org Urdu", |
| 5746 | + uz=>"http://uz.wikipedia.org Uzbek", |
| 5747 | + ve=>"http://ve.wikipedia.org Venda", # was Lushaka |
| 5748 | + vec=>"http://vec.wikipedia.org Venetian", |
| 5749 | + vi=>"http://vi.wikipedia.org Vietnamese", |
| 5750 | + vls=>"http://vls.wikipedia.org West Flemish", |
| 5751 | + vo=>"http://vo.wikipedia.org Volapük", |
| 5752 | + wa=>"http://wa.wikipedia.org Walloon", |
| 5753 | + war=>"http://war.wikipedia.org Waray-Waray", |
| 5754 | + wo=>"http://wo.wikipedia.org Wolof", |
| 5755 | + wuu=>"http://wuu.wikipedia.org Wu", |
| 5756 | + xal=>"http://xal.wikipedia.org Kalmyk", |
| 5757 | + xh=>"http://xh.wikipedia.org Xhosa", |
| 5758 | + yi=>"http://yi.wikipedia.org Yiddish", |
| 5759 | + yo=>"http://yo.wikipedia.org Yoruba", |
| 5760 | + za=>"http://za.wikipedia.org Zhuang", |
| 5761 | + zea=>"http://zea.wikipedia.org Zealandic", |
| 5762 | + zh=>"http://zh.wikipedia.org Chinese", |
| 5763 | + zh_min_nan=>"http://zh-min-nan.wikipedia.org Min Nan", |
| 5764 | + "zh-min-nan"=>"http://zh-min-nan.wikipedia.org Min Nan", |
| 5765 | + zh_classical=>"http://zh-classical.wikipedia.org Classical Chinese", |
| 5766 | + "zh-classical"=>"http://zh-classical.wikipedia.org Classical Chinese", |
| 5767 | + zh_yue=>"http://zh-yue.wikipedia.org Cantonese", |
| 5768 | + "zh-yue"=>"http://zh-yue.wikipedia.org Cantonese", |
| 5769 | + zu=>"http://zu.wikipedia.org Zulu", |
| 5770 | + zz=>" All languages", |
| 5771 | + zzz=>" All languages except English" |
| 5772 | + ); |
| 5773 | + |
| 5774 | + foreach $key (keys %wikipedias) |
| 5775 | + { |
| 5776 | + my $wikipedia = $wikipedias {$key} ; |
| 5777 | + $out_urls {$key} = $wikipedia ; |
| 5778 | + $out_languages {$key} = $wikipedia ; |
| 5779 | + $out_urls {$key} =~ s/(^[^\s]+).*$/$1/ ; |
| 5780 | + $out_languages {$key} =~ s/^[^\s]+\s+(.*)$/$1/ ; |
| 5781 | + $out_article {$key} = "http://en.wikipedia.org/wiki/" . $out_languages {$key} . "_language" ; |
| 5782 | + $out_article {$key} =~ s/ /_/g ; |
| 5783 | + $out_urls {$key} =~ s/(^[^\s]+).*$/$1/ ; |
| 5784 | + } |
| 5785 | + $out_languages {"www"} = "Portal" ; |
| 5786 | +} |
| 5787 | + |
| 5788 | + |
| 5789 | +sub Percentage |
| 5790 | +{ |
| 5791 | + my $perc = shift ; |
| 5792 | + $perc = 100 * $perc ; |
| 5793 | + if ($perc == 100) { $perc = '100%' ; } |
| 5794 | + if ($perc == 0) { $perc = ' ' ; } |
| 5795 | + elsif ($perc < 0.00001) { $perc = '0.00001%' ; } |
| 5796 | + elsif ($perc < 0.0001) { $perc = sprintf ("%.5f%", $perc) ; } |
| 5797 | + elsif ($perc < 0.001) { $perc = sprintf ("%.4f%", $perc) ; } |
| 5798 | + elsif ($perc < 0.01) { $perc = sprintf ("%.3f%", $perc) ; } |
| 5799 | + elsif ($perc < 0.1) { $perc = sprintf ("%.2f%", $perc) ; } |
| 5800 | + else { $perc = sprintf ("%.1f%", $perc) ; } |
| 5801 | + return ($perc) ; |
| 5802 | +} |
| 5803 | + |
| 5804 | +sub ReadWikipedia |
| 5805 | +{ |
| 5806 | + use LWP::Simple qw($ua get); |
| 5807 | + |
| 5808 | + $ua->agent('Wikipedia Wikicounts job'); |
| 5809 | + $ua->timeout(60); |
| 5810 | + my $url = 'http://en.wikipedia.org/wiki/List_of_countries_by_population'; |
| 5811 | + my $html = get $url || die "Timed out!"; |
| 5812 | + |
| 5813 | +# open TEST, '<', 'List_of_countries_by_population.html' ; |
| 5814 | +# @lines = <TEST> ; |
| 5815 | +# $html = join "\n", @lines ; |
| 5816 | +# close TEST ; |
| 5817 | + |
| 5818 | + # split file on <tr>'s, remove all behind </tr> |
| 5819 | + $html =~ s/\n/\\n/gs ; |
| 5820 | + foreach $line (split "(?=<tr)", $html) |
| 5821 | + { |
| 5822 | + next if $line !~ /^<tr/ ; |
| 5823 | + next if $line !~ /class=\"flagicon\"/ ; |
| 5824 | + |
| 5825 | + $line =~ s/(?<=<\/tr>).*$// ; |
| 5826 | + # print "$line\n\n" ; |
| 5827 | + |
| 5828 | + @cells = split "(?=<td)", $line ; |
| 5829 | + # foreach $cell (@cells) |
| 5830 | + # { print "CELL $cell\n" ; } |
| 5831 | + |
| 5832 | + if ($cells [2] =~ /<img /) |
| 5833 | + { |
| 5834 | + $icon = $cells [2] ; |
| 5835 | + $icon =~ s/^.*?(<img[^>]*>).*$/$1/ ; |
| 5836 | + $icon =~ s/class=\"[^\"]*\"// ; |
| 5837 | + $icon =~ s/\s*\/>/>/ ; |
| 5838 | + # print "ICON '$icon'\n" ; |
| 5839 | + } |
| 5840 | + else |
| 5841 | + { $icon = "n.a." ; } |
| 5842 | + |
| 5843 | + if ($cells [2] =~ /title/) |
| 5844 | + { |
| 5845 | + $country = $cells [2] ; |
| 5846 | + $country =~ s/^.*?<a [^>]*>([^<]*)<.*$/$1/ ; |
| 5847 | + # print "COUNTRY '$country'\n" ; |
| 5848 | + } |
| 5849 | + else |
| 5850 | + { $title = "n.a." ; } |
| 5851 | + |
| 5852 | + if ($cells [2] =~ /<a /) |
| 5853 | + { |
| 5854 | + $link = $cells [2] ; |
| 5855 | + $link =~ s/^.*?(<a [^>]*>.*?<\/a>).*$/$1/ ; |
| 5856 | + $link =~ s/\/wiki/http:\/\/en.wikipedia.org\/wiki/ ; |
| 5857 | + # print "LINK '$link'\n" ; |
| 5858 | + } |
| 5859 | + else |
| 5860 | + { $title = "n.a." ; } |
| 5861 | + |
| 5862 | + ($population = $cells [3]) =~ s/<td[^>]*>(.*?)<.*$/$1/, $population =~ s/,/_/g ; |
| 5863 | + # print "POP $population\n\n" ; |
| 5864 | + |
| 5865 | + $country =~ s/,/,/g ; |
| 5866 | + $link =~ s/,/,/g ; |
| 5867 | + $icon =~ s/,/,/g ; |
| 5868 | + |
| 5869 | + $countries {$country} = "$country,$link,$population,connected,$icon\n" ; |
| 5870 | + } |
| 5871 | + |
| 5872 | + $url = 'http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users'; |
| 5873 | + $html = get $url || die "Timed out!"; |
| 5874 | + |
| 5875 | + # split file on <tr>'s, remove all behind </tr> |
| 5876 | + $html =~ s/\n/\\n/gs ; |
| 5877 | + foreach $line (split "(?=<tr)", $html) |
| 5878 | + { |
| 5879 | + next if $line !~ /^<tr/ ; |
| 5880 | + next if $line !~ /class=\"flagicon\"/ ; |
| 5881 | + |
| 5882 | + $line =~ s/(?<=<\/tr>).*$// ; |
| 5883 | + # print "$line\n\n" ; |
| 5884 | + |
| 5885 | + @cells = split "(?=<td)", $line ; |
| 5886 | + # foreach $cell (@cells) |
| 5887 | + # { print "CELL $cell\n" ; } |
| 5888 | + |
| 5889 | + if ($cells [2] =~ /title/) |
| 5890 | + { |
| 5891 | + $country = $cells [2] ; |
| 5892 | + $country =~ s/^.*?<a [^>]*>([^<]*)<.*$/$1/ ; |
| 5893 | + # print "COUNTRY '$country'\n" ; |
| 5894 | + } |
| 5895 | + else |
| 5896 | + { $country = "n.a." ; } |
| 5897 | + |
| 5898 | + ($connected = $cells [3]) =~ s/<td[^>]*>(.*?)<.*$/$1/, $connected =~ s/,/_/g ; |
| 5899 | + # print "POP $population\n\n" ; |
| 5900 | + |
| 5901 | + $country =~ s/,/,/g ; |
| 5902 | + $country =~ s/Bosnia-Herzegovina/Bosnia and Herzegovina/ ; |
| 5903 | + $country =~ s/Cote d'Ivoire/C�te d'Ivoire/ ; |
| 5904 | + $country =~ s/Macao/Macau/ ; # will be changed back later |
| 5905 | + $country =~ s/Samoa/American Samoa/ ; |
| 5906 | + $country =~ s/Timor Leste/Timor-Leste/ ; |
| 5907 | + $country =~ s/UAE/United Arab Emirates/ ; |
| 5908 | + |
| 5909 | + $countries {$country} =~ s/connected/$connected/ ; |
| 5910 | + } |
| 5911 | + |
| 5912 | + open COUNTRY_META_INFO, '>', "$path_out/SquidReportCountryMetaInfo.csv" ; |
| 5913 | + foreach $country (sort keys %countries) |
| 5914 | + { print COUNTRY_META_INFO $countries {$country} ; } |
| 5915 | + close COUNTRY_META_INFO ; |
| 5916 | +} |
| 5917 | + |
| 5918 | +sub GetLanguageInfo |
| 5919 | +{ |
| 5920 | + my $language = shift ; |
| 5921 | + my ($language_name,$anchor_language) ; |
| 5922 | + $language_name = "$language (?)" ; |
| 5923 | + if ($out_languages {$language} ne "") |
| 5924 | + { $language_name = $out_languages {$language} ; } |
| 5925 | + ($anchor_language = $language_name) =~ s/ /_/g ; |
| 5926 | + return ($language_name,$anchor_language) ; |
| 5927 | +} |
| 5928 | + |
| 5929 | +sub CountryMetaInfo |
| 5930 | +{ |
| 5931 | + my $country = shift ; |
| 5932 | +print "Country '$country'\n" ; # qqq |
| 5933 | + my ($link_country,$icon,$population) ; |
| 5934 | + if ($country_meta_info {$country} eq "") |
| 5935 | + { |
| 5936 | + if ($country_meta_info_not_found_reported {$country} ++ == 0) |
| 5937 | + { print "_Meta info not found for country '$country'\n" ; } |
| 5938 | + $link_country = $country ; |
| 5939 | + return ($country,'','..','..') ; |
| 5940 | + } |
| 5941 | + else |
| 5942 | + { |
| 5943 | + ($link_country,$population,$connected,$icon) = split ',', $country_meta_info {$country} ; |
| 5944 | + $population =~ s/_//g ; |
| 5945 | + $connected =~ s/_//g ; |
| 5946 | + $link_country =~ s/,/,/g ; |
| 5947 | + $icon =~ s/,/,/g ; |
| 5948 | + $icon =~ s/>/ border=1>/ ; |
| 5949 | + return ($link_country,$icon,$population,$connected) ; |
| 5950 | + } |
| 5951 | +} |
| 5952 | + |
| 5953 | +sub i2KM |
| 5954 | +{ |
| 5955 | + $out_million = 'M' ; |
| 5956 | + $out_thousand = 'K' ; |
| 5957 | + |
| 5958 | + my $v = shift ; |
| 5959 | + |
| 5960 | + if ($v == 0) |
| 5961 | + { return (" ") ; } |
| 5962 | + if ($v >= 100000000) |
| 5963 | + { |
| 5964 | + $v = sprintf ("%.0f",($v / 1000000)) . " " . $out_million ; |
| 5965 | + $v =~ s/(\d+?)(\d\d\d[^\d])/$1,$2/ ; |
| 5966 | + } |
| 5967 | + elsif ($v >= 1000000) |
| 5968 | + { $v = sprintf ("%.1f",($v / 1000000)) . " " . $out_million ; } |
| 5969 | + elsif ($v >= 10000) |
| 5970 | + { $v = sprintf ("%.0f",($v / 1000)) . " " . $out_thousand ; } |
| 5971 | + elsif ($v >= 1000) |
| 5972 | + { $v = sprintf ("%.1f",($v / 1000)) . " " . $out_thousand ; } |
| 5973 | + return ($v) ; |
| 5974 | +} |
| 5975 | + |
| 5976 | +sub i2KM2 |
| 5977 | +{ |
| 5978 | + $out_million = 'M' ; |
| 5979 | + $out_thousand = 'K' ; |
| 5980 | + |
| 5981 | + my $v = shift ; |
| 5982 | + return $v if $v !~ /^\d*$/ ; |
| 5983 | + |
| 5984 | +# return (sprintf ("%.1f",$v/1000000)) ; |
| 5985 | + if ($v == 0) |
| 5986 | + { return (" ") ; } |
| 5987 | + if ($v >= 10000000) |
| 5988 | + { $v = sprintf ("%.0f",($v / 1000000)) . " " . $out_million ; } |
| 5989 | + elsif ($v >= 1000000) |
| 5990 | + { $v = sprintf ("%.1f",($v / 1000000)) . " " . $out_million ; } |
| 5991 | + elsif ($v >= 1000) |
| 5992 | + { $v = sprintf ("%.0f",($v / 1000)) . " " . $out_thousand ; } |
| 5993 | + return ($v) ; |
| 5994 | +} |
| 5995 | + |
| 5996 | +# format: function(s) { return $.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/\\ \\;/g,"").replace(/M/i,"000000").replace(/М/,"000000").replace(/K/i,"000").replace(/К/i,"000")); }, |
| 5997 | + |
| 5998 | +sub UnLink |
| 5999 | +{ |
| 6000 | + my ($links,$index) = @_ ; |
| 6001 | +# print "\n\nUnLink $index\n\n" ; |
| 6002 | + my @segments = split '(?=<a )', $links ; |
| 6003 | +# print "SEGMENT 1 $segments[$index]\n" ; |
| 6004 | + $segments [$index] =~ s/^.*?<a .*?>([^<]*)<\/a>/$1/ ; |
| 6005 | +# print "SEGMENT 2 $segments[$index]\n" ; |
| 6006 | + $links = join '', @segments ; |
| 6007 | + return ($links) ; |
| 6008 | +} |
| 6009 | + |
| 6010 | +sub PrintHtml |
| 6011 | +{ |
| 6012 | + ($html, $path) = @_ ; |
| 6013 | + |
| 6014 | + $verbose = $false ; |
| 6015 | + if ($verbose) |
| 6016 | + { $html =~ s/\[([^\]]*)\]/$1/g ; } |
| 6017 | + else |
| 6018 | + { $html =~ s/\[([^\]]*)\]//g ; } |
| 6019 | + |
| 6020 | + $html =~ s/and images// ; # all data [and images] onthis page are in the public domain |
| 6021 | + open HTML_OUT, '>', $path ; |
| 6022 | + print HTML_OUT $html ; |
| 6023 | + close HTML_OUT ; |
| 6024 | +} |
| 6025 | + |
| 6026 | +sub PrintCsv |
| 6027 | +{ |
| 6028 | + ($csv, $path) = @_ ; |
| 6029 | + |
| 6030 | + open HTML_CSV, '>', $path ; |
| 6031 | + print HTML_CSV $csv ; |
| 6032 | + close HTML_CSV ; |
| 6033 | +} |
| 6034 | + |
| 6035 | +sub HtmlSortTable |
| 6036 | +{ |
| 6037 | + my $html = <<__HTML_SORT_TABLE__ ; |
| 6038 | + |
| 6039 | +<script src="jquery-1.3.2.min.js" type="text/javascript"></script> |
| 6040 | +<script src="jquery.tablesorter.js" type="text/javascript"></script> |
| 6041 | + |
| 6042 | +<script type="text/javascript"> |
| 6043 | +\$.tablesorter.addParser({ |
| 6044 | + id: "nohtml", |
| 6045 | + is: function(s) { return false; }, |
| 6046 | + format: function(s) { return s.replace(/<.*?>/g,"").replace(/ /g,""); }, |
| 6047 | + type: "text" |
| 6048 | +}); |
| 6049 | + |
| 6050 | +\$.tablesorter.addParser({ |
| 6051 | + id: "millions", |
| 6052 | + is: function(s) { return false; }, |
| 6053 | + format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/ /g,"").replace(/M/,"000000").replace(/М/,"000000").replace(/K/,"000").replace(/К/i,"000")); }, |
| 6054 | + type: "numeric" |
| 6055 | +}); |
| 6056 | + |
| 6057 | + |
| 6058 | +\$.tablesorter.addParser({ |
| 6059 | + id: "digitsonly", |
| 6060 | + is: function(s) { return false; }, |
| 6061 | + format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<.*?>/g,"").replace(/ /g,"").replace(/,/g,"").replace(/-/,"-1")); }, |
| 6062 | + type: "numeric" |
| 6063 | +}); |
| 6064 | +</script> |
| 6065 | + |
| 6066 | +<style type="text/css"> |
| 6067 | +table.tablesorter |
| 6068 | +{ |
| 6069 | +/* |
| 6070 | + font-family:arial; |
| 6071 | + background-color: #CDCDCD; |
| 6072 | + margin:10px 0pt 15px; |
| 6073 | + font-size: 7pt; |
| 6074 | + width: 80%; |
| 6075 | + text-align: left; |
| 6076 | +*/ |
| 6077 | +} |
| 6078 | +table.tablesorter thead tr th, table.tablesorter tfoot tr th |
| 6079 | +{ |
| 6080 | +/* |
| 6081 | + background-color: #99D; |
| 6082 | + border: 1px solid #FFF; |
| 6083 | + font-size: 8pt; |
| 6084 | + padding: 4px; |
| 6085 | +*/ |
| 6086 | +} |
| 6087 | +table.tablesorter thead tr .header |
| 6088 | +{ |
| 6089 | + background-color: #ffffdd; |
| 6090 | + background-image: url(bg.gif); |
| 6091 | + background-repeat: no-repeat; |
| 6092 | + background-position: center right; |
| 6093 | + cursor: pointer; |
| 6094 | +} |
| 6095 | +table.tablesorter tbody th |
| 6096 | +{ |
| 6097 | +/* |
| 6098 | + color: #3D3D3D; |
| 6099 | + padding: 4px; |
| 6100 | + background-color: #CCF; |
| 6101 | + vertical-align: top; |
| 6102 | +*/ |
| 6103 | +} |
| 6104 | +table.tablesorter tbody tr.odd th |
| 6105 | +{ |
| 6106 | + background-color:#eeeeaa; |
| 6107 | + background-image:url(asc.gif); |
| 6108 | +} |
| 6109 | +table.tablesorter thead tr .headerSortUp |
| 6110 | +{ |
| 6111 | + background-color:#eeeeaa; |
| 6112 | + background-image:url(asc.gif); |
| 6113 | +} |
| 6114 | +table.tablesorter thead tr .headerSortDown |
| 6115 | +{ |
| 6116 | + background-color:#eeeeaa; |
| 6117 | + background-image:url(desc.gif); |
| 6118 | +} |
| 6119 | +table.tablesorter thead tr .headerSorthown, table.tablesorter thead tr .headerSortUp |
| 6120 | +{ |
| 6121 | + background-color: #eeeeaa; |
| 6122 | +} |
| 6123 | +</style> |
| 6124 | +__HTML_SORT_TABLE__ |
| 6125 | +return ($html) ; |
| 6126 | +} |
| 6127 | + |
| 6128 | +sub HtmlSortTableColumns |
| 6129 | +{ |
| 6130 | + my $html = <<__HTML_SORT_TABLE_COLUMNS__ ; |
| 6131 | + |
| 6132 | +<script type='text/javascript'> |
| 6133 | +\$('#table1').tablesorter({ |
| 6134 | + // debug:true, |
| 6135 | + headers:{0:{sorter:'nohtml'},1:{sorter:'nohtml'},2:{sorter:'nohtml'},3:{sorter:'millions'},4:{sorter:'millions'},5:{sorter:'millions'},6:{sorter:'digitsonly'},7:{sorter:'digitsonly'},6:{sorter:'digitsonly'},7:{sorter:'digitsonly'}} |
| 6136 | +}); |
| 6137 | +</script> |
| 6138 | +__HTML_SORT_TABLE_COLUMNS__ |
| 6139 | +return ($html) ; |
| 6140 | +} |
| 6141 | + |
| 6142 | +sub HtmlIndex |
| 6143 | +{ |
| 6144 | + $index = shift ; |
| 6145 | + |
| 6146 | + my $html = <<__HTML_INDEX__ ; |
| 6147 | + |
| 6148 | +<script type="text/javascript"> |
| 6149 | +<!-- |
| 6150 | +function toggle_visibility_index() |
| 6151 | +{ |
| 6152 | + var index = document.getElementById('index'); |
| 6153 | + var toggle = document.getElementById('toggle'); |
| 6154 | + if (index.style.display == 'block') |
| 6155 | + { |
| 6156 | + index.style.display = 'none'; |
| 6157 | + toggle.innerHTML = 'Show index'; |
| 6158 | + } |
| 6159 | + else |
| 6160 | + { |
| 6161 | + index.style.display = 'block'; |
| 6162 | + toggle.innerHTML = 'Hide index'; |
| 6163 | + } |
| 6164 | +} |
| 6165 | +//--> |
| 6166 | +</script> |
| 6167 | + |
| 6168 | +<tr><td class=r colspan=99><a href="#" id='toggle' onclick="toggle_visibility_index();">Show index</a></td></tr> |
| 6169 | +<tr><td class=l colspan=99><span id='index' style="display:none">\n$index\n</span></td></tr> |
| 6170 | +__HTML_INDEX__ |
| 6171 | + |
| 6172 | +return ($html) ; |
| 6173 | +} |
| 6174 | + |
| 6175 | +sub hsv_to_rgb { |
| 6176 | + |
| 6177 | + my $h = shift; |
| 6178 | + my $s = shift; |
| 6179 | + my $v = shift; |
| 6180 | + |
| 6181 | + # limit this to h values between 0 and 360 and s/v values |
| 6182 | + # between 0 and 1 |
| 6183 | + |
| 6184 | + unless (defined($h) && defined($s) && defined($v) && |
| 6185 | + $h >= 0 && $s >= 0 && $v >= 0 && |
| 6186 | + $h <= 360 && $s <= 1 && $v <= 1) { |
| 6187 | + return (undef, undef, undef); |
| 6188 | + } |
| 6189 | + |
| 6190 | + my $r; |
| 6191 | + my $g; |
| 6192 | + my $b; |
| 6193 | + |
| 6194 | + # 0.003 is less than 1/255; use this to make the floating point |
| 6195 | + # approximation of zero, since the resulting rgb values will |
| 6196 | + # normally be used as integers between 0 and 255. Feel free to |
| 6197 | + # change this approximation of zero to something else, if this |
| 6198 | + # suits you. |
| 6199 | + |
| 6200 | + if ($s < 0.003) { |
| 6201 | + $r = $g = $b = $v; |
| 6202 | + } |
| 6203 | + else { |
| 6204 | + |
| 6205 | + $h /= 60; |
| 6206 | + my $sector = int($h); |
| 6207 | + my $fraction = $h - $sector; |
| 6208 | + |
| 6209 | + my $p = $v * (1 - $s); |
| 6210 | + my $q = $v * (1 - ($s * $fraction)); |
| 6211 | + my $t = $v * (1 - ($s * (1 - $fraction))); |
| 6212 | + |
| 6213 | + if ($sector == 0) { $r = $v; $g = $t; $b = $p; } |
| 6214 | + elsif ($sector == 1) { $r = $q; $g = $v; $b = $p; } |
| 6215 | + elsif ($sector == 2) { $r = $p; $g = $v; $b = $t; } |
| 6216 | + elsif ($sector == 3) { $r = $p; $g = $q; $b = $v; } |
| 6217 | + elsif ($sector == 4) { $r = $t; $g = $p; $b = $v; } |
| 6218 | + else { $r = $v; $g = $p; $b = $q; } |
| 6219 | + } |
| 6220 | + |
| 6221 | + # Convert the r/g/b values to all be between 0 and 255; use the |
| 6222 | + # ol' 0.003 approximation again, with the same comment as above. |
| 6223 | + |
| 6224 | + $r = ($r < 0.003 ? 0.0 : $r * 255); |
| 6225 | + $g = ($g < 0.003 ? 0.0 : $g * 255); |
| 6226 | + $b = ($b < 0.003 ? 0.0 : $b * 255); |
| 6227 | + |
| 6228 | + return ($r, $g, $b); |
| 6229 | + } |
| 6230 | + |
| 6231 | +sub hsv2rgb |
| 6232 | +{ |
| 6233 | + my ($h,$s,$v) = @_; |
| 6234 | + my ($p,$q) ; |
| 6235 | + ($v,$p,$q) = hsv_to_rgb ($h,$s,$v) ; |
| 6236 | + my $color = "\#" . sprintf ("%02X", int($v)) . sprintf ("%02X", int($p)) . sprintf ("%02X", int($q)) ; |
| 6237 | + return ($color) ; |
| 6238 | +} |
| 6239 | + |
| 6240 | +sub HtmlWorldMaps |
| 6241 | +{ |
| 6242 | +my $html_worldmaps = <<__HTML_WORLD_MAPS__ ; |
| 6243 | +<tr><td colspan=99 align=center> |
| 6244 | +<table width='100%' align=center><td align=left> |
| 6245 | +<small> |
| 6246 | +<img src='http://upload.wikimedia.org/wikipedia/commons/thumb/b/b1/World_population.PNG/400px-World_population.PNG' border='1'> |
| 6247 | +<br><a href='http://en.wikipedia.org/wiki/List_of_countries_by_population'>Countries by population</a> - English Wikipedia |
| 6248 | +</small> |
| 6249 | +</td><td> |
| 6250 | +<small> |
| 6251 | +<img src='http://upload.wikimedia.org/wikipedia/commons/thumb/a/af/Internet_Penetration.png/400px-Internet_Penetration.png' border='1'> |
| 6252 | +<br><a href='http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users'>Internet penetration</a> (% of population) - English Wikipedia |
| 6253 | +</small> |
| 6254 | +</td></tr> |
| 6255 | +<tr><td> |
| 6256 | +<small> |
| 6257 | +<img src='http://upload.wikimedia.org/wikipedia/commons/thumb/4/46/North_South_divide.svg/400px-North_South_divide.svg.png' border='1'> |
| 6258 | +<br><a href='http://en.wikipedia.org/wiki/North-South_divide'>Global North South</a> - English Wikipedia |
| 6259 | +</small> |
| 6260 | +</td></tr> |
| 6261 | +</table> |
| 6262 | +</td></tr> |
| 6263 | +__HTML_WORLD_MAPS__ |
| 6264 | + |
| 6265 | +return $html_worldmaps ; |
| 6266 | +} |