r86327 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r86326‎ | r86327 | r86328 >
Date:17:50, 18 April 2011
Author:ezachte
Status:deferred
Tags:
Comment:
fixed region codes for 4 countries, fixed selection of squid log files, several report tweaks (overview report: extra column for perc. world population per country in/ reordered columns)
Modified paths:
  • /trunk/wikistats/squids/SquidCountArchive.pl (modified) (history)
  • /trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm (added) (history)
  • /trunk/wikistats/squids/SquidReportArchive.pl (modified) (history)

Diff [purge]

Index: trunk/wikistats/squids/SquidCountArchive.pl
@@ -11,7 +11,7 @@
1212 use SquidCountArchiveWriteOutput ;
1313
1414 # set defaults mainly for tests on local machine
15 - default_argv "-d 2010/05/10" ;
 15+ default_argv "-d 2011/04/01" ;
1616
1717 # http://wikitech.wikimedia.org/view/Squid_log_format
1818 # 1. Hostname
@@ -45,7 +45,7 @@
4646 if (! $job_runs_on_production_server)
4747 {
4848 $test = $true ;
49 - $file_test = "w:/# Out Locke/sampled-1000-log-20100510b.txt" ;
 49+ $file_test = "w:/# Out Locke/sampled-1000-log-20110401.txt" ;
5050 # $file_test = getcwd . "/SquidDataFilterFY.txt" ;
5151 if (! -e $file_test)
5252 { abort "Test input file '$file_test' not found" ; }
@@ -58,8 +58,8 @@
5959 else
6060 { $path_root = "w:/! perl/squids/archive/test" ; }
6161
62 - $tags_mobile = "Android|BlackBerry|Windows CE|DoCoMo|iPad|iPod|iPhone|HipTop|LGE|Linux arm|Mobile|MIDP|NetFront|Nintendo|Nokia|Obigo|Opera Mini|Palm Pre|Playstation|Samsung|SoftBank|SonyEricsson|SymbianOS|UP\.Browser|Vodafone|WAP|webOS|Wikiamo|Wikipanion" ;
63 - $tags_mobile_upd = "May 2010" ;
 62+ $tags_mobile = "Android|BlackBerry|Windows CE|DoCoMo|iPad|iPod|iPhone|HipTop|Kindle|LGE|Linux arm|Mobile|MIDP|NetFront|Nintendo|Nokia|Obigo|Opera Mini|Opera Mobi|Palm Pre|Playstation|Samsung|SoftBank|SonyEricsson|SymbianOS|UP\.Browser|Vodafone|WAP|webOS|Wikiamo|Wikipanion" ;
 63+ $tags_mobile_upd = "March 2011" ;
6464
6565 $pattern_url_pre = "(?:^|[a-zA-Z0-9-]+\\.)*?" ;
6666 $pattern_url_post = "\\.(?:biz|com|info|name|net|org|pro|aero|asia|cat|coop|edu|gov|int|jobs|mil|mobi|museum|tel|travel|arpa|[a-zA-Z0-9-]{2}|(?:com?|ne)\\.[a-zA-Z0-9-]{2})\$" ;
Index: trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm
@@ -0,0 +1,1188 @@
 2+# to do: study http://www.zytrax.com/tech/web/mobile_ids.html
 3+
 4+sub ProcessLine
 5+{
 6+ my $line = shift ;
 7+
 8+ my @fields = split (' ', $line) ;
 9+ $time = $fields [2] ;
 10+ $date = substr ($time,0,10) ;
 11+
 12+ $client_ip = $fields [4] ;
 13+ $mime = $fields [10] ;
 14+
 15+ if ($scan_ip_frequencies) # phase 1
 16+ {
 17+ return if $line =~ /Banner(?:Cont|List|Load|beheer)/io ;
 18+
 19+ if ($mime eq "text/html")
 20+ {
 21+ $ip_frequencies {$client_ip} ++ ;
 22+ $html_pages_found ++ ;
 23+ }
 24+
 25+ return ;
 26+ }
 27+
 28+ # remember for each squid per hour lowest and highest sequence number and number of events
 29+ # later calc per hour average distance between events = (higest - lowest sequence number) / events - 1
 30+ # distance between consecutive events that lay in different hour bin are ignored, begligible
 31+ $squid = $fields [0] ;
 32+ $seqno = $fields [1] ;
 33+ $hour = substr ($time, 11, 2) ;
 34+
 35+ # init for new found or restarted squid
 36+ # note seqno can be negative! probably unsigned int printed as signed int, 3rd clause deals with this
 37+ if (($squid_seqno {$squid} == 0) || ($seqno < $squid_seqno {$squid}) || (($seqno > 0) && ($squid_seqno {$squid} < 0)))
 38+ { $squid_seqno {$squid} = $seqno ; }
 39+ else
 40+ {
 41+ $squid_events {"$squid,$hour"} ++ ;
 42+ $delta = $seqno - $squid_seqno {$squid};
 43+ $squid_delta {"$squid,$hour"} += $delta ;
 44+ $squid_seqno {$squid} = $seqno ;
 45+ }
 46+
 47+ # now parse all other fields
 48+
 49+ $status = $fields [5] ;
 50+ $size = $fields [6] ;
 51+ $method = $fields [7] ;
 52+ $url = lc ($fields [8]) ;
 53+
 54+ $referer = lc ($fields [11]) ;
 55+ $agent = $fields [13] ;
 56+
 57+ $url =~ s/^http\w?\:\/\///o ;
 58+ $url =~ s/\%3A/:/gio ;
 59+ $url =~ s/\%3B/;/gio ;
 60+ $url =~ s/\&amp;/\&/gio ;
 61+
 62+ ($agent2 = $agent) =~ s/\%20/ /g ; # mainly to make line content more readable on debugging
 63+ $agent2 =~ s/\%2F/\//g ; # mainly to make line content more readable on debugging
 64+ $agents_raw {$agent2}++ ;
 65+
 66+ ($file,$ext) = &GetFileExt ($url) ;
 67+ $exts {$ext}++ ;
 68+
 69+ if (($ext eq "js") || ($ext eq "css"))
 70+ { $scripts {"$ext,$file,"} ++ ; }
 71+
 72+ $title = "" ;
 73+ $parm = "" ;
 74+ if ($ext eq "php")
 75+ {
 76+ if ($url =~ /title=/o)
 77+ {
 78+ $title = $url ;
 79+ $title =~ s/^.*?title=//o ;
 80+ $title =~ s/\&.*$//o ;
 81+ }
 82+ ($url,$parm) = &NormalizeParms ($url) ;
 83+ if ($parm eq "?") { return ; } # error
 84+ $file =~ s/,/&comma;/go ;
 85+ $parm =~ s/,/&comma;/go ;
 86+ $scripts {"php,$file,$parm"} ++ ;
 87+ $ext .= "($file)" ; # add filename behind extension php
 88+ }
 89+
 90+ if ($mime eq "text/html")
 91+ {
 92+ $mimecat = "page" ;
 93+ $tot_mime_html ++ ;
 94+ }
 95+ elsif ($mime =~ /(?:gif|png|jpeg)/o)
 96+ { $mimecat = "image" ; }
 97+ else
 98+ { $mimecat = "other" ; }
 99+
 100+ if ($job_runs_on_production_server)
 101+ {
 102+ $country = $fields [14] ;
 103+ if (($country eq "") || ($country =~ /null/))
 104+ { $country = "--" ; }
 105+ }
 106+ else
 107+ {
 108+ $country = $fields [14] ;
 109+ if ($country eq "")
 110+ {
 111+ if (++ $fake_country_code % 3 == 0)
 112+ { $country = "XX" ; }
 113+ else
 114+ { $country = "YY" ; }
 115+ }
 116+ }
 117+
 118+ if ($line =~ /(?:BannerCont|BannerList|BannerLoad|Bannerbeheer)/io)
 119+ {
 120+ $banners {"$country,$url"} ++ ;
 121+ $banner_requests_ignored ++ ;
 122+ return ;
 123+ }
 124+
 125+ $countries {$country}++ ;
 126+
 127+ $agent2 = $agent ;
 128+ $agent2 =~ s/\%20/ /g ;
 129+
 130+ # remove all mentions of .NET CLR
 131+ # http://en.wikipedia.org/wiki/Common_Language_Runtime
 132+ $agent2 =~ s/\.NET CLR [0-9.]+\s*;?\s*//go ;
 133+ $agent2 =~ s/\(\s*\)//go ;
 134+
 135+ # e.g. BlackBerry8310/4.2.2 Profile/MIDP-2.0 Configuration/CLDC-1.1 VendorID/102 -> BlackBerry8310/4.2.2
 136+ if ($agent2 =~ BlackBerry)
 137+ { $agent2 =~ s/^.*?BlackBerry\d+\/([^\s]*).*$/BlackBerry\/$1/io ; } # keep
 138+
 139+ $agent2 =~ s/Android (\d)/Android\/$1/o ;
 140+ $agent2 =~ s/Safari(\d)/Safari\/$1/o ;
 141+ $agent2 =~ s/Browser\/NetFront/NetFront/o ;
 142+ $agent2 =~ s/Browser\/VF-NetFront/NetFront/o ;
 143+ $agent2 =~ s/jig browser (\d)/JigBrowser\/$1/o ;
 144+ $agent2 =~ s/jig browser9 (\d)/JigBrowser\/$1/o ;
 145+ $agent2 =~ s/jig browser web; (\d)/JigBrowser9\/$1/o ;
 146+
 147+ # Remove explanation for KHTML
 148+ $agent2 =~ s/\(KHTML, like Gecko\)/KHTML/o ;
 149+ $agent2 =~ s/(KHTML[^\s]*) \(like Gecko\)/$1/o ;
 150+
 151+ # Remove name of Ubuntu release (or name -> number)
 152+ $agent2 =~ s/(Ubuntu\/[0-9\.]+)\s*\(\w+\)/$1/gio ;
 153+ $agent2 =~ s/\(Ubuntu-(\w)\w+\)/("Ubuntu\/".(ord (lc($1))-ord('a')+1))/gieo ;
 154+ $agent2 =~ s/Ubuntu\/([a-zA-Z])\w+/("Ubuntu\/".(ord (lc($1))-ord('a')+1))/gieo ;
 155+
 156+ $agent2 =~ s/;\s*U\s*;/;/o ;
 157+
 158+ if ($agent2 =~ /GoogleBot/io)
 159+ {
 160+ $client_ip2 = &MatchIpRange ($client_ip) ;
 161+ if ($client_ip2 =~ /!google/o)
 162+ {
 163+ $agent2 .= " |Google ip address" ;
 164+ $client_ip_range = $client_ip ;
 165+ $client_ip_range =~ s/\.\d+$//o ;
 166+ $google_bot_hits {substr ($time,0,13).','.$client_ip_range} ++ ;
 167+ }
 168+ elsif ($agent2 !~ /compatible GoogleBot/io)
 169+ {
 170+ $agent2 .= " |no Google ip address" ;
 171+ $ip_bot_no_google {$client_ip}++ ;
 172+ }
 173+ }
 174+
 175+ $bot = $false ;
 176+
 177+ # url in agent string should only occur for bots (agent string is free format, no rules, just conventions)
 178+ # exception: Embedded Web Browser from: http://bsalsa.com/,
 179+ # see also http://www.bsalsa.com/forum/showthread.php?t=724
 180+ if (($agent2 =~ /http:\/\//o) && ($agent2 !~ /bsalsa.com/o))
 181+ {
 182+ if ($agent2 !~ /MSIE \d+\/\d+/o) # most likely false positives
 183+ {
 184+ $bot = $true ;
 185+ @bots {"$mime,$agent2"} ++ ;
 186+ }
 187+ }
 188+ elsif (($agent2 =~ /bot/io) || (($agent2 =~ /crawl(?:er)?/io) && ($agent2 !~ /MSIEcrawler/io)) || ($agent2 =~ /spider/io) || ($agent2 =~ /parser/io))
 189+ {
 190+ $bot = $true ;
 191+ @bots {"$mime,$agent2"} ++ ;
 192+ }
 193+
 194+ # GECKO
 195+ $gecko = "" ;
 196+ if ($agent2 =~ /Gecko\/\d{4,}/io)
 197+ { ($gecko = $agent2) =~ s/^.*?Gecko\/(\d{4}).*$/Gecko\/$1/io ; }
 198+
 199+ # APPLEWEBKIT
 200+ $applewebkit = "" ;
 201+ if ($agent2 =~ /AppleWebKit/io)
 202+ {
 203+ ($applewebkit = $agent2) =~ s/^.*?AppleWebKit\/(\d+\.\d+).*$/AppleWebKit\/$1/io ;
 204+ $applewebkit =~ s/^.*?AppleWebKit\/(\d+).*$/AppleWebKit\/$1/io ;
 205+ $applewebkit =~ s/\/(\d\d)$/\/0$1/o ;
 206+
 207+ if ($agent2 =~ /Mozilla.{1,8}\(/io)
 208+ {
 209+ $agent3 = $agent2 ;
 210+ $agent3 =~ s/^[^\(]*\(//o ;
 211+ $agent3 =~ s/;.*$//o ;
 212+ $agent3 =~ s/\).*$//o ;
 213+ $agent3 =~ s/^\s+//o ;
 214+ $agent3 =~ s/\s+$//o ;
 215+ $agent3 = substr ($agent3,0,20) ;
 216+
 217+ $platform = '' ;
 218+ if ($agent2 =~ /Chrome/io)
 219+ { ($platform = $agent2) =~ s/^.*?(Chrome\/?\s*\d+\.?\d*).*$/$1/io ; }
 220+ elsif ($agent2 =~ /Android/io)
 221+ { ($platform = $agent2) =~ s/^.*?(Android\/?\s*\d+\.?\d*).*$/$1/io ; }
 222+ elsif ($agent2 =~ /(?:iPad|iPod|iPhone)/io)
 223+ {
 224+ ($platform = $agent2) =~ s/^.*?(OS\s*\d\_?\d?).*$/$1/io ;
 225+ $platform =~ s/_/./go ;
 226+ }
 227+ elsif ($agent2 =~ /Kindle/io)
 228+ { ($platform = $agent2) =~ s/^.*?(Kindle\/?\s*\d+\.?\d*).*$/$1/io ; }
 229+ elsif ($agent2 =~ /Safari/io)
 230+ { ($platform = $agent2) =~ s/^.*?(Safari\/\d+).*$/$1/io ; }
 231+
 232+ if (($agent2 =~ /Symbian/i) && ($agent3 !~ /Symbian/io))
 233+ { ($platform = $agent2) =~ s/^.*?(Symbian[\w\d\.\/]+).*$/$1/io ; }
 234+
 235+ if ($platform ne '')
 236+ {
 237+ $platform =~ s/^\s+//o ;
 238+ $platform =~ s/\s+$//o ;
 239+ $platform = " $platform" ;
 240+ }
 241+
 242+ $applewebkit .= " ($agent3$platform)" ;
 243+ }
 244+
 245+ if (($agent2 =~ /Nokia/io) && ($applewebkit !~ /Nokia/io))
 246+ { $applewebkit .= " (Nokia)" ; }
 247+
 248+ # if ($agent2 =~ /\(iPad/i) { $applewebkit .= " (iPad)" ; }
 249+ # elsif ($agent2 =~ /\(iPod/i) { $applewebkit .= " (iPod)" ; }
 250+ # elsif ($agent2 =~ /\(iPhone/i) { $applewebkit .= " (iPhone)" ; }
 251+ # elsif ($agent2 =~ /\(Windows/i) { $applewebkit .= " (Win)" ; }
 252+ # elsif ($agent2 =~ /\(Macintosh/i) { $applewebkit .= " (Mac)" ; }
 253+ # else { $applewebkit .= " (--)" ; }
 254+ }
 255+
 256+ # MOBILE
 257+ $mobile = '-' ;
 258+ if ($agent2 =~ /(?:$tags_mobile)/io)
 259+ { $mobile = 'M' ; }
 260+
 261+ $os = ".." ;
 262+
 263+ if ($agent2 =~ /(?:Wikiamo|Wikipanion)/io) { $os = "iPhone" ; }
 264+ elsif ($agent2 =~ /BlackBerry/io) {($os = $agent2) =~ s/^.*?BlackBerry[^\/]*\/(\d+\.\d+).*$/BlackBerry\/$1/io ; } # BlackBerry/8320/4.2 -> BlackBerry/4.2
 265+ elsif ($agent2 =~ /DoCoMo/io) { $os = "DoCoMo" ; }
 266+ elsif ($agent2 =~ /iPad/io) { $version = "iPad" ; ($os = $agent2) =~ s/^.*?(iPad OS \d+\_\d+).*$/$1/io ; }
 267+ elsif ($agent2 =~ /iPod/io) { $version = "iPod" ; ($os = $agent2) =~ s/^.*?(iPhone OS \d+\_\d+).*$/$1/io ; }
 268+ elsif ($agent2 =~ /iPhone/io) { $version = "iPhone" ; ($os = $agent2) =~ s/^.*?(iPhone OS \d+\_\d+).*$/$1/io ; }
 269+ elsif ($agent2 =~ /webOS.* Pre/io) { $version = "Pre" ; ($os = $agent2) =~ s/^.*?(webOs\/\d+\.?\d*).*$/$1/io ; } # Palm Pre
 270+ elsif ($agent2 =~ /Intel Mac/io) { $os = "Mac Intel" ; }
 271+ elsif ($agent2 =~ /PPC Mac/io) { $os = "Mac PowerPC" ; }
 272+ elsif ($agent2 =~ /Mac_PowerPC/io) { $os = "Mac PowerPC" ; }
 273+ elsif ($agent2 =~ /Macintosh.*PPC/io) { $os = "Mac PowerPC" ; }
 274+ elsif ($agent2 =~ /Mac OS/io) { $os = "Mac" ; }
 275+ elsif ($agent2 =~ /MacBook/io) { $os = "Mac" ; }
 276+ elsif ($agent2 =~ /iMac/io) { $os = "iMac" ; }
 277+ elsif ($agent2 =~ /Power.*Macintosh/io) { $os = "Mac PowerPC" ; }
 278+ elsif ($agent2 =~ /FreeBSD/io) { $os = "FreeBSD" ; }
 279+ elsif ($agent2 =~ /OpenBSD/io) { $os = "OpenBSD" ; }
 280+ elsif ($agent2 =~ /SunOS/io) { $os = "SunOS" ; }
 281+ elsif ($agent2 =~ /PlayStation/io) { $os = "PlayStation" ; }
 282+ elsif ($agent2 =~ /SymbianOS/io) { ($os = $agent2) =~ s/^.*?SymbianOS[^\/]*\/(\d+\.\d+).*$/SymbianOS\/$1/io ; }
 283+ elsif ($agent2 =~ /Symbian.*OS/io) { $os = "SymbianOS/0.0" ; }
 284+# elsif ($agent2 =~ /Linux i686/io) { $os = "Linux i686" ; }
 285+# elsif ($agent2 =~ /Linux x86_64/io) { $os = "Linux x86_64" ; }
 286+# elsif ($agent2 =~ /Linux armv\d+/io) { $os = "Linux armv" ; }
 287+# elsif ($agent2 =~ /Linux ppc\d+/io) { $os = "Linux ppc" ; }
 288+# elsif ($agent2 =~ /Linux mips/io) { $os = "Linux mips" ; }
 289+ elsif ($agent2 =~ /Linux/io) { $os = "Linux" ; }
 290+ elsif ($agent2 =~ /Win95/io) { $os = "Windows 95" ; }
 291+ elsif ($agent2 =~ /Win(?:dows)[+\s-]?98/io) { $os = "Windows 98" ; }
 292+ elsif ($agent2 =~ /Win(?:dows)?[+\s-]?9x/io) { $os = "Windows 9x" ; }
 293+ elsif ($agent2 =~ /WinNT4.0/io) { $os = "Windows NT 4.0" ; }
 294+ elsif ($agent2 =~ /Windows XP/io) { $os = "Windows XP" ; } # Windows XP 2600.xpsp.14648-27197 -> Windows XP
 295+ elsif ($agent2 =~ /Windows CE/io) { $os = "Windows CE" ; }
 296+ elsif ($agent2 =~ /Windows; PPC/io) { $os = "Windows CE" ; }
 297+ elsif ($agent2 =~ /NT \d+\.\d+.*Windows/io) { ($os = $agent2) =~ s/^.*?NT (\d+\.\d+).*$/Windows NT $1/io ; }
 298+ elsif ($agent2 =~ /Windows NT \d+\.\d+/io) { ($os = $agent2) =~ s/^.*?Windows NT (\d+\.\d+).*$/Windows NT $1/io ; }
 299+ elsif ($agent2 =~ /Windows NT/io) { $os = "Windows NT" ; }
 300+ elsif ($agent2 =~ /Windows VISTA/io) { $os = "Windows VISTA" ; }
 301+ elsif ($agent2 =~ /Windows 7/io) { $os = "Windows 7" ; }
 302+# elsif ($agent2 =~ /Windows/io) { ($os = $agent2) =~ s/^.*?(Windows.{10,10}[^;\(\)\[\]]*).*$/$1/io ; }
 303+ elsif ($agent2 =~ /Windows/io) { $os = "Windows" ; }
 304+ elsif ($agent2 =~ /Win32/io) { $os = "Windows 32" ; }
 305+ elsif ($agent2 =~ /Wii/io) { $os = "Wii" ; }
 306+ elsif ($agent2 =~ /SonyEricsson/io) { $os = "SonyEricsson" ; }
 307+ elsif ($agent2 =~ /Samsung/io) { $os = "Samsung" ; }
 308+ elsif ($agent2 =~ /Nokia/io) { $os = "Nokia" ; }
 309+ elsif ($agent2 =~ /Palm Pre/io) { $os = "Palm Pre" ; }
 310+ elsif ($agent2 =~ /Vodafone/io) { $os = "Vodafone" ; }
 311+ elsif ($agent2 =~ /Danger/io) { $os = "Danger" ; }
 312+ elsif ($agent2 =~ /J2ME\/MIDP/io) { $os = "Java/ME" ; }
 313+ elsif ($agent2 =~ /Kindle/io) { $os = "Kindle" ; }
 314+
 315+ if (($os eq '..') && ($mobile eq 'M'))
 316+ {
 317+ $os = "Mobile other" ;
 318+ $mobile_other {$agent2} ++ ;
 319+ }
 320+
 321+ if ($version =~ /(?:Ipod|Iphone)/io)
 322+ {
 323+ if ($os !~ /Iphone OS \d/io)
 324+ { $os = "iPhone OS 1_X" ; }
 325+ if ($agent2 !~ /(?:Opera|Safari)/io)
 326+ { $agent2 .= " Safari/0.0" ; }
 327+ }
 328+ elsif ($version =~ /(?:Ipad)/io)
 329+ {
 330+ if ($os !~ /Ipad OS \d/io)
 331+ { $os = "iPad OS 1_X" ; }
 332+ if ($agent2 !~ /(?:Opera|Safari)/io)
 333+ { $agent2 .= " Safari/0.0" ; }
 334+ }
 335+
 336+ if (($os =~ /Mac/o) && ($agent2 =~ /OS X/o))
 337+ {
 338+ ($osx = $agent2) =~ s/^.*?(OS X[^;\(\)\[\]]*).*$/$1/o ;
 339+ $osx =~ s/(\d+\_\d+).*$/$1/o ;
 340+ $osx =~ s/_/\./o ;
 341+ $os = "$os $osx" ;
 342+ }
 343+
 344+ if ($os =~ /Linux/o)
 345+ {
 346+ ($osx = $agent2) =~ s/^.*?((?:Android|Ubuntu|Gentoo|PCLinuxOS|CentOS|Red Hat|Mandriva|SUSE|Fedora|Epiphany|Debian|Motor\w+)[^\s;\[\]\(\)]*).*$/ucfirst($1)/ieo ;
 347+ if ($osx ne $agent2)
 348+ {
 349+ $osx =~ s/(\d+\_\d+).*$/$1/o ;
 350+ $osx =~ s/^([^-]*)-/$1\//o ; # Debian-1.0 -> Debian/1.0
 351+ $osx =~ s/_/\./o ;
 352+ $osx =~ s/(\d+\.\d+).*$/$1/o ;
 353+ $osx =~ s/^(Motor)(\w+).*$/ucfirst(lc($1)).uc($2)/ieo ;
 354+ $os = "$os $osx" ;
 355+ }
 356+ }
 357+
 358+ $os =~ s/(Windows NT \d+\.\d+).*$/$1/o ;
 359+
 360+ if ($bot)
 361+ { $agent2 = "BOT $agent2" ; }
 362+
 363+ elsif ($agent2 eq "-")
 364+ {;}
 365+
 366+ # KINDLE
 367+ elsif ($agent2 =~ /Kindle/io)
 368+ { ($version = $agent2) =~ s/^.*?(Kindle \d+\.\d+).*$/$1/io ; }
 369+
 370+ # IEMOBILE
 371+ elsif ($agent2 =~ /IEMobile/io)
 372+ { ($version = $agent2) =~ s/^.*?(IEMobile \d+\.\d+).*$/$1/io ; }
 373+
 374+ # PALM PRE
 375+ elsif ($agent2 =~ /webOS\/\d+\.\d+.*Pre\/\d/io)
 376+ { ($version = $agent2) =~ s/^.*?(Pre\/\d+\.?\d*).*$/Palm_$1/o ; }
 377+
 378+ # ANDROID
 379+ elsif ($agent2 =~ /Android\/\d+/io)
 380+ { ($version = $agent2) =~ s/^.*?(Android\/\d+\.?\d*).*$/$1/o ; }
 381+
 382+ # EXPLORER
 383+ elsif ($agent2 =~ /Mozilla\/\d+\.\d+ \(compatible;.*MSIE/io)
 384+ { ($version = $agent2) =~ s/^.*?(MSIE \d+\.\d+).*$/$1/o ; }
 385+
 386+ # CHROME
 387+ elsif ($agent2 =~ /Chrome\/\d/io) # Chrome sometimes mimicked Safari to work around Hotmail bug
 388+ {
 389+ $agent2 =~ s/Windows NT \d\.\d/Windows/o ;
 390+ $agent2 =~ s/(Chrome\/\d+\.\d+)[^;\) ]+/$1/o ;
 391+
 392+ $agent2 = &ExtractLanguage ($agent2, 'Chrome') ;
 393+
 394+ ($version = $agent2) =~ s/^.*?(Chrome\/\d+\.\d+).*$/$1/o ;
 395+ }
 396+
 397+ # SAFARI
 398+ elsif ($agent2 =~ /Safari\/[^\s]+$/io)
 399+ {
 400+ $agent2 = &ExtractLanguage ($agent2, 'Safari') ;
 401+ $agent2 =~ s/(Safari\/\d+\.\d+)[^;\) ]+/$1/o ;
 402+ if ($agent2 =~ /Safari\/\d+\.\d+/o)
 403+ { ($version = $agent2) =~ s/^.*?(Safari\/\d+\.\d+).*$/$1/o ; }
 404+ elsif ($agent2 =~ /Safari\/\d+/o)
 405+ { ($version = $agent2) =~ s/^.*?(Safari\/\d+).*$/$1/o ; }
 406+ }
 407+
 408+ # FIREFOX
 409+ elsif ($agent2 =~ /Firefox\/[^\s]+/io)
 410+ {
 411+ $agent2 = &ExtractLanguage ($agent2, 'Firefox') ;
 412+ $agent2 =~ s/X11; Linux [^;]+/Linux/o ;
 413+ $agent2 =~ s/(Firefox\/\d+\.\d+)[^;\) ]+/$1/o ;
 414+
 415+ if ($agent2 =~ /Firefox\/\d+\.\d+/o)
 416+ { ($version = $agent2) =~ s/^.*?(Firefox\/\d+\.\d+).*$/$1/o ; }
 417+ elsif ($agent2 =~ /Firefox\/\d+/o)
 418+ { ($version = $agent2) =~ s/^.*?(Firefox\/\d+).*$/$1/o ; }
 419+ }
 420+
 421+ # OPERA
 422+ # new format
 423+ elsif ($agent2 =~ /^Opera\/\d/io)
 424+ {
 425+ if ($agent2 =~ /Version\//o)
 426+ { ($version = $agent2) =~ s/^.*?Version\/(\d+\.\d+).*$/Opera\/$1/o ; }
 427+ else
 428+ { ($version = $agent2) =~ s/^.*?(Opera\/\d+\.\d+).*$/$1/o ; }
 429+
 430+ $agent2 =~ s/Windows NT \d\.\d/Windows/o ;
 431+ $agent2 =~ s/X11; Linux [^;]+/Linux/o ;
 432+ $agent2 =~ s/(Opera Mini\/\d+\.\d+)[^;\) ]+/$1/o ;
 433+ $agent2 =~ s/J2ME\/MIDP/Java mobile (J2ME)/o ; # J2ME\/MIDP
 434+
 435+ $agent2 = &ExtractLanguage ($agent2, 'Opera') ;
 436+
 437+ if ($agent2 =~ /Opera Mini/o)
 438+ {
 439+ if ($agent2 =~ /Opera Mini\/\d+\.\d+/o)
 440+ { ($mini = $agent2) =~ s/^.*?Opera (Mini\/\d+\.\d+).*$/$1/o ; }
 441+ else
 442+ { $mini = "Mini/?.?" ; }
 443+ $version = "$version ($mini)" ;
 444+ }
 445+ elsif ($agent2 =~ /Opera Mobi/o)
 446+ {
 447+ if ($agent2 =~ /Opera Mobi\/\d+\.\d+/o)
 448+ { ($mobi = $agent2) =~ s/^.*?Opera (Mobi\/\d+\.\d+).*$/$1/o ; }
 449+ else
 450+ { $mobi = "Mobi/?.?" ; }
 451+ $version = "$version ($mobi)" ;
 452+ }
 453+
 454+ $version =~ s/^\s*(.*?)\s*$/$1/o ;
 455+ }
 456+
 457+ # old format
 458+ elsif ($agent2 =~ /^Mozilla.*\(compatible.*Opera \d/io)
 459+ {
 460+ $agent2 =~ s/Opera (\d+\.\d+)/Opera\/$1/o ;
 461+ $agent2 =~ s/Windows NT \d\.\d/Windows/o ;
 462+ $agent2 =~ s/X11; Linux [^;\)]+/Linux/o ;
 463+ ($version = $agent2) =~ s/^.*?(Opera\/\d+\.\d+).*$/$1/o ;
 464+ $version =~ s/^\s*(.*?)\s*$/$1/o ; # remove leading/trailing spaces
 465+ }
 466+
 467+ # BLACKBERRY
 468+ elsif ($agent2 =~ /BlackBerry\d+/io)
 469+ {
 470+ $agent2 =~ s/(\/\d+\.\d+).*$/$1/o ;
 471+ $agent2 =~ s/BlackBerry/BlackBerry\//o ;
 472+ $version = $agent2 ;
 473+ }
 474+
 475+ # KONQUEROR
 476+ elsif ($agent2 =~ /Konqueror\/\d/io) # Chrome sometimes mimicked Safari to work around Hotmail bug
 477+ {
 478+ $agent2 =~ s/(Konqueror\/\d+\.\d+)[^;\) ]+/$1/o ;
 479+
 480+ ($version = $agent2) =~ s/^.*?(Konqueror\/\d+\.\d+).*$/$1/o ;
 481+ }
 482+
 483+ # WGET
 484+ elsif ($agent2 =~ /Wget\/\d/io)
 485+ {
 486+ $agent2 =~ s/(Wget\/\d+\.\d+)[^;\) ]+/$1/io ;
 487+
 488+ ($version = $agent2) =~ s/^.*?Wget\/(\d+\.\d+).*$/$1/io ;
 489+ }
 490+
 491+ elsif ($os =~ /Iphone OS \d/io)
 492+ { $os = "iPhone OS 1_X" ; }
 493+ elsif ($os =~ /Ipad OS \d/io)
 494+ { $os = "iPad OS 1_X" ; }
 495+
 496+ else
 497+ {
 498+ $browserfound = $false ;
 499+
 500+ @browsers = qw (GranParadiso IceWeasel JigBrowser K-Meleon NetFront Netscape SeaMonkey Shiretoko Sleipnir Songbird) ;
 501+ foreach $browser (@browsers)
 502+ {
 503+ if ($agent2 =~ /$browser/i)
 504+ {
 505+ ($version = $agent2) =~ s/.*?($browser\/\d+\.\d+).*$/$1/i ;
 506+ $browserfound = $true ;
 507+ last ;
 508+ }
 509+ }
 510+ if (! $browserfound)
 511+ {
 512+ ($version = $agent2) =~ s/(^[a-zA-Z0-9-_]+\/\d+\.\d+).*$/$1/io ;
 513+ $version =~ s/[;\[\]\(\)].*$//o ;
 514+ $version =~ s/(\d+\.\d+).*$/$1/o ;
 515+ }
 516+
 517+ $agent2 = "*[$version] [$os] --- $agent2" ;
 518+ }
 519+
 520+ if ((! $bot) && ($agent ne "-"))
 521+ {
 522+ $engine =~ s/,/&comma;/go ;
 523+ if ($gecko ne "")
 524+ { $engines {$gecko} ++ ; }
 525+ elsif ($applewebkit ne "")
 526+ {
 527+ $applewebkit =~ s/AppleWebKit\//AppleWebKit /o ;
 528+ $engines {$applewebkit} ++ ;
 529+ }
 530+
 531+ $version =~ s/,/&comma;/go ;
 532+ if ($os =~ /playstation/io)
 533+ { $version = "NetFront (PlayStation)" ; }
 534+
 535+ $clients {"$mobile,$version"}++ ;
 536+
 537+ $operating_systems =~ s/,/&comma;/go ;
 538+ $operating_systems {"$mobile,$os"} ++ ;
 539+ }
 540+
 541+ if ($count_hits_per_ip_range)
 542+ {
 543+ $client_ip_range = $client_ip ;
 544+ $client_ip_range =~ s/\.\d+$//o ;
 545+ $cnt_ip_ranges {$client_ip_range}++ ;
 546+ }
 547+
 548+ if ($status =~ /^TCP/)
 549+ {
 550+ $statusses {"$method:$status"}++ ;
 551+ $statusses {"$method:total"}++ ;
 552+ }
 553+ else
 554+ { $statusses_non_tcp ++ ; }
 555+
 556+ if ($url =~ /org\/skins/o)
 557+ {
 558+ ($url2 = $url) =~ s/^.*?\/skins/skins/o ;
 559+ $skins {$url2} ++ ;
 560+ }
 561+
 562+ if ($url =~ /^upload\.wikimedia\.org\//o) # count image size if applicable
 563+ { &ProcessUploadPath ($url) ; }
 564+
 565+ ($url2 = $url) =~ s/\.php\?.*$/\.php\?../go ;
 566+ ($domain,$location) = split ('\/',$url2,2) ;
 567+ $domain_original = $domain ;
 568+
 569+ # for diagnostics
 570+ if (($referer =~ /google/o) || ($agent =~ /google/io))
 571+ { $googles++ ; }
 572+
 573+ $referer =~ s/^http\w?\:\/\///o ;
 574+ $referer =~ s/\.php\?.*$/\.php\?../go ;
 575+ $referer =~ s/\/.*$//o ;
 576+ $referer_original = $referer ;
 577+
 578+ # $domain_mobile = $false ;
 579+ # if ($domain =~ /m\.wikipedia/o)
 580+ # {
 581+ # print "Domain 1 $domain\n" ;
 582+ # $domain_mobile = $true ;
 583+ # }
 584+
 585+ $domain = &Abbreviate ($domain) ;
 586+ if (($domain =~ /\./o) ||
 587+ ($domain !~ /^[\*\@\%]?!(wb|wn|wp|wq|ws|wv|wk|wx|xx|wm|mw|wmf)\:/o))
 588+ {
 589+ $unrecognized_domains {$domain_original} ++ ;
 590+ $domain = 'other' ;
 591+ }
 592+
 593+ # if ($domain_mobile)
 594+ # { print "Domain 2 $domain\n" ; }
 595+
 596+ # $referer_mobile = $false ;
 597+ # if ($referer =~ /m\.wikipedia/o)
 598+ # {
 599+ # print "Referer 1 $referer\n" ;
 600+ # $referer_mobile = $true ;
 601+ # }
 602+
 603+ $referer = &Abbreviate ($referer) ;
 604+ $referer_external = ($referer !~ /^[\*\@]?!(wb|wn|wp|wq|ws|wv|wk|wx|xx|wm|mw|wmf)\:/o) ;
 605+
 606+ if ($referer_external)
 607+ {
 608+ $tot_referers_external++ ;
 609+
 610+ ($origin, $toplevel) = &DetectOrigin ($client_ip, $referer_original, $agent, $mime, $mimecat, $service, $ext) ;
 611+
 612+ &CountOrigin ("external", $origin, $toplevel, $mimecat) ;
 613+
 614+ if ($origin !~ /^\!/o)
 615+ { $origins_unsimplified {$referer_original} ++ ; }
 616+ else
 617+ {
 618+ $origin_simplified {"$origin [$referer] <- $referer_original"} ++ ;
 619+ $origins_external {$origin} ++ ;
 620+ }
 621+ }
 622+ else
 623+ {
 624+ $tot_referers_internal ++ ;
 625+ $referers_internal {$referer} ++ ;
 626+ $referer =~ s/!//go ; # ! was marker to signal pattern was recognized as wikimedia project
 627+ &CountOrigin ("internal", $referer, "org" , $mimecat) ;
 628+ }
 629+
 630+ $domain =~ s/!//o ;
 631+ $referer =~ s/!//o ;
 632+ $domain =~ s/\:\d+$//o ; # remove port number
 633+ $referer =~ s/\:\d+$//o ; # remove port number
 634+ if ($domain =~ /!/o)
 635+ { print ERR "still ! in domain: '$domain' <- '$domain_original'\n" ; }
 636+
 637+ $requests {"$domain|$referer|$ext|$mime|$parm"}++ ;
 638+
 639+ $clients_by_wiki {"$mobile,$version,$domain"}++ ;
 640+
 641+ if ($bot)
 642+ { $ind_bot = 'bot=Y' ; }
 643+ else
 644+ { $ind_bot = 'bot=N' ; }
 645+
 646+ if (($domain =~ /^\@/) || ($domain =~ /^\*/))
 647+ {
 648+ # print "Requests wap $domain | $ext | $mime | $parm | $country | $ind_bot\n" ;
 649+ $requests_wap {"$domain|$ext|$mime|$parm|$country|$ind_bot"} ++ ;
 650+ }
 651+
 652+ if ($domain =~ /^\%/)
 653+ {
 654+ # print "Requests m $domain | $ext | $mime | $parm | $country | $ind_bot\n" ;
 655+ $requests_m {"$domain|$ext|$mime|$parm|$country|$ind_bot"} ++ ;
 656+ }
 657+ # $title !~ /:/ -> only namespace 0 (minus few titles with colon in name)
 658+ if (($url =~ /index.php\?/o) && ($title !~ /:/o) && ($mime eq "text/html") && (($url =~ /action=edit/o) || ($url =~ /action=submit/o)))
 659+ {
 660+
 661+ if (($referer ne "-") && ($referer !~ /^..:/o))
 662+ { $referer = "ext" ; }
 663+
 664+ $key = "$client_ip|$ind_bot|$domain|$referer|$status|$mime|$parm" ;
 665+ $key =~ s/,/&comma;/go ;
 666+ $key =~ s/\|/,/go ;
 667+
 668+ $index_php_raw {$key}++ ;
 669+ $client_ip_record_cnt {$client_ip}++ ;
 670+ }
 671+
 672+ if ($mimecat eq "page")
 673+ {
 674+ $tot_mime_html2 ++ ;
 675+
 676+ if (($ind_bot =~ /N/) and ($ip_frequencies {$client_ip} > 2))
 677+ { $ind_bot = 'bot=Y' ; }
 678+
 679+ $countries_views {"$ind_bot,$domain,$country"} ++ ;
 680+
 681+ # $title !~ /:/ -> only namespace 0 (minus few titles with colon in name)
 682+ if (($url =~ /index.php\?/o) && ($title !~ /:/) && ($mime eq "text/html") && ($url =~ /action=submit/o) && ($status =~ /302/o))
 683+ { $countries_saves {"$ind_bot,$domain,$country"} ++ ; }
 684+
 685+ $time_hh = substr ($time,11,2) ;
 686+ $time_mm = substr ($time,14,2) ;
 687+ $time_tt = $time_hh * 60 + $time_mm ;
 688+ $time_tt2 = $time_tt - $time_tt % 15 ;
 689+ $countries_timed {"$ind_bot,$domain,$country,$time_tt2"} ++ ;
 690+ }
 691+}
 692+
 693+sub ExtractLanguage
 694+{
 695+ my $agent = shift ;
 696+ my $application = shift ;
 697+ my $language ;
 698+
 699+ $regexp_lang = "[a-z]{2}(?:-[a-zA-Z]{2,3})?(?:-[a-zA-Z]{2,3})?" ;
 700+ ($language = $agent) =~ s/^.*?; ($regexp_lang)[\);].*$/$1/o ;
 701+ if ($language eq $agent)
 702+ { $languages_unrecognized {$agent} ++ ; }
 703+ else
 704+ {
 705+ $languages {"$application,$language"} ++ ;
 706+ $agent =~ s/ $language//o ;
 707+ }
 708+ return ($agent) ;
 709+}
 710+
 711+sub GetFileExt
 712+{
 713+ my $url = shift ;
 714+ my ($file, $ext) ;
 715+ $url =~ s/\?.*$//o ;
 716+ ($file = $url) =~ s/^([^\/]*\/)+//o ; # drop path before file
 717+
 718+ if ($file =~ /^[^\.]*$/o) # no extension
 719+ { $ext = "none" ; }
 720+ else
 721+ {
 722+ ($ext = $file) =~ s/^.*?\.([^\.]+)$/$1/o ;
 723+ if ($ext =~ /[^a-zA-Z]/o)
 724+ { $ext = "invalid" ; }
 725+ }
 726+ $ext = lc ($ext) ;
 727+ $ext =~ s/^(jpg|jpeg)$/jp[e]g/go ;
 728+
 729+ return ($file, $ext) ;
 730+
 731+ # obsolete alternate code ?
 732+ # implied php request returns html
 733+ # if ($url =~ /\/wiki\//o) { $ext = "html <- /wiki/" ; }
 734+ # elsif ($url =~ /\.org\/?$/o) { $ext = "html <- *.org" ; }
 735+ # elsif ($url =~ /\.com\/?$/o) { $ext = "html <- *.com" ; }
 736+ # elsif ($url =~ /\/wiki\?title=/o) { $ext = "html <- /wiki?title=.." ; }
 737+ #
 738+ # if ($mime =~ /(?:xml|html)/o)
 739+ # { $ext = "none (mimetype:$mime)" ; }
 740+ # else
 741+ # {
 742+ # $url =~ s/\?.*$//o ;
 743+ # ($file = $url) =~ s/^([^\/]*\/)+//o ; # drop path before file
 744+ #
 745+ # if ($file =~ /^[^\.]*$/o) # no extension
 746+ # { $ext = "none (mimetype:$mime)" ;
 747+ # print "\n\n$mime\n$line\n" ;
 748+ # $ext = "none" ; }
 749+ # else
 750+ # {
 751+ # ($ext = $file) =~ s/^.*?\.([^\.]+)$/$1/o ;
 752+ # if ($ext =~ /[^a-zA-Z]/o)
 753+ # { $ext = "invalid" ; }
 754+ # }
 755+ # }
 756+ #
 757+ # $ext = lc ($ext) ;
 758+ # $ext =~ s/^(jpg|jpeg)$/jp[e]g/go ;
 759+ #
 760+ # return ($file, $ext) ;
 761+}
 762+
 763+sub NormalizeParms
 764+{
 765+ my $url = shift ;
 766+
 767+ $invalid = $false ;
 768+ my ($url2,$parm) = split ('\?', $url) ;
 769+ $parm =~ s/^\&+//o ;
 770+ $parm =~ s/\&+$//o ;
 771+ $parm =~ s/\&\&+/\&/o ;
 772+ $parm =~ s/\&quot;/'/go ; # invalid in url ?, accept for now
 773+ @parms = split ('\&', $parm) ;
 774+ @parms = sort @parms ;
 775+
 776+ foreach $parm (@parms)
 777+ {
 778+ next if $parm eq "" ;
 779+
 780+ if (($parm !~ /=/) && ($parm !~ /^[\w\d\-\_]+$/o))
 781+ { $error = "parm probably invalid: '$parm' in '$url' -> skip\n" ; $invalid = $true ; last }
 782+
 783+ ($keyword,$data) = split ('\=', $parm) ;
 784+ if ($keyword eq "")
 785+ { $keyword = "[empty]" ; }
 786+ if ($keyword ne "redlink")
 787+ {
 788+ if (($keyword !~ /^(?:action|ctype|gen|usemsgcache)$/) || ($data !~ /^[a-zA-Z\-\_\/]*$/o))
 789+ { $parm =~ s/=.+/=../o ; } # show generalized version of parameter, without specifics
 790+ }
 791+ }
 792+
 793+ if ($invalid)
 794+ {
 795+ print $error ;
 796+ print ERR $error ;
 797+ return ("?","?") ;
 798+ }
 799+
 800+ $parm = join ('&', @parms) ;
 801+ $url = "$url2\?$parm" ;
 802+ return ($url,$parm) ;
 803+}
 804+
 805+sub Abbreviate
 806+{
 807+ my $domain = shift ;
 808+
 809+ $domain =~ s/www\.([^\.]+\.[^\.]+\.[^\.]+)/$1/o ;
 810+ $domain =~ s/\.com/\.org/o ;
 811+ $domain =~ s/^([^\.]+\.org)/www.$1/o ;
 812+
 813+ if ($domain !~ /\.org/o)
 814+ { $domain =~ s/www\.(wik[^\.\/]+)\.([^\.\/]+)/$2.$1.org/o ; }
 815+
 816+ $legend = "# wx = wikispecial (commons|mediawiki|meta|foundation|species)\n" ;
 817+ $legend .= "# xx:upload = upload.wikimedia.org\n" ;
 818+ $domain =~ s/commons\.wikimedia\.org/!wx:commons/o ;
 819+ $domain =~ s/www\.mediawiki\.org/!wx:mediawiki/o ;
 820+ $domain =~ s/meta\.wikipedia\.org/!wx:meta/o ;
 821+ $domain =~ s/meta\.wikimedia\.org/!wx:meta/o ;
 822+ $domain =~ s/foundation\.wikimedia\.org/!wx:foundation/o ;
 823+ $domain =~ s/species\.wikimedia\.org/!wx:species/o ;
 824+ $domain =~ s/upload\.wikimedia\.org/!xx:upload/o ;
 825+
 826+ $legend .= "# wmf = wikimediafoundation\n" ;
 827+ $legend .= "# wb = wikibooks\n" ;
 828+ $legend .= "# wn = wikinews\n" ;
 829+ $legend .= "# wp = wikipedia\n" ;
 830+ $legend .= "# wq = wikiquote\n" ;
 831+ $legend .= "# ws = wikisource\n" ;
 832+ $legend .= "# wv = wikiversity\n" ;
 833+ $legend .= "# wk = wiktionary\n" ;
 834+ $legend .= "# wm = wikimedia\n" ;
 835+ $legend .= "# mw = mediawiki\n" ;
 836+ $legend .= "# \@ = .mobile.\n" ;
 837+ $legend .= "# \* = .wap.\n" ;
 838+ $legend .= "# \% = .m.\n" ;
 839+
 840+ $domain =~ s/wikimediafoundation/!wmf/o ;
 841+ $domain =~ s/wikibooks/!wb/o ;
 842+ $domain =~ s/wikinews/!wn/o ;
 843+ $domain =~ s/wikipedia/!wp/o ;
 844+ $domain =~ s/wikiquote/!wq/o ;
 845+ $domain =~ s/wikisource/!ws/o ;
 846+ $domain =~ s/wikiversity/!wv/o ;
 847+ $domain =~ s/wiktionary/!wk/o ;
 848+ $domain =~ s/wikimedia/!wm/o ;
 849+ $domain =~ s/mediawiki/!mw/o ;
 850+
 851+ $domain =~ s/\.mobile\./.@/o ;
 852+ $domain =~ s/\.wap\./.*/o ;
 853+ $domain =~ s/\.m\./.%/o ;
 854+
 855+ if ($domain =~ /^error:/o)
 856+ { $domain_errors {$domain}++ ; }
 857+ $domain =~ s/error:.*$/!error:1/o ;
 858+
 859+ $domain =~ s/^([^\.\/]+)\.([^\.\/]+)\.org/$2:$1/o ;
 860+
 861+ $domain =~ s/\s//g ;
 862+
 863+ return ($domain) ;
 864+}
 865+
 866+sub DetectOrigin
 867+{
 868+# this simplification is a rather loose approximation, not rigidly according to domain name standards, as that would require further study
 869+
 870+# three reasons to count search engine 'xxx':
 871+# 1 $referer contains 'xxx'
 872+# 2 $client_ip is known to belong to 'xxx'
 873+# 3 agent shows request (probably) came from 'xxx'
 874+
 875+ my $client_ip = shift ;
 876+ my $referer = shift ;
 877+ my $agent = shift ;
 878+ my $mime = shift ;
 879+ my $mimecat = shift ;
 880+ my $service = shift ;
 881+ my $ext = shift ;
 882+
 883+ $client_ip =~ s/\:\d+$//o ;
 884+ $referer =~ s/\:\d+$//o ;
 885+
 886+ my $referer_original = $referer ;
 887+ my $origin ;
 888+
 889+ if ($referer ne '-')
 890+ { $origin = $referer ; }
 891+ else
 892+ { $origin = $client_ip ; }
 893+
 894+ my $origin_original = $origin ;
 895+
 896+ if (is_valid_ip_address ($client_ip)) # always ?
 897+ { $client_ip = &MatchIpRange ($client_ip) ; }
 898+
 899+ if (is_valid_ip_address ($referer)) # never ?
 900+ {
 901+ $top_level_domain = "-" ;
 902+ $referer = &MatchIpRange ($referer) ;
 903+ }
 904+ else
 905+ {
 906+ $top_level_domain = &GetTopLevelDomain ($referer) ;
 907+ if ($top_level_domain eq "")
 908+ {
 909+ $secondary_domain = "invalid" ;
 910+ $referer = "invalid" ;
 911+ $origin = "invalid origin" ;
 912+ }
 913+ else
 914+ { $secondary_domain = &GetSecondaryDomain ($referer) ; }
 915+ if ($secondary_domain eq "google")
 916+ {
 917+ $referer =~ s/$pattern_url_post//o ;
 918+ $referer =~ s/^${pattern_url_pre}maps\.google$/!google:maps/o ;
 919+ $referer =~ s/^${pattern_url_pre}images\.google$/!google:image search/o ;
 920+ $referer =~ s/^${pattern_url_pre}translate\.google$/!google:translate/o ;
 921+ $referer =~ s/^${pattern_url_pre}mail\.google$/!google:mail/o ;
 922+ $referer =~ s/^${pattern_url_pre}toolbar\.google$/!google:toolbar/o ;
 923+ $referer =~ s/^${pattern_url_pre}gmodules$/!google:gmodules/o ;
 924+ $referer =~ s/^${pattern_url_pre}google$/!google:web search/o ;
 925+ $referer =~ s/^${pattern_url_pre}www\.google/!google:web search/o ;
 926+ if ($referer !~ /!/)
 927+ { print "google referer not recognized: '$referer_original'\n" ; }
 928+ }
 929+
 930+ # test code
 931+ # if ($secondary_domain !~ /(?:-|google|yahoo)/o)
 932+ # { print "$secondary_domain <= $referer\n" ; }
 933+ }
 934+
 935+ ($service,$agent) = &MatchAgent ($agent, $client_ip, $mime, $ext) ;
 936+
 937+ if (($top_level_domain eq "-") && ($client_ip =~ /!google:ip/io))
 938+ { $top_level_domain = "ip:$service" ; }
 939+
 940+ if (($client_ip =~ /!.*google/io) || ($referer =~ /!.*google/io) || ($agent =~ /!.*google/io))
 941+ {
 942+ if ($referer =~ /!.*google/io)
 943+ { $origin = "google (by referer)" } # $referer_original ; }
 944+ elsif ($client_ip =~ /!.*google/io)
 945+ { $origin = "google (by ip)" ; }
 946+ else
 947+ { $origin = "google (by agent)" ; }
 948+
 949+ if ($client_ip =~ /!.*google/io) { $google_x = "x" ; } else { $google_x = "-" ; }
 950+ if ($referer =~ /!.*google/io) { $google_y = "y" ; } else { $google_y = "-" ; }
 951+ if ($agent =~ /!.*google/io) { $google_z = "z" ; } else { $google_z = "-" ; }
 952+
 953+ $googlematch = "$google_x $google_y $google_z" ;
 954+
 955+ $referer2 = $referer ; if ($referer2 !~ /^!.*google:/io) { $referer2 = ".." ; } else { $referer2 =~ s/^!google://o ; }
 956+ $agent2 = $agent ; if ($agent2 !~ /^!.*google:/io) { $agent2 = ".." ; } else { $agent2 =~ s/^!google://o ; }
 957+
 958+ $top_level_domain =~ s/^.*\.//o ; # co.uk -> uk
 959+
 960+ if (($service eq "..") && ($referer =~ /!google:/o) && ($referer !~ /!google:ip/o))
 961+ { ($service = $referer) =~ s/^.*?:(.*$)/ucfirst($1)/eo ; }
 962+
 963+ if (($service eq "GoogleBot") && ($client_ip !~ /!.*google/io))
 964+ { $service = "GoogleBot?" ; }
 965+
 966+ $service =~ s/^\.\.$/Other/o ;
 967+
 968+ # only found in agent string -> except Google Earth and Google Desktop, ignore others (Toolbar , GoogleBot)
 969+ $accept = " " ;
 970+ if (($googlematch eq "- - z") && ($service =~ /GoogleBot/io))
 971+ {
 972+ $service = "GoogleBot?" ;
 973+ $google_imposters {$agent}++ ;
 974+ }
 975+
 976+ # obsolete? to be considered ?
 977+ # if (($googlematch ne "- - z") || ($service =~ /(?:Earth|Desktop)/o))
 978+ # { $search {"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} ++ ; }
 979+ # else
 980+ # { $accept = "not" ; }
 981+
 982+ $search {"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} ++ ;
 983+
 984+ $googlebins2 {"$accept [$googlematch] " . sprintf ("%-14s",$service) . $referer} ++ ;
 985+ $googlebins {$googlematch}++ ;
 986+ }
 987+
 988+ # test only: make yahoo's treatment of languages look like google's
 989+ # $origin =~ s/^([a-zA-Z0-9-]+)\.([a-zA-Z0-9-]+\.yahoo.com)/$2.$1/o ;
 990+
 991+
 992+ $origin =~ s/^localhost(\:.*)?$/!localhost/o ;
 993+ $origin =~ s/\:\d+$//o ; # remove port number
 994+
 995+ # $origin =~ s/${pattern_url_pre}mail\.live$/!microsoft live mail/o ;
 996+ # $origin =~ s/${pattern_url_pre}msn.$/!microsoft MSN/o ;
 997+ # $origin =~ s/${pattern_url_pre}msdn.$/!microsoft MSDN/o ;
 998+
 999+ # $origin =~ s/${pattern_url_pre}dailynews\.yahoo$/!yahoo news/o ;
 1000+ # $origin =~ s/${pattern_url_pre}mail\.yahoo$/!yahoo mail/o ;
 1001+ # $origin =~ s/${pattern_url_pre}search.yahoo$/!yahoo search/o ;
 1002+
 1003+ # if (($origin !~ /^ip:!/o) && ($origin !~ /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})/o))
 1004+ # {
 1005+ # $origin =~ s/${pattern_url_pre}([a-zA-Z0-9-]+)$/!$1/o ;
 1006+ # print "$origin\n" ;
 1007+ # }
 1008+
 1009+ if ($origin =~ /wiki/o)
 1010+ { $wikis {$origin} ++ ; }
 1011+
 1012+ if ($origin eq "wikipedia")
 1013+ {
 1014+ # print "incomplete origin: $origin <= $referer_original\n$line\n\n" ;
 1015+ $origin = "!error:4" ;
 1016+ }
 1017+
 1018+ return ($origin, $top_level_domain) ;
 1019+}
 1020+
 1021+sub MatchAgent
 1022+{
 1023+ my $agent = shift ;
 1024+ my $client_ip = shift ;
 1025+ my $mime = shift ;
 1026+ my $ext = shift ;
 1027+
 1028+ ($client_ip_range = $client_ip) =~ s/\.\d+\.\d+$//o ;
 1029+
 1030+ $service = '..' ;
 1031+ if ($agent =~ /google/io)
 1032+ {
 1033+ if ($agent =~ /Googlebot/io) { $service = "GoogleBot" ; $agent = "!GoogleBot" ; }
 1034+ elsif ($agent =~ /FeedFetcher-Google/io) { $service = "FeedFetcher" ; $agent = "!FeedFetcher-Google" ; }
 1035+ elsif ($agent =~ /Google.*?Wireless.*?Transcoder/io) { $service = "Wireless" ; $agent = "!GoogleWirelessTranscoder" ; }
 1036+ elsif ($agent =~ /Google.*?Desktop/io) { $service = "Desktop" ; $agent = "!GoogleDesktop" ; }
 1037+ elsif ($agent =~ /GoogleEarth/io) { $service = "Earth" ; $agent = "!GoogleEarth" ; }
 1038+ elsif ($agent =~ /GoogleToolbar/io) { $service = "Toolbar" ; $agent = "!GoogleToolbar" ; }
 1039+ elsif ($agent =~ /Google.*?Keyword.*?Tool/io) { $service = "KeywordTool" ; $agent = "!GoogleKeywordTool" ; }
 1040+ elsif ($agent =~ /GoogleT\d/io) { $service = "Toolbar" ; $agent =~ s/^.*?(GoogleT\d+).*$/"!".$1/e ; }
 1041+ elsif ($agent =~ /translate\.google\.com/io) { $service = "Translate" ; $agent = "!GoogleTranslate" ; }
 1042+ else { $service = "Other" ; $agent = "!GoogleOther" ; }
 1043+
 1044+ $googlebots {"$agent,$client_ip_range,$service,$mime,$ext"} ++ ;
 1045+ }
 1046+
 1047+# if ($agent =~ /yahoo/io)
 1048+# {
 1049+# if ($agent =~ /ysearch\/slurp/o)
 1050+# { $service = "bot" ; $agent = "!YahooBot" ; }
 1051+
 1052+# @yahoobots {"$agent,$client_ip_range,$mime,$ext"} ++ ;
 1053+# }
 1054+
 1055+ return ($service, $agent) ;
 1056+}
 1057+
 1058+sub MatchIpRange
 1059+{
 1060+ my $address = shift ;
 1061+
 1062+ $address =~ s/\:.*$//o ; # remove port number
 1063+
 1064+ # test code
 1065+ # $address_original = $address ;
 1066+
 1067+ $address =~ s/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/sprintf("%03d",$1).".".sprintf("%03d",$2).".".sprintf("%03d",$3).".".sprintf("%03d",$4)/eo ;
 1068+ $address_11 = substr ($address,0,11) ;
 1069+
 1070+ if (($address_11 ge "064.233.160") && ($address_11 le "064.233.191")) { $address = "!google:IP064" ; }
 1071+ elsif (($address_11 ge "066.249.064") && ($address_11 le "066.249.095")) { $address = "!google:IP066" ; }
 1072+ elsif (($address_11 ge "066.102.000") && ($address_11 le "066.102.015")) { $address = "!google:IP066" ; }
 1073+ elsif (($address_11 ge "072.014.192") && ($address_11 le "072.014.255")) { $address = "!google:IP072" ; }
 1074+ elsif (($address_11 ge "074.125.000") && ($address_11 le "074.125.255")) { $address = "!google:IP074" ; }
 1075+ elsif (($address_11 ge "209.085.128") && ($address_11 le "209.085.255")) { $address = "!google:IP209" ; }
 1076+ elsif (($address_11 ge "216.239.032") && ($address_11 le "216.239.063")) { $address = "!google:IP216" ; }
 1077+ elsif (($address ge "070.089.039.152") && ($address le "070.089.039.159")) { $address = "!google:IP070" ; }
 1078+ elsif (($address ge "070.090.219.072") && ($address le "070.090.219.079")) { $address = "!google:IP070" ; }
 1079+ elsif (($address ge "070.090.219.048") && ($address le "070.090.219.055")) { $address = "!google:IP070" ; }
 1080+
 1081+ elsif (($address_11 ge "067.195.000") && ($address_11 le "067.195.255")) { $address = "!yahoo:IP067" ; }
 1082+ elsif (($address_11 ge "072.030.000") && ($address_11 le "072.030.255")) { $address = "!yahoo:IP072" ; }
 1083+ elsif (($address_11 ge "074.006.000") && ($address_11 le "074.006.255")) { $address = "!yahoo:IP074" ; }
 1084+ elsif (($address_11 ge "209.191.064") && ($address_11 le "209.191.127")) { $address = "!yahoo:IP209" ; }
 1085+
 1086+ $address =~ s/IP\d+/ip/o ; # no need for detailed ranges for now
 1087+
 1088+ # test code
 1089+ # @fields = split ('\.', $address) ;
 1090+ # foreach $field (@fields)
 1091+ # { $field =~ s/^0+(\d)/$1/o ; }
 1092+ # $address2 = join ('.', @fields) ;
 1093+ # if ($address2 ne $address_original)
 1094+ # { print "MatchIpRange: '$address2' <- $address_original\n" ; }
 1095+
 1096+ return ($address) ;
 1097+}
 1098+
 1099+# see http://en.wikipedia.org/wiki/Domain_name
 1100+sub GetTopLevelDomain
 1101+{
 1102+ my $domain = shift ;
 1103+ $domain =~ s/\:\d+$//o ; # remove port number
 1104+
 1105+ if ($domain eq '-')
 1106+ { $top_level_domain = '-' ; }
 1107+ elsif ($domain =~ /!?localhost/o)
 1108+ { $top_level_domain = 'localhost' ; }
 1109+ elsif ($domain !~ /.+\..+/o)
 1110+ { $top_level_domain = '' ; }
 1111+ else
 1112+ {
 1113+ ($top_level_domain = $domain) =~ s/^.*?($pattern_url_post)/$1/o ;
 1114+ if ($domain eq $top_level_domain)
 1115+ { $top_level_domain = '-other-' ; }
 1116+ }
 1117+ return ($top_level_domain) ;
 1118+}
 1119+
 1120+sub GetSecondaryDomain
 1121+{
 1122+ my $domain = shift ;
 1123+ $domain =~ s/\:\d+$//o ; # remove port number
 1124+
 1125+ if ($domain !~ /\./)
 1126+ { return ($domain) ; }
 1127+
 1128+ $domain =~ s/$pattern_url_post//o ;
 1129+ $domain =~ s/^.*?\.([^\.]+)$/$1/o ;
 1130+ return ($domain) ;
 1131+}
 1132+
 1133+sub CountOrigin
 1134+{
 1135+ my $source = shift ;
 1136+ my $origin = shift ;
 1137+ my $toplevel = shift ;
 1138+ my $mimecat = shift ;
 1139+
 1140+ if ($source eq "external")
 1141+ {
 1142+ $tot_origins_external_counted ++ ;
 1143+ $origin =~ s/\:.*$//o ;
 1144+ if (is_valid_ip_address ($origin))
 1145+ { $origin = "unmatched ip address" ; $toplevel = "" ; }
 1146+ elsif ($origin =~ /^!error/o)
 1147+ { $origin = "invalid origin" ; $toplevel = "" ; }
 1148+ elsif ($origin =~ /^!localhost/o)
 1149+ { $origin = "localhost" ; $toplevel = "" ; }
 1150+ else
 1151+ {
 1152+ if (($origin =~ /!/o) && ($origin !~ /!error/o))
 1153+ { print "CountOrigin: $origin\n" ; }
 1154+ $origin = &GetSecondaryDomain ($origin) ;
 1155+ # print "$origin\n" ;
 1156+ }
 1157+ }
 1158+ $origins {"$source,$origin,$toplevel,$mimecat"} ++ ;
 1159+}
 1160+
 1161+sub ProcessUploadPath
 1162+{
 1163+ my $url = shift ;
 1164+ my ($file,$folder,$path,$size,$sizerange) ;
 1165+ ($path = $url) =~ s/^.*?\.org\///o ;
 1166+ ($file = $path) =~ s/^.*\/([^\/]*)$/$1/go ; # remove path
 1167+
 1168+ $binaries {$file} ++ ;
 1169+
 1170+ if ($file =~ /(?:gif|jpg|jpeg|png|svg)$/io)
 1171+ {
 1172+ ($folder = $path) =~ s/\/[^\/]*$/\//o ; # remove file
 1173+ $folder =~ s/\/[^\/]{1,1}\/[^\/]{2,2}\/.*$//o ; # remove /x/yy/ part and beyond
 1174+ $folder =~ s/\/[^\/]{1,1}\/[^\/]{2,2}\/.*$//o ; # remove /x/yy/ part and beyond, can occur twice (in thumbs)
 1175+ $folder =~ s/\/thumb//o ;
 1176+ $folder =~ s/^math\/.*$/math/o ;
 1177+ # print "$folder <- $upload\n" ;
 1178+ if ($file =~ /\d+px/o)
 1179+ {
 1180+ ($size = $file) =~ s/^.*?(\d+)px.*$/$1/o ;
 1181+ $sizerange = sprintf ("%5d",(int ($size / 20)) * 20) . "-" . sprintf ("%5d",(((int ($size / 20))+1) * 20 - 1)) ;
 1182+ $imagesizes {$sizerange} ++ ;
 1183+ }
 1184+ else
 1185+ { $imagesizes {"???"} ++ ; }
 1186+ }
 1187+}
 1188+
 1189+1;
Index: trunk/wikistats/squids/SquidReportArchive.pl
@@ -7,90 +7,10 @@
88
99 # $quarter_only = '2010 Q3' ; # if not empty filter process for this quarter only
1010
11 - # set defaults mainly for tests on local machine
12 -# default_argv "-m 201009 " ;
 11+# set defaults mainly for tests on local machine
 12+# default_argv "-m 201010 " ;
1313 default_argv "-c " ;
1414
15 -# $html = "<html><body bgcolor=black><table>" ;
16 -# for ($i = 4 ; $i >= 0 ; $i-=0.5)
17 -# {
18 -# ($requests,$ratio,$fill) = RatioAndFillColor1 ('',$i,4, $ratio_sqrt) ;
19 -# print sprintf ("%.1f",$i) . ": $fill\n" ;
20 -# $i2 = sprintf ("%0.1f", $i) ;
21 -# $html .= "<tr><td align=right><font color=grey>$i2</font></td><td width=15>&nbsp;</td><td width=50 style=\"background:$fill\">&nbsp;</td><td width=15>&nbsp;</td><td><font color=grey> $fill</font></td></tr>" ;
22 -# }
23 -# $html .= "<tr><td height=30 colspan=99>&nbsp;</td></tr>" ;
24 -# for ($i = 4 ; $i >= 0 ; $i-=0.5)
25 -# {
26 -# ($requests,$ratio,$fill) = RatioAndFillColor2 ('',$i,4, $ratio_sqrt) ;
27 -# print sprintf ("%.1f",$i) . ": $fill\n" ;
28 -# $i2 = sprintf ("%0.1f", $i) ;
29 -# $html .= "<tr><td align=right><font color=grey>$i2</font></td><td width=15>&nbsp;</td><td width=50 style=\"background:$fill\">&nbsp;</td><td width=15>&nbsp;</td><td><font color=grey> $fill</font></td></tr>" ;
30 -# }
31 -# $html .= "</table><body></html>" ;
32 -# open HTML, '>', 'color_range2.html' ;
33 -# print HTML $html ;
34 -# close HTML ;
35 -# exit ;
36 -
37 -#sub RatioAndFillColor1
38 -#{
39 -# my ($code, $requests,$requests_max) = @_ ;
40 -# my ($ratio,$green,$red,$blue,$fill) ;
41 -
42 -# if ($requests > $requests_max)
43 -# { $requests = $requests_max ; }
44 -
45 -# $ratio = sqrt ($requests / $requests_max) ;
46 -# if ($ratio >= 0.20)
47 -# {
48 -# $green = 180 ;
49 -# $red = 180 - int (0.5 + 180 * 5/4 * ($ratio-0.20)) ;
50 -# $blue = int ($green / 3) ;
51 -# }
52 -# else
53 -# {
54 -# $red = 220 ;
55 -# $green = int (0.5 + 220 * 5 * $ratio) ;
56 -# $blue = 0 ; #int ($green / 2) ;
57 -# }
58 -
59 -# $fill = "\#" . sprintf ("%02x%02x%02x",$red,$green,$blue) ;
60 -# $fill = lc hsv2rgb($ratio*150,0.67+$ratio*0.33,0.8-0.2*$ratio) ;
61 -
62 -# $fills {lc $code} = $fill ;
63 -# return ($requests,$ratio,$fill) ;
64 -#}
65 -
66 -#sub RatioAndFillColor2
67 -#{
68 -# my ($code, $requests,$requests_max) = @_ ;
69 -# my ($ratio,$green,$red,$blue,$fill) ;
70 -
71 -# if ($requests > $requests_max)
72 -# { $requests = $requests_max ; }
73 -
74 -# $ratio = $requests / $requests_max ;
75 -# if ($ratio >= 0.20)
76 -# {
77 -# $green = 180 ;
78 -# $red = 180 - int (0.5 + 180 * 5/4 * ($ratio-0.20)) ;
79 -# $blue = int ($green / 3) ;
80 -# }
81 -# else
82 -# {
83 -# $red = 220 ;
84 -# $green = int (0.5 + 220 * 5 * $ratio) ;
85 -# $blue = 0 ; #int ($green / 2) ;
86 -# }
87 -
88 -# $fill = "\#" . sprintf ("%02x%02x%02x",$red,$green,$blue) ;
89 -# $fill = lc hsv2rgb($ratio*150,1-$ratio*0.334,0.6) ;
90 -
91 -# $fills {lc $code} = $fill ;
92 -# return ($requests,$ratio,$fill) ;
93 -#}
94 -
9515 # to do: add text from http://wiki.squid-cache.org/SquidFaq/SquidLogs
9616 # ReportOrigin how to handle '!error <-> other
9717 # SquidReportOrigins.htm total count<->alpha are not the same (+ skip total for "google (total)")
@@ -104,8 +24,6 @@
10525
10626 # http://www.linux.com/community/blogs/Convert-a-.svg-file-to-a-.png-in-Ubuntu.html
10727
108 -# use CGI::Carp qw(fatalsToBrowser);
109 -# use Getopt::Std ;
11028 use Time::Local ;
11129 use Cwd;
11230
@@ -117,22 +35,24 @@
11836 if (-d "/a/squid")
11937 {
12038 print "\n\nJob runs on server $hostname\n\n" ;
121 - $path_root = "/a/ezachte" ;
 39+ $path_in = "/a/ezachte" ;
 40+ $path_out = "/a/ezachte" ;
12241 }
12342 elsif ($hostname eq 'bayes')
12443 {
12544 print "\n\nJob runs on server $hostname\n\n" ;
126 - $path_root = "/home/ezachte/wikistats/animation" ;
 45+ $path_in = "/home/ezachte/wikistats/animation" ;
 46+ $path_out = "/home/ezachte/wikistats/animation" ;
12747 }
12848 else
12949 {
13050 print "Job runs local for tests\n\n" ;
131 - $path_root = "W:/! Perl/Squids/Archive/test5" ;
 51+ $path_in = "W:/# Out Locke" ;
 52+ $path_out = "W:/# Out Test/Locke" ;
13253 }
133 - $path_in = $path_root ;
134 - $path_out = $path_root ;
13554
136 - print "Path root = $path_root\n" ;
 55+ print "Path in = $path_in\n" ;
 56+ print "Path out = $path_out\n" ;
13757
13858 # periodically harvest updated metrics from
13959 # 'http://en.wikipedia.org/wiki/List_of_countries_by_population'
@@ -149,20 +69,21 @@
15070
15171 &InitProjectNames ;
15272
 73+ $file_csv_country_codes = "CountryCodes.csv" ;
 74+ $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ;
 75+
 76+ &ReadInputCountriesNames ;
 77+
15378 if ($reportcountries)
15479 {
15580 $project_mode = "wp" ;
15681
157 - $file_csv_country_codes = "CountryCodes.csv" ;
158 - $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ;
159 -
160 - &ReadInputCountriesNames ;
16182 &ReadInputCountriesMeta ;
16283
16384 &CollectRegionCounts ;
16485
16586 &ReportCountries ('Saves');
166 - &ReportCountries ('Views');
 87+# &ReportCountries ('Views');
16788
16889 exit ;
16990 }
@@ -170,12 +91,10 @@
17192 $reportdaysback = $options {"d"} ;
17293 $reportmonth = $options {"m"} ;
17394
174 - if (($reportmonth !~ /^\d{6}$/) && ($reportdaysback !~ /^-\d+/))
175 - { print "Specify month as -m yyyymm or days back as -d -[days] (e.g. -d -1 for yesterday)" ; exit ; }
 95+ if (($reportmonth !~ /^\d\d\d\d-\d\d$/) && ($reportdaysback !~ /^-\d+$/))
 96+ { print "Specify month as -m yyyy-mm or days back as -d -[days] (e.g. -d -1 for yesterday)" ; exit ; }
17697
177 - if ($reportmonth =~ /^\d{6}$/)
178 - { $reportmonth = substr ($reportmonth,0,4) . "-" . substr ($reportmonth,4,2) ; }
179 - else
 98+ if ($reportdaysback =~ /^-\d+$/)
18099 {
181100 ($sec,$min,$hour,$day,$month,$year) = localtime (time+$reportdaysback*86400) ;
182101 $reportmonth = sprintf ("%04d-%02d",$year+1900,$month+1) ;
@@ -225,27 +144,16 @@
226145 $file_csv_clients = "public/SquidDataClients.csv" ;
227146 $file_csv_google_bots = "public/SquidDataGoogleBots.csv" ;
228147 $file_csv_indexphp = "public/SquidDataIndexPhp.csv" ;
229 - $file_csv_countries_languages_visited = "public/SquidDataCountriesViews.csv" ;
230 - $file_csv_countries_timed = "public/SquidDataCountriesViewsTimed.csv" ;
231148 $file_csv_browser_languages = "public/SquidDataLanguages.csv" ;
232149
 150+ $file_csv_countries_languages_visited = "SquidDataCountriesViews.csv" ;
 151+ $file_csv_countries_timed = "SquidDataCountriesViewsTimed.csv" ;
 152+
233153 print "\n\nJob SquidReportArchive.pl\n\n" ;
234154
235 -# if (! -d "/a/squid")
236 -# {
237 -# if (! -e $file_csv_requests) { $file_csv_requests =~ s/\./Test./ }
238 -# if (! -e $file_csv_methods) { $file_csv_methods =~ s/\./Test./ }
239 -# if (! -e $file_csv_skins) { $file_csv_skins =~ s/\./Test./ }
240 -# if (! -e $file_csv_scripts) { $file_csv_scripts =~ s/\./Test./ }
241 -# if (! -e $file_csv_opsys) { $file_csv_opsys =~ s/\./Test./ }
242 -# if (! -e $file_csv_origins) { $file_csv_origins =~ s/\./Test./ }
243 -# if (! -e $file_csv_google) { $file_csv_google =~ s/\./Test./ }
244 -# if (! -e $file_csv_crawlers) { $file_csv_crawlers =~ s/\./Test./ }
245 -# }
 155+ if (! -d "$path_in/$reportmonth")
 156+ { print "Directory not found: $path_in\/$reportmonth\n" ; exit ; }
246157
247 - if (! -d "$path_root/$reportmonth")
248 - { print "Directory not found: $path_root\/$reportmonth\n" ; exit ; }
249 -
250158 # for ($month = 4 ; $month <= 10 ; $month ++)
251159 # {
252160 # $reportmonth = "2009-" . sprintf ("%02d", $month) ;
@@ -255,7 +163,7 @@
256164 # last if ($month == 10) && ($day > 24) # temp code stay with DST summer time zone for SV
257165
258166 $date = $reportmonth . "-". sprintf ("%02d", $day) ;
259 - $dir = "$path_root/$reportmonth/$date" ;
 167+ $dir = "$path_in/$reportmonth/$date" ;
260168
261169 if (-d $dir)
262170 {
@@ -277,7 +185,8 @@
278186 if ($#dirs_process < 0)
279187 { print "No valid data to process.\n" ; exit ; }
280188
281 - $dir_reports = "$path_root/$reportmonth" ;
 189+ $path_reports = "$path_in/$reportmonth" ;
 190+ print "Write report to $path_reports\n" ;
282191
283192 $google_ip_ranges = "<b>IP ranges:</b> known ip ranges for Google are 64.233.[160.0-191.255], 66.249.[64.0-95.255], 66.102.[0.0-15.255], 72.14.[192.0-255.255], <br>74.125.[0.0-255.255], " .
284193 "209.085.[128.0-255.255], 216.239.[32.0-63.255] and a few minor other subranges</small><p>\n" ;
@@ -288,7 +197,7 @@
289198
290199 # &ReadDate ; date range was read from csv file
291200
292 - foreach $dir_process (@dirs_process)
 201+ foreach $path_process (@dirs_process)
293202 {
294203 $days_input_found ++ ;
295204
@@ -303,18 +212,21 @@
304213 &ReadInputSkins ;
305214 &ReadInputIndexPhp ;
306215 &ReadInputBrowserLanguages ;
307 -# &ReadInputCountriesTimed ;
 216+ &ReadInputCountriesTimed ;
308217 }
309218
310219 #&ReadCountryCodes ;
311220
312 - print "\nDays input = $days_input_found\n" ;
313 - $multiplier = 1 / $days_input_found ;
314 - print "\nMultiplier = " . sprintf ("%.4f", $multiplier) . "\n" ;
 221+ if ($days_input_found > 0)
 222+ {
 223+ print "\nDays input = $days_input_found\n" ;
 224+ $multiplier = 1 / $days_input_found ;
 225+ print "\nMultiplier = " . sprintf ("%.4f", $multiplier) . "\n" ;
 226+ }
 227+ else { print "\nDays input = 0 (zero!)\n" ; }
315228
316 -#&WriteCsvCountriesTimed ;
317 -#&WriteCsvCountriesGoTo ;
318 -#exit ;
 229+ &WriteCsvCountriesTimed ;
 230+ &WriteCsvCountriesGoTo ;
319231
320232 foreach $key (keys_sorted_alpha_desc %edit_submit)
321233 { print "YYY " . sprintf ("%5d", $edit_submit {$key}) . ": $key\n" ; }
@@ -324,7 +236,6 @@
325237
326238 print "\n\n" ;
327239
328 -
329240 foreach $domain (keys_sorted_by_value_num_desc %edit_submit_bot_sort)
330241 {
331242 $cnt = $edit_submit_bot_sort {$domain} ;
@@ -385,18 +296,18 @@
386297 # &WriteCsvCountriesTimed ;
387298 # &WriteCsvCountriesTargets ;
388299 close "FILE_LOG" ;
389 - print "\nReady\n\n" ;
390300
391301 if (-d "/a/squid")
392302 {
393 -# $cmd = "tar -cf $dir_reports/$date_last\-csv.tar $dir_reports_in/*.csv | bzip2 $dir_reports/$date_last\-csv.tar" ;
 303+# $cmd = "tar -cf $path_reports/$date_last\-csv.tar $path_reports_in/*.csv | bzip2 $path_reports/$date_last\-csv.tar" ;
394304 # print "cmd = '$cmd'\n" ;
395305 # `$cmd` ;
396 - $cmd = "tar -cf $dir_reports/$reportmonth\-html.tar $dir_reports/*.htm | bzip2 $dir_reports/$reportmonth\-html.tar" ;
 306+ $cmd = "tar -cf $path_reports/$reportmonth\-html.tar $path_reports/*.htm | bzip2 $path_reports/$reportmonth\-html.tar" ;
397307 print "cmd = '$cmd'\n" ;
398308 `$cmd` ;
399309 }
400310
 311+ print "\nReady\n\n" ;
401312 exit ;
402313
403314 sub ReportCountries
@@ -408,12 +319,14 @@
409320 $selection = 'PageViews' ;
410321 $selection2 = 'Visits' ;
411322 $views_edits = 'Page Views' ;
 323+ $offset_links = 0 ;
412324 }
413325 else
414326 {
415327 $selection = 'PageEdits' ;
416328 $selection2 = 'Saves' ;
417329 $views_edits = 'Page Edits' ;
 330+ $offset_links = 4 ;
418331 }
419332
420333 ($quarter_only2 = $quarter_only) =~ s/ // ;
@@ -450,31 +363,36 @@
451364
452365 $title_main = "Wikimedia Traffic Analysis Report" ;
453366
454 - $links = "<p>&nbsp;Also: <b>$views_edits Per Country</b> - " .
 367+ $links = "<p>&nbsp;<b>Page Views Per Country</b> - " .
455368 "<a href='$file_html_per_country_overview'>Overview</a> / " .
456369 "<a href='$file_html_per_country_breakdown'>Breakdown</a> / " .
457370 "<a href='$file_html_per_country_trends'>Trends</a>,&nbsp;&nbsp;&nbsp;&nbsp;" .
458 - "<b>$views_edits Per Wikipedia Language - </b> " .
 371+ "<b>Page Views Per Wikipedia Language - </b> " .
459372 "<a href='$file_html_per_language_breakdown'>Breakdown</a>" ;
460373
461 - $title = "$title_main - Wikipedia $views_edits Per Country - Overview" ;
462 - &WriteReportPerCountryOverview ($title, $views_edits, &UnLink ($links,1)) ; ;
 374+ ($links_views = $links) =~ s/Edits/Views/g ;
 375+ ($links_edits = $links) =~ s/Views/Edits/g ;
463376
464 - $title = "$title_main - Wikipedia $views_edits Per Country - Breakdown" ;
465 - &WriteReportPerCountryBreakdown ($title, $views_edits, &UnLink ($links,2),$cutoff_requests = 100, $cutoff_percentage = 1, $show_logcount = $false) ;
466 - &WriteReportPerCountryBreakdown ($title, $views_edits, &UnLink ($links,2),$cutoff_requests = 10, $cutoff_percentage = 0.1, $show_logcount = $true) ;
 377+ $links = "$links_views\n$links_edits\n" ;
467378
468 - $title = "$title_main - Wikipedia $views_edits Per Country - Trends" ;
469 - &WriteReportPerCountryTrends ($title, $views_edits, &UnLink ($links,3)) ;
 379+ $title = "$title_main - Wikipedia <font color=#008000>$views_edits Per Country</font> - Overview" ;
 380+ &WriteReportPerCountryOverview ($title, $views_edits, &UnLink ($links,$offset_links+1)) ;
470381
 382+ $title = "$title_main - Wikipedia <font color=#008000>$views_edits Per Country</font> - Breakdown" ;
 383+ &WriteReportPerCountryBreakdown ($title, $views_edits, &UnLink ($links,$offset_links+2),$cutoff_requests = 100, $cutoff_percentage = 1, $show_logcount = $false) ;
 384+ &WriteReportPerCountryBreakdown ($title, $views_edits, &UnLink ($links,$offset_links+2),$cutoff_requests = 10, $cutoff_percentage = 0.1, $show_logcount = $true) ;
 385+
 386+ $title = "$title_main - Wikipedia <font color=#008000>$views_edits Per Country</font> - Trends" ;
 387+ &WriteReportPerCountryTrends ($title, $views_edits, &UnLink ($links,$offset_links+3)) ;
 388+
471389 $links =~ s/,.*$// ;
472 - $title = "$title_main - $views_edits Per Wikipedia Language - Breakdown" ;
473 - &WriteReportPerLanguageBreakDown ($title, $views_edits, &UnLink ($links,4)) ;
 390+ $title = "$title_main - <font color=#008000>$views_edits Per Wikipedia Language</font> - Breakdown" ;
 391+ &WriteReportPerLanguageBreakDown ($title, $views_edits, &UnLink ($links,$offset_links+4)) ;
474392 }
475393
476394 sub ReadDate
477395 {
478 - open CSV_CRAWLERS, '<', "$dir_process/$file_csv_crawlers" ;
 396+ open CSV_CRAWLERS, '<', "$path_process/$file_csv_crawlers" ;
479397 $line = <CSV_CRAWLERS> ;
480398 close CSV_CRAWLERS ;
481399 # print "DATE LINE $line\n" ;
@@ -541,11 +459,12 @@
542460 "body {font-family:arial,sans-serif; font-size:12px }\n" .
543461 "h2 {margin:0px 0px 3px 0px; font-size:18px}\n" .
544462 "table {font-size:12px ;}\n" .
545 - "td {white-space:wrap; text-align:right; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:12px ; vertical-align:top}\n" .
 463+ "td {white-space:wrap; text-align:right; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:12px ; vertical-align:middle}\n" .
546464 "th {white-space:nowrap; text-align:right; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:12px ; vertical-align:top ; font-width:bold}\n" .
547465 "th.small {white-space:wrap; text-align:right; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:11px ; vertical-align:top ; font-width:bold}\n" .
548 - "td.hl {text-align:left;}\n" .
549 - "td.hr {text-align:right;}\n" .
 466+ "td.hl {text-align:left;vertical-align:top;}\n" .
 467+ "td.hr {text-align:right;vertical-align:top;}\n" .
 468+ "td.hc {text-align:center;vertical-align:top;}\n" .
550469 "td.r {text-align:right; border: inset 1px #FFFFFF}\n" .
551470 "td.c {text-align:center; border: inset 1px #FFFFFF}\n" .
552471 "td.l {text-align:left; border: inset 1px #FFFFFF}\n" .
@@ -614,7 +533,7 @@
615534 {
616535 chomp ($line) ;
617536 ($code,$region,$north_south,$name) = split (',',$line,4) ;
618 - $country_codes {$code} = $name ;
 537+ $country_codes {$code} = unicode_to_html ($name) ;
619538 # print "$code => $name\n" ;
620539 }
621540 }
@@ -623,7 +542,7 @@
624543
625544 sub ReadInputClients
626545 {
627 - my $file_csv = "$dir_process/$file_csv_clients" ;
 546+ my $file_csv = "$path_process/$file_csv_clients" ;
628547 if (! -e $file_csv)
629548 { abort ("Function ReadInputClients: file $file_csv not found!!!") ; }
630549 open CSV_CLIENTS, '<', $file_csv ;
@@ -712,7 +631,7 @@
713632
714633 sub ReadInputCrawlers
715634 {
716 - my $file_csv = "$dir_process/$file_csv_crawlers" ;
 635+ my $file_csv = "$path_process/$file_csv_crawlers" ;
717636 if (! -e $file_csv)
718637 { abort ("Function ReadInputCrawlers: file $file_csv not found!!!\n") ; }
719638 open CSV_CRAWLERS, '<', $file_csv ;
@@ -767,7 +686,7 @@
768687
769688 sub ReadInputMethods
770689 {
771 - my $file_csv = "$dir_process/$file_csv_methods" ;
 690+ my $file_csv = "$path_process/$file_csv_methods" ;
772691 if (! -e $file_csv)
773692 { abort ("Function ReadInputMethods: file $file_csv not found!!!") ; }
774693 open CSV_METHODS, '<', $file_csv ;
@@ -785,7 +704,7 @@
786705
787706 sub ReadInputMimeTypes
788707 {
789 - my $file_csv = "$dir_process/$file_csv_requests" ;
 708+ my $file_csv = "$path_process/$file_csv_requests" ;
790709 if (! -e $file_csv)
791710 { abort ("Function ReadInputMimeTypes: file $file_csv not found!!!") ; }
792711 open CSV_REQUESTS, '<', $file_csv ;
@@ -872,7 +791,7 @@
873792
874793 sub ReadInputOpSys
875794 {
876 - my $file_csv = "$dir_process/$file_csv_opsys" ;
 795+ my $file_csv = "$path_process/$file_csv_opsys" ;
877796 if (! -e $file_csv)
878797 { abort ("Function ReadInputOpSys: file $file_csv not found!!!") ; }
879798 open CSV_OPSYS, '<', $file_csv ;
@@ -918,7 +837,7 @@
919838
920839 sub ReadInputOrigins
921840 {
922 - my $file_csv = "$dir_process/$file_csv_origins" ;
 841+ my $file_csv = "$path_process/$file_csv_origins" ;
923842 if (! -e $file_csv)
924843 { abort ("Function ReadInputOrigins: file $file_csv not found!!!") ; }
925844 open CSV_ORIGINS, '<', $file_csv ;
@@ -1001,7 +920,7 @@
1002921
1003922 sub ReadInputScripts
1004923 {
1005 - my $file_csv = "$dir_process/$file_csv_scripts" ;
 924+ my $file_csv = "$path_process/$file_csv_scripts" ;
1006925 if (! -e $file_csv)
1007926 { abort ("Function ReadInputScripts: file $file_csv not found!!!") ; }
1008927 open CSV_SCRIPTS, '<', $file_csv ;
@@ -1035,7 +954,7 @@
1036955 # foreach $key (keys_sorted_by_value_num_desc %actions)
1037956 # { print "$key: " . $actions {$key} . "\n" ; }
1038957
1039 - open CSV_SCRIPTS, '<', "$dir_process/$file_csv_scripts" ;
 958+ open CSV_SCRIPTS, '<', "$path_process/$file_csv_scripts" ;
1040959 read_script:
1041960 while ($line = <CSV_SCRIPTS>)
1042961 {
@@ -1101,7 +1020,7 @@
11021021
11031022 sub ReadInputGoogle
11041023 {
1105 - my $file_csv = "$dir_process/$file_csv_google" ;
 1024+ my $file_csv = "$path_process/$file_csv_google" ;
11061025 if (! -e $file_csv)
11071026 { abort ("Function ReadInputGoogle: file $file_csv not found!!!") ; }
11081027 open CSV_SEARCH, '<', $file_csv ;
@@ -1158,7 +1077,7 @@
11591078
11601079 sub ReadInputSkins
11611080 {
1162 - my $file_csv = "$dir_process/$file_csv_skins" ;
 1081+ my $file_csv = "$path_process/$file_csv_skins" ;
11631082 if (! -e $file_csv)
11641083 { abort ("Function ReadInputSkins: file $file_csv not found!!!") ; }
11651084 open CSV_SKINS, '<', $file_csv ;
@@ -1179,7 +1098,7 @@
11801099
11811100 sub ReadInputIndexPhp
11821101 {
1183 - my $file_csv = "$dir_process/$file_csv_indexphp" ;
 1102+ my $file_csv = "$path_process/$file_csv_indexphp" ;
11841103 if (! -e $file_csv)
11851104 { abort ("Function ReadInputIndexPhp: file $file_csv not found!!!") ; }
11861105 open CSV_INDEXPHP, '<', $file_csv ;
@@ -1305,7 +1224,7 @@
13061225
13071226 sub ReadInputCountriesTimed
13081227 {
1309 - my $file_csv = "$dir_process/$file_csv_countries_timed" ;
 1228+ my $file_csv = "$path_process/public/$file_csv_countries_timed" ;
13101229 if (! -e $file_csv)
13111230 { abort ("Function ReadInputSkins: file $file_csv not found!!! ") ; }
13121231 open CSV_COUNTRIES, '<', $file_csv ;
@@ -1346,6 +1265,8 @@
13471266
13481267 next if $line =~ /^#/ ;
13491268
 1269+ $line =~ s/C..?te d'Ivoire/C&ocirc;te d'Ivoire/g ;
 1270+
13501271 ($country_code,$region_code,$north_south_code,$country_name) = split (',', $line,4) ;
13511272 $region_codes {$country_code} = $region_code ;
13521273 $north_south_codes {$country_code} = $north_south_code ;
@@ -1363,6 +1284,7 @@
13641285 # if ($country_meta_info_not_found_reported {$country} ++ == 0)
13651286 # { print "Meta info not found for country '$country'\n" ; }
13661287 # }
 1288+ $country_name =~ s/^C..?te d/C&ocirc;te d/ ;
13671289
13681290 $country_names {$country_code} = $country_name ;
13691291 $country_codes_all {"$country_name|$country_code"} ++ ;
@@ -1377,13 +1299,14 @@
13781300 while ($line = <COUNTRY_META_INFO>)
13791301 {
13801302 chomp $line ;
 1303+
 1304+ $line =~ s/C..?te d'Ivoire/C&ocirc;te d'Ivoire/g ;
 1305+
13811306 ($country,$link,$population,$connected,$icon) = split ',', $line ;
1382 -print "$line\n" ; # qqq
13831307 $country =~ s/&comma;/,/g ;
13841308
13851309 # use country names as given by MaxMind
13861310 $country =~ s/Brunei/Brunei Darussalam/ ;
1387 - $country =~ s/C..?te d'Ivoire/Cote d'Ivoire/ ;
13881311 $country =~ s/Congo, The Democratic Republic of the/Republic of the Congo/ ;
13891312 $country =~ s/Dem. Rep. of Congo/Congo - The Democratic Republic of the/ ;
13901313 $country =~ s/East timor/Timor-Leste/ ;
@@ -1404,10 +1327,8 @@
14051328 $country =~ s/U.S. Virgin Islands/Virgin Islands, British/ ;
14061329 $country =~ s/Vatican City/Holy See (Vatican City State)/ ;
14071330 $country =~ s/^Korea$/South Korea/ ;
1408 -
14091331 $connected =~ s/connected/../g ;
14101332 $country_meta_info {$country} = "$link,$population,$connected,$icon" ;
1411 -print "meta info found for '$country'\n" ; # qqq
14121333
14131334 if ($country eq "United States")
14141335 { ($connected_us = $connected) =~ s/_//g ; }
@@ -1451,7 +1372,7 @@
14521373 undef %yyyymm_ ;
14531374 undef %quarters ;
14541375 undef %requests_unknown_per_quarter ;
1455 - undef %country_codes ;
 1376+# undef %country_codes ;
14561377 undef %requests_all ;
14571378 undef %requests_all_per_period ;
14581379 undef %requests_per_quarter ;
@@ -1786,7 +1707,7 @@
17871708
17881709 sub ReadInputBrowserLanguages
17891710 {
1790 - my $file_csv = "$dir_process/$file_csv_browser_languages" ;
 1711+ my $file_csv = "$path_process/$file_csv_browser_languages" ;
17911712 if (! -e $file_csv)
17921713 { abort ("Function ReadInputBrowserLanguages: file $file_csv not found!!! ") ; }
17931714 open CSV_BROWSER_LANGUAGES, '<', $file_csv ;
@@ -2025,7 +1946,7 @@
20261947
20271948 sub WriteReportClients
20281949 {
2029 - open FILE_HTML_CLIENTS, '>', "$dir_reports/$file_html_clients" ;
 1950+ open FILE_HTML_CLIENTS, '>', "$path_reports/$file_html_clients" ;
20301951
20311952 $html = $header ;
20321953 $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Browsers e.a./ ;
@@ -2316,7 +2237,7 @@
23172238
23182239 sub WriteReportCrawlers
23192240 {
2320 - open FILE_HTML_CRAWLERS, '>', "$dir_reports/$file_html_crawlers" ;
 2241+ open FILE_HTML_CRAWLERS, '>', "$path_reports/$file_html_crawlers" ;
23212242
23222243 $html = $header ;
23232244 $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Crawler requests/ ;
@@ -2490,7 +2411,7 @@
24912412
24922413 sub WriteReportMethods
24932414 {
2494 - open FILE_HTML_METHODS, '>', "$dir_reports/$file_html_methods" ;
 2415+ open FILE_HTML_METHODS, '>', "$path_reports/$file_html_methods" ;
24952416
24962417 $html = $header ;
24972418 $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Request Methods/ ;
@@ -2570,7 +2491,7 @@
25712492
25722493 sub WriteReportMimeTypes
25732494 {
2574 - open FILE_HTML_REQUESTS, '>', "$dir_reports/$file_html_requests" ;
 2495+ open FILE_HTML_REQUESTS, '>', "$path_reports/$file_html_requests" ;
25752496
25762497 $html = $header ;
25772498 $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Requests by destination/ ;
@@ -2740,7 +2661,7 @@
27412662
27422663 sub WriteReportOpSys
27432664 {
2744 - open FILE_HTML_OPSYS, '>', "$dir_reports/$file_html_opsys" ;
 2665+ open FILE_HTML_OPSYS, '>', "$path_reports/$file_html_opsys" ;
27452666
27462667 $html = $header ;
27472668 $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Operating Systems/ ;
@@ -2955,7 +2876,7 @@
29562877 # http://en.wikipedia.org/wiki/Domain_name
29572878 sub WriteReportOrigins
29582879 {
2959 - open FILE_HTML_ORIGINS, '>', "$dir_reports/$file_html_origins" ;
 2880+ open FILE_HTML_ORIGINS, '>', "$path_reports/$file_html_origins" ;
29602881
29612882 $html = $header ;
29622883 $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Requests by origin/ ;
@@ -3560,7 +3481,7 @@
35613482
35623483 sub WriteReportScripts
35633484 {
3564 - open FILE_HTML_SCRIPTS, '>', "$dir_reports/$file_html_scripts" ;
 3485+ open FILE_HTML_SCRIPTS, '>', "$path_reports/$file_html_scripts" ;
35653486
35663487 $html = $header ;
35673488 $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Scripts/ ;
@@ -3737,7 +3658,7 @@
37383659
37393660 sub WriteReportGoogle
37403661 {
3741 - open FILE_HTML_SEARCH, '>', "$dir_reports/$file_html_google" ;
 3662+ open FILE_HTML_SEARCH, '>', "$path_reports/$file_html_google" ;
37423663
37433664 $html = $header ;
37443665 $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Google requests/ ;
@@ -4058,7 +3979,7 @@
40593980
40603981 sub WriteReportSkins
40613982 {
4062 - open FILE_HTML_SKINS, '>', "$dir_reports/$file_html_skins" ;
 3983+ open FILE_HTML_SKINS, '>', "$path_reports/$file_html_skins" ;
40633984
40643985 $html = $header ;
40653986 $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Skins/ ;
@@ -4112,11 +4033,11 @@
41134034
41144035 sub WriteCsvGoogleBots
41154036 {
4116 - open CSV_GOOGLE_BOTS_OUT, '>', "$dir_reports/$file_csv_google_bots" ;
 4037+ open CSV_GOOGLE_BOTS_OUT, '>', "$path_reports/$file_csv_google_bots" ;
41174038 print CSV_GOOGLE_BOTS_OUT "Date Time,Ip Range,Hits\n" ;
4118 - foreach $dir_process (@dirs_process)
 4039+ foreach $path_process (@dirs_process)
41194040 {
4120 - open CSV_GOOGLE_BOTS_IN, '<', "$dir_process/$file_csv_google_bots" ;
 4041+ open CSV_GOOGLE_BOTS_IN, '<', "$path_process/$file_csv_google_bots" ;
41214042 while ($line = <CSV_GOOGLE_BOTS_IN>)
41224043 {
41234044 next if $line =~ /^#/ ; # comments
@@ -4140,7 +4061,7 @@
41414062
41424063 sub WriteCsvBrowserLanguages
41434064 {
4144 - open CSV_BROWSER_LANGUAGES, '>', "$dir_reports/$file_csv_browser_languages" ;
 4065+ open CSV_BROWSER_LANGUAGES, '>', "$path_reports/$file_csv_browser_languages" ;
41454066 print CSV_BROWSER_LANGUAGES "Browser,Languages,Hits\n" ;
41464067 foreach $key (keys_sorted_alpha_asc %browser_languages)
41474068 { print CSV_BROWSER_LANGUAGES "$key,${browser_languages {$key}}\n" ; }
@@ -4150,8 +4071,8 @@
41514072 sub WriteCsvCountriesTimed
41524073 {
41534074 $multiplier_1000 = 1000 * $multiplier ;
4154 -# open CSV_COUNTRIES_TIMED, '>', "$dir_reports/$file_csv_countries_timed" ;
4155 - open CSV_COUNTRIES_TIMED, '>', "/home/ezachte/$file_csv_countries_timed" ;
 4075+ print "WriteCsvCountriesTimed: $path_out/$file_csv_countries_timed\n" ;
 4076+ open CSV_COUNTRIES_TIMED, '>', "$path_out/$file_csv_countries_timed" ;
41564077
41574078 foreach $target (sort keys %targets)
41584079 {
@@ -4173,7 +4094,7 @@
41744095 $cnt_countries = 0 ;
41754096 foreach $country (@countries)
41764097 {
4177 - $country_name = $country_codes {$country} ;
 4098+ $country_name = $country_names {$country} ;
41784099 $line .= "$country_name," ;
41794100
41804101 last if $cnt_countries++ >= 25 ;
@@ -4203,8 +4124,8 @@
42044125 # http://www.maxmind.com/app/iso3166 country codes
42054126 sub WriteCsvCountriesGoTo
42064127 {
4207 -# open CSV_COUNTRIES_TIMED, '>', "$dir_reports/$file_csv_countries_timed" ;
4208 - open CSV_COUNTRIES_LANGUAGES_VISITED, '>', "/home/ezachte/$file_csv_countries_languages_visited" ;
 4128+ print "WriteCsvCountriesGoTo: $path_out/$file_csv_countries_languages_visited\n" ;
 4129+ open CSV_COUNTRIES_LANGUAGES_VISITED, '>', "$path_out/$file_csv_countries_languages_visited" ;
42094130
42104131 foreach $country (sort keys %countries)
42114132 {
@@ -4227,9 +4148,9 @@
42284149
42294150 foreach $bot ("N","Y")
42304151 {
4231 - $country_name = $country_codes {$country} ;
 4152+ $country_name = $country_names {$country} ;
42324153 $country_name =~ s/\n//gs ;
4233 - $country_name =~ s/[0x00-0x1F]//gs ;
 4154+ $country_name =~ s/[\x00-\x1F]//gs ;
42344155
42354156 $cnt_targets = 0 ;
42364157 $tot_targets = 0 ;
@@ -4283,6 +4204,8 @@
42844205 $html =~ s/X1000/.&nbsp;Period <b>$requests_recently_start - $requests_recently_stop<\/b>/ ;
42854206 $html =~ s/DATE// ;
42864207
 4208+ $html .= "<p>Portal is <a href='http://www.wikipedia.org'>www.wikipedia.org</a>\n" ;
 4209+
42874210 $html .= "<p><table border=1 width=800>INDEX\n" ;
42884211
42894212 my $languages_reported ;
@@ -4380,7 +4303,7 @@
43814304 my (@index_countries,@csv_countries) ;
43824305 my $views_edits_lc = lc $views_edits ;
43834306 my $views_edits_lcf = ucfirst $views_edits_lc ;
4384 - ($views_edits2 = $views_edits) =~ s/ /\<br\>/ ;
 4307+# ($views_edits2 = $views_edits) =~ s/ /\<br\>/ ;
43854308 if ($views_edits =~ /edit/i)
43864309 { $MPVE = 'MPE' ; } # monthly page edits
43874310 else
@@ -4397,17 +4320,22 @@
43984321
43994322 $html .= &HtmlSortTable ;
44004323
 4324+ $html .= "<p>Portal is <a href='http://www.wikipedia.org'>www.wikipedia.org</a>\n" ;
 4325+
44014326 $html .= "<p><table border=1 width=800 class=tablesorter id=table1>\n" ;
44024327 $html .= "<thead>\n" ;
44034328 $html .= "INDEX\n" ;
44044329
44054330 $html .= &HtmlWorldMaps ;
44064331
4407 - $html .= "<tr><td class=rh5 colspan=3 rowspan=1><b>Country</b></td><td class=c rowspan=2><b>Monthly<br>$views_edits2</b></td>" .
4408 - "<td class=r rowspan=2><b>Population</b></td>" . # <td class=c rowspan=2><b>$MPVE's<br>Per<br>Person</b></td>" .
4409 - "<td class=c colspan=2><b>Internet<br>Users</b></td><td class=c><b>${MPVE}'s<br>Per<br>I U</b></td>" .
4410 - "<td colspan=99 class=l rowspan=2><b>Share in Global Monthly $views_edits</b><br><small><font color=#808080>red and blue bars have different scale</font></small></td></tr>\n" ;
4411 - $html .= "<tr><td class=c><b>Name</b></td><td class=c><b>Region</b><br><img src='http://stats.wikimedia.org/Location_of_Continents2.gif'></td><td class=c><b>N/S</b></td><td class=c><b>Total</b></td><td class=c><b>/Pop.</b></td></tr>\n" ;
 4332+ $html .= "<tr><td class=hr colspan=3 rowspan=1><b>Location</b></td>" .
 4333+ "<td class=hc colspan=2 rowspan=2><b>Population</b><br><small><font color=#404040>absolute count and percentage of world population</font></small></td>" . # <td class=hc rowspan=2><b>$MPVE's<br>Per<br>Person</b></td>" .
 4334+ "<td class=hc colspan=2 rowspan=2><b>Internet<br>Users</b><br><small><font color=#404040>absolute count and percentage of country population</font></small></td>" .
 4335+ "<td class=hl colspan=4 rowspan=1><b>Monthly $views_edits</b></td></tr>\n" ;
 4336+# $html .= "<tr>" .
 4337+# # "<td class=hc><b>${MPVE}'s<br>Per<br>I U</b></td>" .
 4338+# "<td colspan=99 class=hc><b>Share in Global Monthly $views_edits</b><br><small><font color=#808080>red and blue bars have different scale</font></small></td></tr>\n" ;
 4339+ $html .= "<tr><td class=hr><b>Country</b></td><td class=hc><b>Region</b><br><img src='http://stats.wikimedia.org/Location_of_Continents2.gif'></td><td class=hc><b>N/S</b></td><td class=hc colspan=2><small><font color=#404040>absolute count and edits per internet user</font></small></td><td class=hl colspan=2><small>share of global total<font color=#808080><p>note:blue and red bars have different scale</font></small></td></tr>\n" ;
44124340 $html .= "<tr><th>&nbsp;</th><th>&nbsp;</th><th>&nbsp;</th><th>&nbsp;</th><th>&nbsp;</th><th>&nbsp;</th><th>&nbsp;</th><th>&nbsp;</th><th>&nbsp;</th><th colspan=2>&nbsp;</th></tr>\n" ;
44134341 $html .= "</thead><tbody>\nTOTAL\nREGIONS\n" ;
44144342
@@ -4440,7 +4368,6 @@
44414369 $north_south_name =~ s/^N$/<font color=#000BF7><b>N<\/b><\/font>/ ;
44424370 $north_south_name =~ s/^S$/<font color=#FE0B0D><b>S<\/b><\/font>/ ;
44434371
4444 -print "\n" ; # qqq
44454372 ($link_country,$icon,$population,$connected) = &CountryMetaInfo ($country) ;
44464373
44474374 my $requests_this_country = $requests_recently_per_country {$country} ;
@@ -4495,7 +4422,7 @@
44964423 $country2 =~ s/Syrian Arab Republic/Syria/ ;
44974424 $country2 =~ s/Tanzania, United Republic of/Tanzania/ ;
44984425 $country2 =~ s/Libyan Arab Jamahiriya/Libya/ ;
4499 - $country2 =~ s/C..?te d'Ivoire/C\xC3\xB4te d'Ivoire/ ;
 4426+ $country2 =~ s/C..?te d'Ivoire/C&ocirc;te d'Ivoire/ ;
45004427 $country2 =~ s/Serbia/republic of serbia/ ;
45014428 $country2 =~ s/Lao People's Democratic Republic/Laos/ ;
45024429
@@ -4505,13 +4432,18 @@
45064433 $population2 = &i2KM2 ($population) ;
45074434 $connected2 = &i2KM2 ($connected) ;
45084435 $requests_this_country2 = &i2KM2 ($requests_this_country2) ;
 4436+ $perc_population = &Percentage ($population / $population_tot) ;
 4437+ if ($perc_population =~ /\.0\d/)
 4438+ { $perc_population = "<small>$perc_population</small>" ; }
 4439+
45094440 $html .= "<tr><th class=rh3><a id='$country' name='$country'></a>$link_country $icon</td>" .
45104441 "<td>$region_name</td>" .
45114442 "<td>$north_south_name</td>" .
4512 - "<td>$requests_this_country2</td>" .
45134443 "<td>$population2</td>" . # <td>$requests_per_person</td>" .
 4444+ "<td>$perc_population</td>" . # <td>$requests_per_person</td>" .
45144445 "<td>$connected2</td>" .
45154446 "<td>$perc_connected</td>" .
 4447+ "<td>$requests_this_country2</td>" .
45164448 "<td>$requests_per_connected_person</td>" .
45174449 "<td>$perc_share_total</td>" .
45184450 "<td class=l>$bar</td></tr>\n" ;
@@ -4549,10 +4481,11 @@
45504482 $html_total = "<tr><th class=rh3>All countries in</td>" .
45514483 "<td><b>World</b></td>" .
45524484 "<td>&nbsp;</td>" .
4553 - "<td>$requests_tot2</td>" .
45544485 "<td>$population_tot2</td>" .
 4486+ "<td>100%</td>" .
45554487 "<td>$connected_tot2</td>" .
45564488 "<td>$perc_connected_tot</td>" .
 4489+ "<td>$requests_tot2</td>" .
45574490 "<td>$requests_per_connected_person_tot</td>" .
45584491 "<td>100%</th>" .
45594492 "<td class=l>&nbsp;</td></tr>\n" ;
@@ -4592,9 +4525,9 @@
45934526 if ($requests_recently_all > 0)
45944527 { $perc_share_total = &Percentage ($requests_region / $requests_recently_all) ; }
45954528
4596 - $perc_connected_region = ".." ;
 4529+ $perc_population_region = ".." ;
45974530 if ($population_region > 0)
4598 - { $perc_connected_region = sprintf ("%.0f", 100 * $connected_region / $population_region) .'%' ; }
 4531+ { $perc_population_region = &Percentage ($population_region / $population_tot) ; }
45994532
46004533 # $requests_region2 = int ($requests_region * 1000 / $months_recently) ;
46014534
@@ -4620,10 +4553,11 @@
46214554 $html_regions .= "<tr><th>All countries in</th>" .
46224555 "</td><td>$region</td>" .
46234556 "<td>&nbsp;</td>" .
4624 - "<td>$requests_region2</td>" .
46254557 "<td>$population_region</td>" .
 4558+ "<td>$perc_population_region</td>" .
46264559 "<td>$connected_region</td>" .
46274560 "<td>$perc_connected_region</td>" .
 4561+ "<td>$requests_region2</td>" .
46284562 "<td>$requests_per_connected_person</td>" .
46294563 "<td>$perc_share_total</th>" .
46304564 "<td class=l>$bar</td></tr>\n" ;
@@ -4806,7 +4740,7 @@
48074741 $country =~ s/Syrian Arab Republic/Syria/ ;
48084742 $country =~ s/Tanzania, United Republic of/Tanzania/ ;
48094743 $country =~ s/Libyan Arab Jamahiriya/Libya/ ;
4810 - $country =~ s/C..?te d'Ivoire/C\xC3\xB4te d'Ivoire/ ;
 4744+ $country =~ s/C..?te d'Ivoire/C&ocirc;te d'Ivoire/ ;
48114745 $country =~ s/Serbia/republic of serbia/ ;
48124746 $country =~ s/Lao People's Democratic Republic/Laos/ ;
48134747
@@ -5079,6 +5013,8 @@
50805014 $html =~ s/X1000/.&nbsp;Period <b>$requests_recently_start - $requests_recently_stop<\/b><br>$report_version/ ;
50815015 $html =~ s/DATE// ;
50825016
 5017+ $html .= "<p>Portal is <a href='http://www.wikipedia.org'>www.wikipedia.org</a>\n" ;
 5018+
50835019 $html .= "<p><table border=1 width=800>INDEX\n" ;
50845020
50855021 $html .= &HtmlWorldMaps ;
@@ -5221,6 +5157,8 @@
52225158 $html =~ s/X1000/.&nbsp;Period <b>$requests_start - $requests_stop<\/b>/ ;
52235159 $html =~ s/DATE// ;
52245160
 5161+ $html .= "<p>Portal is <a href='http://www.wikipedia.org'>www.wikipedia.org</a>\n" ;
 5162+
52255163 $html .= "<p><table border=1 width=800>INDEX\n" ;
52265164
52275165 $html .= &HtmlWorldMaps ;
@@ -5414,14 +5352,14 @@
54155353 sub OpenLog
54165354 {
54175355 # only shrink log when same log file is appended daily, is no longer the case
5418 -# $fileage = -M "$dir_reports/$file_log" ;
 5356+# $fileage = -M "$path_reports/$file_log" ;
54195357 # if ($fileage > 5)
54205358 # {
5421 -# open "FILE_LOG", "<", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ;
 5359+# open "FILE_LOG", "<", "$path_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ;
54225360 # @log = <FILE_LOG> ;
54235361 # close "FILE_LOG" ;
54245362 # $lines = 0 ;
5425 -# open "FILE_LOG", ">", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ;
 5363+# open "FILE_LOG", ">", "$path_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ;
54265364 # foreach $line (@log)
54275365 # {
54285366 # if (++$lines >= $#log - 5000)
@@ -5429,8 +5367,8 @@
54305368 # }
54315369 # close "FILE_LOG" ;
54325370 # }
5433 -# open "FILE_LOG", ">>", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ;
5434 - open "FILE_LOG", ">>", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ;
 5371+# open "FILE_LOG", ">>", "$path_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ;
 5372+ open "FILE_LOG", ">>", "$path_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ;
54355373 &Log ("\n\n===== Wikimedia Sampled Visitors Log Report / " . date_time_english (time) . " =====\n\n") ;
54365374 }
54375375
@@ -5899,7 +5837,7 @@
59005838
59015839 $country =~ s/,/&comma;/g ;
59025840 $country =~ s/Bosnia-Herzegovina/Bosnia and Herzegovina/ ;
5903 - $country =~ s/Cote d'Ivoire/C�te d'Ivoire/ ;
 5841+ $country =~ s/Cote d'Ivoire/C&ocirc;te d'Ivoire/ ;
59045842 $country =~ s/Macao/Macau/ ; # will be changed back later
59055843 $country =~ s/Samoa/American Samoa/ ;
59065844 $country =~ s/Timor Leste/Timor-Leste/ ;
@@ -5928,12 +5866,11 @@
59295867 sub CountryMetaInfo
59305868 {
59315869 my $country = shift ;
5932 -print "Country '$country'\n" ; # qqq
59335870 my ($link_country,$icon,$population) ;
59345871 if ($country_meta_info {$country} eq "")
59355872 {
59365873 if ($country_meta_info_not_found_reported {$country} ++ == 0)
5937 - { print "_Meta info not found for country '$country'\n" ; }
 5874+ { print "Meta info not found for country '$country'\n" ; }
59385875 $link_country = $country ;
59395876 return ($country,'','..','..') ;
59405877 }
@@ -6000,7 +5937,7 @@
60015938 # print "\n\nUnLink $index\n\n" ;
60025939 my @segments = split '(?=<a )', $links ;
60035940 # print "SEGMENT 1 $segments[$index]\n" ;
6004 - $segments [$index] =~ s/^.*?<a .*?>([^<]*)<\/a>/$1/ ;
 5941+ $segments [$index] =~ s/^.*?<a .*?>([^<]*)<\/a>/<font color=#008000><b>$1<\/b><\/font>/ ;
60055942 # print "SEGMENT 2 $segments[$index]\n" ;
60065943 $links = join '', @segments ;
60075944 return ($links) ;
@@ -6049,11 +5986,12 @@
60505987 \$.tablesorter.addParser({
60515988 id: "millions",
60525989 is: function(s) { return false; },
6053 - format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/&nbsp;/g,"").replace(/M/,"000000").replace(/&#1052;/,"000000").replace(/K/,"000").replace(/&#1050;/i,"000")); },
 5990+//failed so far to turn 1.2M into 1200000, so figures with decimal point are sorted out of place
 5991+//format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/&nbsp;/g,"").replace(/\\.(\d)M/,$1+"00000").replace(/M/,"000000").replace(/&#1052;/,"000000").replace(/K/,"000").replace(/&#1050;/i,"000")); },
 5992+ format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/&nbsp;/g,""). replace(/M/,"000000").replace(/&#1052;/,"000000").replace(/K/,"000").replace(/&#1050;/i,"000")); },
60545993 type: "numeric"
60555994 });
60565995
6057 -
60585996 \$.tablesorter.addParser({
60595997 id: "digitsonly",
60605998 is: function(s) { return false; },
@@ -6131,7 +6069,7 @@
61326070 <script type='text/javascript'>
61336071 \$('#table1').tablesorter({
61346072 // debug:true,
6135 - headers:{0:{sorter:'nohtml'},1:{sorter:'nohtml'},2:{sorter:'nohtml'},3:{sorter:'millions'},4:{sorter:'millions'},5:{sorter:'millions'},6:{sorter:'digitsonly'},7:{sorter:'digitsonly'},6:{sorter:'digitsonly'},7:{sorter:'digitsonly'}}
 6073+ headers:{0:{sorter:'nohtml'},1:{sorter:'nohtml'},2:{sorter:'nohtml'},3:{sorter:'millions'},4:{sorter:'digitsonly'},5:{sorter:'millions'},6:{sorter:'digitsonly'},7:{sorter:'millions'},8:{sorter:'digitsonly'},9:{sorter:'digitsonly'}}
61366074 });
61376075 </script>
61386076 __HTML_SORT_TABLE_COLUMNS__
@@ -6263,3 +6201,4 @@
62646202
62656203 return $html_worldmaps ;
62666204 }
 6205+

Status & tagging log