Index: trunk/wikistats/squids/SquidCountArchive.pl |
— | — | @@ -11,7 +11,7 @@ |
12 | 12 | use SquidCountArchiveWriteOutput ; |
13 | 13 | |
14 | 14 | # set defaults mainly for tests on local machine |
15 | | - default_argv "-d 2010/05/10" ; |
| 15 | + default_argv "-d 2011/04/01" ; |
16 | 16 | |
17 | 17 | # http://wikitech.wikimedia.org/view/Squid_log_format |
18 | 18 | # 1. Hostname |
— | — | @@ -45,7 +45,7 @@ |
46 | 46 | if (! $job_runs_on_production_server) |
47 | 47 | { |
48 | 48 | $test = $true ; |
49 | | - $file_test = "w:/# Out Locke/sampled-1000-log-20100510b.txt" ; |
| 49 | + $file_test = "w:/# Out Locke/sampled-1000-log-20110401.txt" ; |
50 | 50 | # $file_test = getcwd . "/SquidDataFilterFY.txt" ; |
51 | 51 | if (! -e $file_test) |
52 | 52 | { abort "Test input file '$file_test' not found" ; } |
— | — | @@ -58,8 +58,8 @@ |
59 | 59 | else |
60 | 60 | { $path_root = "w:/! perl/squids/archive/test" ; } |
61 | 61 | |
62 | | - $tags_mobile = "Android|BlackBerry|Windows CE|DoCoMo|iPad|iPod|iPhone|HipTop|LGE|Linux arm|Mobile|MIDP|NetFront|Nintendo|Nokia|Obigo|Opera Mini|Palm Pre|Playstation|Samsung|SoftBank|SonyEricsson|SymbianOS|UP\.Browser|Vodafone|WAP|webOS|Wikiamo|Wikipanion" ; |
63 | | - $tags_mobile_upd = "May 2010" ; |
| 62 | + $tags_mobile = "Android|BlackBerry|Windows CE|DoCoMo|iPad|iPod|iPhone|HipTop|Kindle|LGE|Linux arm|Mobile|MIDP|NetFront|Nintendo|Nokia|Obigo|Opera Mini|Opera Mobi|Palm Pre|Playstation|Samsung|SoftBank|SonyEricsson|SymbianOS|UP\.Browser|Vodafone|WAP|webOS|Wikiamo|Wikipanion" ; |
| 63 | + $tags_mobile_upd = "March 2011" ; |
64 | 64 | |
65 | 65 | $pattern_url_pre = "(?:^|[a-zA-Z0-9-]+\\.)*?" ; |
66 | 66 | $pattern_url_post = "\\.(?:biz|com|info|name|net|org|pro|aero|asia|cat|coop|edu|gov|int|jobs|mil|mobi|museum|tel|travel|arpa|[a-zA-Z0-9-]{2}|(?:com?|ne)\\.[a-zA-Z0-9-]{2})\$" ; |
Index: trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm |
— | — | @@ -0,0 +1,1188 @@ |
| 2 | +# to do: study http://www.zytrax.com/tech/web/mobile_ids.html |
| 3 | + |
| 4 | +sub ProcessLine |
| 5 | +{ |
| 6 | + my $line = shift ; |
| 7 | + |
| 8 | + my @fields = split (' ', $line) ; |
| 9 | + $time = $fields [2] ; |
| 10 | + $date = substr ($time,0,10) ; |
| 11 | + |
| 12 | + $client_ip = $fields [4] ; |
| 13 | + $mime = $fields [10] ; |
| 14 | + |
| 15 | + if ($scan_ip_frequencies) # phase 1 |
| 16 | + { |
| 17 | + return if $line =~ /Banner(?:Cont|List|Load|beheer)/io ; |
| 18 | + |
| 19 | + if ($mime eq "text/html") |
| 20 | + { |
| 21 | + $ip_frequencies {$client_ip} ++ ; |
| 22 | + $html_pages_found ++ ; |
| 23 | + } |
| 24 | + |
| 25 | + return ; |
| 26 | + } |
| 27 | + |
| 28 | + # remember for each squid per hour lowest and highest sequence number and number of events |
| 29 | + # later calc per hour average distance between events = (higest - lowest sequence number) / events - 1 |
| 30 | + # distance between consecutive events that lay in different hour bin are ignored, begligible |
| 31 | + $squid = $fields [0] ; |
| 32 | + $seqno = $fields [1] ; |
| 33 | + $hour = substr ($time, 11, 2) ; |
| 34 | + |
| 35 | + # init for new found or restarted squid |
| 36 | + # note seqno can be negative! probably unsigned int printed as signed int, 3rd clause deals with this |
| 37 | + if (($squid_seqno {$squid} == 0) || ($seqno < $squid_seqno {$squid}) || (($seqno > 0) && ($squid_seqno {$squid} < 0))) |
| 38 | + { $squid_seqno {$squid} = $seqno ; } |
| 39 | + else |
| 40 | + { |
| 41 | + $squid_events {"$squid,$hour"} ++ ; |
| 42 | + $delta = $seqno - $squid_seqno {$squid}; |
| 43 | + $squid_delta {"$squid,$hour"} += $delta ; |
| 44 | + $squid_seqno {$squid} = $seqno ; |
| 45 | + } |
| 46 | + |
| 47 | + # now parse all other fields |
| 48 | + |
| 49 | + $status = $fields [5] ; |
| 50 | + $size = $fields [6] ; |
| 51 | + $method = $fields [7] ; |
| 52 | + $url = lc ($fields [8]) ; |
| 53 | + |
| 54 | + $referer = lc ($fields [11]) ; |
| 55 | + $agent = $fields [13] ; |
| 56 | + |
| 57 | + $url =~ s/^http\w?\:\/\///o ; |
| 58 | + $url =~ s/\%3A/:/gio ; |
| 59 | + $url =~ s/\%3B/;/gio ; |
| 60 | + $url =~ s/\&/\&/gio ; |
| 61 | + |
| 62 | + ($agent2 = $agent) =~ s/\%20/ /g ; # mainly to make line content more readable on debugging |
| 63 | + $agent2 =~ s/\%2F/\//g ; # mainly to make line content more readable on debugging |
| 64 | + $agents_raw {$agent2}++ ; |
| 65 | + |
| 66 | + ($file,$ext) = &GetFileExt ($url) ; |
| 67 | + $exts {$ext}++ ; |
| 68 | + |
| 69 | + if (($ext eq "js") || ($ext eq "css")) |
| 70 | + { $scripts {"$ext,$file,"} ++ ; } |
| 71 | + |
| 72 | + $title = "" ; |
| 73 | + $parm = "" ; |
| 74 | + if ($ext eq "php") |
| 75 | + { |
| 76 | + if ($url =~ /title=/o) |
| 77 | + { |
| 78 | + $title = $url ; |
| 79 | + $title =~ s/^.*?title=//o ; |
| 80 | + $title =~ s/\&.*$//o ; |
| 81 | + } |
| 82 | + ($url,$parm) = &NormalizeParms ($url) ; |
| 83 | + if ($parm eq "?") { return ; } # error |
| 84 | + $file =~ s/,/,/go ; |
| 85 | + $parm =~ s/,/,/go ; |
| 86 | + $scripts {"php,$file,$parm"} ++ ; |
| 87 | + $ext .= "($file)" ; # add filename behind extension php |
| 88 | + } |
| 89 | + |
| 90 | + if ($mime eq "text/html") |
| 91 | + { |
| 92 | + $mimecat = "page" ; |
| 93 | + $tot_mime_html ++ ; |
| 94 | + } |
| 95 | + elsif ($mime =~ /(?:gif|png|jpeg)/o) |
| 96 | + { $mimecat = "image" ; } |
| 97 | + else |
| 98 | + { $mimecat = "other" ; } |
| 99 | + |
| 100 | + if ($job_runs_on_production_server) |
| 101 | + { |
| 102 | + $country = $fields [14] ; |
| 103 | + if (($country eq "") || ($country =~ /null/)) |
| 104 | + { $country = "--" ; } |
| 105 | + } |
| 106 | + else |
| 107 | + { |
| 108 | + $country = $fields [14] ; |
| 109 | + if ($country eq "") |
| 110 | + { |
| 111 | + if (++ $fake_country_code % 3 == 0) |
| 112 | + { $country = "XX" ; } |
| 113 | + else |
| 114 | + { $country = "YY" ; } |
| 115 | + } |
| 116 | + } |
| 117 | + |
| 118 | + if ($line =~ /(?:BannerCont|BannerList|BannerLoad|Bannerbeheer)/io) |
| 119 | + { |
| 120 | + $banners {"$country,$url"} ++ ; |
| 121 | + $banner_requests_ignored ++ ; |
| 122 | + return ; |
| 123 | + } |
| 124 | + |
| 125 | + $countries {$country}++ ; |
| 126 | + |
| 127 | + $agent2 = $agent ; |
| 128 | + $agent2 =~ s/\%20/ /g ; |
| 129 | + |
| 130 | + # remove all mentions of .NET CLR |
| 131 | + # http://en.wikipedia.org/wiki/Common_Language_Runtime |
| 132 | + $agent2 =~ s/\.NET CLR [0-9.]+\s*;?\s*//go ; |
| 133 | + $agent2 =~ s/\(\s*\)//go ; |
| 134 | + |
| 135 | + # e.g. BlackBerry8310/4.2.2 Profile/MIDP-2.0 Configuration/CLDC-1.1 VendorID/102 -> BlackBerry8310/4.2.2 |
| 136 | + if ($agent2 =~ BlackBerry) |
| 137 | + { $agent2 =~ s/^.*?BlackBerry\d+\/([^\s]*).*$/BlackBerry\/$1/io ; } # keep |
| 138 | + |
| 139 | + $agent2 =~ s/Android (\d)/Android\/$1/o ; |
| 140 | + $agent2 =~ s/Safari(\d)/Safari\/$1/o ; |
| 141 | + $agent2 =~ s/Browser\/NetFront/NetFront/o ; |
| 142 | + $agent2 =~ s/Browser\/VF-NetFront/NetFront/o ; |
| 143 | + $agent2 =~ s/jig browser (\d)/JigBrowser\/$1/o ; |
| 144 | + $agent2 =~ s/jig browser9 (\d)/JigBrowser\/$1/o ; |
| 145 | + $agent2 =~ s/jig browser web; (\d)/JigBrowser9\/$1/o ; |
| 146 | + |
| 147 | + # Remove explanation for KHTML |
| 148 | + $agent2 =~ s/\(KHTML, like Gecko\)/KHTML/o ; |
| 149 | + $agent2 =~ s/(KHTML[^\s]*) \(like Gecko\)/$1/o ; |
| 150 | + |
| 151 | + # Remove name of Ubuntu release (or name -> number) |
| 152 | + $agent2 =~ s/(Ubuntu\/[0-9\.]+)\s*\(\w+\)/$1/gio ; |
| 153 | + $agent2 =~ s/\(Ubuntu-(\w)\w+\)/("Ubuntu\/".(ord (lc($1))-ord('a')+1))/gieo ; |
| 154 | + $agent2 =~ s/Ubuntu\/([a-zA-Z])\w+/("Ubuntu\/".(ord (lc($1))-ord('a')+1))/gieo ; |
| 155 | + |
| 156 | + $agent2 =~ s/;\s*U\s*;/;/o ; |
| 157 | + |
| 158 | + if ($agent2 =~ /GoogleBot/io) |
| 159 | + { |
| 160 | + $client_ip2 = &MatchIpRange ($client_ip) ; |
| 161 | + if ($client_ip2 =~ /!google/o) |
| 162 | + { |
| 163 | + $agent2 .= " |Google ip address" ; |
| 164 | + $client_ip_range = $client_ip ; |
| 165 | + $client_ip_range =~ s/\.\d+$//o ; |
| 166 | + $google_bot_hits {substr ($time,0,13).','.$client_ip_range} ++ ; |
| 167 | + } |
| 168 | + elsif ($agent2 !~ /compatible GoogleBot/io) |
| 169 | + { |
| 170 | + $agent2 .= " |no Google ip address" ; |
| 171 | + $ip_bot_no_google {$client_ip}++ ; |
| 172 | + } |
| 173 | + } |
| 174 | + |
| 175 | + $bot = $false ; |
| 176 | + |
| 177 | + # url in agent string should only occur for bots (agent string is free format, no rules, just conventions) |
| 178 | + # exception: Embedded Web Browser from: http://bsalsa.com/, |
| 179 | + # see also http://www.bsalsa.com/forum/showthread.php?t=724 |
| 180 | + if (($agent2 =~ /http:\/\//o) && ($agent2 !~ /bsalsa.com/o)) |
| 181 | + { |
| 182 | + if ($agent2 !~ /MSIE \d+\/\d+/o) # most likely false positives |
| 183 | + { |
| 184 | + $bot = $true ; |
| 185 | + @bots {"$mime,$agent2"} ++ ; |
| 186 | + } |
| 187 | + } |
| 188 | + elsif (($agent2 =~ /bot/io) || (($agent2 =~ /crawl(?:er)?/io) && ($agent2 !~ /MSIEcrawler/io)) || ($agent2 =~ /spider/io) || ($agent2 =~ /parser/io)) |
| 189 | + { |
| 190 | + $bot = $true ; |
| 191 | + @bots {"$mime,$agent2"} ++ ; |
| 192 | + } |
| 193 | + |
| 194 | + # GECKO |
| 195 | + $gecko = "" ; |
| 196 | + if ($agent2 =~ /Gecko\/\d{4,}/io) |
| 197 | + { ($gecko = $agent2) =~ s/^.*?Gecko\/(\d{4}).*$/Gecko\/$1/io ; } |
| 198 | + |
| 199 | + # APPLEWEBKIT |
| 200 | + $applewebkit = "" ; |
| 201 | + if ($agent2 =~ /AppleWebKit/io) |
| 202 | + { |
| 203 | + ($applewebkit = $agent2) =~ s/^.*?AppleWebKit\/(\d+\.\d+).*$/AppleWebKit\/$1/io ; |
| 204 | + $applewebkit =~ s/^.*?AppleWebKit\/(\d+).*$/AppleWebKit\/$1/io ; |
| 205 | + $applewebkit =~ s/\/(\d\d)$/\/0$1/o ; |
| 206 | + |
| 207 | + if ($agent2 =~ /Mozilla.{1,8}\(/io) |
| 208 | + { |
| 209 | + $agent3 = $agent2 ; |
| 210 | + $agent3 =~ s/^[^\(]*\(//o ; |
| 211 | + $agent3 =~ s/;.*$//o ; |
| 212 | + $agent3 =~ s/\).*$//o ; |
| 213 | + $agent3 =~ s/^\s+//o ; |
| 214 | + $agent3 =~ s/\s+$//o ; |
| 215 | + $agent3 = substr ($agent3,0,20) ; |
| 216 | + |
| 217 | + $platform = '' ; |
| 218 | + if ($agent2 =~ /Chrome/io) |
| 219 | + { ($platform = $agent2) =~ s/^.*?(Chrome\/?\s*\d+\.?\d*).*$/$1/io ; } |
| 220 | + elsif ($agent2 =~ /Android/io) |
| 221 | + { ($platform = $agent2) =~ s/^.*?(Android\/?\s*\d+\.?\d*).*$/$1/io ; } |
| 222 | + elsif ($agent2 =~ /(?:iPad|iPod|iPhone)/io) |
| 223 | + { |
| 224 | + ($platform = $agent2) =~ s/^.*?(OS\s*\d\_?\d?).*$/$1/io ; |
| 225 | + $platform =~ s/_/./go ; |
| 226 | + } |
| 227 | + elsif ($agent2 =~ /Kindle/io) |
| 228 | + { ($platform = $agent2) =~ s/^.*?(Kindle\/?\s*\d+\.?\d*).*$/$1/io ; } |
| 229 | + elsif ($agent2 =~ /Safari/io) |
| 230 | + { ($platform = $agent2) =~ s/^.*?(Safari\/\d+).*$/$1/io ; } |
| 231 | + |
| 232 | + if (($agent2 =~ /Symbian/i) && ($agent3 !~ /Symbian/io)) |
| 233 | + { ($platform = $agent2) =~ s/^.*?(Symbian[\w\d\.\/]+).*$/$1/io ; } |
| 234 | + |
| 235 | + if ($platform ne '') |
| 236 | + { |
| 237 | + $platform =~ s/^\s+//o ; |
| 238 | + $platform =~ s/\s+$//o ; |
| 239 | + $platform = " $platform" ; |
| 240 | + } |
| 241 | + |
| 242 | + $applewebkit .= " ($agent3$platform)" ; |
| 243 | + } |
| 244 | + |
| 245 | + if (($agent2 =~ /Nokia/io) && ($applewebkit !~ /Nokia/io)) |
| 246 | + { $applewebkit .= " (Nokia)" ; } |
| 247 | + |
| 248 | + # if ($agent2 =~ /\(iPad/i) { $applewebkit .= " (iPad)" ; } |
| 249 | + # elsif ($agent2 =~ /\(iPod/i) { $applewebkit .= " (iPod)" ; } |
| 250 | + # elsif ($agent2 =~ /\(iPhone/i) { $applewebkit .= " (iPhone)" ; } |
| 251 | + # elsif ($agent2 =~ /\(Windows/i) { $applewebkit .= " (Win)" ; } |
| 252 | + # elsif ($agent2 =~ /\(Macintosh/i) { $applewebkit .= " (Mac)" ; } |
| 253 | + # else { $applewebkit .= " (--)" ; } |
| 254 | + } |
| 255 | + |
| 256 | + # MOBILE |
| 257 | + $mobile = '-' ; |
| 258 | + if ($agent2 =~ /(?:$tags_mobile)/io) |
| 259 | + { $mobile = 'M' ; } |
| 260 | + |
| 261 | + $os = ".." ; |
| 262 | + |
| 263 | + if ($agent2 =~ /(?:Wikiamo|Wikipanion)/io) { $os = "iPhone" ; } |
| 264 | + elsif ($agent2 =~ /BlackBerry/io) {($os = $agent2) =~ s/^.*?BlackBerry[^\/]*\/(\d+\.\d+).*$/BlackBerry\/$1/io ; } # BlackBerry/8320/4.2 -> BlackBerry/4.2 |
| 265 | + elsif ($agent2 =~ /DoCoMo/io) { $os = "DoCoMo" ; } |
| 266 | + elsif ($agent2 =~ /iPad/io) { $version = "iPad" ; ($os = $agent2) =~ s/^.*?(iPad OS \d+\_\d+).*$/$1/io ; } |
| 267 | + elsif ($agent2 =~ /iPod/io) { $version = "iPod" ; ($os = $agent2) =~ s/^.*?(iPhone OS \d+\_\d+).*$/$1/io ; } |
| 268 | + elsif ($agent2 =~ /iPhone/io) { $version = "iPhone" ; ($os = $agent2) =~ s/^.*?(iPhone OS \d+\_\d+).*$/$1/io ; } |
| 269 | + elsif ($agent2 =~ /webOS.* Pre/io) { $version = "Pre" ; ($os = $agent2) =~ s/^.*?(webOs\/\d+\.?\d*).*$/$1/io ; } # Palm Pre |
| 270 | + elsif ($agent2 =~ /Intel Mac/io) { $os = "Mac Intel" ; } |
| 271 | + elsif ($agent2 =~ /PPC Mac/io) { $os = "Mac PowerPC" ; } |
| 272 | + elsif ($agent2 =~ /Mac_PowerPC/io) { $os = "Mac PowerPC" ; } |
| 273 | + elsif ($agent2 =~ /Macintosh.*PPC/io) { $os = "Mac PowerPC" ; } |
| 274 | + elsif ($agent2 =~ /Mac OS/io) { $os = "Mac" ; } |
| 275 | + elsif ($agent2 =~ /MacBook/io) { $os = "Mac" ; } |
| 276 | + elsif ($agent2 =~ /iMac/io) { $os = "iMac" ; } |
| 277 | + elsif ($agent2 =~ /Power.*Macintosh/io) { $os = "Mac PowerPC" ; } |
| 278 | + elsif ($agent2 =~ /FreeBSD/io) { $os = "FreeBSD" ; } |
| 279 | + elsif ($agent2 =~ /OpenBSD/io) { $os = "OpenBSD" ; } |
| 280 | + elsif ($agent2 =~ /SunOS/io) { $os = "SunOS" ; } |
| 281 | + elsif ($agent2 =~ /PlayStation/io) { $os = "PlayStation" ; } |
| 282 | + elsif ($agent2 =~ /SymbianOS/io) { ($os = $agent2) =~ s/^.*?SymbianOS[^\/]*\/(\d+\.\d+).*$/SymbianOS\/$1/io ; } |
| 283 | + elsif ($agent2 =~ /Symbian.*OS/io) { $os = "SymbianOS/0.0" ; } |
| 284 | +# elsif ($agent2 =~ /Linux i686/io) { $os = "Linux i686" ; } |
| 285 | +# elsif ($agent2 =~ /Linux x86_64/io) { $os = "Linux x86_64" ; } |
| 286 | +# elsif ($agent2 =~ /Linux armv\d+/io) { $os = "Linux armv" ; } |
| 287 | +# elsif ($agent2 =~ /Linux ppc\d+/io) { $os = "Linux ppc" ; } |
| 288 | +# elsif ($agent2 =~ /Linux mips/io) { $os = "Linux mips" ; } |
| 289 | + elsif ($agent2 =~ /Linux/io) { $os = "Linux" ; } |
| 290 | + elsif ($agent2 =~ /Win95/io) { $os = "Windows 95" ; } |
| 291 | + elsif ($agent2 =~ /Win(?:dows)[+\s-]?98/io) { $os = "Windows 98" ; } |
| 292 | + elsif ($agent2 =~ /Win(?:dows)?[+\s-]?9x/io) { $os = "Windows 9x" ; } |
| 293 | + elsif ($agent2 =~ /WinNT4.0/io) { $os = "Windows NT 4.0" ; } |
| 294 | + elsif ($agent2 =~ /Windows XP/io) { $os = "Windows XP" ; } # Windows XP 2600.xpsp.14648-27197 -> Windows XP |
| 295 | + elsif ($agent2 =~ /Windows CE/io) { $os = "Windows CE" ; } |
| 296 | + elsif ($agent2 =~ /Windows; PPC/io) { $os = "Windows CE" ; } |
| 297 | + elsif ($agent2 =~ /NT \d+\.\d+.*Windows/io) { ($os = $agent2) =~ s/^.*?NT (\d+\.\d+).*$/Windows NT $1/io ; } |
| 298 | + elsif ($agent2 =~ /Windows NT \d+\.\d+/io) { ($os = $agent2) =~ s/^.*?Windows NT (\d+\.\d+).*$/Windows NT $1/io ; } |
| 299 | + elsif ($agent2 =~ /Windows NT/io) { $os = "Windows NT" ; } |
| 300 | + elsif ($agent2 =~ /Windows VISTA/io) { $os = "Windows VISTA" ; } |
| 301 | + elsif ($agent2 =~ /Windows 7/io) { $os = "Windows 7" ; } |
| 302 | +# elsif ($agent2 =~ /Windows/io) { ($os = $agent2) =~ s/^.*?(Windows.{10,10}[^;\(\)\[\]]*).*$/$1/io ; } |
| 303 | + elsif ($agent2 =~ /Windows/io) { $os = "Windows" ; } |
| 304 | + elsif ($agent2 =~ /Win32/io) { $os = "Windows 32" ; } |
| 305 | + elsif ($agent2 =~ /Wii/io) { $os = "Wii" ; } |
| 306 | + elsif ($agent2 =~ /SonyEricsson/io) { $os = "SonyEricsson" ; } |
| 307 | + elsif ($agent2 =~ /Samsung/io) { $os = "Samsung" ; } |
| 308 | + elsif ($agent2 =~ /Nokia/io) { $os = "Nokia" ; } |
| 309 | + elsif ($agent2 =~ /Palm Pre/io) { $os = "Palm Pre" ; } |
| 310 | + elsif ($agent2 =~ /Vodafone/io) { $os = "Vodafone" ; } |
| 311 | + elsif ($agent2 =~ /Danger/io) { $os = "Danger" ; } |
| 312 | + elsif ($agent2 =~ /J2ME\/MIDP/io) { $os = "Java/ME" ; } |
| 313 | + elsif ($agent2 =~ /Kindle/io) { $os = "Kindle" ; } |
| 314 | + |
| 315 | + if (($os eq '..') && ($mobile eq 'M')) |
| 316 | + { |
| 317 | + $os = "Mobile other" ; |
| 318 | + $mobile_other {$agent2} ++ ; |
| 319 | + } |
| 320 | + |
| 321 | + if ($version =~ /(?:Ipod|Iphone)/io) |
| 322 | + { |
| 323 | + if ($os !~ /Iphone OS \d/io) |
| 324 | + { $os = "iPhone OS 1_X" ; } |
| 325 | + if ($agent2 !~ /(?:Opera|Safari)/io) |
| 326 | + { $agent2 .= " Safari/0.0" ; } |
| 327 | + } |
| 328 | + elsif ($version =~ /(?:Ipad)/io) |
| 329 | + { |
| 330 | + if ($os !~ /Ipad OS \d/io) |
| 331 | + { $os = "iPad OS 1_X" ; } |
| 332 | + if ($agent2 !~ /(?:Opera|Safari)/io) |
| 333 | + { $agent2 .= " Safari/0.0" ; } |
| 334 | + } |
| 335 | + |
| 336 | + if (($os =~ /Mac/o) && ($agent2 =~ /OS X/o)) |
| 337 | + { |
| 338 | + ($osx = $agent2) =~ s/^.*?(OS X[^;\(\)\[\]]*).*$/$1/o ; |
| 339 | + $osx =~ s/(\d+\_\d+).*$/$1/o ; |
| 340 | + $osx =~ s/_/\./o ; |
| 341 | + $os = "$os $osx" ; |
| 342 | + } |
| 343 | + |
| 344 | + if ($os =~ /Linux/o) |
| 345 | + { |
| 346 | + ($osx = $agent2) =~ s/^.*?((?:Android|Ubuntu|Gentoo|PCLinuxOS|CentOS|Red Hat|Mandriva|SUSE|Fedora|Epiphany|Debian|Motor\w+)[^\s;\[\]\(\)]*).*$/ucfirst($1)/ieo ; |
| 347 | + if ($osx ne $agent2) |
| 348 | + { |
| 349 | + $osx =~ s/(\d+\_\d+).*$/$1/o ; |
| 350 | + $osx =~ s/^([^-]*)-/$1\//o ; # Debian-1.0 -> Debian/1.0 |
| 351 | + $osx =~ s/_/\./o ; |
| 352 | + $osx =~ s/(\d+\.\d+).*$/$1/o ; |
| 353 | + $osx =~ s/^(Motor)(\w+).*$/ucfirst(lc($1)).uc($2)/ieo ; |
| 354 | + $os = "$os $osx" ; |
| 355 | + } |
| 356 | + } |
| 357 | + |
| 358 | + $os =~ s/(Windows NT \d+\.\d+).*$/$1/o ; |
| 359 | + |
| 360 | + if ($bot) |
| 361 | + { $agent2 = "BOT $agent2" ; } |
| 362 | + |
| 363 | + elsif ($agent2 eq "-") |
| 364 | + {;} |
| 365 | + |
| 366 | + # KINDLE |
| 367 | + elsif ($agent2 =~ /Kindle/io) |
| 368 | + { ($version = $agent2) =~ s/^.*?(Kindle \d+\.\d+).*$/$1/io ; } |
| 369 | + |
| 370 | + # IEMOBILE |
| 371 | + elsif ($agent2 =~ /IEMobile/io) |
| 372 | + { ($version = $agent2) =~ s/^.*?(IEMobile \d+\.\d+).*$/$1/io ; } |
| 373 | + |
| 374 | + # PALM PRE |
| 375 | + elsif ($agent2 =~ /webOS\/\d+\.\d+.*Pre\/\d/io) |
| 376 | + { ($version = $agent2) =~ s/^.*?(Pre\/\d+\.?\d*).*$/Palm_$1/o ; } |
| 377 | + |
| 378 | + # ANDROID |
| 379 | + elsif ($agent2 =~ /Android\/\d+/io) |
| 380 | + { ($version = $agent2) =~ s/^.*?(Android\/\d+\.?\d*).*$/$1/o ; } |
| 381 | + |
| 382 | + # EXPLORER |
| 383 | + elsif ($agent2 =~ /Mozilla\/\d+\.\d+ \(compatible;.*MSIE/io) |
| 384 | + { ($version = $agent2) =~ s/^.*?(MSIE \d+\.\d+).*$/$1/o ; } |
| 385 | + |
| 386 | + # CHROME |
| 387 | + elsif ($agent2 =~ /Chrome\/\d/io) # Chrome sometimes mimicked Safari to work around Hotmail bug |
| 388 | + { |
| 389 | + $agent2 =~ s/Windows NT \d\.\d/Windows/o ; |
| 390 | + $agent2 =~ s/(Chrome\/\d+\.\d+)[^;\) ]+/$1/o ; |
| 391 | + |
| 392 | + $agent2 = &ExtractLanguage ($agent2, 'Chrome') ; |
| 393 | + |
| 394 | + ($version = $agent2) =~ s/^.*?(Chrome\/\d+\.\d+).*$/$1/o ; |
| 395 | + } |
| 396 | + |
| 397 | + # SAFARI |
| 398 | + elsif ($agent2 =~ /Safari\/[^\s]+$/io) |
| 399 | + { |
| 400 | + $agent2 = &ExtractLanguage ($agent2, 'Safari') ; |
| 401 | + $agent2 =~ s/(Safari\/\d+\.\d+)[^;\) ]+/$1/o ; |
| 402 | + if ($agent2 =~ /Safari\/\d+\.\d+/o) |
| 403 | + { ($version = $agent2) =~ s/^.*?(Safari\/\d+\.\d+).*$/$1/o ; } |
| 404 | + elsif ($agent2 =~ /Safari\/\d+/o) |
| 405 | + { ($version = $agent2) =~ s/^.*?(Safari\/\d+).*$/$1/o ; } |
| 406 | + } |
| 407 | + |
| 408 | + # FIREFOX |
| 409 | + elsif ($agent2 =~ /Firefox\/[^\s]+/io) |
| 410 | + { |
| 411 | + $agent2 = &ExtractLanguage ($agent2, 'Firefox') ; |
| 412 | + $agent2 =~ s/X11; Linux [^;]+/Linux/o ; |
| 413 | + $agent2 =~ s/(Firefox\/\d+\.\d+)[^;\) ]+/$1/o ; |
| 414 | + |
| 415 | + if ($agent2 =~ /Firefox\/\d+\.\d+/o) |
| 416 | + { ($version = $agent2) =~ s/^.*?(Firefox\/\d+\.\d+).*$/$1/o ; } |
| 417 | + elsif ($agent2 =~ /Firefox\/\d+/o) |
| 418 | + { ($version = $agent2) =~ s/^.*?(Firefox\/\d+).*$/$1/o ; } |
| 419 | + } |
| 420 | + |
| 421 | + # OPERA |
| 422 | + # new format |
| 423 | + elsif ($agent2 =~ /^Opera\/\d/io) |
| 424 | + { |
| 425 | + if ($agent2 =~ /Version\//o) |
| 426 | + { ($version = $agent2) =~ s/^.*?Version\/(\d+\.\d+).*$/Opera\/$1/o ; } |
| 427 | + else |
| 428 | + { ($version = $agent2) =~ s/^.*?(Opera\/\d+\.\d+).*$/$1/o ; } |
| 429 | + |
| 430 | + $agent2 =~ s/Windows NT \d\.\d/Windows/o ; |
| 431 | + $agent2 =~ s/X11; Linux [^;]+/Linux/o ; |
| 432 | + $agent2 =~ s/(Opera Mini\/\d+\.\d+)[^;\) ]+/$1/o ; |
| 433 | + $agent2 =~ s/J2ME\/MIDP/Java mobile (J2ME)/o ; # J2ME\/MIDP |
| 434 | + |
| 435 | + $agent2 = &ExtractLanguage ($agent2, 'Opera') ; |
| 436 | + |
| 437 | + if ($agent2 =~ /Opera Mini/o) |
| 438 | + { |
| 439 | + if ($agent2 =~ /Opera Mini\/\d+\.\d+/o) |
| 440 | + { ($mini = $agent2) =~ s/^.*?Opera (Mini\/\d+\.\d+).*$/$1/o ; } |
| 441 | + else |
| 442 | + { $mini = "Mini/?.?" ; } |
| 443 | + $version = "$version ($mini)" ; |
| 444 | + } |
| 445 | + elsif ($agent2 =~ /Opera Mobi/o) |
| 446 | + { |
| 447 | + if ($agent2 =~ /Opera Mobi\/\d+\.\d+/o) |
| 448 | + { ($mobi = $agent2) =~ s/^.*?Opera (Mobi\/\d+\.\d+).*$/$1/o ; } |
| 449 | + else |
| 450 | + { $mobi = "Mobi/?.?" ; } |
| 451 | + $version = "$version ($mobi)" ; |
| 452 | + } |
| 453 | + |
| 454 | + $version =~ s/^\s*(.*?)\s*$/$1/o ; |
| 455 | + } |
| 456 | + |
| 457 | + # old format |
| 458 | + elsif ($agent2 =~ /^Mozilla.*\(compatible.*Opera \d/io) |
| 459 | + { |
| 460 | + $agent2 =~ s/Opera (\d+\.\d+)/Opera\/$1/o ; |
| 461 | + $agent2 =~ s/Windows NT \d\.\d/Windows/o ; |
| 462 | + $agent2 =~ s/X11; Linux [^;\)]+/Linux/o ; |
| 463 | + ($version = $agent2) =~ s/^.*?(Opera\/\d+\.\d+).*$/$1/o ; |
| 464 | + $version =~ s/^\s*(.*?)\s*$/$1/o ; # remove leading/trailing spaces |
| 465 | + } |
| 466 | + |
| 467 | + # BLACKBERRY |
| 468 | + elsif ($agent2 =~ /BlackBerry\d+/io) |
| 469 | + { |
| 470 | + $agent2 =~ s/(\/\d+\.\d+).*$/$1/o ; |
| 471 | + $agent2 =~ s/BlackBerry/BlackBerry\//o ; |
| 472 | + $version = $agent2 ; |
| 473 | + } |
| 474 | + |
| 475 | + # KONQUEROR |
| 476 | + elsif ($agent2 =~ /Konqueror\/\d/io) # Chrome sometimes mimicked Safari to work around Hotmail bug |
| 477 | + { |
| 478 | + $agent2 =~ s/(Konqueror\/\d+\.\d+)[^;\) ]+/$1/o ; |
| 479 | + |
| 480 | + ($version = $agent2) =~ s/^.*?(Konqueror\/\d+\.\d+).*$/$1/o ; |
| 481 | + } |
| 482 | + |
| 483 | + # WGET |
| 484 | + elsif ($agent2 =~ /Wget\/\d/io) |
| 485 | + { |
| 486 | + $agent2 =~ s/(Wget\/\d+\.\d+)[^;\) ]+/$1/io ; |
| 487 | + |
| 488 | + ($version = $agent2) =~ s/^.*?Wget\/(\d+\.\d+).*$/$1/io ; |
| 489 | + } |
| 490 | + |
| 491 | + elsif ($os =~ /Iphone OS \d/io) |
| 492 | + { $os = "iPhone OS 1_X" ; } |
| 493 | + elsif ($os =~ /Ipad OS \d/io) |
| 494 | + { $os = "iPad OS 1_X" ; } |
| 495 | + |
| 496 | + else |
| 497 | + { |
| 498 | + $browserfound = $false ; |
| 499 | + |
| 500 | + @browsers = qw (GranParadiso IceWeasel JigBrowser K-Meleon NetFront Netscape SeaMonkey Shiretoko Sleipnir Songbird) ; |
| 501 | + foreach $browser (@browsers) |
| 502 | + { |
| 503 | + if ($agent2 =~ /$browser/i) |
| 504 | + { |
| 505 | + ($version = $agent2) =~ s/.*?($browser\/\d+\.\d+).*$/$1/i ; |
| 506 | + $browserfound = $true ; |
| 507 | + last ; |
| 508 | + } |
| 509 | + } |
| 510 | + if (! $browserfound) |
| 511 | + { |
| 512 | + ($version = $agent2) =~ s/(^[a-zA-Z0-9-_]+\/\d+\.\d+).*$/$1/io ; |
| 513 | + $version =~ s/[;\[\]\(\)].*$//o ; |
| 514 | + $version =~ s/(\d+\.\d+).*$/$1/o ; |
| 515 | + } |
| 516 | + |
| 517 | + $agent2 = "*[$version] [$os] --- $agent2" ; |
| 518 | + } |
| 519 | + |
| 520 | + if ((! $bot) && ($agent ne "-")) |
| 521 | + { |
| 522 | + $engine =~ s/,/,/go ; |
| 523 | + if ($gecko ne "") |
| 524 | + { $engines {$gecko} ++ ; } |
| 525 | + elsif ($applewebkit ne "") |
| 526 | + { |
| 527 | + $applewebkit =~ s/AppleWebKit\//AppleWebKit /o ; |
| 528 | + $engines {$applewebkit} ++ ; |
| 529 | + } |
| 530 | + |
| 531 | + $version =~ s/,/,/go ; |
| 532 | + if ($os =~ /playstation/io) |
| 533 | + { $version = "NetFront (PlayStation)" ; } |
| 534 | + |
| 535 | + $clients {"$mobile,$version"}++ ; |
| 536 | + |
| 537 | + $operating_systems =~ s/,/,/go ; |
| 538 | + $operating_systems {"$mobile,$os"} ++ ; |
| 539 | + } |
| 540 | + |
| 541 | + if ($count_hits_per_ip_range) |
| 542 | + { |
| 543 | + $client_ip_range = $client_ip ; |
| 544 | + $client_ip_range =~ s/\.\d+$//o ; |
| 545 | + $cnt_ip_ranges {$client_ip_range}++ ; |
| 546 | + } |
| 547 | + |
| 548 | + if ($status =~ /^TCP/) |
| 549 | + { |
| 550 | + $statusses {"$method:$status"}++ ; |
| 551 | + $statusses {"$method:total"}++ ; |
| 552 | + } |
| 553 | + else |
| 554 | + { $statusses_non_tcp ++ ; } |
| 555 | + |
| 556 | + if ($url =~ /org\/skins/o) |
| 557 | + { |
| 558 | + ($url2 = $url) =~ s/^.*?\/skins/skins/o ; |
| 559 | + $skins {$url2} ++ ; |
| 560 | + } |
| 561 | + |
| 562 | + if ($url =~ /^upload\.wikimedia\.org\//o) # count image size if applicable |
| 563 | + { &ProcessUploadPath ($url) ; } |
| 564 | + |
| 565 | + ($url2 = $url) =~ s/\.php\?.*$/\.php\?../go ; |
| 566 | + ($domain,$location) = split ('\/',$url2,2) ; |
| 567 | + $domain_original = $domain ; |
| 568 | + |
| 569 | + # for diagnostics |
| 570 | + if (($referer =~ /google/o) || ($agent =~ /google/io)) |
| 571 | + { $googles++ ; } |
| 572 | + |
| 573 | + $referer =~ s/^http\w?\:\/\///o ; |
| 574 | + $referer =~ s/\.php\?.*$/\.php\?../go ; |
| 575 | + $referer =~ s/\/.*$//o ; |
| 576 | + $referer_original = $referer ; |
| 577 | + |
| 578 | + # $domain_mobile = $false ; |
| 579 | + # if ($domain =~ /m\.wikipedia/o) |
| 580 | + # { |
| 581 | + # print "Domain 1 $domain\n" ; |
| 582 | + # $domain_mobile = $true ; |
| 583 | + # } |
| 584 | + |
| 585 | + $domain = &Abbreviate ($domain) ; |
| 586 | + if (($domain =~ /\./o) || |
| 587 | + ($domain !~ /^[\*\@\%]?!(wb|wn|wp|wq|ws|wv|wk|wx|xx|wm|mw|wmf)\:/o)) |
| 588 | + { |
| 589 | + $unrecognized_domains {$domain_original} ++ ; |
| 590 | + $domain = 'other' ; |
| 591 | + } |
| 592 | + |
| 593 | + # if ($domain_mobile) |
| 594 | + # { print "Domain 2 $domain\n" ; } |
| 595 | + |
| 596 | + # $referer_mobile = $false ; |
| 597 | + # if ($referer =~ /m\.wikipedia/o) |
| 598 | + # { |
| 599 | + # print "Referer 1 $referer\n" ; |
| 600 | + # $referer_mobile = $true ; |
| 601 | + # } |
| 602 | + |
| 603 | + $referer = &Abbreviate ($referer) ; |
| 604 | + $referer_external = ($referer !~ /^[\*\@]?!(wb|wn|wp|wq|ws|wv|wk|wx|xx|wm|mw|wmf)\:/o) ; |
| 605 | + |
| 606 | + if ($referer_external) |
| 607 | + { |
| 608 | + $tot_referers_external++ ; |
| 609 | + |
| 610 | + ($origin, $toplevel) = &DetectOrigin ($client_ip, $referer_original, $agent, $mime, $mimecat, $service, $ext) ; |
| 611 | + |
| 612 | + &CountOrigin ("external", $origin, $toplevel, $mimecat) ; |
| 613 | + |
| 614 | + if ($origin !~ /^\!/o) |
| 615 | + { $origins_unsimplified {$referer_original} ++ ; } |
| 616 | + else |
| 617 | + { |
| 618 | + $origin_simplified {"$origin [$referer] <- $referer_original"} ++ ; |
| 619 | + $origins_external {$origin} ++ ; |
| 620 | + } |
| 621 | + } |
| 622 | + else |
| 623 | + { |
| 624 | + $tot_referers_internal ++ ; |
| 625 | + $referers_internal {$referer} ++ ; |
| 626 | + $referer =~ s/!//go ; # ! was marker to signal pattern was recognized as wikimedia project |
| 627 | + &CountOrigin ("internal", $referer, "org" , $mimecat) ; |
| 628 | + } |
| 629 | + |
| 630 | + $domain =~ s/!//o ; |
| 631 | + $referer =~ s/!//o ; |
| 632 | + $domain =~ s/\:\d+$//o ; # remove port number |
| 633 | + $referer =~ s/\:\d+$//o ; # remove port number |
| 634 | + if ($domain =~ /!/o) |
| 635 | + { print ERR "still ! in domain: '$domain' <- '$domain_original'\n" ; } |
| 636 | + |
| 637 | + $requests {"$domain|$referer|$ext|$mime|$parm"}++ ; |
| 638 | + |
| 639 | + $clients_by_wiki {"$mobile,$version,$domain"}++ ; |
| 640 | + |
| 641 | + if ($bot) |
| 642 | + { $ind_bot = 'bot=Y' ; } |
| 643 | + else |
| 644 | + { $ind_bot = 'bot=N' ; } |
| 645 | + |
| 646 | + if (($domain =~ /^\@/) || ($domain =~ /^\*/)) |
| 647 | + { |
| 648 | + # print "Requests wap $domain | $ext | $mime | $parm | $country | $ind_bot\n" ; |
| 649 | + $requests_wap {"$domain|$ext|$mime|$parm|$country|$ind_bot"} ++ ; |
| 650 | + } |
| 651 | + |
| 652 | + if ($domain =~ /^\%/) |
| 653 | + { |
| 654 | + # print "Requests m $domain | $ext | $mime | $parm | $country | $ind_bot\n" ; |
| 655 | + $requests_m {"$domain|$ext|$mime|$parm|$country|$ind_bot"} ++ ; |
| 656 | + } |
| 657 | + # $title !~ /:/ -> only namespace 0 (minus few titles with colon in name) |
| 658 | + if (($url =~ /index.php\?/o) && ($title !~ /:/o) && ($mime eq "text/html") && (($url =~ /action=edit/o) || ($url =~ /action=submit/o))) |
| 659 | + { |
| 660 | + |
| 661 | + if (($referer ne "-") && ($referer !~ /^..:/o)) |
| 662 | + { $referer = "ext" ; } |
| 663 | + |
| 664 | + $key = "$client_ip|$ind_bot|$domain|$referer|$status|$mime|$parm" ; |
| 665 | + $key =~ s/,/,/go ; |
| 666 | + $key =~ s/\|/,/go ; |
| 667 | + |
| 668 | + $index_php_raw {$key}++ ; |
| 669 | + $client_ip_record_cnt {$client_ip}++ ; |
| 670 | + } |
| 671 | + |
| 672 | + if ($mimecat eq "page") |
| 673 | + { |
| 674 | + $tot_mime_html2 ++ ; |
| 675 | + |
| 676 | + if (($ind_bot =~ /N/) and ($ip_frequencies {$client_ip} > 2)) |
| 677 | + { $ind_bot = 'bot=Y' ; } |
| 678 | + |
| 679 | + $countries_views {"$ind_bot,$domain,$country"} ++ ; |
| 680 | + |
| 681 | + # $title !~ /:/ -> only namespace 0 (minus few titles with colon in name) |
| 682 | + if (($url =~ /index.php\?/o) && ($title !~ /:/) && ($mime eq "text/html") && ($url =~ /action=submit/o) && ($status =~ /302/o)) |
| 683 | + { $countries_saves {"$ind_bot,$domain,$country"} ++ ; } |
| 684 | + |
| 685 | + $time_hh = substr ($time,11,2) ; |
| 686 | + $time_mm = substr ($time,14,2) ; |
| 687 | + $time_tt = $time_hh * 60 + $time_mm ; |
| 688 | + $time_tt2 = $time_tt - $time_tt % 15 ; |
| 689 | + $countries_timed {"$ind_bot,$domain,$country,$time_tt2"} ++ ; |
| 690 | + } |
| 691 | +} |
| 692 | + |
| 693 | +sub ExtractLanguage |
| 694 | +{ |
| 695 | + my $agent = shift ; |
| 696 | + my $application = shift ; |
| 697 | + my $language ; |
| 698 | + |
| 699 | + $regexp_lang = "[a-z]{2}(?:-[a-zA-Z]{2,3})?(?:-[a-zA-Z]{2,3})?" ; |
| 700 | + ($language = $agent) =~ s/^.*?; ($regexp_lang)[\);].*$/$1/o ; |
| 701 | + if ($language eq $agent) |
| 702 | + { $languages_unrecognized {$agent} ++ ; } |
| 703 | + else |
| 704 | + { |
| 705 | + $languages {"$application,$language"} ++ ; |
| 706 | + $agent =~ s/ $language//o ; |
| 707 | + } |
| 708 | + return ($agent) ; |
| 709 | +} |
| 710 | + |
| 711 | +sub GetFileExt |
| 712 | +{ |
| 713 | + my $url = shift ; |
| 714 | + my ($file, $ext) ; |
| 715 | + $url =~ s/\?.*$//o ; |
| 716 | + ($file = $url) =~ s/^([^\/]*\/)+//o ; # drop path before file |
| 717 | + |
| 718 | + if ($file =~ /^[^\.]*$/o) # no extension |
| 719 | + { $ext = "none" ; } |
| 720 | + else |
| 721 | + { |
| 722 | + ($ext = $file) =~ s/^.*?\.([^\.]+)$/$1/o ; |
| 723 | + if ($ext =~ /[^a-zA-Z]/o) |
| 724 | + { $ext = "invalid" ; } |
| 725 | + } |
| 726 | + $ext = lc ($ext) ; |
| 727 | + $ext =~ s/^(jpg|jpeg)$/jp[e]g/go ; |
| 728 | + |
| 729 | + return ($file, $ext) ; |
| 730 | + |
| 731 | + # obsolete alternate code ? |
| 732 | + # implied php request returns html |
| 733 | + # if ($url =~ /\/wiki\//o) { $ext = "html <- /wiki/" ; } |
| 734 | + # elsif ($url =~ /\.org\/?$/o) { $ext = "html <- *.org" ; } |
| 735 | + # elsif ($url =~ /\.com\/?$/o) { $ext = "html <- *.com" ; } |
| 736 | + # elsif ($url =~ /\/wiki\?title=/o) { $ext = "html <- /wiki?title=.." ; } |
| 737 | + # |
| 738 | + # if ($mime =~ /(?:xml|html)/o) |
| 739 | + # { $ext = "none (mimetype:$mime)" ; } |
| 740 | + # else |
| 741 | + # { |
| 742 | + # $url =~ s/\?.*$//o ; |
| 743 | + # ($file = $url) =~ s/^([^\/]*\/)+//o ; # drop path before file |
| 744 | + # |
| 745 | + # if ($file =~ /^[^\.]*$/o) # no extension |
| 746 | + # { $ext = "none (mimetype:$mime)" ; |
| 747 | + # print "\n\n$mime\n$line\n" ; |
| 748 | + # $ext = "none" ; } |
| 749 | + # else |
| 750 | + # { |
| 751 | + # ($ext = $file) =~ s/^.*?\.([^\.]+)$/$1/o ; |
| 752 | + # if ($ext =~ /[^a-zA-Z]/o) |
| 753 | + # { $ext = "invalid" ; } |
| 754 | + # } |
| 755 | + # } |
| 756 | + # |
| 757 | + # $ext = lc ($ext) ; |
| 758 | + # $ext =~ s/^(jpg|jpeg)$/jp[e]g/go ; |
| 759 | + # |
| 760 | + # return ($file, $ext) ; |
| 761 | +} |
| 762 | + |
| 763 | +sub NormalizeParms |
| 764 | +{ |
| 765 | + my $url = shift ; |
| 766 | + |
| 767 | + $invalid = $false ; |
| 768 | + my ($url2,$parm) = split ('\?', $url) ; |
| 769 | + $parm =~ s/^\&+//o ; |
| 770 | + $parm =~ s/\&+$//o ; |
| 771 | + $parm =~ s/\&\&+/\&/o ; |
| 772 | + $parm =~ s/\"/'/go ; # invalid in url ?, accept for now |
| 773 | + @parms = split ('\&', $parm) ; |
| 774 | + @parms = sort @parms ; |
| 775 | + |
| 776 | + foreach $parm (@parms) |
| 777 | + { |
| 778 | + next if $parm eq "" ; |
| 779 | + |
| 780 | + if (($parm !~ /=/) && ($parm !~ /^[\w\d\-\_]+$/o)) |
| 781 | + { $error = "parm probably invalid: '$parm' in '$url' -> skip\n" ; $invalid = $true ; last } |
| 782 | + |
| 783 | + ($keyword,$data) = split ('\=', $parm) ; |
| 784 | + if ($keyword eq "") |
| 785 | + { $keyword = "[empty]" ; } |
| 786 | + if ($keyword ne "redlink") |
| 787 | + { |
| 788 | + if (($keyword !~ /^(?:action|ctype|gen|usemsgcache)$/) || ($data !~ /^[a-zA-Z\-\_\/]*$/o)) |
| 789 | + { $parm =~ s/=.+/=../o ; } # show generalized version of parameter, without specifics |
| 790 | + } |
| 791 | + } |
| 792 | + |
| 793 | + if ($invalid) |
| 794 | + { |
| 795 | + print $error ; |
| 796 | + print ERR $error ; |
| 797 | + return ("?","?") ; |
| 798 | + } |
| 799 | + |
| 800 | + $parm = join ('&', @parms) ; |
| 801 | + $url = "$url2\?$parm" ; |
| 802 | + return ($url,$parm) ; |
| 803 | +} |
| 804 | + |
| 805 | +sub Abbreviate |
| 806 | +{ |
| 807 | + my $domain = shift ; |
| 808 | + |
| 809 | + $domain =~ s/www\.([^\.]+\.[^\.]+\.[^\.]+)/$1/o ; |
| 810 | + $domain =~ s/\.com/\.org/o ; |
| 811 | + $domain =~ s/^([^\.]+\.org)/www.$1/o ; |
| 812 | + |
| 813 | + if ($domain !~ /\.org/o) |
| 814 | + { $domain =~ s/www\.(wik[^\.\/]+)\.([^\.\/]+)/$2.$1.org/o ; } |
| 815 | + |
| 816 | + $legend = "# wx = wikispecial (commons|mediawiki|meta|foundation|species)\n" ; |
| 817 | + $legend .= "# xx:upload = upload.wikimedia.org\n" ; |
| 818 | + $domain =~ s/commons\.wikimedia\.org/!wx:commons/o ; |
| 819 | + $domain =~ s/www\.mediawiki\.org/!wx:mediawiki/o ; |
| 820 | + $domain =~ s/meta\.wikipedia\.org/!wx:meta/o ; |
| 821 | + $domain =~ s/meta\.wikimedia\.org/!wx:meta/o ; |
| 822 | + $domain =~ s/foundation\.wikimedia\.org/!wx:foundation/o ; |
| 823 | + $domain =~ s/species\.wikimedia\.org/!wx:species/o ; |
| 824 | + $domain =~ s/upload\.wikimedia\.org/!xx:upload/o ; |
| 825 | + |
| 826 | + $legend .= "# wmf = wikimediafoundation\n" ; |
| 827 | + $legend .= "# wb = wikibooks\n" ; |
| 828 | + $legend .= "# wn = wikinews\n" ; |
| 829 | + $legend .= "# wp = wikipedia\n" ; |
| 830 | + $legend .= "# wq = wikiquote\n" ; |
| 831 | + $legend .= "# ws = wikisource\n" ; |
| 832 | + $legend .= "# wv = wikiversity\n" ; |
| 833 | + $legend .= "# wk = wiktionary\n" ; |
| 834 | + $legend .= "# wm = wikimedia\n" ; |
| 835 | + $legend .= "# mw = mediawiki\n" ; |
| 836 | + $legend .= "# \@ = .mobile.\n" ; |
| 837 | + $legend .= "# \* = .wap.\n" ; |
| 838 | + $legend .= "# \% = .m.\n" ; |
| 839 | + |
| 840 | + $domain =~ s/wikimediafoundation/!wmf/o ; |
| 841 | + $domain =~ s/wikibooks/!wb/o ; |
| 842 | + $domain =~ s/wikinews/!wn/o ; |
| 843 | + $domain =~ s/wikipedia/!wp/o ; |
| 844 | + $domain =~ s/wikiquote/!wq/o ; |
| 845 | + $domain =~ s/wikisource/!ws/o ; |
| 846 | + $domain =~ s/wikiversity/!wv/o ; |
| 847 | + $domain =~ s/wiktionary/!wk/o ; |
| 848 | + $domain =~ s/wikimedia/!wm/o ; |
| 849 | + $domain =~ s/mediawiki/!mw/o ; |
| 850 | + |
| 851 | + $domain =~ s/\.mobile\./.@/o ; |
| 852 | + $domain =~ s/\.wap\./.*/o ; |
| 853 | + $domain =~ s/\.m\./.%/o ; |
| 854 | + |
| 855 | + if ($domain =~ /^error:/o) |
| 856 | + { $domain_errors {$domain}++ ; } |
| 857 | + $domain =~ s/error:.*$/!error:1/o ; |
| 858 | + |
| 859 | + $domain =~ s/^([^\.\/]+)\.([^\.\/]+)\.org/$2:$1/o ; |
| 860 | + |
| 861 | + $domain =~ s/\s//g ; |
| 862 | + |
| 863 | + return ($domain) ; |
| 864 | +} |
| 865 | + |
| 866 | +sub DetectOrigin |
| 867 | +{ |
| 868 | +# this simplification is a rather loose approximation, not rigidly according to domain name standards, as that would require further study |
| 869 | + |
| 870 | +# three reasons to count search engine 'xxx': |
| 871 | +# 1 $referer contains 'xxx' |
| 872 | +# 2 $client_ip is known to belong to 'xxx' |
| 873 | +# 3 agent shows request (probably) came from 'xxx' |
| 874 | + |
| 875 | + my $client_ip = shift ; |
| 876 | + my $referer = shift ; |
| 877 | + my $agent = shift ; |
| 878 | + my $mime = shift ; |
| 879 | + my $mimecat = shift ; |
| 880 | + my $service = shift ; |
| 881 | + my $ext = shift ; |
| 882 | + |
| 883 | + $client_ip =~ s/\:\d+$//o ; |
| 884 | + $referer =~ s/\:\d+$//o ; |
| 885 | + |
| 886 | + my $referer_original = $referer ; |
| 887 | + my $origin ; |
| 888 | + |
| 889 | + if ($referer ne '-') |
| 890 | + { $origin = $referer ; } |
| 891 | + else |
| 892 | + { $origin = $client_ip ; } |
| 893 | + |
| 894 | + my $origin_original = $origin ; |
| 895 | + |
| 896 | + if (is_valid_ip_address ($client_ip)) # always ? |
| 897 | + { $client_ip = &MatchIpRange ($client_ip) ; } |
| 898 | + |
| 899 | + if (is_valid_ip_address ($referer)) # never ? |
| 900 | + { |
| 901 | + $top_level_domain = "-" ; |
| 902 | + $referer = &MatchIpRange ($referer) ; |
| 903 | + } |
| 904 | + else |
| 905 | + { |
| 906 | + $top_level_domain = &GetTopLevelDomain ($referer) ; |
| 907 | + if ($top_level_domain eq "") |
| 908 | + { |
| 909 | + $secondary_domain = "invalid" ; |
| 910 | + $referer = "invalid" ; |
| 911 | + $origin = "invalid origin" ; |
| 912 | + } |
| 913 | + else |
| 914 | + { $secondary_domain = &GetSecondaryDomain ($referer) ; } |
| 915 | + if ($secondary_domain eq "google") |
| 916 | + { |
| 917 | + $referer =~ s/$pattern_url_post//o ; |
| 918 | + $referer =~ s/^${pattern_url_pre}maps\.google$/!google:maps/o ; |
| 919 | + $referer =~ s/^${pattern_url_pre}images\.google$/!google:image search/o ; |
| 920 | + $referer =~ s/^${pattern_url_pre}translate\.google$/!google:translate/o ; |
| 921 | + $referer =~ s/^${pattern_url_pre}mail\.google$/!google:mail/o ; |
| 922 | + $referer =~ s/^${pattern_url_pre}toolbar\.google$/!google:toolbar/o ; |
| 923 | + $referer =~ s/^${pattern_url_pre}gmodules$/!google:gmodules/o ; |
| 924 | + $referer =~ s/^${pattern_url_pre}google$/!google:web search/o ; |
| 925 | + $referer =~ s/^${pattern_url_pre}www\.google/!google:web search/o ; |
| 926 | + if ($referer !~ /!/) |
| 927 | + { print "google referer not recognized: '$referer_original'\n" ; } |
| 928 | + } |
| 929 | + |
| 930 | + # test code |
| 931 | + # if ($secondary_domain !~ /(?:-|google|yahoo)/o) |
| 932 | + # { print "$secondary_domain <= $referer\n" ; } |
| 933 | + } |
| 934 | + |
| 935 | + ($service,$agent) = &MatchAgent ($agent, $client_ip, $mime, $ext) ; |
| 936 | + |
| 937 | + if (($top_level_domain eq "-") && ($client_ip =~ /!google:ip/io)) |
| 938 | + { $top_level_domain = "ip:$service" ; } |
| 939 | + |
| 940 | + if (($client_ip =~ /!.*google/io) || ($referer =~ /!.*google/io) || ($agent =~ /!.*google/io)) |
| 941 | + { |
| 942 | + if ($referer =~ /!.*google/io) |
| 943 | + { $origin = "google (by referer)" } # $referer_original ; } |
| 944 | + elsif ($client_ip =~ /!.*google/io) |
| 945 | + { $origin = "google (by ip)" ; } |
| 946 | + else |
| 947 | + { $origin = "google (by agent)" ; } |
| 948 | + |
| 949 | + if ($client_ip =~ /!.*google/io) { $google_x = "x" ; } else { $google_x = "-" ; } |
| 950 | + if ($referer =~ /!.*google/io) { $google_y = "y" ; } else { $google_y = "-" ; } |
| 951 | + if ($agent =~ /!.*google/io) { $google_z = "z" ; } else { $google_z = "-" ; } |
| 952 | + |
| 953 | + $googlematch = "$google_x $google_y $google_z" ; |
| 954 | + |
| 955 | + $referer2 = $referer ; if ($referer2 !~ /^!.*google:/io) { $referer2 = ".." ; } else { $referer2 =~ s/^!google://o ; } |
| 956 | + $agent2 = $agent ; if ($agent2 !~ /^!.*google:/io) { $agent2 = ".." ; } else { $agent2 =~ s/^!google://o ; } |
| 957 | + |
| 958 | + $top_level_domain =~ s/^.*\.//o ; # co.uk -> uk |
| 959 | + |
| 960 | + if (($service eq "..") && ($referer =~ /!google:/o) && ($referer !~ /!google:ip/o)) |
| 961 | + { ($service = $referer) =~ s/^.*?:(.*$)/ucfirst($1)/eo ; } |
| 962 | + |
| 963 | + if (($service eq "GoogleBot") && ($client_ip !~ /!.*google/io)) |
| 964 | + { $service = "GoogleBot?" ; } |
| 965 | + |
| 966 | + $service =~ s/^\.\.$/Other/o ; |
| 967 | + |
| 968 | + # only found in agent string -> except Google Earth and Google Desktop, ignore others (Toolbar , GoogleBot) |
| 969 | + $accept = " " ; |
| 970 | + if (($googlematch eq "- - z") && ($service =~ /GoogleBot/io)) |
| 971 | + { |
| 972 | + $service = "GoogleBot?" ; |
| 973 | + $google_imposters {$agent}++ ; |
| 974 | + } |
| 975 | + |
| 976 | + # obsolete? to be considered ? |
| 977 | + # if (($googlematch ne "- - z") || ($service =~ /(?:Earth|Desktop)/o)) |
| 978 | + # { $search {"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} ++ ; } |
| 979 | + # else |
| 980 | + # { $accept = "not" ; } |
| 981 | + |
| 982 | + $search {"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} ++ ; |
| 983 | + |
| 984 | + $googlebins2 {"$accept [$googlematch] " . sprintf ("%-14s",$service) . $referer} ++ ; |
| 985 | + $googlebins {$googlematch}++ ; |
| 986 | + } |
| 987 | + |
| 988 | + # test only: make yahoo's treatment of languages look like google's |
| 989 | + # $origin =~ s/^([a-zA-Z0-9-]+)\.([a-zA-Z0-9-]+\.yahoo.com)/$2.$1/o ; |
| 990 | + |
| 991 | + |
| 992 | + $origin =~ s/^localhost(\:.*)?$/!localhost/o ; |
| 993 | + $origin =~ s/\:\d+$//o ; # remove port number |
| 994 | + |
| 995 | + # $origin =~ s/${pattern_url_pre}mail\.live$/!microsoft live mail/o ; |
| 996 | + # $origin =~ s/${pattern_url_pre}msn.$/!microsoft MSN/o ; |
| 997 | + # $origin =~ s/${pattern_url_pre}msdn.$/!microsoft MSDN/o ; |
| 998 | + |
| 999 | + # $origin =~ s/${pattern_url_pre}dailynews\.yahoo$/!yahoo news/o ; |
| 1000 | + # $origin =~ s/${pattern_url_pre}mail\.yahoo$/!yahoo mail/o ; |
| 1001 | + # $origin =~ s/${pattern_url_pre}search.yahoo$/!yahoo search/o ; |
| 1002 | + |
| 1003 | + # if (($origin !~ /^ip:!/o) && ($origin !~ /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})/o)) |
| 1004 | + # { |
| 1005 | + # $origin =~ s/${pattern_url_pre}([a-zA-Z0-9-]+)$/!$1/o ; |
| 1006 | + # print "$origin\n" ; |
| 1007 | + # } |
| 1008 | + |
| 1009 | + if ($origin =~ /wiki/o) |
| 1010 | + { $wikis {$origin} ++ ; } |
| 1011 | + |
| 1012 | + if ($origin eq "wikipedia") |
| 1013 | + { |
| 1014 | + # print "incomplete origin: $origin <= $referer_original\n$line\n\n" ; |
| 1015 | + $origin = "!error:4" ; |
| 1016 | + } |
| 1017 | + |
| 1018 | + return ($origin, $top_level_domain) ; |
| 1019 | +} |
| 1020 | + |
| 1021 | +sub MatchAgent |
| 1022 | +{ |
| 1023 | + my $agent = shift ; |
| 1024 | + my $client_ip = shift ; |
| 1025 | + my $mime = shift ; |
| 1026 | + my $ext = shift ; |
| 1027 | + |
| 1028 | + ($client_ip_range = $client_ip) =~ s/\.\d+\.\d+$//o ; |
| 1029 | + |
| 1030 | + $service = '..' ; |
| 1031 | + if ($agent =~ /google/io) |
| 1032 | + { |
| 1033 | + if ($agent =~ /Googlebot/io) { $service = "GoogleBot" ; $agent = "!GoogleBot" ; } |
| 1034 | + elsif ($agent =~ /FeedFetcher-Google/io) { $service = "FeedFetcher" ; $agent = "!FeedFetcher-Google" ; } |
| 1035 | + elsif ($agent =~ /Google.*?Wireless.*?Transcoder/io) { $service = "Wireless" ; $agent = "!GoogleWirelessTranscoder" ; } |
| 1036 | + elsif ($agent =~ /Google.*?Desktop/io) { $service = "Desktop" ; $agent = "!GoogleDesktop" ; } |
| 1037 | + elsif ($agent =~ /GoogleEarth/io) { $service = "Earth" ; $agent = "!GoogleEarth" ; } |
| 1038 | + elsif ($agent =~ /GoogleToolbar/io) { $service = "Toolbar" ; $agent = "!GoogleToolbar" ; } |
| 1039 | + elsif ($agent =~ /Google.*?Keyword.*?Tool/io) { $service = "KeywordTool" ; $agent = "!GoogleKeywordTool" ; } |
| 1040 | + elsif ($agent =~ /GoogleT\d/io) { $service = "Toolbar" ; $agent =~ s/^.*?(GoogleT\d+).*$/"!".$1/e ; } |
| 1041 | + elsif ($agent =~ /translate\.google\.com/io) { $service = "Translate" ; $agent = "!GoogleTranslate" ; } |
| 1042 | + else { $service = "Other" ; $agent = "!GoogleOther" ; } |
| 1043 | + |
| 1044 | + $googlebots {"$agent,$client_ip_range,$service,$mime,$ext"} ++ ; |
| 1045 | + } |
| 1046 | + |
| 1047 | +# if ($agent =~ /yahoo/io) |
| 1048 | +# { |
| 1049 | +# if ($agent =~ /ysearch\/slurp/o) |
| 1050 | +# { $service = "bot" ; $agent = "!YahooBot" ; } |
| 1051 | + |
| 1052 | +# @yahoobots {"$agent,$client_ip_range,$mime,$ext"} ++ ; |
| 1053 | +# } |
| 1054 | + |
| 1055 | + return ($service, $agent) ; |
| 1056 | +} |
| 1057 | + |
| 1058 | +sub MatchIpRange |
| 1059 | +{ |
| 1060 | + my $address = shift ; |
| 1061 | + |
| 1062 | + $address =~ s/\:.*$//o ; # remove port number |
| 1063 | + |
| 1064 | + # test code |
| 1065 | + # $address_original = $address ; |
| 1066 | + |
| 1067 | + $address =~ s/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/sprintf("%03d",$1).".".sprintf("%03d",$2).".".sprintf("%03d",$3).".".sprintf("%03d",$4)/eo ; |
| 1068 | + $address_11 = substr ($address,0,11) ; |
| 1069 | + |
| 1070 | + if (($address_11 ge "064.233.160") && ($address_11 le "064.233.191")) { $address = "!google:IP064" ; } |
| 1071 | + elsif (($address_11 ge "066.249.064") && ($address_11 le "066.249.095")) { $address = "!google:IP066" ; } |
| 1072 | + elsif (($address_11 ge "066.102.000") && ($address_11 le "066.102.015")) { $address = "!google:IP066" ; } |
| 1073 | + elsif (($address_11 ge "072.014.192") && ($address_11 le "072.014.255")) { $address = "!google:IP072" ; } |
| 1074 | + elsif (($address_11 ge "074.125.000") && ($address_11 le "074.125.255")) { $address = "!google:IP074" ; } |
| 1075 | + elsif (($address_11 ge "209.085.128") && ($address_11 le "209.085.255")) { $address = "!google:IP209" ; } |
| 1076 | + elsif (($address_11 ge "216.239.032") && ($address_11 le "216.239.063")) { $address = "!google:IP216" ; } |
| 1077 | + elsif (($address ge "070.089.039.152") && ($address le "070.089.039.159")) { $address = "!google:IP070" ; } |
| 1078 | + elsif (($address ge "070.090.219.072") && ($address le "070.090.219.079")) { $address = "!google:IP070" ; } |
| 1079 | + elsif (($address ge "070.090.219.048") && ($address le "070.090.219.055")) { $address = "!google:IP070" ; } |
| 1080 | + |
| 1081 | + elsif (($address_11 ge "067.195.000") && ($address_11 le "067.195.255")) { $address = "!yahoo:IP067" ; } |
| 1082 | + elsif (($address_11 ge "072.030.000") && ($address_11 le "072.030.255")) { $address = "!yahoo:IP072" ; } |
| 1083 | + elsif (($address_11 ge "074.006.000") && ($address_11 le "074.006.255")) { $address = "!yahoo:IP074" ; } |
| 1084 | + elsif (($address_11 ge "209.191.064") && ($address_11 le "209.191.127")) { $address = "!yahoo:IP209" ; } |
| 1085 | + |
| 1086 | + $address =~ s/IP\d+/ip/o ; # no need for detailed ranges for now |
| 1087 | + |
| 1088 | + # test code |
| 1089 | + # @fields = split ('\.', $address) ; |
| 1090 | + # foreach $field (@fields) |
| 1091 | + # { $field =~ s/^0+(\d)/$1/o ; } |
| 1092 | + # $address2 = join ('.', @fields) ; |
| 1093 | + # if ($address2 ne $address_original) |
| 1094 | + # { print "MatchIpRange: '$address2' <- $address_original\n" ; } |
| 1095 | + |
| 1096 | + return ($address) ; |
| 1097 | +} |
| 1098 | + |
| 1099 | +# see http://en.wikipedia.org/wiki/Domain_name |
| 1100 | +sub GetTopLevelDomain |
| 1101 | +{ |
| 1102 | + my $domain = shift ; |
| 1103 | + $domain =~ s/\:\d+$//o ; # remove port number |
| 1104 | + |
| 1105 | + if ($domain eq '-') |
| 1106 | + { $top_level_domain = '-' ; } |
| 1107 | + elsif ($domain =~ /!?localhost/o) |
| 1108 | + { $top_level_domain = 'localhost' ; } |
| 1109 | + elsif ($domain !~ /.+\..+/o) |
| 1110 | + { $top_level_domain = '' ; } |
| 1111 | + else |
| 1112 | + { |
| 1113 | + ($top_level_domain = $domain) =~ s/^.*?($pattern_url_post)/$1/o ; |
| 1114 | + if ($domain eq $top_level_domain) |
| 1115 | + { $top_level_domain = '-other-' ; } |
| 1116 | + } |
| 1117 | + return ($top_level_domain) ; |
| 1118 | +} |
| 1119 | + |
| 1120 | +sub GetSecondaryDomain |
| 1121 | +{ |
| 1122 | + my $domain = shift ; |
| 1123 | + $domain =~ s/\:\d+$//o ; # remove port number |
| 1124 | + |
| 1125 | + if ($domain !~ /\./) |
| 1126 | + { return ($domain) ; } |
| 1127 | + |
| 1128 | + $domain =~ s/$pattern_url_post//o ; |
| 1129 | + $domain =~ s/^.*?\.([^\.]+)$/$1/o ; |
| 1130 | + return ($domain) ; |
| 1131 | +} |
| 1132 | + |
| 1133 | +sub CountOrigin |
| 1134 | +{ |
| 1135 | + my $source = shift ; |
| 1136 | + my $origin = shift ; |
| 1137 | + my $toplevel = shift ; |
| 1138 | + my $mimecat = shift ; |
| 1139 | + |
| 1140 | + if ($source eq "external") |
| 1141 | + { |
| 1142 | + $tot_origins_external_counted ++ ; |
| 1143 | + $origin =~ s/\:.*$//o ; |
| 1144 | + if (is_valid_ip_address ($origin)) |
| 1145 | + { $origin = "unmatched ip address" ; $toplevel = "" ; } |
| 1146 | + elsif ($origin =~ /^!error/o) |
| 1147 | + { $origin = "invalid origin" ; $toplevel = "" ; } |
| 1148 | + elsif ($origin =~ /^!localhost/o) |
| 1149 | + { $origin = "localhost" ; $toplevel = "" ; } |
| 1150 | + else |
| 1151 | + { |
| 1152 | + if (($origin =~ /!/o) && ($origin !~ /!error/o)) |
| 1153 | + { print "CountOrigin: $origin\n" ; } |
| 1154 | + $origin = &GetSecondaryDomain ($origin) ; |
| 1155 | + # print "$origin\n" ; |
| 1156 | + } |
| 1157 | + } |
| 1158 | + $origins {"$source,$origin,$toplevel,$mimecat"} ++ ; |
| 1159 | +} |
| 1160 | + |
| 1161 | +sub ProcessUploadPath |
| 1162 | +{ |
| 1163 | + my $url = shift ; |
| 1164 | + my ($file,$folder,$path,$size,$sizerange) ; |
| 1165 | + ($path = $url) =~ s/^.*?\.org\///o ; |
| 1166 | + ($file = $path) =~ s/^.*\/([^\/]*)$/$1/go ; # remove path |
| 1167 | + |
| 1168 | + $binaries {$file} ++ ; |
| 1169 | + |
| 1170 | + if ($file =~ /(?:gif|jpg|jpeg|png|svg)$/io) |
| 1171 | + { |
| 1172 | + ($folder = $path) =~ s/\/[^\/]*$/\//o ; # remove file |
| 1173 | + $folder =~ s/\/[^\/]{1,1}\/[^\/]{2,2}\/.*$//o ; # remove /x/yy/ part and beyond |
| 1174 | + $folder =~ s/\/[^\/]{1,1}\/[^\/]{2,2}\/.*$//o ; # remove /x/yy/ part and beyond, can occur twice (in thumbs) |
| 1175 | + $folder =~ s/\/thumb//o ; |
| 1176 | + $folder =~ s/^math\/.*$/math/o ; |
| 1177 | + # print "$folder <- $upload\n" ; |
| 1178 | + if ($file =~ /\d+px/o) |
| 1179 | + { |
| 1180 | + ($size = $file) =~ s/^.*?(\d+)px.*$/$1/o ; |
| 1181 | + $sizerange = sprintf ("%5d",(int ($size / 20)) * 20) . "-" . sprintf ("%5d",(((int ($size / 20))+1) * 20 - 1)) ; |
| 1182 | + $imagesizes {$sizerange} ++ ; |
| 1183 | + } |
| 1184 | + else |
| 1185 | + { $imagesizes {"???"} ++ ; } |
| 1186 | + } |
| 1187 | +} |
| 1188 | + |
| 1189 | +1; |
Index: trunk/wikistats/squids/SquidReportArchive.pl |
— | — | @@ -7,90 +7,10 @@ |
8 | 8 | |
9 | 9 | # $quarter_only = '2010 Q3' ; # if not empty filter process for this quarter only |
10 | 10 | |
11 | | - # set defaults mainly for tests on local machine |
12 | | -# default_argv "-m 201009 " ; |
| 11 | +# set defaults mainly for tests on local machine |
| 12 | +# default_argv "-m 201010 " ; |
13 | 13 | default_argv "-c " ; |
14 | 14 | |
15 | | -# $html = "<html><body bgcolor=black><table>" ; |
16 | | -# for ($i = 4 ; $i >= 0 ; $i-=0.5) |
17 | | -# { |
18 | | -# ($requests,$ratio,$fill) = RatioAndFillColor1 ('',$i,4, $ratio_sqrt) ; |
19 | | -# print sprintf ("%.1f",$i) . ": $fill\n" ; |
20 | | -# $i2 = sprintf ("%0.1f", $i) ; |
21 | | -# $html .= "<tr><td align=right><font color=grey>$i2</font></td><td width=15> </td><td width=50 style=\"background:$fill\"> </td><td width=15> </td><td><font color=grey> $fill</font></td></tr>" ; |
22 | | -# } |
23 | | -# $html .= "<tr><td height=30 colspan=99> </td></tr>" ; |
24 | | -# for ($i = 4 ; $i >= 0 ; $i-=0.5) |
25 | | -# { |
26 | | -# ($requests,$ratio,$fill) = RatioAndFillColor2 ('',$i,4, $ratio_sqrt) ; |
27 | | -# print sprintf ("%.1f",$i) . ": $fill\n" ; |
28 | | -# $i2 = sprintf ("%0.1f", $i) ; |
29 | | -# $html .= "<tr><td align=right><font color=grey>$i2</font></td><td width=15> </td><td width=50 style=\"background:$fill\"> </td><td width=15> </td><td><font color=grey> $fill</font></td></tr>" ; |
30 | | -# } |
31 | | -# $html .= "</table><body></html>" ; |
32 | | -# open HTML, '>', 'color_range2.html' ; |
33 | | -# print HTML $html ; |
34 | | -# close HTML ; |
35 | | -# exit ; |
36 | | - |
37 | | -#sub RatioAndFillColor1 |
38 | | -#{ |
39 | | -# my ($code, $requests,$requests_max) = @_ ; |
40 | | -# my ($ratio,$green,$red,$blue,$fill) ; |
41 | | - |
42 | | -# if ($requests > $requests_max) |
43 | | -# { $requests = $requests_max ; } |
44 | | - |
45 | | -# $ratio = sqrt ($requests / $requests_max) ; |
46 | | -# if ($ratio >= 0.20) |
47 | | -# { |
48 | | -# $green = 180 ; |
49 | | -# $red = 180 - int (0.5 + 180 * 5/4 * ($ratio-0.20)) ; |
50 | | -# $blue = int ($green / 3) ; |
51 | | -# } |
52 | | -# else |
53 | | -# { |
54 | | -# $red = 220 ; |
55 | | -# $green = int (0.5 + 220 * 5 * $ratio) ; |
56 | | -# $blue = 0 ; #int ($green / 2) ; |
57 | | -# } |
58 | | - |
59 | | -# $fill = "\#" . sprintf ("%02x%02x%02x",$red,$green,$blue) ; |
60 | | -# $fill = lc hsv2rgb($ratio*150,0.67+$ratio*0.33,0.8-0.2*$ratio) ; |
61 | | - |
62 | | -# $fills {lc $code} = $fill ; |
63 | | -# return ($requests,$ratio,$fill) ; |
64 | | -#} |
65 | | - |
66 | | -#sub RatioAndFillColor2 |
67 | | -#{ |
68 | | -# my ($code, $requests,$requests_max) = @_ ; |
69 | | -# my ($ratio,$green,$red,$blue,$fill) ; |
70 | | - |
71 | | -# if ($requests > $requests_max) |
72 | | -# { $requests = $requests_max ; } |
73 | | - |
74 | | -# $ratio = $requests / $requests_max ; |
75 | | -# if ($ratio >= 0.20) |
76 | | -# { |
77 | | -# $green = 180 ; |
78 | | -# $red = 180 - int (0.5 + 180 * 5/4 * ($ratio-0.20)) ; |
79 | | -# $blue = int ($green / 3) ; |
80 | | -# } |
81 | | -# else |
82 | | -# { |
83 | | -# $red = 220 ; |
84 | | -# $green = int (0.5 + 220 * 5 * $ratio) ; |
85 | | -# $blue = 0 ; #int ($green / 2) ; |
86 | | -# } |
87 | | - |
88 | | -# $fill = "\#" . sprintf ("%02x%02x%02x",$red,$green,$blue) ; |
89 | | -# $fill = lc hsv2rgb($ratio*150,1-$ratio*0.334,0.6) ; |
90 | | - |
91 | | -# $fills {lc $code} = $fill ; |
92 | | -# return ($requests,$ratio,$fill) ; |
93 | | -#} |
94 | | - |
95 | 15 | # to do: add text from http://wiki.squid-cache.org/SquidFaq/SquidLogs |
96 | 16 | # ReportOrigin how to handle '!error <-> other |
97 | 17 | # SquidReportOrigins.htm total count<->alpha are not the same (+ skip total for "google (total)") |
— | — | @@ -104,8 +24,6 @@ |
105 | 25 | |
106 | 26 | # http://www.linux.com/community/blogs/Convert-a-.svg-file-to-a-.png-in-Ubuntu.html |
107 | 27 | |
108 | | -# use CGI::Carp qw(fatalsToBrowser); |
109 | | -# use Getopt::Std ; |
110 | 28 | use Time::Local ; |
111 | 29 | use Cwd; |
112 | 30 | |
— | — | @@ -117,22 +35,24 @@ |
118 | 36 | if (-d "/a/squid") |
119 | 37 | { |
120 | 38 | print "\n\nJob runs on server $hostname\n\n" ; |
121 | | - $path_root = "/a/ezachte" ; |
| 39 | + $path_in = "/a/ezachte" ; |
| 40 | + $path_out = "/a/ezachte" ; |
122 | 41 | } |
123 | 42 | elsif ($hostname eq 'bayes') |
124 | 43 | { |
125 | 44 | print "\n\nJob runs on server $hostname\n\n" ; |
126 | | - $path_root = "/home/ezachte/wikistats/animation" ; |
| 45 | + $path_in = "/home/ezachte/wikistats/animation" ; |
| 46 | + $path_out = "/home/ezachte/wikistats/animation" ; |
127 | 47 | } |
128 | 48 | else |
129 | 49 | { |
130 | 50 | print "Job runs local for tests\n\n" ; |
131 | | - $path_root = "W:/! Perl/Squids/Archive/test5" ; |
| 51 | + $path_in = "W:/# Out Locke" ; |
| 52 | + $path_out = "W:/# Out Test/Locke" ; |
132 | 53 | } |
133 | | - $path_in = $path_root ; |
134 | | - $path_out = $path_root ; |
135 | 54 | |
136 | | - print "Path root = $path_root\n" ; |
| 55 | + print "Path in = $path_in\n" ; |
| 56 | + print "Path out = $path_out\n" ; |
137 | 57 | |
138 | 58 | # periodically harvest updated metrics from |
139 | 59 | # 'http://en.wikipedia.org/wiki/List_of_countries_by_population' |
— | — | @@ -149,20 +69,21 @@ |
150 | 70 | |
151 | 71 | &InitProjectNames ; |
152 | 72 | |
| 73 | + $file_csv_country_codes = "CountryCodes.csv" ; |
| 74 | + $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ; |
| 75 | + |
| 76 | + &ReadInputCountriesNames ; |
| 77 | + |
153 | 78 | if ($reportcountries) |
154 | 79 | { |
155 | 80 | $project_mode = "wp" ; |
156 | 81 | |
157 | | - $file_csv_country_codes = "CountryCodes.csv" ; |
158 | | - $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ; |
159 | | - |
160 | | - &ReadInputCountriesNames ; |
161 | 82 | &ReadInputCountriesMeta ; |
162 | 83 | |
163 | 84 | &CollectRegionCounts ; |
164 | 85 | |
165 | 86 | &ReportCountries ('Saves'); |
166 | | - &ReportCountries ('Views'); |
| 87 | +# &ReportCountries ('Views'); |
167 | 88 | |
168 | 89 | exit ; |
169 | 90 | } |
— | — | @@ -170,12 +91,10 @@ |
171 | 92 | $reportdaysback = $options {"d"} ; |
172 | 93 | $reportmonth = $options {"m"} ; |
173 | 94 | |
174 | | - if (($reportmonth !~ /^\d{6}$/) && ($reportdaysback !~ /^-\d+/)) |
175 | | - { print "Specify month as -m yyyymm or days back as -d -[days] (e.g. -d -1 for yesterday)" ; exit ; } |
| 95 | + if (($reportmonth !~ /^\d\d\d\d-\d\d$/) && ($reportdaysback !~ /^-\d+$/)) |
| 96 | + { print "Specify month as -m yyyy-mm or days back as -d -[days] (e.g. -d -1 for yesterday)" ; exit ; } |
176 | 97 | |
177 | | - if ($reportmonth =~ /^\d{6}$/) |
178 | | - { $reportmonth = substr ($reportmonth,0,4) . "-" . substr ($reportmonth,4,2) ; } |
179 | | - else |
| 98 | + if ($reportdaysback =~ /^-\d+$/) |
180 | 99 | { |
181 | 100 | ($sec,$min,$hour,$day,$month,$year) = localtime (time+$reportdaysback*86400) ; |
182 | 101 | $reportmonth = sprintf ("%04d-%02d",$year+1900,$month+1) ; |
— | — | @@ -225,27 +144,16 @@ |
226 | 145 | $file_csv_clients = "public/SquidDataClients.csv" ; |
227 | 146 | $file_csv_google_bots = "public/SquidDataGoogleBots.csv" ; |
228 | 147 | $file_csv_indexphp = "public/SquidDataIndexPhp.csv" ; |
229 | | - $file_csv_countries_languages_visited = "public/SquidDataCountriesViews.csv" ; |
230 | | - $file_csv_countries_timed = "public/SquidDataCountriesViewsTimed.csv" ; |
231 | 148 | $file_csv_browser_languages = "public/SquidDataLanguages.csv" ; |
232 | 149 | |
| 150 | + $file_csv_countries_languages_visited = "SquidDataCountriesViews.csv" ; |
| 151 | + $file_csv_countries_timed = "SquidDataCountriesViewsTimed.csv" ; |
| 152 | + |
233 | 153 | print "\n\nJob SquidReportArchive.pl\n\n" ; |
234 | 154 | |
235 | | -# if (! -d "/a/squid") |
236 | | -# { |
237 | | -# if (! -e $file_csv_requests) { $file_csv_requests =~ s/\./Test./ } |
238 | | -# if (! -e $file_csv_methods) { $file_csv_methods =~ s/\./Test./ } |
239 | | -# if (! -e $file_csv_skins) { $file_csv_skins =~ s/\./Test./ } |
240 | | -# if (! -e $file_csv_scripts) { $file_csv_scripts =~ s/\./Test./ } |
241 | | -# if (! -e $file_csv_opsys) { $file_csv_opsys =~ s/\./Test./ } |
242 | | -# if (! -e $file_csv_origins) { $file_csv_origins =~ s/\./Test./ } |
243 | | -# if (! -e $file_csv_google) { $file_csv_google =~ s/\./Test./ } |
244 | | -# if (! -e $file_csv_crawlers) { $file_csv_crawlers =~ s/\./Test./ } |
245 | | -# } |
| 155 | + if (! -d "$path_in/$reportmonth") |
| 156 | + { print "Directory not found: $path_in\/$reportmonth\n" ; exit ; } |
246 | 157 | |
247 | | - if (! -d "$path_root/$reportmonth") |
248 | | - { print "Directory not found: $path_root\/$reportmonth\n" ; exit ; } |
249 | | - |
250 | 158 | # for ($month = 4 ; $month <= 10 ; $month ++) |
251 | 159 | # { |
252 | 160 | # $reportmonth = "2009-" . sprintf ("%02d", $month) ; |
— | — | @@ -255,7 +163,7 @@ |
256 | 164 | # last if ($month == 10) && ($day > 24) # temp code stay with DST summer time zone for SV |
257 | 165 | |
258 | 166 | $date = $reportmonth . "-". sprintf ("%02d", $day) ; |
259 | | - $dir = "$path_root/$reportmonth/$date" ; |
| 167 | + $dir = "$path_in/$reportmonth/$date" ; |
260 | 168 | |
261 | 169 | if (-d $dir) |
262 | 170 | { |
— | — | @@ -277,7 +185,8 @@ |
278 | 186 | if ($#dirs_process < 0) |
279 | 187 | { print "No valid data to process.\n" ; exit ; } |
280 | 188 | |
281 | | - $dir_reports = "$path_root/$reportmonth" ; |
| 189 | + $path_reports = "$path_in/$reportmonth" ; |
| 190 | + print "Write report to $path_reports\n" ; |
282 | 191 | |
283 | 192 | $google_ip_ranges = "<b>IP ranges:</b> known ip ranges for Google are 64.233.[160.0-191.255], 66.249.[64.0-95.255], 66.102.[0.0-15.255], 72.14.[192.0-255.255], <br>74.125.[0.0-255.255], " . |
284 | 193 | "209.085.[128.0-255.255], 216.239.[32.0-63.255] and a few minor other subranges</small><p>\n" ; |
— | — | @@ -288,7 +197,7 @@ |
289 | 198 | |
290 | 199 | # &ReadDate ; date range was read from csv file |
291 | 200 | |
292 | | - foreach $dir_process (@dirs_process) |
| 201 | + foreach $path_process (@dirs_process) |
293 | 202 | { |
294 | 203 | $days_input_found ++ ; |
295 | 204 | |
— | — | @@ -303,18 +212,21 @@ |
304 | 213 | &ReadInputSkins ; |
305 | 214 | &ReadInputIndexPhp ; |
306 | 215 | &ReadInputBrowserLanguages ; |
307 | | -# &ReadInputCountriesTimed ; |
| 216 | + &ReadInputCountriesTimed ; |
308 | 217 | } |
309 | 218 | |
310 | 219 | #&ReadCountryCodes ; |
311 | 220 | |
312 | | - print "\nDays input = $days_input_found\n" ; |
313 | | - $multiplier = 1 / $days_input_found ; |
314 | | - print "\nMultiplier = " . sprintf ("%.4f", $multiplier) . "\n" ; |
| 221 | + if ($days_input_found > 0) |
| 222 | + { |
| 223 | + print "\nDays input = $days_input_found\n" ; |
| 224 | + $multiplier = 1 / $days_input_found ; |
| 225 | + print "\nMultiplier = " . sprintf ("%.4f", $multiplier) . "\n" ; |
| 226 | + } |
| 227 | + else { print "\nDays input = 0 (zero!)\n" ; } |
315 | 228 | |
316 | | -#&WriteCsvCountriesTimed ; |
317 | | -#&WriteCsvCountriesGoTo ; |
318 | | -#exit ; |
| 229 | + &WriteCsvCountriesTimed ; |
| 230 | + &WriteCsvCountriesGoTo ; |
319 | 231 | |
320 | 232 | foreach $key (keys_sorted_alpha_desc %edit_submit) |
321 | 233 | { print "YYY " . sprintf ("%5d", $edit_submit {$key}) . ": $key\n" ; } |
— | — | @@ -324,7 +236,6 @@ |
325 | 237 | |
326 | 238 | print "\n\n" ; |
327 | 239 | |
328 | | - |
329 | 240 | foreach $domain (keys_sorted_by_value_num_desc %edit_submit_bot_sort) |
330 | 241 | { |
331 | 242 | $cnt = $edit_submit_bot_sort {$domain} ; |
— | — | @@ -385,18 +296,18 @@ |
386 | 297 | # &WriteCsvCountriesTimed ; |
387 | 298 | # &WriteCsvCountriesTargets ; |
388 | 299 | close "FILE_LOG" ; |
389 | | - print "\nReady\n\n" ; |
390 | 300 | |
391 | 301 | if (-d "/a/squid") |
392 | 302 | { |
393 | | -# $cmd = "tar -cf $dir_reports/$date_last\-csv.tar $dir_reports_in/*.csv | bzip2 $dir_reports/$date_last\-csv.tar" ; |
| 303 | +# $cmd = "tar -cf $path_reports/$date_last\-csv.tar $path_reports_in/*.csv | bzip2 $path_reports/$date_last\-csv.tar" ; |
394 | 304 | # print "cmd = '$cmd'\n" ; |
395 | 305 | # `$cmd` ; |
396 | | - $cmd = "tar -cf $dir_reports/$reportmonth\-html.tar $dir_reports/*.htm | bzip2 $dir_reports/$reportmonth\-html.tar" ; |
| 306 | + $cmd = "tar -cf $path_reports/$reportmonth\-html.tar $path_reports/*.htm | bzip2 $path_reports/$reportmonth\-html.tar" ; |
397 | 307 | print "cmd = '$cmd'\n" ; |
398 | 308 | `$cmd` ; |
399 | 309 | } |
400 | 310 | |
| 311 | + print "\nReady\n\n" ; |
401 | 312 | exit ; |
402 | 313 | |
403 | 314 | sub ReportCountries |
— | — | @@ -408,12 +319,14 @@ |
409 | 320 | $selection = 'PageViews' ; |
410 | 321 | $selection2 = 'Visits' ; |
411 | 322 | $views_edits = 'Page Views' ; |
| 323 | + $offset_links = 0 ; |
412 | 324 | } |
413 | 325 | else |
414 | 326 | { |
415 | 327 | $selection = 'PageEdits' ; |
416 | 328 | $selection2 = 'Saves' ; |
417 | 329 | $views_edits = 'Page Edits' ; |
| 330 | + $offset_links = 4 ; |
418 | 331 | } |
419 | 332 | |
420 | 333 | ($quarter_only2 = $quarter_only) =~ s/ // ; |
— | — | @@ -450,31 +363,36 @@ |
451 | 364 | |
452 | 365 | $title_main = "Wikimedia Traffic Analysis Report" ; |
453 | 366 | |
454 | | - $links = "<p> Also: <b>$views_edits Per Country</b> - " . |
| 367 | + $links = "<p> <b>Page Views Per Country</b> - " . |
455 | 368 | "<a href='$file_html_per_country_overview'>Overview</a> / " . |
456 | 369 | "<a href='$file_html_per_country_breakdown'>Breakdown</a> / " . |
457 | 370 | "<a href='$file_html_per_country_trends'>Trends</a>, " . |
458 | | - "<b>$views_edits Per Wikipedia Language - </b> " . |
| 371 | + "<b>Page Views Per Wikipedia Language - </b> " . |
459 | 372 | "<a href='$file_html_per_language_breakdown'>Breakdown</a>" ; |
460 | 373 | |
461 | | - $title = "$title_main - Wikipedia $views_edits Per Country - Overview" ; |
462 | | - &WriteReportPerCountryOverview ($title, $views_edits, &UnLink ($links,1)) ; ; |
| 374 | + ($links_views = $links) =~ s/Edits/Views/g ; |
| 375 | + ($links_edits = $links) =~ s/Views/Edits/g ; |
463 | 376 | |
464 | | - $title = "$title_main - Wikipedia $views_edits Per Country - Breakdown" ; |
465 | | - &WriteReportPerCountryBreakdown ($title, $views_edits, &UnLink ($links,2),$cutoff_requests = 100, $cutoff_percentage = 1, $show_logcount = $false) ; |
466 | | - &WriteReportPerCountryBreakdown ($title, $views_edits, &UnLink ($links,2),$cutoff_requests = 10, $cutoff_percentage = 0.1, $show_logcount = $true) ; |
| 377 | + $links = "$links_views\n$links_edits\n" ; |
467 | 378 | |
468 | | - $title = "$title_main - Wikipedia $views_edits Per Country - Trends" ; |
469 | | - &WriteReportPerCountryTrends ($title, $views_edits, &UnLink ($links,3)) ; |
| 379 | + $title = "$title_main - Wikipedia <font color=#008000>$views_edits Per Country</font> - Overview" ; |
| 380 | + &WriteReportPerCountryOverview ($title, $views_edits, &UnLink ($links,$offset_links+1)) ; |
470 | 381 | |
| 382 | + $title = "$title_main - Wikipedia <font color=#008000>$views_edits Per Country</font> - Breakdown" ; |
| 383 | + &WriteReportPerCountryBreakdown ($title, $views_edits, &UnLink ($links,$offset_links+2),$cutoff_requests = 100, $cutoff_percentage = 1, $show_logcount = $false) ; |
| 384 | + &WriteReportPerCountryBreakdown ($title, $views_edits, &UnLink ($links,$offset_links+2),$cutoff_requests = 10, $cutoff_percentage = 0.1, $show_logcount = $true) ; |
| 385 | + |
| 386 | + $title = "$title_main - Wikipedia <font color=#008000>$views_edits Per Country</font> - Trends" ; |
| 387 | + &WriteReportPerCountryTrends ($title, $views_edits, &UnLink ($links,$offset_links+3)) ; |
| 388 | + |
471 | 389 | $links =~ s/,.*$// ; |
472 | | - $title = "$title_main - $views_edits Per Wikipedia Language - Breakdown" ; |
473 | | - &WriteReportPerLanguageBreakDown ($title, $views_edits, &UnLink ($links,4)) ; |
| 390 | + $title = "$title_main - <font color=#008000>$views_edits Per Wikipedia Language</font> - Breakdown" ; |
| 391 | + &WriteReportPerLanguageBreakDown ($title, $views_edits, &UnLink ($links,$offset_links+4)) ; |
474 | 392 | } |
475 | 393 | |
476 | 394 | sub ReadDate |
477 | 395 | { |
478 | | - open CSV_CRAWLERS, '<', "$dir_process/$file_csv_crawlers" ; |
| 396 | + open CSV_CRAWLERS, '<', "$path_process/$file_csv_crawlers" ; |
479 | 397 | $line = <CSV_CRAWLERS> ; |
480 | 398 | close CSV_CRAWLERS ; |
481 | 399 | # print "DATE LINE $line\n" ; |
— | — | @@ -541,11 +459,12 @@ |
542 | 460 | "body {font-family:arial,sans-serif; font-size:12px }\n" . |
543 | 461 | "h2 {margin:0px 0px 3px 0px; font-size:18px}\n" . |
544 | 462 | "table {font-size:12px ;}\n" . |
545 | | - "td {white-space:wrap; text-align:right; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:12px ; vertical-align:top}\n" . |
| 463 | + "td {white-space:wrap; text-align:right; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:12px ; vertical-align:middle}\n" . |
546 | 464 | "th {white-space:nowrap; text-align:right; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:12px ; vertical-align:top ; font-width:bold}\n" . |
547 | 465 | "th.small {white-space:wrap; text-align:right; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:11px ; vertical-align:top ; font-width:bold}\n" . |
548 | | - "td.hl {text-align:left;}\n" . |
549 | | - "td.hr {text-align:right;}\n" . |
| 466 | + "td.hl {text-align:left;vertical-align:top;}\n" . |
| 467 | + "td.hr {text-align:right;vertical-align:top;}\n" . |
| 468 | + "td.hc {text-align:center;vertical-align:top;}\n" . |
550 | 469 | "td.r {text-align:right; border: inset 1px #FFFFFF}\n" . |
551 | 470 | "td.c {text-align:center; border: inset 1px #FFFFFF}\n" . |
552 | 471 | "td.l {text-align:left; border: inset 1px #FFFFFF}\n" . |
— | — | @@ -614,7 +533,7 @@ |
615 | 534 | { |
616 | 535 | chomp ($line) ; |
617 | 536 | ($code,$region,$north_south,$name) = split (',',$line,4) ; |
618 | | - $country_codes {$code} = $name ; |
| 537 | + $country_codes {$code} = unicode_to_html ($name) ; |
619 | 538 | # print "$code => $name\n" ; |
620 | 539 | } |
621 | 540 | } |
— | — | @@ -623,7 +542,7 @@ |
624 | 543 | |
625 | 544 | sub ReadInputClients |
626 | 545 | { |
627 | | - my $file_csv = "$dir_process/$file_csv_clients" ; |
| 546 | + my $file_csv = "$path_process/$file_csv_clients" ; |
628 | 547 | if (! -e $file_csv) |
629 | 548 | { abort ("Function ReadInputClients: file $file_csv not found!!!") ; } |
630 | 549 | open CSV_CLIENTS, '<', $file_csv ; |
— | — | @@ -712,7 +631,7 @@ |
713 | 632 | |
714 | 633 | sub ReadInputCrawlers |
715 | 634 | { |
716 | | - my $file_csv = "$dir_process/$file_csv_crawlers" ; |
| 635 | + my $file_csv = "$path_process/$file_csv_crawlers" ; |
717 | 636 | if (! -e $file_csv) |
718 | 637 | { abort ("Function ReadInputCrawlers: file $file_csv not found!!!\n") ; } |
719 | 638 | open CSV_CRAWLERS, '<', $file_csv ; |
— | — | @@ -767,7 +686,7 @@ |
768 | 687 | |
769 | 688 | sub ReadInputMethods |
770 | 689 | { |
771 | | - my $file_csv = "$dir_process/$file_csv_methods" ; |
| 690 | + my $file_csv = "$path_process/$file_csv_methods" ; |
772 | 691 | if (! -e $file_csv) |
773 | 692 | { abort ("Function ReadInputMethods: file $file_csv not found!!!") ; } |
774 | 693 | open CSV_METHODS, '<', $file_csv ; |
— | — | @@ -785,7 +704,7 @@ |
786 | 705 | |
787 | 706 | sub ReadInputMimeTypes |
788 | 707 | { |
789 | | - my $file_csv = "$dir_process/$file_csv_requests" ; |
| 708 | + my $file_csv = "$path_process/$file_csv_requests" ; |
790 | 709 | if (! -e $file_csv) |
791 | 710 | { abort ("Function ReadInputMimeTypes: file $file_csv not found!!!") ; } |
792 | 711 | open CSV_REQUESTS, '<', $file_csv ; |
— | — | @@ -872,7 +791,7 @@ |
873 | 792 | |
874 | 793 | sub ReadInputOpSys |
875 | 794 | { |
876 | | - my $file_csv = "$dir_process/$file_csv_opsys" ; |
| 795 | + my $file_csv = "$path_process/$file_csv_opsys" ; |
877 | 796 | if (! -e $file_csv) |
878 | 797 | { abort ("Function ReadInputOpSys: file $file_csv not found!!!") ; } |
879 | 798 | open CSV_OPSYS, '<', $file_csv ; |
— | — | @@ -918,7 +837,7 @@ |
919 | 838 | |
920 | 839 | sub ReadInputOrigins |
921 | 840 | { |
922 | | - my $file_csv = "$dir_process/$file_csv_origins" ; |
| 841 | + my $file_csv = "$path_process/$file_csv_origins" ; |
923 | 842 | if (! -e $file_csv) |
924 | 843 | { abort ("Function ReadInputOrigins: file $file_csv not found!!!") ; } |
925 | 844 | open CSV_ORIGINS, '<', $file_csv ; |
— | — | @@ -1001,7 +920,7 @@ |
1002 | 921 | |
1003 | 922 | sub ReadInputScripts |
1004 | 923 | { |
1005 | | - my $file_csv = "$dir_process/$file_csv_scripts" ; |
| 924 | + my $file_csv = "$path_process/$file_csv_scripts" ; |
1006 | 925 | if (! -e $file_csv) |
1007 | 926 | { abort ("Function ReadInputScripts: file $file_csv not found!!!") ; } |
1008 | 927 | open CSV_SCRIPTS, '<', $file_csv ; |
— | — | @@ -1035,7 +954,7 @@ |
1036 | 955 | # foreach $key (keys_sorted_by_value_num_desc %actions) |
1037 | 956 | # { print "$key: " . $actions {$key} . "\n" ; } |
1038 | 957 | |
1039 | | - open CSV_SCRIPTS, '<', "$dir_process/$file_csv_scripts" ; |
| 958 | + open CSV_SCRIPTS, '<', "$path_process/$file_csv_scripts" ; |
1040 | 959 | read_script: |
1041 | 960 | while ($line = <CSV_SCRIPTS>) |
1042 | 961 | { |
— | — | @@ -1101,7 +1020,7 @@ |
1102 | 1021 | |
1103 | 1022 | sub ReadInputGoogle |
1104 | 1023 | { |
1105 | | - my $file_csv = "$dir_process/$file_csv_google" ; |
| 1024 | + my $file_csv = "$path_process/$file_csv_google" ; |
1106 | 1025 | if (! -e $file_csv) |
1107 | 1026 | { abort ("Function ReadInputGoogle: file $file_csv not found!!!") ; } |
1108 | 1027 | open CSV_SEARCH, '<', $file_csv ; |
— | — | @@ -1158,7 +1077,7 @@ |
1159 | 1078 | |
1160 | 1079 | sub ReadInputSkins |
1161 | 1080 | { |
1162 | | - my $file_csv = "$dir_process/$file_csv_skins" ; |
| 1081 | + my $file_csv = "$path_process/$file_csv_skins" ; |
1163 | 1082 | if (! -e $file_csv) |
1164 | 1083 | { abort ("Function ReadInputSkins: file $file_csv not found!!!") ; } |
1165 | 1084 | open CSV_SKINS, '<', $file_csv ; |
— | — | @@ -1179,7 +1098,7 @@ |
1180 | 1099 | |
1181 | 1100 | sub ReadInputIndexPhp |
1182 | 1101 | { |
1183 | | - my $file_csv = "$dir_process/$file_csv_indexphp" ; |
| 1102 | + my $file_csv = "$path_process/$file_csv_indexphp" ; |
1184 | 1103 | if (! -e $file_csv) |
1185 | 1104 | { abort ("Function ReadInputIndexPhp: file $file_csv not found!!!") ; } |
1186 | 1105 | open CSV_INDEXPHP, '<', $file_csv ; |
— | — | @@ -1305,7 +1224,7 @@ |
1306 | 1225 | |
1307 | 1226 | sub ReadInputCountriesTimed |
1308 | 1227 | { |
1309 | | - my $file_csv = "$dir_process/$file_csv_countries_timed" ; |
| 1228 | + my $file_csv = "$path_process/public/$file_csv_countries_timed" ; |
1310 | 1229 | if (! -e $file_csv) |
1311 | 1230 | { abort ("Function ReadInputSkins: file $file_csv not found!!! ") ; } |
1312 | 1231 | open CSV_COUNTRIES, '<', $file_csv ; |
— | — | @@ -1346,6 +1265,8 @@ |
1347 | 1266 | |
1348 | 1267 | next if $line =~ /^#/ ; |
1349 | 1268 | |
| 1269 | + $line =~ s/C..?te d'Ivoire/Côte d'Ivoire/g ; |
| 1270 | + |
1350 | 1271 | ($country_code,$region_code,$north_south_code,$country_name) = split (',', $line,4) ; |
1351 | 1272 | $region_codes {$country_code} = $region_code ; |
1352 | 1273 | $north_south_codes {$country_code} = $north_south_code ; |
— | — | @@ -1363,6 +1284,7 @@ |
1364 | 1285 | # if ($country_meta_info_not_found_reported {$country} ++ == 0) |
1365 | 1286 | # { print "Meta info not found for country '$country'\n" ; } |
1366 | 1287 | # } |
| 1288 | + $country_name =~ s/^C..?te d/Côte d/ ; |
1367 | 1289 | |
1368 | 1290 | $country_names {$country_code} = $country_name ; |
1369 | 1291 | $country_codes_all {"$country_name|$country_code"} ++ ; |
— | — | @@ -1377,13 +1299,14 @@ |
1378 | 1300 | while ($line = <COUNTRY_META_INFO>) |
1379 | 1301 | { |
1380 | 1302 | chomp $line ; |
| 1303 | + |
| 1304 | + $line =~ s/C..?te d'Ivoire/Côte d'Ivoire/g ; |
| 1305 | + |
1381 | 1306 | ($country,$link,$population,$connected,$icon) = split ',', $line ; |
1382 | | -print "$line\n" ; # qqq |
1383 | 1307 | $country =~ s/,/,/g ; |
1384 | 1308 | |
1385 | 1309 | # use country names as given by MaxMind |
1386 | 1310 | $country =~ s/Brunei/Brunei Darussalam/ ; |
1387 | | - $country =~ s/C..?te d'Ivoire/Cote d'Ivoire/ ; |
1388 | 1311 | $country =~ s/Congo, The Democratic Republic of the/Republic of the Congo/ ; |
1389 | 1312 | $country =~ s/Dem. Rep. of Congo/Congo - The Democratic Republic of the/ ; |
1390 | 1313 | $country =~ s/East timor/Timor-Leste/ ; |
— | — | @@ -1404,10 +1327,8 @@ |
1405 | 1328 | $country =~ s/U.S. Virgin Islands/Virgin Islands, British/ ; |
1406 | 1329 | $country =~ s/Vatican City/Holy See (Vatican City State)/ ; |
1407 | 1330 | $country =~ s/^Korea$/South Korea/ ; |
1408 | | - |
1409 | 1331 | $connected =~ s/connected/../g ; |
1410 | 1332 | $country_meta_info {$country} = "$link,$population,$connected,$icon" ; |
1411 | | -print "meta info found for '$country'\n" ; # qqq |
1412 | 1333 | |
1413 | 1334 | if ($country eq "United States") |
1414 | 1335 | { ($connected_us = $connected) =~ s/_//g ; } |
— | — | @@ -1451,7 +1372,7 @@ |
1452 | 1373 | undef %yyyymm_ ; |
1453 | 1374 | undef %quarters ; |
1454 | 1375 | undef %requests_unknown_per_quarter ; |
1455 | | - undef %country_codes ; |
| 1376 | +# undef %country_codes ; |
1456 | 1377 | undef %requests_all ; |
1457 | 1378 | undef %requests_all_per_period ; |
1458 | 1379 | undef %requests_per_quarter ; |
— | — | @@ -1786,7 +1707,7 @@ |
1787 | 1708 | |
1788 | 1709 | sub ReadInputBrowserLanguages |
1789 | 1710 | { |
1790 | | - my $file_csv = "$dir_process/$file_csv_browser_languages" ; |
| 1711 | + my $file_csv = "$path_process/$file_csv_browser_languages" ; |
1791 | 1712 | if (! -e $file_csv) |
1792 | 1713 | { abort ("Function ReadInputBrowserLanguages: file $file_csv not found!!! ") ; } |
1793 | 1714 | open CSV_BROWSER_LANGUAGES, '<', $file_csv ; |
— | — | @@ -2025,7 +1946,7 @@ |
2026 | 1947 | |
2027 | 1948 | sub WriteReportClients |
2028 | 1949 | { |
2029 | | - open FILE_HTML_CLIENTS, '>', "$dir_reports/$file_html_clients" ; |
| 1950 | + open FILE_HTML_CLIENTS, '>', "$path_reports/$file_html_clients" ; |
2030 | 1951 | |
2031 | 1952 | $html = $header ; |
2032 | 1953 | $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Browsers e.a./ ; |
— | — | @@ -2316,7 +2237,7 @@ |
2317 | 2238 | |
2318 | 2239 | sub WriteReportCrawlers |
2319 | 2240 | { |
2320 | | - open FILE_HTML_CRAWLERS, '>', "$dir_reports/$file_html_crawlers" ; |
| 2241 | + open FILE_HTML_CRAWLERS, '>', "$path_reports/$file_html_crawlers" ; |
2321 | 2242 | |
2322 | 2243 | $html = $header ; |
2323 | 2244 | $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Crawler requests/ ; |
— | — | @@ -2490,7 +2411,7 @@ |
2491 | 2412 | |
2492 | 2413 | sub WriteReportMethods |
2493 | 2414 | { |
2494 | | - open FILE_HTML_METHODS, '>', "$dir_reports/$file_html_methods" ; |
| 2415 | + open FILE_HTML_METHODS, '>', "$path_reports/$file_html_methods" ; |
2495 | 2416 | |
2496 | 2417 | $html = $header ; |
2497 | 2418 | $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Request Methods/ ; |
— | — | @@ -2570,7 +2491,7 @@ |
2571 | 2492 | |
2572 | 2493 | sub WriteReportMimeTypes |
2573 | 2494 | { |
2574 | | - open FILE_HTML_REQUESTS, '>', "$dir_reports/$file_html_requests" ; |
| 2495 | + open FILE_HTML_REQUESTS, '>', "$path_reports/$file_html_requests" ; |
2575 | 2496 | |
2576 | 2497 | $html = $header ; |
2577 | 2498 | $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Requests by destination/ ; |
— | — | @@ -2740,7 +2661,7 @@ |
2741 | 2662 | |
2742 | 2663 | sub WriteReportOpSys |
2743 | 2664 | { |
2744 | | - open FILE_HTML_OPSYS, '>', "$dir_reports/$file_html_opsys" ; |
| 2665 | + open FILE_HTML_OPSYS, '>', "$path_reports/$file_html_opsys" ; |
2745 | 2666 | |
2746 | 2667 | $html = $header ; |
2747 | 2668 | $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Operating Systems/ ; |
— | — | @@ -2955,7 +2876,7 @@ |
2956 | 2877 | # http://en.wikipedia.org/wiki/Domain_name |
2957 | 2878 | sub WriteReportOrigins |
2958 | 2879 | { |
2959 | | - open FILE_HTML_ORIGINS, '>', "$dir_reports/$file_html_origins" ; |
| 2880 | + open FILE_HTML_ORIGINS, '>', "$path_reports/$file_html_origins" ; |
2960 | 2881 | |
2961 | 2882 | $html = $header ; |
2962 | 2883 | $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Requests by origin/ ; |
— | — | @@ -3560,7 +3481,7 @@ |
3561 | 3482 | |
3562 | 3483 | sub WriteReportScripts |
3563 | 3484 | { |
3564 | | - open FILE_HTML_SCRIPTS, '>', "$dir_reports/$file_html_scripts" ; |
| 3485 | + open FILE_HTML_SCRIPTS, '>', "$path_reports/$file_html_scripts" ; |
3565 | 3486 | |
3566 | 3487 | $html = $header ; |
3567 | 3488 | $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Scripts/ ; |
— | — | @@ -3737,7 +3658,7 @@ |
3738 | 3659 | |
3739 | 3660 | sub WriteReportGoogle |
3740 | 3661 | { |
3741 | | - open FILE_HTML_SEARCH, '>', "$dir_reports/$file_html_google" ; |
| 3662 | + open FILE_HTML_SEARCH, '>', "$path_reports/$file_html_google" ; |
3742 | 3663 | |
3743 | 3664 | $html = $header ; |
3744 | 3665 | $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Google requests/ ; |
— | — | @@ -4058,7 +3979,7 @@ |
4059 | 3980 | |
4060 | 3981 | sub WriteReportSkins |
4061 | 3982 | { |
4062 | | - open FILE_HTML_SKINS, '>', "$dir_reports/$file_html_skins" ; |
| 3983 | + open FILE_HTML_SKINS, '>', "$path_reports/$file_html_skins" ; |
4063 | 3984 | |
4064 | 3985 | $html = $header ; |
4065 | 3986 | $html =~ s/TITLE/Wikimedia Traffic Analysis Report - Skins/ ; |
— | — | @@ -4112,11 +4033,11 @@ |
4113 | 4034 | |
4114 | 4035 | sub WriteCsvGoogleBots |
4115 | 4036 | { |
4116 | | - open CSV_GOOGLE_BOTS_OUT, '>', "$dir_reports/$file_csv_google_bots" ; |
| 4037 | + open CSV_GOOGLE_BOTS_OUT, '>', "$path_reports/$file_csv_google_bots" ; |
4117 | 4038 | print CSV_GOOGLE_BOTS_OUT "Date Time,Ip Range,Hits\n" ; |
4118 | | - foreach $dir_process (@dirs_process) |
| 4039 | + foreach $path_process (@dirs_process) |
4119 | 4040 | { |
4120 | | - open CSV_GOOGLE_BOTS_IN, '<', "$dir_process/$file_csv_google_bots" ; |
| 4041 | + open CSV_GOOGLE_BOTS_IN, '<', "$path_process/$file_csv_google_bots" ; |
4121 | 4042 | while ($line = <CSV_GOOGLE_BOTS_IN>) |
4122 | 4043 | { |
4123 | 4044 | next if $line =~ /^#/ ; # comments |
— | — | @@ -4140,7 +4061,7 @@ |
4141 | 4062 | |
4142 | 4063 | sub WriteCsvBrowserLanguages |
4143 | 4064 | { |
4144 | | - open CSV_BROWSER_LANGUAGES, '>', "$dir_reports/$file_csv_browser_languages" ; |
| 4065 | + open CSV_BROWSER_LANGUAGES, '>', "$path_reports/$file_csv_browser_languages" ; |
4145 | 4066 | print CSV_BROWSER_LANGUAGES "Browser,Languages,Hits\n" ; |
4146 | 4067 | foreach $key (keys_sorted_alpha_asc %browser_languages) |
4147 | 4068 | { print CSV_BROWSER_LANGUAGES "$key,${browser_languages {$key}}\n" ; } |
— | — | @@ -4150,8 +4071,8 @@ |
4151 | 4072 | sub WriteCsvCountriesTimed |
4152 | 4073 | { |
4153 | 4074 | $multiplier_1000 = 1000 * $multiplier ; |
4154 | | -# open CSV_COUNTRIES_TIMED, '>', "$dir_reports/$file_csv_countries_timed" ; |
4155 | | - open CSV_COUNTRIES_TIMED, '>', "/home/ezachte/$file_csv_countries_timed" ; |
| 4075 | + print "WriteCsvCountriesTimed: $path_out/$file_csv_countries_timed\n" ; |
| 4076 | + open CSV_COUNTRIES_TIMED, '>', "$path_out/$file_csv_countries_timed" ; |
4156 | 4077 | |
4157 | 4078 | foreach $target (sort keys %targets) |
4158 | 4079 | { |
— | — | @@ -4173,7 +4094,7 @@ |
4174 | 4095 | $cnt_countries = 0 ; |
4175 | 4096 | foreach $country (@countries) |
4176 | 4097 | { |
4177 | | - $country_name = $country_codes {$country} ; |
| 4098 | + $country_name = $country_names {$country} ; |
4178 | 4099 | $line .= "$country_name," ; |
4179 | 4100 | |
4180 | 4101 | last if $cnt_countries++ >= 25 ; |
— | — | @@ -4203,8 +4124,8 @@ |
4204 | 4125 | # http://www.maxmind.com/app/iso3166 country codes |
4205 | 4126 | sub WriteCsvCountriesGoTo |
4206 | 4127 | { |
4207 | | -# open CSV_COUNTRIES_TIMED, '>', "$dir_reports/$file_csv_countries_timed" ; |
4208 | | - open CSV_COUNTRIES_LANGUAGES_VISITED, '>', "/home/ezachte/$file_csv_countries_languages_visited" ; |
| 4128 | + print "WriteCsvCountriesGoTo: $path_out/$file_csv_countries_languages_visited\n" ; |
| 4129 | + open CSV_COUNTRIES_LANGUAGES_VISITED, '>', "$path_out/$file_csv_countries_languages_visited" ; |
4209 | 4130 | |
4210 | 4131 | foreach $country (sort keys %countries) |
4211 | 4132 | { |
— | — | @@ -4227,9 +4148,9 @@ |
4228 | 4149 | |
4229 | 4150 | foreach $bot ("N","Y") |
4230 | 4151 | { |
4231 | | - $country_name = $country_codes {$country} ; |
| 4152 | + $country_name = $country_names {$country} ; |
4232 | 4153 | $country_name =~ s/\n//gs ; |
4233 | | - $country_name =~ s/[0x00-0x1F]//gs ; |
| 4154 | + $country_name =~ s/[\x00-\x1F]//gs ; |
4234 | 4155 | |
4235 | 4156 | $cnt_targets = 0 ; |
4236 | 4157 | $tot_targets = 0 ; |
— | — | @@ -4283,6 +4204,8 @@ |
4284 | 4205 | $html =~ s/X1000/. Period <b>$requests_recently_start - $requests_recently_stop<\/b>/ ; |
4285 | 4206 | $html =~ s/DATE// ; |
4286 | 4207 | |
| 4208 | + $html .= "<p>Portal is <a href='http://www.wikipedia.org'>www.wikipedia.org</a>\n" ; |
| 4209 | + |
4287 | 4210 | $html .= "<p><table border=1 width=800>INDEX\n" ; |
4288 | 4211 | |
4289 | 4212 | my $languages_reported ; |
— | — | @@ -4380,7 +4303,7 @@ |
4381 | 4304 | my (@index_countries,@csv_countries) ; |
4382 | 4305 | my $views_edits_lc = lc $views_edits ; |
4383 | 4306 | my $views_edits_lcf = ucfirst $views_edits_lc ; |
4384 | | - ($views_edits2 = $views_edits) =~ s/ /\<br\>/ ; |
| 4307 | +# ($views_edits2 = $views_edits) =~ s/ /\<br\>/ ; |
4385 | 4308 | if ($views_edits =~ /edit/i) |
4386 | 4309 | { $MPVE = 'MPE' ; } # monthly page edits |
4387 | 4310 | else |
— | — | @@ -4397,17 +4320,22 @@ |
4398 | 4321 | |
4399 | 4322 | $html .= &HtmlSortTable ; |
4400 | 4323 | |
| 4324 | + $html .= "<p>Portal is <a href='http://www.wikipedia.org'>www.wikipedia.org</a>\n" ; |
| 4325 | + |
4401 | 4326 | $html .= "<p><table border=1 width=800 class=tablesorter id=table1>\n" ; |
4402 | 4327 | $html .= "<thead>\n" ; |
4403 | 4328 | $html .= "INDEX\n" ; |
4404 | 4329 | |
4405 | 4330 | $html .= &HtmlWorldMaps ; |
4406 | 4331 | |
4407 | | - $html .= "<tr><td class=rh5 colspan=3 rowspan=1><b>Country</b></td><td class=c rowspan=2><b>Monthly<br>$views_edits2</b></td>" . |
4408 | | - "<td class=r rowspan=2><b>Population</b></td>" . # <td class=c rowspan=2><b>$MPVE's<br>Per<br>Person</b></td>" . |
4409 | | - "<td class=c colspan=2><b>Internet<br>Users</b></td><td class=c><b>${MPVE}'s<br>Per<br>I U</b></td>" . |
4410 | | - "<td colspan=99 class=l rowspan=2><b>Share in Global Monthly $views_edits</b><br><small><font color=#808080>red and blue bars have different scale</font></small></td></tr>\n" ; |
4411 | | - $html .= "<tr><td class=c><b>Name</b></td><td class=c><b>Region</b><br><img src='http://stats.wikimedia.org/Location_of_Continents2.gif'></td><td class=c><b>N/S</b></td><td class=c><b>Total</b></td><td class=c><b>/Pop.</b></td></tr>\n" ; |
| 4332 | + $html .= "<tr><td class=hr colspan=3 rowspan=1><b>Location</b></td>" . |
| 4333 | + "<td class=hc colspan=2 rowspan=2><b>Population</b><br><small><font color=#404040>absolute count and percentage of world population</font></small></td>" . # <td class=hc rowspan=2><b>$MPVE's<br>Per<br>Person</b></td>" . |
| 4334 | + "<td class=hc colspan=2 rowspan=2><b>Internet<br>Users</b><br><small><font color=#404040>absolute count and percentage of country population</font></small></td>" . |
| 4335 | + "<td class=hl colspan=4 rowspan=1><b>Monthly $views_edits</b></td></tr>\n" ; |
| 4336 | +# $html .= "<tr>" . |
| 4337 | +# # "<td class=hc><b>${MPVE}'s<br>Per<br>I U</b></td>" . |
| 4338 | +# "<td colspan=99 class=hc><b>Share in Global Monthly $views_edits</b><br><small><font color=#808080>red and blue bars have different scale</font></small></td></tr>\n" ; |
| 4339 | + $html .= "<tr><td class=hr><b>Country</b></td><td class=hc><b>Region</b><br><img src='http://stats.wikimedia.org/Location_of_Continents2.gif'></td><td class=hc><b>N/S</b></td><td class=hc colspan=2><small><font color=#404040>absolute count and edits per internet user</font></small></td><td class=hl colspan=2><small>share of global total<font color=#808080><p>note:blue and red bars have different scale</font></small></td></tr>\n" ; |
4412 | 4340 | $html .= "<tr><th> </th><th> </th><th> </th><th> </th><th> </th><th> </th><th> </th><th> </th><th> </th><th colspan=2> </th></tr>\n" ; |
4413 | 4341 | $html .= "</thead><tbody>\nTOTAL\nREGIONS\n" ; |
4414 | 4342 | |
— | — | @@ -4440,7 +4368,6 @@ |
4441 | 4369 | $north_south_name =~ s/^N$/<font color=#000BF7><b>N<\/b><\/font>/ ; |
4442 | 4370 | $north_south_name =~ s/^S$/<font color=#FE0B0D><b>S<\/b><\/font>/ ; |
4443 | 4371 | |
4444 | | -print "\n" ; # qqq |
4445 | 4372 | ($link_country,$icon,$population,$connected) = &CountryMetaInfo ($country) ; |
4446 | 4373 | |
4447 | 4374 | my $requests_this_country = $requests_recently_per_country {$country} ; |
— | — | @@ -4495,7 +4422,7 @@ |
4496 | 4423 | $country2 =~ s/Syrian Arab Republic/Syria/ ; |
4497 | 4424 | $country2 =~ s/Tanzania, United Republic of/Tanzania/ ; |
4498 | 4425 | $country2 =~ s/Libyan Arab Jamahiriya/Libya/ ; |
4499 | | - $country2 =~ s/C..?te d'Ivoire/C\xC3\xB4te d'Ivoire/ ; |
| 4426 | + $country2 =~ s/C..?te d'Ivoire/Côte d'Ivoire/ ; |
4500 | 4427 | $country2 =~ s/Serbia/republic of serbia/ ; |
4501 | 4428 | $country2 =~ s/Lao People's Democratic Republic/Laos/ ; |
4502 | 4429 | |
— | — | @@ -4505,13 +4432,18 @@ |
4506 | 4433 | $population2 = &i2KM2 ($population) ; |
4507 | 4434 | $connected2 = &i2KM2 ($connected) ; |
4508 | 4435 | $requests_this_country2 = &i2KM2 ($requests_this_country2) ; |
| 4436 | + $perc_population = &Percentage ($population / $population_tot) ; |
| 4437 | + if ($perc_population =~ /\.0\d/) |
| 4438 | + { $perc_population = "<small>$perc_population</small>" ; } |
| 4439 | + |
4509 | 4440 | $html .= "<tr><th class=rh3><a id='$country' name='$country'></a>$link_country $icon</td>" . |
4510 | 4441 | "<td>$region_name</td>" . |
4511 | 4442 | "<td>$north_south_name</td>" . |
4512 | | - "<td>$requests_this_country2</td>" . |
4513 | 4443 | "<td>$population2</td>" . # <td>$requests_per_person</td>" . |
| 4444 | + "<td>$perc_population</td>" . # <td>$requests_per_person</td>" . |
4514 | 4445 | "<td>$connected2</td>" . |
4515 | 4446 | "<td>$perc_connected</td>" . |
| 4447 | + "<td>$requests_this_country2</td>" . |
4516 | 4448 | "<td>$requests_per_connected_person</td>" . |
4517 | 4449 | "<td>$perc_share_total</td>" . |
4518 | 4450 | "<td class=l>$bar</td></tr>\n" ; |
— | — | @@ -4549,10 +4481,11 @@ |
4550 | 4482 | $html_total = "<tr><th class=rh3>All countries in</td>" . |
4551 | 4483 | "<td><b>World</b></td>" . |
4552 | 4484 | "<td> </td>" . |
4553 | | - "<td>$requests_tot2</td>" . |
4554 | 4485 | "<td>$population_tot2</td>" . |
| 4486 | + "<td>100%</td>" . |
4555 | 4487 | "<td>$connected_tot2</td>" . |
4556 | 4488 | "<td>$perc_connected_tot</td>" . |
| 4489 | + "<td>$requests_tot2</td>" . |
4557 | 4490 | "<td>$requests_per_connected_person_tot</td>" . |
4558 | 4491 | "<td>100%</th>" . |
4559 | 4492 | "<td class=l> </td></tr>\n" ; |
— | — | @@ -4592,9 +4525,9 @@ |
4593 | 4526 | if ($requests_recently_all > 0) |
4594 | 4527 | { $perc_share_total = &Percentage ($requests_region / $requests_recently_all) ; } |
4595 | 4528 | |
4596 | | - $perc_connected_region = ".." ; |
| 4529 | + $perc_population_region = ".." ; |
4597 | 4530 | if ($population_region > 0) |
4598 | | - { $perc_connected_region = sprintf ("%.0f", 100 * $connected_region / $population_region) .'%' ; } |
| 4531 | + { $perc_population_region = &Percentage ($population_region / $population_tot) ; } |
4599 | 4532 | |
4600 | 4533 | # $requests_region2 = int ($requests_region * 1000 / $months_recently) ; |
4601 | 4534 | |
— | — | @@ -4620,10 +4553,11 @@ |
4621 | 4554 | $html_regions .= "<tr><th>All countries in</th>" . |
4622 | 4555 | "</td><td>$region</td>" . |
4623 | 4556 | "<td> </td>" . |
4624 | | - "<td>$requests_region2</td>" . |
4625 | 4557 | "<td>$population_region</td>" . |
| 4558 | + "<td>$perc_population_region</td>" . |
4626 | 4559 | "<td>$connected_region</td>" . |
4627 | 4560 | "<td>$perc_connected_region</td>" . |
| 4561 | + "<td>$requests_region2</td>" . |
4628 | 4562 | "<td>$requests_per_connected_person</td>" . |
4629 | 4563 | "<td>$perc_share_total</th>" . |
4630 | 4564 | "<td class=l>$bar</td></tr>\n" ; |
— | — | @@ -4806,7 +4740,7 @@ |
4807 | 4741 | $country =~ s/Syrian Arab Republic/Syria/ ; |
4808 | 4742 | $country =~ s/Tanzania, United Republic of/Tanzania/ ; |
4809 | 4743 | $country =~ s/Libyan Arab Jamahiriya/Libya/ ; |
4810 | | - $country =~ s/C..?te d'Ivoire/C\xC3\xB4te d'Ivoire/ ; |
| 4744 | + $country =~ s/C..?te d'Ivoire/Côte d'Ivoire/ ; |
4811 | 4745 | $country =~ s/Serbia/republic of serbia/ ; |
4812 | 4746 | $country =~ s/Lao People's Democratic Republic/Laos/ ; |
4813 | 4747 | |
— | — | @@ -5079,6 +5013,8 @@ |
5080 | 5014 | $html =~ s/X1000/. Period <b>$requests_recently_start - $requests_recently_stop<\/b><br>$report_version/ ; |
5081 | 5015 | $html =~ s/DATE// ; |
5082 | 5016 | |
| 5017 | + $html .= "<p>Portal is <a href='http://www.wikipedia.org'>www.wikipedia.org</a>\n" ; |
| 5018 | + |
5083 | 5019 | $html .= "<p><table border=1 width=800>INDEX\n" ; |
5084 | 5020 | |
5085 | 5021 | $html .= &HtmlWorldMaps ; |
— | — | @@ -5221,6 +5157,8 @@ |
5222 | 5158 | $html =~ s/X1000/. Period <b>$requests_start - $requests_stop<\/b>/ ; |
5223 | 5159 | $html =~ s/DATE// ; |
5224 | 5160 | |
| 5161 | + $html .= "<p>Portal is <a href='http://www.wikipedia.org'>www.wikipedia.org</a>\n" ; |
| 5162 | + |
5225 | 5163 | $html .= "<p><table border=1 width=800>INDEX\n" ; |
5226 | 5164 | |
5227 | 5165 | $html .= &HtmlWorldMaps ; |
— | — | @@ -5414,14 +5352,14 @@ |
5415 | 5353 | sub OpenLog |
5416 | 5354 | { |
5417 | 5355 | # only shrink log when same log file is appended daily, is no longer the case |
5418 | | -# $fileage = -M "$dir_reports/$file_log" ; |
| 5356 | +# $fileage = -M "$path_reports/$file_log" ; |
5419 | 5357 | # if ($fileage > 5) |
5420 | 5358 | # { |
5421 | | -# open "FILE_LOG", "<", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ; |
| 5359 | +# open "FILE_LOG", "<", "$path_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ; |
5422 | 5360 | # @log = <FILE_LOG> ; |
5423 | 5361 | # close "FILE_LOG" ; |
5424 | 5362 | # $lines = 0 ; |
5425 | | -# open "FILE_LOG", ">", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ; |
| 5363 | +# open "FILE_LOG", ">", "$path_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ; |
5426 | 5364 | # foreach $line (@log) |
5427 | 5365 | # { |
5428 | 5366 | # if (++$lines >= $#log - 5000) |
— | — | @@ -5429,8 +5367,8 @@ |
5430 | 5368 | # } |
5431 | 5369 | # close "FILE_LOG" ; |
5432 | 5370 | # } |
5433 | | -# open "FILE_LOG", ">>", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ; |
5434 | | - open "FILE_LOG", ">>", "$dir_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ; |
| 5371 | +# open "FILE_LOG", ">>", "$path_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ; |
| 5372 | + open "FILE_LOG", ">>", "$path_reports/$file_log" || abort ("Log file '$file_log' could not be opened.") ; |
5435 | 5373 | &Log ("\n\n===== Wikimedia Sampled Visitors Log Report / " . date_time_english (time) . " =====\n\n") ; |
5436 | 5374 | } |
5437 | 5375 | |
— | — | @@ -5899,7 +5837,7 @@ |
5900 | 5838 | |
5901 | 5839 | $country =~ s/,/,/g ; |
5902 | 5840 | $country =~ s/Bosnia-Herzegovina/Bosnia and Herzegovina/ ; |
5903 | | - $country =~ s/Cote d'Ivoire/C�te d'Ivoire/ ; |
| 5841 | + $country =~ s/Cote d'Ivoire/Côte d'Ivoire/ ; |
5904 | 5842 | $country =~ s/Macao/Macau/ ; # will be changed back later |
5905 | 5843 | $country =~ s/Samoa/American Samoa/ ; |
5906 | 5844 | $country =~ s/Timor Leste/Timor-Leste/ ; |
— | — | @@ -5928,12 +5866,11 @@ |
5929 | 5867 | sub CountryMetaInfo |
5930 | 5868 | { |
5931 | 5869 | my $country = shift ; |
5932 | | -print "Country '$country'\n" ; # qqq |
5933 | 5870 | my ($link_country,$icon,$population) ; |
5934 | 5871 | if ($country_meta_info {$country} eq "") |
5935 | 5872 | { |
5936 | 5873 | if ($country_meta_info_not_found_reported {$country} ++ == 0) |
5937 | | - { print "_Meta info not found for country '$country'\n" ; } |
| 5874 | + { print "Meta info not found for country '$country'\n" ; } |
5938 | 5875 | $link_country = $country ; |
5939 | 5876 | return ($country,'','..','..') ; |
5940 | 5877 | } |
— | — | @@ -6000,7 +5937,7 @@ |
6001 | 5938 | # print "\n\nUnLink $index\n\n" ; |
6002 | 5939 | my @segments = split '(?=<a )', $links ; |
6003 | 5940 | # print "SEGMENT 1 $segments[$index]\n" ; |
6004 | | - $segments [$index] =~ s/^.*?<a .*?>([^<]*)<\/a>/$1/ ; |
| 5941 | + $segments [$index] =~ s/^.*?<a .*?>([^<]*)<\/a>/<font color=#008000><b>$1<\/b><\/font>/ ; |
6005 | 5942 | # print "SEGMENT 2 $segments[$index]\n" ; |
6006 | 5943 | $links = join '', @segments ; |
6007 | 5944 | return ($links) ; |
— | — | @@ -6049,11 +5986,12 @@ |
6050 | 5987 | \$.tablesorter.addParser({ |
6051 | 5988 | id: "millions", |
6052 | 5989 | is: function(s) { return false; }, |
6053 | | - format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/ /g,"").replace(/M/,"000000").replace(/М/,"000000").replace(/K/,"000").replace(/К/i,"000")); }, |
| 5990 | +//failed so far to turn 1.2M into 1200000, so figures with decimal point are sorted out of place |
| 5991 | +//format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/ /g,"").replace(/\\.(\d)M/,$1+"00000").replace(/M/,"000000").replace(/М/,"000000").replace(/K/,"000").replace(/К/i,"000")); }, |
| 5992 | + format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/ /g,""). replace(/M/,"000000").replace(/М/,"000000").replace(/K/,"000").replace(/К/i,"000")); }, |
6054 | 5993 | type: "numeric" |
6055 | 5994 | }); |
6056 | 5995 | |
6057 | | - |
6058 | 5996 | \$.tablesorter.addParser({ |
6059 | 5997 | id: "digitsonly", |
6060 | 5998 | is: function(s) { return false; }, |
— | — | @@ -6131,7 +6069,7 @@ |
6132 | 6070 | <script type='text/javascript'> |
6133 | 6071 | \$('#table1').tablesorter({ |
6134 | 6072 | // debug:true, |
6135 | | - headers:{0:{sorter:'nohtml'},1:{sorter:'nohtml'},2:{sorter:'nohtml'},3:{sorter:'millions'},4:{sorter:'millions'},5:{sorter:'millions'},6:{sorter:'digitsonly'},7:{sorter:'digitsonly'},6:{sorter:'digitsonly'},7:{sorter:'digitsonly'}} |
| 6073 | + headers:{0:{sorter:'nohtml'},1:{sorter:'nohtml'},2:{sorter:'nohtml'},3:{sorter:'millions'},4:{sorter:'digitsonly'},5:{sorter:'millions'},6:{sorter:'digitsonly'},7:{sorter:'millions'},8:{sorter:'digitsonly'},9:{sorter:'digitsonly'}} |
6136 | 6074 | }); |
6137 | 6075 | </script> |
6138 | 6076 | __HTML_SORT_TABLE_COLUMNS__ |
— | — | @@ -6263,3 +6201,4 @@ |
6264 | 6202 | |
6265 | 6203 | return $html_worldmaps ; |
6266 | 6204 | } |
| 6205 | + |