Index: trunk/wikistats/squids/config.pm |
— | — | @@ -1,6 +0,0 @@ |
2 | | - #!/usr/bin/perl |
3 | | - |
4 | | - $liblocation = "/home/ezachte/lib" ; |
5 | | - $default_argv = "-d 2011/04/01" ; |
6 | | - $dir_in = "/a/squid/archive" ; |
7 | | - $logname = "sampled-1000.log" ; |
\ No newline at end of file |
Index: trunk/wikistats/squids/SquidCountryScanConfig.pm |
— | — | @@ -1,7 +1,7 @@ |
2 | 2 | #!/usr/bin/perl
|
3 | 3 |
|
4 | | - $liblocation = "/home/ezachte/lib" ;
|
| 4 | + $cfg_liblocation = "/home/ezachte/lib" ;
|
5 | 5 |
|
6 | | - $path_root_production = "/a/ezachte/" ;
|
7 | | - $path_root_test = "w:/! perl/squids/archive/" ; # Erik
|
8 | | -# $path_root_test = "?" ; # Andr�
|
| 6 | + $cfg_path_root_production = "/a/ezachte/" ;
|
| 7 | + $cfg_path_root_test = "w:/! perl/squids/archive/" ; # Erik
|
| 8 | +# $cfg_path_root_test = "?" ; # Andr�
|
Index: trunk/wikistats/squids/SquidReportArchiveConfig.pm |
— | — | @@ -1,15 +1,16 @@ |
2 | 2 | #!/usr/bin/perl
|
3 | 3 |
|
4 | | - $liblocation = "/home/ezachte/lib" ;
|
| 4 | + $cfg_liblocation = "/home/ezachte/lib" ;
|
5 | 5 |
|
6 | | -# $path_in_local = "W:/# Out Locke" ; # Erik
|
7 | | -# $path_out_local = "W:/# Out Test/Locke" ; # Erik
|
| 6 | + $cfg_path_in_production = "/a/ezachte" ;
|
| 7 | + $cfg_path_out_production = "/a/ezachte" ;
|
| 8 | +# $cfg_path_in_test = "W:/# Out Locke" ; # Erik
|
| 9 | +# $cfg_path_out_test = "W:/# Out Test/Locke" ; # Erik
|
| 10 | + $cfg_path_in_test = "/srv/erik/" ; # Andr�
|
| 11 | + $cfg_path_out_test = "/srv/erik/" ; # Andr�
|
8 | 12 |
|
9 | | - $path_in = "/srv/erik/" ; # Andr�
|
10 | | - $path_out = "/srv/erik/" ; # Andr�
|
11 | | -
|
12 | | -# set defaults for tests on local machine
|
13 | | -# $default_argv = "-m 2011-07" ; # monthly report
|
14 | | -# $default_argv = "-w" ; # refresh country info from Wikipedia (population etc)
|
15 | | -# $default_argv = "-c" ; # country/regional reports
|
16 | | - $default_argv = "-c -q 2011Q4" ; # country/regional reports based on data for one quarter only
|
| 13 | +# set default arguments for test on local machine
|
| 14 | +# $cfg_default_argv = "-m 2011-07" ; # monthly report
|
| 15 | +# $cfg_default_argv = "-w" ; # refresh country info from Wikipedia (population etc)
|
| 16 | +# $cfg_default_argv = "-c" ; # country/regional reports
|
| 17 | + $cfg_default_argv = "-c -q 2011Q4" ; # country/regional reports based on data for one quarter only
|
Index: trunk/wikistats/squids/SquidCountArchiveReadInput.pm |
— | — | @@ -4,7 +4,7 @@ |
5 | 5 | # test: |
6 | 6 | # echo 125.123.123.123 | /usr/local/bin/geoiplogtag 1 |
7 | 7 | # refresh: bayes:/usr/share/GeoIP> wget http://geolite.maxmind.com/download/geoip/database/GeoLiteCountry/GeoIP.dat.gz |
8 | | -use config ; |
| 8 | +use SquidCountArchiveConfig ; |
9 | 9 | |
10 | 10 | sub CollectFilesToProcess |
11 | 11 | { |
— | — | @@ -24,6 +24,8 @@ |
25 | 25 | |
26 | 26 | my ($date_archived) ; |
27 | 27 | |
| 28 | + $dir_in = $job_runs_on_production_server ? $cfg_dir_in_production : $cfg_dir_in_test ; |
| 29 | + |
28 | 30 | $some_files_found = $false ; |
29 | 31 | $full_range_found = $false ; |
30 | 32 | |
— | — | @@ -47,7 +49,7 @@ |
48 | 50 | $date_archived = sprintf ("%4d%02d%02d", $year+1900, $month+1, $day) ; |
49 | 51 | print "\n- Inspect file saved $days_ago_inspect days ago: $logname-$date_archived.gz\n" ; |
50 | 52 | |
51 | | - my $file = "$dir_in/$logname-$date_archived.gz" ; |
| 53 | + my $file = "$dir_in/$cfg_logname-$date_archived.gz" ; |
52 | 54 | |
53 | 55 | if (! -e $file) |
54 | 56 | { print "- File not found: $file\n" ; } |
— | — | @@ -173,8 +175,8 @@ |
174 | 176 | else |
175 | 177 | { |
176 | 178 | open IN, '<', $file_in ; |
177 | | - # $fields_expected = 14 ; |
178 | | - $fields_expected = 13 ; |
| 179 | + $fields_expected = 14 ; # add fake country code |
| 180 | + # $fields_expected = 13 ; |
179 | 181 | } |
180 | 182 | |
181 | 183 | $line = "" ; |
— | — | @@ -192,6 +194,12 @@ |
193 | 195 | # ugly Q&D code to circumvent spaces in agent string |
194 | 196 | # $line2 = $line ; |
195 | 197 | chomp $line ; |
| 198 | + |
| 199 | + if ($test) |
| 200 | + { $line .= ' XX' ; } |
| 201 | + |
| 202 | + $line =~ s/x-www-form-urlencoded; charset=UTF-8/x-www-form-urlencoded;%20charset=UTF-8/ ; # log lines are space delimited, other spaces should be encoded |
| 203 | + |
196 | 204 | @fields = split (' ', $line) ; |
197 | 205 | # next if $line =~ /upload/ ; |
198 | 206 | # next if $line !~ /en\.m\.wikipedia/ ; |
— | — | @@ -201,23 +209,46 @@ |
202 | 210 | #next if $fields [9] =~ /NONE/ ; |
203 | 211 | if ($#fields > 14) |
204 | 212 | { |
| 213 | +if (! $scan_ip_frequencies) |
| 214 | +{ |
205 | 215 | # print "line $line2\n" ; |
206 | 216 | # print "fields " . $#fields . "\n$line\n" ; |
| 217 | +} |
| 218 | + |
207 | 219 | $country_code = $fields [$#fields] ; |
208 | 220 | $fields [$#fields] = '' ; |
209 | 221 | $line = join (' ', @fields) ; |
210 | | -# print "2 $line\n" ; |
211 | 222 | @fields = split (' ', $line, 14) ; |
212 | 223 | $fields [14] = $country_code ; |
| 224 | + $fields [13] =~ s/ /%20/g ; |
| 225 | + |
| 226 | +if (! $scan_ip_frequencies) |
| 227 | +{ |
| 228 | +# print "2 $line\n" ; |
213 | 229 | # print "\n\n12: " . $fields [12] . "\n" ; |
214 | 230 | # print "13: " . $fields [13] . "\n" ; |
215 | 231 | # print "14: " . $fields [14] . "\n" ; |
216 | 232 | # print "15: " . $fields [15] . "\n" ; |
| 233 | +} |
217 | 234 | } |
218 | 235 | |
219 | | - if ($#fields < $fields_expected) { $fields_too_few ++ ; print "invalid field count " . $#fields . "\n" ; next ; } |
220 | | - if ($#fields > $fields_expected) { $fields_too_many ++ ; print "invalid field count " . $#fields . "\n" ; next ; } |
| 236 | + if ($#fields < $fields_expected) |
| 237 | + { |
| 238 | + $fields_too_few ++ ; |
| 239 | + print "invalid field count " . $#fields . "\n" ; |
| 240 | + print ERR $#fields . " fields: \"$line\"\n" ; |
| 241 | + next ; |
| 242 | + } |
221 | 243 | |
| 244 | + if ($#fields > $fields_expected) |
| 245 | + { |
| 246 | + @a = @fields ; |
| 247 | + $fields_too_many ++ ; |
| 248 | + print "invalid field count " . $#fields . "\n" ; |
| 249 | + print ERR $#fields . " fields: \"$line\"\n" ; |
| 250 | + next ; |
| 251 | + } |
| 252 | + |
222 | 253 | $time = $fields [2] ; |
223 | 254 | |
224 | 255 | if (($oldest_time_read eq "") || ($time lt $oldest_time_read)) |
— | — | @@ -230,7 +261,7 @@ |
231 | 262 | |
232 | 263 | if ($time lt $time_to_start) |
233 | 264 | { |
234 | | - if (++ $times % 100000 == 0) |
| 265 | + if (++ $times % 1000000 == 0) |
235 | 266 | { print "[$time]\n" ; } |
236 | 267 | next ; |
237 | 268 | } |
— | — | @@ -266,12 +297,23 @@ |
267 | 298 | #next if $line !~ /http:\/\/\w+\.m\./ ; |
268 | 299 | #print "$line\n" ; |
269 | 300 | &ProcessLine ($line) ; |
270 | | - if (++ $lines_processed % 10000 == 0) |
| 301 | + if (++ $lines_processed % 50000 == 0) |
271 | 302 | { |
| 303 | + if (! $scan_ip_frequencies) # phase 2 |
| 304 | + { |
| 305 | + $perc_mobile_all = '-' ; |
| 306 | + if ($records {"*,*"} > 0) |
| 307 | + { $perc_mobile_all = sprintf ("%.1f", 100 * $records {"M,*"} / $records {"*,*"}) ; } |
| 308 | + $perc_mobile_pages = '-' ; |
| 309 | + if ($records {"*,page"} > 0) |
| 310 | + { $perc_mobile_pages = sprintf ("%.1f", 100 * $records {"M,page"} / $records {"*,page"}) ; } |
| 311 | + $perc_mobile = " (mobile: all $perc_mobile_all\%, pages $perc_mobile_pages\%)" ; |
| 312 | + } |
| 313 | + |
272 | 314 | if ($banner_requests_ignored == 0) |
273 | | - { print "$time $lines_processed\n" ; } |
| 315 | + { print "$time $lines_processed$perc_mobile\n" ; } |
274 | 316 | else |
275 | | - { print "$time $lines_processed ($banner_requests_ignored banner requests ignored)\n" ; } |
| 317 | + { print "$time $lines_processed$perc_mobile ($banner_requests_ignored banner requests ignored)\n" ; } |
276 | 318 | } |
277 | 319 | if ($test and $lines_processed >= $test_maxlines) |
278 | 320 | { last ; } |
Index: trunk/wikistats/squids/SquidCountryScan.pl |
— | — | @@ -4,7 +4,7 @@ |
5 | 5 | ## sub ProcessRawData <- SquidDataCountries.csv -> ?? |
6 | 6 | |
7 | 7 | use SquidCountryScanConfig ; |
8 | | - use lib $liblocation ; |
| 8 | + use lib $cfg_liblocation ; |
9 | 9 | use EzLib ; |
10 | 10 | $trace_on_exit = $true ; |
11 | 11 | |
— | — | @@ -23,7 +23,7 @@ |
24 | 24 | # exit ; |
25 | 25 | } |
26 | 26 | |
27 | | - $path_root = $job_runs_on_production_server ? $path_root_production : $path_root_test ; |
| 27 | + $path_root = $job_runs_on_production_server ? $cfg_path_root_production : $cfg_path_root_test ; |
28 | 28 | |
29 | 29 | $file_raw_data_monthly_visits = "$path_root/SquidDataVisitsPerCountryMonthly.csv" ; |
30 | 30 | $file_raw_data_daily_visits = "$path_root/SquidDataVisitsPerCountryDaily.csv" ; |
Index: trunk/wikistats/squids/SquidReportArchive.pl |
— | — | @@ -1,13 +1,13 @@ |
2 | 2 | #!/usr/bin/perl |
3 | 3 | |
4 | 4 | use SquidReportArchiveConfig ; |
5 | | - use lib $liblocation ; |
| 5 | + use lib $cfg_liblocation ; |
6 | 6 | |
7 | 7 | use EzLib ; |
8 | 8 | $trace_on_exit = $true ; |
9 | 9 | ez_lib_version (2) ; |
10 | 10 | |
11 | | - default_argv ($default_argv) ; |
| 11 | + default_argv ($cfg_default_argv) ; |
12 | 12 | |
13 | 13 | # to do: add text from http://wiki.squid-cache.org/SquidFaq/SquidLogs |
14 | 14 | # ReportOrigin how to handle '!error <-> other |
— | — | @@ -32,12 +32,12 @@ |
33 | 33 | |
34 | 34 | undef %country_code_not_specified_reported ; |
35 | 35 | |
36 | | - if (-d "/a/squid") |
37 | | - { |
38 | | - &Log ("\n\nJob runs on server $hostname\n\n") ; |
39 | | - $path_in = "/a/ezachte" ; |
40 | | - $path_out = "/a/ezachte" ; |
41 | | - } |
| 36 | + $path_in = $job_runs_on_production_server ? $cfg_path_in_production : $cfg_path_in_test ; |
| 37 | + $path_out = $job_runs_on_production_server ? $cfg_path_out_production : $cfg_path_out_test ; |
| 38 | + |
| 39 | + &Log ("Path in = $path_in\n") ; |
| 40 | + &Log ("Path out = $path_out\n") ; |
| 41 | + |
42 | 42 | # following test needs to change -> remove server name dependency (new run argument ?) |
43 | 43 | # elsif ($hostname eq 'bayes') |
44 | 44 | # { |
— | — | @@ -45,16 +45,7 @@ |
46 | 46 | # $path_in = "/home/ezachte/wikistats/animation" ; |
47 | 47 | # $path_out = "/home/ezachte/wikistats/animation" ; |
48 | 48 | # } |
49 | | - else |
50 | | - { |
51 | | - print "Job runs local for tests\n\n" ; |
52 | | - $path_in = $path_in_local ; |
53 | | - $path_out = $path_out_local ; |
54 | | - } |
55 | 49 | |
56 | | - &Log ("Path in = $path_in\n") ; |
57 | | - &Log ("Path out = $path_out\n") ; |
58 | | - |
59 | 50 | $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ; |
60 | 51 | |
61 | 52 | # periodically harvest updated metrics from |
Index: trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm |
— | — | @@ -1,7 +1,7 @@ |
2 | 2 | #!/usr/bin/perl |
3 | 3 | |
4 | | - use config ; |
5 | | - use lib $liblocation ; |
| 4 | + use SquidCountArchiveConfig ; |
| 5 | + use lib $cfg_liblocation ; |
6 | 6 | use EzLib ; |
7 | 7 | |
8 | 8 | sub WriteOutputIpFrequencies |
Index: trunk/wikistats/squids/SquidCountArchive.pl |
— | — | @@ -1,7 +1,7 @@ |
2 | 2 | #!/usr/bin/perl |
3 | 3 | |
4 | | - use config ; |
5 | | - use lib $liblocation ; |
| 4 | + use SquidCountArchiveConfig ; |
| 5 | + use lib $cfg_liblocation ; |
6 | 6 | use EzLib ; |
7 | 7 | |
8 | 8 | $trace_on_exit = $true ; |
— | — | @@ -13,8 +13,7 @@ |
14 | 14 | use SquidCountArchiveReadInput ; |
15 | 15 | use SquidCountArchiveWriteOutput ; |
16 | 16 | |
17 | | - # set defaults mainly for tests on local machine |
18 | | - default_argv $default_argv; |
| 17 | + default_argv $cfg_default_argv ; |
19 | 18 | |
20 | 19 | # http://wikitech.wikimedia.org/view/Squid_log_format |
21 | 20 | # 1. Hostname |
— | — | @@ -43,25 +42,24 @@ |
44 | 43 | # todo: parm -r root folder |
45 | 44 | |
46 | 45 | $test = $false ; |
47 | | - $test_maxlines = 4000000 ; |
| 46 | + $test_maxlines = $cfg_text_maxlines ; |
| 47 | + $file_test = $cfg_file_test ; |
48 | 48 | |
49 | 49 | if (! $job_runs_on_production_server) |
50 | 50 | { |
51 | 51 | $test = $true ; |
52 | | - $file_test = "w:/# Out Locke/sampled-1000-log-20110401.txt" ; |
53 | | - # $file_test = getcwd . "/SquidDataFilterFY.txt" ; |
54 | | - if (! -e $file_test) |
| 52 | + if (! -e $cfg_file_test) |
55 | 53 | { abort "Test input file '$file_test' not found" ; } |
56 | 54 | } |
57 | 55 | |
58 | 56 | $time_start = time ; |
59 | 57 | |
60 | | - $path_root = "/srv/erik" ; |
| 58 | + $path_root = $job_runs_on_production_server ? $cfg_path_root_production : $cfg_path_root_test ; |
61 | 59 | |
62 | 60 | $tags_wiki_mobile = "Wikiamo|Wikipanion|Wikimedia" ; |
63 | 61 | |
64 | 62 | $tags_mobile = "Android|BlackBerry|Windows CE|DoCoMo|iPad|iPod|iPhone|HipTop|Kindle|LGE|Linux arm|Mobile|MIDP|NetFront|Nintendo|Nokia|Obigo|Opera Mini|Opera Mobi|Palm|Playstation Portable|Samsung|SoftBank|SonyEricsson|SymbianOS|UP\.Browser|Vodafone|WAP|webOS|HTC_Touch|KDDI|FOMA|HTC_HD2|Polaris|Teleca" ; |
65 | | - $tags_mobile_upd = "August 2011" ; |
| 63 | + $tags_mobile_upd = "February 2012" ; |
66 | 64 | |
67 | 65 | $pattern_url_pre = "(?:^|[a-zA-Z0-9-]+\\.)*?" ; |
68 | 66 | $pattern_url_post = "\\.(?:biz|com|info|name|net|org|pro|aero|asia|cat|coop|edu|gov|int|jobs|mil|mobi|museum|tel|travel|arpa|[a-zA-Z0-9-]{2}|(?:com?|ne)\\.[a-zA-Z0-9-]{2})\$" ; |
— | — | @@ -502,6 +500,7 @@ |
503 | 501 | undef %origins_external ; |
504 | 502 | undef %origins_unsimplified ; |
505 | 503 | undef %referers_internal ; |
| 504 | + undef %records ; |
506 | 505 | undef %requests ; |
507 | 506 | undef %scripts ; |
508 | 507 | undef %search ; |
— | — | @@ -510,6 +509,7 @@ |
511 | 510 | undef %squid_events ; |
512 | 511 | undef %squid_seqno ; |
513 | 512 | undef %statusses ; |
| 513 | + undef %total_clients ; |
514 | 514 | undef %unrecognized_domains ; |
515 | 515 | undef %wikis ; |
516 | 516 | # undef @files ; |
Index: trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm |
— | — | @@ -18,14 +18,22 @@ |
19 | 19 | { $mime = "text/html" ; } |
20 | 20 | } |
21 | 21 | |
| 22 | + $count_event = 1 ; |
| 23 | + # from Oct 16, 2011 00:00 hrs till Nov 29, 2011 20:00 hrs one of the two servers which process requests to the mobile site did not send log lines |
| 24 | + # since the two servers are load-balanced, selected stats (e.g. breakdown browser, OS) can be repaired by counting requests to mobile site twice in this period |
| 25 | + # note: do not count twice for metrics where specific ip addresses are considered, this would inflate number of assumed bots (based on ip address freq.) |
| 26 | + if ((($time ge '2011-10-16T00') && ($time lt '2011-11-29T20')) and |
| 27 | + ($url =~ /\.m\./)) |
| 28 | + { $count_event = 2 ; } |
| 29 | + |
22 | 30 | if ($scan_ip_frequencies) # phase 1 |
23 | 31 | { |
24 | 32 | return if $line =~ /Banner(?:Cont|List|Load|beheer)/io ; |
25 | 33 | |
26 | 34 | if ($mime eq "text/html") |
27 | 35 | { |
28 | | - $ip_frequencies {$client_ip} ++ ; |
29 | | - $html_pages_found ++ ; |
| 36 | + $ip_frequencies {$client_ip} ++ ; # do not use $count_event here! |
| 37 | + $html_pages_found += $count_event ; |
30 | 38 | } |
31 | 39 | |
32 | 40 | return ; |
— | — | @@ -33,8 +41,8 @@ |
34 | 42 | |
35 | 43 | |
36 | 44 | # remember for each squid per hour lowest and highest sequence number and number of events |
37 | | - # later calc per hour average distance between events = (higest - lowest sequence number) / events - 1 |
38 | | - # distance between consecutive events that lay in different hour bin are ignored, begligible |
| 45 | + # later calc per hour average distance between events = (highest - lowest sequence number) / events - 1 |
| 46 | + # distance between consecutive events that lay in different hour bin are ignored, negligible |
39 | 47 | $squid = $fields [0] ; |
40 | 48 | $seqno = $fields [1] ; |
41 | 49 | $hour = substr ($time, 11, 2) ; |
— | — | @@ -64,12 +72,12 @@ |
65 | 73 | |
66 | 74 | if ($url =~ /\.m\.wikipedia.org/) |
67 | 75 | { |
68 | | - $url_wikipedia_mobile ++ ; |
69 | | - $status_url_wikipedia_mobile {$status} ++ ; |
70 | | - $status_mime_url_wikipedia_mobile {"$status,$mime"} ++ ; |
| 76 | + $url_wikipedia_mobile += $count_event ; |
| 77 | + $status_url_wikipedia_mobile {$status} += $count_event ; |
| 78 | + $status_mime_url_wikipedia_mobile {"$status,$mime"} += $count_event ; |
71 | 79 | if ($status eq "TCP_MISS/302") |
72 | 80 | { |
73 | | - $redirected_to_mobile ++ ; |
| 81 | + $redirected_to_mobile += $count_event ; |
74 | 82 | return ; |
75 | 83 | } |
76 | 84 | } |
— | — | @@ -81,13 +89,13 @@ |
82 | 90 | |
83 | 91 | ($agent2 = $agent) =~ s/\%20/ /g ; # mainly to make line content more readable on debugging |
84 | 92 | $agent2 =~ s/\%2F/\//g ; # mainly to make line content more readable on debugging |
85 | | - $agents_raw {$agent2}++ ; |
| 93 | + $agents_raw {$agent2} += $count_event ; |
86 | 94 | |
87 | 95 | ($file,$ext) = &GetFileExt ($url) ; |
88 | | - $exts {$ext}++ ; |
| 96 | + $exts {$ext} += $count_event ; |
89 | 97 | |
90 | 98 | if (($ext eq "js") || ($ext eq "css")) |
91 | | - { $scripts {"$ext,$file,"} ++ ; } |
| 99 | + { $scripts {"$ext,$file,"} += $count_event ; } |
92 | 100 | |
93 | 101 | $title = "" ; |
94 | 102 | $parm = "" ; |
— | — | @@ -103,14 +111,14 @@ |
104 | 112 | if ($parm eq "?") { return ; } # error |
105 | 113 | $file =~ s/,/,/go ; |
106 | 114 | $parm =~ s/,/,/go ; |
107 | | - $scripts {"php,$file,$parm"} ++ ; |
| 115 | + $scripts {"php,$file,$parm"} += $count_event ; |
108 | 116 | $ext .= "($file)" ; # add filename behind extension php |
109 | 117 | } |
110 | 118 | |
111 | 119 | if ($mime eq "text/html") |
112 | 120 | { |
113 | 121 | $mimecat = "page" ; |
114 | | - $tot_mime_html ++ ; |
| 122 | + $tot_mime_html += $count_event ; |
115 | 123 | } |
116 | 124 | elsif ($mime =~ /(?:gif|png|jpeg)/o) |
117 | 125 | { $mimecat = "image" ; } |
— | — | @@ -137,12 +145,12 @@ |
138 | 146 | |
139 | 147 | if ($line =~ /(?:BannerCont|BannerList|BannerLoad|Bannerbeheer)/io) |
140 | 148 | { |
141 | | - $banners {"$country,$url"} ++ ; |
142 | | - $banner_requests_ignored ++ ; |
| 149 | + $banners {"$country,$url"} += $count_event ; |
| 150 | + $banner_requests_ignored += $count_event ; |
143 | 151 | return ; |
144 | 152 | } |
145 | 153 | |
146 | | - $countries {$country}++ ; |
| 154 | + $countries {$country} += $count_event ; ; |
147 | 155 | |
148 | 156 | $agent2 = $agent ; |
149 | 157 | $agent2 =~ s/\%20/ /g ; |
— | — | @@ -202,13 +210,13 @@ |
203 | 211 | if ($agent2 !~ /MSIE \d+\/\d+/o) # most likely false positives |
204 | 212 | { |
205 | 213 | $bot = $true ; |
206 | | - @bots {"$mime,$agent2"} ++ ; |
| 214 | + @bots {"$mime,$agent2"} += $count_event ; |
207 | 215 | } |
208 | 216 | } |
209 | 217 | elsif (($agent2 =~ /bot/io) || (($agent2 =~ /crawl(?:er)?/io) && ($agent2 !~ /MSIEcrawler/io)) || ($agent2 =~ /spider/io) || ($agent2 =~ /parser/io)) |
210 | 218 | { |
211 | 219 | $bot = $true ; |
212 | | - @bots {"$mime,$agent2"} ++ ; |
| 220 | + @bots {"$mime,$agent2"} += $count_event ; |
213 | 221 | } |
214 | 222 | |
215 | 223 | # GECKO |
— | — | @@ -337,7 +345,7 @@ |
338 | 346 | if (($os eq '..') && ($mobile eq 'M' || $mobile eq 'W')) |
339 | 347 | { |
340 | 348 | $os = "Mobile other" ; |
341 | | - $mobile_other {$agent2} ++ ; |
| 349 | + $mobile_other {$agent2} += $count_event ; |
342 | 350 | } |
343 | 351 | |
344 | 352 | if ($version =~ /(?:Ipod|Iphone)/io) |
— | — | @@ -557,21 +565,21 @@ |
558 | 566 | { |
559 | 567 | $engine =~ s/,/,/go ; |
560 | 568 | if ($gecko ne "") |
561 | | - { $engines {$gecko} ++ ; } |
| 569 | + { $engines {$gecko} += $count_event ; } |
562 | 570 | elsif ($applewebkit ne "") |
563 | 571 | { |
564 | 572 | $applewebkit =~ s/AppleWebKit\//AppleWebKit /o ; |
565 | | - $engines {$applewebkit} ++ ; |
| 573 | + $engines {$applewebkit} += $count_event ; ; |
566 | 574 | } |
567 | 575 | |
568 | 576 | $version =~ s/,/,/go ; |
569 | 577 | if ($os =~ /playstation/io) |
570 | 578 | { $version = "NetFront (PlayStation)" ; } |
571 | 579 | |
572 | | - $clients {"$mobile,$version,$mimecat"}++ ; |
| 580 | + $clients {"$mobile,$version,$mimecat"} += $count_event ; ; |
573 | 581 | |
574 | 582 | $operating_systems =~ s/,/,/go ; |
575 | | - $operating_systems {"$mobile,$os"} ++ ; |
| 583 | + $operating_systems {"$mobile,$os"} += $count_event ; ; |
576 | 584 | } |
577 | 585 | |
578 | 586 | if ($count_hits_per_ip_range) |
— | — | @@ -583,16 +591,16 @@ |
584 | 592 | |
585 | 593 | if ($status =~ /^TCP/) |
586 | 594 | { |
587 | | - $statusses {"$method:$status"}++ ; |
588 | | - $statusses {"$method:total"}++ ; |
| 595 | + $statusses {"$method:$status"} += $count_event ; |
| 596 | + $statusses {"$method:total"} += $count_event ; |
589 | 597 | } |
590 | 598 | else |
591 | | - { $statusses_non_tcp ++ ; } |
| 599 | + { $statusses_non_tcp += $count_event ; } |
592 | 600 | |
593 | 601 | if ($url =~ /org\/skins/o) |
594 | 602 | { |
595 | 603 | ($url2 = $url) =~ s/^.*?\/skins/skins/o ; |
596 | | - $skins {$url2} ++ ; |
| 604 | + $skins {$url2} += $count_event ; ; |
597 | 605 | } |
598 | 606 | |
599 | 607 | if ($url =~ /^upload\.wikimedia\.org\//o) # count image size if applicable |
— | — | @@ -604,7 +612,7 @@ |
605 | 613 | |
606 | 614 | # for diagnostics |
607 | 615 | if (($referer =~ /google/o) || ($agent =~ /google/io)) |
608 | | - { $googles++ ; } |
| 616 | + { $googles += $count_event ; } |
609 | 617 | |
610 | 618 | $referer =~ s/^http\w?\:\/\///o ; |
611 | 619 | $referer =~ s/\.php\?.*$/\.php\?../go ; |
— | — | @@ -622,7 +630,7 @@ |
623 | 631 | if (($domain =~ /\./o) || |
624 | 632 | ($domain !~ /^[\*\@\%]?!(wb|wn|wp|wq|ws|wv|wk|wx|xx|wm|mw|wmf)\:/o)) |
625 | 633 | { |
626 | | - $unrecognized_domains {$domain_original} ++ ; |
| 634 | + $unrecognized_domains {$domain_original} += $count_event ; |
627 | 635 | $domain = 'other' ; |
628 | 636 | } |
629 | 637 | |
— | — | @@ -641,24 +649,24 @@ |
642 | 650 | |
643 | 651 | if ($referer_external) |
644 | 652 | { |
645 | | - $tot_referers_external++ ; |
| 653 | + $tot_referers_external += $count_event ; ; |
646 | 654 | |
647 | 655 | ($origin, $toplevel) = &DetectOrigin ($client_ip, $referer_original, $agent, $mime, $mimecat, $service, $ext) ; |
648 | 656 | |
649 | 657 | &CountOrigin ("external", $origin, $toplevel, $mimecat) ; |
650 | 658 | |
651 | 659 | if ($origin !~ /^\!/o) |
652 | | - { $origins_unsimplified {$referer_original} ++ ; } |
| 660 | + { $origins_unsimplified {$referer_original} += $count_event ; } |
653 | 661 | else |
654 | 662 | { |
655 | | - $origin_simplified {"$origin [$referer] <- $referer_original"} ++ ; |
656 | | - $origins_external {$origin} ++ ; |
| 663 | + $origin_simplified {"$origin [$referer] <- $referer_original"} += $count_event ; ; |
| 664 | + $origins_external {$origin} += $count_event ; |
657 | 665 | } |
658 | 666 | } |
659 | 667 | else |
660 | 668 | { |
661 | | - $tot_referers_internal ++ ; |
662 | | - $referers_internal {$referer} ++ ; |
| 669 | + $tot_referers_internal += $count_event ; |
| 670 | + $referers_internal {$referer} += $count_event ; |
663 | 671 | $referer =~ s/!//go ; # ! was marker to signal pattern was recognized as wikimedia project |
664 | 672 | &CountOrigin ("internal", $referer, "org" , $mimecat) ; |
665 | 673 | } |
— | — | @@ -670,10 +678,9 @@ |
671 | 679 | if ($domain =~ /!/o) |
672 | 680 | { print ERR "still ! in domain: '$domain' <- '$domain_original'\n" ; } |
673 | 681 | |
674 | | - $requests {"$domain|$referer|$ext|$mime|$parm"}++ ; |
| 682 | + $requests {"$domain|$referer|$ext|$mime|$parm"} += $count_event ; ; |
| 683 | + $clients_by_wiki {"$mobile,$version,$domain"} += $count_event ; ; |
675 | 684 | |
676 | | - $clients_by_wiki {"$mobile,$version,$domain"}++ ; |
677 | | - |
678 | 685 | # different output use either 'bot=N' or 'M'(anual) / 'bot=Y' or 'B'(ot) |
679 | 686 | if ($bot) |
680 | 687 | { |
— | — | @@ -689,13 +696,13 @@ |
690 | 697 | if (($domain =~ /^\@/) || ($domain =~ /^\*/)) |
691 | 698 | { |
692 | 699 | # print "Requests wap $domain | $ext | $mime | $parm | $country | $ind_bot\n" ; |
693 | | - $requests_wap {"$domain|$ext|$mime|$parm|$country|$ind_bot"} ++ ; |
| 700 | + $requests_wap {"$domain|$ext|$mime|$parm|$country|$ind_bot"} += $count_event ; ; |
694 | 701 | } |
695 | 702 | |
696 | 703 | if ($domain =~ /^\%/) |
697 | 704 | { |
698 | 705 | # print "Requests m $domain | $ext | $mime | $parm | $country | $ind_bot\n" ; |
699 | | - $requests_m {"$domain|$ext|$mime|$parm|$country|$ind_bot"} ++ ; |
| 706 | + $requests_m {"$domain|$ext|$mime|$parm|$country|$ind_bot"} += $count_event ; ; |
700 | 707 | } |
701 | 708 | # $title !~ /:/ -> only namespace 0 (minus few titles with colon in name) |
702 | 709 | if (($url =~ /index.php\?/o) && ($title !~ /:/o) && ($mime eq "text/html") && (($url =~ /action=edit/o) || ($url =~ /action=submit/o))) |
— | — | @@ -708,34 +715,40 @@ |
709 | 716 | $key =~ s/,/,/go ; |
710 | 717 | $key =~ s/\|/,/go ; |
711 | 718 | |
712 | | - $index_php_raw {$key}++ ; |
| 719 | + $index_php_raw {$key} += $count_event ; ; |
713 | 720 | $client_ip_record_cnt {$client_ip}++ ; |
714 | 721 | } |
715 | 722 | |
716 | 723 | if ($mimecat eq "page") |
717 | 724 | { |
718 | | - $tot_mime_html2 ++ ; |
| 725 | + $tot_mime_html2 += $count_event ; |
719 | 726 | |
720 | 727 | if (($ind_bot =~ /N/) and ($ip_frequencies {$client_ip} > 2)) |
721 | 728 | { $ind_bot = 'bot=Y' ; } |
722 | 729 | |
723 | | - $countries_views {"$ind_bot,$domain,$country"} ++ ; |
724 | | - |
| 730 | + $countries_views {"$ind_bot,$domain,$country"} += $count_event ; ; |
725 | 731 | # $title !~ /:/ -> only namespace 0 (minus few titles with colon in name) |
726 | 732 | if (($url =~ /index.php\?/o) && ($title !~ /:/) && ($mime eq "text/html") && ($url =~ /action=submit/o) && ($status =~ /302/o)) |
727 | | - { $countries_saves {"$ind_bot,$domain,$country"} ++ ; } |
| 733 | + { $countries_saves {"$ind_bot,$domain,$country"} += $count_event ; } |
728 | 734 | |
729 | 735 | $time_hh = substr ($time,11,2) ; |
730 | 736 | $time_mm = substr ($time,14,2) ; |
731 | 737 | $time_tt = $time_hh * 60 + $time_mm ; |
732 | 738 | $time_tt2 = $time_tt - $time_tt % 15 ; |
733 | | - $countries_timed {"$ind_bot,$domain,$country,$time_tt2"} ++ ; |
| 739 | + $countries_timed {"$ind_bot,$domain,$country,$time_tt2"} += $count_event ; |
734 | 740 | |
| 741 | + if (! $test) |
| 742 | + { |
| 743 | + $time2 = substr ($time,0,19) ; # omit msec |
| 744 | + $line = "$time2,$client_ip,$domain,$ind_bot2,$mobile,$os,$version,$mimecat\n" ; |
| 745 | + $gz_csv_views_viz->gzwrite($line) || die "Zlib error writing to $file_csv_views_viz: $gz_csv_views_viz->gzerror\n" ; |
| 746 | + } |
| 747 | + } |
735 | 748 | |
736 | | - $time2 = substr ($time,0,19) ; # omit msec |
737 | | - $line = "$time2,$client_ip,$domain,$ind_bot2,$mobile,$os,$version,$mimecat\n" ; |
738 | | - $gz_csv_views_viz->gzwrite($line) || die "Zlib error writing to $file_csv_views_viz: $gz_csv_views_viz->gzerror\n" ; |
739 | | - } |
| 749 | + $records {"$mobile,$mimecat"} += $count_event ; |
| 750 | + $records {"*,$mimecat"} += $count_event ; |
| 751 | + $records {"$mobile,*"} += $count_event ; |
| 752 | + $records {"*,*"} += $count_event ; |
740 | 753 | } |
741 | 754 | |
742 | 755 | sub ExtractLanguage |
— | — | @@ -747,10 +760,10 @@ |
748 | 761 | $regexp_lang = "[a-z]{2}(?:-[a-zA-Z]{2,3})?(?:-[a-zA-Z]{2,3})?" ; |
749 | 762 | ($language = $agent) =~ s/^.*?; ($regexp_lang)[\);].*$/$1/o ; |
750 | 763 | if ($language eq $agent) |
751 | | - { $languages_unrecognized {$agent} ++ ; } |
| 764 | + { $languages_unrecognized {$agent} += $count_event ; } |
752 | 765 | else |
753 | 766 | { |
754 | | - $languages {"$application,$language"} ++ ; |
| 767 | + $languages {"$application,$language"} += $count_event ; |
755 | 768 | $agent =~ s/ $language//o ; |
756 | 769 | } |
757 | 770 | return ($agent) ; |
— | — | @@ -823,7 +836,8 @@ |
824 | 837 | |
825 | 838 | foreach $parm (@parms) |
826 | 839 | { |
827 | | - next if $parm eq "" ; |
| 840 | + next if $parm eq '' ; |
| 841 | + next if $parm eq '*' ; |
828 | 842 | |
829 | 843 | if (($parm !~ /=/) && ($parm !~ /^[\w\d\-\_]+$/o)) |
830 | 844 | { $error = "parm probably invalid: '$parm' in '$url' -> skip\n" ; $invalid = $true ; last } |
— | — | @@ -901,7 +915,7 @@ |
902 | 916 | $domain =~ s/\.m\./.%/o ; |
903 | 917 | |
904 | 918 | if ($domain =~ /^error:/o) |
905 | | - { $domain_errors {$domain}++ ; } |
| 919 | + { $domain_errors {$domain} += $count_event ; } |
906 | 920 | $domain =~ s/error:.*$/!error:1/o ; |
907 | 921 | |
908 | 922 | $domain =~ s/^([^\.\/]+)\.([^\.\/]+)\.org/$2:$1/o ; |
— | — | @@ -1018,16 +1032,16 @@ |
1019 | 1033 | if (($googlematch eq "- - z") && ($service =~ /GoogleBot/io)) |
1020 | 1034 | { |
1021 | 1035 | $service = "GoogleBot?" ; |
1022 | | - $google_imposters {$agent}++ ; |
| 1036 | + $google_imposters {$agent} += $count_event ; |
1023 | 1037 | } |
1024 | 1038 | |
1025 | 1039 | # obsolete? to be considered ? |
1026 | 1040 | # if (($googlematch ne "- - z") || ($service =~ /(?:Earth|Desktop)/o)) |
1027 | | - # { $search {"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} ++ ; } |
| 1041 | + # { $search {"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} += $count_event ; } |
1028 | 1042 | # else |
1029 | 1043 | # { $accept = "not" ; } |
1030 | 1044 | |
1031 | | - $search {"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} ++ ; |
| 1045 | + $search {"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} += $count_event ; |
1032 | 1046 | |
1033 | 1047 | $googlebins2 {"$accept [$googlematch] " . sprintf ("%-14s",$service) . $referer} ++ ; |
1034 | 1048 | $googlebins {$googlematch}++ ; |
— | — | @@ -1055,7 +1069,7 @@ |
1056 | 1070 | # } |
1057 | 1071 | |
1058 | 1072 | if ($origin =~ /wiki/o) |
1059 | | - { $wikis {$origin} ++ ; } |
| 1073 | + { $wikis {$origin} += $count_event ; } |
1060 | 1074 | |
1061 | 1075 | if ($origin eq "wikipedia") |
1062 | 1076 | { |
— | — | @@ -1187,7 +1201,7 @@ |
1188 | 1202 | |
1189 | 1203 | if ($source eq "external") |
1190 | 1204 | { |
1191 | | - $tot_origins_external_counted ++ ; |
| 1205 | + $tot_origins_external_counted += $count_event ; |
1192 | 1206 | $origin =~ s/\:.*$//o ; |
1193 | 1207 | if (is_valid_ip_address ($origin)) |
1194 | 1208 | { $origin = "unmatched ip address" ; $toplevel = "" ; } |
— | — | @@ -1203,7 +1217,7 @@ |
1204 | 1218 | # print "$origin\n" ; |
1205 | 1219 | } |
1206 | 1220 | } |
1207 | | - $origins {"$source,$origin,$toplevel,$mimecat"} ++ ; |
| 1221 | + $origins {"$source,$origin,$toplevel,$mimecat"} += $count_event ; |
1208 | 1222 | } |
1209 | 1223 | |
1210 | 1224 | sub ProcessUploadPath |
— | — | @@ -1213,7 +1227,7 @@ |
1214 | 1228 | ($path = $url) =~ s/^.*?\.org\///o ; |
1215 | 1229 | ($file = $path) =~ s/^.*\/([^\/]*)$/$1/go ; # remove path |
1216 | 1230 | |
1217 | | - $binaries {$path} ++ ; # Jan 2012 store path, not file only |
| 1231 | + $binaries {$path} += $count_event ; # Jan 2012 store path, not file only |
1218 | 1232 | |
1219 | 1233 | if ($file =~ /(?:gif|jpg|jpeg|png|svg)$/io) |
1220 | 1234 | { |
— | — | @@ -1227,10 +1241,10 @@ |
1228 | 1242 | { |
1229 | 1243 | ($size = $file) =~ s/^.*?(\d+)px.*$/$1/o ; |
1230 | 1244 | $sizerange = sprintf ("%5d",(int ($size / 20)) * 20) . "-" . sprintf ("%5d",(((int ($size / 20))+1) * 20 - 1)) ; |
1231 | | - $imagesizes {$sizerange} ++ ; |
| 1245 | + $imagesizes {$sizerange} += $count_event ; |
1232 | 1246 | } |
1233 | 1247 | else |
1234 | | - { $imagesizes {"???"} ++ ; } |
| 1248 | + { $imagesizes {"???"} += $count_event ; } |
1235 | 1249 | } |
1236 | 1250 | } |
1237 | 1251 | |