r112317 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r112316‎ | r112317 | r112318 >
Date:12:20, 24 February 2012
Author:ezachte
Status:deferred
Tags:
Comment:
double count mobile records for certain date range, use of cfg_ prefix for var's in ..Config.pm
Modified paths:
  • /trunk/wikistats/squids/SquidCountArchive.pl (modified) (history)
  • /trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm (modified) (history)
  • /trunk/wikistats/squids/SquidCountArchiveReadInput.pm (modified) (history)
  • /trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm (modified) (history)
  • /trunk/wikistats/squids/SquidCountryScan.pl (modified) (history)
  • /trunk/wikistats/squids/SquidCountryScanConfig.pm (modified) (history)
  • /trunk/wikistats/squids/SquidReportArchive.pl (modified) (history)
  • /trunk/wikistats/squids/SquidReportArchiveConfig.pm (modified) (history)
  • /trunk/wikistats/squids/config.pm (deleted) (history)

Diff [purge]

Index: trunk/wikistats/squids/config.pm
@@ -1,6 +0,0 @@
2 - #!/usr/bin/perl
3 -
4 - $liblocation = "/home/ezachte/lib" ;
5 - $default_argv = "-d 2011/04/01" ;
6 - $dir_in = "/a/squid/archive" ;
7 - $logname = "sampled-1000.log" ;
\ No newline at end of file
Index: trunk/wikistats/squids/SquidCountryScanConfig.pm
@@ -1,7 +1,7 @@
22 #!/usr/bin/perl
33
4 - $liblocation = "/home/ezachte/lib" ;
 4+ $cfg_liblocation = "/home/ezachte/lib" ;
55
6 - $path_root_production = "/a/ezachte/" ;
7 - $path_root_test = "w:/! perl/squids/archive/" ; # Erik
8 -# $path_root_test = "?" ; # Andr�
 6+ $cfg_path_root_production = "/a/ezachte/" ;
 7+ $cfg_path_root_test = "w:/! perl/squids/archive/" ; # Erik
 8+# $cfg_path_root_test = "?" ; # Andr�
Index: trunk/wikistats/squids/SquidReportArchiveConfig.pm
@@ -1,15 +1,16 @@
22 #!/usr/bin/perl
33
4 - $liblocation = "/home/ezachte/lib" ;
 4+ $cfg_liblocation = "/home/ezachte/lib" ;
55
6 -# $path_in_local = "W:/# Out Locke" ; # Erik
7 -# $path_out_local = "W:/# Out Test/Locke" ; # Erik
 6+ $cfg_path_in_production = "/a/ezachte" ;
 7+ $cfg_path_out_production = "/a/ezachte" ;
 8+# $cfg_path_in_test = "W:/# Out Locke" ; # Erik
 9+# $cfg_path_out_test = "W:/# Out Test/Locke" ; # Erik
 10+ $cfg_path_in_test = "/srv/erik/" ; # Andr�
 11+ $cfg_path_out_test = "/srv/erik/" ; # Andr�
812
9 - $path_in = "/srv/erik/" ; # Andr�
10 - $path_out = "/srv/erik/" ; # Andr�
11 -
12 -# set defaults for tests on local machine
13 -# $default_argv = "-m 2011-07" ; # monthly report
14 -# $default_argv = "-w" ; # refresh country info from Wikipedia (population etc)
15 -# $default_argv = "-c" ; # country/regional reports
16 - $default_argv = "-c -q 2011Q4" ; # country/regional reports based on data for one quarter only
 13+# set default arguments for test on local machine
 14+# $cfg_default_argv = "-m 2011-07" ; # monthly report
 15+# $cfg_default_argv = "-w" ; # refresh country info from Wikipedia (population etc)
 16+# $cfg_default_argv = "-c" ; # country/regional reports
 17+ $cfg_default_argv = "-c -q 2011Q4" ; # country/regional reports based on data for one quarter only
Index: trunk/wikistats/squids/SquidCountArchiveReadInput.pm
@@ -4,7 +4,7 @@
55 # test:
66 # echo 125.123.123.123 | /usr/local/bin/geoiplogtag 1
77 # refresh: bayes:/usr/share/GeoIP> wget http://geolite.maxmind.com/download/geoip/database/GeoLiteCountry/GeoIP.dat.gz
8 -use config ;
 8+use SquidCountArchiveConfig ;
99
1010 sub CollectFilesToProcess
1111 {
@@ -24,6 +24,8 @@
2525
2626 my ($date_archived) ;
2727
 28+ $dir_in = $job_runs_on_production_server ? $cfg_dir_in_production : $cfg_dir_in_test ;
 29+
2830 $some_files_found = $false ;
2931 $full_range_found = $false ;
3032
@@ -47,7 +49,7 @@
4850 $date_archived = sprintf ("%4d%02d%02d", $year+1900, $month+1, $day) ;
4951 print "\n- Inspect file saved $days_ago_inspect days ago: $logname-$date_archived.gz\n" ;
5052
51 - my $file = "$dir_in/$logname-$date_archived.gz" ;
 53+ my $file = "$dir_in/$cfg_logname-$date_archived.gz" ;
5254
5355 if (! -e $file)
5456 { print "- File not found: $file\n" ; }
@@ -173,8 +175,8 @@
174176 else
175177 {
176178 open IN, '<', $file_in ;
177 - # $fields_expected = 14 ;
178 - $fields_expected = 13 ;
 179+ $fields_expected = 14 ; # add fake country code
 180+ # $fields_expected = 13 ;
179181 }
180182
181183 $line = "" ;
@@ -192,6 +194,12 @@
193195 # ugly Q&D code to circumvent spaces in agent string
194196 # $line2 = $line ;
195197 chomp $line ;
 198+
 199+ if ($test)
 200+ { $line .= ' XX' ; }
 201+
 202+ $line =~ s/x-www-form-urlencoded; charset=UTF-8/x-www-form-urlencoded;%20charset=UTF-8/ ; # log lines are space delimited, other spaces should be encoded
 203+
196204 @fields = split (' ', $line) ;
197205 # next if $line =~ /upload/ ;
198206 # next if $line !~ /en\.m\.wikipedia/ ;
@@ -201,23 +209,46 @@
202210 #next if $fields [9] =~ /NONE/ ;
203211 if ($#fields > 14)
204212 {
 213+if (! $scan_ip_frequencies)
 214+{
205215 # print "line $line2\n" ;
206216 # print "fields " . $#fields . "\n$line\n" ;
 217+}
 218+
207219 $country_code = $fields [$#fields] ;
208220 $fields [$#fields] = '' ;
209221 $line = join (' ', @fields) ;
210 -# print "2 $line\n" ;
211222 @fields = split (' ', $line, 14) ;
212223 $fields [14] = $country_code ;
 224+ $fields [13] =~ s/ /%20/g ;
 225+
 226+if (! $scan_ip_frequencies)
 227+{
 228+# print "2 $line\n" ;
213229 # print "\n\n12: " . $fields [12] . "\n" ;
214230 # print "13: " . $fields [13] . "\n" ;
215231 # print "14: " . $fields [14] . "\n" ;
216232 # print "15: " . $fields [15] . "\n" ;
 233+}
217234 }
218235
219 - if ($#fields < $fields_expected) { $fields_too_few ++ ; print "invalid field count " . $#fields . "\n" ; next ; }
220 - if ($#fields > $fields_expected) { $fields_too_many ++ ; print "invalid field count " . $#fields . "\n" ; next ; }
 236+ if ($#fields < $fields_expected)
 237+ {
 238+ $fields_too_few ++ ;
 239+ print "invalid field count " . $#fields . "\n" ;
 240+ print ERR $#fields . " fields: \"$line\"\n" ;
 241+ next ;
 242+ }
221243
 244+ if ($#fields > $fields_expected)
 245+ {
 246+ @a = @fields ;
 247+ $fields_too_many ++ ;
 248+ print "invalid field count " . $#fields . "\n" ;
 249+ print ERR $#fields . " fields: \"$line\"\n" ;
 250+ next ;
 251+ }
 252+
222253 $time = $fields [2] ;
223254
224255 if (($oldest_time_read eq "") || ($time lt $oldest_time_read))
@@ -230,7 +261,7 @@
231262
232263 if ($time lt $time_to_start)
233264 {
234 - if (++ $times % 100000 == 0)
 265+ if (++ $times % 1000000 == 0)
235266 { print "[$time]\n" ; }
236267 next ;
237268 }
@@ -266,12 +297,23 @@
267298 #next if $line !~ /http:\/\/\w+\.m\./ ;
268299 #print "$line\n" ;
269300 &ProcessLine ($line) ;
270 - if (++ $lines_processed % 10000 == 0)
 301+ if (++ $lines_processed % 50000 == 0)
271302 {
 303+ if (! $scan_ip_frequencies) # phase 2
 304+ {
 305+ $perc_mobile_all = '-' ;
 306+ if ($records {"*,*"} > 0)
 307+ { $perc_mobile_all = sprintf ("%.1f", 100 * $records {"M,*"} / $records {"*,*"}) ; }
 308+ $perc_mobile_pages = '-' ;
 309+ if ($records {"*,page"} > 0)
 310+ { $perc_mobile_pages = sprintf ("%.1f", 100 * $records {"M,page"} / $records {"*,page"}) ; }
 311+ $perc_mobile = " (mobile: all $perc_mobile_all\%, pages $perc_mobile_pages\%)" ;
 312+ }
 313+
272314 if ($banner_requests_ignored == 0)
273 - { print "$time $lines_processed\n" ; }
 315+ { print "$time $lines_processed$perc_mobile\n" ; }
274316 else
275 - { print "$time $lines_processed ($banner_requests_ignored banner requests ignored)\n" ; }
 317+ { print "$time $lines_processed$perc_mobile ($banner_requests_ignored banner requests ignored)\n" ; }
276318 }
277319 if ($test and $lines_processed >= $test_maxlines)
278320 { last ; }
Index: trunk/wikistats/squids/SquidCountryScan.pl
@@ -4,7 +4,7 @@
55 ## sub ProcessRawData <- SquidDataCountries.csv -> ??
66
77 use SquidCountryScanConfig ;
8 - use lib $liblocation ;
 8+ use lib $cfg_liblocation ;
99 use EzLib ;
1010 $trace_on_exit = $true ;
1111
@@ -23,7 +23,7 @@
2424 # exit ;
2525 }
2626
27 - $path_root = $job_runs_on_production_server ? $path_root_production : $path_root_test ;
 27+ $path_root = $job_runs_on_production_server ? $cfg_path_root_production : $cfg_path_root_test ;
2828
2929 $file_raw_data_monthly_visits = "$path_root/SquidDataVisitsPerCountryMonthly.csv" ;
3030 $file_raw_data_daily_visits = "$path_root/SquidDataVisitsPerCountryDaily.csv" ;
Index: trunk/wikistats/squids/SquidReportArchive.pl
@@ -1,13 +1,13 @@
22 #!/usr/bin/perl
33
44 use SquidReportArchiveConfig ;
5 - use lib $liblocation ;
 5+ use lib $cfg_liblocation ;
66
77 use EzLib ;
88 $trace_on_exit = $true ;
99 ez_lib_version (2) ;
1010
11 - default_argv ($default_argv) ;
 11+ default_argv ($cfg_default_argv) ;
1212
1313 # to do: add text from http://wiki.squid-cache.org/SquidFaq/SquidLogs
1414 # ReportOrigin how to handle '!error <-> other
@@ -32,12 +32,12 @@
3333
3434 undef %country_code_not_specified_reported ;
3535
36 - if (-d "/a/squid")
37 - {
38 - &Log ("\n\nJob runs on server $hostname\n\n") ;
39 - $path_in = "/a/ezachte" ;
40 - $path_out = "/a/ezachte" ;
41 - }
 36+ $path_in = $job_runs_on_production_server ? $cfg_path_in_production : $cfg_path_in_test ;
 37+ $path_out = $job_runs_on_production_server ? $cfg_path_out_production : $cfg_path_out_test ;
 38+
 39+ &Log ("Path in = $path_in\n") ;
 40+ &Log ("Path out = $path_out\n") ;
 41+
4242 # following test needs to change -> remove server name dependency (new run argument ?)
4343 # elsif ($hostname eq 'bayes')
4444 # {
@@ -45,16 +45,7 @@
4646 # $path_in = "/home/ezachte/wikistats/animation" ;
4747 # $path_out = "/home/ezachte/wikistats/animation" ;
4848 # }
49 - else
50 - {
51 - print "Job runs local for tests\n\n" ;
52 - $path_in = $path_in_local ;
53 - $path_out = $path_out_local ;
54 - }
5549
56 - &Log ("Path in = $path_in\n") ;
57 - &Log ("Path out = $path_out\n") ;
58 -
5950 $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ;
6051
6152 # periodically harvest updated metrics from
Index: trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm
@@ -1,7 +1,7 @@
22 #!/usr/bin/perl
33
4 - use config ;
5 - use lib $liblocation ;
 4+ use SquidCountArchiveConfig ;
 5+ use lib $cfg_liblocation ;
66 use EzLib ;
77
88 sub WriteOutputIpFrequencies
Index: trunk/wikistats/squids/SquidCountArchive.pl
@@ -1,7 +1,7 @@
22 #!/usr/bin/perl
33
4 - use config ;
5 - use lib $liblocation ;
 4+ use SquidCountArchiveConfig ;
 5+ use lib $cfg_liblocation ;
66 use EzLib ;
77
88 $trace_on_exit = $true ;
@@ -13,8 +13,7 @@
1414 use SquidCountArchiveReadInput ;
1515 use SquidCountArchiveWriteOutput ;
1616
17 - # set defaults mainly for tests on local machine
18 - default_argv $default_argv;
 17+ default_argv $cfg_default_argv ;
1918
2019 # http://wikitech.wikimedia.org/view/Squid_log_format
2120 # 1. Hostname
@@ -43,25 +42,24 @@
4443 # todo: parm -r root folder
4544
4645 $test = $false ;
47 - $test_maxlines = 4000000 ;
 46+ $test_maxlines = $cfg_text_maxlines ;
 47+ $file_test = $cfg_file_test ;
4848
4949 if (! $job_runs_on_production_server)
5050 {
5151 $test = $true ;
52 - $file_test = "w:/# Out Locke/sampled-1000-log-20110401.txt" ;
53 - # $file_test = getcwd . "/SquidDataFilterFY.txt" ;
54 - if (! -e $file_test)
 52+ if (! -e $cfg_file_test)
5553 { abort "Test input file '$file_test' not found" ; }
5654 }
5755
5856 $time_start = time ;
5957
60 - $path_root = "/srv/erik" ;
 58+ $path_root = $job_runs_on_production_server ? $cfg_path_root_production : $cfg_path_root_test ;
6159
6260 $tags_wiki_mobile = "Wikiamo|Wikipanion|Wikimedia" ;
6361
6462 $tags_mobile = "Android|BlackBerry|Windows CE|DoCoMo|iPad|iPod|iPhone|HipTop|Kindle|LGE|Linux arm|Mobile|MIDP|NetFront|Nintendo|Nokia|Obigo|Opera Mini|Opera Mobi|Palm|Playstation Portable|Samsung|SoftBank|SonyEricsson|SymbianOS|UP\.Browser|Vodafone|WAP|webOS|HTC_Touch|KDDI|FOMA|HTC_HD2|Polaris|Teleca" ;
65 - $tags_mobile_upd = "August 2011" ;
 63+ $tags_mobile_upd = "February 2012" ;
6664
6765 $pattern_url_pre = "(?:^|[a-zA-Z0-9-]+\\.)*?" ;
6866 $pattern_url_post = "\\.(?:biz|com|info|name|net|org|pro|aero|asia|cat|coop|edu|gov|int|jobs|mil|mobi|museum|tel|travel|arpa|[a-zA-Z0-9-]{2}|(?:com?|ne)\\.[a-zA-Z0-9-]{2})\$" ;
@@ -502,6 +500,7 @@
503501 undef %origins_external ;
504502 undef %origins_unsimplified ;
505503 undef %referers_internal ;
 504+ undef %records ;
506505 undef %requests ;
507506 undef %scripts ;
508507 undef %search ;
@@ -510,6 +509,7 @@
511510 undef %squid_events ;
512511 undef %squid_seqno ;
513512 undef %statusses ;
 513+ undef %total_clients ;
514514 undef %unrecognized_domains ;
515515 undef %wikis ;
516516 # undef @files ;
Index: trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm
@@ -18,14 +18,22 @@
1919 { $mime = "text/html" ; }
2020 }
2121
 22+ $count_event = 1 ;
 23+ # from Oct 16, 2011 00:00 hrs till Nov 29, 2011 20:00 hrs one of the two servers which process requests to the mobile site did not send log lines
 24+ # since the two servers are load-balanced, selected stats (e.g. breakdown browser, OS) can be repaired by counting requests to mobile site twice in this period
 25+ # note: do not count twice for metrics where specific ip addresses are considered, this would inflate number of assumed bots (based on ip address freq.)
 26+ if ((($time ge '2011-10-16T00') && ($time lt '2011-11-29T20')) and
 27+ ($url =~ /\.m\./))
 28+ { $count_event = 2 ; }
 29+
2230 if ($scan_ip_frequencies) # phase 1
2331 {
2432 return if $line =~ /Banner(?:Cont|List|Load|beheer)/io ;
2533
2634 if ($mime eq "text/html")
2735 {
28 - $ip_frequencies {$client_ip} ++ ;
29 - $html_pages_found ++ ;
 36+ $ip_frequencies {$client_ip} ++ ; # do not use $count_event here!
 37+ $html_pages_found += $count_event ;
3038 }
3139
3240 return ;
@@ -33,8 +41,8 @@
3442
3543
3644 # remember for each squid per hour lowest and highest sequence number and number of events
37 - # later calc per hour average distance between events = (higest - lowest sequence number) / events - 1
38 - # distance between consecutive events that lay in different hour bin are ignored, begligible
 45+ # later calc per hour average distance between events = (highest - lowest sequence number) / events - 1
 46+ # distance between consecutive events that lay in different hour bin are ignored, negligible
3947 $squid = $fields [0] ;
4048 $seqno = $fields [1] ;
4149 $hour = substr ($time, 11, 2) ;
@@ -64,12 +72,12 @@
6573
6674 if ($url =~ /\.m\.wikipedia.org/)
6775 {
68 - $url_wikipedia_mobile ++ ;
69 - $status_url_wikipedia_mobile {$status} ++ ;
70 - $status_mime_url_wikipedia_mobile {"$status,$mime"} ++ ;
 76+ $url_wikipedia_mobile += $count_event ;
 77+ $status_url_wikipedia_mobile {$status} += $count_event ;
 78+ $status_mime_url_wikipedia_mobile {"$status,$mime"} += $count_event ;
7179 if ($status eq "TCP_MISS/302")
7280 {
73 - $redirected_to_mobile ++ ;
 81+ $redirected_to_mobile += $count_event ;
7482 return ;
7583 }
7684 }
@@ -81,13 +89,13 @@
8290
8391 ($agent2 = $agent) =~ s/\%20/ /g ; # mainly to make line content more readable on debugging
8492 $agent2 =~ s/\%2F/\//g ; # mainly to make line content more readable on debugging
85 - $agents_raw {$agent2}++ ;
 93+ $agents_raw {$agent2} += $count_event ;
8694
8795 ($file,$ext) = &GetFileExt ($url) ;
88 - $exts {$ext}++ ;
 96+ $exts {$ext} += $count_event ;
8997
9098 if (($ext eq "js") || ($ext eq "css"))
91 - { $scripts {"$ext,$file,"} ++ ; }
 99+ { $scripts {"$ext,$file,"} += $count_event ; }
92100
93101 $title = "" ;
94102 $parm = "" ;
@@ -103,14 +111,14 @@
104112 if ($parm eq "?") { return ; } # error
105113 $file =~ s/,/&comma;/go ;
106114 $parm =~ s/,/&comma;/go ;
107 - $scripts {"php,$file,$parm"} ++ ;
 115+ $scripts {"php,$file,$parm"} += $count_event ;
108116 $ext .= "($file)" ; # add filename behind extension php
109117 }
110118
111119 if ($mime eq "text/html")
112120 {
113121 $mimecat = "page" ;
114 - $tot_mime_html ++ ;
 122+ $tot_mime_html += $count_event ;
115123 }
116124 elsif ($mime =~ /(?:gif|png|jpeg)/o)
117125 { $mimecat = "image" ; }
@@ -137,12 +145,12 @@
138146
139147 if ($line =~ /(?:BannerCont|BannerList|BannerLoad|Bannerbeheer)/io)
140148 {
141 - $banners {"$country,$url"} ++ ;
142 - $banner_requests_ignored ++ ;
 149+ $banners {"$country,$url"} += $count_event ;
 150+ $banner_requests_ignored += $count_event ;
143151 return ;
144152 }
145153
146 - $countries {$country}++ ;
 154+ $countries {$country} += $count_event ; ;
147155
148156 $agent2 = $agent ;
149157 $agent2 =~ s/\%20/ /g ;
@@ -202,13 +210,13 @@
203211 if ($agent2 !~ /MSIE \d+\/\d+/o) # most likely false positives
204212 {
205213 $bot = $true ;
206 - @bots {"$mime,$agent2"} ++ ;
 214+ @bots {"$mime,$agent2"} += $count_event ;
207215 }
208216 }
209217 elsif (($agent2 =~ /bot/io) || (($agent2 =~ /crawl(?:er)?/io) && ($agent2 !~ /MSIEcrawler/io)) || ($agent2 =~ /spider/io) || ($agent2 =~ /parser/io))
210218 {
211219 $bot = $true ;
212 - @bots {"$mime,$agent2"} ++ ;
 220+ @bots {"$mime,$agent2"} += $count_event ;
213221 }
214222
215223 # GECKO
@@ -337,7 +345,7 @@
338346 if (($os eq '..') && ($mobile eq 'M' || $mobile eq 'W'))
339347 {
340348 $os = "Mobile other" ;
341 - $mobile_other {$agent2} ++ ;
 349+ $mobile_other {$agent2} += $count_event ;
342350 }
343351
344352 if ($version =~ /(?:Ipod|Iphone)/io)
@@ -557,21 +565,21 @@
558566 {
559567 $engine =~ s/,/&comma;/go ;
560568 if ($gecko ne "")
561 - { $engines {$gecko} ++ ; }
 569+ { $engines {$gecko} += $count_event ; }
562570 elsif ($applewebkit ne "")
563571 {
564572 $applewebkit =~ s/AppleWebKit\//AppleWebKit /o ;
565 - $engines {$applewebkit} ++ ;
 573+ $engines {$applewebkit} += $count_event ; ;
566574 }
567575
568576 $version =~ s/,/&comma;/go ;
569577 if ($os =~ /playstation/io)
570578 { $version = "NetFront (PlayStation)" ; }
571579
572 - $clients {"$mobile,$version,$mimecat"}++ ;
 580+ $clients {"$mobile,$version,$mimecat"} += $count_event ; ;
573581
574582 $operating_systems =~ s/,/&comma;/go ;
575 - $operating_systems {"$mobile,$os"} ++ ;
 583+ $operating_systems {"$mobile,$os"} += $count_event ; ;
576584 }
577585
578586 if ($count_hits_per_ip_range)
@@ -583,16 +591,16 @@
584592
585593 if ($status =~ /^TCP/)
586594 {
587 - $statusses {"$method:$status"}++ ;
588 - $statusses {"$method:total"}++ ;
 595+ $statusses {"$method:$status"} += $count_event ;
 596+ $statusses {"$method:total"} += $count_event ;
589597 }
590598 else
591 - { $statusses_non_tcp ++ ; }
 599+ { $statusses_non_tcp += $count_event ; }
592600
593601 if ($url =~ /org\/skins/o)
594602 {
595603 ($url2 = $url) =~ s/^.*?\/skins/skins/o ;
596 - $skins {$url2} ++ ;
 604+ $skins {$url2} += $count_event ; ;
597605 }
598606
599607 if ($url =~ /^upload\.wikimedia\.org\//o) # count image size if applicable
@@ -604,7 +612,7 @@
605613
606614 # for diagnostics
607615 if (($referer =~ /google/o) || ($agent =~ /google/io))
608 - { $googles++ ; }
 616+ { $googles += $count_event ; }
609617
610618 $referer =~ s/^http\w?\:\/\///o ;
611619 $referer =~ s/\.php\?.*$/\.php\?../go ;
@@ -622,7 +630,7 @@
623631 if (($domain =~ /\./o) ||
624632 ($domain !~ /^[\*\@\%]?!(wb|wn|wp|wq|ws|wv|wk|wx|xx|wm|mw|wmf)\:/o))
625633 {
626 - $unrecognized_domains {$domain_original} ++ ;
 634+ $unrecognized_domains {$domain_original} += $count_event ;
627635 $domain = 'other' ;
628636 }
629637
@@ -641,24 +649,24 @@
642650
643651 if ($referer_external)
644652 {
645 - $tot_referers_external++ ;
 653+ $tot_referers_external += $count_event ; ;
646654
647655 ($origin, $toplevel) = &DetectOrigin ($client_ip, $referer_original, $agent, $mime, $mimecat, $service, $ext) ;
648656
649657 &CountOrigin ("external", $origin, $toplevel, $mimecat) ;
650658
651659 if ($origin !~ /^\!/o)
652 - { $origins_unsimplified {$referer_original} ++ ; }
 660+ { $origins_unsimplified {$referer_original} += $count_event ; }
653661 else
654662 {
655 - $origin_simplified {"$origin [$referer] <- $referer_original"} ++ ;
656 - $origins_external {$origin} ++ ;
 663+ $origin_simplified {"$origin [$referer] <- $referer_original"} += $count_event ; ;
 664+ $origins_external {$origin} += $count_event ;
657665 }
658666 }
659667 else
660668 {
661 - $tot_referers_internal ++ ;
662 - $referers_internal {$referer} ++ ;
 669+ $tot_referers_internal += $count_event ;
 670+ $referers_internal {$referer} += $count_event ;
663671 $referer =~ s/!//go ; # ! was marker to signal pattern was recognized as wikimedia project
664672 &CountOrigin ("internal", $referer, "org" , $mimecat) ;
665673 }
@@ -670,10 +678,9 @@
671679 if ($domain =~ /!/o)
672680 { print ERR "still ! in domain: '$domain' <- '$domain_original'\n" ; }
673681
674 - $requests {"$domain|$referer|$ext|$mime|$parm"}++ ;
 682+ $requests {"$domain|$referer|$ext|$mime|$parm"} += $count_event ; ;
 683+ $clients_by_wiki {"$mobile,$version,$domain"} += $count_event ; ;
675684
676 - $clients_by_wiki {"$mobile,$version,$domain"}++ ;
677 -
678685 # different output use either 'bot=N' or 'M'(anual) / 'bot=Y' or 'B'(ot)
679686 if ($bot)
680687 {
@@ -689,13 +696,13 @@
690697 if (($domain =~ /^\@/) || ($domain =~ /^\*/))
691698 {
692699 # print "Requests wap $domain | $ext | $mime | $parm | $country | $ind_bot\n" ;
693 - $requests_wap {"$domain|$ext|$mime|$parm|$country|$ind_bot"} ++ ;
 700+ $requests_wap {"$domain|$ext|$mime|$parm|$country|$ind_bot"} += $count_event ; ;
694701 }
695702
696703 if ($domain =~ /^\%/)
697704 {
698705 # print "Requests m $domain | $ext | $mime | $parm | $country | $ind_bot\n" ;
699 - $requests_m {"$domain|$ext|$mime|$parm|$country|$ind_bot"} ++ ;
 706+ $requests_m {"$domain|$ext|$mime|$parm|$country|$ind_bot"} += $count_event ; ;
700707 }
701708 # $title !~ /:/ -> only namespace 0 (minus few titles with colon in name)
702709 if (($url =~ /index.php\?/o) && ($title !~ /:/o) && ($mime eq "text/html") && (($url =~ /action=edit/o) || ($url =~ /action=submit/o)))
@@ -708,34 +715,40 @@
709716 $key =~ s/,/&comma;/go ;
710717 $key =~ s/\|/,/go ;
711718
712 - $index_php_raw {$key}++ ;
 719+ $index_php_raw {$key} += $count_event ; ;
713720 $client_ip_record_cnt {$client_ip}++ ;
714721 }
715722
716723 if ($mimecat eq "page")
717724 {
718 - $tot_mime_html2 ++ ;
 725+ $tot_mime_html2 += $count_event ;
719726
720727 if (($ind_bot =~ /N/) and ($ip_frequencies {$client_ip} > 2))
721728 { $ind_bot = 'bot=Y' ; }
722729
723 - $countries_views {"$ind_bot,$domain,$country"} ++ ;
724 -
 730+ $countries_views {"$ind_bot,$domain,$country"} += $count_event ; ;
725731 # $title !~ /:/ -> only namespace 0 (minus few titles with colon in name)
726732 if (($url =~ /index.php\?/o) && ($title !~ /:/) && ($mime eq "text/html") && ($url =~ /action=submit/o) && ($status =~ /302/o))
727 - { $countries_saves {"$ind_bot,$domain,$country"} ++ ; }
 733+ { $countries_saves {"$ind_bot,$domain,$country"} += $count_event ; }
728734
729735 $time_hh = substr ($time,11,2) ;
730736 $time_mm = substr ($time,14,2) ;
731737 $time_tt = $time_hh * 60 + $time_mm ;
732738 $time_tt2 = $time_tt - $time_tt % 15 ;
733 - $countries_timed {"$ind_bot,$domain,$country,$time_tt2"} ++ ;
 739+ $countries_timed {"$ind_bot,$domain,$country,$time_tt2"} += $count_event ;
734740
 741+ if (! $test)
 742+ {
 743+ $time2 = substr ($time,0,19) ; # omit msec
 744+ $line = "$time2,$client_ip,$domain,$ind_bot2,$mobile,$os,$version,$mimecat\n" ;
 745+ $gz_csv_views_viz->gzwrite($line) || die "Zlib error writing to $file_csv_views_viz: $gz_csv_views_viz->gzerror\n" ;
 746+ }
 747+ }
735748
736 - $time2 = substr ($time,0,19) ; # omit msec
737 - $line = "$time2,$client_ip,$domain,$ind_bot2,$mobile,$os,$version,$mimecat\n" ;
738 - $gz_csv_views_viz->gzwrite($line) || die "Zlib error writing to $file_csv_views_viz: $gz_csv_views_viz->gzerror\n" ;
739 - }
 749+ $records {"$mobile,$mimecat"} += $count_event ;
 750+ $records {"*,$mimecat"} += $count_event ;
 751+ $records {"$mobile,*"} += $count_event ;
 752+ $records {"*,*"} += $count_event ;
740753 }
741754
742755 sub ExtractLanguage
@@ -747,10 +760,10 @@
748761 $regexp_lang = "[a-z]{2}(?:-[a-zA-Z]{2,3})?(?:-[a-zA-Z]{2,3})?" ;
749762 ($language = $agent) =~ s/^.*?; ($regexp_lang)[\);].*$/$1/o ;
750763 if ($language eq $agent)
751 - { $languages_unrecognized {$agent} ++ ; }
 764+ { $languages_unrecognized {$agent} += $count_event ; }
752765 else
753766 {
754 - $languages {"$application,$language"} ++ ;
 767+ $languages {"$application,$language"} += $count_event ;
755768 $agent =~ s/ $language//o ;
756769 }
757770 return ($agent) ;
@@ -823,7 +836,8 @@
824837
825838 foreach $parm (@parms)
826839 {
827 - next if $parm eq "" ;
 840+ next if $parm eq '' ;
 841+ next if $parm eq '*' ;
828842
829843 if (($parm !~ /=/) && ($parm !~ /^[\w\d\-\_]+$/o))
830844 { $error = "parm probably invalid: '$parm' in '$url' -> skip\n" ; $invalid = $true ; last }
@@ -901,7 +915,7 @@
902916 $domain =~ s/\.m\./.%/o ;
903917
904918 if ($domain =~ /^error:/o)
905 - { $domain_errors {$domain}++ ; }
 919+ { $domain_errors {$domain} += $count_event ; }
906920 $domain =~ s/error:.*$/!error:1/o ;
907921
908922 $domain =~ s/^([^\.\/]+)\.([^\.\/]+)\.org/$2:$1/o ;
@@ -1018,16 +1032,16 @@
10191033 if (($googlematch eq "- - z") && ($service =~ /GoogleBot/io))
10201034 {
10211035 $service = "GoogleBot?" ;
1022 - $google_imposters {$agent}++ ;
 1036+ $google_imposters {$agent} += $count_event ;
10231037 }
10241038
10251039 # obsolete? to be considered ?
10261040 # if (($googlematch ne "- - z") || ($service =~ /(?:Earth|Desktop)/o))
1027 - # { $search {"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} ++ ; }
 1041+ # { $search {"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} += $count_event ; }
10281042 # else
10291043 # { $accept = "not" ; }
10301044
1031 - $search {"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} ++ ;
 1045+ $search {"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} += $count_event ;
10321046
10331047 $googlebins2 {"$accept [$googlematch] " . sprintf ("%-14s",$service) . $referer} ++ ;
10341048 $googlebins {$googlematch}++ ;
@@ -1055,7 +1069,7 @@
10561070 # }
10571071
10581072 if ($origin =~ /wiki/o)
1059 - { $wikis {$origin} ++ ; }
 1073+ { $wikis {$origin} += $count_event ; }
10601074
10611075 if ($origin eq "wikipedia")
10621076 {
@@ -1187,7 +1201,7 @@
11881202
11891203 if ($source eq "external")
11901204 {
1191 - $tot_origins_external_counted ++ ;
 1205+ $tot_origins_external_counted += $count_event ;
11921206 $origin =~ s/\:.*$//o ;
11931207 if (is_valid_ip_address ($origin))
11941208 { $origin = "unmatched ip address" ; $toplevel = "" ; }
@@ -1203,7 +1217,7 @@
12041218 # print "$origin\n" ;
12051219 }
12061220 }
1207 - $origins {"$source,$origin,$toplevel,$mimecat"} ++ ;
 1221+ $origins {"$source,$origin,$toplevel,$mimecat"} += $count_event ;
12081222 }
12091223
12101224 sub ProcessUploadPath
@@ -1213,7 +1227,7 @@
12141228 ($path = $url) =~ s/^.*?\.org\///o ;
12151229 ($file = $path) =~ s/^.*\/([^\/]*)$/$1/go ; # remove path
12161230
1217 - $binaries {$path} ++ ; # Jan 2012 store path, not file only
 1231+ $binaries {$path} += $count_event ; # Jan 2012 store path, not file only
12181232
12191233 if ($file =~ /(?:gif|jpg|jpeg|png|svg)$/io)
12201234 {
@@ -1227,10 +1241,10 @@
12281242 {
12291243 ($size = $file) =~ s/^.*?(\d+)px.*$/$1/o ;
12301244 $sizerange = sprintf ("%5d",(int ($size / 20)) * 20) . "-" . sprintf ("%5d",(((int ($size / 20))+1) * 20 - 1)) ;
1231 - $imagesizes {$sizerange} ++ ;
 1245+ $imagesizes {$sizerange} += $count_event ;
12321246 }
12331247 else
1234 - { $imagesizes {"???"} ++ ; }
 1248+ { $imagesizes {"???"} += $count_event ; }
12351249 }
12361250 }
12371251

Status & tagging log