r109171 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r109170‎ \| r109171 \| r109172 >
Date:	18:21, 17 January 2012
Author:	ezachte
Status:	deferred
Tags:
Comment:	missing file added + patches for incompatible varnish (mobile) log
Modified paths:	/trunk/wikistats/squids/EzLib.pm (added) (history) /trunk/wikistats/squids/SquidCountArchive.pl (modified) (history) /trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm (modified) (history) /trunk/wikistats/squids/SquidCountArchiveReadInput.pm (modified) (history) /trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm (added) (history) /trunk/wikistats/squids/SquidReportArchive.pl (modified) (history)

Diff [purge]

Index: trunk/wikistats/squids/EzLib.pm
—	—	@@ -0,0 +1,594 @@
	2	+# Erik Zachte - general purpose routines
	3	+# subroutines in this module have names in lowercase (I usually name own routines mixed case, though not consistently (yet)
	4	+
	5	+no warnings 'uninitialized';
	6	+
	7	+#use POSIX qw (locale_h);
	8	+#$old_locale = setlocale(LC_LANGUAGE) ;
	9	+#print "old locale LC_LANGUAGE $old_locale\n" ;
	10	+#$new_locale = setlocale(LC_LANGUAGE, "en_US.UTF-8");
	11	+#print "new locale LC_LANGUAGE $new_locale\n" ;
	12	+
	13	+$ez_lib_version = 14 ;
	14	+
	15	+sub ez_lib_version
	16	+{
	17	+ $ez_lib_version_required = shift ;
	18	+ if ($ez_lib_version < $ez_lib_version_required)
	19	+ { print "EzLib out of date: version $ez_lib_version_required required" ; exit ;}
	20	+}
	21	+
	22	+use lib "/home/ezachte/lib" ;
	23	+
	24	+use Time::HiRes ;
	25	+use Time::Local ;
	26	+use Getopt::Std ;
	27	+use Carp ;
	28	+use Net::Domain qw (hostname);
	29	+use Digest::MD5 qw (md5_hex);
	30	+use Cwd ;
	31	+use Benchmark qw (timesum timediff timestr timethis timethese cmpthese) ;
	32	+use POSIX ;
	33	+
	34	+sub date_time_english ($) ;
	35	+
	36	+$true = 1 ;
	37	+$false = 0 ;
	38	+
	39	+($app_start_user,$app_start_system) = times ;
	40	+
	41	+# Get host name
	42	+$hostname = `hostname` ;
	43	+chomp ($hostname) ;
	44	+
	45	+$os = $^O ;
	46	+$os_linux = $true if $os =~ /linux/i ;
	47	+$os_windows = $true if $os =~ /win32/i ;
	48	+
	49	+$path_program = $0 ;
	50	+$path_program = Win32::GetLongPathName ($path_program) if $os_windows ;
	51	+($path_program,$name_program) = split '[\\\/](?=[^\\\/]*$)', $path_program ;
	52	+
	53	+die "Operating system '$os' not supported" if (! $os_linux and ! $os_windows) ;
	54	+
	55	+if ($os_linux) # && (-d "/home/ezachte")) # runs on server, to be refined
	56	+{
	57	+ $job_runs_on_production_server = $true ;
	58	+ $path_home = "/home/ezachte" ;
	59	+}
	60	+else
	61	+{ $path_home = getcwd () ; }
	62	+
	63	+$trace_on_exit = $false ; # shorthand for $trace_on_exit_concise
	64	+$trace_on_exit_concise = $false ;
	65	+$trace_on_exit_verbose = $false ;
	66	+$trace_on_exit_libs = $false ;
	67	+
	68	+# emulate new perl 5.10 function
	69	+sub say
	70	+{ $msg = shift ; print "$msg\n" ; }
	71	+
	72	+# if no explicit parameters specified use these defaults (mainly for tests)
	73	+sub default_argv
	74	+{
	75	+ my $argv = shift ;
	76	+ if (($#ARGV == -1) && (! $job_runs_on_production_server))
	77	+ {
	78	+ $argv =~ s/('[^'\|]+')/($a=$1,$a=~s# #``#g,$a)/ge ;
	79	+ $argv =~ s/("[^'\|]+")/($a=$1,$a=~s# #``#g,$a)/ge ;
	80	+ $argv =~ s/\s*\\|/ /g ;
	81	+ $argv =~ s/\\|\s*/ /g ;
	82	+ # @ARGV = split '\\|', $argv ;
	83	+ @ARGV = split ' ', $argv ;
	84	+ foreach $arg (@ARGV)
	85	+ { $arg =~ s/``/ /g ; }
	86	+ $argv =~ s/``/ /g ;
	87	+ }
	88	+ else
	89	+ { $argv = join ' \| ', @ARGV ; }
	90	+ print "\nScript $name_program started at " . date_time_english (time) . "\n" ;
	91	+ print "Arguments: $argv\n" ;
	92	+ print "\n" . '=' x 80 . "\n\n" ;
	93	+ @ARGV_BAK = @ARGV ;
	94	+}
	95	+
	96	+# Get file time
	97	+sub file_time ($)
	98	+{
	99	+ my $path = shift ;
	100	+
	101	+ if (! -e $path)
	102	+ { return '?' ; }
	103	+ else
	104	+ { return (time - (-M $path) * 24 * 60 * 60) ; }
	105	+}
	106	+
	107	+# Get last modification of this file
	108	+sub trace_ez_lib
	109	+{
	110	+ $file_pm = 'EzLib.pm' ;
	111	+ $path_pm = "/home/ezachte/lib/$file_pm" ;
	112	+ print "File $path_pm not found" unless -e $path_pm ;
	113	+ $path_pm_age = time - ((-M $path_pm) * 24 * 60 * 60 ) ;
	114	+ print "\n$file_pm last modified: " . date_time_english ($path_pm_age) . "\n\n" ;
	115	+}
	116	+
	117	+# Print current file and line number
	118	+# print "File: ", __FILE__, " Line: ", __LINE__, "\n";
	119	+
	120	+# Flush output
	121	+$\| = 1;
	122	+
	123	+# prototype (\%) forces supplying one variable argument, which also is auto converted to reference
	124	+# Pro Perl page 226: Requiring Variabloe Rather Than Values
	125	+
	126	+# invocation: @array = keys_sorted_by_value_alpha_asc (%hash) ;
	127	+# replaces: @array = sort {$hash{$a} cmp $hash{$b}} keys %hash ;
	128	+sub keys_sorted_by_value_alpha_asc (\%)
	129	+{
	130	+ my $hashref = shift ;
	131	+ return (sort {$hashref->{$a} cmp $hashref->{$b}} keys %$hashref) ;
	132	+}
	133	+
	134	+# invocation: @array = keys_sorted_by_value_alpha_desc (%hash) ;
	135	+# replaces: @array = sort {$hash{$b} cmp $hash{$a}} keys %hash ;
	136	+sub keys_sorted_by_value_alpha_desc (\%)
	137	+{
	138	+ my $hashref = shift ;
	139	+ return (sort {$hashref->{$b} cmp $hashref->{$a}} keys %$hashref) ;
	140	+}
	141	+
	142	+# invocation: @array = keys_sorted_by_value_num_asc (%hash) ;
	143	+# replaces: @array = sort {$hash{$a} <=> $hash{$b}} keys %hash ;
	144	+sub keys_sorted_by_value_num_asc (\%)
	145	+{
	146	+ my $hashref = shift ;
	147	+ return (sort {$hashref->{$a} <=> $hashref->{$b}} keys %$hashref) ;
	148	+}
	149	+
	150	+# invocation: @array = keys_sorted_by_value_num_desc (%hash) ;
	151	+# replaces: @array = sort {$hash{$b} <=> $hash{$a}} keys %hash ;
	152	+sub keys_sorted_by_value_num_desc (\%)
	153	+{
	154	+ my $hashref = shift ;
	155	+ return (sort {$hashref->{$b} <=> $hashref->{$a}} keys %$hashref) ;
	156	+}
	157	+
	158	+# almost trivial but to match keys_sorted_by_value_... subroutines
	159	+# invocation: @array = keys_sorted_alpha_asc (%hash) ;
	160	+# replaces: @array = sort {$a cmp $b} keys %hash ;
	161	+sub keys_sorted_alpha_asc (\%)
	162	+{
	163	+ my $hashref = shift ;
	164	+ return (sort {$a cmp $b} keys %$hashref) ;
	165	+}
	166	+
	167	+# almost trivial but to match keys_sorted_by_value_... subroutines
	168	+# invocation: @array = keys_sorted_alpha_desc (%hash) ;
	169	+# replaces: @array = sort {$a cmp $b} keys %hash ;
	170	+sub keys_sorted_alpha_desc (\%)
	171	+{
	172	+ my $hashref = shift ;
	173	+ return (sort {$b cmp $a} keys %$hashref) ;
	174	+}
	175	+
	176	+# almost trivial but to match keys_sorted_by_value_... subroutines
	177	+# invocation: @array = keys_sorted_num_asc (%hash) ;
	178	+# replaces: @array = sort {$a <=> $b} keys %hash ;
	179	+sub keys_sorted_num_asc (\%)
	180	+{
	181	+ my $hashref = shift ;
	182	+ return (sort {$a <=> $b} keys %$hashref) ;
	183	+}
	184	+
	185	+# almost trivial but to match keys_sorted_by_value_... subroutines
	186	+# invocation: @array = keys_sorted_num_desc (%hash) ;
	187	+# replaces: @array = sort {$b <=> $a} keys %hash ;
	188	+sub keys_sorted_num_desc (\%)
	189	+{
	190	+ my $hashref = shift ;
	191	+ return (sort {$b <=> $a} keys %$hashref) ;
	192	+}
	193	+
	194	+# for mulilingual version see wikiReportsDate.pl / sub GetDate
	195	+sub date_time_english ($)
	196	+{
	197	+ my @weekdays_en = qw (Sunday Monday Tuesday Wednesday Thursday Friday Saturday);
	198	+ my @months_en = qw (January February March April May June July
	199	+ August September October November December);
	200	+ my $time = shift ;
	201	+ my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($time);
	202	+ return (substr ($weekdays_en[$wday],0,3) . ", " .
	203	+ substr ($months_en[$mon],0,3) . " " .
	204	+ $mday . ", " .
	205	+ (1900 + $year) .
	206	+ " " . sprintf ("%2d:%02d", $hour, $min)) ;
	207	+}
	208	+
	209	+# for mulilingual version see wikiReportsDate.pl / sub GetMonthShort
	210	+sub month_english_short ($)
	211	+{
	212	+ my @months_en = qw (Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
	213	+
	214	+ my $month = shift ;
	215	+ if ($month !~ /^\d+$/)
	216	+ { return ("?") ; }
	217	+
	218	+ return ($months_en [$month % 12]) ;
	219	+}
	220	+
	221	+sub month_year_english_short ($)
	222	+{
	223	+ my @months_en = qw (Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
	224	+
	225	+ my $month = shift ;
	226	+ if ($month !~ /^\d+$/)
	227	+ { return ("?") ; }
	228	+ $month-- ;
	229	+
	230	+ return ($months_en [$month % 12] . " " . (2000 + int ($month / 12)) ) ;
	231	+}
	232	+
	233	+sub ddhhmmss
	234	+{
	235	+ my $seconds = shift ;
	236	+ my $format = shift ;
	237	+
	238	+ $days = int ($seconds / (24*3600)) ;
	239	+ $seconds -= $days * 24*3600 ;
	240	+ $hrs = int ($seconds / 3600) ;
	241	+ $seconds -= $hrs * 3600 ;
	242	+ $min = int ($seconds / 60) ;
	243	+ $sec = $seconds % 60 ;
	244	+
	245	+ if ($format eq '')
	246	+ {
	247	+ $days = ($days > 0) ? (($days > 1) ? "$days days, " : "$days day, ") : "" ;
	248	+ $hrs = (($days + $hrs > 0) ? (($hrs > 1) ? "$hrs hrs" : "$hrs hrs") : "") . ($days + $hrs > 0 ? ", " : ""); # 2 hrs/1 hr ?
	249	+ $min = ($days + $hrs + $min > 0) ? "$min min, " : "" ;
	250	+ $sec = "$sec sec" ;
	251	+ return ("$days$hrs$min$sec") ;
	252	+ }
	253	+ else
	254	+ {
	255	+ return sprintf ($format,$days,$hrs,$min,$sec) if $format =~ /%.%.%.*%/ ;
	256	+ return sprintf ($format, $hrs,$min,$sec) if $format =~ /%.%.%/ ;
	257	+ return sprintf ($format, $min,$sec) if $format =~ /%.*%/ ;
	258	+ return sprintf ($format, $sec) ;
	259	+ }
	260	+}
	261	+
	262	+sub yyyymmddThhmmssDiff
	263	+{
	264	+ my ($time_till, $time_from) = @_ ;
	265	+ my ($yy1,$mm1,$dd1,$hh1,$nn1,$ss1) = $time_till =~ /(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)/ ;
	266	+ my ($yy2,$mm2,$dd2,$hh2,$nn2,$ss2) = $time_from =~ /(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)/ ;
	267	+ $time_till = timegm ($ss1,$nn1,$hh1,$dd1, $mm1-1, $yy1) ;
	268	+ $time_from = timegm ($ss2,$nn2,$hh2,$dd2, $mm2-1, $yy2) ;
	269	+ return ($time_till - $time_from) ;
	270	+}
	271	+
	272	+sub yyyymmddhhmmssDiff
	273	+{
	274	+ my ($time_till, $time_from) = @_ ;
	275	+ my ($yy1,$mm1,$dd1,$hh1,$nn1,$ss1) = $time_till =~ /(\d\d\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)/ ;
	276	+ my ($yy2,$mm2,$dd2,$hh2,$nn2,$ss2) = $time_from =~ /(\d\d\d\d)(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)/ ;
	277	+ $time_till = timegm ($ss1,$nn1,$hh1,$dd1, $mm1-1, $yy1) ;
	278	+ $time_from = timegm ($ss2,$nn2,$hh2,$dd2, $mm2-1, $yy2) ;
	279	+ return ($time_till - $time_from) ;
	280	+}
	281	+
	282	+sub yyyymmddDiffDays
	283	+{
	284	+ my ($time_till, $time_from) = @_ ;
	285	+ my ($yy1,$mm1,$dd1) = $time_till =~ /(\d\d\d\d)-(\d\d)-(\d\d)/ ;
	286	+ my ($yy2,$mm2,$dd2) = $time_from =~ /(\d\d\d\d)-(\d\d)-(\d\d)/ ;
	287	+ $time_till = timegm (0,0,0,$dd1, $mm1-1, $yy1) ;
	288	+ $time_from = timegm (0,0,0,$dd2, $mm2-1, $yy2) ;
	289	+ return (($time_till - $time_from) / (24 * 60 * 60));
	290	+}
	291	+
	292	+sub yyyymmDiffDays
	293	+{
	294	+ my ($time_till, $time_from) = @_ ;
	295	+ my ($yy1,$mm1) = $time_till =~ /(\d\d\d\d)-(\d\d)/ ;
	296	+ my ($yy2,$mm2) = $time_from =~ /(\d\d\d\d)-(\d\d)/ ;
	297	+ $mm1++ ;
	298	+ if ($mm1 > 12) { $mm1 = 1 ; $yy++ ; }
	299	+ $time_till = timegm (0,0,0,1, $mm1-1, $yy1) ;
	300	+ $time_from = timegm (0,0,0,1, $mm2-1, $yy2) ;
	301	+ return (($time_till - $time_from) / (24 * 60 * 60)) ;
	302	+}
	303	+
	304	+sub days_in_month
	305	+{
	306	+ my $year = shift ;
	307	+ my $month = shift ;
	308	+ my $days = $days_in_month_cached {"$year $month"} ;
	309	+ return $days if $days > 0 ;
	310	+
	311	+ my $month2 = $month+1 ;
	312	+ my $year2 = $year ;
	313	+ if ($month2 > 12)
	314	+ { $month2 = 1 ; $year2++ }
	315	+
	316	+ my $timegm1 = timegm (0,0,0,1,$month-1,$year-1900) ;
	317	+ my $timegm2 = timegm (0,0,0,1,$month2-1,$year2-1900) ;
	318	+ $days = ($timegm2-$timegm1) / (246060) ;
	319	+
	320	+ $days_in_month_cached {"$year $month"} = $days ;
	321	+ return ($days) ;
	322	+}
	323	+
	324	+
	325	+sub abort
	326	+{
	327	+
	328	+ $msg = shift ;
	329	+ confess ("\nAbort: $msg\n\n") ;
	330	+ exit ;
	331	+}
	332	+
	333	+
	334	+# test on each run of script whether message should still be displayed, e.g. "New feature"
	335	+sub blank_text_after
	336	+{
	337	+ my $date = shift ;
	338	+ my $text = shift ;
	339	+ my ($day,$month,$year) = $date =~ /(\d+).?(\d+).?(\d+)/ ;
	340	+ my $till = timegm (0,0,0,$day,$month-1,$year-1900) ;
	341	+ if (time > $till)
	342	+ { return ("") ; }
	343	+ else
	344	+ { return ($text) ; }
	345	+}
	346	+
	347	+# test for four triplets and optional port number
	348	+sub is_valid_ip_address
	349	+{
	350	+ my $address = shift ;
	351	+ return ($address =~ /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?:\:\d+)?$/) ;
	352	+}
	353	+
	354	+# store elapsed high resolution time, gor benchmarking
	355	+sub code_started
	356	+{ return Time::HiRes::time() ; }
	357	+
	358	+sub code_complete
	359	+{
	360	+ my ($label, $start) = @_ ;
	361	+ $code_passes {$label} ++ ;
	362	+ $code_time_spent {$label} += Time::HiRes::time - $start ;
	363	+}
	364	+
	365	+# only protect division against runtime error
	366	+sub divide_if_allowed
	367	+{
	368	+ my $x = shift ;
	369	+ my $y = shift ;
	370	+ if ($y == 0)
	371	+ { return () ; }
	372	+ else
	373	+ { return ($x/$y) ; }
	374	+}
	375	+
	376	+# use Encode qw(encode);
	377	+# $eckey=encode('utf8',$key);
	378	+sub encode_url
	379	+{
	380	+ my $url = shift ;
	381	+ $url =~ s/([^0-9a-zA-Z\%\:\/\.])/"%".sprintf ("%X",ord($1))/ge ;
	382	+ return ($url) ;
	383	+}
	384	+
	385	+sub encode_non_ascii
	386	+{
	387	+ my $msg = shift ;
	388	+ $msg =~ s/([\x80-\xFF]{2,})/"%".sprintf ("%X",ord($1))/ge ;
	389	+ return ($msg) ;
	390	+}
	391	+
	392	+sub convert_unicode
	393	+{
	394	+ my $string = shift ;
	395	+ my $input_unicoded = ($string =~ m/[\xc0-\xdf][\x80-\xbf]\|
	396	+ [\xe0-\xef][\x80-\xbf]{2}\|
	397	+ [\xf0-\xf7][\x80-\xbf]{3}/x) ;
	398	+
	399	+
	400	+ # unicode -> html character codes &#nnnn;
	401	+ if ($input_unicoded)
	402	+ { $string =~ s/([\x80-\xFF]+)/unicode_to_html($1)/ge ; }
	403	+ return ($string) ;
	404	+}
	405	+
	406	+sub unicode_to_html
	407	+{
	408	+ my $text = shift ;
	409	+ my $html = "" ;
	410	+ my ($c, $len, $byte, $ord, $unicode, $bytes) ;
	411	+
	412	+ $len = length ($text) ;
	413	+ for ($c = 0 ; $c < $len ; $c++)
	414	+ {
	415	+ $byte = substr ($text,$c,1) ;
	416	+ $ord = ord ($byte) ;
	417	+ if ($ord < 128) # plain ascii character
	418	+ { $html .= $byte ; } # (will not occur in this script)
	419	+ else
	420	+ {
	421	+ # single byte left >= 0x80 ? should never occur but does a few times
	422	+ # treat as pre-unicode high ascii character
	423	+ if ($c == $len - 1)
	424	+ {
	425	+ $html = "\&\#". $ord . ";" ;
	426	+ # print FILE_ERR $title .":invalid unicode char ".$text. "\n"
	427	+ }
	428	+ else
	429	+ {
	430	+ if ($ord < 224)
	431	+ { $bytes = 2 ; }
	432	+ elsif ($ord < 240)
	433	+ { $bytes = 3 ; }
	434	+ elsif ($ord < 248)
	435	+ { $bytes = 4 ; }
	436	+ elsif ($ord < 252)
	437	+ { $bytes = 5 ; }
	438	+ else
	439	+ { $bytes = 6 ; }
	440	+ $unicode = substr ($text,$c,$bytes) ;
	441	+ $html .= unicode_to_html_tag ($unicode) ;
	442	+ $c += $bytes - 1 ;
	443	+ }
	444	+ }
	445	+ }
	446	+ return ($html) ;
	447	+}
	448	+
	449	+
	450	+sub unicode_to_html_tag
	451	+{
	452	+ my $unicode = shift ;
	453	+ my $char = substr ($unicode,0,1) ;
	454	+ my $ord = ord ($char) ;
	455	+ my ($c, $value, $html) ;
	456	+
	457	+ if ($ord < 128) # plain ascii character
	458	+ { return ($unicode) ; } # (will not occur in this script)
	459	+ else
	460	+ {
	461	+ if ($ord >= 252)
	462	+ { $value = $ord - 252 ; }
	463	+ elsif ($ord >= 248)
	464	+ { $value = $ord - 248 ; }
	465	+ elsif ($ord >= 240)
	466	+ { $value = $ord - 240 ; }
	467	+ elsif ($ord >= 224)
	468	+ { $value = $ord - 224 ; }
	469	+ else
	470	+ { $value = $ord - 192 ; }
	471	+ for ($c = 1 ; $c < length ($unicode) ; $c++)
	472	+ { $value = $value * 64 + ord (substr ($unicode, $c,1)) - 128 ; }
	473	+ $html = "\&\#" . $value . ";" ;
	474	+
	475	+ return ($html) ;
	476	+ }
	477	+}
	478	+
	479	+
	480	+
	481	+
	482	+BEGIN
	483	+{
	484	+}
	485	+
	486	+# optionally print program meta data when program sends
	487	+END
	488	+{
	489	+# if ($os_windows)
	490	+# { use Win32 ; }
	491	+
	492	+ my ($time, $path,$program) ;
	493	+
	494	+ if ($trace_on_exit \|\| $trace_on_exit_verbose \|\| $trace_on_exit_concise)
	495	+ {
	496	+ $time_elapsed_total = time - $^T ; # $^T is program start time
	497	+ ($app_end_user,$app_end_system) = times ;
	498	+
	499	+ $time_active_user_processes = $app_end_user - $app_start_user ;
	500	+ $time_active_system_processes = $app_end_system - $app_start_system ;
	501	+ $time_active_total = $time_active_user_processes + $time_active_system_processes ;
	502	+
	503	+ # print "\n" . '=' x (length ($msg) -1) . "\n\n$msg\n\n" ;
	504	+ print "\n" . '=' x 80 . "\n\n$msg\n\n" ;
	505	+ }
	506	+
	507	+ if ($trace_on_exit \|\| $trace_on_exit_verbose \|\| $trace_on_exit_concise)
	508	+ {
	509	+ print "Prog: $name_program\n" ;
	510	+ print "Path: $path_program\n" ;
	511	+ if ($job_runs_on_production_server)
	512	+ { print "Host: $hostname (production)\n\n" ; }
	513	+ else
	514	+ { print "Host: $hostname (test run)\n\n" ; }
	515	+ print "Args:\n\n", map {" $_\n"} @ARGV_BAK ;
	516	+ # print "Host: $hostname\n" ;
	517	+ print "OS: $os\n" ;
	518	+ print "Perl: " . ($a = sprintf ("%.9f",$^V), $a =~ s/\_/_/g,$a) . "\n" ; # perl version
	519	+ print "Perl: $^X\n" ; # perl exe path
	520	+ print "EzLib: $ez_lib_version\n" ; # perl exe path
	521	+ }
	522	+
	523	+ if ($trace_on_exit \|\| $trace_on_exit_verbose \|\| $trace_on_exit_libs)
	524	+ {
	525	+ # Get library paths
	526	+ print "\nLibs:\n", map {" $_\n"} @INC ;
	527	+
	528	+ $cwd = cwd () ;
	529	+ foreach (grep {$_ =~ /home\|wiki/i} values %INC) # own modules
	530	+ # foreach (values %INC) # all modules
	531	+ {
	532	+ $file = $_ ;
	533	+ if ($file !~ /[\\\/]/)
	534	+ { $file = "$cwd/$file" ; }
	535	+ $time = file_time ($file) ;
	536	+ # $file = Win32::GetLongPathName ($_) if $os_windows ;
	537	+ push @own_modules, "$time\|$file" ;
	538	+ }
	539	+
	540	+ @own_modules = sort {$b <=> $a} @own_modules ;
	541	+ print "\nOwn modules (d/m/y h:m):\n" ;
	542	+ foreach (@own_modules)
	543	+ {
	544	+ ($time,$path) = split '\\|', $_ ;
	545	+ my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($time);
	546	+ print sprintf (" %2d/%02d/%4d %2d:%02d %s\n", $mday,$mon+1,$year+1900,$hour,$min, $path) ;
	547	+ }
	548	+ print "\n\n" ;
	549	+ }
	550	+
	551	+ $lines = 0 ;
	552	+ foreach $key (sort keys %code_passes)
	553	+ {
	554	+ if ($lines++ == 0)
	555	+ { print "Executing times:\n" ; }
	556	+ print sprintf (" %-26s","$key:") . ddhhmmss($code_time_spent{$key},'%3d min, %2d sec') . " / " .
	557	+ sprintf ("%10d",$code_passes{$key}) . " calls = " . sprintf ("%6f", divide_if_allowed ($code_time_spent{$key}, $code_passes {$key})) . " sec/pass\n" ;
	558	+ }
	559	+ print "\n" ;
	560	+
	561	+# $locale = setlocale(LC_LANGUAGE, $old_locale);
	562	+# print "locale LC_LANGUAGE back to $locale\n" ;
	563	+
	564	+ if ($time_elapsed_total < 5)
	565	+ { $msg = "Ready in " . ddhhmmss ($time_elapsed_total) . "\n" ; }
	566	+ else
	567	+ {
	568	+ $perc_active_user_processes = sprintf ("%4.1f", 100 *$time_active_user_processes/$time_elapsed_total) ;
	569	+ $perc_active_system_processes = sprintf ("%4.1f", 100 *$time_active_system_processes/$time_elapsed_total) ;
	570	+ $perc_active_total = sprintf ("%4.1f", 100 *$time_active_total/$time_elapsed_total) ;
	571	+ $msg = "Ready in " . ddhhmmss ($time_elapsed_total) . "\n\nTime spent:\n" .
	572	+ "User: $perc_active_user_processes\% (" . ddhhmmss ($time_active_user_processes) . ")\n" .
	573	+ "System: $perc_active_system_processes\% (" . ddhhmmss ($time_active_system_processes) . ")\n" .
	574	+ "Total: $perc_active_total\% (" . ddhhmmss ($time_active_total) . ")\n" ;
	575	+ }
	576	+
	577	+ print "\n\n" . '=' x 80 . "\n" . '=' x 80 . "\n\n" ;
	578	+}
	579	+
	580	+sub trace
	581	+{
	582	+ my $function_name = shift ;
	583	+
	584	+ my ($ss,$mm,$hh) = (localtime (time))[0,1,2] ;
	585	+ my $time = sprintf ("%02d:%02d:%02d", $hh, $mm, $ss) ;
	586	+
	587	+ print "\n$time $function_name\n" ;
	588	+}
	589	+
	590	+# only when perl compiled with malloc
	591	+# use Devel::Peek ;
	592	+# $ENV {PERL_DEBUG_MSTATS} = 2;
	593	+# mstat() ;
	594	+
	595	+1 ;
Index: trunk/wikistats/squids/SquidCountArchiveReadInput.pm
—	—	@@ -1,5 +1,10 @@
2	2	#!/usr/bin/perl
3	3
	4	+# /usr/local/bin/geoiplogtag uses /usr/share/GeoIP/GeoIP.dat
	5	+# test:
	6	+# echo 125.123.123.123 \| /usr/local/bin/geoiplogtag 1
	7	+# refresh: bayes:/usr/share/GeoIP> wget http://geolite.maxmind.com/download/geoip/database/GeoLiteCountry/GeoIP.dat.gz
	8	+
4	9	sub CollectFilesToProcess
5	10	{
6	11	trace CollectFilesToProcess ;
—	—	@@ -161,7 +166,7 @@
162	167	if ($job_runs_on_production_server)
163	168	{
164	169	if ($file_in =~ /\.gz$/o)
165		~~- { open IN, "-\|", "gzip -dc $file_in \| /usr/local/bin/geoiplogtag 5" ; } # http://perldoc.perl.org/functions/open.html~~
	170	+ { open IN, "-\|", "gzip -dc $file_in \| sed s/\\ \\ */\\ /g \| /usr/local/bin/geoiplogtag 5" ; } # http://perldoc.perl.org/functions/open.html
166	171	else
167	172	{ open IN, "-\|", "cat $file_in \| /usr/local/bin/geoiplogtag 5" ; } # http://perldoc.perl.org/functions/open.html
168	173	$fields_expected = 14 ;
—	—	@@ -184,10 +189,36 @@
185	190	# print $line ;
186	191	# }
187	192
	193	+
	194	+# ugly Q&D code to circumvent spaces in agent string
	195	+# $line2 = $line ;
	196	+ chomp $line ;
188	197	@fields = split (' ', $line) ;
189		~~- if ($#fields < $fields_expected) { $fields_too_few ++ ; next ; }~~
190		~~- if ($#fields > $fields_expected) { $fields_too_many ++ ; next ; }~~
	198	+# next if $line =~ /upload/ ;
	199	+# next if $line !~ /en\.m\.wikipedia/ ;
	200	+# next if $fields[10] eq '-' ;
	201	+# print "mime " . $fields[10] . "\n" ;
	202	+#next if $fields [9] eq '-' ;
	203	+#next if $fields [9] =~ /NONE/ ;
	204	+ if ($#fields > 14)
	205	+ {
	206	+# print "line $line2\n" ;
	207	+# print "fields " . $#fields . "\n$line\n" ;
	208	+ $country_code = $fields [$#fields] ;
	209	+ $fields [$#fields] = '' ;
	210	+ $line = join (' ', @fields) ;
	211	+# print "2 $line\n" ;
	212	+ @fields = split (' ', $line, 14) ;
	213	+ $fields [14] = $country_code ;
	214	+# print "\n\n12: " . $fields [12] . "\n" ;
	215	+# print "13: " . $fields [13] . "\n" ;
	216	+# print "14: " . $fields [14] . "\n" ;
	217	+# print "15: " . $fields [15] . "\n" ;
	218	+ }
191	219
	220	+ if ($#fields < $fields_expected) { $fields_too_few ++ ; print "invalid field count " . $#fields . "\n" ; next ; }
	221	+ if ($#fields > $fields_expected) { $fields_too_many ++ ; print "invalid field count " . $#fields . "\n" ; next ; }
	222	+
192	223	$time = $fields [2] ;
193	224
194	225	if (($oldest_time_read eq "") \|\| ($time lt $oldest_time_read))
Index: trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm
—	—	@@ -0,0 +1,922 @@
	2	+ #!/usr/bin/perl
	3	+
	4	+ use lib "/home/ezachte/lib" ;
	5	+ use EzLib ;
	6	+
	7	+sub WriteOutputIpFrequencies
	8	+{
	9	+ trace WriteOutputIpFrequencies ;
	10	+
	11	+ my $path_out = shift ;
	12	+ print "\ncd $path_out\n\n" ;
	13	+ chdir ($path_out) ;
	14	+
	15	+ $comment = "# Data from $time_to_start till $time_to_stop (yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 1000)\n" ;
	16	+
	17	+ open CSV_MULTIPLE_ADDRESSES, '>', $file_ip_frequencies ;
	18	+ print "# html pages found: $html_pages_found\n" ;
	19	+ print CSV_MULTIPLE_ADDRESSES "# html pages found: $html_pages_found\n" ;
	20	+ print CSV_MULTIPLE_ADDRESSES "#\n" ;
	21	+
	22	+ foreach $address (keys %ip_frequencies)
	23	+ {
	24	+ $ip_distribution {$ip_frequencies {$address}} ++ ;
	25	+ }
	26	+
	27	+ $ip_distribution_ge_2 = 0 ;
	28	+ $ip_distribution_ge_3 = 0 ;
	29	+ $ip_distribution_ge_4 = 0 ;
	30	+ $ip_distribution_ge_5 = 0 ;
	31	+ $ip_distribution_ge_10 = 0 ;
	32	+ $ip_distribution_ge_20 = 0 ;
	33	+ $ip_distribution_ge_50 = 0 ;
	34	+ $ip_distribution_ge_100 = 0 ;
	35	+ $ip_distribution_ge_250 = 0 ;
	36	+ $ip_distribution_ge_1000 = 0 ;
	37	+ $ip_distribution_ge_2500 = 0 ;
	38	+ $ip_distribution_ge_10000 = 0 ;
	39	+
	40	+ foreach $frequency (sort {$a <=> $b} keys %ip_distribution)
	41	+ {
	42	+ $metafreq = $ip_distribution {$frequency} ;
	43	+ if ($frequency >= 2) { $ip_distribution_ge_2 += $metafreq ; }
	44	+ if ($frequency >= 3) { $ip_distribution_ge_3 += $metafreq ; }
	45	+ if ($frequency >= 4) { $ip_distribution_ge_4 += $metafreq ; }
	46	+ if ($frequency >= 5) { $ip_distribution_ge_5 += $metafreq ; }
	47	+ if ($frequency >= 10) { $ip_distribution_ge_10 += $metafreq ; }
	48	+ if ($frequency >= 20) { $ip_distribution_ge_20 += $metafreq ; }
	49	+ if ($frequency >= 50) { $ip_distribution_ge_50 += $metafreq ; }
	50	+ if ($frequency >= 100) { $ip_distribution_ge_100 += $metafreq ; }
	51	+ if ($frequency >= 250) { $ip_distribution_ge_250 += $metafreq ; }
	52	+ if ($frequency >= 1000) { $ip_distribution_ge_1000 += $metafreq ; }
	53	+ if ($frequency >= 2500) { $ip_distribution_ge_2500 += $metafreq ; }
	54	+ if ($frequency >= 10000) { $ip_distribution_ge_10000 += $metafreq ; }
	55	+ if ($frequency > 20) { next ; }
	56	+ print "# $metafreq addresses occur $frequency times\n" ;
	57	+ print CSV_MULTIPLE_ADDRESSES "# $metafreq addresses occur $frequency times\n" ;
	58	+ }
	59	+
	60	+ print CSV_MULTIPLE_ADDRESSES "#\n" ;
	61	+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_2 addresses occur 2+ times\n" ;
	62	+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_3 addresses occur 3+ times\n" ;
	63	+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_4 addresses occur 4+ times\n" ;
	64	+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_5 addresses occur 5+ times\n" ;
	65	+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_10 addresses occur 10+ times\n" ;
	66	+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_20 addresses occur 20+ times\n" ;
	67	+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_50 addresses occur 50+ times\n" ;
	68	+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_100 addresses occur 100+ times\n" ;
	69	+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_250 addresses occur 250+ times\n" ;
	70	+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_1000 addresses occur 1000+ times\n" ;
	71	+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_2500 addresses occur 2500+ times\n" ;
	72	+ print CSV_MULTIPLE_ADDRESSES "# $ip_distribution_ge_10000 addresses occur 10000+ times\n" ;
	73	+
	74	+ foreach $address (sort {$ip_frequencies {$b} <=> $ip_frequencies {$a}} keys %ip_frequencies)
	75	+ {
	76	+ $frequency = $ip_frequencies {$address} ;
	77	+ # print "$freq,$address\n" ;
	78	+ if ($frequency > 1)
	79	+ { print CSV_MULTIPLE_ADDRESSES "$frequency,$address\n" ; }
	80	+ }
	81	+
	82	+ close CSV_MULTIPLE_ADDRESSES ;
	83	+
	84	+ if ($job_runs_on_production_server)
	85	+ {
	86	+ $cmd = "bzip2 -f $file_ip_frequencies" ;
	87	+ print "\ncmd = '$cmd'\n" ;
	88	+ `$cmd` ;
	89	+ }
	90	+}
	91	+
	92	+sub WriteOutputSquidSequenceGaps
	93	+{
	94	+ trace WriteOutputSquidSequenceGaps ;
	95	+
	96	+ my $path_out = shift ;
	97	+ print "\ncd $path_out\n\n" ;
	98	+ chdir ($path_out) ;
	99	+
	100	+ my ($tot_events_all_day, $tot_delta_all_day, %all_squids_events, %all_squids_delta, %squids, $tot_squids) ;
	101	+
	102	+ $yyyy = substr ($time_to_start,0,4) ;
	103	+ $mm = substr ($time_to_start,5,2) ;
	104	+ $dd = substr ($time_to_start,8,2) ;
	105	+ $date = substr ($time_to_start,0,10) ;
	106	+ $date_excel = "\"=DATE($yyyy,$mm,$dd)\"" ;
	107	+
	108	+ open CSV_SEQNO_PER_SQUIDHOUR, '>', $file_seqno_per_squidhour ;
	109	+ print CSV_SEQNO_PER_SQUIDHOUR "squid,hour,events,tot delta,avg delta\n" ;
	110	+
	111	+ $squid_hour = 0 ;
	112	+ foreach $squid_hour (sort keys %squid_events)
	113	+ {
	114	+ $events = $squid_events {$squid_hour} ;
	115	+ next if $events == 0 ;
	116	+
	117	+ $delta = $squid_delta {$squid_hour} ;
	118	+ $avg_delta = sprintf ("%.0f", $delta / $events) ;
	119	+
	120	+ print CSV_SEQNO_PER_SQUIDHOUR "$squid_hour,$events,$delta,$avg_delta\n" ;
	121	+ print "$squid_hour,$events,$delta,$avg_delta\n" ;
	122	+
	123	+ $tot_events_all_day += $events ;
	124	+ $tot_delta_all_day += $delta ;
	125	+ ($squid,$hour) = split (',', $squid_hour) ;
	126	+ $squids {$squid} ++ ;
	127	+
	128	+ $all_squids_events {$hour} += $events ;
	129	+ $all_squids_delta {$hour} += $delta ;
	130	+ }
	131	+
	132	+ foreach $squid (keys %squids)
	133	+ { $tot_squids++ ; }
	134	+
	135	+
	136	+ if ($tot_events_all_day > 0)
	137	+ {
	138	+ $avg_delta_all_day = sprintf ("%.0f", $tot_delta_all_day / $tot_events_all_day) ;
	139	+ $tot_events_all_day_corrected = sprintf ("%.0f", ($avg_delta_all_day / 1000) * $tot_events_all_day) ;
	140	+
	141	+ print CSV_SEQNO_PER_SQUIDHOUR "# Squids: $tot_squids Events: $tot_events_all_day Avg delta: $avg_delta_all_day\n\n" ;
	142	+ print "\nSquids: $tot_squids\nEvents: $tot_events_all_day\nAvg delta: $avg_delta_all_day\n\n" ;
	143	+ }
	144	+ else
	145	+ {
	146	+ print CSV_SEQNO_PER_SQUIDHOUR "# Squids: $tot_squids Events: 0\n\n" ;
	147	+ print "\nSquids: $tot_squids\nEvents: 0\n\n" ;
	148	+ }
	149	+ close CSV_SEQNO_PER_SQUIDHOUR ;
	150	+
	151	+ # now same thing for all squids combined, hourly
	152	+
	153	+ undef @csv ;
	154	+
	155	+ open CSV_SEQNO_ALL_SQUIDS_DAY, '>', $file_seqno_all_squids ;
	156	+ print CSV_SEQNO_ALL_SQUIDS_DAY "date,time,events,avg delta seqno\n" ;
	157	+
	158	+ open CSV_SEQNO_ALL_SQUIDS_MONTH, '<', "../$file_seqno_all_squids" ;
	159	+ while ($line = <CSV_SEQNO_ALL_SQUIDS_MONTH>)
	160	+ {
	161	+ next if $line =~ /^$date/ ;
	162	+ next if $line =~ /^date/ ;
	163	+ push @csv, $line ;
	164	+ }
	165	+ close CSV_SEQNO_ALL_SQUIDS_MONTH ;
	166	+
	167	+ open CSV_SEQNO_ALL_SQUIDS_MONTH, '>', "../$file_seqno_all_squids" ;
	168	+ print CSV_SEQNO_ALL_SQUIDS_MONTH "date,time,events (x 1000),avg delta seqno,date excel,events corrected (x 1000)\n" ;
	169	+ foreach $line (sort @csv)
	170	+ { print CSV_SEQNO_ALL_SQUIDS_MONTH $line ; }
	171	+
	172	+ $hour = '' ;
	173	+ foreach $hour (sort keys %all_squids_events)
	174	+ {
	175	+ $avg_delta = 0 ;
	176	+ $events = $all_squids_events {$hour} ;
	177	+ $delta = $all_squids_delta {$hour} ;
	178	+ if ($events > 0)
	179	+ { $avg_delta = sprintf ("%.0f", $delta / $events) ; }
	180	+
	181	+ print CSV_SEQNO_ALL_SQUIDS_DAY "$date,$hour,$events,$avg_delta\n" ;
	182	+ print CSV_SEQNO_ALL_SQUIDS_MONTH "$date,$hour,$events,$avg_delta\n" ;
	183	+ print "$date,$hour,$events,$avg_delta\n" ;
	184	+ }
	185	+
	186	+ print CSV_SEQNO_ALL_SQUIDS_MONTH "$date,*,$tot_events_all_day,$avg_delta_all_day,$date_excel,$tot_events_all_day_corrected\n" ;
	187	+ print "$date,*,$tot_events_all_day,$avg_delta_all_day,$tot_events_all_day_corrected\n" ;
	188	+
	189	+ close CSV_SEQNO_ALL_SQUIDS_DAY ;
	190	+ close CSV_SEQNO_ALL_SQUIDS_MONTH ;
	191	+}
	192	+
	193	+sub WriteOutputSquidLogs
	194	+{
	195	+ trace WriteOutputSquidLogs ;
	196	+
	197	+ my $path_out = shift ;
	198	+ print "\ncd $path_out\n\n" ;
	199	+ chdir ($path_out) ;
	200	+
	201	+ $comment = "# Data from $time_to_start till $time_to_stop (yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 1000)\n" ;
	202	+
	203	+ open CSV_METHODS, '>', $file_csv_methods ;
	204	+ open CSV_SKINS, '>', $file_csv_skins ;
	205	+ open CSV_SCRIPTS, '>', $file_csv_scripts ;
	206	+ open CSV_IMAGES, '>', $file_csv_images ;
	207	+ open CSV_BANNERS, '>', $file_csv_banners ;
	208	+ open CSV_BINARIES, '>', $file_csv_binaries ;
	209	+ open CSV_EXTENSIONS, '>', $file_csv_extensions ;
	210	+ open CSV_REQUESTS, '>', $file_csv_requests ;
	211	+ open CSV_REQUESTS_WAP, '>', $file_csv_requests_wap ;
	212	+ open CSV_REQUESTS_M, '>', $file_csv_requests_m ;
	213	+ open CSV_ORIGINS, '>', $file_csv_origins ;
	214	+ open CSV_SEARCH, '>', $file_csv_search ;
	215	+ open CSV_BOTS, '>', $file_csv_bots ;
	216	+ open CSV_GOOGLEBOTS, '>', $file_csv_googlebots ;
	217	+ open CSV_OPSYS, '>', $file_csv_opsys ;
	218	+ open CSV_CLIENTS, '>', $file_csv_clients ;
	219	+ open CSV_LANGUAGES, '>', $file_csv_languages ;
	220	+ open CSV_COUNTRIES_VIEWS, '>', $file_csv_countries_views ;
	221	+ open CSV_COUNTRIES_SAVES, '>', $file_csv_countries_saves ;
	222	+ open CSV_COUNTRIESTIMED, '>', $file_csv_countries_timed ;
	223	+ open OUT_REFERERS, '>', $file_out_referers ;
	224	+ open CSV_CLIENTS_BY_WIKI, '>', $file_csv_clients_by_wiki ;
	225	+ open CSV_AGENTS, '>', $file_csv_agents ;
	226	+
	227	+ print CSV_METHODS $comment ;
	228	+ print CSV_SKINS $comment ;
	229	+ print CSV_SCRIPTS $comment ;
	230	+ print CSV_IMAGES $comment ;
	231	+ print CSV_BANNERS $comment ;
	232	+ print CSV_BINARIES $comment ;
	233	+ print CSV_EXTENSIONS $comment ;
	234	+ print CSV_REQUESTS $comment ;
	235	+ print CSV_REQUESTS_WAP $comment ;
	236	+ print CSV_REQUESTS_M $comment ;
	237	+ print CSV_ORIGINS $comment ;
	238	+ print CSV_SEARCH $comment ;
	239	+ print CSV_BOTS $comment ;
	240	+ print CSV_GOOGLEBOTS $comment ;
	241	+ print CSV_OPSYS $comment . "# mobile: $tags_mobile ($tags_mobile_upd)\n" .
	242	+ "# pos 1: - = non mobile, M = mobile ('-'+'M'=100%), G = aggregated Group\n" ;
	243	+ print CSV_CLIENTS $comment ;
	244	+ print CSV_LANGUAGES $comment ;
	245	+ print CSV_COUNTRIES_VIEWS $comment ;
	246	+ print CSV_COUNTRIES_SAVES $comment ;
	247	+ print CSV_COUNTRIESTIMED $comment ;
	248	+ print OUT_REFERERS $comment ;
	249	+ print CSV_CLIENTS_BY_WIKI $comment ;
	250	+ print CSV_AGENTS $comment ;
	251	+
	252	+ # CSV_METHODS
	253	+ print OUT "\nMETHODS:\n\n" ;
	254	+ print "\nMethods:\n\n" ;
	255	+ $method_all = 0 ;
	256	+ foreach $key (sort keys %statusses)
	257	+ {
	258	+ if ($key =~ /:total/)
	259	+ {
	260	+ $total = $statusses {$key} ;
	261	+ $method_all += $total ;
	262	+ ($method = $key) =~ s/:.*$// ;
	263	+ print OUT sprintf ("%-8s", "$method:") . sprintf ("%6d",$total) . "\n" ;
	264	+ print sprintf ("%-8s", "$method:") . sprintf ("%6d",$total) . "\n" ;
	265	+ }
	266	+ }
	267	+ print OUT "TOTAL: " . sprintf ("%6d",$method_all) . "\n" ;
	268	+ print "TOTAL: " . sprintf ("%6d",$method_all) . "\n" ;
	269	+
	270	+ print CSV_METHODS ":method,status,count\n" ;
	271	+ foreach $key (sort keys %statusses)
	272	+ {
	273	+ if ($key =~ /:total/)
	274	+ {
	275	+ $total = $statusses {$key} ;
	276	+ ($method = $key) =~ s/:.*$// ;
	277	+ }
	278	+ else
	279	+ {
	280	+ $total = $statusses {$key} ;
	281	+
	282	+ print OUT sprintf ("%6d",$total) . " : " . $key . "\n" ;
	283	+ $key2 = $key ;
	284	+ $key2 =~ s/,/,/g ;
	285	+ $key2 =~ s/\:/,/g ;
	286	+ print CSV_METHODS "$key2,$total\n" ;
	287	+ }
	288	+ }
	289	+
	290	+ # CSV_SKINS
	291	+ print OUT "\nSKINS:\n\n" ;
	292	+ print CSV_SKINS ":scripts,parameters,count\n" ;
	293	+ $total_skins = 0 ;
	294	+ foreach $key (sort keys %skins)
	295	+ {
	296	+ $total = $skins{$key} ;
	297	+ $total_skins += $total ;
	298	+ print OUT sprintf ("%5d",$total) . " : " . $key . "\n" ;
	299	+ print CSV_SKINS "$key,$total\n" ;
	300	+ }
	301	+ print OUT sprintf ("%5d",$total_skins) . " : total\n" ;
	302	+
	303	+ # CSV_SCRIPTS
	304	+ print OUT "\nSCRIPTS:\n\n" ;
	305	+ print CSV_SCRIPTS ":scripts,parameters,count\n" ;
	306	+ foreach $key (sort keys %scripts)
	307	+ {
	308	+ print OUT sprintf ("%5d",$scripts{$key}) . " : " . $key . "\n" ;
	309	+ print CSV_SCRIPTS "$key,${scripts{$key}}\n" ;
	310	+ }
	311	+
	312	+ print OUT "\nSCRIPTS NO FURTHER PROCESSED:\n\n" ;
	313	+ foreach $key (sort keys %scripts_no_further_processing)
	314	+ {
	315	+ print OUT sprintf ("%5d",$scripts_no_further_processing{$key}) . " : " . $key . "\n" ;
	316	+ }
	317	+
	318	+ # CSV_IMAGES
	319	+ print OUT "\nIMAGE SIZES:\n\n" ;
	320	+ print CSV_IMAGES ":size range,count\n" ;
	321	+ foreach $range (sort keys %imagesizes)
	322	+ {
	323	+ ($range2 = $range) =~ s/ //g ;
	324	+ $count = $imagesizes {$range} ;
	325	+ print OUT sprintf ("%5d",$count) . " : $range\n" ;
	326	+ print CSV_IMAGES "$range2,$count\n" ;
	327	+ }
	328	+
	329	+ # CSV_BANNERS
	330	+ print OUT "\nBANNERS:\n\n" ;
	331	+ print CSV_BANNERS ":country,url\n" ;
	332	+ foreach $key (sort {$banners {$b} <=> $banners {$a}} keys %banners)
	333	+ {
	334	+ print OUT sprintf ("%5d",$banners{$key}) . " : " . $key . "\n" ;
	335	+ print CSV_BANNERS "$key,${banners{$key}}\n" ;
	336	+ }
	337	+
	338	+ # CSV_BINARIES
	339	+ print OUT "\nBINARIES:\n\n" ;
	340	+ print CSV_BINARIES ":file,count\n" ;
	341	+ $cnt_binaries = 0 ;
	342	+ foreach $key (sort {$binaries {$b} <=> $binaries {$a}} keys %binaries)
	343	+ {
	344	+ if (++$cnt_binaries <= 500)
	345	+ { print OUT sprintf ("%5d",$binaries{$key}) . " : " . $key . "\n" ; }
	346	+
	347	+ print CSV_BINARIES "$key,${binaries{$key}}\n" ;
	348	+ }
	349	+ # print OUT "\nImages:\n\n" ;
	350	+ # print CSV_IMAGES ":project,referer,ext,mime,parms,count\n" ;
	351	+
	352	+ foreach $key (sort keys %images_xref)
	353	+ {
	354	+ print OUT sprintf ("%5d",$images_xref{$key}) . " : " . $key . "\n" ;
	355	+ # $key2 = $key ;
	356	+ # $key2 =~ s/,/,/g ;
	357	+ # $key2 =~ s/\\|/,/g ;
	358	+ # push @csv, "$key2,${requests{$key}}" ;
	359	+ }
	360	+ #@csv =sort @csv ;
	361	+ #foreach $line (@csv)
	362	+ #{ print CSV_REQUESTS "$line\n" ; }
	363	+
	364	+ # CSV_EXTENSIONS
	365	+ print OUT "\nEXTENSIONS:\n\n" ;
	366	+ print "\nExtensions:\n\n" ;
	367	+ print CSV_EXTENSIONS ":extension,count\n" ;
	368	+ $total = 0 ;
	369	+ foreach $key (sort {$exts {$b} <=> $exts {$a}} keys %exts)
	370	+ {
	371	+ $count = $exts {$key} ;
	372	+ $total += $count ;
	373	+ print OUT sprintf ("%6d",$count) . " : $key\n" ;
	374	+ print sprintf ("%6d",$count) . " : $key\n" ;
	375	+ print CSV_EXTENSIONS "$key,$count\n" ;
	376	+ }
	377	+ print OUT sprintf ("%6d",$total) . " : total\n" ;
	378	+ print sprintf ("%6d",$total) . " : total\n" ;
	379	+
	380	+ # CSV_REQUESTS
	381	+ undef @csv ;
	382	+ print OUT "\nREQUESTS:\n\n" ;
	383	+ print CSV_REQUESTS $legend ;
	384	+ print CSV_REQUESTS ":project,referer,ext,mime,parms,count\n" ;
	385	+ foreach $key (sort keys %requests)
	386	+ {
	387	+ print OUT sprintf ("%5d",$requests{$key}) . " : " . $key . "\n" ;
	388	+ $key2 = $key ;
	389	+ $key2 =~ s/,/,/g ;
	390	+ $key2 =~ s/\\|/,/g ;
	391	+ push @csv, "$key2,${requests{$key}}" ;
	392	+ }
	393	+ @csv = sort @csv ;
	394	+ foreach $line (@csv)
	395	+ { print CSV_REQUESTS "$line\n" ; }
	396	+
	397	+ # CSV_REQUESTS_WAP
	398	+ undef @csv ;
	399	+ print OUT "\nREQUESTS_WAP:\n\n" ;
	400	+ print CSV_REQUESTS_WAP $legend ;
	401	+ print CSV_REQUESTS_WAP ":project,ext,mime,parms,country,count\n" ;
	402	+ foreach $key (sort keys %requests_wap)
	403	+ {
	404	+ print OUT sprintf ("%5d",$requests_wap{$key}) . " : " . $key . "\n" ;
	405	+ $key2 = $key ;
	406	+ $key2 =~ s/,/,/g ;
	407	+ $key2 =~ s/\\|/,/g ;
	408	+ push @csv, "$key2,${requests_wap{$key}}" ;
	409	+ }
	410	+ @csv = sort @csv ;
	411	+ foreach $line (@csv)
	412	+ { print CSV_REQUESTS_WAP "$line\n" ; }
	413	+
	414	+ # CSV_REQUESTS_M
	415	+ undef @csv ;
	416	+ print OUT "\nREQUESTS_M:\n\n" ;
	417	+ print CSV_REQUESTS_M $legend ;
	418	+ print CSV_REQUESTS_M ":project,ext,mime,parms,country,count\n" ;
	419	+ foreach $key (sort keys %requests_m)
	420	+ {
	421	+ print OUT sprintf ("%5d",$requests_m{$key}) . " : " . $key . "\n" ;
	422	+ $key2 = $key ;
	423	+ $key2 =~ s/,/,/g ;
	424	+ $key2 =~ s/\\|/,/g ;
	425	+ push @csv, "$key2,${requests_m{$key}}" ;
	426	+ }
	427	+ @csv = sort @csv ;
	428	+ foreach $line (@csv)
	429	+ { print CSV_REQUESTS_M "$line\n" ; }
	430	+
	431	+ # CSV_BOTS
	432	+ foreach $key (sort {$bots {$b} <=> $bots {$a}} keys %bots)
	433	+ { print CSV_BOTS $bots{$key} . ",$key\n" ; }
	434	+
	435	+ # CSV_GOOGLEBOTS
	436	+ print CSV_GOOGLEBOTS "# Hits for googlebot from Google ip address\n" ;
	437	+ print CSV_GOOGLEBOTS ":date,:ip range,:hits\n" ;
	438	+ foreach $key (sort {$a cmp $b} keys %google_bot_hits)
	439	+ {
	440	+ my $year = substr ($key,0,4) ;
	441	+ my $mon = substr ($key,5,2) ;
	442	+ my $mday = substr ($key,8,2) ;
	443	+ my $hour = substr ($key,11,2) ;
	444	+ my $date = "$year/$mon/$mday $hour:00:00" ;
	445	+ my $iprange = $key ;
	446	+ $iprange =~ s/^[^,]*,// ;
	447	+
	448	+ print CSV_GOOGLEBOTS "$date,$iprange,${google_bot_hits{$key}}\n" ;
	449	+ }
	450	+
	451	+ #print OUT "\nUrls:\n" ;
	452	+ #foreach $key (sort keys %urls)
	453	+ #{ print OUT sprintf ("%5d",$urls{$key}) . " : " . $key . "\n" ; }
	454	+
	455	+ # OUT_INTERWIKI
	456	+ print OUT "\nINTERWIKI:\n\n" ;
	457	+ foreach $key (sort keys %interwiki)
	458	+ { print OUT sprintf ("%5d",$interwiki{$key}) . " : " . $key . "\n" ; }
	459	+
	460	+ print OUT "\nREFERER UPLOAD:\n\n" ;
	461	+ foreach $key (sort keys %referer_upload)
	462	+ { print OUT sprintf ("%5d",$referer_upload{$key}) . " : " . $key . "\n" ; }
	463	+
	464	+ # OUT_REFERERS
	465	+ print OUT_REFERERS $legend ;
	466	+ print OUT_REFERERS "referer,count\n" ;
	467	+
	468	+ print OUT_REFERERS "# internal\n" ;
	469	+ foreach $key (sort keys %referers_internal)
	470	+ { print OUT_REFERERS sprintf ("%5d",$referers_internal{$key}) . " : " . $key . "\n" ; }
	471	+
	472	+ print OUT_REFERERS "# external\n" ;
	473	+ foreach $key (sort {$origins_external {$b} <=> $origins_external {$a} } keys %origins_external)
	474	+ { print OUT_REFERERS sprintf ("%5d",$origins_external{$key}) . " : " . $key . "\n" ; }
	475	+
	476	+ print OUT_REFERERS "# unsimplified\n" ;
	477	+ foreach $key (sort keys %origins_unsimplified)
	478	+ { print OUT_REFERERS sprintf ("%5d",$origins_unsimplified{$key}) . " : " . $key . "\n" ; }
	479	+
	480	+ print OUT_REFERERS "# simplified\n" ;
	481	+ foreach $key (sort keys %origin_simplified)
	482	+ { print OUT_REFERERS sprintf ("%5d",$origin_simplified{$key}) . " : " . $key . "\n" ; }
	483	+
	484	+ print "\nLook alikes:\n\n" ;
	485	+ print OUT_REFERERS "# look alikes\n" ;
	486	+ foreach $key (sort {$wikis {$b} <=> $wikis {$a}} keys %wikis)
	487	+ {
	488	+ print OUT_REFERERS sprintf ("%5d",$wikis{$key}) . " : " . $key . "\n" ;
	489	+ print sprintf ("%5d",$wikis{$key}) . " : " . $key . "\n" ;
	490	+ }
	491	+
	492	+ # CSV_ORIGINS
	493	+ print OUT "\nORIGINS:\n\n" ;
	494	+ print CSV_ORIGINS ":toplevel,count\n" ;
	495	+ foreach $key (sort keys %origins)
	496	+ {
	497	+ print OUT sprintf ("%8d",$origins{$key}) . " : " . $key . "\n" ;
	498	+ print CSV_ORIGINS "$key,${origins{$key}}\n" ;
	499	+ }
	500	+
	501	+ # CSV_SEARCH
	502	+ print OUT "\nSEARCHES:\n" ;
	503	+ print CSV_SEARCH ":matches (ip range\|referer\|agent string),site,referer group,bot,agent match,mime group,top level domain,count\n" ;
	504	+ foreach $key (sort keys %search)
	505	+ {
	506	+ print OUT sprintf ("%8d",$search{$key}) . " : " . $key . "\n" ;
	507	+ print CSV_SEARCH "$key,${search{$key}}\n" ;
	508	+ }
	509	+
	510	+ # CSV_LANGUAGES
	511	+ print OUT "\nLANGUAGES:\n\n" ;
	512	+ print CSV_LANGUAGES ":browser,:language,:count\n" ;
	513	+ foreach $key (sort keys %languages)
	514	+ {
	515	+ print OUT sprintf ("%8d",$languages{$key}) . " : " . $key . "\n" ;
	516	+ print CSV_LANGUAGES "$key,${languages{$key}}\n" ;
	517	+ }
	518	+
	519	+ #print OUT "\nSources:\n\n" ;
	520	+ #foreach $key (sort keys %srcs)
	521	+ #{ print OUT sprintf ("%5d",$srcs{$key}) . " : " . $key . "\n" ; }
	522	+
	523	+ print OUT "\nGOOGLE BOTS:\n\n" ;
	524	+ foreach $key (sort keys %googlebots)
	525	+ { print OUT sprintf ("%5d",$googlebots{$key}) . " : " . $key . "\n" ; }
	526	+
	527	+ print OUT "\nGOOGLE BINS:\n\n" ;
	528	+ print "\nGoogle bins:\n\n" ;
	529	+ foreach $key (sort {$googlebins {$b} <=> $googlebins {$a}} keys %googlebins)
	530	+ {
	531	+ print OUT sprintf ("%5d",$googlebins{$key}) . " : " . $key . "\n" ;
	532	+ print sprintf ("%5d",$googlebins{$key}) . " : " . $key . "\n" ;
	533	+ }
	534	+
	535	+ print OUT "\nGOOGLE BINS 2:\n\n" ;
	536	+ print "\nGoogle bins 2:\n\n" ;
	537	+ foreach $key (sort {$googlebins2 {$b} <=> $googlebins2 {$a}} keys %googlebins2)
	538	+ {
	539	+ print OUT sprintf ("%5d",$googlebins2{$key}) . " : " . $key . "\n" ;
	540	+ print sprintf ("%5d",$googlebins2{$key}) . " : " . $key . "\n" ;
	541	+ }
	542	+
	543	+ print OUT "\nDOMAIN ERRORS:\n\n" ;
	544	+ foreach $key (sort { $domain_errors {$b} <=> $domain_errors {$a}} keys %domain_errors)
	545	+ { print OUT sprintf ("%5d",$domain_errors{$key}) . " : " . $key . "\n" ; }
	546	+
	547	+ print OUT "\nUNRECOGNIZED GOOGLE AGENTS:\n\n" ;
	548	+ foreach $key (sort { $googleagents {$b} <=> $googleagents {$a}} keys %googleagents)
	549	+ { print OUT sprintf ("%5d",$googleagents{$key}) . " : " . $key . "\n" ; }
	550	+
	551	+ print OUT "\nGOOGLE LOOK ALIKES:\n\n" ;
	552	+ foreach $key (sort { $google_imposters {$b} <=> $google_imposters {$a}} keys %google_imposters)
	553	+ { print OUT sprintf ("%5d",$google_imposters{$key}) . " : " . $key . "\n" ; }
	554	+
	555	+ print OUT "\nYAHOO BOTS:\n\n" ;
	556	+ foreach $key (sort keys %yahoobots)
	557	+ { print OUT sprintf ("%5d",$yahoobots{$key}) . " : " . $key . "\n" ; }
	558	+
	559	+ if ($count_hits_per_ip_range)
	560	+ {
	561	+ print OUT "\nIP ACTIVITY BY COUNT:\n\n" ;
	562	+ foreach $key (sort {$cnt_ip_ranges {$b} <=> $cnt_ip_ranges {$a}}keys %cnt_ip_ranges)
	563	+ {
	564	+ if ($cnt_ip_ranges {$key} >= 10)
	565	+ { print OUT sprintf ("%5d",$cnt_ip_ranges{$key}) . " : " . $key . "\n" ; }
	566	+ }
	567	+ }
	568	+
	569	+ print OUT "\nIP ACTIVITY BY ADDRESS:\n\n" ;
	570	+ foreach $key (sort keys %cnt_ip_ranges)
	571	+ {
	572	+ if ($cnt_ip_ranges {$key} >= 10)
	573	+ { print OUT sprintf ("%5d",$cnt_ip_ranges{$key}) . " : " . $key . "\n" ; }
	574	+ }
	575	+
	576	+ print OUT2 "\nOPERATING SYSTEMS:\n\n" ;
	577	+ print CSV_OPSYS ":rectype,opsys,count\n" ;
	578	+ $total_operating_systems = 0 ;
	579	+
	580	+ foreach $key (keys %operating_systems)
	581	+ { $total_operating_systems += $operating_systems{$key} ; }
	582	+
	583	+ print OUT2 "\nTOTAL_OPERATING_SYSTEMS: $total_operating_systems\n\n" ;
	584	+ foreach $key (sort keys %operating_systems)
	585	+ {
	586	+ my $count = $operating_systems {$key} ;
	587	+ my $count2 = sprintf ("%5d",$count) ;
	588	+ my $perc1 = sprintf ("%6.2f",(100*$count/$total_operating_systems)) . "%" ;
	589	+ my $perc2 = sprintf ("%.2f",(100*$count/$total_operating_systems)) . "%" ;
	590	+
	591	+ if ($count >= 1)
	592	+ { print OUT2 "$count2 = $perc1: $key \n" ; }
	593	+
	594	+ print CSV_OPSYS "$key,$count,$perc2\n" ;
	595	+ }
	596	+ print OUT2 "\nOPERATING SYSTEMS GROUPED:\n\n" ;
	597	+ $total_operating_systems_printed = 0 ;
	598	+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "BlackBerry") ;
	599	+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "DoCoMo") ;
	600	+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "FreeBSD") ;
	601	+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "iPad") ;
	602	+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "iPhone") ;
	603	+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Linux") ;
	604	+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Mac") ;
	605	+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "OpenBSD") ;
	606	+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "SunOS") ;
	607	+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "SymbianOS") ;
	608	+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Windows") ;
	609	+ print OUT2 sprintf ("%6d",$total_operating_systems_printed) . "=" . sprintf ("%5.2f",(100*$total_operating_systems_printed/$total_operating_systems)) . "% : Total\n\n" ;
	610	+
	611	+ @LinuxVersions = split (',', 'Android,Xubuntu,Kubuntu,Ubuntu,Gentoo,PCLinuxOS,CentOS,Oracle,Mandriva,Red Hat,Mandriva,openSUSE,SUSE,Fedora,Epiphany,Mint,Mips,Arch,Debian,Slackware,Motor,Other') ;
	612	+
	613	+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Mac Intel") ;
	614	+ &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Mac PowerPC") ;
	615	+
	616	+ foreach $LinuxVersion (@LinuxVersions)
	617	+ { &WriteOutputLineToCsvSharePerOs ($total_operating_systems, "Linux $LinuxVersion") ; }
	618	+
	619	+
	620	+ print OUT2 "\nCLIENTS:\n\n" ;
	621	+ print CSV_CLIENTS ":mobile,engine,client,mime-cat\n" ;
	622	+ $total_clients = 0 ;
	623	+ foreach $key (keys %clients)
	624	+ {
	625	+ ($mobile,$version,$mimecat) = split (',', $key) ;
	626	+ print OUT2 "\%CLIENTS: '$mobile','$version','$mimecat': " . $clients{$key} . "\n" ;
	627	+ $total_clients {$mimecat} += $clients{$key} ;
	628	+ $version =~ s/ .*$// ;
	629	+ $version =~ s/\/.*$// ;
	630	+ $version =~ s/,/,/g ;
	631	+ $group = "$mobile,$version,$mimecat" ;
	632	+ $grouped_clients {$group} += $clients{$key} ;
	633	+ }
	634	+ foreach $key (sort keys %clients)
	635	+ {
	636	+ ($mobile,$version,$mimecat) = split (',', $key) ;
	637	+ my $count = $clients {$key} ;
	638	+ my $count2 = sprintf ("%5d",$count) ;
	639	+ my $perc1 = sprintf ("%6.2f",(100*$count/$total_clients {$mimecat})) . "%" ;
	640	+ my $perc2 = sprintf ("%.2f" ,(100*$count/$total_clients {$mimecat})) . "%" ;
	641	+
	642	+ if ($clients {$key} >= 3)
	643	+ { print OUT2 "$count2 = $perc1: $key\n" ; }
	644	+
	645	+ print CSV_CLIENTS "$key,$count,$perc2\n" ;
	646	+ }
	647	+ foreach $key (sort keys %engines)
	648	+ {
	649	+ my $count = $engines {$key} ;
	650	+ print CSV_CLIENTS "E,$key,$count\n" ;
	651	+ }
	652	+ foreach $key (sort keys %grouped_clients)
	653	+ {
	654	+ ($group,$version,$mimecat) = split (',', $key) ;
	655	+ my $count = $grouped_clients {$key} ;
	656	+ my $perc2 = sprintf ("%.2f",(100*$count/$total_clients {$mimecat})) . "%" ;
	657	+ print CSV_CLIENTS "G,$key,$count,$perc2\n" ;
	658	+ }
	659	+
	660	+ print OUT2 "\nCLIENTS BY WIKI:\n\n" ;
	661	+ print CSV_CLIENTS_BY_WIKI ":mobile,client,mime-cat\n" ;
	662	+ $total_clients = 0 ;
	663	+ foreach $key (keys %clients_by_wiki)
	664	+ { $total_clients += $clients_by_wiki{$key} ; }
	665	+
	666	+ foreach $key (sort keys %clients_by_wiki)
	667	+ {
	668	+ my $count = $clients_by_wiki {$key} ;
	669	+ my $count2 = sprintf ("%5d",$count) ;
	670	+ my $perc1 = sprintf ("%6.2f",(100*$count/$total_clients)) . "%" ;
	671	+ my $perc2 = sprintf ("%.2f",(100*$count/$total_clients)) . "%" ;
	672	+ if ($clients_by_wiki {$key} >= 3)
	673	+ { print OUT2 "$count2 = $perc1: $key\n" ; }
	674	+ ($mobile,$version,$domain,$mimecat) = split (',', $key) ;
	675	+ $domain = ExpandAbbreviation ($domain) ;
	676	+ $domain =~ s/:/,/ ;
	677	+ $domain =~ s/\ /--/ ;
	678	+ print CSV_CLIENTS_BY_WIKI "$mobile,$version,$domain,$mimecat,$count,$perc2\n" ;
	679	+ }
	680	+
	681	+ foreach $key (sort keys %grouped_clients_by_wiki)
	682	+ {
	683	+ my $count = $grouped_clients_by_wiki {$key} ;
	684	+ my $perc2 = sprintf ("%.2f",(100*$count/$total_clients)) . "%" ;
	685	+ print CSV_CLIENTS_BY_WIKI "G,$key,$count,$perc2\n" ;
	686	+ }
	687	+
	688	+ print OUT2 "\nGOOGLEBOT NOT FROM GOOGLE\n\n" ;
	689	+ foreach $key (sort keys %ip_bot_no_google)
	690	+ {
	691	+ if ($ip_bot_no_google {$key} >= 3)
	692	+ { print OUT2 sprintf ("%5d",$ip_bot_no_google{$key}) . " : " . $key . "\n" ; }
	693	+ }
	694	+
	695	+ print OUT2 "\nMOBILE OTHER\n\n" ;
	696	+ foreach $key (sort keys %mobile_other)
	697	+ { print OUT2 sprintf ("%5d",$mobile_other{$key}) . " : " . $key . "\n" ; }
	698	+
	699	+ foreach $key (sort keys %countries_views)
	700	+ {
	701	+ my $count = $countries_views {$key} ;
	702	+ print CSV_COUNTRIES_VIEWS "$key,$count\n" ;
	703	+ }
	704	+
	705	+ foreach $key (sort keys %countries_saves)
	706	+ {
	707	+ my $count = $countries_saves {$key} ;
	708	+ print CSV_COUNTRIES_SAVES "$key,$count\n" ;
	709	+ print "$key,$count\n" ;
	710	+ }
	711	+
	712	+ foreach $key (sort keys %countries_timed)
	713	+ {
	714	+ my $count = $countries_timed {$key} ;
	715	+ print CSV_COUNTRIESTIMED "$key,$count\n" ;
	716	+ }
	717	+
	718	+ foreach $key (keys_sorted_by_value_num_desc %agents_raw)
	719	+ {
	720	+ my $count = $agents_raw {$key} ;
	721	+ $key =~ s/,/;/g ;
	722	+ next if $count < 5 ;
	723	+ print CSV_AGENTS "$key,$count\n" ;
	724	+ }
	725	+
	726	+ close CSV_METHODS ;
	727	+ close CSV_SKINS ;
	728	+ close CSV_SCRIPTS ;
	729	+ close CSV_IMAGES ;
	730	+ close CSV_BANNERS ;
	731	+ close CSV_BINARIES ;
	732	+ close CSV_EXTENSIONS ;
	733	+ close CSV_REQUESTS ;
	734	+ close CSV_ORIGINS ;
	735	+ close CSV_SEARCH ;
	736	+ close CSV_BOTS ;
	737	+ close CSV_GOOGLEBOTS ;
	738	+ close CSV_OPSYS ;
	739	+ close CSV_LANGUAGES ;
	740	+ close CSV_COUNTRIES_VIEWS ;
	741	+ close CSV_COUNTRIES_SAVES ;
	742	+ close CSV_COUNTRIESTIMED ;
	743	+ close CSV_CLIENTS ;
	744	+ close CSV_CLIENTS_BY_WIKI ;
	745	+ close OUT_REFERERS ;
	746	+ close CSV_AGENTS ;
	747	+}
	748	+
	749	+sub WriteOutputEditsSavesFile
	750	+{
	751	+ trace WriteOutputEditsSavesFile ;
	752	+
	753	+ my $path_out = shift ;
	754	+ print "\ncd $path_out\n\n" ;
	755	+ chdir ($path_out) ;
	756	+
	757	+ $comment = "# Data from $time_to_start till $time_to_stop (yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 1000)\n" ;
	758	+
	759	+# $comment = "# Data from $time_to_start till $time_to_stop (yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 1000)\n" ;
	760	+
	761	+ # only keep edits/submits for ip addresses which occur only once in this hash (stands for avg. 2000 hits)
	762	+ foreach $key (keys %client_ip_record_cnt)
	763	+ { $client_ip_record_cnt_total {$client_ip_record_cnt {$key}}++ ; }
	764	+
	765	+ print "\n\nEdit submit lines:\n" ;
	766	+ foreach $key (sort {$b <=> $a} keys %client_ip_record_cnt_total)
	767	+ {
	768	+ print sprintf ("%5d", $client_ip_record_cnt_total {$key}) . " ip address(es) occur $key times\n" ;
	769	+ $lines_edit_submit_total += $key * $client_ip_record_cnt_total {$key} ;
	770	+ }
	771	+ print "Total edit submit lines: $lines_edit_submit_total\n\n" ;
	772	+
	773	+ foreach $key (keys %index_php_raw)
	774	+ {
	775	+ ($client_ip,$key2) = split (',', $key, 2) ;
	776	+ if ($client_ip_record_cnt {$client_ip} < 2)
	777	+ {
	778	+ $index_php {$key2} += $index_php_raw {$key} ;
	779	+ $edit_submit_filtered += $index_php_raw {$key} ;
	780	+ }
	781	+ }
	782	+ undef %index_php_raw ;
	783	+
	784	+ open CSV_INDEXPHP, '>', "$path_out/$file_csv_indexphp" ;
	785	+
	786	+ print CSV_INDEXPHP $comment ;
	787	+ foreach $key (sort {$index_php {$b} <=> $index_php {$a}} keys %index_php)
	788	+ {
	789	+ print CSV_INDEXPHP "$key,${index_php {$key}}\n" ;
	790	+ $lines_edit_submit_filtered ++ ;
	791	+ }
	792	+ print "Filtered edits+submits: $edit_submit_filtered in $lines_edit_submit_filtered lines\n\n" ;
	793	+
	794	+ close CSV_INDEXPHP ;
	795	+}
	796	+
	797	+sub WriteOutputCountriesSaves
	798	+{
	799	+ my $path_out = shift ;
	800	+
	801	+ $comment = "# Data from $time_to_start till $time_to_stop (yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 1000)\n" ;
	802	+
	803	+ open CSV_COUNTRIES_SAVES, '>', "$path_out/$file_csv_countries_saves" ;
	804	+ print CSV_COUNTRIES_SAVES $comment ;
	805	+
	806	+ foreach $key (sort keys %countries_saves)
	807	+ {
	808	+ my $count = $countries_saves {$key} ;
	809	+ print CSV_COUNTRIES_SAVES "$key,$count\n" ;
	810	+ }
	811	+ close CSV_COUNTRIES_SAVES ;
	812	+}
	813	+
	814	+sub WriteDiagnostics
	815	+{
	816	+ if ($statusses_non_tcp > 0)
	817	+ { print ERR "Statusses non 'TCP..' : $statusses_non_tcp\n" ; }
	818	+
	819	+ if ($fields_too_many > 0)
	820	+ { print ERR "Too many fields on $fields_too_many records. (space in article name?)\n" ; }
	821	+
	822	+ if ($fields_too_few > 0)
	823	+ { print ERR "Too few fields on $fields_too_few records.\n" ; }
	824	+
	825	+ print "\nLines read per date:\n" ;
	826	+ print OUT "\nLines read per date:\n" ;
	827	+ foreach $key (sort keys %lines_read)
	828	+ {
	829	+ print OUT "$key: " . sprintf ("%8d",$lines_read{$key}) . "\n" ;
	830	+ print "$key: " . sprintf ("%8d",$lines_read{$key}) . "\n" ;
	831	+ }
	832	+ print OUT "\n" ;
	833	+ print "\n" ;
	834	+
	835	+ print "Referers internal $tot_referers_internal\n" ;
	836	+ print "Referers external $tot_referers_external\n" ;
	837	+ print "Origins counted $tot_origins_external_counted\n" ;
	838	+
	839	+ print ERR "\nUnrecognized domains:\n\n" ;
	840	+ foreach $key (sort keys %unrecognized_domains)
	841	+ { print ERR sprintf ("%5d",$unrecognized_domains{$key}) . " : " . $key . "\n" ; }
	842	+
	843	+ print "\n$tot_mime_html html requests found.\n" ;
	844	+ print "country info stored for $tot_mime_html2 html requests.\n" ;
	845	+# # double check that yahoo is much more than 10% of google (even when google uses ip addresses)
	846	+# print "\ngoogle string in referer or agent: $googles\n" ;
	847	+}
	848	+
	849	+sub WriteOutputLineToCsvSharePerOs
	850	+{
	851	+ my $total_all = shift ;
	852	+ my $criteria = shift ;
	853	+ (my $criteria2 = $criteria) =~ s/ /.*/g ;
	854	+ my @criteria = split (' ', $criteria2) ;
	855	+
	856	+ my $total_operating_systems = 0 ;
	857	+ my $trace_count = 0 ;
	858	+
	859	+ print "WriteOutputLineToCsvSharePerOs $criteria\n" ;
	860	+ foreach $key (keys %operating_systems)
	861	+ {
	862	+ my $match = $true ;
	863	+ foreach $criterion (@criteria)
	864	+ {
	865	+ if ($key !~ /$criterion/)
	866	+ {
	867	+ if (($trace_count++ < 20) && ($criteria =~ /Linux/))
	868	+ { print "key $key criterion $criterion FALSE\n" ; }
	869	+ $match = $false ;
	870	+ last ;
	871	+ }
	872	+ else
	873	+ {
	874	+ if (($trace_count++ < 20) && ($criteria =~ /Linux/))
	875	+ { print "key $key criterion $criterion TRUE\n" ; }
	876	+ }
	877	+ }
	878	+ if ($match)
	879	+ { $total_operating_systems += $operating_systems {$key} ; }
	880	+ }
	881	+ $perc_operating_systems1 = ".." ;
	882	+ $perc_operating_systems2 = ".." ;
	883	+ if ($total_all > 0)
	884	+ {
	885	+ $perc_operating_systems1 = sprintf ("%5.2f",(100*$total_operating_systems/$total_all)) ;
	886	+ $perc_operating_systems2 = sprintf ("%.2f",(100*$total_operating_systems/$total_all)) ;
	887	+ }
	888	+ print OUT2 sprintf ("%6d",$total_operating_systems) . "= $perc_operating_systems1\% : $criteria\n" ;
	889	+ print CSV_OPSYS "G,$criteria,$total_operating_systems,$perc_operating_systems2\%\n" ; ;
	890	+ $total_operating_systems_printed += $total_operating_systems ;
	891	+}
	892	+
	893	+sub MoveAndCompressFiles
	894	+{
	895	+ trace MoveAndCompressFiles ;
	896	+
	897	+ my ($path_out, $path_out_month, $date_collect_files) = @_ ;
	898	+
	899	+ print "\ncd $path_out_month\n" ;
	900	+ chdir ($path_out_month) ;
	901	+
	902	+# $cmd = "mv $path_out/private/SquidDataEditsSavesDoNotPublish.txt $path_out/private/SquidDataEditsSavesDoNotPublish$date_collect_files.txt" ;
	903	+# print "\ncmd = '$cmd'\n" ;
	904	+#`$cmd` ;
	905	+
	906	+ $cmd = "bzip2 -f $path_out/$file_edits_saves" ;
	907	+ print "\ncmd = '$cmd'\n" ;
	908	+ `$cmd` ;
	909	+
	910	+ $cmd = "bzip2 -f $path_out/$file_csv_agents" ;
	911	+ print "\ncmd = '$cmd'\n" ;
	912	+ `$cmd` ;
	913	+
	914	+ # $cmd = "tar -cf $date_collect_files\-csv.tar $date_collect_files/*.csv" ;
	915	+ # print "\ncmd = '$cmd'\n" ;
	916	+ # `$cmd` ;
	917	+
	918	+ # $cmd = "bzip2 -f $date_collect_files\-csv.tar" ;
	919	+ # print "\ncmd = '$cmd'\n" ;
	920	+ # `$cmd` ;
	921	+}
	922	+
	923	+1 ;
Index: trunk/wikistats/squids/SquidCountArchive.pl
—	—	@@ -510,6 +510,7 @@
511	511	undef %squid_events ;
512	512	undef %squid_seqno ;
513	513	undef %statusses ;
	514	+ undef %total_clients ;
514	515	undef %unrecognized_domains ;
515	516	undef %wikis ;
516	517	# undef @files ;
Index: trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm
—	—	@@ -4,13 +4,20 @@
5	5	{
6	6	my $line = shift ;
7	7
8		~~- my @fields = split (' ', $line) ;~~
9	8	$time = $fields [2] ;
10	9	$date = substr ($time,0,10) ;
11	10
12	11	$client_ip = $fields [4] ;
13	12	$mime = $fields [10] ;
	13	+ $url = lc ($fields [8]) ;
14	14
	15	+ if ($mime eq '-')
	16	+ {
	17	+ # no mime type on log records from varnish, assume 'page request' on most, until that stream had been fixed
	18	+ if (($url =~ /\.m\..?\/wiki\//) \|\| ($url =~ /\.m\..?\/w\/index.php/))
	19	+ { $mime = "text/html" ; }
	20	+ }
	21	+
15	22	if ($scan_ip_frequencies) # phase 1
16	23	{
17	24	return if $line =~ /Banner(?:Cont\|List\|Load\|beheer)/io ;
—	—	@@ -49,8 +56,12 @@
50	57	$status = $fields [5] ;
51	58	$size = $fields [6] ;
52	59	$method = $fields [7] ;
53		~~- $url = lc ($fields [8]) ;~~
54	60
	61	+ $referer = lc ($fields [11]) ;
	62	+ $agent = $fields [13] ;
	63	+
	64	+# print "\ntime '$time', client_ip '$client_ip', mime '$mime', squid '$squid', seqno '$seqno', \nstatus '$status', size '$size', method '$method', referer '$referer',\nurl '$url', agent '$agent'\n" ;
	65	+
55	66	if ($url =~ /\.m\.wikipedia.org/)
56	67	{
57	68	$url_wikipedia_mobile ++ ;
—	—	@@ -63,9 +74,6 @@
64	75	}
65	76	}
66	77
67		~~- $referer = lc ($fields [11]) ;~~
68		~~- $agent = $fields [13] ;~~
69		-
70	78	$url =~ s/^http\w?\:\/\///o ;
71	79	$url =~ s/\%3A/:/gio ;
72	80	$url =~ s/\%3B/;/gio ;
—	—	@@ -355,7 +363,11 @@
356	364
357	365	if ($os =~ /Linux/o)
358	366	{
359		~~- ($osx = $agent2) =~ s/^.?((?:Android\|Ubuntu\|Gentoo\|PCLinuxOS\|CentOS\|Red Hat\|Mandriva\|SUSE\|Fedora\|Epiphany\|Debian\|Motor\w+)[^\s;\[\]]).*$/ucfirst($1)/ieo ;~~
	367	+ ($cpu = $agent2) =~ s/^.?(armv\d+\|i\d+\|x[0-9_]+).$/$1/o ;
	368	+ if ($cpu eq $agent2)
	369	+ { $cpu = '' ; }
	370	+
	371	+ ($osx = $agent2) =~ s/^.?((?:Android\|Xubuntu\|Kubuntu\|Ubuntu\|Gentoo\|PCLinuxOS\|CentOS\|Oracle\|Mandriva\|Red Hat\|Mandriva\|openSUSE\|SUSE\|Fedora\|Epiphany\|Mint\|Mips\|Arch\|Debian\|Slackware\|Motor\w+)[^\s;\[\]]).*$/ucfirst($1)/ieo ;
360	372	if ($osx ne $agent2)
361	373	{
362	374	$osx =~ s/(\d+\_\d+).*$/$1/o ;
—	—	@@ -363,8 +375,18 @@
364	376	$osx =~ s/_/\./o ;
365	377	$osx =~ s/(\d+\.\d+).*$/$1/o ;
366	378	$osx =~ s/^(Motor)(\w+).*$/ucfirst(lc($1)).uc($2)/ieo ;
367		~~- $os = "$os $osx" ;~~
368	379	}
	380	+ else
	381	+ { $osx = "Other" ; }
	382	+
	383	+ $os = "$os $cpu $osx" ;
	384	+ $os =~ s/\s\s+/ /g ;
	385	+
	386	+ # testing:
	387	+ # if ($osx eq $agent2)
	388	+ # { print "Linux ?? -> $agent2\n" ; }
	389	+ # elsif ($osx !~ /(?:Android\|Ubuntu)/i)
	390	+ # { print "Linux !! $cpu $osx -> $agent2\n" ; }
369	391	}
370	392
371	393	$os =~ s/(Windows NT \d+\.\d+).*$/$1/o ;
—	—	@@ -1189,7 +1211,7 @@
1190	1212	($path = $url) =~ s/^.*?\.org\///o ;
1191	1213	($file = $path) =~ s/^.\/([^\/])$/$1/go ; # remove path
1192	1214
1193		~~- $binaries {$file} ++ ;~~
	1215	+ $binaries {$path} ++ ; # Jan 2012 store path, not file only
1194	1216
1195	1217	if ($file =~ /(?:gif\|jpg\|jpeg\|png\|svg)$/io)
1196	1218	{
Index: trunk/wikistats/squids/SquidReportArchive.pl
—	—	@@ -6,8 +6,10 @@
7	7	ez_lib_version (2) ;
8	8
9	9	# set defaults mainly for tests on local machine
10		~~- default_argv "-m 2011-07 " ;~~
11		~~-# default_argv "-c -q 2010Q4" ;~~
	10	+# default_argv "-m 2011-07 " ;
	11	+# default_argv "-c -q 2010Q1" ;
	12	+# default_argv "-w" ; # refresh country info from Wikipedia (population etc)
	13	+ default_argv "-c" ;
12	14
13	15	# to do: add text from http://wiki.squid-cache.org/SquidFaq/SquidLogs
14	16	# ReportOrigin how to handle '!error <-> other
—	—	@@ -30,6 +32,8 @@
31	33
32	34	getopt ("dmq", \%options) ;
33	35
	36	+ undef %country_code_not_specified_reported ;
	37	+
34	38	if (-d "/a/squid")
35	39	{
36	40	print "\n\nJob runs on server $hostname\n\n" ;
—	—	@@ -52,11 +56,13 @@
53	57	print "Path in = $path_in\n" ;
54	58	print "Path out = $path_out\n" ;
55	59
	60	+ $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ;
	61	+
56	62	# periodically harvest updated metrics from
57	63	# 'http://en.wikipedia.org/wiki/List_of_countries_by_population'
58	64	# 'http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users'
59	65	if (defined ($options {"w"}))
60		~~- { &ReadWikipedia ; exit ; }~~
	66	+ { &ReadWikipedia ; print "Ready\n" ; exit ; }
61	67
62	68	if (defined ($options {"c"}))
63	69	{ $reportcountries = $true ; }
—	—	@@ -77,7 +83,6 @@
78	84	&InitProjectNames ;
79	85
80	86	$file_csv_country_codes = "CountryCodes.csv" ;
81		~~- $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ;~~
82	87
83	88	&ReadInputCountriesNames ;
84	89
—	—	@@ -384,7 +389,7 @@
385	390	$title = "$title_main - Wikipedia <font color=#008000>$views_edits Per Country</font> - Trends" ;
386	391	&WriteReportPerCountryTrends ($title, $views_edits, &UnLink ($links,$offset_links+3)) ;
387	392
388		~~- $links =~ s/,.*$// ;~~
	393	+# $links =~ s/,.*$// ;
389	394	$title = "$title_main - <font color=#008000>$views_edits Per Wikipedia Language</font> - Breakdown" ;
390	395	&WriteReportPerLanguageBreakDown ($title, $views_edits, &UnLink ($links,$offset_links+4)) ;
391	396	}
—	—	@@ -480,8 +485,23 @@
481	486	"<input type='button' value=' Archive ' onclick='window.location=\"http://stats.wikimedia.org/archive/squid_reports\"'> " .
482	487	"<input type='button' value=' Wikimedia Statistics ' onclick='window.location=\"http://stats.wikimedia.org\"'>" .
483	488	"</td></tr>\n</table><hr>" .
484		~~- " This analysis is based on a 1:1000 sampled server log (squids) X1000\nALSO<p>" ;~~
	489	+ # " This analysis is based on a 1:1000 sampled server log (squids) X1000\nALSO<p>" ;
	490	+ " This analysis is based on a 1:1000 sampled server log (squids) X1000\nALSO<br>" ;
485	491
	492	+ if ($reportcountries)
	493	+ {
	494	+ $header .= "<p> <font color=#900000>WMF traffic logging service suffered from server capacity problems from Nov 2009 till July 2010 and again in Aug/Sep/Oct 2011.<br>" .
	495	+ " Data loss only occurred during peak hours. It therefore may have had somewhat different impact for traffic from different parts of the world." ;
	496	+ }
	497	+ else
	498	+ {
	499	+ $header .= "<font color=#900000>WMF traffic logging service suffered from server capacity problems in Aug/Sep/Oct 2011.<br>" .
	500	+ "Absolute traffic counts for October 2011 are approximatly 7% too low.<br>" .
	501	+ "Data loss only occurred during peak hours. It therefore may have had somewhat different impact for traffic from different parts of the world.<br>" .
	502	+ "and may have also skewed relative figures like share of traffic per browser or operating system.</font><p>" ;
	503	+ $header .= "<font color=#900000>In a an unrelated server outage precisely half of traffic to WMF mobile sites was not counted from Oct 16 - Nov 29 (one of two load-balanced servers did not report traffic).<br>" .
	504	+ "WMF has since improved server monitoring, so that similar outages should be detected and fixed much faster from now on.</font><p>" ;
	505	+ }
486	506	# to be localized some day like any reports
487	507	$out_license = "All data and images on this page are in the public domain." ;
488	508	$out_generated = "Generated on " ;
—	—	@@ -619,7 +639,7 @@
620	640
621	641	$client =~ s/_/./g ;
622	642	$client =~ s/\.\./Other/g ;
623		~~- if ($client !=~ / \d/)~~
	643	+ if ($client !~ / \d/)
624	644	{ $client =~ s/\// / ; }
625	645	if ($rectype eq "-") { $total_clients_non_mobile += $count ; }
626	646	if ($rectype eq "M") { $total_clients_mobile += $count ; }
—	—	@@ -1332,6 +1352,7 @@
1333	1353	{
1334	1354	# http://en.wikipedia.org/wiki/List_of_countries_by_population
1335	1355	# http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users
	1356	+ print "Read $path_in/$file_csv_country_meta_info\n" ;
1336	1357	open COUNTRY_META_INFO, '<', "$path_in/$file_csv_country_meta_info" ;
1337	1358	binmode COUNTRY_META_INFO ;
1338	1359	while ($line = <COUNTRY_META_INFO>)
—	—	@@ -1342,6 +1363,7 @@
1343	1364	$line =~ s/C..?te d'Ivoire/Côte d'Ivoire/g ;
1344	1365
1345	1366	($country,$link,$population,$connected,$icon) = split ',', $line ;
	1367	+ print "COUNTRY $country\nLINK $link\nPOPULATION $population\nCONNECTED $connected\n\n" ;
1346	1368	$country =~ s/,/,/g ;
1347	1369
1348	1370	# use country names as given by MaxMind
—	—	@@ -1533,10 +1555,10 @@
1534	1556	$months_recently = keys %months_recently ;
1535	1557	if ($months_recently == 0) { die "\$months_recently == 0\n" ; }
1536	1558
1537		~~- $requests_recently_start = substr ($requests_recently_start,5,2) . "/" . substr ($requests_recently_start,2,2) ;~~
1538		~~- $requests_recently_stop = substr ($requests_recently_stop ,5,2) . "/" . substr ($requests_recently_stop ,2,2) ;~~
1539		~~- $requests_start = substr ($requests_start,5,2) . "/" . substr ($requests_start,2,2) ;~~
1540		~~- $requests_stop = substr ($requests_stop ,5,2) . "/" . substr ($requests_stop ,2,2) ;~~
	1559	+ $requests_recently_start = substr ($requests_recently_start,0,4) . '/' . substr ($requests_recently_start,5,2);
	1560	+ $requests_recently_stop = substr ($requests_recently_stop ,0,4) . '/' . substr ($requests_recently_stop ,5,2) ;
	1561	+ $requests_start = substr ($requests_start,0,4) . '/' . substr ($requests_start,5,2) ;
	1562	+ $requests_stop = substr ($requests_stop ,0,4) . '/' . substr ($requests_stop ,5,2) ;
1541	1563
1542	1564	foreach $yyyymm (keys %$yyyymm)
1543	1565	{
—	—	@@ -4644,9 +4666,6 @@
4645	4667	$html_total .= "<tr><td colspan=99> </td></tr>" ;
4646	4668
4647	4669
4648		~~- undef @keys_regions ;~~
4649		~~-# foreach $key (sort keys %population_per_hemisphere)~~
4650		~~-# { push @keys_regions, $key ; }~~
4651	4670	$html_regions = '' ;
4652	4671	foreach $key (qw (N S AF AS AU EU CA NA SA OC))
4653	4672	{
—	—	@@ -5150,19 +5169,19 @@
5151	5170	my $views_edits_lc = lc $views_edits ;
5152	5171
5153	5172	if ($show_logcount)
5154		~~- { $report_version = "<p>This is the extended version of this report, with even small percentages included (> $cutoff_percentage\%) (see also bottom of page). " .~~
5155		~~- "Switch to <a href='$file_html_per_country_breakdown'>regular version</a>" ; }~~
	5173	+ { $report_version = "<p>Showing even small percentages (> $cutoff_percentage\%) (read <a href='#more'>more</a>). " .
	5174	+ "Switch to <a href='$file_html_per_country_breakdown'>concise version</a>" ; }
5156	5175	else
5157		~~- { $report_version = "<p>This is the regular version of this report, with only major percentages (> $cutoff_percentage\%) included." .~~
5158		~~- " Switch to <a href='$file_html_per_country_breakdown_huge'>extended version</a>" ; }~~
	5176	+ { $report_version = "<p>Showing only only major percentages (> $cutoff_percentage\%) (read <a href='#more'>more</a>). " .
	5177	+ " Switch to <a href='$file_html_per_country_breakdown_huge'>detailed version</a>" ; }
5159	5178
5160	5179	$html = $header ;
5161	5180	$html =~ s/TITLE/$title/ ;
5162	5181	$html =~ s/HEADER/$title/ ;
5163	5182	$html =~ s/LINKS// ;
5164		~~- $html =~ s/ALSO/$links/ ;~~
	5183	+ $html =~ s/ALSO/$links$report_version/ ;
5165	5184	$html =~ s/NOTES// ;
5166		~~- $html =~ s/X1000/. Period <b>$requests_recently_start - $requests_recently_stop<\/b><br>$report_version/ ;~~
	5185	+ $html =~ s/X1000/. Period <b>$requests_recently_start - $requests_recently_stop<\/b>/ ;
5167	5186	$html =~ s/DATE// ;
5168	5187
5169	5188	$html .= "<p>Portal is <a href='http://www.wikipedia.org'>www.wikipedia.org</a>\n" ;
—	—	@@ -5262,7 +5281,7 @@
5263	5282	# $html .= "<tr><td colspan=99> </td></tr>\n" ;
5264	5283	}
5265	5284	$html .= "</table>" ;
5266		~~- $html .= "<p><b>Share<\/b> is the percentage of requesting ip addresses (out of the global total) which originated from this country" .~~
	5285	+ $html .= "<p><a name='more' id='more'></a><b>Share<\/b> is the percentage of requesting ip addresses (out of the global total) which originated from this country" .
5267	5286	"<br> Further percentages show per country share of $views_edits_lc per Wikipedia visited" ;
5268	5287	$html .= "<p><b>Countries</b> are only included if the number of requests in the period exceeds $cutoff_requests,000 ($cutoff_requests matching records in 1:1000 sampled log)" ;
5269	5288	$html .= "<p><b>Wikipedia's</b> are only listed for some country if the share of visitors for that particular country exceeds $cutoff_percentage\%." ;
—	—	@@ -5309,6 +5328,12 @@
5310	5329	$html =~ s/X1000/. Period <b>$requests_start - $requests_stop<\/b>/ ;
5311	5330	$html =~ s/DATE// ;
5312	5331
	5332	+ if ($views_edits eq 'Page Views')
	5333	+ {
	5334	+ $html .= "<p><font color=#800000>Nov 2011: For some countries the share of page views on the English Wikipedia was significantly higher in 2010 than in 2009 and 2011,<br>" .
	5335	+ "especially in Q1 and Q2. We don't know yet what caused this, this might be an artifact. Please be cautious to draw conclusions from this.</font>" ;
	5336	+ }
	5337	+
5313	5338	$html .= "<p>Portal is <a href='http://www.wikipedia.org'>www.wikipedia.org</a>\n" ;
5314	5339
5315	5340	$html .= "<p><table border=1 width=800>INDEX\n" ;
—	—	@@ -5892,10 +5917,15 @@
5893	5918
5894	5919	sub ReadWikipedia
5895	5920	{
	5921	+ print "ReadWikipedia\n\n" ;
	5922	+
5896	5923	use LWP::Simple qw($ua get);
5897	5924
5898	5925	$ua->agent('Wikipedia Wikicounts job');
5899	5926	$ua->timeout(60);
	5927	+
	5928	+
	5929	+ print "Read List_of_countries_by_population\n\n" ;
5900	5930	my $url = 'http://en.wikipedia.org/wiki/List_of_countries_by_population';
5901	5931	my $html = get $url \|\| die "Timed out!";
5902	5932
—	—	@@ -5955,9 +5985,12 @@
5956	5986	$link =~ s/,/,/g ;
5957	5987	$icon =~ s/,/,/g ;
5958	5988
	5989	+ print "country: $country\nlink: $link\npopulation: $population\nconnected: $connected\nicon: $icon\n\n" ;
5959	5990	$countries {$country} = "$country,$link,$population,connected,$icon\n" ;
5960	5991	}
5961	5992
	5993	+ print "List_of_countries_by_number_of_Internet_users\n\n" ;
	5994	+
5962	5995	$url = 'http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users';
5963	5996	$html = get $url \|\| die "Timed out!";
5964	5997
—	—	@@ -5995,10 +6028,12 @@
5996	6029	$country =~ s/Timor Leste/Timor-Leste/ ;
5997	6030	$country =~ s/UAE/United Arab Emirates/ ;
5998	6031
	6032	+ print "country: $country\nconnected: $connected\n\n" ;
5999	6033	$countries {$country} =~ s/connected/$connected/ ;
6000	6034	}
6001	6035
6002		~~- open COUNTRY_META_INFO, '>', "$path_out/SquidReportCountryMetaInfo.csv" ;~~
	6036	+ print "Write $path_in/$file_csv_country_meta_info\n\n" ; # use $path_in, not $path_out so that next step picks up proper file
	6037	+ open COUNTRY_META_INFO, '>', "$path_in/$file_csv_country_meta_info" ;
6003	6038	foreach $country (sort keys %countries)
6004	6039	{ print COUNTRY_META_INFO $countries {$country} ; }
6005	6040	close COUNTRY_META_INFO ;
—	—	@@ -6086,11 +6121,11 @@
6087	6122	sub UnLink
6088	6123	{
6089	6124	my ($links,$index) = @_ ;
6090		~~-# print "\n\nUnLink $index\n\n" ;~~
	6125	+ # print "\n\nUnLink $index\n\n" ;
6091	6126	my @segments = split '(?=<a )', $links ;
6092		~~-# print "SEGMENT 1 $segments[$index]\n" ;~~
	6127	+ # print "SEGMENT 1 $segments[$index]\n" ;
6093	6128	$segments [$index] =~ s/^.?<a .?>([^<]*)<\/a>/<font color=#008000><b>$1<\/b><\/font>/ ;
6094		~~-# print "SEGMENT 2 $segments[$index]\n" ;~~
	6129	+ # print "SEGMENT 2 $segments[$index]\n" ;
6095	6130	$links = join '', @segments ;
6096	6131	return ($links) ;
6097	6132	}
—	—	@@ -6139,8 +6174,8 @@
6140	6175	id: "millions",
6141	6176	is: function(s) { return false; },
6142	6177	//failed so far to turn 1.2M into 1200000, so figures with decimal point are sorted out of place
6143		-//format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/ /g,"").replace(/\\.(\d)M/,$1+"00000").replace(/M/,"000000").replace(/М/,"000000").replace(/K/,"000").replace(/К/i,"000")); },
6144		- format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/ /g,""). replace(/M/,"000000").replace(/М/,"000000").replace(/K/,"000").replace(/К/i,"000")); },
	6178	+//format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/ /g,"").replace(/\\.(\\d)M/,$1+"00000").replace(/M/,"000000").replace(/М/,"000000").replace(/K/,"000").replace(/К/i,"000")); },
	6179	+ format: function(s) { return \$.tablesorter.formatFloat(s.replace(/<[^>]*>/g,"").replace(/ /g,""). replace(/M/,"000000").replace(/М/,"000000").replace(/K/,"000").replace(/К/i,"000")); },
6145	6180	type: "numeric"
6146	6181	});
6147	6182
—	—	@@ -6211,7 +6246,7 @@
6212	6247	}
6213	6248	</style>
6214	6249	__HTML_SORT_TABLE__
6215		~~-return ($html) ;~~
	6250	+ return ($html) ;
6216	6251	}
6217	6252
6218	6253	sub HtmlSortTableColumns

Status & tagging log

19:30, 17 January 2012 Reedy (talk | contribs) changed the status of r109171 [removed: new added: deferred]