r86715 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r86714‎ | r86715 | r86716 >
Date:16:14, 22 April 2011
Author:ezachte
Status:deferred
Tags:
Comment:
parse data bug fixed + new param for quarterly report
Modified paths:
  • /trunk/wikistats/squids/SquidCountArchiveReadInput.pm (added) (history)
  • /trunk/wikistats/squids/SquidReportArchive.pl (modified) (history)

Diff [purge]

Index: trunk/wikistats/squids/SquidCountArchiveReadInput.pm
@@ -0,0 +1,353 @@
 2+ #!/usr/bin/perl
 3+
 4+sub CollectFilesToProcess
 5+{
 6+ trace CollectFilesToProcess ;
 7+
 8+ if (! $job_runs_on_production_server)
 9+ {
 10+ push @files, $file_test ;
 11+ return $true ;
 12+ }
 13+
 14+ my ($days_ago, $date_collect_files, $time_to_start, $time_to_stop, $path_out, $path_out_month) = @_ ;
 15+
 16+ print "Collect files for date $date_collect_files: files with timestamps between $time_to_start and $time_to_stop\n\n" ;
 17+
 18+ my $all_files_found = $true ;
 19+
 20+ my ($date_archived) ;
 21+
 22+ $dir_in = "/a/squid/archive" ;
 23+
 24+ $some_files_found = $false ;
 25+ $full_range_found = $false ;
 26+
 27+ $path_head_tail = "$path_out_month/$file_head_tail" ;
 28+
 29+ # file naming scheme on server: sampled-1000.log-yyyymmdd, does not mean on that day file sampled-1000.log was archived
 30+ # file can contain data for days(s) before and day (days?) after yyyymmdd, see e.g. sampled-10000.log-20090802 (days 0801-0803)
 31+ # this is confusing so start a few days earlier and check for each day:
 32+ # whether a file exists and whether it's 'head' and or 'tail' time (first last record) fall within range
 33+
 34+ # find first and last file to process, meaning all files that comprise log records within date range
 35+
 36+ $head_found = $false ;
 37+ $tail_found = $false ;
 38+
 39+ for ($days_ago_inspect = $days_ago + 2 ; $days_ago_inspect >= $days_ago - 5 ; $days_ago_inspect--)
 40+ {
 41+ next if $days_ago_inspect < 0 ; # days ago can't be negative
 42+
 43+ ($sec,$min,$hour,$day,$month,$year) = localtime ($time_start - $days_ago_inspect * 24 * 3600) ;
 44+ $date_archived = sprintf ("%4d%02d%02d", $year+1900, $month+1, $day) ;
 45+ print "\n- Inspect file saved $days_ago_inspect days ago: sampled-1000.log-$date_archived.gz\n" ;
 46+
 47+ my $file = "$dir_in/sampled-1000.log-$date_archived.gz" ;
 48+
 49+ if (! -e $file)
 50+ { print "- File not found: $file\n" ; }
 51+ else
 52+ {
 53+ ($timehead,$timetail) = &GetLogRange ($file, $path_head_tail) ;
 54+
 55+ if (($timetail ge $time_to_start) && ($timehead le $time_to_stop))
 56+ {
 57+ print "- Include this file\n" ;
 58+
 59+ $some_files_found = $true ;
 60+ push @files, $file ;
 61+ if ($timehead le $time_to_start) { $head_found = $true ; print "- Head found\n" ; }
 62+ if ($timetail ge $time_to_stop) { $tail_found = $true ; print "- Tail found\n" ; }
 63+ }
 64+
 65+ # assuming only one file is archived per day !
 66+ if ($head_found && $tail_found)
 67+ {
 68+ $full_range_found = $true ;
 69+ last ;
 70+ }
 71+ }
 72+ }
 73+
 74+ if (! $some_files_found)
 75+ { print "Not any file was found which contains log records for $days_ago days ago. Skip processing for $date_collect_files.\n\n" ; return $false ; }
 76+ if (! $full_range_found)
 77+ { print "Not all files were found which contain log records for $days_ago days ago. Skip processing for $date_collect_files.\n\n" ; return $false ; }
 78+
 79+ print "\n" ;
 80+ foreach $file (sort @files)
 81+ { print "Process $file\n" ; }
 82+
 83+ return $true ;
 84+}
 85+
 86+sub ReadIpFrequencies
 87+{
 88+ trace ReadIpFrequencies ;
 89+
 90+ my $path_out = shift ;
 91+
 92+ my $data_read = $false ;
 93+
 94+ if ($job_runs_on_production_server)
 95+ {
 96+ if (! -e "$path_out/$file_ip_frequencies_bz2")
 97+ { print "$path_out/$file_ip_frequencies_bz2 not found. Abort processing for this day." ; return $false ; }
 98+
 99+ open CSV_ADDRESSES, "-|", "bzip2 -dc \"$path_out/$file_ip_frequencies_bz2\"" || abort ("Input file $path_out/$file_ip_frequencies_bz2 could not be opened.") ;
 100+ }
 101+ else
 102+ {
 103+ if (! -e "$path_out/$file_ip_frequencies")
 104+ { print "$path_out/$file_ip_frequencies not found. Abort processing for this day." ; return $false ; }
 105+
 106+ open CSV_ADDRESSES, '<', "$path_out/$file_ip_frequencies" || abort ("Input file $path_out/$file_ip_frequencies could not be opened.") ;
 107+ }
 108+
 109+ while ($line = <CSV_ADDRESSES>)
 110+ {
 111+ $data_read = $true ;
 112+
 113+ if ($line =~ /^#/o) { next ; }
 114+ chomp ($line) ;
 115+ ($frequency, $address) = split (',', $line) ;
 116+ $ip_frequencies {$address} = $frequency ;
 117+ $addresses_stored++ ;
 118+ }
 119+
 120+ print "\n$addresses_stored addresses stored that occur more than once\n\n" ;
 121+
 122+ return $data_read ;
 123+}
 124+
 125+sub ReadSquidLogFiles
 126+{
 127+ trace ReadSquidLogFiles ;
 128+
 129+ my $data_read = $false ;
 130+
 131+ my ($path_out, $time_to_start, $time_to_stop, @files) = @_ ;
 132+
 133+ if ($#files == -1)
 134+ { print "ReadInput: No files to process.\n\n" ; }
 135+
 136+ print "Read log records in range $time_to_start till $time_to_stop\n\n" ;
 137+
 138+ if ($job_runs_on_production_server && $scan_all_fields)
 139+ { open FILE_EDITS_SAVES, '>', "$path_out/$file_edits_saves" ; }
 140+
 141+ my $lines = 0 ;
 142+ while ($#files > -1)
 143+ {
 144+ $file_in = shift (@files) ;
 145+
 146+ print "Process $file_in\n" ;
 147+ if (! -e $file_in)
 148+ { print "ReadInput: File not found: $file_in. Aborting...\n\n" ; exit ; }
 149+
 150+ if ($job_runs_on_production_server)
 151+ {
 152+ if ($file_in =~ /\.gz$/o)
 153+ { open IN, "-|", "gzip -dc $file_in | /usr/local/bin/geoiplogtag 5" ; } # http://perldoc.perl.org/functions/open.html
 154+ else
 155+ { open IN, "-|", "cat $file_in | /usr/local/bin/geoiplogtag 5" ; } # http://perldoc.perl.org/functions/open.html
 156+ $fields_expected = 14 ;
 157+ }
 158+ else
 159+ {
 160+ open IN, '<', $file_in ;
 161+ # $fields_expected = 14 ;
 162+ $fields_expected = 13 ;
 163+ }
 164+
 165+ $line = "" ;
 166+ while ($line = <IN>)
 167+ {
 168+ $lines_in_file ++ ;
 169+
 170+ # if ($line =~ /fy\.wikipedia\.org/o) # test/debug
 171+ # {
 172+ # print FILTER_FY $line ;
 173+ # print $line ;
 174+ # }
 175+
 176+ @fields = split (' ', $line) ;
 177+ if ($#fields < $fields_expected) { $fields_too_few ++ ; next ; }
 178+ if ($#fields > $fields_expected) { $fields_too_many ++ ; next ; }
 179+
 180+ $time = $fields [2] ;
 181+
 182+ if (($oldest_time_read eq "") || ($time lt $oldest_time_read))
 183+ { $oldest_time_read = $time ; }
 184+ if (($newest_time_read eq "") || ($time gt $newest_time_read))
 185+ { $newest_time_read = $time ; }
 186+
 187+ if ($oldest_time_read ge $time_to_stop)
 188+ { last ; }
 189+
 190+ if ($time lt $time_to_start)
 191+ {
 192+ if (++ $times % 100000 == 0)
 193+ { print "[$time]\n" ; }
 194+ next ;
 195+ }
 196+
 197+ if ($time ge $time_to_stop)
 198+ { last ; }
 199+
 200+ $date = substr ($time,0,10) ;
 201+ if ($date lt $date_prev) { next ; } # occasionally one record for previous day arrives late
 202+
 203+ $data_read = $true ;
 204+ if ($date ne $date_prev)
 205+ {
 206+ print &ddhhmmss (time - $time_start) . " $date\n" ;
 207+ if ($date_prev ne "")
 208+ {
 209+ print "$date_prev: $lines_this_day\n" ;
 210+ $lines_read {$date_prev} = $lines_this_day ;
 211+ }
 212+ $lines_this_day = 0 ;
 213+ $date_prev = $date ;
 214+ }
 215+ $lines_this_day++ ;
 216+
 217+ if ($job_runs_on_production_server)
 218+ {
 219+ if (($line =~ /action=edit/o) || ($line =~ /action=submit/o))
 220+ { print FILE_EDITS_SAVES $line ; }
 221+ }
 222+
 223+ $lines++ ;
 224+
 225+#next if $line !~ /http:\/\/\w+\.m\./ ;
 226+#print "$line\n" ;
 227+ &ProcessLine ($line) ;
 228+ if (++ $lines_processed % 10000 == 0)
 229+ {
 230+ if ($banner_requests_ignored == 0)
 231+ { print "$time $lines_processed\n" ; }
 232+ else
 233+ { print "$time $lines_processed ($banner_requests_ignored banner requests ignored)\n" ; }
 234+ }
 235+ if ($test and $lines_processed >= $test_maxlines)
 236+ { last ; }
 237+ }
 238+ close IN ;
 239+ }
 240+
 241+ if ($scan_ip_frequencies)
 242+ { return ($data_read) ; }
 243+
 244+ if ($job_runs_on_production_server)
 245+ { close FILE_EDITS_SAVES ; }
 246+
 247+ $lines_read {$date_prev} = $lines_this_day ;
 248+
 249+ if ($lines == 0)
 250+ {
 251+ $data_read = $false ;
 252+ print "No data found for $time_to_start - $time_to_stop\n" ;
 253+ }
 254+ else
 255+ { print "$lines_this_day out $lines_in_file processed\n" ; }
 256+
 257+ return ($data_read) ;
 258+}
 259+
 260+sub ReadInputEditsSavesFile
 261+{
 262+ trace ReadInputEditsSavesFile ;
 263+
 264+ my $file_txt = shift ;
 265+
 266+ print "Process $file_txt\n" ;
 267+
 268+ open IN, "-|", "bzip2 -dc \"$file_txt\"" || abort ("Input file '" . $file_txt . "' could not be opened.") ;
 269+# open IN, '<', "2010-04/SquidDataEditsSaves2010-04-01.txt" || abort ("Input file '" . $file_txt . "' could not be opened.") ; # test
 270+
 271+ while ($line = <IN>)
 272+ {
 273+ if ($line =~ /index\.php/o)
 274+ { &ProcessLine ($line) ; }
 275+ }
 276+ close IN ;
 277+}
 278+
 279+sub GetLogRange # finding first and last timestamp ('head' and 'tail') in compressed file is costly, cache results for reuse
 280+{
 281+ my ($file,$path_head_tail) = @_ ;
 282+
 283+ if (-e $path_head_tail)
 284+ {
 285+ open CSV_HEAD_TAIL, '<', $path_head_tail ;
 286+ while ($line = <CSV_HEAD_TAIL>)
 287+ {
 288+ chomp $line ;
 289+ my ($logfile,$head,$tail) = split (',', $line) ;
 290+ $timeheads {$logfile} = $head ;
 291+ $timetails {$logfile} = $tail ;
 292+ }
 293+ close CSV_HEAD_TAIL ;
 294+ }
 295+
 296+ $timehead = $timeheads {$file} ;
 297+ $timetail = $timetails {$file} ;
 298+
 299+ if (($timehead ne '') && ($timetail ne ''))
 300+ {
 301+ print "- HEAD $timehead TAIL $timetail (from head-tail cache)\n" ;
 302+ return ($timehead, $timetail) ;
 303+ }
 304+
 305+ my ($line, @fields, $timehead, $timetail) ;
 306+ print "$file: " ;
 307+ if (! -e $file)
 308+ {
 309+ print "- GetLogRange error: File not found: $file\n" ;
 310+ exit ;
 311+ }
 312+
 313+ if ($file =~ /\.gz$/o)
 314+ { $line = `gzip -dc $file | head -n 1 ` ; }
 315+ else
 316+ { $line = `head -n 1 $file` ; }
 317+ # print "HEAD $line\n" ;
 318+ @fields = split (' ', $line) ;
 319+ # $timehead = substr ($fields [2],0,10) ;
 320+ $timehead = $fields [2] ;
 321+
 322+ if ($file =~ /\.gz$/o)
 323+ { $line = `gzip -dc $file | tail -n 1 ` ; }
 324+ else
 325+ { $line = `tail -n 1 $file` ; }
 326+
 327+ # print "TAIL $line\n" ;
 328+ @fields = split (' ', $line) ;
 329+ # $timetail = substr ($fields [2],0,10) ;
 330+ $timetail = $fields [2] ;
 331+
 332+ print "- HEAD $timehead TAIL $timetail\n" ;
 333+
 334+ open CSV_HEAD_TAIL, '>>', $path_head_tail ;
 335+ print CSV_HEAD_TAIL "$file,$timehead,$timetail\n" ;
 336+ close CSV_HEAD_TAIL ;
 337+
 338+ return ($timehead, $timetail) ;
 339+}
 340+
 341+sub GetTimeIso8601
 342+{
 343+ my $time = shift ;
 344+ my $year = substr ($time,0,4) ;
 345+ my $mon = substr ($time,5,2) ;
 346+ my $mday = substr ($time,8,2) ;
 347+ my $hour = substr ($time,11,2) ;
 348+ my $min = substr ($time,14,2) ;
 349+ my $sec = substr ($time,17,2) ;
 350+ $time = timelocal($sec,$min,$hour,$mday,$mon-1,$year-1900);
 351+ return ($time) ;
 352+}
 353+
 354+1;
Index: trunk/wikistats/squids/SquidReportArchive.pl
@@ -5,11 +5,9 @@
66 $trace_on_exit = $true ;
77 ez_lib_version (2) ;
88
9 -# $quarter_only = '2010 Q3' ; # if not empty filter process for this quarter only
10 -
119 # set defaults mainly for tests on local machine
1210 # default_argv "-m 201010 " ;
13 - default_argv "-c " ;
 11+ default_argv "-c -q 2010Q4" ;
1412
1513 # to do: add text from http://wiki.squid-cache.org/SquidFaq/SquidLogs
1614 # ReportOrigin how to handle '!error <-> other
@@ -30,7 +28,7 @@
3129 $ratio_sqrt = $true ;
3230 $ratio_linear = $false ;
3331
34 - getopt ("dm", \%options) ;
 32+ getopt ("dmq", \%options) ;
3533
3634 if (-d "/a/squid")
3735 {
@@ -63,6 +61,15 @@
6462 if (defined ($options {"c"}))
6563 { $reportcountries = $true ; }
6664
 65+ if (defined ($options {"q"}))
 66+ {
 67+ $quarter_only = $options {"q"} ; # process for this quarter only
 68+ if ($quarter_only !~ /^2\d\d\dQ\d$/)
 69+ { abort ("Specify run for one single quarter as -q yyyyQ[1-4], e.g. -q 2011Q3, not '$quarter_only'\n") ; }
 70+ $quarter_only =~ s/^(\d\d\d\d)(Q\d)$/$1 $2/ ;
 71+ print "QUARTER ONLY $quarter_only\n" ;
 72+ }
 73+
6774 # date range used to be read from csv file with ReadDate, now there are daily csv files
6875 # if earlier methods still is useful it needs to be tweaked
6976 # if (($reportmonth ne "") && ($reportmonth !~ /^\d{6}$/))
@@ -83,7 +90,7 @@
8491 &CollectRegionCounts ;
8592
8693 &ReportCountries ('Saves');
87 -# &ReportCountries ('Views');
 94+ &ReportCountries ('Views');
8895
8996 exit ;
9097 }
@@ -1257,7 +1264,8 @@
12581265 $path_csv_country_codes = "$path_in/$file_csv_country_codes" ;
12591266 if (! -e $path_csv_country_codes) { abort ("Input file $path_csv_country_codes not found!") ; }
12601267
1261 - open CSV_COUNTRY_CODES, '<', $path_csv_country_codes ;
 1268+ open CSV_COUNTRY_CODES, '<', $path_csv_country_codes ;
 1269+ binmode CSV_COUNTRY_CODES ;
12621270 $country_names {"--"} = "Unknown" ;
12631271 while ($line = <CSV_COUNTRY_CODES>)
12641272 {
@@ -1265,6 +1273,7 @@
12661274
12671275 next if $line =~ /^#/ ;
12681276
 1277+ $line =~ s/[\x00-\x1f]//g ;
12691278 $line =~ s/C..?te d'Ivoire/C&ocirc;te d'Ivoire/g ;
12701279
12711280 ($country_code,$region_code,$north_south_code,$country_name) = split (',', $line,4) ;
@@ -1295,11 +1304,13 @@
12961305 {
12971306 # http://en.wikipedia.org/wiki/List_of_countries_by_population
12981307 # http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users
1299 - open COUNTRY_META_INFO, '<', "$path_in/$file_csv_country_meta_info" ;
 1308+ open COUNTRY_META_INFO, '<', "$path_in/$file_csv_country_meta_info" ;
 1309+ binmode COUNTRY_META_INFO ;
13001310 while ($line = <COUNTRY_META_INFO>)
13011311 {
13021312 chomp $line ;
13031313
 1314+ $line =~ s/[\x00-\x1f]//g ;
13041315 $line =~ s/C..?te d'Ivoire/C&ocirc;te d'Ivoire/g ;
13051316
13061317 ($country,$link,$population,$connected,$icon) = split ',', $line ;
@@ -1363,6 +1374,9 @@
13641375
13651376 # print "CODE $country_code NAME $country_name POP $population, $CONN $connected REGION $region_code NS $north_south_code PPR ${population_per_region {$region_code}}\n" ;
13661377 }
 1378+
 1379+ if ($population_tot == 0)
 1380+ { abort ("No valid data found: population_tot = 0 !") ; }
13671381 }
13681382
13691383 sub ReadInputCountriesMonthly
@@ -1424,7 +1438,6 @@
14251439
14261440 $year = substr ($yyyymm,0,4) ;
14271441 $month = substr ($yyyymm,5,2) ;
1428 - # print "year $year report_year month $month $report_year $report_month\n" ;
14291442
14301443 $recently = $false ;
14311444
@@ -1622,7 +1635,6 @@
16231636 $new = &CorrectForMissingDays ($week , ${$requests_per_week_per_country_code {$week }} {$country_code}) ;
16241637 $old = &CorrectForMissingDays ($week-1, ${$requests_per_week_per_country_code {$week-1}} {$country_code}) ;
16251638
1626 - # print "country_code $country_code\n" ;
16271639 if ($old == 0)
16281640 {
16291641 if ($new > 0)
@@ -4432,7 +4444,10 @@
44334445 $population2 = &i2KM2 ($population) ;
44344446 $connected2 = &i2KM2 ($connected) ;
44354447 $requests_this_country2 = &i2KM2 ($requests_this_country2) ;
4436 - $perc_population = &Percentage ($population / $population_tot) ;
 4448+
 4449+ if ($population_tot > 0)
 4450+ { $perc_population = &Percentage ($population / $population_tot) ; }
 4451+
44374452 if ($perc_population =~ /\.0\d/)
44384453 { $perc_population = "<small>$perc_population</small>" ; }
44394454

Status & tagging log