r86715 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r86714‎ \| r86715 \| r86716 >
Date:	16:14, 22 April 2011
Author:	ezachte
Status:	deferred
Tags:
Comment:	parse data bug fixed + new param for quarterly report
Modified paths:	/trunk/wikistats/squids/SquidCountArchiveReadInput.pm (added) (history) /trunk/wikistats/squids/SquidReportArchive.pl (modified) (history)

Diff [purge]

Index: trunk/wikistats/squids/SquidCountArchiveReadInput.pm
—	—	@@ -0,0 +1,353 @@
	2	+ #!/usr/bin/perl
	3	+
	4	+sub CollectFilesToProcess
	5	+{
	6	+ trace CollectFilesToProcess ;
	7	+
	8	+ if (! $job_runs_on_production_server)
	9	+ {
	10	+ push @files, $file_test ;
	11	+ return $true ;
	12	+ }
	13	+
	14	+ my ($days_ago, $date_collect_files, $time_to_start, $time_to_stop, $path_out, $path_out_month) = @_ ;
	15	+
	16	+ print "Collect files for date $date_collect_files: files with timestamps between $time_to_start and $time_to_stop\n\n" ;
	17	+
	18	+ my $all_files_found = $true ;
	19	+
	20	+ my ($date_archived) ;
	21	+
	22	+ $dir_in = "/a/squid/archive" ;
	23	+
	24	+ $some_files_found = $false ;
	25	+ $full_range_found = $false ;
	26	+
	27	+ $path_head_tail = "$path_out_month/$file_head_tail" ;
	28	+
	29	+ # file naming scheme on server: sampled-1000.log-yyyymmdd, does not mean on that day file sampled-1000.log was archived
	30	+ # file can contain data for days(s) before and day (days?) after yyyymmdd, see e.g. sampled-10000.log-20090802 (days 0801-0803)
	31	+ # this is confusing so start a few days earlier and check for each day:
	32	+ # whether a file exists and whether it's 'head' and or 'tail' time (first last record) fall within range
	33	+
	34	+ # find first and last file to process, meaning all files that comprise log records within date range
	35	+
	36	+ $head_found = $false ;
	37	+ $tail_found = $false ;
	38	+
	39	+ for ($days_ago_inspect = $days_ago + 2 ; $days_ago_inspect >= $days_ago - 5 ; $days_ago_inspect--)
	40	+ {
	41	+ next if $days_ago_inspect < 0 ; # days ago can't be negative
	42	+
	43	+ ($sec,$min,$hour,$day,$month,$year) = localtime ($time_start - $days_ago_inspect * 24 * 3600) ;
	44	+ $date_archived = sprintf ("%4d%02d%02d", $year+1900, $month+1, $day) ;
	45	+ print "\n- Inspect file saved $days_ago_inspect days ago: sampled-1000.log-$date_archived.gz\n" ;
	46	+
	47	+ my $file = "$dir_in/sampled-1000.log-$date_archived.gz" ;
	48	+
	49	+ if (! -e $file)
	50	+ { print "- File not found: $file\n" ; }
	51	+ else
	52	+ {
	53	+ ($timehead,$timetail) = &GetLogRange ($file, $path_head_tail) ;
	54	+
	55	+ if (($timetail ge $time_to_start) && ($timehead le $time_to_stop))
	56	+ {
	57	+ print "- Include this file\n" ;
	58	+
	59	+ $some_files_found = $true ;
	60	+ push @files, $file ;
	61	+ if ($timehead le $time_to_start) { $head_found = $true ; print "- Head found\n" ; }
	62	+ if ($timetail ge $time_to_stop) { $tail_found = $true ; print "- Tail found\n" ; }
	63	+ }
	64	+
	65	+ # assuming only one file is archived per day !
	66	+ if ($head_found && $tail_found)
	67	+ {
	68	+ $full_range_found = $true ;
	69	+ last ;
	70	+ }
	71	+ }
	72	+ }
	73	+
	74	+ if (! $some_files_found)
	75	+ { print "Not any file was found which contains log records for $days_ago days ago. Skip processing for $date_collect_files.\n\n" ; return $false ; }
	76	+ if (! $full_range_found)
	77	+ { print "Not all files were found which contain log records for $days_ago days ago. Skip processing for $date_collect_files.\n\n" ; return $false ; }
	78	+
	79	+ print "\n" ;
	80	+ foreach $file (sort @files)
	81	+ { print "Process $file\n" ; }
	82	+
	83	+ return $true ;
	84	+}
	85	+
	86	+sub ReadIpFrequencies
	87	+{
	88	+ trace ReadIpFrequencies ;
	89	+
	90	+ my $path_out = shift ;
	91	+
	92	+ my $data_read = $false ;
	93	+
	94	+ if ($job_runs_on_production_server)
	95	+ {
	96	+ if (! -e "$path_out/$file_ip_frequencies_bz2")
	97	+ { print "$path_out/$file_ip_frequencies_bz2 not found. Abort processing for this day." ; return $false ; }
	98	+
	99	+ open CSV_ADDRESSES, "-\|", "bzip2 -dc \"$path_out/$file_ip_frequencies_bz2\"" \|\| abort ("Input file $path_out/$file_ip_frequencies_bz2 could not be opened.") ;
	100	+ }
	101	+ else
	102	+ {
	103	+ if (! -e "$path_out/$file_ip_frequencies")
	104	+ { print "$path_out/$file_ip_frequencies not found. Abort processing for this day." ; return $false ; }
	105	+
	106	+ open CSV_ADDRESSES, '<', "$path_out/$file_ip_frequencies" \|\| abort ("Input file $path_out/$file_ip_frequencies could not be opened.") ;
	107	+ }
	108	+
	109	+ while ($line = <CSV_ADDRESSES>)
	110	+ {
	111	+ $data_read = $true ;
	112	+
	113	+ if ($line =~ /^#/o) { next ; }
	114	+ chomp ($line) ;
	115	+ ($frequency, $address) = split (',', $line) ;
	116	+ $ip_frequencies {$address} = $frequency ;
	117	+ $addresses_stored++ ;
	118	+ }
	119	+
	120	+ print "\n$addresses_stored addresses stored that occur more than once\n\n" ;
	121	+
	122	+ return $data_read ;
	123	+}
	124	+
	125	+sub ReadSquidLogFiles
	126	+{
	127	+ trace ReadSquidLogFiles ;
	128	+
	129	+ my $data_read = $false ;
	130	+
	131	+ my ($path_out, $time_to_start, $time_to_stop, @files) = @_ ;
	132	+
	133	+ if ($#files == -1)
	134	+ { print "ReadInput: No files to process.\n\n" ; }
	135	+
	136	+ print "Read log records in range $time_to_start till $time_to_stop\n\n" ;
	137	+
	138	+ if ($job_runs_on_production_server && $scan_all_fields)
	139	+ { open FILE_EDITS_SAVES, '>', "$path_out/$file_edits_saves" ; }
	140	+
	141	+ my $lines = 0 ;
	142	+ while ($#files > -1)
	143	+ {
	144	+ $file_in = shift (@files) ;
	145	+
	146	+ print "Process $file_in\n" ;
	147	+ if (! -e $file_in)
	148	+ { print "ReadInput: File not found: $file_in. Aborting...\n\n" ; exit ; }
	149	+
	150	+ if ($job_runs_on_production_server)
	151	+ {
	152	+ if ($file_in =~ /\.gz$/o)
	153	+ { open IN, "-\|", "gzip -dc $file_in \| /usr/local/bin/geoiplogtag 5" ; } # http://perldoc.perl.org/functions/open.html
	154	+ else
	155	+ { open IN, "-\|", "cat $file_in \| /usr/local/bin/geoiplogtag 5" ; } # http://perldoc.perl.org/functions/open.html
	156	+ $fields_expected = 14 ;
	157	+ }
	158	+ else
	159	+ {
	160	+ open IN, '<', $file_in ;
	161	+ # $fields_expected = 14 ;
	162	+ $fields_expected = 13 ;
	163	+ }
	164	+
	165	+ $line = "" ;
	166	+ while ($line = <IN>)
	167	+ {
	168	+ $lines_in_file ++ ;
	169	+
	170	+ # if ($line =~ /fy\.wikipedia\.org/o) # test/debug
	171	+ # {
	172	+ # print FILTER_FY $line ;
	173	+ # print $line ;
	174	+ # }
	175	+
	176	+ @fields = split (' ', $line) ;
	177	+ if ($#fields < $fields_expected) { $fields_too_few ++ ; next ; }
	178	+ if ($#fields > $fields_expected) { $fields_too_many ++ ; next ; }
	179	+
	180	+ $time = $fields [2] ;
	181	+
	182	+ if (($oldest_time_read eq "") \|\| ($time lt $oldest_time_read))
	183	+ { $oldest_time_read = $time ; }
	184	+ if (($newest_time_read eq "") \|\| ($time gt $newest_time_read))
	185	+ { $newest_time_read = $time ; }
	186	+
	187	+ if ($oldest_time_read ge $time_to_stop)
	188	+ { last ; }
	189	+
	190	+ if ($time lt $time_to_start)
	191	+ {
	192	+ if (++ $times % 100000 == 0)
	193	+ { print "[$time]\n" ; }
	194	+ next ;
	195	+ }
	196	+
	197	+ if ($time ge $time_to_stop)
	198	+ { last ; }
	199	+
	200	+ $date = substr ($time,0,10) ;
	201	+ if ($date lt $date_prev) { next ; } # occasionally one record for previous day arrives late
	202	+
	203	+ $data_read = $true ;
	204	+ if ($date ne $date_prev)
	205	+ {
	206	+ print &ddhhmmss (time - $time_start) . " $date\n" ;
	207	+ if ($date_prev ne "")
	208	+ {
	209	+ print "$date_prev: $lines_this_day\n" ;
	210	+ $lines_read {$date_prev} = $lines_this_day ;
	211	+ }
	212	+ $lines_this_day = 0 ;
	213	+ $date_prev = $date ;
	214	+ }
	215	+ $lines_this_day++ ;
	216	+
	217	+ if ($job_runs_on_production_server)
	218	+ {
	219	+ if (($line =~ /action=edit/o) \|\| ($line =~ /action=submit/o))
	220	+ { print FILE_EDITS_SAVES $line ; }
	221	+ }
	222	+
	223	+ $lines++ ;
	224	+
	225	+#next if $line !~ /http:\/\/\w+\.m\./ ;
	226	+#print "$line\n" ;
	227	+ &ProcessLine ($line) ;
	228	+ if (++ $lines_processed % 10000 == 0)
	229	+ {
	230	+ if ($banner_requests_ignored == 0)
	231	+ { print "$time $lines_processed\n" ; }
	232	+ else
	233	+ { print "$time $lines_processed ($banner_requests_ignored banner requests ignored)\n" ; }
	234	+ }
	235	+ if ($test and $lines_processed >= $test_maxlines)
	236	+ { last ; }
	237	+ }
	238	+ close IN ;
	239	+ }
	240	+
	241	+ if ($scan_ip_frequencies)
	242	+ { return ($data_read) ; }
	243	+
	244	+ if ($job_runs_on_production_server)
	245	+ { close FILE_EDITS_SAVES ; }
	246	+
	247	+ $lines_read {$date_prev} = $lines_this_day ;
	248	+
	249	+ if ($lines == 0)
	250	+ {
	251	+ $data_read = $false ;
	252	+ print "No data found for $time_to_start - $time_to_stop\n" ;
	253	+ }
	254	+ else
	255	+ { print "$lines_this_day out $lines_in_file processed\n" ; }
	256	+
	257	+ return ($data_read) ;
	258	+}
	259	+
	260	+sub ReadInputEditsSavesFile
	261	+{
	262	+ trace ReadInputEditsSavesFile ;
	263	+
	264	+ my $file_txt = shift ;
	265	+
	266	+ print "Process $file_txt\n" ;
	267	+
	268	+ open IN, "-\|", "bzip2 -dc \"$file_txt\"" \|\| abort ("Input file '" . $file_txt . "' could not be opened.") ;
	269	+# open IN, '<', "2010-04/SquidDataEditsSaves2010-04-01.txt" \|\| abort ("Input file '" . $file_txt . "' could not be opened.") ; # test
	270	+
	271	+ while ($line = <IN>)
	272	+ {
	273	+ if ($line =~ /index\.php/o)
	274	+ { &ProcessLine ($line) ; }
	275	+ }
	276	+ close IN ;
	277	+}
	278	+
	279	+sub GetLogRange # finding first and last timestamp ('head' and 'tail') in compressed file is costly, cache results for reuse
	280	+{
	281	+ my ($file,$path_head_tail) = @_ ;
	282	+
	283	+ if (-e $path_head_tail)
	284	+ {
	285	+ open CSV_HEAD_TAIL, '<', $path_head_tail ;
	286	+ while ($line = <CSV_HEAD_TAIL>)
	287	+ {
	288	+ chomp $line ;
	289	+ my ($logfile,$head,$tail) = split (',', $line) ;
	290	+ $timeheads {$logfile} = $head ;
	291	+ $timetails {$logfile} = $tail ;
	292	+ }
	293	+ close CSV_HEAD_TAIL ;
	294	+ }
	295	+
	296	+ $timehead = $timeheads {$file} ;
	297	+ $timetail = $timetails {$file} ;
	298	+
	299	+ if (($timehead ne '') && ($timetail ne ''))
	300	+ {
	301	+ print "- HEAD $timehead TAIL $timetail (from head-tail cache)\n" ;
	302	+ return ($timehead, $timetail) ;
	303	+ }
	304	+
	305	+ my ($line, @fields, $timehead, $timetail) ;
	306	+ print "$file: " ;
	307	+ if (! -e $file)
	308	+ {
	309	+ print "- GetLogRange error: File not found: $file\n" ;
	310	+ exit ;
	311	+ }
	312	+
	313	+ if ($file =~ /\.gz$/o)
	314	+ { $line = `gzip -dc $file \| head -n 1 ` ; }
	315	+ else
	316	+ { $line = `head -n 1 $file` ; }
	317	+ # print "HEAD $line\n" ;
	318	+ @fields = split (' ', $line) ;
	319	+ # $timehead = substr ($fields [2],0,10) ;
	320	+ $timehead = $fields [2] ;
	321	+
	322	+ if ($file =~ /\.gz$/o)
	323	+ { $line = `gzip -dc $file \| tail -n 1 ` ; }
	324	+ else
	325	+ { $line = `tail -n 1 $file` ; }
	326	+
	327	+ # print "TAIL $line\n" ;
	328	+ @fields = split (' ', $line) ;
	329	+ # $timetail = substr ($fields [2],0,10) ;
	330	+ $timetail = $fields [2] ;
	331	+
	332	+ print "- HEAD $timehead TAIL $timetail\n" ;
	333	+
	334	+ open CSV_HEAD_TAIL, '>>', $path_head_tail ;
	335	+ print CSV_HEAD_TAIL "$file,$timehead,$timetail\n" ;
	336	+ close CSV_HEAD_TAIL ;
	337	+
	338	+ return ($timehead, $timetail) ;
	339	+}
	340	+
	341	+sub GetTimeIso8601
	342	+{
	343	+ my $time = shift ;
	344	+ my $year = substr ($time,0,4) ;
	345	+ my $mon = substr ($time,5,2) ;
	346	+ my $mday = substr ($time,8,2) ;
	347	+ my $hour = substr ($time,11,2) ;
	348	+ my $min = substr ($time,14,2) ;
	349	+ my $sec = substr ($time,17,2) ;
	350	+ $time = timelocal($sec,$min,$hour,$mday,$mon-1,$year-1900);
	351	+ return ($time) ;
	352	+}
	353	+
	354	+1;
Index: trunk/wikistats/squids/SquidReportArchive.pl
—	—	@@ -5,11 +5,9 @@
6	6	$trace_on_exit = $true ;
7	7	ez_lib_version (2) ;
8	8
9		~~-# $quarter_only = '2010 Q3' ; # if not empty filter process for this quarter only~~
10		-
11	9	# set defaults mainly for tests on local machine
12	10	# default_argv "-m 201010 " ;
13		~~- default_argv "-c " ;~~
	11	+ default_argv "-c -q 2010Q4" ;
14	12
15	13	# to do: add text from http://wiki.squid-cache.org/SquidFaq/SquidLogs
16	14	# ReportOrigin how to handle '!error <-> other
—	—	@@ -30,7 +28,7 @@
31	29	$ratio_sqrt = $true ;
32	30	$ratio_linear = $false ;
33	31
34		~~- getopt ("dm", \%options) ;~~
	32	+ getopt ("dmq", \%options) ;
35	33
36	34	if (-d "/a/squid")
37	35	{
—	—	@@ -63,6 +61,15 @@
64	62	if (defined ($options {"c"}))
65	63	{ $reportcountries = $true ; }
66	64
	65	+ if (defined ($options {"q"}))
	66	+ {
	67	+ $quarter_only = $options {"q"} ; # process for this quarter only
	68	+ if ($quarter_only !~ /^2\d\d\dQ\d$/)
	69	+ { abort ("Specify run for one single quarter as -q yyyyQ[1-4], e.g. -q 2011Q3, not '$quarter_only'\n") ; }
	70	+ $quarter_only =~ s/^(\d\d\d\d)(Q\d)$/$1 $2/ ;
	71	+ print "QUARTER ONLY $quarter_only\n" ;
	72	+ }
	73	+
67	74	# date range used to be read from csv file with ReadDate, now there are daily csv files
68	75	# if earlier methods still is useful it needs to be tweaked
69	76	# if (($reportmonth ne "") && ($reportmonth !~ /^\d{6}$/))
—	—	@@ -83,7 +90,7 @@
84	91	&CollectRegionCounts ;
85	92
86	93	&ReportCountries ('Saves');
87		~~-# &ReportCountries ('Views');~~
	94	+ &ReportCountries ('Views');
88	95
89	96	exit ;
90	97	}
—	—	@@ -1257,7 +1264,8 @@
1258	1265	$path_csv_country_codes = "$path_in/$file_csv_country_codes" ;
1259	1266	if (! -e $path_csv_country_codes) { abort ("Input file $path_csv_country_codes not found!") ; }
1260	1267
1261		~~- open CSV_COUNTRY_CODES, '<', $path_csv_country_codes ;~~
	1268	+ open CSV_COUNTRY_CODES, '<', $path_csv_country_codes ;
	1269	+ binmode CSV_COUNTRY_CODES ;
1262	1270	$country_names {"--"} = "Unknown" ;
1263	1271	while ($line = <CSV_COUNTRY_CODES>)
1264	1272	{
—	—	@@ -1265,6 +1273,7 @@
1266	1274
1267	1275	next if $line =~ /^#/ ;
1268	1276
	1277	+ $line =~ s/[\x00-\x1f]//g ;
1269	1278	$line =~ s/C..?te d'Ivoire/Côte d'Ivoire/g ;
1270	1279
1271	1280	($country_code,$region_code,$north_south_code,$country_name) = split (',', $line,4) ;
—	—	@@ -1295,11 +1304,13 @@
1296	1305	{
1297	1306	# http://en.wikipedia.org/wiki/List_of_countries_by_population
1298	1307	# http://en.wikipedia.org/wiki/List_of_countries_by_number_of_Internet_users
1299		~~- open COUNTRY_META_INFO, '<', "$path_in/$file_csv_country_meta_info" ;~~
	1308	+ open COUNTRY_META_INFO, '<', "$path_in/$file_csv_country_meta_info" ;
	1309	+ binmode COUNTRY_META_INFO ;
1300	1310	while ($line = <COUNTRY_META_INFO>)
1301	1311	{
1302	1312	chomp $line ;
1303	1313
	1314	+ $line =~ s/[\x00-\x1f]//g ;
1304	1315	$line =~ s/C..?te d'Ivoire/Côte d'Ivoire/g ;
1305	1316
1306	1317	($country,$link,$population,$connected,$icon) = split ',', $line ;
—	—	@@ -1363,6 +1374,9 @@
1364	1375
1365	1376	# print "CODE $country_code NAME $country_name POP $population, $CONN $connected REGION $region_code NS $north_south_code PPR ${population_per_region {$region_code}}\n" ;
1366	1377	}
	1378	+
	1379	+ if ($population_tot == 0)
	1380	+ { abort ("No valid data found: population_tot = 0 !") ; }
1367	1381	}
1368	1382
1369	1383	sub ReadInputCountriesMonthly
—	—	@@ -1424,7 +1438,6 @@
1425	1439
1426	1440	$year = substr ($yyyymm,0,4) ;
1427	1441	$month = substr ($yyyymm,5,2) ;
1428		~~- # print "year $year report_year month $month $report_year $report_month\n" ;~~
1429	1442
1430	1443	$recently = $false ;
1431	1444
—	—	@@ -1622,7 +1635,6 @@
1623	1636	$new = &CorrectForMissingDays ($week , ${$requests_per_week_per_country_code {$week }} {$country_code}) ;
1624	1637	$old = &CorrectForMissingDays ($week-1, ${$requests_per_week_per_country_code {$week-1}} {$country_code}) ;
1625	1638
1626		~~- # print "country_code $country_code\n" ;~~
1627	1639	if ($old == 0)
1628	1640	{
1629	1641	if ($new > 0)
—	—	@@ -4432,7 +4444,10 @@
4433	4445	$population2 = &i2KM2 ($population) ;
4434	4446	$connected2 = &i2KM2 ($connected) ;
4435	4447	$requests_this_country2 = &i2KM2 ($requests_this_country2) ;
4436		~~- $perc_population = &Percentage ($population / $population_tot) ;~~
	4448	+
	4449	+ if ($population_tot > 0)
	4450	+ { $perc_population = &Percentage ($population / $population_tot) ; }
	4451	+
4437	4452	if ($perc_population =~ /\.0\d/)
4438	4453	{ $perc_population = "<small>$perc_population</small>" ; }
4439	4454

Status & tagging log

23:43, 19 May 2011 Reedy (talk | contribs) changed the status of r86715 [removed: new added: deferred]