r109176 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r109175‎ \| r109176 \| r109177 >
Date:	18:36, 17 January 2012
Author:	ezachte
Status:	deferred
Tags:
Comment:	intermediate release: cleanup before major restructuring
Modified paths:	/trunk/wikistats/dammit.lt/DammitCompactHourlyOrDailyPageCountFiles.pl (modified) (history)

Diff [purge]

Index: trunk/wikistats/dammit.lt/DammitCompactHourlyOrDailyPageCountFiles.pl
—	—	@@ -1,18 +1,8 @@
2	2	#!/usr/local/bin/perl
3	3
4		~~-# 4/27/2010 renamed from WikiStatsCompactDammitFiles.pl~~
	4	+# 4/27/2010 renamed from WikiStatsCompactDammitFiles.pl
	5	+# 11/23/2011 renamed lots of dead (commented) code
5	6
6		~~-# http://article.gmane.org/gmane.science.linguistics.wikipedia.technical/38154/match=new+statistics+stuff~~
7		~~-# http://svn.wikimedia.org/viewvc/mediawiki/trunk/webstatscollector/~~
8		~~-# https://bugzilla.wikimedia.org/show_bug.cgi?id=13541~~
9		~~-# http://de.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general\|namespaces\|namespacealiases~~
10		-
11		~~-# Ideas:~~
12		~~-# 1 namespace string -> namespace number ? (may not save much space: compress will deal with recurring patterns like these)~~
13		~~-# 2 frequency distribution hits per file per first letter _-> manifest crawler~~
14		~~-# assuming crawler collects articles in alphabetical order~~
15		~~-# 3 first letter uppercase -> sort (in sections per first two chars ?)~~
16		-
17	7	use lib "/home/ezachte/lib" ;
18	8	use EzLib ;
19	9
—	—	@@ -27,7 +17,6 @@
28	18	use URI::Escape;
29	19	use Cwd ;
30	20	$bayes = -d "/a/dammit.lt" ;
31		~~-# $path_7za = "/usr/lib/p7zip/7za" ;~~
32	21
33	22	use IO::Compress::Bzip2 qw(bzip2 $Bzip2Error) ;
34	23
—	—	@@ -59,8 +48,6 @@
60	49	print "Track: $track\n" ;
61	50	$reg_exp_track = qr"$track" ;
62	51
63		~~-# -i "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/in" -o "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/out"~~
64		-
65	52	my $options ;
66	53	getopt ("iodft", \%options) ;
67	54
—	—	@@ -80,7 +67,6 @@
81	68	if (! defined ($options {"d"})) { &Abort ("Specify date range: as yyyymm, yyyy* or *") } ;
82	69	}
83	70
84		-
85	71	$dir_in = $options {"i"} ;
86	72	$dir_out = $options {"o"} ;
87	73	$dir_filtered = $options {"f"} ;
—	—	@@ -129,7 +115,6 @@
130	116	$daterange =~ s/\*/\\d+/ ;
131	117
132	118	&CompactVisitorStatsOneDay ($dir_in, $dir_out, $dir_filtered, $dir_track, $daterange) ;
133		~~- # &UncompactVisitorStats ; # test only, to see if process is revertible~~
134	119	}
135	120
136	121	if ($compactmonth)
—	—	@@ -199,7 +184,7 @@
200	185	my $month = substr ($date,4,2) ;
201	186	my $day = substr ($date,6,2) ;
202	187
203		~~- my ($file_out1, $file_out2, $file_out3, $out_day, $hours_missing) ;~~
	188	+ my ($file_out2, $out_day, $hours_missing) ;
204	189
205	190	$dir_out = "$dir_out/${year}-${month}" ;
206	191	if (! -d $dir_out)
—	—	@@ -248,16 +233,9 @@
249	234
250	235	undef @in_hour ;
251	236
252		~~- # $file_out = "pagecounts-$year$month$day_full_day" ;~~
253		~~- # open OUT, ">", $file_out ;~~
254		~~- # binmode $file_out ;~~
255		-
256		~~-# my $out_day1 = IO::Compress::Gzip->new ($file_out1) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;~~
257	237	if ($bayes)
258	238	{
259		~~- # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd" ; # full day~~
260	239	$file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h" ; # full day, hourly data
261		~~- # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d" ; # full day, compact, just daily totals~~
262	240	if ((-e "$file_out2.7z") \|\| (-e "$file_out2.bz2") \|\| (-e "$file_out2.zip") \|\| (-e "$file_out2.gz"))
263	241	{
264	242	&Log ("\nTarget file '$file_out2.[7z\|bz2\|zip\|gz]' exists already. Skip this date.\n") ;
—	—	@@ -270,24 +248,16 @@
271	249	}
272	250
273	251	open $out_day2, ">", "$file_out2" \|\| &Abort ("Output file '$file_out2' could not be opened.") ;
274		~~- # open $out_day3, ">", "$file_out3" \|\| &Abort ("Output file '$file_out3' could not be opened.") ;~~
275	252	}
276	253	else
277	254	{
278		~~- # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd.gz" ; # full day~~
279	255	$file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h.gz" ; # full day, hourly data, count above threshold
280	256	$out_day2 = IO::Compress::Gzip->new ($file_out2) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
281		~~- # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d.gz" ; # full day, count above threshold~~
282		~~- # $out_day3 = IO::Compress::Gzip->new ($file_out3) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;~~
283	257	}
284	258
285		~~-# binmode $out_day1 ;~~
286	259	binmode $out_day2 ;
287		~~-# binmode $out_day3 ;~~
288	260
289		~~- # print "File_out1 $file_out1\n" ;~~
290	261	print "File_out2 $file_out2\n" ;
291		~~- # print "File_out3 $file_out3\n" ;~~
292	262
293	263	$file_filtered = "$dir_filtered/pagecounts-$year$month$day.txt" ;
294	264	&Log ("\nFilter file: $file_filtered\n") ;
—	—	@@ -338,7 +308,6 @@
339	309	$comment .= "# Counts format is total per day, followed by count per hour if larger than zero, hour 0..23 shown as A..X (saves up to 22 bytes per line compared to comma separated values)\n" ;
340	310	$comment .= "# If data are missing for some hour (file missing or corrupt) a question mark (?) is shown (and for each missing hour the daily total is incremented with hourly average)\n\n" ;
341	311	print $out_day2 $comment ;
342		~~-# print $out_day3 $comment ;~~
343	312
344	313	if ($files_in_found < 24)
345	314	{
—	—	@@ -357,8 +326,6 @@
358	327	{
359	328	print $out_day2 "#\n" ;
360	329	print $out_day2 "# In this file data are missing for hour(s) $hours_missing!\n" ;
361		~~- # print $out_day3 "#\n" ;~~
362		~~- # print $out_day3 "# In this file data are missing for hour(s) $hours_missing!\n" ;~~
363	330	}
364	331	$comment = "#\n" ;
365	332	$comment .= "# Lines starting with ampersand (@) show totals per 'namespace' (including omitted counts for low traffic articles)\n" ;
—	—	@@ -369,7 +336,6 @@
370	337	$comment .= "# Page titles are shown unmodified (preserves sort sequence)\n" ;
371	338	$comment .= "#\n" ;
372	339	print $out_day2 $comment ;
373		~~-# print $out_day3 $comment ;~~
374	340
375	341	$key_low_prev = "" ;
376	342	while ($files_in_open > 0)
—	—	@@ -473,9 +439,6 @@
474	440	&Abort ("Sequence error: '$key_low_prev' eq '$key_low'\n") ;
475	441	}
476	442
477		~~- # print OUT "$key_low $total$counts\n" ;~~
478		~~-# print $out_day1 "$key_low $total$counts\n" ;~~
479		-
480	443	($lang,$title) = split (' ', $key_low) ;
481	444
482	445	$title =~ s/\%20/_/g ;
—	—	@@ -496,7 +459,6 @@
497	460	# { print "- $lang\n" ; }
498	461
499	462	&WriteTotalsPerNamespace ($out_day2, $langprev) ;
500		~~- # &WriteTotalsPerNamespace ($out_day3, $langprev) ;~~
501	463	undef %totals_per_namespace ;
502	464	}
503	465	$langprev = $lang ;
—	—	@@ -518,48 +480,22 @@
519	481	}
520	482
521	483	if ($total >= $threshold)
522		~~- { print $out_day2 "$key_low $total$counts\n" ;~~
523		~~- # print $out_day3 "$key_low $total\n" ;~~
524		~~- }~~
	484	+ { print $out_day2 "$key_low $total$counts\n" ; }
525	485
526	486	$key_low_prev = $key_low ;
527		~~- # print "OUT $key_low $counts\n" ;~~
528	487	}
529	488
530	489	&WriteTotalsPerNamespace ($out_day2, $langprev) ;
531		~~-# &WriteTotalsPerNamespace ($out_day3, $langprev) ;~~
532	490
533	491	&Log ("File production took " . (time-$time_start) . " seconds\n\n") ;
534	492
535	493	&Log ("[$lines, $files_in_open] $key_low\n") ;
536		~~-# close OUT ;~~
537	494
538	495	if ($bayes)
539	496	{
540		~~- # close $out_day1 ;~~
541	497	close $out_day2 ;
542		~~- # close $out_day3 ;~~
543	498	close $out_filtered ;
544	499
545		~~-# $cmd = "$path_7za a $file_out2.7z $file_out2" ;~~
546		~~-# $result = `$cmd` ;~~
547		~~-# if ($result =~ /Everything is Ok/s)~~
548		~~-# {~~
549		~~-# $result =~ s/^.?(Updating.?)\n.*$/$1 -> OK/s ;~~
550		~~-# unlink $file_out2 ;~~
551		~~-# foreach $file_in (@files_today)~~
552		~~-# {~~
553		~~-# print "unlink $dir_in/$file_in\n" ;~~
554		~~-# unlink "$dir_in/$file_in" ;~~
555		~~-# }~~
556		~~-# }~~
557		~~-# else~~
558		~~-# {~~
559		~~-# print "Delete $file_out2.7z\n" ;~~
560		~~-# unlink "$file_out2.7z" ;~~
561		~~-# }~~
562		-
563		-
564	500	$time_start_compression = time ;
565	501	$cmd = "bzip2 -9 -v $file_out2" ;
566	502	&Log ("\n\n$cmd ->\n") ;
—	—	@@ -582,9 +518,7 @@
583	519	}
584	520	else
585	521	{
586		~~- # $out_day1->close() ;~~
587	522	$out_day2->close() ;
588		~~- # $out_day3->close() ;~~
589	523	close $out_filtered ;
590	524	}
591	525
—	—	@@ -740,7 +674,6 @@
741	675	}
742	676	}
743	677
744		-
745	678	my $out_month_all = new IO::Compress::Bzip2 "$file_out.bz2" or die "bzip2 failed for $file_out.bz2: $Bzip2Error\n";
746	679	my $out_month_ge5 = new IO::Compress::Bzip2 "${file_out}_ge5.bz2" or die "bzip2 failed for ${file_out}_ge5.bz2: $Bzip2Error\n";
747	680
—	—	@@ -1211,358 +1144,15 @@
1212	1145
1213	1146	#=============================================================================================================
1214	1147
1215		~~-# snippets obsolete but revivable code / test code~~
	1148	+# http://article.gmane.org/gmane.science.linguistics.wikipedia.technical/38154/match=new+statistics+stuff
	1149	+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/webstatscollector/
	1150	+# https://bugzilla.wikimedia.org/show_bug.cgi?id=13541
	1151	+# http://de.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general\|namespaces\|namespacealiases
1216	1152
1217		~~-#sub Compact~~
1218		~~-#{~~
1219		~~-# my $day = shift ;~~
1220		~~-# &Log ("Compact files for $day\n") ;~~
	1153	+# Ideas:
	1154	+# 1 namespace string -> namespace number ? (may not save much space: compress will deal with recurring patterns like these)
	1155	+# 2 frequency distribution hits per file per first letter _-> manifest crawler
	1156	+# assuming crawler collects articles in alphabetical order
	1157	+# 3 always convert first letter after namespace string to uppercase, then sort and merge
1221	1158
1222		~~-# $file_in = "pagecounts-$day.out" ;~~
1223		~~-# $file_out1 = "pagecounts-${day}_all.gz" ;~~
1224		~~-# $file_out2 = "pagecounts-${day}_10plus.gz" ;~~
1225		~~-# open IN, "<", $file_in ;~~
1226		~~-# binmode $file_in ;~~
1227	1159
1228		~~-# my $out_day1 = IO::Compress::Gzip->new ($file_out1) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;~~
1229		~~-# my $out_day2 = IO::Compress::Gzip->new ($file_out2) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;~~
1230		-
1231		~~-# open OUT, ">", $file_out ;~~
1232		~~-# binmode $file_out ;~~
1233		-
1234		~~-# $lang_prev = "" ;~~
1235		~~-# while ($line = <IN>)~~
1236		~~-# {~~
1237		~~-# chomp ($line) ;~~
1238		~~-# ($lang, $title, $counts) = split (' ', $line) ;~~
1239		~~-# $title2 = $title ;~~
1240		~~-# $title =~ s/\%20/_/g ;~~
1241		~~-# $title =~ s/\%3A/:/g ;~~
1242		~~-# # $title =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;~~
1243		~~-# # if ($title =~ /[\x00-\x1F]/)~~
1244		~~-# # { &Log ("> '$title2'\n") ; }~~
1245		~~-# $title =~ s/\x00-\x1F/"%" . sprintf ("%X", ord($1)) ;/ge ;~~
1246		~~-# print $out_day1 "$lang $title $counts\n" ;~~
1247		~~-# ($counts2 = $counts) =~ s/^(\d+).*/$1/ ;~~
1248		~~-# if ($counts2 >= $threshold)~~
1249		~~-# { print $out_day2 "$lang $title $counts\n" ; }~~
1250		~~-# $lang_prev = $lang ;~~
1251		~~-# }~~
1252		-#
1253		~~-# close IN ;~~
1254		~~-# $out_day1->close() ;~~
1255		~~-# $out_day2->close() ;~~
1256		~~-#}~~
1257		-
1258		-
1259		~~-#sub GetViewDistribution~~
1260		~~-#{~~
1261		~~-# open OUT, ">", "Views.csv" ;~~
1262		~~-# foreach $file_in (@files)~~
1263		~~-# {~~
1264		~~-# ($hour = $file_in) =~ s/^pagecounts-\d+-(\d\d)\d+\.gz$/$1/ ;~~
1265		~~-# $hour = chr(ord('A')+$hour) ;~~
1266		~~-# &Log ("Process $hour $file_in\n") ;~~
1267		-
1268		~~-# $in_hour1 = IO::Uncompress::Gunzip->new ($file_in) \|\| &Abort ("IO::Uncompress::Gunzip failed: $GunzipError\n") ;~~
1269		~~-# while ($line = <$in_hour1>)~~
1270		~~-# {~~
1271		~~-# ($lang,$title,$count,$dummy) = split (' ', $line) ;~~
1272		~~-# if (($lang eq "en") && ($title !~ /:/)) # only en: and namespace 0~~
1273		~~-# {~~
1274		~~-# $tot {$hour} += $count ;~~
1275		~~-# if ($count < 3)~~
1276		~~-# { @counts {$hour . substr ($title,0,1)}++ ; }~~
1277		~~-# }~~
1278		~~-# }~~
1279		~~-# $in_hour1->close () ;~~
1280		~~-# }~~
1281		-#
1282		~~-# print OUT "," ;~~
1283		~~-# foreach $hour ('A'..'X')~~
1284		~~-# { print OUT $hour . ", " ; }~~
1285		~~-# print OUT "\n" ;~~
1286		-#
1287		~~-# print OUT "," ;~~
1288		~~-# foreach $hour ('A'..'X')~~
1289		~~-# { print OUT $tot {$hour} . ", " ; }~~
1290		~~-# print OUT "\n" ;~~
1291		-#
1292		~~-# for ($c=0; $c < 256; $c++)~~
1293		~~-# {~~
1294		~~-# # do not print chars " and , as such: confuses csv format~~
1295		~~-# if ($c < 33)~~
1296		~~-# { print OUT "chr($c), " ; }~~
1297		~~-# elsif (chr($c) eq '"')~~
1298		~~-# { print OUT "dquote, " ; }~~
1299		~~-# elsif (chr($c) eq ',')~~
1300		~~-# { print OUT "comma, " ; }~~
1301		~~-# else~~
1302		~~-# { print OUT chr($c) . ", " ; }~~
1303		-#
1304		~~-# foreach $hour ('A'..'X')~~
1305		~~-# { print OUT (0+@counts {$hour.chr($c)}) , ", " ; }~~
1306		-#
1307		~~-# if ($c < 255)~~
1308		~~-# { print OUT "\n" ; }~~
1309		~~-# }~~
1310		~~-# close OUT ;~~
1311		~~-#}~~
1312		-
1313		-
1314		~~-#sub RecompactVisitorStats~~
1315		~~-#{~~
1316		~~-# my $dir_in = "D:/Wikipedia_Visitors/full_day/" ;~~
1317		~~-# chdir ($dir_in) \|\| &Abort ("Cannot chdir to $dir_in\n") ;~~
1318		~~-# local (*DIR);~~
1319		~~-# opendir (DIR, ".");~~
1320		~~-# @files = () ;~~
1321		~~-# while ($file_in = readdir (DIR))~~
1322		~~-# {~~
1323		~~-# next if $file_in !~ /^pagecounts-\d{8,8}_fd.gz$/ ;~~
1324		-#
1325		~~-# push @files, $file_in ;~~
1326		~~-# }~~
1327		-
1328		~~-# $filecnt = $#files+1 ;~~
1329		~~-# @files = sort { substr ($a, 20,2) <=> substr ($b, 20,2)} @files ;~~
1330		-
1331		~~-# foreach $file (@files)~~
1332		~~-# { &RecompactVisitorStats2 ($file) ; }~~
1333		~~-# closedir (DIR, ".");~~
1334		~~-#}~~
1335		-
1336		~~-#sub RecompactVisitorStats2~~
1337		~~-#{~~
1338		~~-## http://www.7-zip.org/7z.html~~
1339		~~-# my $file = shift ;~~
1340		~~-# my $time_start = time ;~~
1341		~~-# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;~~
1342		~~-## my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;~~
1343		~~-# my $file_in = "D:/Wikipedia_Visitors/full_day/$file" ;~~
1344		~~-# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;~~
1345		~~-# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;~~
1346		-
1347		~~-# &Log ("Process $file_in\n") ;~~
1348		-
1349		~~-# $in_hour = IO::Uncompress::Gunzip->new ($file_in) \|\| &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;~~
1350		~~-# binmode $in_hour ;~~
1351		~~-# open OUT, ">", $file_out ;~~
1352		~~-# binmode OUT ;~~
1353		-
1354		~~-# my ($title, $title2) ;~~
1355		~~-# while ($line = <$in_hour>)~~
1356		~~-# {~~
1357		~~-# chomp ($line) ;~~
1358		~~-# ($lang,$title,$counts) = split (" ", $line) ;~~
1359		-
1360		~~-# if ($lang ne $lang_prev) { print "$lang " ; }~~
1361		~~-# $lang_prev = $lang ;~~
1362		-
1363		~~-# # test pagecounts-20080701_fd.gz~~
1364		~~-# # all records 424 Mib compressed (1984 uncompressed)~~
1365		~~-# # count > 1 212 Mib compressed ( 733 uncompressed)~~
1366		~~-# # count > 2 169 Mib compressed ( 551 uncompressed)~~
1367		~~-# next if $counts <= 1 ;~~
1368		-
1369		~~-# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;~~
1370		~~-# $title =~ s/\s/_/g;~~
1371		~~-# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence~~
1372		~~-# $lang =~ s/\.y/2/ ;~~
1373		-
1374		~~-# print OUT "$lang $title $counts\n" ;~~
1375		~~-# }~~
1376		-
1377		~~-# print "Close files\n" ;~~
1378		~~-# $in_hour -> close () ;~~
1379		~~-# close (OUT) ;~~
1380		-
1381		~~-# &Log ("Compress $file_out\n") ;~~
1382		-
1383		~~-# unlink $file_7z ;~~
1384		~~-# $result = `$path_7z a $file_7z $file_out` ;~~
1385		~~-# &Log ("Compressed\n") ;~~
1386		~~-# &Log ("Result " . ($result+0) . " \n") ;~~
1387		~~-# if ((-e $file_7z) && (-s $file_7z > 0) && (($result == 0) \|\| ($result == 7)))~~
1388		~~-# { unlink $file_out ; }~~
1389		-
1390		~~-# &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;~~
1391		~~-## 0 No error~~
1392		~~-## 1 Warning (Non fatal error(s)). For example, one or more files were locked by some other application, so they were not compressed.~~
1393		~~-## 2 Fatal error~~
1394		~~-## 7 Command line error~~
1395		~~-## 8 Not enough memory for operation~~
1396		~~-## 255 User stopped the process~~
1397		~~-#}~~
1398		-
1399		-
1400		~~-#sub RecompactVisitorStats3~~
1401		~~-#{~~
1402		~~-## http://www.7-zip.org/7z.html~~
1403		~~-# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;~~
1404		~~-# my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;~~
1405		~~-# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;~~
1406		~~-# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;~~
1407		~~-## my $file_log = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts.log" ;~~
1408		-
1409		~~-# $in_hour = IO::Uncompress::Gunzip->new ($file_in) \|\| &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;~~
1410		~~-# binmode $in_hour ;~~
1411		~~-## $out_day = IO::Compress::Gzip->new ($file_out) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;~~
1412		~~-## binmode $out_day ;~~
1413		~~-# open OUT, ">", $file_out ;~~
1414		~~-# binmode OUT ;~~
1415		~~-## open LOG, ">", $file_log ;~~
1416		~~-## binmode LOG ;~~
1417		-
1418		~~-# my ($title, $title2) ;~~
1419		~~-# while ($line = <$in_hour>)~~
1420		~~-# {~~
1421		~~-# chomp ($line) ;~~
1422		~~-# ($lang,$title,$counts) = split (" ", $line) ;~~
1423		-
1424		~~-# if ($lang ne $lang_prev) { print "$lang\n" ; }~~
1425		~~-## last if $lang gt "fs" ;~~
1426		~~-# $lang_prev = $lang ;~~
1427		-
1428		~~-# # test pagecounts-20080701_fd.gz~~
1429		~~-# # all records 424 Mib compressed (1984 uncompressed)~~
1430		~~-# # count > 1 212 Mib compressed ( 733 uncompressed)~~
1431		~~-# # count > 2 169 Mib compressed ( 551 uncompressed)~~
1432		~~-# next if $counts <= 1 ;~~
1433		-
1434		~~-## next if $lang !~ /^(?:ar\|fr)/ ;~~
1435		-
1436		~~-#if ($false)~~
1437		~~-#{~~
1438		~~-# $title1b = $title ;~~
1439		~~-# $title1b =~ s/(\%[A-Fa-f0-9]{2})/uc($1)/seg;~~
1440		~~-# $title1b =~ s/\%28/(/g ;~~
1441		~~-# $title1b =~ s/\%29/)/g ;~~
1442		~~-# $title1b =~ s/\%3A/:/g ;~~
1443		~~-# $title1b =~ s/\%2F/\//g ;~~
1444		~~-# $title1b =~ s/\%5C/\\/g ;~~
1445		~~-# $title1b =~ s/\%2A/*/g ;~~
1446		~~-# $title1b =~ s/\%21/!/g ;~~
1447		~~-# $title1b =~ s/\%5F/_/g ;~~
1448		~~-# $title1b =~ s/\%2C/,/g ;~~
1449		~~-# $title1b =~ s/\%2E/./g ;~~
1450		~~-# $title1b =~ s/\%2D/-/g ;~~
1451		~~-# $title1b =~ s/\%25/%/g ;~~
1452		~~-# $title1b =~ s/\%7E/~/g ;~~
1453		~~-# $title1b =~ s/\%27/'/g ;~~
1454		~~-# $title1b =~ s/\%3D/=/g ;~~
1455		~~-# $title1b =~ s/\%26/&/g ;~~
1456		~~-# $title1b =~ s/\%3B/;/g ;~~
1457		~~-# $title1b =~ s/\%3F/?/g ;~~
1458		~~-# $title1b =~ s/\%2B/+/g ;~~
1459		~~-# $title2 = $title1b ;~~
1460		~~-# $title2 =~ s/%([A-F0-9]{2})/chr(hex($1))/seg;~~
1461		-
1462		~~-# if ($title1b ne $title2) # if changed anything at all~~
1463		~~-# {~~
1464		~~-# $title3 = uri_escape ($title2) ;~~
1465		~~-# $title3 =~ s/\%28/(/g ;~~
1466		~~-# $title3 =~ s/\%29/)/g ;~~
1467		~~-# $title3 =~ s/\%3A/:/g ;~~
1468		~~-# $title3 =~ s/\%2F/\//g ;~~
1469		~~-# $title3 =~ s/\%5C/\\/g ;~~
1470		~~-# $title3 =~ s/\%2A/\*/g ;~~
1471		~~-# $title3 =~ s/\%21/\!/g ;~~
1472		~~-# $title3 =~ s/\%5F/\_/g ;~~
1473		~~-# $title3 =~ s/\%2C/,/g ;~~
1474		~~-# $title3 =~ s/\%2E/./g ;~~
1475		~~-# $title3 =~ s/\%2D/-/g ;~~
1476		~~-# $title3 =~ s/\%25/%/g ;~~
1477		~~-# $title3 =~ s/\%7E/~/g ;~~
1478		~~-# $title3 =~ s/\%27/'/g ;~~
1479		~~-# $title3 =~ s/\%3D/=/g ;~~
1480		~~-# $title3 =~ s/\%26/&/g ;~~
1481		~~-# $title3 =~ s/\%3B/;/g ;~~
1482		~~-# $title3 =~ s/\%3F/?/g ;~~
1483		~~-# $title3 =~ s/\%2B/+/g ;~~
1484		-
1485		~~-# if ($title1b eq $title3) # process reversible ?~~
1486		~~-# {~~
1487		~~-# $y++ ;~~
1488		~~-# $title2 =~ s/\s/_/g;~~
1489		~~-# $title = $title2 ;~~
1490		~~-# }~~
1491		~~-# else~~
1492		~~-# {~~
1493		~~-# $n++ ;~~
1494		~~-# print "Y $y N $n\n$title\n$title3\n\n" ;~~
1495		~~-# print LOG "Y $y N $n\n$title\n$title3\n\n" ;~~
1496		~~-# }~~
1497		~~-# }~~
1498		~~-#}~~
1499		~~-# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;~~
1500		~~-# $title =~ s/\s/_/g;~~
1501		~~-# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence~~
1502		~~-# $lang =~ s/\.y/2/ ;~~
1503		-
1504		~~-# # print $out_day "$lang $title $counts\n" ;~~
1505		~~-# print OUT "$lang $title $counts\n" ;~~
1506		~~-# }~~
1507		-
1508		~~-# print "Close files\n" ;~~
1509		~~-# $in_hour -> close () ;~~
1510		~~-## $out_day -> close () ;~~
1511		~~-# close (OUT) ;~~
1512		~~-# $result = `$path_7z a $file_out $file_txt` ;~~
1513		~~-# print $result ;~~
1514		~~-#}~~
1515		-
1516		-
1517		-
1518		~~-# test (partial) reversibility of process~~
1519		~~-#sub UncompactVisitorStats~~
1520		~~-#{~~
1521		~~-# my $file_in = "out/2009-03/pagecounts-20090301_fdt" ;~~
1522		~~-# my $dir_out = "out" ;~~
1523		~~-# # $in_hour = IO::Uncompress::Gunzip->new ($file_in) \|\| &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;~~
1524		~~-# open $in_hour, '<', $file_in ;~~
1525		~~-# binmode $in_hour ;~~
1526		-
1527		~~-# for ($h=0 ; $h<=23 ; $h++)~~
1528		~~-# {~~
1529		~~-# $time = sprintf ("%02d",$h) . "0000" ;~~
1530		~~-## $file_out = "$dir_out/pagecounts-20090301-$time.gz" ;~~
1531		~~-# $file_out = "$dir_out/pagecounts-20090301-$time" ;~~
1532		~~-# open $out_day [$h], '>', $file_out ;~~
1533		~~-## $out_day [$h] = IO::Compress::Gzip->new ($file_out) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n");~~
1534		~~-# binmode $out_day [$h] ;~~
1535		~~-# }~~
1536		-
1537		~~-# while ($line = <$in_hour>)~~
1538		~~-# {~~
1539		~~-# next if $line =~ /^#/ ;~~
1540		~~-# next if $line =~ /^@/ ;~~
1541		~~-# chomp ($line) ;~~
1542		~~-## print "$line\n" ;~~
1543		~~-# if ($lines++ > 10000) { exit ; }~~
1544		~~-# ($lang,$title,$counts) = split (" ", $line) ;~~
1545		~~-# $lang =~ s/\.z// ;~~
1546		~~-# $lang =~ s/\.y/2/ ;~~
1547		~~-# $counts =~ s/^\d+// ; # remove (redundant) preceding total~~
1548		~~-# while ($counts ne "")~~
1549		~~-# {~~
1550		~~-# $letter = substr ($counts,0,1) ;~~
1551		~~-# $counts = substr ($counts,1) ;~~
1552		~~-# ($count = $counts) =~ s/^(\d+).*$/$1/ ;~~
1553		~~-# $counts =~ s/^\d+(.*)$/$1/ ;~~
1554		~~-# $h = ord ($letter) - ord ('A') ;~~
1555		~~-# $file = $out_day [$h] ;~~
1556		~~-# $writes {$h} ++ ;~~
1557		~~-# print $file "$lang $title $count\n" ;~~
1558		~~-# }~~
1559		-
1560		~~-# }~~
1561		-
1562		~~-# for ($h=0 ; $h<=23 ; $h++)~~
1563		~~-# {~~
1564		~~-## $out_day [$h] -> close () ;~~
1565		~~-# close $out_day [$h] ;~~
1566		~~-# }~~
1567		~~-#}~~
1568		-
1569		-

Status & tagging log

19:28, 17 January 2012 Reedy (talk | contribs) changed the status of r109176 [removed: new added: deferred]