r109176 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r109175‎ | r109176 | r109177 >
Date:18:36, 17 January 2012
Author:ezachte
Status:deferred
Tags:
Comment:
intermediate release: cleanup before major restructuring
Modified paths:
  • /trunk/wikistats/dammit.lt/DammitCompactHourlyOrDailyPageCountFiles.pl (modified) (history)

Diff [purge]

Index: trunk/wikistats/dammit.lt/DammitCompactHourlyOrDailyPageCountFiles.pl
@@ -1,18 +1,8 @@
22 #!/usr/local/bin/perl
33
4 -# 4/27/2010 renamed from WikiStatsCompactDammitFiles.pl
 4+# 4/27/2010 renamed from WikiStatsCompactDammitFiles.pl
 5+# 11/23/2011 renamed lots of dead (commented) code
56
6 -# http://article.gmane.org/gmane.science.linguistics.wikipedia.technical/38154/match=new+statistics+stuff
7 -# http://svn.wikimedia.org/viewvc/mediawiki/trunk/webstatscollector/
8 -# https://bugzilla.wikimedia.org/show_bug.cgi?id=13541
9 -# http://de.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases
10 -
11 -# Ideas:
12 -# 1 namespace string -> namespace number ? (may not save much space: compress will deal with recurring patterns like these)
13 -# 2 frequency distribution hits per file per first letter _-> manifest crawler
14 -# assuming crawler collects articles in alphabetical order
15 -# 3 first letter uppercase -> sort (in sections per first two chars ?)
16 -
177 use lib "/home/ezachte/lib" ;
188 use EzLib ;
199
@@ -27,7 +17,6 @@
2818 use URI::Escape;
2919 use Cwd ;
3020 $bayes = -d "/a/dammit.lt" ;
31 -# $path_7za = "/usr/lib/p7zip/7za" ;
3221
3322 use IO::Compress::Bzip2 qw(bzip2 $Bzip2Error) ;
3423
@@ -59,8 +48,6 @@
6049 print "Track: $track\n" ;
6150 $reg_exp_track = qr"$track" ;
6251
63 -# -i "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/in" -o "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/out"
64 -
6552 my $options ;
6653 getopt ("iodft", \%options) ;
6754
@@ -80,7 +67,6 @@
8168 if (! defined ($options {"d"})) { &Abort ("Specify date range: as yyyymm, yyyy* or *") } ;
8269 }
8370
84 -
8571 $dir_in = $options {"i"} ;
8672 $dir_out = $options {"o"} ;
8773 $dir_filtered = $options {"f"} ;
@@ -129,7 +115,6 @@
130116 $daterange =~ s/\*/\\d+/ ;
131117
132118 &CompactVisitorStatsOneDay ($dir_in, $dir_out, $dir_filtered, $dir_track, $daterange) ;
133 - # &UncompactVisitorStats ; # test only, to see if process is revertible
134119 }
135120
136121 if ($compactmonth)
@@ -199,7 +184,7 @@
200185 my $month = substr ($date,4,2) ;
201186 my $day = substr ($date,6,2) ;
202187
203 - my ($file_out1, $file_out2, $file_out3, $out_day, $hours_missing) ;
 188+ my ($file_out2, $out_day, $hours_missing) ;
204189
205190 $dir_out = "$dir_out/${year}-${month}" ;
206191 if (! -d $dir_out)
@@ -248,16 +233,9 @@
249234
250235 undef @in_hour ;
251236
252 - # $file_out = "pagecounts-$year$month$day_full_day" ;
253 - # open OUT, ">", $file_out ;
254 - # binmode $file_out ;
255 -
256 -# my $out_day1 = IO::Compress::Gzip->new ($file_out1) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
257237 if ($bayes)
258238 {
259 - # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd" ; # full day
260239 $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h" ; # full day, hourly data
261 - # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d" ; # full day, compact, just daily totals
262240 if ((-e "$file_out2.7z") || (-e "$file_out2.bz2") || (-e "$file_out2.zip") || (-e "$file_out2.gz"))
263241 {
264242 &Log ("\nTarget file '$file_out2.[7z|bz2|zip|gz]' exists already. Skip this date.\n") ;
@@ -270,24 +248,16 @@
271249 }
272250
273251 open $out_day2, ">", "$file_out2" || &Abort ("Output file '$file_out2' could not be opened.") ;
274 - # open $out_day3, ">", "$file_out3" || &Abort ("Output file '$file_out3' could not be opened.") ;
275252 }
276253 else
277254 {
278 - # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd.gz" ; # full day
279255 $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h.gz" ; # full day, hourly data, count above threshold
280256 $out_day2 = IO::Compress::Gzip->new ($file_out2) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
281 - # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d.gz" ; # full day, count above threshold
282 - # $out_day3 = IO::Compress::Gzip->new ($file_out3) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
283257 }
284258
285 -# binmode $out_day1 ;
286259 binmode $out_day2 ;
287 -# binmode $out_day3 ;
288260
289 - # print "File_out1 $file_out1\n" ;
290261 print "File_out2 $file_out2\n" ;
291 - # print "File_out3 $file_out3\n" ;
292262
293263 $file_filtered = "$dir_filtered/pagecounts-$year$month$day.txt" ;
294264 &Log ("\nFilter file: $file_filtered\n") ;
@@ -338,7 +308,6 @@
339309 $comment .= "# Counts format is total per day, followed by count per hour if larger than zero, hour 0..23 shown as A..X (saves up to 22 bytes per line compared to comma separated values)\n" ;
340310 $comment .= "# If data are missing for some hour (file missing or corrupt) a question mark (?) is shown (and for each missing hour the daily total is incremented with hourly average)\n\n" ;
341311 print $out_day2 $comment ;
342 -# print $out_day3 $comment ;
343312
344313 if ($files_in_found < 24)
345314 {
@@ -357,8 +326,6 @@
358327 {
359328 print $out_day2 "#\n" ;
360329 print $out_day2 "# In this file data are missing for hour(s) $hours_missing!\n" ;
361 - # print $out_day3 "#\n" ;
362 - # print $out_day3 "# In this file data are missing for hour(s) $hours_missing!\n" ;
363330 }
364331 $comment = "#\n" ;
365332 $comment .= "# Lines starting with ampersand (@) show totals per 'namespace' (including omitted counts for low traffic articles)\n" ;
@@ -369,7 +336,6 @@
370337 $comment .= "# Page titles are shown unmodified (preserves sort sequence)\n" ;
371338 $comment .= "#\n" ;
372339 print $out_day2 $comment ;
373 -# print $out_day3 $comment ;
374340
375341 $key_low_prev = "" ;
376342 while ($files_in_open > 0)
@@ -473,9 +439,6 @@
474440 &Abort ("Sequence error: '$key_low_prev' eq '$key_low'\n") ;
475441 }
476442
477 - # print OUT "$key_low $total$counts\n" ;
478 -# print $out_day1 "$key_low $total$counts\n" ;
479 -
480443 ($lang,$title) = split (' ', $key_low) ;
481444
482445 $title =~ s/\%20/_/g ;
@@ -496,7 +459,6 @@
497460 # { print "- $lang\n" ; }
498461
499462 &WriteTotalsPerNamespace ($out_day2, $langprev) ;
500 - # &WriteTotalsPerNamespace ($out_day3, $langprev) ;
501463 undef %totals_per_namespace ;
502464 }
503465 $langprev = $lang ;
@@ -518,48 +480,22 @@
519481 }
520482
521483 if ($total >= $threshold)
522 - { print $out_day2 "$key_low $total$counts\n" ;
523 - # print $out_day3 "$key_low $total\n" ;
524 - }
 484+ { print $out_day2 "$key_low $total$counts\n" ; }
525485
526486 $key_low_prev = $key_low ;
527 - # print "OUT $key_low $counts\n" ;
528487 }
529488
530489 &WriteTotalsPerNamespace ($out_day2, $langprev) ;
531 -# &WriteTotalsPerNamespace ($out_day3, $langprev) ;
532490
533491 &Log ("File production took " . (time-$time_start) . " seconds\n\n") ;
534492
535493 &Log ("[$lines, $files_in_open] $key_low\n") ;
536 -# close OUT ;
537494
538495 if ($bayes)
539496 {
540 - # close $out_day1 ;
541497 close $out_day2 ;
542 - # close $out_day3 ;
543498 close $out_filtered ;
544499
545 -# $cmd = "$path_7za a $file_out2.7z $file_out2" ;
546 -# $result = `$cmd` ;
547 -# if ($result =~ /Everything is Ok/s)
548 -# {
549 -# $result =~ s/^.*?(Updating.*?)\n.*$/$1 -> OK/s ;
550 -# unlink $file_out2 ;
551 -# foreach $file_in (@files_today)
552 -# {
553 -# print "unlink $dir_in/$file_in\n" ;
554 -# unlink "$dir_in/$file_in" ;
555 -# }
556 -# }
557 -# else
558 -# {
559 -# print "Delete $file_out2.7z\n" ;
560 -# unlink "$file_out2.7z" ;
561 -# }
562 -
563 -
564500 $time_start_compression = time ;
565501 $cmd = "bzip2 -9 -v $file_out2" ;
566502 &Log ("\n\n$cmd ->\n") ;
@@ -582,9 +518,7 @@
583519 }
584520 else
585521 {
586 - # $out_day1->close() ;
587522 $out_day2->close() ;
588 - # $out_day3->close() ;
589523 close $out_filtered ;
590524 }
591525
@@ -740,7 +674,6 @@
741675 }
742676 }
743677
744 -
745678 my $out_month_all = new IO::Compress::Bzip2 "$file_out.bz2" or die "bzip2 failed for $file_out.bz2: $Bzip2Error\n";
746679 my $out_month_ge5 = new IO::Compress::Bzip2 "${file_out}_ge5.bz2" or die "bzip2 failed for ${file_out}_ge5.bz2: $Bzip2Error\n";
747680
@@ -1211,358 +1144,15 @@
12121145
12131146 #=============================================================================================================
12141147
1215 -# snippets obsolete but revivable code / test code
 1148+# http://article.gmane.org/gmane.science.linguistics.wikipedia.technical/38154/match=new+statistics+stuff
 1149+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/webstatscollector/
 1150+# https://bugzilla.wikimedia.org/show_bug.cgi?id=13541
 1151+# http://de.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases
12161152
1217 -#sub Compact
1218 -#{
1219 -# my $day = shift ;
1220 -# &Log ("Compact files for $day\n") ;
 1153+# Ideas:
 1154+# 1 namespace string -> namespace number ? (may not save much space: compress will deal with recurring patterns like these)
 1155+# 2 frequency distribution hits per file per first letter _-> manifest crawler
 1156+# assuming crawler collects articles in alphabetical order
 1157+# 3 always convert first letter after namespace string to uppercase, then sort and merge
12211158
1222 -# $file_in = "pagecounts-$day.out" ;
1223 -# $file_out1 = "pagecounts-${day}_all.gz" ;
1224 -# $file_out2 = "pagecounts-${day}_10plus.gz" ;
1225 -# open IN, "<", $file_in ;
1226 -# binmode $file_in ;
12271159
1228 -# my $out_day1 = IO::Compress::Gzip->new ($file_out1) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
1229 -# my $out_day2 = IO::Compress::Gzip->new ($file_out2) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
1230 -
1231 -# open OUT, ">", $file_out ;
1232 -# binmode $file_out ;
1233 -
1234 -# $lang_prev = "" ;
1235 -# while ($line = <IN>)
1236 -# {
1237 -# chomp ($line) ;
1238 -# ($lang, $title, $counts) = split (' ', $line) ;
1239 -# $title2 = $title ;
1240 -# $title =~ s/\%20/_/g ;
1241 -# $title =~ s/\%3A/:/g ;
1242 -# # $title =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
1243 -# # if ($title =~ /[\x00-\x1F]/)
1244 -# # { &Log ("> '$title2'\n") ; }
1245 -# $title =~ s/\x00-\x1F/"%" . sprintf ("%X", ord($1)) ;/ge ;
1246 -# print $out_day1 "$lang $title $counts\n" ;
1247 -# ($counts2 = $counts) =~ s/^(\d+).*/$1/ ;
1248 -# if ($counts2 >= $threshold)
1249 -# { print $out_day2 "$lang $title $counts\n" ; }
1250 -# $lang_prev = $lang ;
1251 -# }
1252 -#
1253 -# close IN ;
1254 -# $out_day1->close() ;
1255 -# $out_day2->close() ;
1256 -#}
1257 -
1258 -
1259 -#sub GetViewDistribution
1260 -#{
1261 -# open OUT, ">", "Views.csv" ;
1262 -# foreach $file_in (@files)
1263 -# {
1264 -# ($hour = $file_in) =~ s/^pagecounts-\d+-(\d\d)\d+\.gz$/$1/ ;
1265 -# $hour = chr(ord('A')+$hour) ;
1266 -# &Log ("Process $hour $file_in\n") ;
1267 -
1268 -# $in_hour1 = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed: $GunzipError\n") ;
1269 -# while ($line = <$in_hour1>)
1270 -# {
1271 -# ($lang,$title,$count,$dummy) = split (' ', $line) ;
1272 -# if (($lang eq "en") && ($title !~ /:/)) # only en: and namespace 0
1273 -# {
1274 -# $tot {$hour} += $count ;
1275 -# if ($count < 3)
1276 -# { @counts {$hour . substr ($title,0,1)}++ ; }
1277 -# }
1278 -# }
1279 -# $in_hour1->close () ;
1280 -# }
1281 -#
1282 -# print OUT "," ;
1283 -# foreach $hour ('A'..'X')
1284 -# { print OUT $hour . ", " ; }
1285 -# print OUT "\n" ;
1286 -#
1287 -# print OUT "," ;
1288 -# foreach $hour ('A'..'X')
1289 -# { print OUT $tot {$hour} . ", " ; }
1290 -# print OUT "\n" ;
1291 -#
1292 -# for ($c=0; $c < 256; $c++)
1293 -# {
1294 -# # do not print chars " and , as such: confuses csv format
1295 -# if ($c < 33)
1296 -# { print OUT "chr($c), " ; }
1297 -# elsif (chr($c) eq '"')
1298 -# { print OUT "dquote, " ; }
1299 -# elsif (chr($c) eq ',')
1300 -# { print OUT "comma, " ; }
1301 -# else
1302 -# { print OUT chr($c) . ", " ; }
1303 -#
1304 -# foreach $hour ('A'..'X')
1305 -# { print OUT (0+@counts {$hour.chr($c)}) , ", " ; }
1306 -#
1307 -# if ($c < 255)
1308 -# { print OUT "\n" ; }
1309 -# }
1310 -# close OUT ;
1311 -#}
1312 -
1313 -
1314 -#sub RecompactVisitorStats
1315 -#{
1316 -# my $dir_in = "D:/Wikipedia_Visitors/full_day/" ;
1317 -# chdir ($dir_in) || &Abort ("Cannot chdir to $dir_in\n") ;
1318 -# local (*DIR);
1319 -# opendir (DIR, ".");
1320 -# @files = () ;
1321 -# while ($file_in = readdir (DIR))
1322 -# {
1323 -# next if $file_in !~ /^pagecounts-\d{8,8}_fd.gz$/ ;
1324 -#
1325 -# push @files, $file_in ;
1326 -# }
1327 -
1328 -# $filecnt = $#files+1 ;
1329 -# @files = sort { substr ($a, 20,2) <=> substr ($b, 20,2)} @files ;
1330 -
1331 -# foreach $file (@files)
1332 -# { &RecompactVisitorStats2 ($file) ; }
1333 -# closedir (DIR, ".");
1334 -#}
1335 -
1336 -#sub RecompactVisitorStats2
1337 -#{
1338 -## http://www.7-zip.org/7z.html
1339 -# my $file = shift ;
1340 -# my $time_start = time ;
1341 -# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
1342 -## my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
1343 -# my $file_in = "D:/Wikipedia_Visitors/full_day/$file" ;
1344 -# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
1345 -# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
1346 -
1347 -# &Log ("Process $file_in\n") ;
1348 -
1349 -# $in_hour = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
1350 -# binmode $in_hour ;
1351 -# open OUT, ">", $file_out ;
1352 -# binmode OUT ;
1353 -
1354 -# my ($title, $title2) ;
1355 -# while ($line = <$in_hour>)
1356 -# {
1357 -# chomp ($line) ;
1358 -# ($lang,$title,$counts) = split (" ", $line) ;
1359 -
1360 -# if ($lang ne $lang_prev) { print "$lang " ; }
1361 -# $lang_prev = $lang ;
1362 -
1363 -# # test pagecounts-20080701_fd.gz
1364 -# # all records 424 Mib compressed (1984 uncompressed)
1365 -# # count > 1 212 Mib compressed ( 733 uncompressed)
1366 -# # count > 2 169 Mib compressed ( 551 uncompressed)
1367 -# next if $counts <= 1 ;
1368 -
1369 -# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
1370 -# $title =~ s/\s/_/g;
1371 -# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
1372 -# $lang =~ s/\.y/2/ ;
1373 -
1374 -# print OUT "$lang $title $counts\n" ;
1375 -# }
1376 -
1377 -# print "Close files\n" ;
1378 -# $in_hour -> close () ;
1379 -# close (OUT) ;
1380 -
1381 -# &Log ("Compress $file_out\n") ;
1382 -
1383 -# unlink $file_7z ;
1384 -# $result = `$path_7z a $file_7z $file_out` ;
1385 -# &Log ("Compressed\n") ;
1386 -# &Log ("Result " . ($result+0) . " \n") ;
1387 -# if ((-e $file_7z) && (-s $file_7z > 0) && (($result == 0) || ($result == 7)))
1388 -# { unlink $file_out ; }
1389 -
1390 -# &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
1391 -## 0 No error
1392 -## 1 Warning (Non fatal error(s)). For example, one or more files were locked by some other application, so they were not compressed.
1393 -## 2 Fatal error
1394 -## 7 Command line error
1395 -## 8 Not enough memory for operation
1396 -## 255 User stopped the process
1397 -#}
1398 -
1399 -
1400 -#sub RecompactVisitorStats3
1401 -#{
1402 -## http://www.7-zip.org/7z.html
1403 -# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
1404 -# my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
1405 -# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
1406 -# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
1407 -## my $file_log = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts.log" ;
1408 -
1409 -# $in_hour = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
1410 -# binmode $in_hour ;
1411 -## $out_day = IO::Compress::Gzip->new ($file_out) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
1412 -## binmode $out_day ;
1413 -# open OUT, ">", $file_out ;
1414 -# binmode OUT ;
1415 -## open LOG, ">", $file_log ;
1416 -## binmode LOG ;
1417 -
1418 -# my ($title, $title2) ;
1419 -# while ($line = <$in_hour>)
1420 -# {
1421 -# chomp ($line) ;
1422 -# ($lang,$title,$counts) = split (" ", $line) ;
1423 -
1424 -# if ($lang ne $lang_prev) { print "$lang\n" ; }
1425 -## last if $lang gt "fs" ;
1426 -# $lang_prev = $lang ;
1427 -
1428 -# # test pagecounts-20080701_fd.gz
1429 -# # all records 424 Mib compressed (1984 uncompressed)
1430 -# # count > 1 212 Mib compressed ( 733 uncompressed)
1431 -# # count > 2 169 Mib compressed ( 551 uncompressed)
1432 -# next if $counts <= 1 ;
1433 -
1434 -## next if $lang !~ /^(?:ar|fr)/ ;
1435 -
1436 -#if ($false)
1437 -#{
1438 -# $title1b = $title ;
1439 -# $title1b =~ s/(\%[A-Fa-f0-9]{2})/uc($1)/seg;
1440 -# $title1b =~ s/\%28/(/g ;
1441 -# $title1b =~ s/\%29/)/g ;
1442 -# $title1b =~ s/\%3A/:/g ;
1443 -# $title1b =~ s/\%2F/\//g ;
1444 -# $title1b =~ s/\%5C/\\/g ;
1445 -# $title1b =~ s/\%2A/*/g ;
1446 -# $title1b =~ s/\%21/!/g ;
1447 -# $title1b =~ s/\%5F/_/g ;
1448 -# $title1b =~ s/\%2C/,/g ;
1449 -# $title1b =~ s/\%2E/./g ;
1450 -# $title1b =~ s/\%2D/-/g ;
1451 -# $title1b =~ s/\%25/%/g ;
1452 -# $title1b =~ s/\%7E/~/g ;
1453 -# $title1b =~ s/\%27/'/g ;
1454 -# $title1b =~ s/\%3D/=/g ;
1455 -# $title1b =~ s/\%26/&/g ;
1456 -# $title1b =~ s/\%3B/;/g ;
1457 -# $title1b =~ s/\%3F/?/g ;
1458 -# $title1b =~ s/\%2B/+/g ;
1459 -# $title2 = $title1b ;
1460 -# $title2 =~ s/%([A-F0-9]{2})/chr(hex($1))/seg;
1461 -
1462 -# if ($title1b ne $title2) # if changed anything at all
1463 -# {
1464 -# $title3 = uri_escape ($title2) ;
1465 -# $title3 =~ s/\%28/(/g ;
1466 -# $title3 =~ s/\%29/)/g ;
1467 -# $title3 =~ s/\%3A/:/g ;
1468 -# $title3 =~ s/\%2F/\//g ;
1469 -# $title3 =~ s/\%5C/\\/g ;
1470 -# $title3 =~ s/\%2A/\*/g ;
1471 -# $title3 =~ s/\%21/\!/g ;
1472 -# $title3 =~ s/\%5F/\_/g ;
1473 -# $title3 =~ s/\%2C/,/g ;
1474 -# $title3 =~ s/\%2E/./g ;
1475 -# $title3 =~ s/\%2D/-/g ;
1476 -# $title3 =~ s/\%25/%/g ;
1477 -# $title3 =~ s/\%7E/~/g ;
1478 -# $title3 =~ s/\%27/'/g ;
1479 -# $title3 =~ s/\%3D/=/g ;
1480 -# $title3 =~ s/\%26/&/g ;
1481 -# $title3 =~ s/\%3B/;/g ;
1482 -# $title3 =~ s/\%3F/?/g ;
1483 -# $title3 =~ s/\%2B/+/g ;
1484 -
1485 -# if ($title1b eq $title3) # process reversible ?
1486 -# {
1487 -# $y++ ;
1488 -# $title2 =~ s/\s/_/g;
1489 -# $title = $title2 ;
1490 -# }
1491 -# else
1492 -# {
1493 -# $n++ ;
1494 -# print "Y $y N $n\n$title\n$title3\n\n" ;
1495 -# print LOG "Y $y N $n\n$title\n$title3\n\n" ;
1496 -# }
1497 -# }
1498 -#}
1499 -# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
1500 -# $title =~ s/\s/_/g;
1501 -# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
1502 -# $lang =~ s/\.y/2/ ;
1503 -
1504 -# # print $out_day "$lang $title $counts\n" ;
1505 -# print OUT "$lang $title $counts\n" ;
1506 -# }
1507 -
1508 -# print "Close files\n" ;
1509 -# $in_hour -> close () ;
1510 -## $out_day -> close () ;
1511 -# close (OUT) ;
1512 -# $result = `$path_7z a $file_out $file_txt` ;
1513 -# print $result ;
1514 -#}
1515 -
1516 -
1517 -
1518 -# test (partial) reversibility of process
1519 -#sub UncompactVisitorStats
1520 -#{
1521 -# my $file_in = "out/2009-03/pagecounts-20090301_fdt" ;
1522 -# my $dir_out = "out" ;
1523 -# # $in_hour = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
1524 -# open $in_hour, '<', $file_in ;
1525 -# binmode $in_hour ;
1526 -
1527 -# for ($h=0 ; $h<=23 ; $h++)
1528 -# {
1529 -# $time = sprintf ("%02d",$h) . "0000" ;
1530 -## $file_out = "$dir_out/pagecounts-20090301-$time.gz" ;
1531 -# $file_out = "$dir_out/pagecounts-20090301-$time" ;
1532 -# open $out_day [$h], '>', $file_out ;
1533 -## $out_day [$h] = IO::Compress::Gzip->new ($file_out) || &Abort ("IO::Compress::Gzip failed: $GzipError\n");
1534 -# binmode $out_day [$h] ;
1535 -# }
1536 -
1537 -# while ($line = <$in_hour>)
1538 -# {
1539 -# next if $line =~ /^#/ ;
1540 -# next if $line =~ /^@/ ;
1541 -# chomp ($line) ;
1542 -## print "$line\n" ;
1543 -# if ($lines++ > 10000) { exit ; }
1544 -# ($lang,$title,$counts) = split (" ", $line) ;
1545 -# $lang =~ s/\.z// ;
1546 -# $lang =~ s/\.y/2/ ;
1547 -# $counts =~ s/^\d+// ; # remove (redundant) preceding total
1548 -# while ($counts ne "")
1549 -# {
1550 -# $letter = substr ($counts,0,1) ;
1551 -# $counts = substr ($counts,1) ;
1552 -# ($count = $counts) =~ s/^(\d+).*$/$1/ ;
1553 -# $counts =~ s/^\d+(.*)$/$1/ ;
1554 -# $h = ord ($letter) - ord ('A') ;
1555 -# $file = $out_day [$h] ;
1556 -# $writes {$h} ++ ;
1557 -# print $file "$lang $title $count\n" ;
1558 -# }
1559 -
1560 -# }
1561 -
1562 -# for ($h=0 ; $h<=23 ; $h++)
1563 -# {
1564 -## $out_day [$h] -> close () ;
1565 -# close $out_day [$h] ;
1566 -# }
1567 -#}
1568 -
1569 -

Status & tagging log