Index: trunk/wikistats/dammit.lt/DammitCompactHourlyOrDailyPageCountFiles.pl |
— | — | @@ -1,18 +1,8 @@ |
2 | 2 | #!/usr/local/bin/perl
|
3 | 3 |
|
4 | | -# 4/27/2010 renamed from WikiStatsCompactDammitFiles.pl
|
| 4 | +# 4/27/2010 renamed from WikiStatsCompactDammitFiles.pl
|
| 5 | +# 11/23/2011 renamed lots of dead (commented) code
|
5 | 6 |
|
6 | | -# http://article.gmane.org/gmane.science.linguistics.wikipedia.technical/38154/match=new+statistics+stuff
|
7 | | -# http://svn.wikimedia.org/viewvc/mediawiki/trunk/webstatscollector/
|
8 | | -# https://bugzilla.wikimedia.org/show_bug.cgi?id=13541
|
9 | | -# http://de.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases
|
10 | | -
|
11 | | -# Ideas:
|
12 | | -# 1 namespace string -> namespace number ? (may not save much space: compress will deal with recurring patterns like these)
|
13 | | -# 2 frequency distribution hits per file per first letter _-> manifest crawler
|
14 | | -# assuming crawler collects articles in alphabetical order
|
15 | | -# 3 first letter uppercase -> sort (in sections per first two chars ?)
|
16 | | -
|
17 | 7 | use lib "/home/ezachte/lib" ;
|
18 | 8 | use EzLib ;
|
19 | 9 |
|
— | — | @@ -27,7 +17,6 @@ |
28 | 18 | use URI::Escape;
|
29 | 19 | use Cwd ;
|
30 | 20 | $bayes = -d "/a/dammit.lt" ;
|
31 | | -# $path_7za = "/usr/lib/p7zip/7za" ;
|
32 | 21 |
|
33 | 22 | use IO::Compress::Bzip2 qw(bzip2 $Bzip2Error) ;
|
34 | 23 |
|
— | — | @@ -59,8 +48,6 @@ |
60 | 49 | print "Track: $track\n" ;
|
61 | 50 | $reg_exp_track = qr"$track" ;
|
62 | 51 |
|
63 | | -# -i "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/in" -o "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/out"
|
64 | | -
|
65 | 52 | my $options ;
|
66 | 53 | getopt ("iodft", \%options) ;
|
67 | 54 |
|
— | — | @@ -80,7 +67,6 @@ |
81 | 68 | if (! defined ($options {"d"})) { &Abort ("Specify date range: as yyyymm, yyyy* or *") } ;
|
82 | 69 | }
|
83 | 70 |
|
84 | | -
|
85 | 71 | $dir_in = $options {"i"} ;
|
86 | 72 | $dir_out = $options {"o"} ;
|
87 | 73 | $dir_filtered = $options {"f"} ;
|
— | — | @@ -129,7 +115,6 @@ |
130 | 116 | $daterange =~ s/\*/\\d+/ ;
|
131 | 117 |
|
132 | 118 | &CompactVisitorStatsOneDay ($dir_in, $dir_out, $dir_filtered, $dir_track, $daterange) ;
|
133 | | - # &UncompactVisitorStats ; # test only, to see if process is revertible
|
134 | 119 | }
|
135 | 120 |
|
136 | 121 | if ($compactmonth)
|
— | — | @@ -199,7 +184,7 @@ |
200 | 185 | my $month = substr ($date,4,2) ;
|
201 | 186 | my $day = substr ($date,6,2) ;
|
202 | 187 |
|
203 | | - my ($file_out1, $file_out2, $file_out3, $out_day, $hours_missing) ;
|
| 188 | + my ($file_out2, $out_day, $hours_missing) ;
|
204 | 189 |
|
205 | 190 | $dir_out = "$dir_out/${year}-${month}" ;
|
206 | 191 | if (! -d $dir_out)
|
— | — | @@ -248,16 +233,9 @@ |
249 | 234 |
|
250 | 235 | undef @in_hour ;
|
251 | 236 |
|
252 | | - # $file_out = "pagecounts-$year$month$day_full_day" ;
|
253 | | - # open OUT, ">", $file_out ;
|
254 | | - # binmode $file_out ;
|
255 | | -
|
256 | | -# my $out_day1 = IO::Compress::Gzip->new ($file_out1) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
257 | 237 | if ($bayes)
|
258 | 238 | {
|
259 | | - # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd" ; # full day
|
260 | 239 | $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h" ; # full day, hourly data
|
261 | | - # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d" ; # full day, compact, just daily totals
|
262 | 240 | if ((-e "$file_out2.7z") || (-e "$file_out2.bz2") || (-e "$file_out2.zip") || (-e "$file_out2.gz"))
|
263 | 241 | {
|
264 | 242 | &Log ("\nTarget file '$file_out2.[7z|bz2|zip|gz]' exists already. Skip this date.\n") ;
|
— | — | @@ -270,24 +248,16 @@ |
271 | 249 | }
|
272 | 250 |
|
273 | 251 | open $out_day2, ">", "$file_out2" || &Abort ("Output file '$file_out2' could not be opened.") ;
|
274 | | - # open $out_day3, ">", "$file_out3" || &Abort ("Output file '$file_out3' could not be opened.") ;
|
275 | 252 | }
|
276 | 253 | else
|
277 | 254 | {
|
278 | | - # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd.gz" ; # full day
|
279 | 255 | $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h.gz" ; # full day, hourly data, count above threshold
|
280 | 256 | $out_day2 = IO::Compress::Gzip->new ($file_out2) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
281 | | - # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d.gz" ; # full day, count above threshold
|
282 | | - # $out_day3 = IO::Compress::Gzip->new ($file_out3) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
283 | 257 | }
|
284 | 258 |
|
285 | | -# binmode $out_day1 ;
|
286 | 259 | binmode $out_day2 ;
|
287 | | -# binmode $out_day3 ;
|
288 | 260 |
|
289 | | - # print "File_out1 $file_out1\n" ;
|
290 | 261 | print "File_out2 $file_out2\n" ;
|
291 | | - # print "File_out3 $file_out3\n" ;
|
292 | 262 |
|
293 | 263 | $file_filtered = "$dir_filtered/pagecounts-$year$month$day.txt" ;
|
294 | 264 | &Log ("\nFilter file: $file_filtered\n") ;
|
— | — | @@ -338,7 +308,6 @@ |
339 | 309 | $comment .= "# Counts format is total per day, followed by count per hour if larger than zero, hour 0..23 shown as A..X (saves up to 22 bytes per line compared to comma separated values)\n" ;
|
340 | 310 | $comment .= "# If data are missing for some hour (file missing or corrupt) a question mark (?) is shown (and for each missing hour the daily total is incremented with hourly average)\n\n" ;
|
341 | 311 | print $out_day2 $comment ;
|
342 | | -# print $out_day3 $comment ;
|
343 | 312 |
|
344 | 313 | if ($files_in_found < 24)
|
345 | 314 | {
|
— | — | @@ -357,8 +326,6 @@ |
358 | 327 | {
|
359 | 328 | print $out_day2 "#\n" ;
|
360 | 329 | print $out_day2 "# In this file data are missing for hour(s) $hours_missing!\n" ;
|
361 | | - # print $out_day3 "#\n" ;
|
362 | | - # print $out_day3 "# In this file data are missing for hour(s) $hours_missing!\n" ;
|
363 | 330 | }
|
364 | 331 | $comment = "#\n" ;
|
365 | 332 | $comment .= "# Lines starting with ampersand (@) show totals per 'namespace' (including omitted counts for low traffic articles)\n" ;
|
— | — | @@ -369,7 +336,6 @@ |
370 | 337 | $comment .= "# Page titles are shown unmodified (preserves sort sequence)\n" ;
|
371 | 338 | $comment .= "#\n" ;
|
372 | 339 | print $out_day2 $comment ;
|
373 | | -# print $out_day3 $comment ;
|
374 | 340 |
|
375 | 341 | $key_low_prev = "" ;
|
376 | 342 | while ($files_in_open > 0)
|
— | — | @@ -473,9 +439,6 @@ |
474 | 440 | &Abort ("Sequence error: '$key_low_prev' eq '$key_low'\n") ;
|
475 | 441 | }
|
476 | 442 |
|
477 | | - # print OUT "$key_low $total$counts\n" ;
|
478 | | -# print $out_day1 "$key_low $total$counts\n" ;
|
479 | | -
|
480 | 443 | ($lang,$title) = split (' ', $key_low) ;
|
481 | 444 |
|
482 | 445 | $title =~ s/\%20/_/g ;
|
— | — | @@ -496,7 +459,6 @@ |
497 | 460 | # { print "- $lang\n" ; }
|
498 | 461 |
|
499 | 462 | &WriteTotalsPerNamespace ($out_day2, $langprev) ;
|
500 | | - # &WriteTotalsPerNamespace ($out_day3, $langprev) ;
|
501 | 463 | undef %totals_per_namespace ;
|
502 | 464 | }
|
503 | 465 | $langprev = $lang ;
|
— | — | @@ -518,48 +480,22 @@ |
519 | 481 | }
|
520 | 482 |
|
521 | 483 | if ($total >= $threshold)
|
522 | | - { print $out_day2 "$key_low $total$counts\n" ;
|
523 | | - # print $out_day3 "$key_low $total\n" ;
|
524 | | - }
|
| 484 | + { print $out_day2 "$key_low $total$counts\n" ; }
|
525 | 485 |
|
526 | 486 | $key_low_prev = $key_low ;
|
527 | | - # print "OUT $key_low $counts\n" ;
|
528 | 487 | }
|
529 | 488 |
|
530 | 489 | &WriteTotalsPerNamespace ($out_day2, $langprev) ;
|
531 | | -# &WriteTotalsPerNamespace ($out_day3, $langprev) ;
|
532 | 490 |
|
533 | 491 | &Log ("File production took " . (time-$time_start) . " seconds\n\n") ;
|
534 | 492 |
|
535 | 493 | &Log ("[$lines, $files_in_open] $key_low\n") ;
|
536 | | -# close OUT ;
|
537 | 494 |
|
538 | 495 | if ($bayes)
|
539 | 496 | {
|
540 | | - # close $out_day1 ;
|
541 | 497 | close $out_day2 ;
|
542 | | - # close $out_day3 ;
|
543 | 498 | close $out_filtered ;
|
544 | 499 |
|
545 | | -# $cmd = "$path_7za a $file_out2.7z $file_out2" ;
|
546 | | -# $result = `$cmd` ;
|
547 | | -# if ($result =~ /Everything is Ok/s)
|
548 | | -# {
|
549 | | -# $result =~ s/^.*?(Updating.*?)\n.*$/$1 -> OK/s ;
|
550 | | -# unlink $file_out2 ;
|
551 | | -# foreach $file_in (@files_today)
|
552 | | -# {
|
553 | | -# print "unlink $dir_in/$file_in\n" ;
|
554 | | -# unlink "$dir_in/$file_in" ;
|
555 | | -# }
|
556 | | -# }
|
557 | | -# else
|
558 | | -# {
|
559 | | -# print "Delete $file_out2.7z\n" ;
|
560 | | -# unlink "$file_out2.7z" ;
|
561 | | -# }
|
562 | | -
|
563 | | -
|
564 | 500 | $time_start_compression = time ;
|
565 | 501 | $cmd = "bzip2 -9 -v $file_out2" ;
|
566 | 502 | &Log ("\n\n$cmd ->\n") ;
|
— | — | @@ -582,9 +518,7 @@ |
583 | 519 | }
|
584 | 520 | else
|
585 | 521 | {
|
586 | | - # $out_day1->close() ;
|
587 | 522 | $out_day2->close() ;
|
588 | | - # $out_day3->close() ;
|
589 | 523 | close $out_filtered ;
|
590 | 524 | }
|
591 | 525 |
|
— | — | @@ -740,7 +674,6 @@ |
741 | 675 | }
|
742 | 676 | }
|
743 | 677 |
|
744 | | -
|
745 | 678 | my $out_month_all = new IO::Compress::Bzip2 "$file_out.bz2" or die "bzip2 failed for $file_out.bz2: $Bzip2Error\n";
|
746 | 679 | my $out_month_ge5 = new IO::Compress::Bzip2 "${file_out}_ge5.bz2" or die "bzip2 failed for ${file_out}_ge5.bz2: $Bzip2Error\n";
|
747 | 680 |
|
— | — | @@ -1211,358 +1144,15 @@ |
1212 | 1145 |
|
1213 | 1146 | #=============================================================================================================
|
1214 | 1147 |
|
1215 | | -# snippets obsolete but revivable code / test code
|
| 1148 | +# http://article.gmane.org/gmane.science.linguistics.wikipedia.technical/38154/match=new+statistics+stuff
|
| 1149 | +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/webstatscollector/
|
| 1150 | +# https://bugzilla.wikimedia.org/show_bug.cgi?id=13541
|
| 1151 | +# http://de.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases
|
1216 | 1152 |
|
1217 | | -#sub Compact
|
1218 | | -#{
|
1219 | | -# my $day = shift ;
|
1220 | | -# &Log ("Compact files for $day\n") ;
|
| 1153 | +# Ideas:
|
| 1154 | +# 1 namespace string -> namespace number ? (may not save much space: compress will deal with recurring patterns like these)
|
| 1155 | +# 2 frequency distribution hits per file per first letter _-> manifest crawler
|
| 1156 | +# assuming crawler collects articles in alphabetical order
|
| 1157 | +# 3 always convert first letter after namespace string to uppercase, then sort and merge
|
1221 | 1158 |
|
1222 | | -# $file_in = "pagecounts-$day.out" ;
|
1223 | | -# $file_out1 = "pagecounts-${day}_all.gz" ;
|
1224 | | -# $file_out2 = "pagecounts-${day}_10plus.gz" ;
|
1225 | | -# open IN, "<", $file_in ;
|
1226 | | -# binmode $file_in ;
|
1227 | 1159 |
|
1228 | | -# my $out_day1 = IO::Compress::Gzip->new ($file_out1) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
1229 | | -# my $out_day2 = IO::Compress::Gzip->new ($file_out2) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
1230 | | -
|
1231 | | -# open OUT, ">", $file_out ;
|
1232 | | -# binmode $file_out ;
|
1233 | | -
|
1234 | | -# $lang_prev = "" ;
|
1235 | | -# while ($line = <IN>)
|
1236 | | -# {
|
1237 | | -# chomp ($line) ;
|
1238 | | -# ($lang, $title, $counts) = split (' ', $line) ;
|
1239 | | -# $title2 = $title ;
|
1240 | | -# $title =~ s/\%20/_/g ;
|
1241 | | -# $title =~ s/\%3A/:/g ;
|
1242 | | -# # $title =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
|
1243 | | -# # if ($title =~ /[\x00-\x1F]/)
|
1244 | | -# # { &Log ("> '$title2'\n") ; }
|
1245 | | -# $title =~ s/\x00-\x1F/"%" . sprintf ("%X", ord($1)) ;/ge ;
|
1246 | | -# print $out_day1 "$lang $title $counts\n" ;
|
1247 | | -# ($counts2 = $counts) =~ s/^(\d+).*/$1/ ;
|
1248 | | -# if ($counts2 >= $threshold)
|
1249 | | -# { print $out_day2 "$lang $title $counts\n" ; }
|
1250 | | -# $lang_prev = $lang ;
|
1251 | | -# }
|
1252 | | -#
|
1253 | | -# close IN ;
|
1254 | | -# $out_day1->close() ;
|
1255 | | -# $out_day2->close() ;
|
1256 | | -#}
|
1257 | | -
|
1258 | | -
|
1259 | | -#sub GetViewDistribution
|
1260 | | -#{
|
1261 | | -# open OUT, ">", "Views.csv" ;
|
1262 | | -# foreach $file_in (@files)
|
1263 | | -# {
|
1264 | | -# ($hour = $file_in) =~ s/^pagecounts-\d+-(\d\d)\d+\.gz$/$1/ ;
|
1265 | | -# $hour = chr(ord('A')+$hour) ;
|
1266 | | -# &Log ("Process $hour $file_in\n") ;
|
1267 | | -
|
1268 | | -# $in_hour1 = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed: $GunzipError\n") ;
|
1269 | | -# while ($line = <$in_hour1>)
|
1270 | | -# {
|
1271 | | -# ($lang,$title,$count,$dummy) = split (' ', $line) ;
|
1272 | | -# if (($lang eq "en") && ($title !~ /:/)) # only en: and namespace 0
|
1273 | | -# {
|
1274 | | -# $tot {$hour} += $count ;
|
1275 | | -# if ($count < 3)
|
1276 | | -# { @counts {$hour . substr ($title,0,1)}++ ; }
|
1277 | | -# }
|
1278 | | -# }
|
1279 | | -# $in_hour1->close () ;
|
1280 | | -# }
|
1281 | | -#
|
1282 | | -# print OUT "," ;
|
1283 | | -# foreach $hour ('A'..'X')
|
1284 | | -# { print OUT $hour . ", " ; }
|
1285 | | -# print OUT "\n" ;
|
1286 | | -#
|
1287 | | -# print OUT "," ;
|
1288 | | -# foreach $hour ('A'..'X')
|
1289 | | -# { print OUT $tot {$hour} . ", " ; }
|
1290 | | -# print OUT "\n" ;
|
1291 | | -#
|
1292 | | -# for ($c=0; $c < 256; $c++)
|
1293 | | -# {
|
1294 | | -# # do not print chars " and , as such: confuses csv format
|
1295 | | -# if ($c < 33)
|
1296 | | -# { print OUT "chr($c), " ; }
|
1297 | | -# elsif (chr($c) eq '"')
|
1298 | | -# { print OUT "dquote, " ; }
|
1299 | | -# elsif (chr($c) eq ',')
|
1300 | | -# { print OUT "comma, " ; }
|
1301 | | -# else
|
1302 | | -# { print OUT chr($c) . ", " ; }
|
1303 | | -#
|
1304 | | -# foreach $hour ('A'..'X')
|
1305 | | -# { print OUT (0+@counts {$hour.chr($c)}) , ", " ; }
|
1306 | | -#
|
1307 | | -# if ($c < 255)
|
1308 | | -# { print OUT "\n" ; }
|
1309 | | -# }
|
1310 | | -# close OUT ;
|
1311 | | -#}
|
1312 | | -
|
1313 | | -
|
1314 | | -#sub RecompactVisitorStats
|
1315 | | -#{
|
1316 | | -# my $dir_in = "D:/Wikipedia_Visitors/full_day/" ;
|
1317 | | -# chdir ($dir_in) || &Abort ("Cannot chdir to $dir_in\n") ;
|
1318 | | -# local (*DIR);
|
1319 | | -# opendir (DIR, ".");
|
1320 | | -# @files = () ;
|
1321 | | -# while ($file_in = readdir (DIR))
|
1322 | | -# {
|
1323 | | -# next if $file_in !~ /^pagecounts-\d{8,8}_fd.gz$/ ;
|
1324 | | -#
|
1325 | | -# push @files, $file_in ;
|
1326 | | -# }
|
1327 | | -
|
1328 | | -# $filecnt = $#files+1 ;
|
1329 | | -# @files = sort { substr ($a, 20,2) <=> substr ($b, 20,2)} @files ;
|
1330 | | -
|
1331 | | -# foreach $file (@files)
|
1332 | | -# { &RecompactVisitorStats2 ($file) ; }
|
1333 | | -# closedir (DIR, ".");
|
1334 | | -#}
|
1335 | | -
|
1336 | | -#sub RecompactVisitorStats2
|
1337 | | -#{
|
1338 | | -## http://www.7-zip.org/7z.html
|
1339 | | -# my $file = shift ;
|
1340 | | -# my $time_start = time ;
|
1341 | | -# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
|
1342 | | -## my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
|
1343 | | -# my $file_in = "D:/Wikipedia_Visitors/full_day/$file" ;
|
1344 | | -# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
|
1345 | | -# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
|
1346 | | -
|
1347 | | -# &Log ("Process $file_in\n") ;
|
1348 | | -
|
1349 | | -# $in_hour = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
|
1350 | | -# binmode $in_hour ;
|
1351 | | -# open OUT, ">", $file_out ;
|
1352 | | -# binmode OUT ;
|
1353 | | -
|
1354 | | -# my ($title, $title2) ;
|
1355 | | -# while ($line = <$in_hour>)
|
1356 | | -# {
|
1357 | | -# chomp ($line) ;
|
1358 | | -# ($lang,$title,$counts) = split (" ", $line) ;
|
1359 | | -
|
1360 | | -# if ($lang ne $lang_prev) { print "$lang " ; }
|
1361 | | -# $lang_prev = $lang ;
|
1362 | | -
|
1363 | | -# # test pagecounts-20080701_fd.gz
|
1364 | | -# # all records 424 Mib compressed (1984 uncompressed)
|
1365 | | -# # count > 1 212 Mib compressed ( 733 uncompressed)
|
1366 | | -# # count > 2 169 Mib compressed ( 551 uncompressed)
|
1367 | | -# next if $counts <= 1 ;
|
1368 | | -
|
1369 | | -# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
|
1370 | | -# $title =~ s/\s/_/g;
|
1371 | | -# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
|
1372 | | -# $lang =~ s/\.y/2/ ;
|
1373 | | -
|
1374 | | -# print OUT "$lang $title $counts\n" ;
|
1375 | | -# }
|
1376 | | -
|
1377 | | -# print "Close files\n" ;
|
1378 | | -# $in_hour -> close () ;
|
1379 | | -# close (OUT) ;
|
1380 | | -
|
1381 | | -# &Log ("Compress $file_out\n") ;
|
1382 | | -
|
1383 | | -# unlink $file_7z ;
|
1384 | | -# $result = `$path_7z a $file_7z $file_out` ;
|
1385 | | -# &Log ("Compressed\n") ;
|
1386 | | -# &Log ("Result " . ($result+0) . " \n") ;
|
1387 | | -# if ((-e $file_7z) && (-s $file_7z > 0) && (($result == 0) || ($result == 7)))
|
1388 | | -# { unlink $file_out ; }
|
1389 | | -
|
1390 | | -# &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
|
1391 | | -## 0 No error
|
1392 | | -## 1 Warning (Non fatal error(s)). For example, one or more files were locked by some other application, so they were not compressed.
|
1393 | | -## 2 Fatal error
|
1394 | | -## 7 Command line error
|
1395 | | -## 8 Not enough memory for operation
|
1396 | | -## 255 User stopped the process
|
1397 | | -#}
|
1398 | | -
|
1399 | | -
|
1400 | | -#sub RecompactVisitorStats3
|
1401 | | -#{
|
1402 | | -## http://www.7-zip.org/7z.html
|
1403 | | -# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
|
1404 | | -# my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
|
1405 | | -# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
|
1406 | | -# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
|
1407 | | -## my $file_log = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts.log" ;
|
1408 | | -
|
1409 | | -# $in_hour = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
|
1410 | | -# binmode $in_hour ;
|
1411 | | -## $out_day = IO::Compress::Gzip->new ($file_out) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
1412 | | -## binmode $out_day ;
|
1413 | | -# open OUT, ">", $file_out ;
|
1414 | | -# binmode OUT ;
|
1415 | | -## open LOG, ">", $file_log ;
|
1416 | | -## binmode LOG ;
|
1417 | | -
|
1418 | | -# my ($title, $title2) ;
|
1419 | | -# while ($line = <$in_hour>)
|
1420 | | -# {
|
1421 | | -# chomp ($line) ;
|
1422 | | -# ($lang,$title,$counts) = split (" ", $line) ;
|
1423 | | -
|
1424 | | -# if ($lang ne $lang_prev) { print "$lang\n" ; }
|
1425 | | -## last if $lang gt "fs" ;
|
1426 | | -# $lang_prev = $lang ;
|
1427 | | -
|
1428 | | -# # test pagecounts-20080701_fd.gz
|
1429 | | -# # all records 424 Mib compressed (1984 uncompressed)
|
1430 | | -# # count > 1 212 Mib compressed ( 733 uncompressed)
|
1431 | | -# # count > 2 169 Mib compressed ( 551 uncompressed)
|
1432 | | -# next if $counts <= 1 ;
|
1433 | | -
|
1434 | | -## next if $lang !~ /^(?:ar|fr)/ ;
|
1435 | | -
|
1436 | | -#if ($false)
|
1437 | | -#{
|
1438 | | -# $title1b = $title ;
|
1439 | | -# $title1b =~ s/(\%[A-Fa-f0-9]{2})/uc($1)/seg;
|
1440 | | -# $title1b =~ s/\%28/(/g ;
|
1441 | | -# $title1b =~ s/\%29/)/g ;
|
1442 | | -# $title1b =~ s/\%3A/:/g ;
|
1443 | | -# $title1b =~ s/\%2F/\//g ;
|
1444 | | -# $title1b =~ s/\%5C/\\/g ;
|
1445 | | -# $title1b =~ s/\%2A/*/g ;
|
1446 | | -# $title1b =~ s/\%21/!/g ;
|
1447 | | -# $title1b =~ s/\%5F/_/g ;
|
1448 | | -# $title1b =~ s/\%2C/,/g ;
|
1449 | | -# $title1b =~ s/\%2E/./g ;
|
1450 | | -# $title1b =~ s/\%2D/-/g ;
|
1451 | | -# $title1b =~ s/\%25/%/g ;
|
1452 | | -# $title1b =~ s/\%7E/~/g ;
|
1453 | | -# $title1b =~ s/\%27/'/g ;
|
1454 | | -# $title1b =~ s/\%3D/=/g ;
|
1455 | | -# $title1b =~ s/\%26/&/g ;
|
1456 | | -# $title1b =~ s/\%3B/;/g ;
|
1457 | | -# $title1b =~ s/\%3F/?/g ;
|
1458 | | -# $title1b =~ s/\%2B/+/g ;
|
1459 | | -# $title2 = $title1b ;
|
1460 | | -# $title2 =~ s/%([A-F0-9]{2})/chr(hex($1))/seg;
|
1461 | | -
|
1462 | | -# if ($title1b ne $title2) # if changed anything at all
|
1463 | | -# {
|
1464 | | -# $title3 = uri_escape ($title2) ;
|
1465 | | -# $title3 =~ s/\%28/(/g ;
|
1466 | | -# $title3 =~ s/\%29/)/g ;
|
1467 | | -# $title3 =~ s/\%3A/:/g ;
|
1468 | | -# $title3 =~ s/\%2F/\//g ;
|
1469 | | -# $title3 =~ s/\%5C/\\/g ;
|
1470 | | -# $title3 =~ s/\%2A/\*/g ;
|
1471 | | -# $title3 =~ s/\%21/\!/g ;
|
1472 | | -# $title3 =~ s/\%5F/\_/g ;
|
1473 | | -# $title3 =~ s/\%2C/,/g ;
|
1474 | | -# $title3 =~ s/\%2E/./g ;
|
1475 | | -# $title3 =~ s/\%2D/-/g ;
|
1476 | | -# $title3 =~ s/\%25/%/g ;
|
1477 | | -# $title3 =~ s/\%7E/~/g ;
|
1478 | | -# $title3 =~ s/\%27/'/g ;
|
1479 | | -# $title3 =~ s/\%3D/=/g ;
|
1480 | | -# $title3 =~ s/\%26/&/g ;
|
1481 | | -# $title3 =~ s/\%3B/;/g ;
|
1482 | | -# $title3 =~ s/\%3F/?/g ;
|
1483 | | -# $title3 =~ s/\%2B/+/g ;
|
1484 | | -
|
1485 | | -# if ($title1b eq $title3) # process reversible ?
|
1486 | | -# {
|
1487 | | -# $y++ ;
|
1488 | | -# $title2 =~ s/\s/_/g;
|
1489 | | -# $title = $title2 ;
|
1490 | | -# }
|
1491 | | -# else
|
1492 | | -# {
|
1493 | | -# $n++ ;
|
1494 | | -# print "Y $y N $n\n$title\n$title3\n\n" ;
|
1495 | | -# print LOG "Y $y N $n\n$title\n$title3\n\n" ;
|
1496 | | -# }
|
1497 | | -# }
|
1498 | | -#}
|
1499 | | -# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
|
1500 | | -# $title =~ s/\s/_/g;
|
1501 | | -# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
|
1502 | | -# $lang =~ s/\.y/2/ ;
|
1503 | | -
|
1504 | | -# # print $out_day "$lang $title $counts\n" ;
|
1505 | | -# print OUT "$lang $title $counts\n" ;
|
1506 | | -# }
|
1507 | | -
|
1508 | | -# print "Close files\n" ;
|
1509 | | -# $in_hour -> close () ;
|
1510 | | -## $out_day -> close () ;
|
1511 | | -# close (OUT) ;
|
1512 | | -# $result = `$path_7z a $file_out $file_txt` ;
|
1513 | | -# print $result ;
|
1514 | | -#}
|
1515 | | -
|
1516 | | -
|
1517 | | -
|
1518 | | -# test (partial) reversibility of process
|
1519 | | -#sub UncompactVisitorStats
|
1520 | | -#{
|
1521 | | -# my $file_in = "out/2009-03/pagecounts-20090301_fdt" ;
|
1522 | | -# my $dir_out = "out" ;
|
1523 | | -# # $in_hour = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
|
1524 | | -# open $in_hour, '<', $file_in ;
|
1525 | | -# binmode $in_hour ;
|
1526 | | -
|
1527 | | -# for ($h=0 ; $h<=23 ; $h++)
|
1528 | | -# {
|
1529 | | -# $time = sprintf ("%02d",$h) . "0000" ;
|
1530 | | -## $file_out = "$dir_out/pagecounts-20090301-$time.gz" ;
|
1531 | | -# $file_out = "$dir_out/pagecounts-20090301-$time" ;
|
1532 | | -# open $out_day [$h], '>', $file_out ;
|
1533 | | -## $out_day [$h] = IO::Compress::Gzip->new ($file_out) || &Abort ("IO::Compress::Gzip failed: $GzipError\n");
|
1534 | | -# binmode $out_day [$h] ;
|
1535 | | -# }
|
1536 | | -
|
1537 | | -# while ($line = <$in_hour>)
|
1538 | | -# {
|
1539 | | -# next if $line =~ /^#/ ;
|
1540 | | -# next if $line =~ /^@/ ;
|
1541 | | -# chomp ($line) ;
|
1542 | | -## print "$line\n" ;
|
1543 | | -# if ($lines++ > 10000) { exit ; }
|
1544 | | -# ($lang,$title,$counts) = split (" ", $line) ;
|
1545 | | -# $lang =~ s/\.z// ;
|
1546 | | -# $lang =~ s/\.y/2/ ;
|
1547 | | -# $counts =~ s/^\d+// ; # remove (redundant) preceding total
|
1548 | | -# while ($counts ne "")
|
1549 | | -# {
|
1550 | | -# $letter = substr ($counts,0,1) ;
|
1551 | | -# $counts = substr ($counts,1) ;
|
1552 | | -# ($count = $counts) =~ s/^(\d+).*$/$1/ ;
|
1553 | | -# $counts =~ s/^\d+(.*)$/$1/ ;
|
1554 | | -# $h = ord ($letter) - ord ('A') ;
|
1555 | | -# $file = $out_day [$h] ;
|
1556 | | -# $writes {$h} ++ ;
|
1557 | | -# print $file "$lang $title $count\n" ;
|
1558 | | -# }
|
1559 | | -
|
1560 | | -# }
|
1561 | | -
|
1562 | | -# for ($h=0 ; $h<=23 ; $h++)
|
1563 | | -# {
|
1564 | | -## $out_day [$h] -> close () ;
|
1565 | | -# close $out_day [$h] ;
|
1566 | | -# }
|
1567 | | -#}
|
1568 | | -
|
1569 | | -
|