r82396 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r82395‎ | r82396 | r82397 >
Date:13:34, 18 February 2011
Author:ezachte
Status:deferred
Tags:
Comment:
Collect and process hourly page view files from http://dammit.lt/wikistats
Folder cellar contains old special purpose code, perhaps still of value (undocumented)
Modified paths:
  • /trunk/wikistats/dammit.lt (added) (history)
  • /trunk/wikistats/dammit.lt/DammitCompactHourlyOrDailyPageCountFiles.pl (added) (history)
  • /trunk/wikistats/dammit.lt/DammitCompactHourlyPageCountFiles.pl (added) (history)
  • /trunk/wikistats/dammit.lt/DammitReportPageRequestsStaffWikis.pl (added) (history)
  • /trunk/wikistats/dammit.lt/DammitSyncFiles.pl (added) (history)
  • /trunk/wikistats/dammit.lt/cellar (added) (history)
  • /trunk/wikistats/dammit.lt/cellar/!DammitCollectArticleNames.pl (added) (history)
  • /trunk/wikistats/dammit.lt/cellar/!DammitCollectViewsOneArticle.pl (added) (history)
  • /trunk/wikistats/dammit.lt/cellar/!DammitFilesFindMisses.pl (added) (history)
  • /trunk/wikistats/dammit.lt/cellar/!DammitFilterDailyPagecountFilesPerLanguage.pl (added) (history)
  • /trunk/wikistats/dammit.lt/cellar/!DammitPageViewsPerSpecialSearch.pl (added) (history)
  • /trunk/wikistats/dammit.lt/cellar/!DammitPatchProjectcountsForBanners.pl (added) (history)
  • /trunk/wikistats/dammit.lt/cellar/!DammitPatchProjectcountsForFundraiser.pl (added) (history)
  • /trunk/wikistats/dammit.lt/cellar/!DammitPrepCollectHarvestInterwikiLinks.pl (added) (history)
  • /trunk/wikistats/dammit.lt/cellar/!DammitRankSpecialPages.pl (added) (history)
  • /trunk/wikistats/dammit.lt/cellar/!DammitRankSpecialPages.txt (added) (history)
  • /trunk/wikistats/dammit.lt/cellar/!DammitScanCompactedFiles.pl (added) (history)
  • /trunk/wikistats/dammit.lt/dammit_compact_monthly.sh (added) (history)
  • /trunk/wikistats/dammit.lt/dammit_filter.sh (added) (history)
  • /trunk/wikistats/dammit.lt/dammit_report.sh (added) (history)
  • /trunk/wikistats/dammit.lt/dammit_scan.sh (added) (history)
  • /trunk/wikistats/dammit.lt/dammit_sync.sh (added) (history)

Diff [purge]

Index: trunk/wikistats/dammit.lt/dammit_report.sh
@@ -0,0 +1 @@
 2+perl /a/dammit.lt/DammitReportPageRequestsStaffWikis.pl
Property changes on: trunk/wikistats/dammit.lt/dammit_report.sh
___________________________________________________________________
Added: svn:eol-style
13 + native
Index: trunk/wikistats/dammit.lt/dammit_sync.sh
@@ -0,0 +1,2 @@
 2+perl /a/dammit.lt/DammitSyncFiles.pl
 3+#perl /home/ezachte/wikistats/WikiCountsJobProgress.pl >> /a/dammit.lt/cron.txt
Property changes on: trunk/wikistats/dammit.lt/dammit_sync.sh
___________________________________________________________________
Added: svn:eol-style
14 + native
Index: trunk/wikistats/dammit.lt/cellar/!DammitPageViewsPerSpecialSearch.pl
@@ -0,0 +1,263 @@
 2+#!/usr/bin/perl
 3+
 4+ use lib "/home/ezachte/lib" ;
 5+ use EzLib ;
 6+ $trace_on_exit = $true ;
 7+
 8+ use CGI::Carp qw(fatalsToBrowser);
 9+ use Time::Local ;
 10+ use Net::Domain qw (hostname);
 11+
 12+ $file_csv_pagecounts = "pagecounts-$month-$language\_fdt" ;
 13+
 14+ open CSV, '>', "/a/dammit.lt/SpecialSearch.csv" ;
 15+ open TXT, '>', "/a/dammit.lt/SpecialSearch.txt" ;
 16+
 17+ $timestart = time ;
 18+
 19+ &ScanFiles ;
 20+ print "\n\nReady\n\n" ;
 21+ exit ;
 22+
 23+sub ScanFiles
 24+{
 25+ print "ScanFiles\n" ;
 26+ print "Filter view counts for $language to $dir_out/$file_csv_pagecounts\n\n" ;
 27+
 28+ $year = 2009 ;
 29+ $month = 10 ;
 30+ while ($year == 2009 || ($year == 2010 && $month <= 5))
 31+ {
 32+
 33+ for ($day = 1 ; $day <= 31 ; $day++)
 34+ {
 35+
 36+ $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
 37+ $yyyymmdd = sprintf ("%04d%02d%02d", $year, $month, $day) ;
 38+
 39+ $file_pagecounts = "/a/dammit.lt/pagecounts/$yyyymm/pagecounts-${yyyymmdd}_h.bz2" ;
 40+
 41+ if (! -e $file_pagecounts)
 42+ { print "Not found: $file_pagecounts\n" ; next ; }
 43+
 44+ print ddhhmmss (time,"%d:%02d:%02d") . "\nRead $file_pagecounts\n\n" ;
 45+
 46+ if ($file_pagecounts =~ /.7z$/)
 47+ { open IN, "-|", "./7za e -so \"$file_pagecounts\"" || die ("Input file '$file_pagecounts' could not be opened.") ; }
 48+ elsif ($file_pagecounts =~ /.bz2$/)
 49+ { open IN, "-|", "bzip2 -dc \"$file_pagecounts\"" || die ("Input file '$file_pagecounts' could not be opened.") ; }
 50+ else
 51+ { next ; } # open IN, '<', $file_pagecounts ; }
 52+
 53+ $project = "" ;
 54+ while ($line = <IN>)
 55+ {
 56+ next if $line =~ /^#/ ;
 57+ next if $line =~ /^@/ ;
 58+
 59+ if ($line !~ /$project/)
 60+ {
 61+ if ($project eq 'en.z')
 62+ {
 63+ print CSV "\"=date($year,$month,$day)\",$project2,$generic,$specific,$other\n" ;
 64+ print "\"=date($year,$month,$day)\",$project2,$generic,$specific,$other\n" ;
 65+ $generic = 0 ;
 66+ $specific = 0 ;
 67+ $other = 0 ;
 68+ }
 69+ ($project) = split ' ', $line ; print "$project " ;
 70+ }
 71+ next if $line lt "en.z " ;
 72+ last if $line gt "en.\xFF" ;
 73+
 74+ if ($project eq 'en.z')
 75+ {
 76+ if ($line =~ /Special:Search/i)
 77+ {
 78+ ($project, $title, $counts) = split (' ', $line) ;
 79+ ($project2 = $project) =~ s/\.z// ;
 80+ $counts =~ s/^(\d+).*$/$1/ ;
 81+ $title =~ s/,/&comma;/g ;
 82+
 83+ if ($yyyymmdd eq '20100201')
 84+ { print TXT "$yyyymmdd,$project2,$counts,$title\n" ; }
 85+
 86+ if ($title =~ /^Special:Search\//i)
 87+ { $specific += $counts ; }
 88+ elsif ($title =~ /^Special:Search/i)
 89+ { $generic += $counts ; }
 90+ else
 91+ { $other += $counts ; }
 92+ }
 93+ }
 94+ }
 95+ close IN ;
 96+ }
 97+ close OUT ;
 98+ $month ++ ;
 99+ if ($month > 12)
 100+ { $month = 1 ; $year ++ ; }
 101+
 102+ }
 103+}
 104+
 105+sub CountArticles
 106+{
 107+ print "CountArticles\n" ;
 108+ if (! -e "$dir_in/$file_csv_pagecounts")
 109+ { print "File not found: $dir_in/$file_csv_pagecounts\n" ; exit ; }
 110+
 111+ open IN, '<', "$dir_in/$file_csv_pagecounts" ;
 112+ while ($line = <IN>)
 113+ {
 114+ chomp ($line) ;
 115+
 116+ ($count,$title) = split (' ', $line,2) ;
 117+# if ($title !~ /Depardieu/) { next ; }
 118+ $title =~ s/%([0-9A-F]{2})/chr(hex($1))/ge ;
 119+ if ($unicodetoascii)
 120+ { $title =~ s/([\x80-\xFF]{2,})/&UnicodeToAscii($1)/ge ; }
 121+ $title =~ s/(\&\#\d+\;)/&HtmlToAscii($1)/ge ;
 122+ $title =~ s/\&quot;/'/g ;
 123+ $title =~ s/\&amp;/&/g ;
 124+ $title = lc ($title) ;
 125+# print "X $count $title\n" ;
 126+ $titles {$title} += $count ;
 127+ }
 128+ close IN ;
 129+
 130+ open OUT, '>', "$dir_out/WikiStatsPageViewsPerArticleSortByTitle.txt" ;
 131+ open IN, '<', "$dir_out/WikiStatsArticles.csv" ;
 132+ while ($line = <IN>)
 133+ {
 134+ chomp ($line) ;
 135+ ($title,$category) = split (',',$line) ;
 136+
 137+# next if $category !~ /politicus/ ;
 138+# next if $category =~ /Nederlands/ ;
 139+# $category =~ s/-politicus// ;
 140+
 141+# if ($title !~ /Depardieu/) { next ; }
 142+ $title =~ s/\%2C/,/g ;
 143+ $category =~ s/\%2C/,/g ;
 144+ $title =~ s/\s/_/g ;
 145+ $title =~ s/(\&\#\d+\;)/&HtmlToAscii($1)/ge ;
 146+ $title =~ s/\&quot;/'/g ;
 147+ $title =~ s/\&amp;/&/g ;
 148+ $title_lc = lc ($title) ;
 149+ $count = ($titles {$title_lc}+0) ; # force numeric
 150+# print "Y $count $title_lc\n" ;
 151+ print OUT sprintf ("%5d",$count) . " " . $title . "\n" ;
 152+ if ($title ne $title_prev)
 153+ { $articles {$title} += $count ; }
 154+ $title_prev = $title ;
 155+ $categories {$category} += $count ;
 156+ $titlecat {$title} = $category ;
 157+ }
 158+ close IN ;
 159+ close OUT ;
 160+
 161+ open OUT, '>', "$dir_out/WikiStatsPageViewsPerArticleSortByTitle.txt" ;
 162+ print OUT "Wikipedia '$wikipedia', Category: '$categoryroot', Month: '$month_out'\n" ;
 163+ foreach $article (sort keys %articles)
 164+# { print OUT sprintf ("%5d",$articles {$article}) . " " . $article . "\n" ; }
 165+ { &Print ($articles {$article}, $article) ; }
 166+ close OUT ;
 167+
 168+ open OUT, '>', "$dir_out/WikiStatsPageViewsPerArticleSortByViews.txt" ;
 169+ print OUT "Wikipedia '$wikipedia', Category: '$categoryroot', Month: '$month_out'\n" ;
 170+ foreach $article (sort {$articles {$b} <=> $articles {$a}} keys %articles)
 171+# { print OUT sprintf ("%5d",$articles {$article}) . " " . $article . "\n" ; }
 172+ { &Print ($articles {$article}, $article) ; }
 173+ close OUT ;
 174+
 175+ open OUT, '>', "$dir_out/WikiStatsPageViewsPerCategorySortByTitle.txt" ;
 176+ print OUT "Wikipedia '$wikipedia', Category: '$categoryroot', Month: '$month_out'\n" ;
 177+ foreach $category (sort keys %categories)
 178+# { print OUT sprintf ("%5d",$categories {$category}) . " " . $category . "\n" ; }
 179+ { &Print ($categories {$category}, $category) ; }
 180+ close OUT ;
 181+
 182+ open OUT, '>', "$dir_out/WikiStatsPageViewsPerCategorySortByViews.txt" ;
 183+ print OUT "Wikipedia '$wikipedia', Category: '$categoryroot', Month: '$month_out'\n" ;
 184+ foreach $category (sort {$categories {$b} <=> $categories {$a}} keys %categories)
 185+# { print OUT sprintf ("%5d",$categories {$category}) . " " . $category . "\n" ; }
 186+ { &Print ($categories {$category}, $category) ; }
 187+ close OUT ;
 188+
 189+# open OUT, '>', "$dir_out/WikiStatsPageViewsPerPerArticleSortByViewsPvdA.csv" ;
 190+# print OUT "politicus,partij,hits,kleur\n" ;
 191+# foreach $article (sort {$articles {$b} <=> $articles {$a}} keys %articles)
 192+# {
 193+# last if $articles {$article} == 0 ;
 194+# next if $titlecat {$article} !~ /pvda/i ;
 195+# $color = int(rand(255)) ;
 196+# print OUT "$article,${titlecat {$article}},${articles {$article}},$color\n" ;
 197+# }
 198+# close OUT ;
 199+
 200+}
 201+
 202+sub Print
 203+{
 204+ my $count = shift ;
 205+ my $text = shift ;
 206+ print OUT sprintf ("%5d",$count) . " p/m = " . sprintf ("%4.0f",$count/$daysinmonth) . " p/d : $text\n" ;
 207+}
 208+
 209+# translates one unicode character into plain ascii
 210+sub UnicodeToAscii {
 211+ my $unicode = shift ;
 212+
 213+ my $char = substr ($unicode,0,1) ;
 214+ my $ord = ord ($char) ;
 215+ my ($c, $value, $html) ;
 216+
 217+ if ($ord < 128) # plain ascii character
 218+ { return ($unicode) ; } # (will not occur in this script)
 219+ else
 220+ {
 221+ if ($ord >= 252) { $value = $ord - 252 ; }
 222+ elsif ($ord >= 248) { $value = $ord - 248 ; }
 223+ elsif ($ord >= 240) { $value = $ord - 240 ; }
 224+ elsif ($ord >= 224) { $value = $ord - 224 ; }
 225+ else { $value = $ord - 192 ; }
 226+
 227+ for ($c = 1 ; $c < length ($unicode) ; $c++)
 228+ { $value = $value * 64 + ord (substr ($unicode, $c,1)) - 128 ; }
 229+
 230+ if ($value < 256)
 231+ { return (chr ($value)) ; }
 232+
 233+ # $unicode =~ s/([\x80-\xFF])/("%".sprintf("%02X",$1))/gie ;
 234+ return ($unicode) ;
 235+ }
 236+}
 237+
 238+sub HtmlToAscii {
 239+ my $html = shift ;
 240+ my $html2 = $html ;
 241+ $html2 =~ s/[^\d]//g ;
 242+ if ($html2 <= 255)
 243+ { return (chr ($html2)) ; }
 244+ else
 245+ { return ($html) ; }
 246+}
 247+
 248+sub Log
 249+{
 250+ $msg = shift ;
 251+ print $msg ;
 252+ print FILE_LOG $msg ;
 253+}
 254+
 255+sub Abort
 256+{
 257+ $msg = shift ;
 258+ print "Abort script\nError: $msg\n" ;
 259+ print LOG "Abort script\nError: $msg\n" ;
 260+ exit ;
 261+}
 262+
 263+
 264+
Index: trunk/wikistats/dammit.lt/cellar/!DammitCollectViewsOneArticle.pl
@@ -0,0 +1,199 @@
 2+#!/usr/local/bin/perl
 3+
 4+# 27 April 2010 renamed from WikiStatsCollectViewsOneArticle.pl
 5+
 6+ use CGI qw(:all);
 7+ use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; # install IO-Compress-Zlib
 8+ use IO::Compress::Gzip qw(gzip $GzipError) ; # install IO-Compress-Zlib
 9+
 10+ $| = 1; # flush screen output
 11+ $true = 1 ;
 12+ $false = 0 ;
 13+ $mode = "H" ; # daily files as opposed to H
 14+
 15+# $dir0 = "D:/Wikipedia_Visitors/full_day" ;
 16+ $dir0 = "D:/Wikipedia_Visitors" ;
 17+ chdir ($dir0) || die "Cannot chdir to $dir0\n";
 18+
 19+# open TXT, ">", "JoeBiden.txt" ;
 20+# open TXT, ">", "FalungGong.txt" ;
 21+# &ProcessMonth (2008,7) ;
 22+# &ProcessMonth (2008,8) ;
 23+# close TXT ;
 24+
 25+ &ProcessSelection ;
 26+
 27+ exit ;
 28+
 29+sub ProcessSelection
 30+{
 31+ open "IN", "<", "FalungGong.txt" ;
 32+ open "OUT", ">", "FalungGongTotals.csv" ;
 33+ while ($line = <IN>)
 34+ {
 35+ chomp ($line) ;
 36+ $line =~ s/\s+/ /g ;
 37+ ($timestamp, $project, $count, $title) = split (' ', $line) ;
 38+ # $timestamp =~ s/\d\d\d\d$// ; # discard minutes and seconds
 39+ $timestamp =~ s/\-\d\d\d\d\d\d$// ; # discard hours, minutes and seconds
 40+ if ($project eq "zh")
 41+ { @counts_zh {$timestamp} += $count ; }
 42+ else
 43+ { @counts_other {$timestamp} += $count ; }
 44+ }
 45+ close IN ;
 46+
 47+foreach $date (sort keys %counts_zh)
 48+{
 49+ $year = substr ($date,0,4) ;
 50+ $month = substr ($date,4,2) ;
 51+ $day = substr ($date,6,2) ;
 52+ $timestamp = sprintf ("%02d/%02d/%04d", $day, $month, $year) ;
 53+ print OUT $timestamp . "," . (@counts_zh {$date}) . "\n" ;
 54+}
 55+
 56+if (0)
 57+{
 58+ $month = 7 ;
 59+ for $day (1..31)
 60+ {
 61+ for $hour (0..23)
 62+ {
 63+ $timestamp = sprintf ("%04d%02d%02d-%02d", 2008, 7, $day, $hour) ;
 64+ $timestamp2 = sprintf ("%02d/%02d/%04d %02d:%02d", $day, 7, 2008, $hour, 0) ;
 65+ print OUT $timestamp2 . "," . (@counts_zh {$timestamp}+0) . "," . (@counts_other {$timestamp}+0) . "\n" ;
 66+ }
 67+ }
 68+
 69+ $month = 8 ;
 70+ for $day (1..31)
 71+ {
 72+ for $hour (0..23)
 73+ {
 74+ $timestamp = sprintf ("%04d%02d%02d-%02d", 2008, 8, $day, $hour) ;
 75+ $timestamp2 = sprintf ("%02d/%02d/%04d %02d:%02d", $day, 8, 2008, $hour, 0) ;
 76+ print OUT $timestamp2 . "," . (@counts_zh {$timestamp}+0) . "," . (@counts_other {$timestamp}+0) . "\n" ;
 77+ }
 78+ }
 79+
 80+ $month = 9 ;
 81+ for $day (1..14)
 82+ {
 83+ for $hour (0..23)
 84+ {
 85+ $timestamp = sprintf ("%04d%02d%02d-%02d", 2008, 9, $day, $hour) ;
 86+ $timestamp2 = sprintf ("%02d/%02d/%04d %02d:%02d", $day, 9, 2008, $hour, 0) ;
 87+ print OUT $timestamp2 . "," . (@counts_zh {$timestamp}+0) . "," . (@counts_other {$timestamp}+0) . "\n" ;
 88+ }
 89+ }
 90+}
 91+ close OUT ;
 92+}
 93+
 94+sub ProcessMonth
 95+{
 96+ my $year = shift ;
 97+ my $month = sprintf ("%02d", shift) ;
 98+
 99+ $dir0 =~ s/[\\\/]$// ;
 100+
 101+ $dir_in = "$dir0/$year-$month-pagecounts" ;
 102+ &Log ("Process year $year month $month from '$dir_in'\n") ;
 103+ chdir ($dir_in) || die "Cannot chdir to $dir_in\n";
 104+ local (*DIR);
 105+
 106+ opendir (DIR, ".");
 107+ @files = () ;
 108+ while ($file_in = readdir (DIR))
 109+ {
 110+ if ($mode eq "H")
 111+ {
 112+ if ($file_in !~ /^pagecounts-\d{8,8}-\d{6,6}.gz$/)
 113+ { next ; }
 114+ if ($file_in lt "pagecounts-20080816-000000.gz")
 115+ { next ; }
 116+# if ($file_in ge "pagecounts-20080831-000000.gz")
 117+# { next ; }
 118+ }
 119+ if ($mode eq "D")
 120+ {
 121+ if ($file_in !~ /^pagecounts-\d{8,8}_fd.gz$/)
 122+ { next ; }
 123+# if ($file_in lt "pagecounts-20080801_fd.gz")
 124+# { next ; }
 125+# if ($file_in ge "pagecounts-20080831_fd.gz")
 126+# { next ; }
 127+ }
 128+ push @files, $file_in ;
 129+ }
 130+ closedir (DIR, ".");
 131+
 132+ @files = sort {$a cmp $b} @files ;
 133+
 134+ foreach $file (@files)
 135+ { &ProcessFile ($file) ; }
 136+}
 137+
 138+sub ProcessFile
 139+{
 140+ my $file = shift ;
 141+ $date = substr ($file, 11, 8) ;
 142+ $time = substr ($file, 20, 6) ;
 143+ print "ProcessFile ($file)\n" ;
 144+
 145+ my $lines ;
 146+ $in_gz = IO::Uncompress::Gunzip->new ($file) or die "IO::Uncompress::Gunzip failed for '$file': $GunzipError\n";
 147+ binmode $in_gz ;
 148+ while ($line = <$in_gz>)
 149+ {
 150+ # if ($line ge "eo")
 151+ # { last ; }
 152+ # if ($line !~ /^en /)
 153+ # { next ; }
 154+ # if ($lines ++ == 0) { print "$line" ; }
 155+
 156+# if ($line =~ /sarah.*palin/i)
 157+# if ($line =~ /joe.*biden/i)
 158+ if ($line =~ / \%E6\%B3\%95\%E8\%BD\%AE\%E5\%8A\%9F /)
 159+ {
 160+ if ($mode eq "H")
 161+ {
 162+ ($wiki,$title,$views,$bytes) = split (' ', $line) ;
 163+ $line = sprintf ("%-10s", $wiki) . " " . sprintf ("%8d",$views) . " $title\n" ;
 164+ print "$date-$time $line" ;
 165+ print TXT "$date-$time $line" ;
 166+ }
 167+ if ($mode eq "D")
 168+ {
 169+ chomp ($line) ;
 170+
 171+ ($wiki,$title,$views_all_day) = split (' ', $line) ;
 172+ $wiki =~ s/\.z// ;
 173+ $wiki =~ s/\.y/2/ ;
 174+ $views_all_day =~ s/^\d+// ; # remove (redundant) preceding total
 175+ while ($views_all_day ne "")
 176+ {
 177+ $letter = substr ($views_all_day,0,1) ;
 178+ $views_all_day = substr ($views_all_day,1) ;
 179+ ($views_one_hour = $views_all_day) =~ s/^(\d+).*$/$1/ ;
 180+ $views_all_day =~ s/^\d+(.*)$/$1/ ;
 181+ $time = sprintf ("%02d",ord ($letter) - ord ('A')) . "0000" ;
 182+
 183+ $line = sprintf ("%-10s", $wiki) . " " . sprintf ("%8d",$views_one_hour) . " $title\n" ;
 184+ print "$date-$time $line" ;
 185+ print TXT "$date-$time $line" ;
 186+ }
 187+ }
 188+ }
 189+ }
 190+
 191+ $in_gz->close() ;
 192+}
 193+
 194+sub Log
 195+{
 196+ $msg = shift ;
 197+ print $msg ;
 198+ print LOG $msg ;
 199+}
 200+
Index: trunk/wikistats/dammit.lt/cellar/!DammitFilterDailyPagecountFilesPerLanguage.pl
@@ -0,0 +1,156 @@
 2+#!/usr/bin/perl
 3+
 4+# 27 April 2010 renamed from WikiStatsFilterCompactedDammitFilesPerLanguage.pl
 5+
 6+ use lib "/home/ezachte/lib" ;
 7+ use EzLib ;
 8+ $trace_on_exit = $true ;
 9+
 10+ use CGI::Carp qw(fatalsToBrowser);
 11+ use Time::Local ;
 12+ use Net::Domain qw (hostname);
 13+
 14+ $language = "nl" ;
 15+ $wikipedia = "$language.wikipedia.org" ; # read from input
 16+
 17+ $path_in = "." ;
 18+ $path_out = "." ;
 19+ if ($hostname eq "bayes")
 20+ {
 21+ $path_in = "/a/dammit.lt/pagecounts" ;
 22+ $path_out = "/a/dammit.lt/pagecounts/languages/$language.z" ;
 23+ if (! -d $path_out)
 24+ { mkdir $path_out, 0777 ; }
 25+ $path_7za = "/usr/lib/p7zip/7za" ;
 26+ }
 27+
 28+ $month = 8 ;
 29+ $year = 2008 ;
 30+ $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
 31+ $path_in_monthly = "$path_in/$yyyymm" ;
 32+ while (-d $path_in_monthly)
 33+ {
 34+ print "\nCheck dir $path_in_monthly\n" ;
 35+
 36+ $file_filtered = "$path_out/pagecounts-$yyyymm-$language-fdt.txt" ;
 37+
 38+ if ($hostname eq "bayes")
 39+ {
 40+ $file_filtered_7z = "$file_filtered.7z" ;
 41+
 42+ if (-e $file_filtered_7z)
 43+ { print "File $file_filtered_7z already exists\n" ; }
 44+ else
 45+ { &FilterCounts ($yyyymm, $file_filtered) ; }
 46+ }
 47+ else
 48+ { &FilterCounts ($yyyymm, $file_filtered) ; }
 49+
 50+ $month++ ;
 51+ if ($month > 12)
 52+ { $month = 1 ; $year++ ; }
 53+ $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
 54+ $path_in_monthly = "$path_in/$yyyymm" ;
 55+ }
 56+
 57+ print "\n\nReady\n\n" ;
 58+ exit ;
 59+
 60+sub FilterCounts
 61+{
 62+ my ($yyyymm, $file_filtered) = @_ ;
 63+ ($yyyymm2 = $yyyymm) =~ s/-// ;
 64+
 65+ open OUT, '>', $file_filtered ;
 66+
 67+ print OUT "# Counts for articles with less than a few requests per full day (before April 2010 five per day, from then on two per day) were not preserved in daily archives and hence are neither included here\n" ;
 68+# print OUT "# Subproject is language code, followed by project code\n" ;
 69+# print OUT "# Project is b:wikibooks, k:wiktionary, n:wikinews, q:wikiquote, s:wikisource, v:wikiversity, z:wikipedia (z added by compression script: wikipedia happens to be sorted last in dammit.lt files)\n" ;
 70+ print OUT "# Counts format is total per day, followed by count per hour if larger than zero, hour 0..23 shown as A..X (saves up to 22 bytes per line compared to comma separated values)\n" ;
 71+ print OUT "# If data are missing for some hour (file missing or corrupt) a question mark (?) is shown (and for each missing hour the daily total is incremented with hourly average)\n" ;
 72+ print OUT "# Lines starting with ampersand (@) show totals per 'namespace' (including omitted counts for low traffic articles)\n" ;
 73+ print OUT "# Since valid namespace string are not known in the compression script any string followed by colon (:) counts as possible namespace string\n" ;
 74+ print OUT "# Please reconcile with real namespace name strings later\n" ;
 75+ print OUT "# 'namespaces' with count < 5 are combined in 'Other' (on larger wikis these are surely false positives)\n" ;
 76+ print OUT "# Page titles are shown unmodified (preserves sort sequence)\n" ;
 77+
 78+
 79+ for ($day = 1 ; $day <= 31 ; $day++)
 80+ {
 81+ $yyyymmdd = "$yyyymm-" . sprintf ("%02d", $day) ;
 82+
 83+ $file_pagecounts = "$path_in/$yyyymm/pagecounts-$yyyymm2" . sprintf ("%02d", $day) . "_fdt" ;
 84+ if ($hostname eq "bayes")
 85+ { $file_pagecounts .= ".7z" ; }
 86+
 87+
 88+ if (! -e $file_pagecounts)
 89+ {
 90+ print "\nNot found: $file_pagecounts\n" ;
 91+ print OUT "# $yyyymmdd missing!\n" ;
 92+ next ;
 93+ }
 94+
 95+ print "Read $file_pagecounts\n" ;
 96+ print OUT "# $yyyymmdd\n" ;
 97+
 98+ if ($hostname eq "bayes")
 99+ { open IN, "-|", "./7za e -so \"$file_pagecounts\"" || die ("Input file '" . $file_pagecounts . "' could not be opened.") ; }
 100+ else
 101+ { open IN, '<', $file_pagecounts ; }
 102+
 103+ while ($line = <IN>)
 104+ {
 105+ $ch = substr ($line,0,1) ;
 106+
 107+ next if $ch eq '#' ; # comments
 108+
 109+ if ($ch eq '@') # summary per language project
 110+ {
 111+ if ($line =~ /^\@ $language\.z /o)
 112+ { print OUT $line ; }
 113+ next ;
 114+ }
 115+
 116+ next if $line lt "$language.z" ;
 117+ last if $line !~ /$language.z / ;
 118+
 119+ ($project, $title, $counts) = split (' ', $line) ;
 120+ print OUT "$title $counts\n" ;
 121+ }
 122+ close IN ;
 123+ }
 124+ close OUT ;
 125+
 126+ $cmd = "$path_7za a $file_filtered.7z $file_filtered" ;
 127+ $result = `$cmd` ;
 128+
 129+ if ($result =~ /Everything is Ok/s)
 130+ {
 131+ $result =~ s/^.*?(Updating.*?)\n.*$/$1 -> OK/s ;
 132+ unlink $file_filtered ;
 133+ }
 134+ else
 135+ {
 136+ print "Delete $file_filtered.7z\n" ;
 137+ unlink "$file_filtered.7z" ;
 138+ }
 139+
 140+ print "$cmd -> $result\n" ;
 141+}
 142+
 143+sub Log
 144+{
 145+ $msg = shift ;
 146+ print $msg ;
 147+}
 148+
 149+sub Abort
 150+{
 151+ $msg = shift ;
 152+ print "Abort script\nError: $msg\n" ;
 153+ exit ;
 154+}
 155+
 156+
 157+
Index: trunk/wikistats/dammit.lt/cellar/!DammitPatchProjectcountsForFundraiser.pl
@@ -0,0 +1,86 @@
 2+#!/usr/bin/perl
 3+
 4+$| = 1; # flush screen output
 5+
 6+open IN, '<', 'DammitPatchProjectcountsForFundraiser/AllSquids.csv' ;
 7+open LOG, '>', 'DammitPatchProjectcountsForFundraiser/Log.txt' ;
 8+
 9+chdir ("DammitPatchProjectcountsForFundraiser") || die "Cannot chdir to DammitPatchProjectcountsForFundraiser\n" ;
 10+
 11+while ($line = <IN>)
 12+{
 13+ chomp $line ;
 14+
 15+ next if $line =~ /[*]/ ;
 16+ next if $line !~ /^2010/ ;
 17+
 18+ ($date,$hour,$events,$avg_delta) = split (',', $line) ;
 19+
 20+ next if $avg_delta <= 1005 ; # normally projectcounts also miss a few hits, overcorrecting would skew trends
 21+ &Patch ($date, $hour, $avg_delta) ;
 22+}
 23+
 24+print "\n\nReady\n\n" ;
 25+exit ;
 26+
 27+sub Patch
 28+{
 29+ ($date,$hour,$avg_delta) = @_ ;
 30+
 31+ $date =~ s/-//g ;
 32+ $file = "projectcounts-$date-" . sprintf ("%02d",$hour) . "0000" ;
 33+
 34+ if (! -e $file)
 35+ {
 36+ $file = "projectcounts-$date-" . sprintf ("%02d",$hour) . "0001" ;
 37+ if (! -e $file)
 38+ {
 39+ print "File '$file' missing!\n" ;
 40+ exit ;
 41+ }
 42+ }
 43+ &PatchFile ($file, $avg_delta) ;
 44+}
 45+
 46+sub PatchFile
 47+{
 48+ my ($file,$avg_delta) = @_ ;
 49+ my $line ;
 50+ $correction = $avg_delta / 1000 ;
 51+ print "Patch file $file: avg delta $avg_delta -> correction $correction\n" ;
 52+
 53+ open PROJECTFILE, '<', $file || die "Could not open '$file'\n" ;
 54+
 55+ undef @projectfile ;
 56+ $file_changed = 0 ;
 57+ while ($line = <PROJECTFILE>)
 58+ {
 59+ chomp $line ;
 60+ ($project,$dash,$count,$bytes) = split (' ', $line) ;
 61+
 62+ if ($bytes > 0)
 63+ {
 64+ $count = sprintf ("%.0f", $correction * $count) ;
 65+ # &Log ("\n$line ->\n") ;
 66+ $line = "$project $dash $count 1" ;
 67+ # &Log ("$line\n") ;
 68+ }
 69+ push @projectfile, "$line\n" ;
 70+ }
 71+
 72+ close PROJECTFILE ;
 73+
 74+ open PROJECTFILE, '>', $file || die "Could not open '$file'\n" ;
 75+ print PROJECTFILE @projectfile ;
 76+ close PROJECTFILE ;
 77+}
 78+
 79+sub Log
 80+{
 81+ my $msg = shift ;
 82+ print $msg ;
 83+ print LOG $msg ;
 84+}
 85+
 86+
 87+
Index: trunk/wikistats/dammit.lt/cellar/!DammitRankSpecialPages.pl
@@ -0,0 +1,44 @@
 2+#!/usr/local/bin/perl
 3+use CGI qw(:all);
 4+
 5+ open IN, '<', "pagecounts-20090301_fdt" ;
 6+ open OUT, '>', "!DammitRankSpecialPages.txt" ;
 7+ $projprev = "" ;
 8+ while ($line = <IN>)
 9+ {
 10+ if ($line =~ /^#/) { next ; }
 11+ if ($line =~ /^@/) { next ; }
 12+ # if (($line !~ / Wikipedia\:/) && ($line !~ / Help\:/) && ($line !~ / Hilfe\:/) && ($line !~ / Wikip�dia\:/) && ($line !~ / Aide\:/) )
 13+ if (($line !~ / Help\:/) && ($line !~ / Hilfe\:/) && ($line !~ / Aide\:/))
 14+ { next ; }
 15+
 16+ chomp ($line) ;
 17+ ($project, $title, $counts) = split (' ', $line) ;
 18+ $project =~ s/^([^\.]+)\.z/wikipedia:$1/ ;
 19+ $project =~ s/^([^\.]+)\.b/wikibooks:$1/ ;
 20+ $project =~ s/^([^\.]+)\.d/wiktionary:$1/ ; # dictionaire
 21+ $project =~ s/^([^\.]+)\.m/wikimedia:$1/ ;
 22+ $project =~ s/^([^\.]+)\.n/wikinews:$1/ ;
 23+ $project =~ s/^([^\.]+)\.q/wikiquote:$1/ ;
 24+ $project =~ s/^([^\.]+)\.s/wikisource:$1/ ;
 25+ $project =~ s/^([^\.]+)\.v/wikiversity:$1/ ;
 26+ $project =~ s/^([^\.]+)\.x/wikispecial:$1/ ;
 27+ if ($project ne $projprev)
 28+ {
 29+ $rows = 0 ;
 30+ foreach $key (sort {$counts {$b} <=> $counts {$a}} keys %counts)
 31+ {
 32+ print OUT sprintf ("%8d", $counts {$key} ) . ": $key\n" ;
 33+ if ($rows++ > 50)
 34+ { last ;}
 35+ }
 36+ undef %counts ;
 37+ }
 38+ $projprev = $project ;
 39+
 40+ $counts =~ s/^(\d+).*$/$1/ ;
 41+ @counts {"$project $title"} += $counts ;
 42+ }
 43+
 44+
 45+
Index: trunk/wikistats/dammit.lt/cellar/!DammitCollectArticleNames.pl
@@ -0,0 +1,152 @@
 2+#!/usr/local/bin/perl
 3+
 4+# 27 April 2010 renamed from WikiStatsCollectArticleNames.pl
 5+
 6+use CGI qw(:all);
 7+use Time::Local ;
 8+use Getopt::Std ;
 9+
 10+ &ParseArguments ;
 11+ $dumpfile = &FindDumpFile ;
 12+ &ProcessFile ($dumpfile, "$path_out/$mode\_$project.txt") ;
 13+ print "\n\nReady\n\n" ;
 14+ exit ;
 15+
 16+sub ParseArguments
 17+{
 18+ my $options ;
 19+ getopt ("iomp", \%options) ;
 20+
 21+ &Abort ("Specify input folder for xml dump files as: -i path") if (! defined (@options {"i"})) ;
 22+ &Abort ("Specify output folder as: -o path") if (! defined (@options {"o"})) ;
 23+
 24+ $path_in = @options {"i"} ;
 25+ $path_out = @options {"o"} ;
 26+ $project = @options {"p"} ;
 27+ $mode = @options {"m"} ;
 28+
 29+ $language = $project ;
 30+ $language_ = $language ;
 31+ $language_ =~ s/-/_/g ;
 32+
 33+ if ($mode eq "")
 34+ { $mode = "wp" ; }
 35+ if ($mode !~ /^(?:wb|wk|wn|wp|wq|ws|wx|wv)$/)
 36+ { abort ("Specify mode as: -m [wb|wk|wn|wp|wq|ws|wx|wv]\n(wp=wikipedia (default), wb=wikibooks, wk=wiktionary, wn=wikinews, wq=wikiquote, ws=wikisource, wx=wikispecial, wv=wikiversity)") ; }
 37+
 38+ &Abort ("Project $project is skipped: 'mania' and/or 'team' in the name") if ($project =~ /(?:mania|team)/i) ;
 39+
 40+ if ($project =~ /wik(?:|ibooks|inews|iquote|isource|tionary|iversity)$/i)
 41+ {
 42+ $project_suffix = $project ;
 43+ $project_suffix =~ s/wik(?:|ibooks|inews|iquote|isource|tionary|iversity)$// ;
 44+ }
 45+ $language =~ s/wik(?:|ibooks|inews|iquote|isource|tionary|iversity)$// ;
 46+
 47+ if ($project =~ /wiki$/i)
 48+ {
 49+ $project_suffix = $project ;
 50+ $project_suffix =~ s/wiki$// ;
 51+ }
 52+ $language =~ s/wiki$// ;
 53+
 54+ &Log ("Project '$project' -> language '$language'\n\n") ;
 55+}
 56+
 57+sub FindDumpFile
 58+{
 59+ my ($dumpdir,$dir,$file,$scandir,$status) ;
 60+
 61+ @files = glob "$path_in/*" ;
 62+
 63+ &Log ("Find latest valid dump dir in $path_in ->\n\n") ;
 64+ foreach $file (@files)
 65+ {
 66+ if ($file !~ /\/\d{8,8}$/)
 67+ { next ; }
 68+ if (! -d $file)
 69+ { next ; }
 70+
 71+ ($dir = $file) =~ s/.*?\/(\d{8,8})/$1/ ;
 72+ $scandir = "$path_in/$dir" ;
 73+ if (! -e "$scandir/status.html")
 74+ { &Log ("$scandir/status.html not found\n") ; }
 75+ elsif (! -e "$scandir/index.html")
 76+ { &Log ("$scandir/index.html not found\n") ; }
 77+ else
 78+ {
 79+ open STATUS, '<', "$scandir/status.html" ;
 80+ $line = <STATUS> ;
 81+ chomp $line ;
 82+ close STATUS ;
 83+ $status = "undetermined: $line" ;
 84+ if ($line =~ /dump complete/i)
 85+ { $status = "dump complete" ; }
 86+ elsif ($line =~ /dump aborted/i)
 87+ { $status = "dump aborted" ; }
 88+ elsif ($line =~ /dump in progress/i)
 89+ { $status = "dump in progress" ; }
 90+ if ($dumpdir lt $dir)
 91+ {
 92+ if ($status eq "dump complete")
 93+ {
 94+ open INDEX, '<', "$scandir/index.html" ;
 95+ while ($line = <INDEX>)
 96+ {
 97+ if ($line =~ /failed.*?All pages with complete.*?edit history/i)
 98+ {
 99+ $status = "dump aborted (dump failed)" ;
 100+ last ;
 101+ }
 102+ }
 103+ close INDEX ;
 104+ }
 105+ if ($status eq "dump complete")
 106+ { $dumpdir = $dir ; }
 107+ }
 108+ &Log ("$dir: $status\n") ;
 109+ }
 110+ }
 111+ if ($dumpdir eq "")
 112+ { &Abort ("No valid dump dir found\n") ; }
 113+
 114+ $path_in .= "/$dumpdir/" ;
 115+ &Log ("\nDump dir -> $path_in\n") ;
 116+ $dumpdate = $dumpdir ;
 117+
 118+ $dumpfile = "$path_in/$project-$dumpdate-pages-meta-current.xml.bz2" ;
 119+ &Log ("\nFile in $dumpfile\n") ;
 120+ return ($dumpfile) ;
 121+}
 122+
 123+sub ProcessFile
 124+{
 125+ my $file_in = shift ;
 126+ my $file_out = shift ;
 127+ print "File out $file_out\n" ;
 128+ open FILE_OUT, '>', $file_out || abort ("Output file '" . $file_out . "' could not be opened.") ;
 129+ open FILE_IN, "-|", "bzip2 -dc \"$file_in\"" || abort ("Input file '" . $file_in . "' could not be opened.") ;
 130+ while ($line = <FILE_IN>)
 131+ {
 132+ # $line =~ s/<title>([^<]*)<\/title>/print FILE_OUT "$1\n", print "$1\n"/ge ;
 133+ $line =~ s/<title>([^<]*)<\/title>/print FILE_OUT "$1\n"/ge ;
 134+ }
 135+ close FILE_IN ;
 136+}
 137+
 138+sub Log
 139+{
 140+ $msg = shift ;
 141+ print $msg ;
 142+# print LOG $msg ;
 143+}
 144+
 145+sub Abort
 146+{
 147+ $msg = shift ;
 148+ print "Abort script\nError: $msg\n" ;
 149+# print LOG "Abort script\nError: $msg\n" ;
 150+ exit ;
 151+}
 152+
 153+
Index: trunk/wikistats/dammit.lt/cellar/!DammitPrepCollectHarvestInterwikiLinks.pl
@@ -0,0 +1,40 @@
 2+#!/usr/bin/perl
 3+
 4+open IN, '<', 'index.php' ;
 5+
 6+while ($line = <IN>)
 7+{
 8+ if ($line =~ /class=\"interwiki/)
 9+ {
 10+ chomp ($line) ;
 11+ $lang = $line ;
 12+ $lang =~ s/^.*?interwiki-(\w+).*$/$1/ ;
 13+ $title = $line ;
 14+ $title =~ s/^.*?href=\"([^\"]+)\".*$/$1/ ;
 15+ $title =~ s/^.*\/([^\/]+)$/$1/ ;
 16+# print "[$lang] $title\n" ;
 17+ @languages {$title} .= "$lang," ;
 18+ @langcnt {$title}++ ;
 19+ }
 20+}
 21+print "\n\n\n" ;
 22+
 23+foreach $title (sort {$langcnt {$b} <=> $langcnt {$a}} keys %langcnt)
 24+{
 25+ $count = $langcnt {$title} ;
 26+ if ($count > 10)
 27+ { $pattern .= "^$title\n" ; }
 28+ else
 29+ {
 30+ $langlist = $languages {$title} ;
 31+ @langs = split (',', $langlist) ;
 32+ foreach $lang (@langs)
 33+ {
 34+ print "$lang $title\n" ;
 35+ $pattern .= "^$lang\.z $title\n"
 36+ }
 37+ }
 38+}
 39+
 40+print "\n\nPATTERN:\n$pattern\n" ;
 41+
Index: trunk/wikistats/dammit.lt/cellar/!DammitRankSpecialPages.txt
@@ -0,0 +1,576 @@
 2+ 14: wikipedia:als Hilfe:Neue_Seite_anlegen
 3+ 33: wikipedia:am Help:Contents
 4+ 68: wikipedia:ang Help:Innung
 5+ 10: wikipedia:arc Help:Contents
 6+ 31: wikipedia:ay Help:Contents
 7+ 12: wikipedia:bar Hilfe:Hilfe
 8+ 10: wikiversity:beta Help:Contents
 9+ 11: wikipedia:bo Help:Contents
 10+ 10: wikipedia:chr Help:Contents
 11+ 11: wikipedia:co Help:Contents
 12+ 993: wikimedia:commons Help:Contents
 13+ 86: wikimedia:commons Help:Inkscape
 14+ 30: wikimedia:commons Help:SVG
 15+ 23: wikimedia:commons Help:Creating_a_DjVu_file
 16+ 21: wikimedia:commons Help:Sommaire
 17+ 20: wikimedia:commons Help:Converting_video
 18+ 18: wikimedia:commons Hilfe:%C3%9Cbersicht
 19+ 17: wikimedia:commons Help:Scanning
 20+ 16: wikimedia:commons Help:%E7%9B%AE%E6%AC%A1
 21+ 15: wikimedia:commons Help:%C3%9Cbersicht
 22+ 14: wikimedia:commons Help:Mass_deletion_request
 23+ 13: wikimedia:commons Help:Zoomable_images
 24+ 12: wikimedia:commons Help:Logging_in
 25+ 11: wikimedia:commons Help:Mpeg2dv.sh
 26+ 36: wikibooks:de Hilfe:Sammlungen
 27+ 18: wikibooks:de Hilfe:Suche
 28+ 18: wikibooks:de Hilfe:So_schreibe_ich_gute_B%C3%BCcher
 29+ 11: wikibooks:de Hilfe:Erste_Schritte_auf_der_Spielwiese
 30+ 10: wikibooks:de Hilfe:Urheberrechte_beachten
 31+ 10: wikibooks:de Hilfe:Wikibook_lokal_speichern
 32+ 811: wiktionary:de Hilfe:H%C3%B6rbeispiele
 33+ 197: wiktionary:de Hilfe:Wortart
 34+ 164: wiktionary:de Hilfe:IPA
 35+ 63: wiktionary:de Hilfe:Nominativ
 36+ 61: wiktionary:de Hilfe:Sonderzeichen/Tabelle
 37+ 35: wiktionary:de Hilfe:H%C3%A4ufig_gestellte_Fragen
 38+ 35: wiktionary:de Hilfe:Genitiv
 39+ 34: wiktionary:de Hilfe:Plural
 40+ 33: wiktionary:de Hilfe:Singular
 41+ 26: wiktionary:de Hilfe:Pr%C3%A4sens
 42+ 26: wiktionary:de Hilfe:Akkusativ
 43+ 25: wiktionary:de Hilfe:Flexionstabellen_(Altgriechisch)
 44+ 25: wiktionary:de Hilfe:Dativ
 45+ 24: wiktionary:de Hilfe:Pr%C3%A4teritum
 46+ 23: wiktionary:de Hilfe:Suche
 47+ 21: wiktionary:de Hilfe:Konjunktiv
 48+ 19: wiktionary:de Hilfe:Sonderzeichen
 49+ 18: wiktionary:de Hilfe:Flexionstabellen_(Franz%C3%B6sisch)
 50+ 15: wiktionary:de Hilfe:H%C3%B6rbeispiele/Liste
 51+ 15: wiktionary:de Hilfe:Flexionstabellen_(Lateinisch)
 52+ 15: wiktionary:de Hilfe:Lautschrift
 53+ 14: wiktionary:de Hilfe:Positiv
 54+ 14: wiktionary:de Hilfe:Kasus
 55+ 13: wiktionary:de Hilfe:Imperativ
 56+ 13: wiktionary:de Hilfe:Flexionstabellen
 57+ 13: wiktionary:de Hilfe:Flexionstabellen_(Spanisch)
 58+ 12: wiktionary:de Hilfe:Partizip
 59+ 12: wiktionary:de Hilfe:Hinweise_f%C3%BCr_Leser
 60+ 10: wiktionary:de Hilfe:Komparativ
 61+ 19: wikinews:de Hilfe:Erste_Schritte
 62+ 18: wikinews:de Hilfe:%C3%9Cbersicht
 63+ 14: wikinews:de Hilfe:Zweite_Schritte
 64+ 10: wikinews:de Hilfe:Quellenverzeichnis
 65+ 11: wikiquote:de Hilfe:Erste_Schritte
 66+ 20: wikisource:de Hilfe:Bearbeitungsstand
 67+ 18: wikisource:de Hilfe:B%C3%BCcher
 68+ 16: wikisource:de Hilfe:Korrekturlesen
 69+ 10: wikisource:de Hilfe:Scannen_von_B%C3%BCchern
 70+ 1722: wikipedia:de Hilfe:Buchfunktion
 71+ 1154: wikipedia:de Hilfe:Sonderzeichen
 72+ 923: wikipedia:de Hilfe:Gesichtete_und_gepr%C3%BCfte_Versionen
 73+ 834: wikipedia:de Hilfe:Tutorial
 74+ 747: wikipedia:de Hilfe:TeX
 75+ 728: wikipedia:de Hilfe:Suche
 76+ 484: wikipedia:de Hilfe:Wikimedia_Commons
 77+ 417: wikipedia:de Hilfe:Neu_bei_Wikipedia
 78+ 338: wikipedia:de Hilfe:Bearbeitungshilfe
 79+ 272: wikipedia:de Hilfe:Tutorial/3
 80+ 242: wikipedia:de Hilfe:Spezialseiten
 81+ 241: wikipedia:de Hilfe:Tutorial/1
 82+ 233: wikipedia:de Hilfe:Bilder
 83+ 219: wikipedia:de Hilfe:Textgestaltung
 84+ 202: wikipedia:de Hilfe:Vorlagen
 85+ 191: wikipedia:de Hilfe:Seite_bearbeiten
 86+ 177: wikipedia:de Hilfe:Zusammenfassung_und_Quelle
 87+ 175: wikipedia:de Hilfe:Einzelnachweise
 88+ 173: wikipedia:de Hilfe:Tutorial/2
 89+ 169: wikipedia:de Hilfe:Tabellen
 90+ 167: wikipedia:de Hilfe:Audio
 91+ 166: wikipedia:de Hilfe:Neue_Seite_anlegen
 92+ 154: wikipedia:de Hilfe:Einstellungen
 93+ 141: wikipedia:de Hilfe:Formatvorlagen
 94+ 140: wikipedia:de Hilfe:Signatur
 95+ 139: wikipedia:de Hilfe:Tutorial/4
 96+ 126: wikipedia:de Hilfe:FAQ
 97+ 125: wikipedia:de Hilfe:Bausteine
 98+ 125: wikipedia:de Hilfe:Archivieren
 99+ 121: wikipedia:de Hilfe:Namensr%C3%A4ume
 100+ 117: wikipedia:de Hilfe:Links
 101+ 116: wikipedia:de Hilfe:Personendaten
 102+ 115: wikipedia:de Hilfe:Zusammenfassung_und_Quellen
 103+ 114: wikipedia:de Hilfe:Weiterleitung
 104+ 110: wikipedia:de Hilfe:Bearbeiten
 105+ 105: wikipedia:de Hilfe:Buchfunktion/Fragen_und_Antworten
 106+ 95: wikipedia:de Hilfe:Benutzerkonto_anlegen
 107+ 94: wikipedia:de Hilfe:Bild_und_Ton
 108+ 87: wikipedia:de Hilfe:Farben
 109+ 85: wikipedia:de Hilfe:Allgemeine_Textbausteine
 110+ 79: wikipedia:de Hilfe:Versionen
 111+ 79: wikipedia:de Hilfe:Bildertutorial
 112+ 78: wikipedia:de Hilfe:Navigation
 113+ 78: wikipedia:de Hilfe:Inhaltsverzeichnis
 114+ 76: wikipedia:de Hilfe:Benutzerkonto
 115+ 75: wikipedia:de Hilfe:Formatieren
 116+ 74: wikipedia:de Hilfe:Listen_und_Tabellen
 117+ 74: wikipedia:de Hilfe:Buchfunktion/Feedback_zur_Buchfunktion
 118+ 74: wikipedia:de Hilfe:Tutorial/6
 119+ 73: wikipedia:de Hilfe:Tutorial/5
 120+ 72: wikipedia:de Hilfe:Benutzernamensraum
 121+ 69: wikipedia:de Hilfe:Glossar
 122+ 345: wikibooks:en Help:Contents
 123+ 330: wikibooks:en Help:Page_validation
 124+ 125: wikibooks:en Help:Collections
 125+ 62: wikibooks:en Help:Starting_a_new_page_or_book
 126+ 42: wikibooks:en Help:Editing
 127+ 40: wikibooks:en Help:About
 128+ 36: wikibooks:en Help:Development_stages
 129+ 29: wikibooks:en Help:Searching
 130+ 25: wikibooks:en Help:Print_versions
 131+ 22: wikibooks:en Help:Contributing_FAQ
 132+ 18: wikibooks:en Help:Contents/editing_wikibooks_-_the_basics
 133+ 12: wikibooks:en Help:Why_contribute%3F
 134+ 12: wikibooks:en Help:How_to_start_a_book
 135+ 10: wikibooks:en Help:Images_and_other_uploaded_files
 136+ 10: wikibooks:en Help:FAQ
 137+ 869: wiktionary:en Help:Contents
 138+ 58: wiktionary:en Help:Searching
 139+ 45: wiktionary:en Help:How_to_check_translations
 140+ 44: wiktionary:en Help:Starting_a_new_page
 141+ 23: wiktionary:en Help:Example_sentences
 142+ 21: wiktionary:en Help:How_to_edit_a_page
 143+ 13: wiktionary:en Help:FAQ
 144+ 13: wiktionary:en Help:Edit_summary
 145+ 10: wiktionary:en Help:Audio_pronunciations
 146+ 10: wiktionary:en Help:Editing
 147+ 84: wikinews:en Help:Page_validation
 148+ 36: wikinews:en Help:Editing_http://schoolpapers.hostinginfive.com/bike.htm
 149+ 36: wikinews:en Help:Editing%20http://schoolpapers.hostinginfive.com/bike.htm
 150+ 22: wikinews:en Help:Contents
 151+ 14: wikinews:en Help:Editing
 152+ 10: wikinews:en Help:How_to_decorate_your_article
 153+ 271: wikiquote:en Help:Contents
 154+ 235: wikisource:en Help:Contents
 155+ 81: wikisource:en Help:Books
 156+ 56: wikisource:en Help:Public_domain
 157+ 38: wikisource:en Help:Adding_texts
 158+ 32: wikisource:en Help:Searching
 159+ 22: wikisource:en Help:DjVu_files
 160+ 15: wikisource:en Help:Introduction
 161+ 12: wikisource:en Help:DJVU_files
 162+ 11: wikisource:en Help:Editing_Wikisource
 163+ 11: wikisource:en Help:Editing_poetry
 164+ 11: wikisource:en Help:Side_by_side_image_view_for_proofreading
 165+ 294: wikiversity:en Help:Guides
 166+ 193: wikiversity:en Help:Contents
 167+ 89: wikiversity:en Help:The_original_tour_for_newcomers
 168+ 81: wikiversity:en Help:The_original_tour_for_newcomers/1
 169+ 56: wikiversity:en Help:The_original_tour_for_newcomers/2
 170+ 41: wikiversity:en Help:The_original_tour_for_newcomers/3
 171+ 37: wikiversity:en Help:The_original_tour_for_newcomers/4
 172+ 28: wikiversity:en Help:Resources_by_subject
 173+ 20: wikiversity:en Help:Resources_by_educational_level
 174+ 19: wikiversity:en Help:Resources_by_type
 175+ 15: wikiversity:en Help:Editing
 176+ 15: wikiversity:en Help:Creating_educational_content_at_Wikiversity
 177+ 13: wikiversity:en Help:Accessing_Wikiversity_by_educational_level
 178+ 12: wikiversity:en Help:Resources_by_completion_status
 179+ 12: wikiversity:en Help:Quiz
 180+ 10: wikiversity:en Help:Project_boxes
 181+ 6368: wikipedia:en Help:Contents
 182+ 2203: wikipedia:en Help:Category
 183+ 1422: wikipedia:en Help:Japanese
 184+ 849: wikipedia:en Help:Books
 185+ 782: wikipedia:en Help:Special_page
 186+ 623: wikipedia:en Help:Page_history
 187+ 597: wikipedia:en Help:IPA
 188+ 581: wikipedia:en Help:Edit_summary
 189+ 518: wikipedia:en Help:Minor_edit
 190+ 512: wikipedia:en Help:IPA_for_English
 191+ 496: wikipedia:en Help:Link
 192+ 304: wikipedia:en Help:Editing
 193+ 291: wikipedia:en Help:Multilingual_support_(East_Asian)
 194+ 239: wikipedia:en Help:Watching_pages
 195+ 238: wikipedia:en Help:Contents/Editing_Wikipedia
 196+ 224: wikipedia:en Help:Template
 197+ 198: wikipedia:en Help:Special_characters
 198+ 198: wikipedia:en Help:Table
 199+ 193: wikipedia:en Help:Contents/Getting_started
 200+ 193: wikipedia:en Help:Section
 201+ 186: wikipedia:en Help:Pronunciation_respelling_key
 202+ 180: wikipedia:en Help:Diff
 203+ 176: wikipedia:en Help:Starting_a_new_page
 204+ 175: wikipedia:en Help:Reverting
 205+ 171: wikipedia:en Help:Archiving_a_talk_page
 206+ 158: wikipedia:en Help:User_contributions
 207+ 151: wikipedia:en Help:Books/Feedback
 208+ 138: wikipedia:en Help:Displaying_a_formula
 209+ 135: wikipedia:en Help:Merging_and_moving_pages
 210+ 134: wikipedia:en Help:Formula
 211+ 111: wikipedia:en Help:Multilingual_support_(Indic)
 212+ 109: wikipedia:en Help:Talk_page
 213+ 105: wikipedia:en Help:Books/Frequently_Asked_Questions
 214+ 105: wikipedia:en Help:Searching
 215+ 99: wikipedia:en Help:CentralAuth
 216+ 97: wikipedia:en Help:Contents/Browsing_Wikipedia
 217+ 96: wikipedia:en Help:Books/for_experts
 218+ 95: wikipedia:en Help:Images_and_other_uploaded_files
 219+ 95: wikipedia:en Help:IPA_chart_for_Russian
 220+ 93: wikipedia:en Help:Logging_in
 221+ 90: wikipedia:en Help:Contents/Links
 222+ 88: wikipedia:en Help:Contents/Images_and_media
 223+ 74: wikipedia:en Help:Redirect
 224+ 73: wikipedia:en Help:Preferences
 225+ 71: wikipedia:en Help:Contents/Policies_and_guidelines
 226+ 67: wikipedia:en Help:Footnotes
 227+ 66: wikipedia:en Help:Contents/Technical_information
 228+ 64: wikipedia:en Help:Edit_conflict
 229+ 62: wikipedia:en Help:HTML_in_wikitext
 230+ 62: wikipedia:en Help:Recent_changes
 231+ 59: wikipedia:en Help:Namespace
 232+ 55: wikipedia:en Help:Cite_errors
 233+ 51: wikibooks:fr Aide:Compilations
 234+ 21: wikibooks:fr Aide:Compilations/Probl%C3%A8mes
 235+ 21: wikibooks:fr Aide:Compilations/FAQ
 236+ 13: wikibooks:fr Aide:Raccourcis
 237+ 12: wikibooks:fr Aide:Accueil
 238+ 10: wikibooks:fr Aide:Compilations/Aide_avanc%C3%A9e
 239+ 113: wiktionary:fr Aide:%C3%89tymologies
 240+ 96: wiktionary:fr Aide:Synonymes_et_antonymes
 241+ 61: wiktionary:fr Aide:Sommaire
 242+ 24: wiktionary:fr Aide:Prononciations
 243+ 21: wiktionary:fr Aide:D%C3%A9finitions
 244+ 18: wiktionary:fr Aide:%C3%89tymologie_grecque
 245+ 17: wiktionary:fr Aide:Anagrammes
 246+ 17: wiktionary:fr Aide:Aide
 247+ 12: wiktionary:fr Aide:Exemples
 248+ 10: wiktionary:fr Aide:Homophones_et_paronymes
 249+ 14: wikinews:fr Aide:Sommaire
 250+ 13: wikiquote:fr Aide:Sommaire
 251+ 127: wikisource:fr Aide:Aide_au_lecteur
 252+ 24: wikisource:fr Aide:Livres
 253+ 17: wikisource:fr Aide:Cr%C3%A9er_un_fichier_DjVu
 254+ 17: wikisource:fr Aide:Accueil
 255+ 12: wikisource:fr Aide:Guide_du_nouveau_contributeur
 256+ 11: wikisource:fr Aide:Comment_num%C3%A9riser
 257+ 10: wikisource:fr Aide:Aide
 258+ 103: wikiversity:fr Aide:Niveau_de_difficult%C3%A9
 259+ 26: wikiversity:fr Aide:Niveau_de_difficult%C3%A9/Niveau_13
 260+ 24: wikiversity:fr Aide:Sommaire
 261+ 21: wikiversity:fr Aide:Niveau_de_difficult%C3%A9/Niveau_14
 262+ 18: wikiversity:fr Aide:Niveau_de_difficult%C3%A9/Niveau_11
 263+ 16: wikiversity:fr Aide:Comment_cr%C3%A9er_un_projet
 264+ 15: wikiversity:fr Aide:Niveau_de_difficult%C3%A9/Niveau_10
 265+ 11: wikiversity:fr Aide:Frise_chronologique
 266+ 4860: wikipedia:fr Aide:Homonymie
 267+ 1742: wikipedia:fr Aide:Recherche
 268+ 1703: wikipedia:fr Aide:Sommaire
 269+ 1253: wikipedia:fr Aide:Importer_un_fichier
 270+ 945: wikipedia:fr Aide:%C3%89bauche
 271+ 901: wikipedia:fr Aide:Livres
 272+ 731: wikipedia:fr Aide:Comment_modifier_une_page
 273+ 359: wikipedia:fr Aide:Poser_une_question
 274+ 308: wikipedia:fr Aide:Tout_l%27indispensable...
 275+ 288: wikipedia:fr Aide:Consultation
 276+ 268: wikipedia:fr Aide:Premiers_pas
 277+ 217: wikipedia:fr Aide:Comment_cr%C3%A9er_une_page
 278+ 215: wikipedia:fr Aide:Redirection
 279+ 170: wikipedia:fr Aide:Importer_un_logo
 280+ 158: wikipedia:fr Aide:Syntaxe
 281+ 139: wikipedia:fr Aide:Importer_un_fichier_sur_Commons
 282+ 134: wikipedia:fr Aide:Unicode
 283+ 126: wikipedia:fr Aide:Note
 284+ 109: wikipedia:fr Aide:Importer_sur_Commons_un_fichier_dont_je_suis_l%27auteur
 285+ 106: wikipedia:fr Aide:Toujours_commenter_vos_modifications_dans_la_bo%C3%AEte_de_r%C3%A9sum%C3%A9
 286+ 106: wikipedia:fr Aide:Ins%C3%A9rer_une_image
 287+ 101: wikipedia:fr Aide:Comment_r%C3%A9diger_une_page
 288+ 101: wikipedia:fr Aide:%C3%89couter_des_sons_ogg
 289+ 94: wikipedia:fr Aide:Premiers_pas/2
 290+ 85: wikipedia:fr Aide:Mod%C3%A8le
 291+ 82: wikipedia:fr Aide:Japonais
 292+ 75: wikipedia:fr Aide:Comment_cr%C3%A9er_un_article
 293+ 74: wikipedia:fr Aide:Formules_TeX
 294+ 71: wikipedia:fr Aide:Caract%C3%A8res_sp%C3%A9ciaux
 295+ 68: wikipedia:fr Aide:Caract%C3%A8res_sp%C3%A9ciaux_probl%C3%A9matiques
 296+ 65: wikipedia:fr Aide:Premiers_pas/3
 297+ 63: wikipedia:fr Aide:Regarder_des_vid%C3%A9os_ogg
 298+ 63: wikipedia:fr Aide:Compte_utilisateur
 299+ 60: wikipedia:fr Aide:Sourcer
 300+ 60: wikipedia:fr Aide:Sommaire/D%C3%A9buter
 301+ 58: wikipedia:fr Aide:Sommaire/Modifier_Wikip%C3%A9dia
 302+ 55: wikipedia:fr Aide:Historique
 303+ 51: wikipedia:fr Aide:Espace_de_noms
 304+ 49: wikipedia:fr Aide:Republication
 305+ 49: wikipedia:fr Aide:Sommaire/Traduction
 306+ 44: wikipedia:fr Aide:Cat%C3%A9gorie
 307+ 42: wikipedia:fr Aide:Couleurs
 308+ 41: wikipedia:fr Aide:Accents
 309+ 39: wikipedia:fr Aide:Signature
 310+ 38: wikipedia:fr Aide:Liens_externes
 311+ 38: wikipedia:fr Aide:Les_diff%C3%A9rents_r%C3%B4les
 312+ 37: wikipedia:fr Aide:Acc%C3%A8s_%C3%A0_Wikip%C3%A9dia_avec_un_t%C3%A9l%C3%A9phone_portable_et_un_PDA
 313+ 35: wikipedia:fr Aide:Sommaire/Parcourir_Wikip%C3%A9dia
 314+ 33: wikipedia:fr Aide:Frise_chronologique
 315+ 33: wikipedia:fr Aide:Raccourci
 316+ 32: wikipedia:fr Aide:Page_utilisateur
 317+ 31: wikipedia:fr Aide:Page_Utilisateur
 318+ 14: wikipedia:gd Help:Cuideachadh
 319+ 25: wikipedia:gn Help:Contents
 320+ 10: wikipedia:ig Help:Contents
 321+ 48: wikipedia:ilo Help:Contents
 322+ 12: wikipedia:ilo Help:Dagiti_Linaon
 323+ 24: wikimedia:incubator Help:Manual
 324+ 15: wikimedia:incubator Help:Contents
 325+ 25: wikipedia:io Help:Helpo
 326+ 18: wikibooks:ja Help:%E9%80%B2%E6%8D%97%E7%8A%B6%E6%B3%81
 327+ 14: wiktionary:ja Help:%E7%9B%AE%E6%AC%A1
 328+ 1121: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1
 329+ 537: wikipedia:ja Help:%E6%A4%9C%E7%B4%A2
 330+ 188: wikipedia:ja Help:%E3%83%9A%E3%83%BC%E3%82%B8%E3%81%AE%E7%B7%A8%E9%9B%86
 331+ 179: wikipedia:ja Help:%E9%9F%B3%E5%A3%B0%E3%83%BB%E5%8B%95%E7%94%BB%E3%81%AE%E5%86%8D%E7%94%9F
 332+ 147: wikipedia:ja Help:%25E7%259B%25AE%25E6%25AC%25A1
 333+ 132: wikipedia:ja Help:%E7%94%BB%E5%83%8F%E3%81%AA%E3%81%A9%E3%81%AE%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB%E3%81%AE%E3%82%A2%E3%83%83%E3%83%97%E3%83%AD%E3%83%BC%E3%83%89%E3%81%A8%E5%88%A9%E7%94%A8
 334+ 120: wikipedia:ja Help:%E8%84%9A%E6%B3%A8/%E8%AA%AD%E8%80%85%E5%90%91%E3%81%91
 335+ 98: wikipedia:ja Help:%E7%89%B9%E6%AE%8A%E6%96%87%E5%AD%97
 336+ 84: wikipedia:ja Help:%E6%96%B0%E8%A6%8F%E3%83%9A%E3%83%BC%E3%82%B8%E3%81%AE%E4%BD%9C%E6%88%90
 337+ 69: wikipedia:ja Help:%E8%A8%98%E4%BA%8B%E3%81%A8%E3%81%AF%E4%BD%95%E3%81%8B
 338+ 68: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E6%96%B0%E3%81%97%E3%81%84%E8%A8%98%E4%BA%8B%E3%82%92%E6%9B%B8%E3%81%8F
 339+ 65: wikipedia:ja Help:%E7%94%BB%E5%83%8F%E3%81%AE%E8%A1%A8%E7%A4%BA
 340+ 64: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E7%9B%B8%E8%AB%87%E3%81%A8%E8%B3%AA%E5%95%8F
 341+ 57: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E7%B7%A8%E9%9B%86%E5%85%A5%E9%96%80
 342+ 56: wikipedia:ja Help:%E3%83%AD%E3%82%B0%E3%82%A4%E3%83%B3
 343+ 48: wikipedia:ja Help:%E3%83%86%E3%83%B3%E3%83%97%E3%83%AC%E3%83%BC%E3%83%88
 344+ 45: wikipedia:ja Help:%E3%83%8E%E3%83%BC%E3%83%88%E3%83%9A%E3%83%BC%E3%82%B8
 345+ 38: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E3%83%9E%E3%83%BC%E3%82%AF%E3%82%A2%E3%83%83%E3%83%97
 346+ 35: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E7%94%BB%E5%83%8F%E3%81%AA%E3%81%A9%E3%81%AE%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB
 347+ 34: wikipedia:ja Help:%E6%97%A9%E8%A6%8B%E8%A1%A8
 348+ 32: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E8%A8%98%E4%BA%8B%E3%82%92%E8%82%B2%E3%81%A6%E3%82%8B
 349+ 32: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E8%AA%AD%E8%80%85%E5%90%91%E3%81%91
 350+ 31: wikipedia:ja Help:%E8%84%9A%E6%B3%A8
 351+ 30: wikipedia:ja Help:%E7%B4%B0%E9%83%A8%E3%81%AE%E7%B7%A8%E9%9B%86
 352+ 30: wikipedia:ja Help:%E3%83%AA%E3%83%80%E3%82%A4%E3%83%AC%E3%82%AF%E3%83%88
 353+ 30: wikipedia:ja Help:JPEG%E7%94%BB%E5%83%8F%E3%82%92%E6%B8%9B%E8%89%B2%E3%81%97PNG%E7%94%BB%E5%83%8F%E3%81%A8%E3%81%97%E3%81%A6%E4%BF%9D%E5%AD%98%E3%81%99%E3%82%8B%E6%96%B9%E6%B3%95
 354+ 30: wikipedia:ja Help:%E9%81%8E%E5%8E%BB%E3%83%AD%E3%82%B0
 355+ 29: wikipedia:ja Help:%E5%B1%A5%E6%AD%B4
 356+ 28: wikipedia:ja Help:ISBN%E3%81%AE%E3%83%AA%E3%83%B3%E3%82%AF
 357+ 27: wikipedia:ja Help:%E3%83%8A%E3%83%93%E3%82%B2%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E3%83%BB%E3%83%9D%E3%83%83%E3%83%97%E3%82%A2%E3%83%83%E3%83%97
 358+ 27: wikipedia:ja Help:%E5%A4%9A%E8%A8%80%E8%AA%9E%E5%AF%BE%E5%BF%9C_(%E3%82%A4%E3%83%B3%E3%83%89%E7%B3%BB%E6%96%87%E5%AD%97)
 359+ 26: wikipedia:ja Help:%E3%83%AA%E3%83%B3%E3%82%AF
 360+ 25: wikipedia:ja Help:%E7%AE%87%E6%9D%A1%E6%9B%B8%E3%81%8D
 361+ 24: wikipedia:ja Help:%E3%82%A6%E3%82%A3%E3%82%AD%E3%83%86%E3%82%AD%E3%82%B9%E3%83%88%E3%81%AB%E3%81%8A%E3%81%91%E3%82%8BHTML
 362+ 24: wikipedia:ja Help:%E3%83%9A%E3%83%BC%E3%82%B8%E3%81%AE%E7%A7%BB%E5%8B%95
 363+ 23: wikipedia:ja Help:%E8%A1%A8%E3%81%AE%E4%BD%9C%E3%82%8A%E6%96%B9
 364+ 21: wikipedia:ja Help:%E8%A6%81%E7%B4%84%E6%AC%84
 365+ 21: wikipedia:ja Help:%E3%83%9E%E3%82%B8%E3%83%83%E3%82%AF%E3%83%AF%E3%83%BC%E3%83%89
 366+ 21: wikipedia:ja Help:Pywikipediabot
 367+ 20: wikipedia:ja Help:%25E6%25A4%259C%25E7%25B4%25A2
 368+ 20: wikipedia:ja Help:%E3%82%AB%E3%83%86%E3%82%B4%E3%83%AA
 369+ 20: wikipedia:ja Help:%E3%82%B5%E3%83%B3%E3%83%89%E3%83%9C%E3%83%83%E3%82%AF%E3%82%B9
 370+ 20: wikipedia:ja Help:%E6%A3%92%E3%82%B0%E3%83%A9%E3%83%95%E3%81%AE%E6%9B%B8%E3%81%8D%E6%96%B9
 371+ 18: wikipedia:ja Help:%E3%82%BB%E3%82%AF%E3%82%B7%E3%83%A7%E3%83%B3
 372+ 15: wikipedia:ja Help:%E3%82%A6%E3%82%A3%E3%82%AD%E3%83%A1%E3%83%BC%E3%83%AB
 373+ 15: wikipedia:ja Help:%E5%80%8B%E4%BA%BA%E8%A8%AD%E5%AE%9A
 374+ 15: wikipedia:ja Help:%E5%90%8D%E5%89%8D%E7%A9%BA%E9%96%93
 375+ 14: wikipedia:ja Help:%E3%83%86%E3%83%B3%E3%83%97%E3%83%AC%E3%83%BC%E3%83%88%E3%81%AE%E8%AA%AC%E6%98%8E%E6%96%87
 376+ 14: wikipedia:ja Help:%E4%BB%A5%E5%89%8D%E3%81%AE%E7%89%88%E3%81%AB%E3%83%9A%E3%83%BC%E3%82%B8%E3%82%92%E6%88%BB%E3%81%99%E6%96%B9%E6%B3%95
 377+ 13: wikipedia:ja Help:%E3%82%BD%E3%83%95%E3%83%88%E3%83%AA%E3%83%80%E3%82%A4%E3%83%AC%E3%82%AF%E3%83%88
 378+ 13: wikipedia:ja Help:%E3%83%9A%E3%83%BC%E3%82%B8%E5%90%8D
 379+ 13: wikipedia:ja Help:%E6%9D%A1%E4%BB%B6%E6%96%87
 380+ 30: wikipedia:kg Help:Contents
 381+ 2343: wikimedia:meta Help:External_editors
 382+ 1266: wikimedia:meta Help:Contents
 383+ 353: wikimedia:meta Help:Editing
 384+ 246: wikimedia:meta Help:Help
 385+ 237: wikimedia:meta Help:Starting_a_new_page
 386+ 217: wikimedia:meta Help:Images_and_other_uploaded_files
 387+ 210: wikimedia:meta Help:Unified_login
 388+ 208: wikimedia:meta Help:Table
 389+ 169: wikimedia:meta Hilfe:Handbuch
 390+ 167: wikimedia:meta Help:Category
 391+ 156: wikimedia:meta Help:Template
 392+ 131: wikimedia:meta Help:User_style%20http://schoolpapers.hostinginfive.com/bike.htm
 393+ 131: wikimedia:meta Help:User_style_http://schoolpapers.hostinginfive.com/bike.htm
 394+ 118: wikimedia:meta Help:Link
 395+ 111: wikimedia:meta Help:Editor
 396+ 107: wikimedia:meta Help:Formula
 397+ 105: wikimedia:meta Help:Wikitext_examples
 398+ 97: wikimedia:meta Help:Reference_card
 399+ 95: wikimedia:meta Help:Section
 400+ 89: wikimedia:meta Help:Special_characters
 401+ 85: wikimedia:meta Help:Wikitext
 402+ 85: wikimedia:meta Help:HTML_in_wikitext
 403+ 78: wikimedia:meta Help:System_admin
 404+ 77: wikimedia:meta Help:Preferences
 405+ 72: wikimedia:meta Aide:Contenu
 406+ 69: wikimedia:meta Help:Displaying_a_formula
 407+ 65: wikimedia:meta Help:Page_name
 408+ 64: wikimedia:meta Help:Magic_words
 409+ 61: wikimedia:meta Help:Advanced_editing
 410+ 59: wikimedia:meta Help:Reader
 411+ 57: wikimedia:meta Help:List
 412+ 56: wikimedia:meta Help:Searching
 413+ 56: wikimedia:meta Help:Moderator
 414+ 54: wikimedia:meta Help:Interwiki_linking
 415+ 52: wikimedia:meta Help:Transwiki
 416+ 52: wikimedia:meta Help:Redirect
 417+ 52: wikimedia:meta Help:Namespace
 418+ 50: wikimedia:meta Hilfe:Externe_Editoren
 419+ 49: wikimedia:meta Help:Public_domain_image_resources
 420+ 48: wikimedia:meta Help:User_style
 421+ 48: wikimedia:meta Help:Variable
 422+ 48: wikimedia:meta Help:Introduction
 423+ 46: wikimedia:meta Help:Moving_a_page
 424+ 46: wikimedia:meta Help:ParserFunctions
 425+ 45: wikimedia:meta Help:Logging_in
 426+ 45: wikimedia:meta Help:Export
 427+ 43: wikimedia:meta Help:Editing_FAQ
 428+ 42: wikimedia:meta Help:Import
 429+ 41: wikimedia:meta Help:Special_page
 430+ 41: wikimedia:meta Hilfe:Textgestaltung
 431+ 40: wikimedia:meta Help:Job_queue
 432+ 40: wikimedia:meta Help:URL
 433+ 14: wikipedia:meta Help:Contents
 434+ 35: wikipedia:mi Help:Contents
 435+ 24: wikipedia:mr Help:Contents
 436+ 18: wikipedia:ne Help:Contents
 437+ 12: wikibooks:nl Help:Boeken
 438+ 312: wikipedia:nl Help:Boeken
 439+ 182: wikipedia:nl Help:Zoeken
 440+ 124: wikipedia:nl Help:Tips_voor_het_schrijven_van_een_goed_artikel
 441+ 105: wikipedia:nl Help:Nieuwe_pagina_aanmaken
 442+ 72: wikipedia:nl Help:Helpdesk
 443+ 70: wikipedia:nl Help:Beveiligde_pagina%27s
 444+ 64: wikipedia:nl Help:Ogg_Vorbis
 445+ 61: wikipedia:nl Help:Uitleg
 446+ 60: wikipedia:nl Help:Wikipedia
 447+ 56: wikipedia:nl Help:Veelgestelde_vragen
 448+ 53: wikipedia:nl Help:Veelvoorkomende_spelfouten
 449+ 51: wikipedia:nl Help:Samenvatting
 450+ 45: wikipedia:nl Help:Hoe_kan_ik_meedoen%3F
 451+ 39: wikipedia:nl Help:Installeer_je_eigen_Wiki
 452+ 36: wikipedia:nl Help:Gebruik_van_bestanden
 453+ 35: wikipedia:nl Help:Terminologie_op_Wikipedia
 454+ 34: wikipedia:nl Help:Gebruik_van_tabellen
 455+ 33: wikipedia:nl Help:Referenties_en_voetnoten
 456+ 33: wikipedia:nl Help:Afkortingen_op_Wikipedia_chat
 457+ 32: wikipedia:nl Help:Gebruik_van_categorie%C3%ABn
 458+ 31: wikipedia:nl Help:Tekstopmaak
 459+ 30: wikipedia:nl Help:Gebruik_van_sjablonen
 460+ 30: wikipedia:nl Help:Contact_met_Wikipedia
 461+ 29: wikipedia:nl Help:Speciale_tekens
 462+ 27: wikipedia:nl Help:Kleine_wijziging
 463+ 26: wikipedia:nl Help:Alfabetische_index
 464+ 25: wikipedia:nl Help:Spellinggids
 465+ 25: wikipedia:nl Help:TeX_in_Wikipedia
 466+ 24: wikipedia:nl Help:Standaardvorm_voor_biografie%C3%ABn
 467+ 24: wikipedia:nl Help:Gebruik_van_bots
 468+ 23: wikipedia:nl Help:Beginnetje
 469+ 23: wikipedia:nl Help:Tips_voor_het_vertalen_van_een_artikel_vanaf_een_andere_Wikipedia
 470+ 23: wikipedia:nl Help:Gebruik_van_links
 471+ 22: wikipedia:nl Help:Samenvoegen_van_artikelen
 472+ 20: wikipedia:nl Help:Hulpmiddelen
 473+ 19: wikipedia:nl Help:Auteursrechten
 474+ 18: wikipedia:nl Help:Gebruik_van_openbare_bronnen
 475+ 17: wikipedia:nl Help:Bronnensjabloon
 476+ 17: wikipedia:nl Help:Wikipediachat
 477+ 16: wikipedia:nl Help:Inhoud
 478+ 16: wikipedia:nl Help:Gebruik_van_geluid
 479+ 15: wikipedia:nl Help:Externe_kaarten
 480+ 15: wikipedia:nl Help:Waarom_zou_ik_meedoen%3F
 481+ 15: wikipedia:nl Help:Naamruimte
 482+ 14: wikipedia:nl Help:EasyTimeline
 483+ 14: wikipedia:nl Help:English
 484+ 13: wikipedia:nl Help:Media_uploaden_naar_commons
 485+ 13: wikipedia:nl Help:Overlegpagina
 486+ 13: wikipedia:nl Help:Unieke_van_Wikipedia
 487+ 12: wikipedia:nl Help:Gebruik_van_de_taxobox
 488+ 11: wikipedia:nl Help:Doorverwijzen
 489+ 11: wikipedia:nl Help:Huis-_tuin-_en_keukeninspiratie
 490+ 10: wikipedia:nrm Help:Contents
 491+ 15: wikipedia:pam Help:Kalamnan
 492+ 10: wikipedia:pdc Hilfe:Hilfe
 493+ 15: wikipedia:sc Help:Aiuto
 494+ 32: wikipedia:scn Help:Aiutu
 495+ 15: wikipedia:sco Help:Contents
 496+ 48: wikipedia:se Help:Contents
 497+ 65: wiktionary:simple Help:Contents
 498+ 266: wikipedia:simple Help:Contents
 499+ 241: wikipedia:simple Help:Books
 500+ 21: wikipedia:simple Help:How_to_use_images
 501+ 18: wikipedia:simple Help:How_to_change_pages
 502+ 13: wikipedia:simple Help:Editing
 503+ 11: wikipedia:simple Help:How_to_edit
 504+ 10: wikipedia:simple Help:Archiving_a_talk_page
 505+ 10: wikipedia:simple Help:Pronunciation_respelling_key
 506+ 50: wikimedia:species Help:Contents
 507+ 19: wikimedia:species Help:Image_Guidelines
 508+ 17: wikimedia:species Help:General_Wikispecies
 509+ 15: wikimedia:species Help:Author_Names
 510+ 22: wikipedia:sw Help:Contents
 511+ 28: wikipedia:te Help:Contents
 512+ 20: wikipedia:test Help:Books
 513+ 14: wikipedia:test Help:Page_validation
 514+ 11: wikipedia:to Help:Contents
 515+ 21: wikipedia:uz Help:Contents
 516+ 748: www.w Help:Contents
 517+ 417: www.w Help:Configuration_settings
 518+ 373: www.w Help:Editing_pages
 519+ 355: www.w Help:Formatting
 520+ 276: www.w Help:Magic_words
 521+ 261: www.w Help:Navigation
 522+ 253: www.w Help:Extension:ParserFunctions
 523+ 208: www.w Help:Images
 524+ 185: www.w Help:FAQ
 525+ 172: www.w Help:Links
 526+ 164: www.w Help:Starting_a_new_page
 527+ 153: www.w Help:Templates
 528+ 147: www.w Help:Tables
 529+ 66: www.w Help:Categories
 530+ 47: www.w Help:Redirects
 531+ 46: www.w Help:Assigning_permissions
 532+ 46: www.w Help:Editing
 533+ 45: www.w Help:Namespaces
 534+ 44: www.w Help:Skins
 535+ 40: www.w Help:Managing_files
 536+ 38: www.w Help:Contents/de
 537+ 36: www.w Help:Special_pages
 538+ 35: www.w Help:Subpages
 539+ 33: www.w Help:Preferences
 540+ 31: www.w Help:Variables
 541+ 30: www.w Help:Moving_a_page
 542+ 29: www.w Help:Editing_pages/de
 543+ 28: www.w Help:User_page
 544+ 27: www.w Help:Contents/ru
 545+ 27: www.w Help:Sysops_and_permissions
 546+ 25: www.w Help:Talk_pages
 547+ 25: www.w Help:Editing_pages/ja
 548+ 24: www.w Help:Searching
 549+ 24: www.w Help:Navigation/de
 550+ 22: www.w Help:User_rights
 551+ 21: www.w Help:Signatures
 552+ 21: www.w Help:Deleting_a_page
 553+ 21: www.w Help:Tracking_changes
 554+ 20: www.w Help:Linked_images
 555+ 20: www.w Help:ParserFunctions
 556+ 19: www.w Help:Navigation/ru
 557+ 18: www.w Help:Interwiki_linking
 558+ 18: www.w Help:User_rights/favicon.ico
 559+ 18: www.w Help:User_rights/favicon.gif
 560+ 17: www.w Help:Formatting/de
 561+ 17: www.w Help:Editing_pages/pt
 562+ 17: www.w Help:Patrolled_edits
 563+ 16: www.w Help:Contents/es
 564+ 15: www.w Help:Links/ru
 565+ 14: www.w Help:Sysop_deleting_and_undeleting
 566+ 14: www.w Help:Starting_a_new_page/de
 567+ 13: www.w Help:Protecting_and_unprotecting_pages
 568+ 117: wikipedia:www Help:Contents
 569+ 55: wikipedia:zh-classical Help:%E5%87%A1%E4%BE%8B
 570+ 14: wikipedia:zh-classical Help:Page_validation
 571+ 57: wikipedia:zh-min-nan Help:Bo%CC%8Dk-lio%CC%8Dk
 572+ 13: wikipedia:zh-min-nan Help:%E5%A6%82%E4%BD%95%E8%BC%B8%E5%85%A5%E7%99%BD%E8%A9%B1%E5%AD%97
 573+ 10: wikipedia:zh-min-nan Help:%E5%A6%82%E4%BD%95%E8%AE%80
 574+ 51: wikipedia:zh-yue Help:%E7%9B%AE%E9%8C%84
 575+ 19: wikisource:zh Help:%E7%9B%AE%E5%BD%95
 576+ 13: wikisource:zh Help:%E4%B9%A6
 577+ 12: wikisource:zh Help:%E5%85%A5%E9%97%A8%E6%8C%87%E5%8D%97
Property changes on: trunk/wikistats/dammit.lt/cellar/!DammitRankSpecialPages.txt
___________________________________________________________________
Added: svn:eol-style
1578 + native
Index: trunk/wikistats/dammit.lt/cellar/!DammitPatchProjectcountsForBanners.pl
@@ -0,0 +1,189 @@
 2+#!/usr/bin/perl
 3+
 4+$| = 1; # flush screen output
 5+
 6+open IN, '<', 'PageViewsBannerPages.txt' ;
 7+open OUT1, '>', 'PageViewsBannerPagesUse.txt' ;
 8+open OUT2, '>', 'PageViewsBannerPagesDiscard.txt' ;
 9+open LOG, '>', 'PageViewsBannerPagesLog.txt' ;
 10+
 11+while ($line = <IN>)
 12+{
 13+ ($date,$project,$title,$counts) = split (' ', $line) ;
 14+
 15+ $date =~ s/^.*?(\d{8}).*$/$1/ ;
 16+ $project =~ s/^.*?:// ;
 17+ $project =~ s/\.z// ;
 18+
 19+ $projects {$project} ++ ;
 20+
 21+ ($total = $counts) =~ s/\D.*//g ;
 22+
 23+# next if $line !~ /20101001/ ;
 24+# next if $line !~ /fy\.z/ ;
 25+
 26+ if ($line !~ /(?:BannerCont|BannerList|BannerLoad|Bannerbeheer)/i)
 27+ {
 28+ print OUT2 $line ;
 29+ $total_discard += $total ;
 30+ $titles_discard {"$project $title"} += $total ;
 31+ next ;
 32+ }
 33+
 34+ print OUT1 $line ;
 35+ $titles_use {"$project $title"} += $total ;
 36+ $total_use += $total ;
 37+
 38+ # print "$counts: " ;
 39+ $counts =~ s/^\d+// ; # remove (redundant) preceding total
 40+ while ($counts ne "")
 41+ {
 42+ $letter = substr ($counts,0,1) ;
 43+ $counts = substr ($counts,1) ;
 44+ ($count = $counts) =~ s/^(\d+).*$/$1/ ;
 45+ $counts =~ s/^\d+(.*)$/$1/ ;
 46+ $hour = ord ($letter) - ord ('A') ;
 47+ # print "[$hour] $count " ;
 48+
 49+ $substract {"$project,$date,$hour"} += $count ;
 50+ # if (($project eq 'fy') && ($date eq '20101001'))
 51+ # { print "$project,$date,$hour\n" ; }
 52+ }
 53+ # print "\n" ;
 54+
 55+}
 56+close IN ;
 57+
 58+&Log ("\n\nDiscard:\n") ;
 59+foreach $title (sort {$titles_discard {$b} <=> $titles_discard {$a}} keys %titles_discard)
 60+{
 61+ print $titles_discard {$title} . " : $title\n" ;
 62+ print LOG $titles_discard {$title} . " : $title\n" ;
 63+ last if $lines_discard++ > 10 ;
 64+}
 65+
 66+&Log ("\n\nUse:\n") ;
 67+foreach $title (sort {$titles_use {$b} <=> $titles_use {$a}} keys %titles_use)
 68+{
 69+ print LOG $titles_use {$title} . " : $title\n" ;
 70+ next if $lines_use++ > 10 ;
 71+ print $titles_use {$title} . " : $title\n" ;
 72+ last if $lines_use++ > 1000 ;
 73+}
 74+
 75+&Log ("\n\nProjects:\n") ;
 76+foreach $project (sort keys %projects)
 77+{
 78+ &Log ("$project ") ;
 79+ &Log ("\n") if $projects_printed++ %10 == 0 ;
 80+}
 81+close OUT1 ;
 82+close OUT2 ;
 83+close LOG ;
 84+
 85+&Patch ;
 86+
 87+&Log ("Use $total_use\n") ;
 88+&Log ("Discard $total_discard\n") ;
 89+&Log ("Substracted $counts_substracted\n") ;
 90+
 91+print "\n\nReady\n\n" ;
 92+exit ;
 93+
 94+sub Patch
 95+{
 96+ &Log ("\n\nPatch\n\n") ;
 97+ if (-d "/a/dammit.lt/projectcounts")
 98+ { $dir = "/a/dammit.lt/projectcounts" ; }
 99+ else
 100+ { $dir = "w:/# In Dammit.lt/projectcounts/t" ; }
 101+
 102+ chdir ($dir) || die "Cannot chdir to $dir\n" ;
 103+
 104+ local (*DIR);
 105+ opendir (DIR, ".");
 106+ @files = () ;
 107+
 108+ while ($file_in = readdir (DIR))
 109+ {
 110+ next if $file_in !~ /^projectcounts-2010(?:09|10)/ ;
 111+ # next if $file_in !~ /^projectcounts-20101001/ ;
 112+
 113+ push @files, $file_in ;
 114+ }
 115+
 116+ closedir (DIR);
 117+
 118+ @files = sort @files ;
 119+
 120+ foreach $file (@files)
 121+ { &PatchFile ($file) ; }
 122+
 123+ &Log ("\n\nUnpatched\n\n") ;
 124+ foreach $key (sort keys %substract)
 125+ {
 126+ if (! $substract_found {$key})
 127+ { &Log ("$key\n") ; }
 128+ }
 129+}
 130+
 131+sub PatchFile
 132+{
 133+ my $file = shift ;
 134+ my $line ;
 135+ print "\nFile $file\n" ;
 136+
 137+ ($dummy,$date,$time) = split '-', $file ;
 138+ $hour = substr ($time,0,2) + 0 ;
 139+
 140+ open PROJECTFILE, '<', "$dir/$file" || die "Could not open '$dir/$file'\n" ;
 141+
 142+ undef @projectfile ;
 143+ $file_changed = 0 ;
 144+ while ($line = <PROJECTFILE>)
 145+ {
 146+ chomp $line ;
 147+ ($project,$dash,$count,$bytes) = split (' ', $line) ;
 148+
 149+ # next if $project ne 'fy' ;
 150+ # print "$line\n" ;
 151+ next if $bytes eq '' ;
 152+ $count_substract = $substract {"$project,$date,$hour"} ;
 153+ $substract_found {"$project,$date,$hour"} ++ ;
 154+
 155+ if ($count_substract == 0)
 156+ { push @projectfile, $line ; }
 157+ else
 158+ {
 159+ $file_changed = 1 ;
 160+ $count -= $count_substract ;
 161+ &Log ("\n$line ->\n") ;
 162+ $line = "$project $dash $count 1" ;
 163+ push @projectfile, $line ;
 164+ &Log ("$line\n") ;
 165+ }
 166+ # next if $count_substract eq '' ;
 167+ $counts_substracted += $count_substract ;
 168+ # print "$project $count minus $count_substract\n" ; # '$project,$date,$hour'\n" ;
 169+ }
 170+
 171+ close PROJECTFILE ;
 172+
 173+ if ($file_changed)
 174+ {
 175+ open PROJECTFILE, '>', "$dir/$file" || die "Could not open '$dir/$file'\n" ;
 176+ foreach $line (@projectfile)
 177+ { print PROJECTFILE "$line\n" ; }
 178+ close PROJECTFILE ;
 179+ }
 180+}
 181+
 182+sub Log
 183+{
 184+ my $msg = shift ;
 185+ print $msg ;
 186+ print LOG $msg ;
 187+}
 188+
 189+
 190+
Index: trunk/wikistats/dammit.lt/cellar/!DammitFilesFindMisses.pl
@@ -0,0 +1,185 @@
 2+#!/usr/local/bin/perl
 3+
 4+# to do
 5+# titles can occur twice (because of ucfirst) , add those counts before pushing to table @data
 6+# remove extra parameters e.g. "Gabriel_Andrade&limit=500"
 7+
 8+ use CGI qw(:all);
 9+ use URI::Escape;
 10+ use Getopt::Std ;
 11+ use Cwd ;
 12+
 13+ $bayes = -d "/a/dammit.lt/pagecounts" ;
 14+ $path_7za = "/usr/lib/p7zip/7za" ;
 15+ $path_grep = "/bin/grep" ;
 16+
 17+ $| = 1; # flush screen output
 18+ $true = 1 ;
 19+ $false = 0 ;
 20+
 21+ $jobstart = time ;
 22+
 23+ $key = "de.z" ;
 24+
 25+# -i "D:/\@Wikimedia/!Perl/#Projects/Dammit Log Files/Scan Log Files/in" -o "D:/\@Wikimedia/!Perl/#Projects/Dammit Log Files/Scan Log Files/out" -f 20090429 -t 20090429 -p ''
 26+ my $options ;
 27+ getopt ("iop", \%options) ;
 28+
 29+ $file_articles_in = "W:/# In Dumps/dewiki-20090917-all-titles-in-ns0" ;
 30+ $file_articles_out = "W:/# In Dumps/dewiki-20090917-all-titles-in-ns0_b" ;
 31+ $file_pageviews_in = "W:/pagecounts-20090801_fdt" ;
 32+ $file_pageviews_out = "W:/pagecounts-20090801_fdt_b" ;
 33+ $file_extract = "W:/! Perl/Dammit Log Files/Scan Log Files/PageViewsExtractArticlesDeWp.txt" ;
 34+ $file_missing = "W:/! Perl/Dammit Log Files/Scan Log Files/PageViewsMissingArticlesDeWp.txt" ;
 35+
 36+# if (! defined ($options {"i"})) { &Abort ("Specify input dir as -i dirname") } ;
 37+# if (! defined ($options {"o"})) { &Abort ("Specify output dir as -o dirname") } ;
 38+# if (! defined ($options {"p"})) { &Abort ("Specify project as -p \".....\"") } ;
 39+
 40+# $dir_in = $options {"i"} ;
 41+# $dir_out = $options {"o"} ;
 42+# $project = $options {"p"} ;
 43+
 44+# $work = cwd() ;
 45+# print "Work dir $work\n" ;
 46+# if ($dir_in !~ /[\/\\]/)
 47+# { $dir_in = "$work/$dir_in" ; }
 48+# if ($dir_out !~ /[\/\\]/)
 49+# { $dir_out = "$work/$dir_out" ; }
 50+
 51+# if (! -d $dir_in) { &Abort ("Input dir not found: $dir_in") } ;
 52+# if (! -d $dir_out)
 53+# {
 54+# print "Create output dir $dir_out\n" ;
 55+# mkdir $dir_out ;
 56+# if (! -d $dir_out)
 57+# { &Abort ("Output dir could not be created.") } ;
 58+# }
 59+
 60+ print "\nExtract missing articles\n" ; # Parm in: $dir_in\nParm out: $dir_out\n" ;
 61+
 62+# &SortEncodedArticleTitles ;
 63+ &ExtractMissingArticles ;
 64+
 65+ &Log ("\nReady\n") ;
 66+ exit ;
 67+
 68+sub SortEncodedArticleTitles
 69+{
 70+ open IN, '<', $file_articles_in || &Abort ("$file_articles_in could not be opened") ;
 71+ open OUT, '>', $file_articles_out || &Abort ("$file_articles_out could not be opened") ;
 72+
 73+ while ($line = <IN>)
 74+ {
 75+ chomp ($line) ;
 76+ $line =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
 77+ $line =~ s/([\x00-\x31\x80-\xFF])/"%".sprintf("%X",ord ($1))/ge ;
 78+ $line = ucfirst ($line) ;
 79+ push @data, $line ;
 80+ }
 81+ close IN ;
 82+
 83+ @data = sort @data ;
 84+
 85+ foreach $line (@data)
 86+ { print OUT "$line\n" ; }
 87+
 88+ close OUT ;
 89+
 90+ #--------------------------------------------------------------------------------------
 91+
 92+ open IN, '<', $file_pageviews_in || &Abort ("$file_pageviews_in could not be opened") ;
 93+ open OUT, '>', $file_pageviews_out || &Abort ("$file_pageviews_tmp could not be opened") ;
 94+
 95+ @data = () ;
 96+ while ($line = <IN>)
 97+ {
 98+ if ($line !~ /^$key /) { next ; }
 99+
 100+ chomp ($line) ;
 101+ ($key2,$title,$counts) = split (' ', $line) ;
 102+ $title =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
 103+ $title =~ s/([\x00-\x31\x80-\xFF])/"%".sprintf("%X",ord ($1))/ge ;
 104+ $title = ucfirst ($title) ;
 105+ push @data, "$title $counts" ;
 106+ }
 107+ close IN ;
 108+
 109+ @data = sort @data ;
 110+
 111+ foreach $line (@data)
 112+ { print OUT "$line\n" ; }
 113+
 114+ close OUT ;
 115+}
 116+
 117+sub ExtractMissingArticles
 118+{
 119+ my $dir_in = shift ;
 120+ my $dir_out = shift ;
 121+
 122+ open ARTICLES, '<', $file_articles_out || &Abort ("$file_articles_out could not be opened") ;
 123+ open PAGEVIEWS, '<', $file_pageviews_out || &Abort ("$file_pageviews_out could not be opened") ;
 124+ open EXTRACT, '>', $file_extract || &Abort ("$file_extract could not be written") ;
 125+ open MISSING, '>', $file_missing || &Abort ("$file_missing could not be written") ;
 126+
 127+ $title_at = <ARTICLES> ; # at = article title
 128+ chomp $title_at ;
 129+
 130+ @data = () ;
 131+ while ($line_pv = <PAGEVIEWS>) # pv = page view
 132+ {
 133+ chomp ($line_pv) ;
 134+ ($title_pv,$counts) = split (' ', $line_pv) ;
 135+
 136+ while (($title_at ne "") && ($title_pv gt $title_at))
 137+ {
 138+ # print EXTRACT " PV '$title_pv' gt AT $title_at\n" ;
 139+ $title_at = <ARTICLES> ;
 140+ chomp ($title_at) ;
 141+ }
 142+
 143+ chomp ($line_articles) ;
 144+ # if ($title_pv eq $title_at)
 145+ # { print EXTRACT " PV '$title_pv' EQ AT '$title_at'\n" ; }
 146+ # else
 147+ # { print EXTRACT " PV '$title_pv' NE AT '$title_at'\n" ; }
 148+ if ($title_pv ne $title_at)
 149+ {
 150+ $title_pv2 = $title_pv ;
 151+ $title_pv2 =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
 152+ print EXTRACT "$title_pv2 $counts\n" ;
 153+
 154+ if ($title_pv2 !~ /:/) # temp treat all titles with : as namespaces
 155+ {
 156+ $counts =~ s/^(\d+).*$/$1/ ;
 157+ push @data, "$counts $title_pv2" ;
 158+ }
 159+ }
 160+ }
 161+ @data = sort {$b <=> $a} @data ;
 162+ foreach $line (@data)
 163+ { print MISSING "$line\n" ; }
 164+}
 165+
 166+sub Log
 167+{
 168+ $msg = shift ;
 169+ print $msg ;
 170+ print LOG $msg ;
 171+}
 172+
 173+sub Abort
 174+{
 175+ $msg = shift ;
 176+ print "Abort script\nError: $msg\n" ;
 177+ print LOG "Abort script\nError: $msg\n" ;
 178+ exit ;
 179+}
 180+
 181+sub mmss
 182+{
 183+ my $seconds = shift ;
 184+ return (int ($seconds / 60) . " min, " . ($seconds % 60) . " sec") ;
 185+}
 186+
Index: trunk/wikistats/dammit.lt/cellar/!DammitScanCompactedFiles.pl
@@ -0,0 +1,364 @@
 2+#!/usr/local/bin/perl
 3+
 4+# 27 April 2010 renamed from WikiStatsScanCompactedDammitFiles.pl
 5+
 6+ use CGI qw(:all);
 7+ use URI::Escape;
 8+ use Getopt::Std ;
 9+ use Cwd ;
 10+
 11+# grep pagecounts-20090428_fdt -f pandemic.txt > scan.txt
 12+# utf-8 encoder for non western article titles: http://www.motobit.com/util/url-encoder.asp
 13+
 14+# &UncompactVisitorStats ('.') ;
 15+# exit ;
 16+
 17+ $bayes = -d "/a/dammit.lt/pagecounts" ;
 18+ $path_7za = "/usr/lib/p7zip/7za" ;
 19+ $path_grep = "/bin/grep" ;
 20+
 21+# if (! $bayes)
 22+# {
 23+# print "Test on Windows\n" ;
 24+# include IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; # install IO-Compress-Zlib
 25+# include IO::Compress::Gzip qw(gzip $GzipError) ; # install IO-Compress-Zlib
 26+# }
 27+
 28+ $| = 1; # flush screen output
 29+ $true = 1 ;
 30+ $false = 0 ;
 31+ $threshold = 5 ;
 32+ $jobstart = time ;
 33+
 34+# -i "D:/\@Wikimedia/!Perl/#Projects/Dammit Log Files/Scan Log Files/in" -o "D:/\@Wikimedia/!Perl/#Projects/Dammit Log Files/Scan Log Files/out" -f 20090429 -t 20090429 -p ''
 35+ my $options ;
 36+ getopt ("ioftp", \%options) ;
 37+
 38+ if (! defined ($options {"i"})) { &Abort ("Specify input dir as -i dirname") } ;
 39+ if (! defined ($options {"o"})) { &Abort ("Specify output dir as -o dirname") } ;
 40+ if (! defined ($options {"f"})) { &Abort ("Specify from date as -f yyyymmdd") } ;
 41+ if (! defined ($options {"t"})) { &Abort ("Specify till date as -t yyyymmdd") } ;
 42+ if (! defined ($options {"p"})) { &Abort ("Specify pattern as -p \".....\"") } ;
 43+
 44+ $dir_in = $options {"i"} ;
 45+ $dir_out = $options {"o"} ;
 46+ $datefrom = $options {"f"} ;
 47+ $datetill = $options {"t"} ;
 48+ $pattern = $options {"p"} ;
 49+
 50+ print "Pattern '$pattern'\n" ;
 51+ if ($pattern eq "html")
 52+ { $pattern = &GetPattern ; }
 53+
 54+ $work = cwd() ;
 55+ print "Work dir $work\n" ;
 56+ if ($dir_in !~ /[\/\\]/)
 57+ { $dir_in = "$work/$dir_in" ; }
 58+ if ($dir_out !~ /[\/\\]/)
 59+ { $dir_out = "$work/$dir_out" ; }
 60+
 61+ if (! -d $dir_in) { &Abort ("Input dir not found: $dir_in") } ;
 62+ if (! -d $dir_out)
 63+ {
 64+ print "Create output dir $dir_out\n" ;
 65+ mkdir $dir_out ;
 66+ if (! -d $dir_out)
 67+ { &Abort ("Output dir could not be created.") } ;
 68+ }
 69+
 70+ print "\nParm pattern: $pattern\n\n" ;
 71+# $pattern = "^nl.z Amsterdam\n^de.z Leiden\n" ;
 72+ if ($pattern =~ /^\#/)
 73+ { $file_pattern = substr ($pattern,1) ; }
 74+ else
 75+ {
 76+ $pattern =~ s/\\n/\n/gs ;
 77+ $file_pattern = "$dir_out/pattern.txt" ;
 78+ print "Write pattern to $file_pattern\n" ;
 79+ open PATTERN, ">", $file_pattern ;
 80+ print PATTERN $pattern ;
 81+ close PATTERN ;
 82+ }
 83+
 84+ if (($datefrom !~ /^20\d{6}$/))
 85+ { &Abort ("Specify from date: as -f yyyymmdd") ; }
 86+ if (($datetill !~ /^20\d{6}$/))
 87+ { &Abort ("Specify till date: as -t yyyymmdd") ; }
 88+
 89+ $dirfrom = substr ($datefrom,0,4) . "-" . substr ($datefrom,4,2) ;
 90+ $dirtill = substr ($datetill,0,4) . "-" . substr ($datetill,4,2) ;
 91+
 92+ print "\nScan pagecount files\nParm in: $dir_in\nParm out: $dir_out\nParm from: $datefrom (in folder $dirfrom)\nParm till: $datetill (in folder $dirtill)\nParm pattern: $pattern\n\n" ;
 93+
 94+ open LOG, ">>", "$work/WikiStatsScanVisitorstats.log" ;
 95+
 96+ &ScanVisitorStats ($dir_in, $dir_out, $dirfrom, $dirtill, $datefrom, $datetill) ;
 97+ &UncompactVisitorStats ($dir_out) ;
 98+
 99+ &Log ("\nReady\n") ;
 100+ close LOG ;
 101+ exit ;
 102+
 103+sub ScanVisitorStats
 104+{
 105+ my $dir_in = shift ;
 106+ my $dir_out = shift ;
 107+ my $dirfrom = shift ;
 108+ my $dirtill = shift ;
 109+ my $datefrom = shift ;
 110+ my $datetill = shift ;
 111+
 112+ my @dirs ;
 113+ my @files ;
 114+
 115+ chdir ($dir_in) || &Abort ("Cannot chdir to $dir_in\n") ;
 116+ local (*DIR);
 117+ opendir (DIR, ".");
 118+ @files = () ;
 119+ while ($file_in = readdir (DIR))
 120+ {
 121+ if (! -d $file_in)
 122+ { next ; }
 123+ if ($file_in !~ /^20\d\d-\d\d$/)
 124+ { next ; }
 125+ if (($file_in lt $dirfrom) || ($file_in gt $dirtill))
 126+ { next ; }
 127+ &Log ("Store folder $file_in\n") ;
 128+ push @dirs, $file_in ;
 129+ }
 130+ &Log ("\n") ;
 131+ closedir (DIR, ".");
 132+
 133+ @dirs = sort @dirs ;
 134+
 135+ foreach $dir (@dirs)
 136+ {
 137+ chdir ("$dir_in/$dir") || &Abort ("Cannot chdir to $dir_in/$dir\n") ;
 138+ local (*DIR);
 139+ opendir (DIR, ".");
 140+ while ($file_in = readdir (DIR))
 141+ {
 142+ if (-d $file_in)
 143+ { next ; }
 144+ if ($file_in !~ /^pagecounts-\d{8,8}_fdt.7z$/)
 145+ { next ; }
 146+ if (($file_in lt "pagecounts-$datefrom") || ($file_in gt "pagecounts-$datetill\xFF"))
 147+ { next ; }
 148+ &Log ("Store file $file_in\n") ;
 149+ push @files, "$dir/$file_in" ;
 150+ }
 151+ closedir (DIR, ".");
 152+ }
 153+ &Log ("\n") ;
 154+
 155+ if ($#files > -1)
 156+ {
 157+ @files = sort @files ;
 158+
 159+ unlink "$dir_out/scan.txt" ;
 160+ foreach $file (@files)
 161+ {
 162+ my $filestart = time ;
 163+ my $date = $file ;
 164+ $date =~ s/^.*?-(\d{8,8})_.*$/$1/ ;
 165+ $size = -s "$dir_in/$file" ;
 166+ print "Scan file '$file' ($size bytes)\n" ;
 167+
 168+ $cmd = "echo \"\# $date\" >> $dir_out/scan.txt" ;
 169+ print "Cmd: $cmd\n" ;
 170+ $result = `$cmd` ;
 171+
 172+ $cmd = "7z -so e $dir_in/$file | grep -i -f $file_pattern >> $dir_out/scan.txt" ;
 173+ print "Cmd: $cmd\n" ;
 174+ $result = `$cmd` ;
 175+
 176+ print "File done in " . &mmss(time - $filestart) . "\n\n" ;
 177+ }
 178+
 179+ print "Job done in " . &mmss(time - $jobstart) . "\n" ;
 180+ print "Average file took " . &mmss(int (time - $jobstart)/($#files+1)) . "\n" ;
 181+ }
 182+ &Log ("\n\n") ;
 183+}
 184+
 185+sub UncompactVisitorStats
 186+{
 187+ &Log ("\nUncompact visitors stats\n\n") ;
 188+ my $dir_out = shift ;
 189+
 190+ my $file_in = "$dir_out/scan.txt" ;
 191+ my $file_out1 = "$dir_out/CountsDailyPerLanguageTitles.csv" ; # totals for full day per language:title
 192+ my $file_out2 = "$dir_out/CountsHourlyPerLanguageTitle.csv" ; # hourly counts per language:title (hours vertical)
 193+ my $file_out3 = "$dir_out/CountsHourlyPerLanguage.csv" ; # hourly counts per language (hours vertical)
 194+ my ($date,$time,$year,$month,$day) ; ;
 195+
 196+ open IN, '<', $file_in ;
 197+ binmode IN ;
 198+
 199+ while ($line = <IN>)
 200+ {
 201+ # process timestamp
 202+ if ($line =~ /^#/)
 203+ {
 204+ $date = substr ($line,2,8) ;
 205+ $year = substr ($date,0,4) ;
 206+ $month = substr ($date,4,2) ;
 207+ $day = substr ($date,6,2) ;
 208+ $date = "=DATE($year,$month,$day)" ;
 209+ next ;
 210+ }
 211+
 212+ chomp ($line) ;
 213+ ($lang,$title,$counts) = split (" ", $line) ;
 214+ $title =~ s/,/&comma;/g ;
 215+ $lang =~ s/\.z// ;
 216+ $lang =~ s/\.y/2/ ;
 217+ $counts =~ s/^\d+// ; # remove (redundant) preceding total
 218+
 219+ # store hourly counts
 220+ while ($counts ne "")
 221+ {
 222+ $letter = substr ($counts,0,1) ;
 223+ $counts = substr ($counts,1) ;
 224+ ($count = $counts) =~ s/^(\d+).*$/$1/ ;
 225+ $counts =~ s/^\d+(.*)$/$1/ ;
 226+ $h = sprintf ("%02d", ord ($letter) - ord ('A')) ;
 227+ $time = $date . "+TIME($h,0,0)" ;
 228+
 229+ $hits1 {"$lang,$title,\"$date\""} += $count ;
 230+ $key = "$lang:$title" ;
 231+ $times {$time}++ ;
 232+ $keys {$key} ++ ;
 233+ $languages {$lang} ++ ;
 234+ $hits2 {"$time,$key"} += $count ;
 235+ $hits3 {"$time,$lang"} += $count ;
 236+ }
 237+ }
 238+
 239+ close IN ;
 240+
 241+ # file_out1: write totals for full day per language:title
 242+ # quick way to see which titles are visisted significantly
 243+ @lines = sort @lines ;
 244+ open OUT, '>', $file_out1 ;
 245+ binmode OUT ;
 246+ foreach $key (sort keys %hits1)
 247+ { print OUT "$key,${hits1{$key}}\n" ; }
 248+ close OUT ;
 249+
 250+ # file_out2: write hourly counts per language:title (hours vertical)
 251+ open OUT, '>', $file_out2 ;
 252+ binmode OUT ;
 253+
 254+ # header line
 255+ $line = "date / group" ;
 256+ foreach $key (sort keys %keys)
 257+ { $line .= ",$key" ; }
 258+ $line .= "\n" ;
 259+ print OUT $line ;
 260+
 261+ foreach $time (sort keys %times)
 262+ {
 263+ $line = "\"$time\"" ;
 264+ foreach $key (sort keys %keys)
 265+ {
 266+ $count = $hits2 {"$time,$key"} ;
 267+ if ($count eq "")
 268+ { $count = 0 ; }
 269+ $line .= ",$count" ;
 270+ }
 271+ $line .= "\n" ;
 272+ print OUT $line ;
 273+ }
 274+ close OUT ;
 275+
 276+ # file_out3: write hourly counts per language (hours vertical)
 277+ open OUT, '>', $file_out3 ;
 278+ binmode OUT ;
 279+
 280+ # header line
 281+ $line = "date / group" ;
 282+ foreach $lang (sort keys %languages)
 283+ { $line .= ",$lang" ; }
 284+ $line .= "\n" ;
 285+ print OUT $line ;
 286+
 287+ foreach $time (sort keys %times)
 288+ {
 289+ $line = "\"$time\"" ;
 290+ foreach $lang (sort keys %languages)
 291+ {
 292+ $count = $hits3 {"$time,$lang"} ;
 293+ if ($count eq "")
 294+ { $count = 0 ; }
 295+ $line .= ",$count" ;
 296+ }
 297+ $line .= "\n" ;
 298+ print OUT $line ;
 299+ }
 300+ close OUT ;
 301+
 302+}
 303+
 304+sub GetPattern
 305+{
 306+ print "GetPattern\n" ;
 307+ open HTML, '<', 'wikilinks.html' ;
 308+ $pattern = "" ;
 309+ while ($line = <HTML>)
 310+ {
 311+ if ($line =~ /class=\"interwiki/)
 312+ {
 313+ chomp ($line) ;
 314+ $lang = $line ;
 315+ $lang =~ s/^.*?interwiki-(\w+).*$/$1/ ;
 316+ $title = $line ;
 317+ $title =~ s/^.*?href=\"([^\"]+)\".*$/$1/ ;
 318+ $title =~ s/^.*\/([^\/]+)$/$1/ ;
 319+ # print "[$lang] $title\n" ;
 320+ @languages {$title} .= "$lang," ;
 321+ @langcnt {$title}++ ;
 322+ }
 323+ }
 324+ print "\n\n\n" ;
 325+
 326+ foreach $title (sort {$langcnt {$b} <=> $langcnt {$a}} keys %langcnt)
 327+ {
 328+ $count = $langcnt {$title} ;
 329+ if ($count > 10)
 330+ { $pattern .= "$title\n" ; }
 331+ else
 332+ {
 333+ $langlist = $languages {$title} ;
 334+ @langs = split (',', $langlist) ;
 335+ foreach $lang (@langs)
 336+ {
 337+ print "$lang $title\n" ;
 338+ $pattern .= "^$lang\.z $title\n"
 339+ }
 340+ }
 341+ }
 342+ return ($pattern) ;
 343+}
 344+
 345+sub Log
 346+{
 347+ $msg = shift ;
 348+ print $msg ;
 349+ print LOG $msg ;
 350+}
 351+
 352+sub Abort
 353+{
 354+ $msg = shift ;
 355+ print "Abort script\nError: $msg\n" ;
 356+ print LOG "Abort script\nError: $msg\n" ;
 357+ exit ;
 358+}
 359+
 360+sub mmss
 361+{
 362+ my $seconds = shift ;
 363+ return (int ($seconds / 60) . " min, " . ($seconds % 60) . " sec") ;
 364+}
 365+
Index: trunk/wikistats/dammit.lt/dammit_compact_monthly.sh
@@ -0,0 +1,11 @@
 2+#!/bin/sh
 3+
 4+ulimit -v 8000000
 5+
 6+# dte=$(date +%Y%m)
 7+# dte=$(date --date "$dte -1 days" +%Y%m)
 8+# echo "Compact dammit.lt files for one day: $dte"
 9+
 10+echo "Compact dammit.lt files for one month"
 11+nice perl /a/dammit.lt/DammitCompactHourlyOrDailyPageCountFiles.pl -m -d 201001 -i /a/dammit.lt/pagecounts -o /a/dammit.lt/pagecounts/monthly >> /a/dammit.lt/pagecounts/monthly/compact_log.txt
 12+
Property changes on: trunk/wikistats/dammit.lt/dammit_compact_monthly.sh
___________________________________________________________________
Added: svn:eol-style
113 + native
Index: trunk/wikistats/dammit.lt/DammitReportPageRequestsStaffWikis.pl
@@ -0,0 +1,281 @@
 2+#!/usr/bin/perl
 3+
 4+# bash file for daily generation and copy
 5+# blank article title ?!
 6+
 7+# no warnings 'uninitialized';
 8+
 9+ use lib "/home/ezachte/lib" ; # general routines
 10+ use lib "/home/ezachte/wikistats" ; # WikiReports*.pm modules
 11+ use lib "W:/! Perl/Wikistats" ; # test env
 12+
 13+ use EzLib ;
 14+ ez_lib_version (8) ;
 15+ $trace_on_exit = $true ;
 16+
 17+# use Time::Local ;
 18+# use Net::Domain qw (hostname);
 19+
 20+ use WikiReportsDate ;
 21+ use WikiReportsLiterals ;
 22+ use WikiReportsOutputMisc ;
 23+ use WikiReportsScripts ;
 24+ use WikiReportsNoWikimedia ;
 25+ use WikiReportsLocalizations ;
 26+ use WikiReportsHtml ;
 27+
 28+ my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
 29+
 30+ &SetMonths ;
 31+ &SetLiterals ;
 32+ &SetScripts ;
 33+
 34+ &CountMostViewedPages ($this_month) ;
 35+ if ($mday <= 5)
 36+ { &CountMostViewedPages ($prev_month) ; }
 37+
 38+ exit ;
 39+
 40+sub SetMonths
 41+{
 42+ $mon ++ ;
 43+ $year += 1900 ;
 44+ $this_month = sprintf ("%04d-%02d", $year, $mon) ;
 45+ if (-- $mon == 0) { $mon = 12 ; $year-- ; }
 46+ $prev_month = sprintf ("%04d-%02d", $year, $mon) ;
 47+ if (-- $mon == 0) { $mon = 12 ; $year-- ; }
 48+ $prev_prev_month = sprintf ("%04d-%02d", $year, $mon) ;
 49+}
 50+
 51+sub CountMostViewedPages
 52+{
 53+ my $month = shift ;
 54+ ($month2 = $month) =~ s/-// ;
 55+
 56+ undef %views ;
 57+
 58+ &LogT ("Count pages for $month\n\n") ;
 59+
 60+ if ($job_runs_on_production_server)
 61+ {
 62+ &LogT ("Job runs on production server\n") ;
 63+ $path_in = "/a/dammit.lt/filtered" ;
 64+ $path_out = "/mnt/htdocs/page_views" ;
 65+ }
 66+ else
 67+ {
 68+ &LogT ("Job runs on local test server\n") ;
 69+ $path_in = "w:/! Perl/Dammit/Page Requests Staff Wikis" ;
 70+ $path_out = "w:/! Perl/Dammit/Page Requests Staff Wikis" ;
 71+ }
 72+
 73+ &LogT ("Path in: $path_in\n") ;
 74+ &LogT ("Path out: $path_out\n") ;
 75+ chdir $path_in ;
 76+ my @files = glob "*" ; # glob on qualified dir on Windows gives problems, hence chdir ??
 77+
 78+ $first = "" ;
 79+ $last = "" ;
 80+ foreach $file (sort @files)
 81+ {
 82+ next if $file !~ /pagecounts-$month2\d\d.txt/ ;
 83+ &LogT ("$file\n") ;
 84+
 85+ if ($first eq "")
 86+ { ($first = $file) =~ s/[^\d]//g ; }
 87+ ($last = $file) =~ s/[^\d]//g ;
 88+ $first =~ s/(\d\d\d\d)(\d\d)(\d\d)/$1-$2-$3/ ;
 89+ $last =~ s/(\d\d\d\d)(\d\d)(\d\d)/$1-$2-$3/ ;
 90+ $first_day = substr ($first,8,2) ;
 91+ $last_day = substr ($last ,8,2) ;
 92+
 93+ open IN, '<', $file ;
 94+ while ($line = <IN>)
 95+ {
 96+ chomp $line ;
 97+ ($project, $article, $counts) = split (' ', $line) ;
 98+
 99+ next if $article =~ /^\s*$/ ;
 100+ next if $project eq "quality.m" ; # obsolete
 101+ next if $article =~ /:\/\// ; # e.g. http://
 102+ next if $article =~ /\.php/ ;
 103+
 104+ $article =~ s/^[\/\\]*// ;
 105+ $article = ucfirst $article ;
 106+ $project =~ s/\.m$// ;
 107+ $project = ucfirst $project ;
 108+ $projects {$project} ++ ;
 109+
 110+ ($daytotal = $counts) =~ s/^(\d+).*$/$1/ ;
 111+ $views {$project} {$article} += $daytotal ;
 112+
 113+ # if ($article =~ /China/)
 114+ # { print "$project $article + $daytotal -> " . $views {$project} {$article} . "\n" ; }
 115+ }
 116+ }
 117+
 118+ $month_eng = month_english_short (substr($month,5,2) - 1) . ' ' . substr ($month,0,4) ;
 119+
 120+ $period = 'day ' . (substr ($first,8,2)+0) . '-' . (substr ($last,8,2)+0) ;
 121+
 122+ foreach $project (sort keys %projects)
 123+ {
 124+ &LogT ("\nWrite totals for project $project for month $month (day $first_day - $last_day)\n\n") ;
 125+
 126+ # === Sort by title ===
 127+
 128+ @articles = sort keys %{$views {$project}} ;
 129+ next if $#articles == -1 ;
 130+
 131+ open TXT, '>', "$path_out/PageViews${project}-$month-ByTitle.txt" ;
 132+ open CSV, '>', "$path_out/PageViews${project}-$month-ByTitle.csv" ;
 133+
 134+ print TXT "title,views (period: $first - $last)\n" ;
 135+ print CSV "views,title,period: $first - $last\n" ;
 136+
 137+ foreach $article (@articles)
 138+ {
 139+ ($article2 = $article) =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
 140+ print TXT "$article2,${views {$project} {$article}}\n" ;
 141+
 142+ $article2 = $article ;
 143+ if ($article2 =~ /,/)
 144+ { $article2 = "\"$article2\"" ; }
 145+ print CSV "${views {$project} {$article}},$article2\n" ;
 146+ }
 147+ close TXT ;
 148+ close CSV ;
 149+
 150+ # === Sort by views ===
 151+
 152+ if ($month eq $this_month)
 153+ {
 154+ $url_prev = "PageViews${project}-$prev_month-ByViews.html" ;
 155+ $url_next = "" ;
 156+ $out_button_prev = &btn (" < ", $url_prev) ;
 157+ $out_button_next = "" ;
 158+ }
 159+ elsif ($month eq $prev_month)
 160+ {
 161+ $url_prev = "PageViews${project}-$prev_prev_month-ByViews.html" ;
 162+ $url_next = "PageViews${project}-$this_month-ByViews.html" ;
 163+ $out_button_prev = &btn (" < ", $url_prev) ;
 164+ $out_button_next = &btn (" > ", $url_next) ;
 165+
 166+ if (! -e $url_prev)
 167+ { $out_button_prev = "" ; }
 168+ }
 169+
 170+ my $out_zoom = "" ;
 171+ my $out_options = "" ;
 172+ my $out_explanation = "" ; #Based on Domas' <a href='http://dammit.lt/wikistats/'>page view files</a>" ;
 173+ my $out_page_subtitle = "" ;
 174+ my $out_crossref = "" ;
 175+ my $out_description = "" ;
 176+ my $out_button_switch = "" ;
 177+ my $out_msg = "<b>$month_eng ($period)</b>" ;
 178+ my $lang = "en" ;
 179+
 180+ my $out_html_title = "$project wiki page views" ;
 181+ my $out_page_title = "$project wiki page views" ;
 182+
 183+ $out_scriptfile = "<script language=\"javascript\" type=\"text/javascript\" src=\"WikipediaStatistics14.js\"></script>\n" ;
 184+ $out_style =~ s/td/td {font-size:12px}\nth {font-size:12px}\ntd/ ; # script definition needs clean up
 185+
 186+ $out_options = &opt ("PageViews${project}-$month-ByViews.html", $project) ;
 187+ foreach $project2 (keys %projects)
 188+ {
 189+ if ($project2 ne $project)
 190+ { $out_options .= &opt ("PageViews${project2}-$month-ByViews.html", $project2) ; }
 191+ }
 192+
 193+ $unicode = $true ;
 194+ &GenerateHtmlStart ($out_html_title, $out_zoom, $out_options,
 195+ $out_page_title, $out_page_subtitle, $out_explanation,
 196+ $out_button_prev, $out_button_next, $out_button_switch,
 197+ $out_crossref, $out_msg) ;
 198+
 199+ $out_html =~ s/Sitemap.htm/http:\/\/stats.wikimedia.org/ ; # Q&D patch
 200+ $out_html =~ s/ Home / stats.wikimedia.org / ; # Q&D patch
 201+
 202+ @articles = sort {$views {$project}{$b} <=> $views {$project}{$a}} keys %{$views {$project}} ;
 203+
 204+ open TXT, '>', "$path_out/PageViews${project}-$month-ByViews.txt" ;
 205+ open CSV, '>', "$path_out/PageViews${project}-$month-ByViews.csv" ;
 206+
 207+ print TXT "title,views (period: $first - $last)\n" ;
 208+ print CSV "views,title,period: $first - $last\n" ;
 209+
 210+ $out_html .= "<p><b>Other formats</b>: " ;
 211+ $out_html .= "ordered by views: <a href='PageViews${project}-$month-ByViews.txt'>text file</a> / <a href='PageViews${project}-$month-ByViews.csv'>csv file</a>, " ;
 212+ $out_html .= "ordered by title: <a href='PageViews${project}-$month-ByTitle.txt'>text file</a> / <a href='PageViews${project}-$month-ByTitle.csv'>csv file</a><p>" ;
 213+ $out_html .= "<table border=1>\n" ;
 214+ $out_html .= "<tr><th class=cb>Rank</th><th class=cb>Views</th><th class=lb>Title</th></tr>\n" ;
 215+
 216+ $lines = 0 ;
 217+ foreach $article (@articles)
 218+ {
 219+ ($article2 = $article) =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
 220+ print TXT "$article2,${views {$project} {$article}}\n" ;
 221+
 222+ $article2 =~ s/_/ /g ;
 223+ if (++$lines <= 1000)
 224+ { $out_html .= "<tr><td class=c>$lines</td><td class=r>${views {$project} {$article}}</td><td class=l><a href='http://$project.wikimedia.org/wiki/$article'>$article2</a></td></tr>\n" ; }
 225+
 226+ $article2 = $article ;
 227+ if ($article2 =~ /,/)
 228+ { $article2 = "\"$article2\"" ; }
 229+ print CSV "${views {$project} {$article}},$article2\n" ;
 230+ }
 231+
 232+ $out_html .= "</table>\n" ;
 233+
 234+ close TXT ;
 235+ close CSV ;
 236+
 237+ $out_html .= "<p><small>Counts based on <a href='http://dammit.lt/wikistats/'>Domas' hourly pagecount files</a><br>" .
 238+ "File generated on " . date_time_english (time) . "<br>Author: Erik Zachte</small>" ;
 239+
 240+ open HTML, '>', "$path_out/PageViews${project}-$month-ByViews.html" ;
 241+ print HTML $out_html ;
 242+ close HTML ;
 243+
 244+ if ($month eq $this_month) # static url
 245+ {
 246+ open HTML, '>', "$path_out/PageViews${project}.html" ;
 247+ print HTML $out_html ;
 248+ close HTML ;
 249+ }
 250+ }
 251+}
 252+
 253+# translates one unicode character into plain ascii
 254+sub UnicodeToAscii {
 255+ my $unicode = shift ;
 256+
 257+ my $char = substr ($unicode,0,1) ;
 258+ my $ord = ord ($char) ;
 259+ my ($c, $value, $html) ;
 260+
 261+ if ($ord < 128) # plain ascii character
 262+ { return ($unicode) ; } # (will not occur in this script)
 263+ else
 264+ {
 265+ if ($ord >= 252) { $value = $ord - 252 ; }
 266+ elsif ($ord >= 248) { $value = $ord - 248 ; }
 267+ elsif ($ord >= 240) { $value = $ord - 240 ; }
 268+ elsif ($ord >= 224) { $value = $ord - 224 ; }
 269+ else { $value = $ord - 192 ; }
 270+
 271+ for ($c = 1 ; $c < length ($unicode) ; $c++)
 272+ { $value = $value * 64 + ord (substr ($unicode, $c,1)) - 128 ; }
 273+
 274+ if ($value < 256)
 275+ { return (chr ($value)) ; }
 276+
 277+ # $unicode =~ s/([\x80-\xFF])/("%".sprintf("%02X",$1))/gie ;
 278+ return ($unicode) ;
 279+ }
 280+}
 281+
 282+
Index: trunk/wikistats/dammit.lt/DammitSyncFiles.pl
@@ -0,0 +1,197 @@
 2+#!/usr/bin/perl
 3+
 4+# 27 April 2010 renamed from WikiStatsDammitSync.pl
 5+
 6+ use Time::Local ;
 7+ use Archive::Tar;
 8+
 9+ $tar = Archive::Tar->new;
 10+
 11+ $| = 1; # flush screen output
 12+
 13+ $maxdaysago = 40; # do not download files more than this ago
 14+
 15+ if (-e "a_dammit.lt_index.html") # test
 16+ { $file_html = "a_dammit.lt_index.html" ; }
 17+ else
 18+ {
 19+ open LOG, '>>', "/a/dammit.lt/WikiStatsDammitSync.log" ;
 20+
 21+ $file_html = "/a/dammit.lt/index.html" ;
 22+ unlink $file_html ;
 23+ $cmd = "wget -O $file_html http://dammit.lt/wikistats/" ;
 24+ $result = `$cmd` ;
 25+ if ($result == 0)
 26+ { $result = "OK" ; }
 27+ &Log ("Cmd '$cmd' -> $result \n\n") ;
 28+
 29+ if (! -e $file_html) { &Abort ("File $file_html not found") ; }
 30+ if (-s $file_html == 0) { &Abort ("File $file_html empty") ; }
 31+ }
 32+
 33+ $timestart = time ;
 34+
 35+ chdir "/a/dammit.lt/projectcounts" ;
 36+ $cmd = `pwd` ;
 37+ &Log ("Cmd '$cmd'\n") ;
 38+ $result = `$cmd` ;
 39+ print "$result\n" ;
 40+
 41+ open HTML,'<',$file_html ;
 42+ while ($line = <HTML>)
 43+ {
 44+ if ($line =~ /<title>/)
 45+ {
 46+ $subdir = "" ;
 47+ if ($line =~ /archive/)
 48+ {
 49+ $line =~ s/^.*?\/wikistats\/// ;
 50+ $line =~ s/<.*$// ;
 51+ chomp $line ;
 52+ $subdir = $line ;
 53+ }
 54+ &Log ("Subdir = '$subdir'\n") ;
 55+ next ;
 56+ }
 57+
 58+ if ($line !~ /application\/octet-stream/) { next ; }
 59+
 60+ ($file = $line) =~ s/^.*?a href=\"([^"]+)\".*$/$1/s ;
 61+ ($date = $line) =~ s/^.*?class=\"m\">([^<]+)<.*$/$1/s ;
 62+ ($date,$time) = split (' ', $date) ;
 63+
 64+ if ($file =~ /^pagecounts/)
 65+ {
 66+ $yy = substr ($file,11,4) ;
 67+ $mm = substr ($file,15,2) ;
 68+ $dd = substr ($file,17,2) ;
 69+ $daysago = int ((time - timegm(0,0,0,$dd,$mm-1,$yy-1900)) / (24 * 60 * 60)) ;
 70+
 71+ print "$file: $daysago days ago\n" ;
 72+ if ($daysago > $maxdaysago) { next ; }
 73+
 74+ # $path_7z = "/a/dammit.lt/pagecounts/$yy\-$mm/pagecounts\-$yy$mm$dd\_fdt.7z" ;
 75+ # if (-e $path_7z) { print "exists\n" ; next ; }
 76+
 77+ $path = "/a/dammit.lt/pagecounts/$yy\-$mm/pagecounts\-$yy$mm$dd\_h" ;
 78+ if ((-e "$path.7z") || (-e "$path.zip") || (-e "$path.bz2") || (-e "$path.gz"))
 79+ { print "$path.[7z|zip|bz2|gz] exists\n" ; next ; }
 80+ else
 81+ { print "$path.[7z|zip|bz2|gz] new -> download\n" ; }
 82+ }
 83+
 84+ # if ($file =~ /^projectcounts/)
 85+ # {
 86+ # $yy = substr ($file,14,4) ;
 87+ # $mm = substr ($file,18,2) ;
 88+ # $dd = substr ($file,20,2) ;
 89+ # $daysago = int ((time - timegm(0,0,0,$dd,$mm-1,$yy-1900)) / (24 * 60 * 60)) ;
 90+ # if ($daysago > $maxdaysago) { next ; }
 91+ # }
 92+
 93+
 94+ $yy = substr ($date,0,4) ;
 95+ $mm = substr ($date,5,3) ;
 96+ $dd = substr ($date,9,2) ;
 97+ $hh = substr ($time,0,2) ;
 98+ $nn = substr ($time,3,2) ;
 99+ $ss = substr ($time,6,2) ;
 100+
 101+ if ($mm eq 'Jan') { $mm = 1 ; }
 102+ elsif ($mm eq 'Feb') { $mm = 2 ; }
 103+ elsif ($mm eq 'Mar') { $mm = 3 ; }
 104+ elsif ($mm eq 'Apr') { $mm = 4 ; }
 105+ elsif ($mm eq 'May') { $mm = 5 ; }
 106+ elsif ($mm eq 'Jun') { $mm = 6 ; }
 107+ elsif ($mm eq 'Jul') { $mm = 7 ; }
 108+ elsif ($mm eq 'Aug') { $mm = 8 ; }
 109+ elsif ($mm eq 'Sep') { $mm = 9 ; }
 110+ elsif ($mm eq 'Oct') { $mm = 10 ; }
 111+ elsif ($mm eq 'Nov') { $mm = 11 ; }
 112+ elsif ($mm eq 'Dec') { $mm = 12 ; }
 113+ else { &Abort ("Invalid month '$mm' in file date $date $time") ; }
 114+
 115+ $date2 = sprintf ("%02d%02d%02d%02d%02d.%02d", ($yy-2000), $mm, $dd, $hh, $nn, $ss) ;
 116+
 117+ if ($file =~ /^(?:page|project)counts-2/)
 118+ {
 119+
 120+ if ($file =~ /^pagecounts/)
 121+ { $path = "/a/dammit.lt/pagecounts/$file" ; }
 122+ else
 123+ { $path = "/a/dammit.lt/projectcounts/$file" ; }
 124+
 125+ if (-e $path)
 126+ {
 127+ &Log ("File $path exists\n") ;
 128+ if (-s $path == 0)
 129+ {
 130+ &Log ("File $path empty -> overwrite\n") ;
 131+ unlink $path ;
 132+ }
 133+ else { next ; }
 134+ }
 135+
 136+ if ($file =~ /^projectcounts/)
 137+ {
 138+ $tar_file = "/a/dammit.lt/projectcounts/projectcounts-$yy.tar" ;
 139+ if (-e $tar_file)
 140+ {
 141+ if ($tar_file ne $tar_file_prev)
 142+ {
 143+ &Log ("\nRead tar file $tar_file\n") ;
 144+ $tar->read($tar_file);
 145+ $tar_file_prev = $tar_file ;
 146+ }
 147+ if ($tar->contains_file ($file))
 148+ {
 149+ &Log ("File $file exists in tar file $tar_file\n") ;
 150+ next ;
 151+ }
 152+ }
 153+ else
 154+ { &Log ("Tar file $tar_file not found\n") ; }
 155+ }
 156+
 157+ &Log ("Write file $path, set date $date2\n") ;
 158+
 159+ $cmd = "wget -a /a/dammit.lt/wget.log -O $path http://mituzas.lt/wikistats/$subdir$file" ;
 160+ $result = `$cmd` ;
 161+ if ($result == 0)
 162+ { $result = "OK" ; }
 163+ &Log ("Cmd '$cmd' -> $result \n\n") ;
 164+
 165+ `touch $path -t $date2` ;
 166+
 167+ if ($file =~ /^projectcounts/)
 168+ {
 169+ $cmd = "tar --append --file=$tar_file $file" ;
 170+ &Log ("Cmd '$cmd'\n") ;
 171+ $result = `$cmd` ;
 172+ print "$result\n" ;
 173+ unlink $path ;
 174+ }
 175+ }
 176+ }
 177+
 178+ &Log ("Ready in " . (time - $timestart) . " sec.\n") ;
 179+ close HTML ;
 180+ close LOG ;
 181+ exit ;
 182+
 183+sub Log
 184+{
 185+ $msg = shift ;
 186+ my ($ss, $nn, $hh) = (localtime(time))[0,1,2] ;
 187+ my $time = sprintf ("%02d:%02d:%02d", $hh, $nn, $ss) ;
 188+ $msg = "$time $msg" ;
 189+ print $msg ;
 190+ print LOG $msg ;
 191+}
 192+
 193+sub Abort
 194+{
 195+ $msg = shift ;
 196+ &Log ($msg) ;
 197+ exit ;
 198+}
Index: trunk/wikistats/dammit.lt/DammitCompactHourlyPageCountFiles.pl
@@ -0,0 +1,964 @@
 2+ #!/usr/local/bin/perl
 3+
 4+# 4/27/2010 renamed from WikiStatsCompactDammitFiles.pl
 5+
 6+# http://article.gmane.org/gmane.science.linguistics.wikipedia.technical/38154/match=new+statistics+stuff
 7+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/webstatscollector/
 8+# https://bugzilla.wikimedia.org/show_bug.cgi?id=13541
 9+# http://de.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases
 10+
 11+# Ideas:
 12+# 1 namespace string -> namespace number ? (may not save much space: compress will deal with recurring patterns like these)
 13+# 2 frequenty distribution hits per file per first letter _-> manifest crawler
 14+# assuming crawler collects articles in alphabetical order
 15+# 3 first letter uppercase -> sort (in sections per first two chars ?)
 16+
 17+ use lib "/home/ezachte/lib" ;
 18+ use EzLib ;
 19+
 20+ $trace_on_exit = $true ;
 21+ ez_lib_version (13) ;
 22+
 23+ # set defaults mainly for tests on local machine
 24+ default_argv "-i C:/bayes_backup/a/dammit.lt/pagecounts|-t C:/bayes_backup/a/dammit.lt|-f C:/bayes_backup/a/dammit.lt|-o C:/bayes_backup/a/dammit.lt|-d 20101215" ;
 25+
 26+ use CGI qw(:all);
 27+ use URI::Escape;
 28+ use Getopt::Std ;
 29+ use Cwd ;
 30+ $bayes = -d "/a/dammit.lt" ;
 31+ $path_7za = "/usr/lib/p7zip/7za" ;
 32+ if (! $bayes)
 33+ {
 34+ print "Test on Windows\n" ;
 35+ use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; # install IO-Compress-Zlib
 36+ use IO::Compress::Gzip qw(gzip $GzipError) ; # install IO-Compress-Zlib
 37+ }
 38+
 39+ $| = 1; # flush screen output
 40+
 41+ $true = 1 ;
 42+ $false = 0 ;
 43+ $threshold = 0 ;
 44+ undef %totals_per_namespace ;
 45+
 46+ $filter = "^(?:outreach|quality|strategy|usability)\.m\$" ;
 47+ print "Filter: $filter\n" ;
 48+ $reg_exp_filter = qr"$filter" ;
 49+
 50+ $track = "NonExistingPageForSquidLogMonitoring" ;
 51+ print "Track: $track\n" ;
 52+ $reg_exp_track = qr"$track" ;
 53+
 54+# -i "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/in" -o "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/out"
 55+
 56+ my $options ;
 57+ getopt ("iodft", \%options) ;
 58+
 59+ if (! defined ($options {"i"})) { &Abort ("Specify input dir: -i dirname") } ;
 60+ if (! defined ($options {"o"})) { &Abort ("Specify output dir: -o dirname") } ;
 61+ if (! defined ($options {"f"})) { &Abort ("Specify filter dir: -f dirname") } ;
 62+ if (! defined ($options {"t"})) { &Abort ("Specify tracking dir: -t dirname") } ;
 63+ if (! defined ($options {"d"})) { &Abort ("Specify date range: as yyyymmdd, yyyymm*, yyyy* or *") } ;
 64+
 65+ $dir_in = $options {"i"} ;
 66+ $dir_out = $options {"o"} ;
 67+ $dir_filtered = $options {"f"} ;
 68+ $dir_track = $options {"t"} ;
 69+ $daterange = $options {"d"} ;
 70+
 71+ $work = cwd() ;
 72+ print "Work dir $work\n" ;
 73+
 74+ if ($dir_in !~ /[\/\\]/)
 75+ { $dir_in = "$work/$dir_in" ; }
 76+
 77+ if ($dir_out !~ /[\/\\]/)
 78+ { $dir_out = "$work/$dir_out" ; }
 79+
 80+ if ($dir_filtered !~ /[\/\\]/)
 81+ { $dir_filtered = "$work/$dir_filtered" ; }
 82+
 83+ if ($dir_track !~ /[\/\\]/)
 84+ { $dir_track = "$work/$dir_track" ; }
 85+
 86+ if (! -d $dir_in)
 87+ { &Abort ("Input dir not found: $dir_in") } ;
 88+
 89+ if (! -d $dir_out)
 90+ {
 91+ print "Create output dir $dir_out\n" ;
 92+ mkdir $dir_out ;
 93+ if (! -d $dir_out)
 94+ { &Abort ("Output dir could not be created.") } ;
 95+ }
 96+
 97+ if (($daterange !~ /^\d{8}$/) && ($daterange !~ /^\d{6}\*$/) && ($daterange !~ /^\d{4}\*$/) && ($daterange !~ /^\*$/))
 98+ { &Abort ("Specify date range: as yyyymmdd, yyyymm*, yyyy* or *") ; }
 99+
 100+ print "\nCompress pagecount files\nin: $dir_in\nout: $dir_out\nflt: $dir_filtered\ntrack: $dir_track\ndate range: $daterange" ;
 101+ $daterange =~ s/\*/\\d+/ ;
 102+
 103+ open LOG, ">>", "$work/WikiStatsCompactDammitFiles.log" ;
 104+
 105+ &CompactVisitorStats ($dir_in, $dir_out, $dir_filtered, $dir_track, $daterange) ;
 106+# &UncompactVisitorStats ; # test only, to see if process is revertible
 107+
 108+ &Log ("\nReady\n") ;
 109+ close LOG ;
 110+ exit ;
 111+
 112+sub CompactVisitorStats
 113+{
 114+ my $dir_in = shift ;
 115+ my $dir_out = shift ;
 116+ my $dir_filtered = shift ;
 117+ my $dir_track = shift ;
 118+ my $daterange = shift ;
 119+
 120+ chdir ($dir_in) || &Abort ("Cannot chdir to $dir_in\n") ;
 121+
 122+ local (*DIR);
 123+ opendir (DIR, ".");
 124+ @files = () ;
 125+
 126+ while ($file_in = readdir (DIR))
 127+ {
 128+ next if $file_in !~ /^pagecounts-$daterange-\d{6,6}.gz$/ ;
 129+
 130+ push @files, $file_in ;
 131+ }
 132+
 133+ closedir (DIR, ".");
 134+
 135+ @files = sort @files ;
 136+
 137+ if (($daterange =~ /^\d{8}$/) and ($#files < 23))
 138+ { &Abort ("Less than 24 files found for date $daterange\n" . @files) ; }
 139+
 140+ foreach $file (@files)
 141+ {
 142+ $date = substr ($file,11,8) ;
 143+ $process_dates {$date}++ ;
 144+ }
 145+
 146+ &Log ("\n\n") ;
 147+
 148+ foreach $date (sort keys %process_dates)
 149+ { &MergeFilesFullDay ($dir_in, $dir_out, $dir_filtered, $dir_track, $date) ; }
 150+}
 151+
 152+sub MergeFilesFullDay
 153+{
 154+ my $dir_in = shift ;
 155+ my $dir_out = shift ;
 156+ my $dir_filtered = shift ;
 157+ my $dir_track = shift ;
 158+ my $date = shift ;
 159+
 160+ my $year = substr ($date,0,4) ;
 161+ my $month = substr ($date,4,2) ;
 162+ my $day = substr ($date,6,2) ;
 163+
 164+ my ($file_out1, $file_out2, $file_out3, $out_gz) ;
 165+
 166+ $dir_out = "$dir_out/${year}-${month}" ;
 167+ if (! -d $dir_out)
 168+ {
 169+ mkdir $dir_out ;
 170+ if (! -d $dir_out)
 171+ { &Abort ("Output dir could not be created: $dir_out") } ;
 172+ }
 173+
 174+ my @files_today = () ;
 175+ foreach $file (@files)
 176+ {
 177+ next if $file !~ /^pagecounts-$date-\d{6,6}.gz$/ ;
 178+
 179+ push @files_today, $file ;
 180+ }
 181+
 182+ # very few times (nearly) dupiclate files are found for same hour
 183+ # keep the largest and presumably most complete one
 184+ for ($i = 0 ; $i < $#files_today ; $i++)
 185+ {
 186+ for ($j = $i+1 ; $j <= $#files_today ; $j++)
 187+ {
 188+ if (substr ($files_today [$i],0,25) eq substr ($files_today [$j],0,25))
 189+ {
 190+ $size_i = -s $files_today [$i] ;
 191+ $size_j = -s $files_today [$j] ;
 192+ print "${files_today [$i]}: $size_i\n" ;
 193+ print "${files_today [$j]}: $size_j\n" ;
 194+ if ($size_i > $size_j)
 195+ {
 196+ print "Keep ${files_today [$i]}\n\n" ;
 197+ $files_today [$j]= "" ;
 198+ }
 199+ else
 200+ {
 201+ print "Keep ${files_today [$j]}\n\n" ;
 202+ $files_today [$i]= "" ;
 203+ }
 204+ }
 205+ }
 206+ }
 207+
 208+ $time_start = time ;
 209+ $lines = 0 ;
 210+
 211+ undef @in_gz ;
 212+ undef $file_open ;
 213+ my $time_start = time ;
 214+
 215+ # $file_out = "pagecounts-$year$month$day_full_day" ;
 216+ # open OUT, ">", $file_out ;
 217+ # binmode $file_out ;
 218+
 219+ # print "File_out1 $file_out1\n" ;
 220+ # print "File_out2 $file_out2\n" ;
 221+ # print "File_out3 $file_out3\n" ;
 222+
 223+# my $out_gz1 = IO::Compress::Gzip->new ($file_out1) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
 224+ if ($bayes)
 225+ {
 226+ # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd" ; # full day
 227+ $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h" ; # full day, hourly data
 228+ # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d" ; # full day, compact, just daily totals
 229+ if ((-e "$file_out2.7z") || (-e "$file_out2.bz2") || (-e "$file_out2.zip") || (-e "$file_out2.gz"))
 230+ {
 231+ &Log ("\nTarget file '$file_out2.[7z|bz2|zip|gz]' exists already. Skip this date.\n") ;
 232+ return ;
 233+ }
 234+ if ($#files_today < 23)
 235+ {
 236+ &Log ("\nLess than 24 files found for target file '$file_out2.7z'. Skip this date.\n") ;
 237+ return ;
 238+ }
 239+
 240+ open $out_gz2, ">", "$file_out2" || &Abort ("Output file '$file_out2' could not be opened.") ;
 241+ # open $out_gz3, ">", "$file_out3" || &Abort ("Output file '$file_out3' could not be opened.") ;
 242+ }
 243+ else
 244+ {
 245+ # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd.gz" ; # full day
 246+ $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h.gz" ; # full day, count above threshold
 247+ $out_gz2 = IO::Compress::Gzip->new ($file_out2) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
 248+ # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d.gz" ; # full day, count above threshold
 249+ # $out_gz3 = IO::Compress::Gzip->new ($file_out3) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
 250+ }
 251+
 252+# binmode $out_gz1 ;
 253+ binmode $out_gz2 ;
 254+# binmode $out_gz3 ;
 255+
 256+ $file_filtered = "$dir_filtered/pagecounts-$year$month$day.txt" ;
 257+ &Log ("\nFilter file: $file_filtered\n") ;
 258+ open $out_filtered, '>', $file_filtered ;
 259+ binmode $out_filtered ;
 260+
 261+ $file_track = "$dir_track/_PageCountsForSquidLogTracking.txt" ;
 262+ &Log ("Tracking file: $file_track\n\n") ;
 263+
 264+ for ($hour = 0 ; $hour < 24 ; $hour++)
 265+ { $file_in_found [$hour] = $false ; }
 266+
 267+ $files_in_open = 0 ;
 268+ $files_in_found = 0 ;
 269+ $langprev = "" ;
 270+ foreach $file_in (@files_today)
 271+ {
 272+ next if $file_in eq "" ;
 273+
 274+ ($hour = $file_in) =~ s/^pagecounts-\d+-(\d\d)\d+\.gz$/$1/ ;
 275+ $hour = (0+$hour) ;
 276+ # print " file found '$file_in'\n" ;
 277+
 278+ if ($bayes)
 279+ { open $in_gz [$hour], "-|", "gzip -dc \"$file_in\"" || &Abort ("Input file '" . $file_in . "' could not be opened.") ; }
 280+ else
 281+ { $in_gz [$hour] = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ; }
 282+ binmode $in_gz [$hour] ;
 283+
 284+ $files_in_open++ ;
 285+ $file_in_found [$hour] = $true ;
 286+ $file_in_open [$hour] = $true ;
 287+ $files_in_found ++ ;
 288+ $file = $in_gz [$hour] ;
 289+ $line = <$file> ;
 290+ $line =~ s/^(\w+)2 /$1.y /o ;
 291+ $line =~ s/^(\w+) /$1.z /o ;
 292+
 293+ ($lang,$title,$count [$hour],$dummy) = split (' ', $line) ;
 294+ $key [$hour] = "$lang $title" ;
 295+ }
 296+
 297+ $comment = "# Wikimedia page request counts for $date, each line shows 'subproject title counts'\n" ;
 298+ if ($threshold > 0 )
 299+ { $comment .= "# Count for articles with less than $threshold requests per full day are omitted\n" ; }
 300+ $comment .= "# Subproject is language code, followed by project code\n" ;
 301+ $comment .= "# Project is b:wikibooks, k:wiktionary, n:wikinews, q:wikiquote, s:wikisource, v:wikiversity, z:wikipedia (z added by compression script: wikipedia happens to be sorted last in dammit.lt files)\n" ;
 302+ $comment .= "# Counts format is total per day, followed by count per hour if larger than zero, hour 0..23 shown as A..X (saves up to 22 bytes per line compared to comma separated values)\n" ;
 303+ $comment .= "# If data are missing for some hour (file missing or corrupt) a question mark (?) is shown (and for each missing hour the daily total is incremented with hourly average)\n" ;
 304+ print $out_gz2 $comment ;
 305+# print $out_gz3 $comment ;
 306+
 307+ if ($files_in_found < 24)
 308+ {
 309+ for ($hour = 0 ; $hour < 24 ; $hour++)
 310+ {
 311+ if (! $file_in_found [$hour])
 312+ { $hours_missing .= "$hour," ; }
 313+ }
 314+ $hours_missing =~ s/,$// ;
 315+ &Log ("Merge files: date = $date, only $files_in_found files found!\n") ;
 316+ }
 317+ else
 318+ { &Log ("Merge files: date = $date\n") ; }
 319+
 320+ if ($hours_missing ne '')
 321+ {
 322+ print $out_gz2 "#\n" ;
 323+ print $out_gz2 "# In this file data are missing for hour(s) $hours_missing!\n" ;
 324+ # print $out_gz3 "#\n" ;
 325+ # print $out_gz3 "# In this file data are missing for hour(s) $hours_missing!\n" ;
 326+ }
 327+ $comment = "#\n" ;
 328+ $comment .= "# Lines starting with ampersand (@) show totals per 'namespace' (including omitted counts for low traffic articles)\n" ;
 329+ $comment .= "# Since valid namespace string are not known in the compression script any string followed by colon (:) counts as possible namespace string\n" ;
 330+ $comment .= "# Please reconcile with real namespace name strings later\n" ;
 331+ $comment .= "# 'namespaces' with count < 5 are combined in 'Other' (on larger wikis these are surely false positives)\n" ;
 332+ $comment .= "#\n" ;
 333+ $comment .= "# Page titles are shown unmodified (preserves sort sequence)\n" ;
 334+ $comment .= "#\n" ;
 335+ print $out_gz2 $comment ;
 336+# print $out_gz3 $comment ;
 337+
 338+ $key_low_prev = "" ;
 339+ while ($files_in_open > 0)
 340+ {
 341+ $key_low = "\xFF\xFF";
 342+ for ($hour = 0 ; $hour < 24 ; $hour++)
 343+ {
 344+ if (($files_in_open == 24) || ($file_in_found [$hour] && $file_in_open [$hour]))
 345+ {
 346+ if ($key [$hour] lt $key_low)
 347+ { $key_low = $key [$hour] ; }
 348+ }
 349+ }
 350+
 351+ if (($key_low =~ /^nov/) || ($key_low_prev =~ /^nov/))
 352+ { &Log ("key_low '$key_low' (key_low_prev '$key_low_prev')\n") ; }
 353+
 354+ $counts = "" ;
 355+ $total = 0 ;
 356+ for ($hour = 0 ; $hour < 24 ; $hour++)
 357+ {
 358+ if (! $file_in_found [$hour])
 359+ { $counts .= chr ($hour+ord('A')) . '?' ; }
 360+ elsif (($files_in_open == 24) || $file_in_open [$hour])
 361+ {
 362+ if ($key [$hour] eq $key_low)
 363+ {
 364+ $counts .= chr ($hour+ord('A')) . $count [$hour] ;
 365+ $total += $count [$hour] ;
 366+ $file = $in_gz [$hour] ;
 367+ # $line = <$file> ;
 368+
 369+ while ($true)
 370+ {
 371+ if ($line = <$file>) # =~ /^a/)
 372+ {
 373+ $line =~ s/^([\w\-]+)2 /$1.y /o ;
 374+ $line =~ s/^([\w\-]+) /$1.z /o ;
 375+ ($lang,$title,$count [$hour],$dummy) = split (' ', $line) ;
 376+ $key [$hour] = "$lang $title" ;
 377+
 378+ last if $lang !~ /\d/ ;
 379+ }
 380+ else
 381+ {
 382+ if ($bayes)
 383+ { close $in_gz [$hour] ; }
 384+ else
 385+ { $in_gz [$hour] -> close () ; }
 386+ $files_in_open-- ;
 387+ $file_in_open [$hour] = $false ;
 388+ $key [$hour] = "\xFF\xFF";
 389+
 390+ last ;
 391+ }
 392+ }
 393+ }
 394+ }
 395+ }
 396+ if ($lines == 0)
 397+ { &Log ("\nlines: project key\n") ; }
 398+
 399+ if (++$lines % 100000 == 0)
 400+ { &Log ("$lines: $key_low\n") ; }
 401+
 402+ # last if $lines > 10000 ; # test
 403+
 404+ last if $key_low eq "\xFF\xFF" ;
 405+
 406+ # Q&D fix for unexplained out of order error for what seems to be invalid language
 407+ # remember : no suffix on language code gets replaced by .y or .z to fixed sort order
 408+ # ^nov.mw nov1 1 8765
 409+ # ^nov1.mw nov1 1 931 <--------------
 410+ # ^nov 10_dw_oktobre 1 11421
 411+ ($lang,$title) = split (' ', $key_low) ;
 412+ if ($lang =~ /\d/)
 413+ {
 414+ $invalid_languages {$lang}++ ;
 415+ &Log ("\nSkip invalid language '$lang'\n") ;
 416+ next ;
 417+ }
 418+
 419+
 420+ if ($key_low_prev gt $key_low)
 421+ {
 422+ for ($hour = 0 ; $hour < 24 ; $hour++)
 423+ { &Log ("hour $hour: key ${key[$hour]}\n") ; }
 424+
 425+ &Abort ("Sequence error: '$key_low_prev' gt '$key_low'\n") ;
 426+ }
 427+
 428+ if (($key_low_prev eq $key_low) && ($files_in_open > 0))
 429+ {
 430+ for ($hour = 0 ; $hour < 24 ; $hour++)
 431+ {
 432+ if ($file_in_open [$hour])
 433+ { print "hour $hour: file open, key ${key [$hour]}\n" ; }
 434+ else
 435+ { print "hour $hour: file closed, key ${key [$hour]}\n" ; }
 436+ }
 437+ &Abort ("Sequence error: '$key_low_prev' eq '$key_low'\n") ;
 438+ }
 439+
 440+ # print OUT "$key_low $total$counts\n" ;
 441+# print $out_gz1 "$key_low $total$counts\n" ;
 442+
 443+ ($lang,$title) = split (' ', $key_low) ;
 444+
 445+ $title =~ s/\%20/_/g ;
 446+ $title =~ s/\%3A/:/gi ;
 447+# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
 448+ if (($title !~ /\:/) || ($title =~ /^:[^:]*$/)) # no colon or only on first position
 449+ { $namespace = 'NamespaceArticles' ; }
 450+ else
 451+ { ($namespace = $title) =~ s/([^:])\:.*$/$1/ ; }
 452+ # print "KEY $key_low -> $namespace\n" ;
 453+
 454+ if (($lang ne $langprev) && ($langprev ne ""))
 455+ {
 456+ $filter_matches = $lang =~ $reg_exp_filter ;
 457+ if ($filter_matches)
 458+ { print "F $lang\n" ; }
 459+ # else
 460+ # { print "- $lang\n" ; }
 461+
 462+ &WriteTotalsPerNamespace ($out_gz2, $langprev) ;
 463+ # &WriteTotalsPerNamespace ($out_gz3, $langprev) ;
 464+ undef %totals_per_namespace ;
 465+ }
 466+ $langprev = $lang ;
 467+
 468+ if (($files_in_found < 24) && ($files_in_found > 0)) # always > 0 actually
 469+ { $total = sprintf ("%.0f",($total / $files_in_found) * 24) ; }
 470+
 471+ $totals_per_namespace {"$lang $namespace"} += $total ;
 472+
 473+ if ($filter_matches)
 474+ { print $out_filtered "$key_low $total$counts\n" ; }
 475+
 476+ if ($key_low =~ $reg_exp_track) # track count for NonExistingPageForSquidLogMonitoring on en.z
 477+ {
 478+ open $out_track, '>>', $file_track ;
 479+ binmode $out_track ;
 480+ print $out_track "$key_low $total$counts\n" ;
 481+ close $out_track ;
 482+ }
 483+
 484+ if ($total >= $threshold)
 485+ { print $out_gz2 "$key_low $total$counts\n" ;
 486+ # print $out_gz3 "$key_low $total\n" ;
 487+ }
 488+
 489+ $key_low_prev = $key_low ;
 490+ # print "OUT $key_low $counts\n" ;
 491+ }
 492+
 493+ &WriteTotalsPerNamespace ($out_gz2, $langprev) ;
 494+# &WriteTotalsPerNamespace ($out_gz3, $langprev) ;
 495+
 496+ &Log ("File production took " . (time-$time_start) . " seconds\n\n") ;
 497+
 498+ &Log ("[$lines, $files_in_open] $key_low\n") ;
 499+# close OUT ;
 500+
 501+ if ($bayes)
 502+ {
 503+ # close $out_gz1 ;
 504+ close $out_gz2 ;
 505+ # close $out_gz3 ;
 506+ close $out_filtered ;
 507+
 508+# $cmd = "$path_7za a $file_out2.7z $file_out2" ;
 509+# $result = `$cmd` ;
 510+# if ($result =~ /Everything is Ok/s)
 511+# {
 512+# $result =~ s/^.*?(Updating.*?)\n.*$/$1 -> OK/s ;
 513+# unlink $file_out2 ;
 514+# foreach $file_in (@files_today)
 515+# {
 516+# print "unlink $dir_in/$file_in\n" ;
 517+# unlink "$dir_in/$file_in" ;
 518+# }
 519+# }
 520+# else
 521+# {
 522+# print "Delete $file_out2.7z\n" ;
 523+# unlink "$file_out2.7z" ;
 524+# }
 525+
 526+
 527+ $cmd = "bzip2 -9 -v $file_out2" ;
 528+ &Log ("\n\n$cmd ->\n") ;
 529+ $result = `$cmd` ;
 530+ &Log ("\n\n") ;
 531+
 532+ # if ($true) # qqq
 533+ if ($false)
 534+ {
 535+ foreach $file_in (@files_today)
 536+ {
 537+ print "unlink $dir_in/$file_in\n" ;
 538+ unlink "$dir_in/$file_in" ;
 539+ }
 540+ }
 541+ else
 542+ {
 543+ # print "Delete $file_out2.7z\n" ;
 544+ # unlink "$file_out2.7z" ;
 545+ }
 546+
 547+ # $cmd = "bzip2 -9 -v $file_out3" ;
 548+ # &Log ("\n$cmd ->\n") ;
 549+ # $result = `$cmd` ;
 550+ # &Log ("\n\n") ;
 551+ &Log ("Compression took " . (time-$time_start_compression) . " seconds\n\n") ;
 552+ }
 553+ else
 554+ {
 555+ # $out_gz1->close() ;
 556+ $out_gz2->close() ;
 557+ # $out_gz3->close() ;
 558+ close $out_filtered ;
 559+ }
 560+
 561+ &Log ("\nRecords skipped for invalid languages:\n") ;
 562+ foreach $key (sort keys %invalid_languages)
 563+ { &Log ("$key: ${invalid_languages {$key}}\n") ; }
 564+
 565+ &Log ("\nTotals per namespace written: $lines_namespace_counts\n") ;
 566+ &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
 567+}
 568+
 569+sub WriteTotalsPerNamespace
 570+{
 571+ my $out_gz = shift ;
 572+ my $lang = shift ;
 573+ my $total ;
 574+ my $totals_per_namespace_other ;
 575+
 576+ foreach my $key (sort keys %totals_per_namespace)
 577+ {
 578+ $total = $totals_per_namespace {$key} ;
 579+ if ($total < 5)
 580+ { $totals_per_namespace_other += $total ; }
 581+ else
 582+ {
 583+ # print "@ $key $total\n" ;
 584+ print $out_gz "@ $key $total\n" ;
 585+ $lines_namespace_counts ++ ;
 586+ }
 587+ }
 588+ if ($totals_per_namespace_other > 0 )
 589+ {
 590+ # print "@ $lang -other- $totals_per_namespace_other\n" ;
 591+ print $out_gz "@ $lang -other- $totals_per_namespace_other\n" ;
 592+ $lines_namespace_counts ++ ;
 593+ }
 594+}
 595+
 596+sub Log
 597+{
 598+ $msg = shift ;
 599+ print $msg ;
 600+ print LOG $msg ;
 601+}
 602+
 603+sub Abort
 604+{
 605+ $msg = shift ;
 606+ print "Abort script\nError: $msg\n" ;
 607+ print LOG "Abort script\nError: $msg\n" ;
 608+ exit ;
 609+}
 610+
 611+#=============================================================================================================
 612+
 613+#sub Compact
 614+#{
 615+# my $day = shift ;
 616+# &Log ("Compact files for $day\n") ;
 617+
 618+# $file_in = "pagecounts-$day.out" ;
 619+# $file_out1 = "pagecounts-${day}_all.gz" ;
 620+# $file_out2 = "pagecounts-${day}_10plus.gz" ;
 621+# open IN, "<", $file_in ;
 622+# binmode $file_in ;
 623+
 624+# my $out_gz1 = IO::Compress::Gzip->new ($file_out1) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
 625+# my $out_gz2 = IO::Compress::Gzip->new ($file_out2) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
 626+
 627+# open OUT, ">", $file_out ;
 628+# binmode $file_out ;
 629+
 630+# $lang_prev = "" ;
 631+# while ($line = <IN>)
 632+# {
 633+# chomp ($line) ;
 634+# ($lang, $title, $counts) = split (' ', $line) ;
 635+# $title2 = $title ;
 636+# $title =~ s/\%20/_/g ;
 637+# $title =~ s/\%3A/:/g ;
 638+# # $title =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
 639+# # if ($title =~ /[\x00-\x1F]/)
 640+# # { &Log ("> '$title2'\n") ; }
 641+# $title =~ s/\x00-\x1F/"%" . sprintf ("%X", ord($1)) ;/ge ;
 642+# print $out_gz1 "$lang $title $counts\n" ;
 643+# ($counts2 = $counts) =~ s/^(\d+).*/$1/ ;
 644+# if ($counts2 >= $threshold)
 645+# { print $out_gz2 "$lang $title $counts\n" ; }
 646+# $lang_prev = $lang ;
 647+# }
 648+#
 649+# close IN ;
 650+# $out_gz1->close() ;
 651+# $out_gz2->close() ;
 652+#}
 653+
 654+
 655+#sub GetViewDistribution
 656+#{
 657+# open OUT, ">", "Views.csv" ;
 658+# foreach $file_in (@files)
 659+# {
 660+# ($hour = $file_in) =~ s/^pagecounts-\d+-(\d\d)\d+\.gz$/$1/ ;
 661+# $hour = chr(ord('A')+$hour) ;
 662+# &Log ("Process $hour $file_in\n") ;
 663+
 664+# $in_gz1 = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed: $GunzipError\n") ;
 665+# while ($line = <$in_gz1>)
 666+# {
 667+# ($lang,$title,$count,$dummy) = split (' ', $line) ;
 668+# if (($lang eq "en") && ($title !~ /:/)) # only en: and namespace 0
 669+# {
 670+# $tot {$hour} += $count ;
 671+# if ($count < 3)
 672+# { @counts {$hour . substr ($title,0,1)}++ ; }
 673+# }
 674+# }
 675+# $in_gz1->close () ;
 676+# }
 677+#
 678+# print OUT "," ;
 679+# foreach $hour ('A'..'X')
 680+# { print OUT $hour . ", " ; }
 681+# print OUT "\n" ;
 682+#
 683+# print OUT "," ;
 684+# foreach $hour ('A'..'X')
 685+# { print OUT $tot {$hour} . ", " ; }
 686+# print OUT "\n" ;
 687+#
 688+# for ($c=0; $c < 256; $c++)
 689+# {
 690+# # do not print chars " and , as such: confuses csv format
 691+# if ($c < 33)
 692+# { print OUT "chr($c), " ; }
 693+# elsif (chr($c) eq '"')
 694+# { print OUT "dquote, " ; }
 695+# elsif (chr($c) eq ',')
 696+# { print OUT "comma, " ; }
 697+# else
 698+# { print OUT chr($c) . ", " ; }
 699+#
 700+# foreach $hour ('A'..'X')
 701+# { print OUT (0+@counts {$hour.chr($c)}) , ", " ; }
 702+#
 703+# if ($c < 255)
 704+# { print OUT "\n" ; }
 705+# }
 706+# close OUT ;
 707+#}
 708+
 709+
 710+#sub RecompactVisitorStats
 711+#{
 712+# my $dir_in = "D:/Wikipedia_Visitors/full_day/" ;
 713+# chdir ($dir_in) || &Abort ("Cannot chdir to $dir_in\n") ;
 714+# local (*DIR);
 715+# opendir (DIR, ".");
 716+# @files = () ;
 717+# while ($file_in = readdir (DIR))
 718+# {
 719+# next if $file_in !~ /^pagecounts-\d{8,8}_fd.gz$/ ;
 720+#
 721+# push @files, $file_in ;
 722+# }
 723+
 724+# $filecnt = $#files+1 ;
 725+# @files = sort { substr ($a, 20,2) <=> substr ($b, 20,2)} @files ;
 726+
 727+# foreach $file (@files)
 728+# { &RecompactVisitorStats2 ($file) ; }
 729+# closedir (DIR, ".");
 730+#}
 731+
 732+#sub RecompactVisitorStats2
 733+#{
 734+## http://www.7-zip.org/7z.html
 735+# my $file = shift ;
 736+# my $time_start = time ;
 737+# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
 738+## my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
 739+# my $file_in = "D:/Wikipedia_Visitors/full_day/$file" ;
 740+# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
 741+# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
 742+
 743+# &Log ("Process $file_in\n") ;
 744+
 745+# $in_gz = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
 746+# binmode $in_gz ;
 747+# open OUT, ">", $file_out ;
 748+# binmode OUT ;
 749+
 750+# my ($title, $title2) ;
 751+# while ($line = <$in_gz>)
 752+# {
 753+# chomp ($line) ;
 754+# ($lang,$title,$counts) = split (" ", $line) ;
 755+
 756+# if ($lang ne $lang_prev) { print "$lang " ; }
 757+# $lang_prev = $lang ;
 758+
 759+# # test pagecounts-20080701_fd.gz
 760+# # all records 424 Mib compressed (1984 uncompressed)
 761+# # count > 1 212 Mib compressed ( 733 uncompressed)
 762+# # count > 2 169 Mib compressed ( 551 uncompressed)
 763+# next if $counts <= 1 ;
 764+
 765+# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
 766+# $title =~ s/\s/_/g;
 767+# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
 768+# $lang =~ s/\.y/2/ ;
 769+
 770+# print OUT "$lang $title $counts\n" ;
 771+# }
 772+
 773+# print "Close files\n" ;
 774+# $in_gz -> close () ;
 775+# close (OUT) ;
 776+
 777+# &Log ("Compress $file_out\n") ;
 778+
 779+# unlink $file_7z ;
 780+# $result = `$path_7z a $file_7z $file_out` ;
 781+# &Log ("Compressed\n") ;
 782+# &Log ("Result " . ($result+0) . " \n") ;
 783+# if ((-e $file_7z) && (-s $file_7z > 0) && (($result == 0) || ($result == 7)))
 784+# { unlink $file_out ; }
 785+
 786+# &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
 787+## 0 No error
 788+## 1 Warning (Non fatal error(s)). For example, one or more files were locked by some other application, so they were not compressed.
 789+## 2 Fatal error
 790+## 7 Command line error
 791+## 8 Not enough memory for operation
 792+## 255 User stopped the process
 793+#}
 794+
 795+
 796+#sub RecompactVisitorStats3
 797+#{
 798+## http://www.7-zip.org/7z.html
 799+# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
 800+# my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
 801+# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
 802+# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
 803+## my $file_log = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts.log" ;
 804+
 805+# $in_gz = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
 806+# binmode $in_gz ;
 807+## $out_gz = IO::Compress::Gzip->new ($file_out) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
 808+## binmode $out_gz ;
 809+# open OUT, ">", $file_out ;
 810+# binmode OUT ;
 811+## open LOG, ">", $file_log ;
 812+## binmode LOG ;
 813+
 814+# my ($title, $title2) ;
 815+# while ($line = <$in_gz>)
 816+# {
 817+# chomp ($line) ;
 818+# ($lang,$title,$counts) = split (" ", $line) ;
 819+
 820+# if ($lang ne $lang_prev) { print "$lang\n" ; }
 821+## last if $lang gt "fs" ;
 822+# $lang_prev = $lang ;
 823+
 824+# # test pagecounts-20080701_fd.gz
 825+# # all records 424 Mib compressed (1984 uncompressed)
 826+# # count > 1 212 Mib compressed ( 733 uncompressed)
 827+# # count > 2 169 Mib compressed ( 551 uncompressed)
 828+# next if $counts <= 1 ;
 829+
 830+## next if $lang !~ /^(?:ar|fr)/ ;
 831+
 832+#if ($false)
 833+#{
 834+# $title1b = $title ;
 835+# $title1b =~ s/(\%[A-Fa-f0-9]{2})/uc($1)/seg;
 836+# $title1b =~ s/\%28/(/g ;
 837+# $title1b =~ s/\%29/)/g ;
 838+# $title1b =~ s/\%3A/:/g ;
 839+# $title1b =~ s/\%2F/\//g ;
 840+# $title1b =~ s/\%5C/\\/g ;
 841+# $title1b =~ s/\%2A/*/g ;
 842+# $title1b =~ s/\%21/!/g ;
 843+# $title1b =~ s/\%5F/_/g ;
 844+# $title1b =~ s/\%2C/,/g ;
 845+# $title1b =~ s/\%2E/./g ;
 846+# $title1b =~ s/\%2D/-/g ;
 847+# $title1b =~ s/\%25/%/g ;
 848+# $title1b =~ s/\%7E/~/g ;
 849+# $title1b =~ s/\%27/'/g ;
 850+# $title1b =~ s/\%3D/=/g ;
 851+# $title1b =~ s/\%26/&/g ;
 852+# $title1b =~ s/\%3B/;/g ;
 853+# $title1b =~ s/\%3F/?/g ;
 854+# $title1b =~ s/\%2B/+/g ;
 855+# $title2 = $title1b ;
 856+# $title2 =~ s/%([A-F0-9]{2})/chr(hex($1))/seg;
 857+
 858+# if ($title1b ne $title2) # if changed anything at all
 859+# {
 860+# $title3 = uri_escape ($title2) ;
 861+# $title3 =~ s/\%28/(/g ;
 862+# $title3 =~ s/\%29/)/g ;
 863+# $title3 =~ s/\%3A/:/g ;
 864+# $title3 =~ s/\%2F/\//g ;
 865+# $title3 =~ s/\%5C/\\/g ;
 866+# $title3 =~ s/\%2A/\*/g ;
 867+# $title3 =~ s/\%21/\!/g ;
 868+# $title3 =~ s/\%5F/\_/g ;
 869+# $title3 =~ s/\%2C/,/g ;
 870+# $title3 =~ s/\%2E/./g ;
 871+# $title3 =~ s/\%2D/-/g ;
 872+# $title3 =~ s/\%25/%/g ;
 873+# $title3 =~ s/\%7E/~/g ;
 874+# $title3 =~ s/\%27/'/g ;
 875+# $title3 =~ s/\%3D/=/g ;
 876+# $title3 =~ s/\%26/&/g ;
 877+# $title3 =~ s/\%3B/;/g ;
 878+# $title3 =~ s/\%3F/?/g ;
 879+# $title3 =~ s/\%2B/+/g ;
 880+
 881+# if ($title1b eq $title3) # process reversible ?
 882+# {
 883+# $y++ ;
 884+# $title2 =~ s/\s/_/g;
 885+# $title = $title2 ;
 886+# }
 887+# else
 888+# {
 889+# $n++ ;
 890+# print "Y $y N $n\n$title\n$title3\n\n" ;
 891+# print LOG "Y $y N $n\n$title\n$title3\n\n" ;
 892+# }
 893+# }
 894+#}
 895+# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
 896+# $title =~ s/\s/_/g;
 897+# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
 898+# $lang =~ s/\.y/2/ ;
 899+
 900+# # print $out_gz "$lang $title $counts\n" ;
 901+# print OUT "$lang $title $counts\n" ;
 902+# }
 903+
 904+# print "Close files\n" ;
 905+# $in_gz -> close () ;
 906+## $out_gz -> close () ;
 907+# close (OUT) ;
 908+# $result = `$path_7z a $file_out $file_txt` ;
 909+# print $result ;
 910+#}
 911+
 912+
 913+
 914+# test (partial) reversibility of process
 915+#sub UncompactVisitorStats
 916+#{
 917+# my $file_in = "out/2009-03/pagecounts-20090301_fdt1" ;
 918+# my $dir_out = "out" ;
 919+# # $in_gz = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
 920+# open $in_gz, '<', $file_in ;
 921+# binmode $in_gz ;
 922+
 923+# for ($h=0 ; $h<=23 ; $h++)
 924+# {
 925+# $time = sprintf ("%02d",$h) . "0000" ;
 926+## $file_out = "$dir_out/pagecounts-20090301-$time.gz" ;
 927+# $file_out = "$dir_out/pagecounts-20090301-$time" ;
 928+# open $out_gz [$h], '>', $file_out ;
 929+## $out_gz [$h] = IO::Compress::Gzip->new ($file_out) || &Abort ("IO::Compress::Gzip failed: $GzipError\n");
 930+# binmode $out_gz [$h] ;
 931+# }
 932+
 933+# while ($line = <$in_gz>)
 934+# {
 935+# next if $line =~ /^#/ ;
 936+# next if $line =~ /^@/ ;
 937+# chomp ($line) ;
 938+## print "$line\n" ;
 939+# if ($lines++ > 10000) { exit ; }
 940+# ($lang,$title,$counts) = split (" ", $line) ;
 941+# $lang =~ s/\.z// ;
 942+# $lang =~ s/\.y/2/ ;
 943+# $counts =~ s/^\d+// ; # remove (redundant) preceding total
 944+# while ($counts ne "")
 945+# {
 946+# $letter = substr ($counts,0,1) ;
 947+# $counts = substr ($counts,1) ;
 948+# ($count = $counts) =~ s/^(\d+).*$/$1/ ;
 949+# $counts =~ s/^\d+(.*)$/$1/ ;
 950+# $h = ord ($letter) - ord ('A') ;
 951+# $file = $out_gz [$h] ;
 952+# $writes {$h} ++ ;
 953+# print $file "$lang $title $count\n" ;
 954+# }
 955+
 956+# }
 957+
 958+# for ($h=0 ; $h<=23 ; $h++)
 959+# {
 960+## $out_gz [$h] -> close () ;
 961+# close $out_gz [$h] ;
 962+# }
 963+#}
 964+
 965+
Index: trunk/wikistats/dammit.lt/dammit_scan.sh
@@ -0,0 +1,10 @@
 2+i='/a/dammit.lt/pagecounts' # input dir
 3+o='/home/ezachte/wikistats/scans' # output dir
 4+f=20090424 # from date
 5+t=20091110 # till date
 6+#="swine.*flu\nswine.*influenza\nflu.*outbreak\ninfluenza.*outbreak\ngripe.*porcina\npandem\n"
 7+p=".*influensa\n.*H1N1.*\npandemi\n"
 8+#p="#$o/pattern_influenza_en.txt" # file name
 9+#p="#$o/pattern_pandemic_shortlist.txt" # file name
 10+#p=html
 11+perl /a/dammit.lt/DammitScanCompactedFiles.pl -i $i -o $o -f $f -t $t -p $p
Property changes on: trunk/wikistats/dammit.lt/dammit_scan.sh
___________________________________________________________________
Added: svn:eol-style
112 + native
Index: trunk/wikistats/dammit.lt/dammit_filter.sh
@@ -0,0 +1,5 @@
 2+#='/a/dammit.lt/pagecounts' # input dir
 3+#o='/home/ezachte/wikistats/scans' # output dir
 4+#f=20090424 # from date
 5+#t=20091110 # till date
 6+perl /a/dammit.lt/DammitFilterDailyPageCountsPerLanguage.pl
Property changes on: trunk/wikistats/dammit.lt/dammit_filter.sh
___________________________________________________________________
Added: svn:eol-style
17 + native
Index: trunk/wikistats/dammit.lt/DammitCompactHourlyOrDailyPageCountFiles.pl
@@ -0,0 +1,1568 @@
 2+#!/usr/local/bin/perl
 3+
 4+# 4/27/2010 renamed from WikiStatsCompactDammitFiles.pl
 5+
 6+# http://article.gmane.org/gmane.science.linguistics.wikipedia.technical/38154/match=new+statistics+stuff
 7+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/webstatscollector/
 8+# https://bugzilla.wikimedia.org/show_bug.cgi?id=13541
 9+# http://de.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases
 10+
 11+# Ideas:
 12+# 1 namespace string -> namespace number ? (may not save much space: compress will deal with recurring patterns like these)
 13+# 2 frequency distribution hits per file per first letter _-> manifest crawler
 14+# assuming crawler collects articles in alphabetical order
 15+# 3 first letter uppercase -> sort (in sections per first two chars ?)
 16+
 17+ use lib "/home/ezachte/lib" ;
 18+ use EzLib ;
 19+
 20+ $trace_on_exit = $true ;
 21+ ez_lib_version (13) ;
 22+
 23+ # set defaults mainly for tests on local machine
 24+# default_argv "-i C:/bayes_backup/a/dammit.lt/pagecounts|-t C:/bayes_backup/a/dammit.lt|-f C:/bayes_backup/a/dammit.lt|-o C:/bayes_backup/a/dammit.lt|-d 20101215" ;
 25+ default_argv "-m|-i C:/bayes_backup/a/dammit.lt/pagecounts|-o C:/bayes_backup/a/dammit.lt|-d 200812" ;
 26+
 27+ use CGI qw(:all);
 28+ use URI::Escape;
 29+ use Cwd ;
 30+ $bayes = -d "/a/dammit.lt" ;
 31+# $path_7za = "/usr/lib/p7zip/7za" ;
 32+
 33+ use IO::Compress::Bzip2 qw(bzip2 $Bzip2Error) ;
 34+
 35+ if (! $bayes)
 36+ {
 37+ print "Test on Windows\n" ;
 38+ use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; # install IO-Compress-Zlib
 39+ use IO::Compress::Gzip qw(gzip $GzipError) ; # install IO-Compress-Zlib
 40+ }
 41+
 42+ $| = 1; # flush screen output
 43+
 44+ $true = 1 ;
 45+ $false = 0 ;
 46+ $threshold = 0 ;
 47+ undef %totals_per_namespace ;
 48+
 49+ ($sec,$min,$hour,$mday,$month,$year,$wday,$yday,$isdst) = gmtime(time);
 50+ $year = $year + 1900;
 51+ $month++ ;
 52+ $month_run = sprintf ("%4d-%2d", $year, $month) ;
 53+ print "Current month: $month_run\n" ;
 54+
 55+ $filter = "^(?:outreach|quality|strategy|usability)\.m\$" ;
 56+ print "Filter: $filter\n" ;
 57+ $reg_exp_filter = qr"$filter" ;
 58+
 59+ $track = "NonExistingPageForSquidLogMonitoring" ;
 60+ print "Track: $track\n" ;
 61+ $reg_exp_track = qr"$track" ;
 62+
 63+# -i "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/in" -o "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/out"
 64+
 65+ my $options ;
 66+ getopt ("iodft", \%options) ;
 67+
 68+ $compactmonth = $options {"m"} ;
 69+ $compactday = ! $compactmonth ;
 70+
 71+ if (! defined ($options {"i"})) { &Abort ("Specify input dir: -i dirname") } ;
 72+ if ($compactday)
 73+ {
 74+ if (! defined ($options {"o"})) { &Abort ("Specify output dir: -o dirname") } ;
 75+ if (! defined ($options {"f"})) { &Abort ("Specify filter dir: -f dirname") } ;
 76+ if (! defined ($options {"t"})) { &Abort ("Specify tracking dir: -t dirname") } ;
 77+ if (! defined ($options {"d"})) { &Abort ("Specify date range: as yyyymmdd, yyyymm*, yyyy* or *") } ;
 78+ }
 79+ if ($compactmonth)
 80+ {
 81+ if (! defined ($options {"d"})) { &Abort ("Specify date range: as yyyymm, yyyy* or *") } ;
 82+ }
 83+
 84+
 85+ $dir_in = $options {"i"} ;
 86+ $dir_out = $options {"o"} ;
 87+ $dir_filtered = $options {"f"} ;
 88+ $dir_track = $options {"t"} ;
 89+ $daterange = $options {"d"} ;
 90+
 91+ $work = cwd() ;
 92+ print "Work dir $work\n" ;
 93+
 94+ if ($dir_in !~ /[\/\\]/)
 95+ { $dir_in = "$work/$dir_in" ; }
 96+
 97+ if ($dir_out eq '')
 98+ { $dir_out = "$work" ; }
 99+ elsif ($dir_out !~ /[\/\\]/)
 100+ { $dir_out = "$work/$dir_out" ; }
 101+
 102+ if ($compactmonth && ($dir_out eq ''))
 103+ { $dir_out = $dir_in ; }
 104+
 105+ if ($dir_filtered !~ /[\/\\]/)
 106+ { $dir_filtered = "$work/$dir_filtered" ; }
 107+
 108+ if ($dir_track !~ /[\/\\]/)
 109+ { $dir_track = "$work/$dir_track" ; }
 110+
 111+ if (! -d $dir_in)
 112+ { &Abort ("Input dir not found: $dir_in") } ;
 113+
 114+ if (! -d $dir_out)
 115+ {
 116+ print "Create output dir $dir_out\n" ;
 117+ mkdir $dir_out ;
 118+ if (! -d $dir_out)
 119+ { &Abort ("Output dir could not be created.") } ;
 120+ }
 121+
 122+ open LOG, ">>", "$work/WikiStatsCompactDammitFiles.log" ;
 123+
 124+ if ($compactday)
 125+ {
 126+ if (($daterange !~ /^\d{8}$/) && ($daterange !~ /^\d{6}\*$/) && ($daterange !~ /^\d{4}\*$/) && ($daterange !~ /^\*$/))
 127+ { &Abort ("Specify date range: as yyyymmdd, yyyymm*, yyyy* or *") ; }
 128+
 129+ &Log ("\nCompress pagecount files\nin: $dir_in\nout: $dir_out\nflt: $dir_filtered\ntrack: $dir_track\ndate range: $daterange\n") ;
 130+ $daterange =~ s/\*/\\d+/ ;
 131+
 132+ &CompactVisitorStatsOneDay ($dir_in, $dir_out, $dir_filtered, $dir_track, $daterange) ;
 133+ # &UncompactVisitorStats ; # test only, to see if process is revertible
 134+ }
 135+
 136+ if ($compactmonth)
 137+ {
 138+ if (($daterange !~ /^\d{6}$/) && ($daterange !~ /^\d{4}\*$/) && ($daterange !~ /^\*$/))
 139+ { &Abort ("Specify date range: as yyyymm, yyyy* or *") ; }
 140+
 141+ ($daterange2 = $daterange) =~ s/\*/\\d+/ ;
 142+ &Log ("\nCompress pagecount files\nin: $dir_in\nout: $dir_out\ndate range: $daterange->$daterange2\n") ;
 143+
 144+ &CompactVisitorStatsOneMonth ($dir_in, $dir_out, $daterange2) ;
 145+ }
 146+
 147+ &Log ("\nReady\n") ;
 148+ close LOG ;
 149+ exit ;
 150+
 151+sub CompactVisitorStatsOneDay
 152+{
 153+ my $dir_in = shift ;
 154+ my $dir_out = shift ;
 155+ my $dir_filtered = shift ;
 156+ my $dir_track = shift ;
 157+ my $daterange = shift ;
 158+
 159+ chdir ($dir_in) || &Abort ("Cannot chdir to $dir_in\n") ;
 160+
 161+ local (*DIR);
 162+ opendir (DIR, ".");
 163+ @files = () ;
 164+
 165+ while ($file_in = readdir (DIR))
 166+ {
 167+ next if $file_in !~ /^pagecounts-$daterange-\d{6,6}.gz$/ ;
 168+
 169+ push @files, $file_in ;
 170+ }
 171+
 172+ closedir (DIR);
 173+
 174+ @files = sort @files ;
 175+
 176+# if (($daterange =~ /^\d{8}$/) and ($#files < 23))
 177+# { &Abort ("Less than 24 files found for date $daterange\n" . @files) ; }
 178+
 179+ foreach $file (@files)
 180+ {
 181+ $date = substr ($file,11,8) ;
 182+ $process_dates {$date}++ ;
 183+ }
 184+
 185+ &Log ("\n\n") ;
 186+
 187+ foreach $date (sort keys %process_dates)
 188+ { &MergeFilesFullDay ($dir_in, $dir_out, $dir_filtered, $dir_track, $date) ; }
 189+}
 190+
 191+sub MergeFilesFullDay
 192+{
 193+ my $dir_in = shift ;
 194+ my $dir_out = shift ;
 195+ my $dir_filtered = shift ;
 196+ my $dir_track = shift ;
 197+ my $date = shift ;
 198+
 199+ my $year = substr ($date,0,4) ;
 200+ my $month = substr ($date,4,2) ;
 201+ my $day = substr ($date,6,2) ;
 202+
 203+ my ($file_out1, $file_out2, $file_out3, $out_day, $hours_missing) ;
 204+
 205+ $dir_out = "$dir_out/${year}-${month}" ;
 206+ if (! -d $dir_out)
 207+ {
 208+ mkdir $dir_out ;
 209+ if (! -d $dir_out)
 210+ { &Abort ("Output dir could not be created: $dir_out") } ;
 211+ }
 212+
 213+ my @files_today = () ;
 214+ foreach $file (@files)
 215+ {
 216+ next if $file !~ /^pagecounts-$date-\d{6,6}.gz$/ ;
 217+
 218+ push @files_today, $file ;
 219+ }
 220+
 221+ # very few times (nearly) dupiclate files are found for same hour
 222+ # keep the largest and presumably most complete one
 223+ for ($i = 0 ; $i < $#files_today ; $i++)
 224+ {
 225+ for ($j = $i+1 ; $j <= $#files_today ; $j++)
 226+ {
 227+ if (substr ($files_today [$i],0,25) eq substr ($files_today [$j],0,25))
 228+ {
 229+ $size_i = -s $files_today [$i] ;
 230+ $size_j = -s $files_today [$j] ;
 231+ print "${files_today [$i]}: $size_i\n" ;
 232+ print "${files_today [$j]}: $size_j\n" ;
 233+ if ($size_i > $size_j)
 234+ {
 235+ print "Keep ${files_today [$i]}\n\n" ;
 236+ $files_today [$j]= "" ;
 237+ }
 238+ else
 239+ {
 240+ print "Keep ${files_today [$j]}\n\n" ;
 241+ $files_today [$i]= "" ;
 242+ }
 243+ }
 244+ }
 245+ }
 246+
 247+ $time_start = time ;
 248+ $lines = 0 ;
 249+
 250+ undef @in_hour ;
 251+
 252+ # $file_out = "pagecounts-$year$month$day_full_day" ;
 253+ # open OUT, ">", $file_out ;
 254+ # binmode $file_out ;
 255+
 256+# my $out_day1 = IO::Compress::Gzip->new ($file_out1) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
 257+ if ($bayes)
 258+ {
 259+ # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd" ; # full day
 260+ $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h" ; # full day, hourly data
 261+ # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d" ; # full day, compact, just daily totals
 262+ if ((-e "$file_out2.7z") || (-e "$file_out2.bz2") || (-e "$file_out2.zip") || (-e "$file_out2.gz"))
 263+ {
 264+ &Log ("\nTarget file '$file_out2.[7z|bz2|zip|gz]' exists already. Skip this date.\n") ;
 265+ return ;
 266+ }
 267+ if ($#files_today < 23)
 268+ {
 269+ &Log ("\nLess than 24 files found for target file '$file_out2.7z'. Skip this date.\n") ;
 270+ return ;
 271+ }
 272+
 273+ open $out_day2, ">", "$file_out2" || &Abort ("Output file '$file_out2' could not be opened.") ;
 274+ # open $out_day3, ">", "$file_out3" || &Abort ("Output file '$file_out3' could not be opened.") ;
 275+ }
 276+ else
 277+ {
 278+ # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd.gz" ; # full day
 279+ $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h.gz" ; # full day, hourly data, count above threshold
 280+ $out_day2 = IO::Compress::Gzip->new ($file_out2) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
 281+ # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d.gz" ; # full day, count above threshold
 282+ # $out_day3 = IO::Compress::Gzip->new ($file_out3) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
 283+ }
 284+
 285+# binmode $out_day1 ;
 286+ binmode $out_day2 ;
 287+# binmode $out_day3 ;
 288+
 289+ # print "File_out1 $file_out1\n" ;
 290+ print "File_out2 $file_out2\n" ;
 291+ # print "File_out3 $file_out3\n" ;
 292+
 293+ $file_filtered = "$dir_filtered/pagecounts-$year$month$day.txt" ;
 294+ &Log ("\nFilter file: $file_filtered\n") ;
 295+ open $out_filtered, '>', $file_filtered ;
 296+ binmode $out_filtered ;
 297+
 298+ $file_track = "$dir_track/_PageCountsForSquidLogTracking.txt" ;
 299+ &Log ("Tracking file: $file_track\n\n") ;
 300+
 301+ for ($hour = 0 ; $hour < 24 ; $hour++)
 302+ { $file_in_found [$hour] = $false ; }
 303+
 304+ $files_in_open = 0 ;
 305+ $files_in_found = 0 ;
 306+ $langprev = "" ;
 307+ foreach $file_in (@files_today)
 308+ {
 309+ next if $file_in eq "" ;
 310+
 311+ ($hour = $file_in) =~ s/^pagecounts-\d+-(\d\d)\d+\.gz$/$1/ ;
 312+ $hour = (0+$hour) ;
 313+ # print " file found '$file_in'\n" ;
 314+
 315+ if ($bayes)
 316+ { open $in_hour [$hour], "-|", "gzip -dc \"$file_in\"" || &Abort ("Input file '" . $file_in . "' could not be opened.") ; }
 317+ else
 318+ { $in_hour [$hour] = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ; }
 319+ binmode $in_hour [$hour] ;
 320+
 321+ $files_in_open++ ;
 322+ $file_in_found [$hour] = $true ;
 323+ $file_in_open [$hour] = $true ;
 324+ $files_in_found ++ ;
 325+ $file = $in_hour [$hour] ;
 326+ $line = <$file> ;
 327+ $line =~ s/^(\w+)2 /$1.y /o ;# project wikipedia comes without suffix -> out of sort order, make it fit by appending suffix
 328+ $line =~ s/^(\w+) /$1.z /o ;
 329+
 330+ ($lang,$title,$count [$hour],$dummy) = split (' ', $line) ;
 331+ $key [$hour] = "$lang $title" ;
 332+ }
 333+
 334+ $comment = "# Wikimedia page request counts for $date, each line shows 'subproject title counts'\n" ;
 335+ if ($threshold > 0 )
 336+ { $comment .= "# Count for articles with less than $threshold requests per full day are omitted\n" ; }
 337+ $comment .= "# Subproject is language code, followed by project code\n" ;
 338+ $comment .= "# Project is b:wikibooks, k:wiktionary, n:wikinews, q:wikiquote, s:wikisource, v:wikiversity, z:wikipedia (z added by compression script: wikipedia happens to be sorted last in dammit.lt files)\n" ;
 339+ $comment .= "# Counts format is total per day, followed by count per hour if larger than zero, hour 0..23 shown as A..X (saves up to 22 bytes per line compared to comma separated values)\n" ;
 340+ $comment .= "# If data are missing for some hour (file missing or corrupt) a question mark (?) is shown (and for each missing hour the daily total is incremented with hourly average)\n\n" ;
 341+ print $out_day2 $comment ;
 342+# print $out_day3 $comment ;
 343+
 344+ if ($files_in_found < 24)
 345+ {
 346+ for ($hour = 0 ; $hour < 24 ; $hour++)
 347+ {
 348+ if (! $file_in_found [$hour])
 349+ { $hours_missing .= "$hour," ; }
 350+ }
 351+ $hours_missing =~ s/,$// ;
 352+ &Log ("Merge files: date = $date, only $files_in_found files found!\n\n") ;
 353+ }
 354+ else
 355+ { &Log ("Merge files: date = $date\n") ; }
 356+
 357+ if ($hours_missing ne '')
 358+ {
 359+ print $out_day2 "#\n" ;
 360+ print $out_day2 "# In this file data are missing for hour(s) $hours_missing!\n" ;
 361+ # print $out_day3 "#\n" ;
 362+ # print $out_day3 "# In this file data are missing for hour(s) $hours_missing!\n" ;
 363+ }
 364+ $comment = "#\n" ;
 365+ $comment .= "# Lines starting with ampersand (@) show totals per 'namespace' (including omitted counts for low traffic articles)\n" ;
 366+ $comment .= "# Since valid namespace string are not known in the compression script any string followed by colon (:) counts as possible namespace string\n" ;
 367+ $comment .= "# Please reconcile with real namespace name strings later\n" ;
 368+ $comment .= "# 'namespaces' with count < 5 are combined in 'Other' (on larger wikis these are surely false positives)\n" ;
 369+ $comment .= "#\n" ;
 370+ $comment .= "# Page titles are shown unmodified (preserves sort sequence)\n" ;
 371+ $comment .= "#\n" ;
 372+ print $out_day2 $comment ;
 373+# print $out_day3 $comment ;
 374+
 375+ $key_low_prev = "" ;
 376+ while ($files_in_open > 0)
 377+ {
 378+ $key_low = "\xFF\xFF";
 379+ for ($hour = 0 ; $hour < 24 ; $hour++)
 380+ {
 381+ if (($files_in_open == 24) || ($file_in_found [$hour] && $file_in_open [$hour]))
 382+ {
 383+ if ($key [$hour] lt $key_low)
 384+ { $key_low = $key [$hour] ; }
 385+ }
 386+ }
 387+
 388+ if (($key_low =~ /^nov/) || ($key_low_prev =~ /^nov/))
 389+ { &Log ("key_low '$key_low' (key_low_prev '$key_low_prev')\n") ; }
 390+
 391+ $counts = "" ;
 392+ $total = 0 ;
 393+ for ($hour = 0 ; $hour < 24 ; $hour++)
 394+ {
 395+ if (! $file_in_found [$hour])
 396+ { $counts .= chr ($hour+ord('A')) . '?' ; }
 397+ elsif (($files_in_open == 24) || $file_in_open [$hour])
 398+ {
 399+ if ($key [$hour] eq $key_low)
 400+ {
 401+ $counts .= chr ($hour+ord('A')) . $count [$hour] ;
 402+ $total += $count [$hour] ;
 403+ $file = $in_hour [$hour] ;
 404+ # $line = <$file> ;
 405+
 406+ while ($true)
 407+ {
 408+ if ($line = <$file>) # =~ /^a/)
 409+ {
 410+ $line =~ s/^([\w\-]+)2 /$1.y /o ; # project wikipedia comes without suffix -> out of sort order, make it fit by appending suffix
 411+ $line =~ s/^([\w\-]+) /$1.z /o ;
 412+ ($lang,$title,$count [$hour],$dummy) = split (' ', $line) ;
 413+ $key [$hour] = "$lang $title" ;
 414+
 415+ last if $lang !~ /\d/ ;
 416+ }
 417+ else
 418+ {
 419+ if ($bayes)
 420+ { close $in_hour [$hour] ; }
 421+ else
 422+ { $in_hour [$hour] -> close () ; }
 423+ $files_in_open-- ;
 424+ $file_in_open [$hour] = $false ;
 425+ $key [$hour] = "\xFF\xFF";
 426+
 427+ last ;
 428+ }
 429+ }
 430+ }
 431+ }
 432+ }
 433+ if ($lines == 0)
 434+ { &Log ("\nlines: project key\n") ; }
 435+
 436+ if (++$lines % 100000 == 0)
 437+ { &Log ("$lines: $key_low\n") ; }
 438+
 439+ # last if $lines > 10000 ; # test
 440+
 441+ last if $key_low eq "\xFF\xFF" ;
 442+
 443+ # Q&D fix for unexplained out of order error for what seems to be invalid language
 444+ # remember : no suffix on language code gets replaced by .y or .z to fixed sort order
 445+ # ^nov.mw nov1 1 8765
 446+ # ^nov1.mw nov1 1 931 <--------------
 447+ # ^nov 10_dw_oktobre 1 11421
 448+ ($lang,$title) = split (' ', $key_low) ;
 449+ if ($lang =~ /\d/)
 450+ {
 451+ $invalid_languages {$lang}++ ;
 452+ &Log ("\nSkip invalid language '$lang'\n") ;
 453+ next ;
 454+ }
 455+
 456+
 457+ if ($key_low_prev gt $key_low)
 458+ {
 459+ for ($hour = 0 ; $hour < 24 ; $hour++)
 460+ { &Log ("hour $hour: key ${key[$hour]}\n") ; }
 461+
 462+ &Abort ("Sequence error: '$key_low_prev' gt '$key_low'\n") ;
 463+ }
 464+
 465+ if (($key_low_prev eq $key_low) && ($files_in_open > 0))
 466+ {
 467+ for ($hour = 0 ; $hour < 24 ; $hour++)
 468+ {
 469+ if ($file_in_open [$hour])
 470+ { print "hour $hour: file open, key ${key [$hour]}\n" ; }
 471+ else
 472+ { print "hour $hour: file closed, key ${key [$hour]}\n" ; }
 473+ }
 474+ &Abort ("Sequence error: '$key_low_prev' eq '$key_low'\n") ;
 475+ }
 476+
 477+ # print OUT "$key_low $total$counts\n" ;
 478+# print $out_day1 "$key_low $total$counts\n" ;
 479+
 480+ ($lang,$title) = split (' ', $key_low) ;
 481+
 482+ $title =~ s/\%20/_/g ;
 483+ $title =~ s/\%3A/:/gi ;
 484+# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
 485+ if (($title !~ /\:/) || ($title =~ /^:[^:]*$/)) # no colon or only on first position
 486+ { $namespace = 'NamespaceArticles' ; }
 487+ else
 488+ { ($namespace = $title) =~ s/([^:])\:.*$/$1/ ; }
 489+ # print "KEY $key_low -> $namespace\n" ;
 490+
 491+ if (($lang ne $langprev) && ($langprev ne ""))
 492+ {
 493+ $filter_matches = $lang =~ $reg_exp_filter ;
 494+ if ($filter_matches)
 495+ { print "F $lang\n" ; }
 496+ # else
 497+ # { print "- $lang\n" ; }
 498+
 499+ &WriteTotalsPerNamespace ($out_day2, $langprev) ;
 500+ # &WriteTotalsPerNamespace ($out_day3, $langprev) ;
 501+ undef %totals_per_namespace ;
 502+ }
 503+ $langprev = $lang ;
 504+
 505+ if (($files_in_found < 24) && ($files_in_found > 0)) # always > 0 actually
 506+ { $total = sprintf ("%.0f",($total / $files_in_found) * 24) ; }
 507+
 508+ $totals_per_namespace {"$lang $namespace"} += $total ;
 509+
 510+ if ($filter_matches)
 511+ { print $out_filtered "$key_low $total$counts\n" ; }
 512+
 513+ if ($key_low =~ $reg_exp_track) # track count for NonExistingPageForSquidLogMonitoring on en.z
 514+ {
 515+ open $out_track, '>>', $file_track ;
 516+ binmode $out_track ;
 517+ print $out_track "$key_low $total$counts\n" ;
 518+ close $out_track ;
 519+ }
 520+
 521+ if ($total >= $threshold)
 522+ { print $out_day2 "$key_low $total$counts\n" ;
 523+ # print $out_day3 "$key_low $total\n" ;
 524+ }
 525+
 526+ $key_low_prev = $key_low ;
 527+ # print "OUT $key_low $counts\n" ;
 528+ }
 529+
 530+ &WriteTotalsPerNamespace ($out_day2, $langprev) ;
 531+# &WriteTotalsPerNamespace ($out_day3, $langprev) ;
 532+
 533+ &Log ("File production took " . (time-$time_start) . " seconds\n\n") ;
 534+
 535+ &Log ("[$lines, $files_in_open] $key_low\n") ;
 536+# close OUT ;
 537+
 538+ if ($bayes)
 539+ {
 540+ # close $out_day1 ;
 541+ close $out_day2 ;
 542+ # close $out_day3 ;
 543+ close $out_filtered ;
 544+
 545+# $cmd = "$path_7za a $file_out2.7z $file_out2" ;
 546+# $result = `$cmd` ;
 547+# if ($result =~ /Everything is Ok/s)
 548+# {
 549+# $result =~ s/^.*?(Updating.*?)\n.*$/$1 -> OK/s ;
 550+# unlink $file_out2 ;
 551+# foreach $file_in (@files_today)
 552+# {
 553+# print "unlink $dir_in/$file_in\n" ;
 554+# unlink "$dir_in/$file_in" ;
 555+# }
 556+# }
 557+# else
 558+# {
 559+# print "Delete $file_out2.7z\n" ;
 560+# unlink "$file_out2.7z" ;
 561+# }
 562+
 563+
 564+ $time_start_compression = time ;
 565+ $cmd = "bzip2 -9 -v $file_out2" ;
 566+ &Log ("\n\n$cmd ->\n") ;
 567+ $result = `$cmd` ;
 568+ &Log ("\n\nCompression took " . (time-$time_start_compression) . " seconds\n$result\n") ;
 569+
 570+ if ($true)
 571+ {
 572+ foreach $file_in (@files_today)
 573+ {
 574+ print "unlink $dir_in/$file_in\n" ;
 575+ unlink "$dir_in/$file_in" ;
 576+ }
 577+ }
 578+ else
 579+ {
 580+ # print "Delete $file_out2.7z\n" ;
 581+ # unlink "$file_out2.7z" ;
 582+ }
 583+ }
 584+ else
 585+ {
 586+ # $out_day1->close() ;
 587+ $out_day2->close() ;
 588+ # $out_day3->close() ;
 589+ close $out_filtered ;
 590+ }
 591+
 592+ &Log ("\nRecords skipped for invalid languages:\n") ;
 593+ foreach $key (sort keys %invalid_languages)
 594+ { &Log ("$key: ${invalid_languages {$key}}\n") ; }
 595+
 596+ &Log ("\nTotals per namespace written: $lines_namespace_counts\n") ;
 597+ &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
 598+}
 599+
 600+sub WriteTotalsPerNamespace
 601+{
 602+ my $out_day = shift ;
 603+ my $lang = shift ;
 604+ my $total ;
 605+ my $totals_per_namespace_other ;
 606+
 607+ foreach my $key (sort keys %totals_per_namespace)
 608+ {
 609+ $total = $totals_per_namespace {$key} ;
 610+ if ($total < 5)
 611+ { $totals_per_namespace_other += $total ; }
 612+ else
 613+ {
 614+ # print "@ $key $total\n" ;
 615+ print $out_day "@ $key $total\n" ;
 616+ $lines_namespace_counts ++ ;
 617+ }
 618+ }
 619+ if ($totals_per_namespace_other > 0 )
 620+ {
 621+ # print "@ $lang -other- $totals_per_namespace_other\n" ;
 622+ print $out_day "@ $lang -other- $totals_per_namespace_other\n" ;
 623+ $lines_namespace_counts ++ ;
 624+ }
 625+}
 626+
 627+sub CompactVisitorStatsOneMonth
 628+{
 629+ my $dir_in = shift ;
 630+ my $dir_out = shift ;
 631+ my $daterange = shift ;
 632+
 633+ &Log ("\nCompactVisitorStatsOneMonth\n\n") ;
 634+
 635+ chdir ($dir_in) || &Abort ("Cannot chdir to $dir_in\n") ;
 636+
 637+ local (*DIR);
 638+ opendir (DIR, ".");
 639+ @files = () ;
 640+
 641+ while ($dir = readdir (DIR))
 642+ {
 643+ next if ! -d $dir ;
 644+ next if $dir !~ /^\d\d\d\d-\d\d$/ ;
 645+
 646+ push @dirs, $dir ;
 647+ }
 648+
 649+ closedir (DIR);
 650+
 651+ @dirs = sort @dirs ;
 652+
 653+ foreach $dir (@dirs)
 654+ {
 655+ &Log ("\n\n" . '-' x 80 . "\n\nCompactVisitorStatsOneMonth:\nCheck dir $dir_in/$dir\n") ;
 656+
 657+ if (-e "$dir_in/$dir/a")
 658+ {
 659+ &Log ("Already done -> skip\n\n") ;
 660+ next ;
 661+ }
 662+
 663+ ($dir2 = $dir) =~ s/-//g ;
 664+ if ($dir2 !~ /^$daterange/)
 665+ {
 666+ &Log ("Directory out of date range ($daterange) -> skip\n\n") ;
 667+ next ;
 668+ }
 669+
 670+ local (*DIR2);
 671+ opendir (DIR2, "$dir_in/$dir");
 672+
 673+ undef @files ;
 674+ undef %process_dates ;
 675+
 676+ while ($file_in = readdir (DIR2))
 677+ {
 678+ if ($bayes)
 679+ { next if $file_in !~ /^pagecounts-\d{8}_(?:fdt\.7z|h\.bz2)$/ ; }
 680+ else
 681+ { next if $file_in !~ /^pagecounts-\d{8}_fdt$/ ; }
 682+
 683+ &Log ("File found: $file_in\n") ;
 684+
 685+ push @files, $file_in ;
 686+ }
 687+
 688+ closedir (DIR2);
 689+
 690+ @files = sort @files ;
 691+
 692+ foreach $file (@files)
 693+ {
 694+ $date = substr ($file,11,8) ;
 695+ $process_dates {$date}++ ;
 696+ }
 697+
 698+ &Log ("\n\n") ;
 699+
 700+ &MergeFilesFullMonth ($dir_in, $dir_out, $dir, @files) ;
 701+ }
 702+
 703+ exit ;
 704+}
 705+
 706+sub MergeFilesFullMonth
 707+{
 708+ my $dir_in = shift ;
 709+ my $dir_out = shift ;
 710+ my $dir = shift ;
 711+ my @files_this_month = @_ ;
 712+
 713+ my $year = substr ($dir,0,4) ;
 714+ my $month = substr ($dir,5,2) ;
 715+
 716+ my (@file_in_open, @file_in_found, @counts, $days_missing) ;
 717+ my $days_in_month = days_in_month ($year, $month) ;
 718+
 719+ my ($file_out2) ;
 720+
 721+ $lines = 0 ;
 722+
 723+ undef @in_day ;
 724+ my $time_start = time ;
 725+
 726+ if ($dir eq $month_run)
 727+ { $scope = "part" ; }
 728+ else
 729+ { $scope = "all" ; }
 730+
 731+ $file_out = "$dir_out/pagecounts-$year-$month-$scope" ;
 732+
 733+ &Log ("\nMergeFilesFullMonth\nIn: $dir_in/$dir\nOut: $dir_out/$file_out\nDays expected: $days_in_month\n\nProcess...\n") ;
 734+
 735+ if ($bayes)
 736+ {
 737+ if ((-e "$file_out.7z") || (-e "$file_out.bz2") || (-e "$file_out.zip") || (-e "$file_out.gz"))
 738+ {
 739+ &Log ("\nTarget file '$file_out.[7z|bz2|zip|gz]' exists already. Skip this month.\n") ;
 740+ return ;
 741+ }
 742+ }
 743+
 744+
 745+ my $out_month_all = new IO::Compress::Bzip2 "$file_out.bz2" or die "bzip2 failed for $file_out.bz2: $Bzip2Error\n";
 746+ my $out_month_ge5 = new IO::Compress::Bzip2 "${file_out}_ge5.bz2" or die "bzip2 failed for ${file_out}_ge5.bz2: $Bzip2Error\n";
 747+
 748+ $out_month_all->binmode() ;
 749+ $out_month_ge5->binmode() ;
 750+
 751+ for ($day = 0 ; $day < $days_in_month ; $day++)
 752+ { $file_in_found [$day] = $false ; }
 753+
 754+ $files_in_open = 0 ;
 755+ $files_in_found = 0 ;
 756+ $total_hours_missing = 0 ;
 757+ $langprev = "" ;
 758+ $lines_read_this_month = 0 ;
 759+ @hours_missing_per_day = () ;
 760+ $hours_missing_coded = '' ;
 761+ $lines_omitted_daily = 0 ;
 762+
 763+ foreach $file_in (@files_this_month)
 764+ {
 765+ next if $file_in eq "" ;
 766+
 767+ ($day = $file_in) =~ s/^pagecounts-\d{6}(\d+)_(?:fdt|fdt\.7z|h\.bz2)$/$1/ ;
 768+ $day = sprintf ("%2d", $day-1) ;
 769+
 770+ $file_in = "$dir_in/$year-$month/$file_in" ;
 771+ # print "File $file_in -> day $day\n" ;
 772+
 773+ &CheckHoursMissing ($year,$month,$day,$file_in) ;
 774+
 775+ if ($bayes)
 776+ {
 777+ if ($file_in =~ /\.bz2$/)
 778+ { open $in_day [$day], "-|", "bzip2 -dc \"$file_in\"" || abort ("Input file '" . $file_in . "' could not be opened.") ; }
 779+ elsif ($file_in =~ /\.7z$/)
 780+ { open $in_day [$day], "-|", "7z e -so \"$file_in\"" || abort ("Input file '" . $file_in . "' could not be opened.") ; }
 781+ else
 782+ { abort ("MergeFilesFullMonth: unexpected file name $file_in.") ; }
 783+ }
 784+ else
 785+ { open $in_day [$day], '<', $file_in || &Abort ("Open failed for '$file_in'\n") ; }
 786+
 787+ binmode $in_day [$day] ;
 788+
 789+ $files_in_open++ ;
 790+ $file_in_found [$day] = $true ;
 791+ $file_in_open [$day] = $true ;
 792+ $files_in_found ++ ;
 793+
 794+ $file = $in_day [$day] ;
 795+ $line = <$file> ;
 796+ while (($line =~ /^#/) || ($line =~ /^@/))
 797+ { $line = <$file> ; }
 798+
 799+ chomp $line ;
 800+ if ($line =~ /^[^ ]+ [^ ]+ [^ ]+$/) # prepare for format change: space will be added between daily total and hourly counts
 801+ {
 802+ ($lang,$title,$counts) = split (' ', $line) ;
 803+ }
 804+ else
 805+ {
 806+ ($lang,$title,$total,$counts) = split (' ', $line) ;
 807+ $counts = "$total$counts" ;
 808+ }
 809+
 810+ $key [$day] = "$lang $title" ;
 811+ $counts [$day] = $counts ;
 812+ # print "DAY " . ($day+1) . " KEY ${key [$day]} COUNTS $counts\n" ;
 813+ }
 814+ print "\n" ;
 815+
 816+ $comment = "# Wikimedia article requests (aka page views) for year $year, month $month\n" ;
 817+ if ($threshold > 0 )
 818+ { $comment .= "# Count for articles with less than $threshold requests per full month are omitted\n" ; }
 819+ $comment .= "#\n" ;
 820+ $comment .= "# Each line contains four fields separated by spaces\n" ;
 821+ $comment .= "# - wiki code (subproject.project, see below)\n" ;
 822+ $comment .= "# - article title (encoding from original hourly files is preserved to maintain proper sort sequence)\n" ;
 823+ $comment .= "# - monthly total (possibly extrapolated from available data when hours/days in input were missing)\n" ;
 824+ $comment .= "# - hourly counts (only for hours where indeed article requests occurred)\n" ;
 825+ $comment .= "#\n" ;
 826+ $comment .= "# Subproject is language code, followed by project code\n" ;
 827+ $comment .= "# Project is b:wikibooks, k:wiktionary, n:wikinews, q:wikiquote, s:wikisource, v:wikiversity, z:wikipedia\n" ;
 828+ $comment .= "# Note: suffix z added by compression script: project wikipedia happens to be sorted last in dammit.lt files, so add this suffix to fix sort order\n" ;
 829+ $comment .= "#\n" ;
 830+ $comment .= "# To keep hourly counts compact and tidy both day and hour are coded as one character each, as follows:\n" ;
 831+ $comment .= "# Hour 0..23 shown as A..X convert to number: ordinal (char) - ordinal ('A')\n" ;
 832+ $comment .= "# Day 1..31 shown as A.._ 27=[ 28=\\ 29=] 30=^ 31=_ convert to number: ordinal (char) - ordinal ('A') + 1\n" ;
 833+ $comment .= "#\n" ;
 834+ $comment .= "# Original data source: Wikimedia full (=unsampled) squid logs\n" ;
 835+ $comment .= "# These data have been aggregated from hourly pagecount files at http://dammit.lt/wikistats, originally produced by Domas Mituzas\n" ;
 836+ $comment .= "# Daily and monthly aggregator script built by Erik Zachte\n" ;
 837+ $comment .= "# Each day hourly files for previous day are downloaded and merged into one file per day\n" ;
 838+ $comment .= "# Each month daily files are merged into one file per month\n" ;
 839+# $comment .= "# If data are missing for some hour (file missing or corrupt) a question mark (?) is shown (and for each missing hour the daily total is incremented with hourly average)\n" ;
 840+# $comment .= "# If data are missing for some day (file missing or corrupt) a question mark (?) is shown (and for each missing day the monthly total is incremented with daily average)\n" ;
 841+ $comment .= "#\n" ;
 842+
 843+ $out_month_all->print ($comment) ;
 844+ $comment .= "# This file contains only lines with monthly page request total greater/equal 5\n" ;
 845+ $comment .= "#\n" ;
 846+ $out_month_ge5->print ($comment) ;
 847+
 848+ if ($files_in_found < $days_in_month)
 849+ {
 850+ for ($day = 0 ; $day < $days_in_month ; $day++)
 851+ {
 852+ if (! $file_in_found [$day])
 853+ {
 854+ $days_missing .= ($day+1) . "," ;
 855+ $total_hours_missing += 24 ;
 856+ for (my $h = 0 ; $h <= 23 ; $h++)
 857+ { $hours_missing_coded .= chr ($day + ord ('A')) . chr ($h + ord ('A')) .',' ; }
 858+ }
 859+ }
 860+
 861+ $days_missing =~ s/,$// ;
 862+ &Log ("Merge files: year $year, month $month, only $files_in_found files found!\n\n") ;
 863+
 864+ if ($days_missing =~ /,/)
 865+ {
 866+ $out_month_all->print ("# No input files found for days $days_missing!\n#\n") ;
 867+ $out_month_ge5->print ("# No input files found for days $days_missing!\n#\n") ;
 868+ print "No input files found for days $days_missing!\n\n" ;
 869+ }
 870+ else
 871+ {
 872+ $out_month_all->print ("# No input file found for day $days_missing!\n#\n") ;
 873+ $out_month_ge5->print ("# No input file found for day $days_missing!\n#\n") ;
 874+ print "No input file found for day $days_missing!\n\n" ;
 875+ }
 876+ }
 877+ else
 878+ { &Log ("Merge files: year $year, month $month\n\n") ; }
 879+
 880+ if ($#hours_missing_per_day > -1)
 881+ {
 882+ $out_month_all->print (@hours_missing_per_day) ;
 883+ $out_month_ge5->print (@hours_missing_per_day) ;
 884+ }
 885+
 886+ if ($hours_missing_coded ne '')
 887+ {
 888+ $hours_missing_coded =~ s/,$// ;
 889+ $hours_missing_coded = join (',', sort {$a cmp $b} split (',', $hours_missing_coded)) ; # single hours and full days missing added out of sort order
 890+ $out_month_all->print ("#\n# Hours missing: $hours_missing_coded\n") ;
 891+ $out_month_ge5->print ("#\n# Hours missing: $hours_missing_coded\n") ;
 892+ print "Hours missing: $hours_missing_coded\n\n" ;
 893+ }
 894+
 895+ $monthly_correction = 1 ;
 896+ if ($total_hours_missing == 0)
 897+ {
 898+ $out_month_all->print ("# Data for all hours of each day were available in input\n#\n") ;
 899+ $out_month_ge5->print ("# Data for all hours of each day were available in input\n#\n") ;
 900+ print "Data for all hours of each day were available in input\n\n" ;
 901+ }
 902+ else
 903+ {
 904+ $monthly_correction = sprintf ("%.4f", ($days_in_month * 24) / ($days_in_month * 24 - $total_hours_missing)) ;
 905+ $out_month_all->print ("#\n# In this file data for $total_hours_missing hours were not encountered in input\n") ;
 906+ $out_month_ge5->print ("#\n# In this file data for $total_hours_missing hours were not encountered in input\n") ;
 907+ $out_month_all->print ("# Monthly totals per page have been extrapolated from available counts: multiplication factor = $monthly_correction\n#\n") ;
 908+ $out_month_ge5->print ("# Monthly totals per page have been extrapolated from available counts: multiplication factor = $monthly_correction\n#\n") ;
 909+ print "In this file data for $total_hours_missing hours were not encountered in input\n" ;
 910+ print "Monthly totals per page have been extrapolated from available counts: multiplication factor = $monthly_correction\n\n" ;
 911+ }
 912+
 913+ if ($threshold_requests_omitted > 0)
 914+ {
 915+ $out_month_all->print ("# For this month intermediate files (from daily aggregation of hourly files) did no longer contain lines with daily total below $threshold_requests_omitted page requests\n#\n") ;
 916+ $out_month_ge5->print ("# For this month intermediate files (from daily aggregation of hourly files) did no longer contain lines with daily total below $threshold_requests_omitted page requests\n#\n") ;
 917+ print "# For this month intermediate files (from daily aggregation of hourly files) did no longer contain lines with daily total below $threshold_requests_omitted page requests\n#\n" ;
 918+ }
 919+
 920+ $key_low_prev = "" ;
 921+ while ($files_in_open > 0)
 922+ {
 923+ # last if $cycles ++ > 10000 ; # test code
 924+
 925+ $key_low = "\xFF\xFF";
 926+ for ($day = 0 ; $day < $days_in_month ; $day++)
 927+ {
 928+ if (($files_in_open == $days_in_month) || ($file_in_found [$day] && $file_in_open [$day]))
 929+ {
 930+ if ($key [$day] lt $key_low)
 931+ { $key_low = $key [$day] ; }
 932+ }
 933+ }
 934+
 935+ $counts_per_month = "" ;
 936+ $total_per_month = 0 ;
 937+
 938+ for ($day = 0 ; $day < $days_in_month ; $day++)
 939+ {
 940+ if (! $file_in_found [$day])
 941+ {
 942+ # $counts_per_month .= chr ($day+ord('A')) . '?' ;
 943+ }
 944+ elsif (($files_in_open == $days_in_month) || $file_in_open [$day]) # slight optimization
 945+ {
 946+ if ($key [$day] eq $key_low)
 947+ {
 948+ $ch_day = chr ($day+ord('A')) ;
 949+ $counts_per_day = $counts [$day] ;
 950+
 951+ ($total_per_day = $counts_per_day) =~ s/^(\d+).*$/$1/ ;
 952+ $counts_per_day =~ s/^\d+// ; # remove total
 953+
 954+ $counts_per_day =~ s/([A-Z]\d+)/$ch_day$1,/g ; # prefix each hourly count with char that represent day
 955+ $counts_per_month .= $counts_per_day ;
 956+
 957+ $total_per_month += $total_per_day ;
 958+ $file = $in_day [$day] ;
 959+ # $line = <$file> ;
 960+
 961+ while ($true)
 962+ {
 963+ # if (($line = <$file>) && ($lines_read_this_month++ < 10000)) # test code
 964+ if ($line = <$file>)
 965+ {
 966+ next if $line =~ /^#/ ;
 967+ next if $line =~ /^@/ ;
 968+
 969+ $line =~ s/^([\w\-]+)2 /$1.y /o ;
 970+ $line =~ s/^([\w\-]+) /$1.z /o ;
 971+
 972+ chomp $line ;
 973+
 974+ if ($line =~ /^[^ ]+ [^ ]+ [^ ]+$/) # prepare for format change: space will be added between daily total and hourly counts
 975+ {
 976+ ($lang,$title,$counts) = split (' ', $line) ;
 977+ }
 978+ else
 979+ {
 980+ ($lang,$title,$total,$counts) = split (' ', $line) ;
 981+ $counts = "$total$counts" ;
 982+ }
 983+
 984+ $key [$day] = "$lang $title" ;
 985+ $counts [$day] = $counts ;
 986+
 987+ last ;
 988+ }
 989+ else
 990+ {
 991+ close $in_day [$day] ;
 992+
 993+ $files_in_open-- ;
 994+ $file_in_open [$day] = $false ;
 995+ $key [$day] = "\xFF\xFF";
 996+
 997+ last ;
 998+ }
 999+ }
 1000+ }
 1001+ }
 1002+ }
 1003+ if ($lines == 0)
 1004+ { &Log ("\nlines: project key\n") ; }
 1005+
 1006+ if (++$lines % 100000 == 0)
 1007+ { &Log ("$lines: $key_low\n") ; }
 1008+
 1009+ # last if $lines > 10000 ; # test
 1010+
 1011+ last if $key_low eq "\xFF\xFF" ;
 1012+
 1013+ # Q&D fix for unexplained out of order error for what seems to be invalid language
 1014+ # remember : language code without suffix gets appended by .y or .z to fix sort order
 1015+ # ^nov.mw nov1 1 8765
 1016+ # ^nov1.mw nov1 1 931 <--------------
 1017+ # ^nov 10_dw_oktobre 1 11421
 1018+ ($lang,$title) = split (' ', $key_low) ;
 1019+ if ($lang =~ /\d/)
 1020+ {
 1021+ $invalid_languages {$lang}++ ;
 1022+ &Log ("\nSkip invalid language '$lang'\n") ;
 1023+ next ;
 1024+ }
 1025+
 1026+ if ($key_low_prev gt $key_low)
 1027+ {
 1028+ for ($day = 0 ; $day < $days_in_month ; $day++)
 1029+ { &Log ("day " . ($day+1) . ": key ${key[$day]}\n") ; }
 1030+
 1031+ &Abort ("Sequence error: '$key_low_prev' gt '$key_low'\n") ;
 1032+ }
 1033+
 1034+ if (($key_low_prev eq $key_low) && ($files_in_open > 0))
 1035+ {
 1036+ for ($day = 0 ; $day < $days_in_month ; $day++)
 1037+ {
 1038+ if ($file_in_open [$day])
 1039+ { print "day " . ($day+1) . ": file open, key ${key [$day]}\n" ; }
 1040+ else
 1041+ { print "day " . ($day+1) . ": file closed, key ${key [$day]}\n" ; }
 1042+ }
 1043+ &Abort ("Sequence error: '$key_low_prev' eq '$key_low'\n") ;
 1044+ }
 1045+
 1046+ ($lang,$title) = split (' ', $key_low) ;
 1047+
 1048+ if (($title !~ /\:/) || ($title =~ /^:[^:]*$/)) # no colon or only on first position
 1049+ { $namespace = 'NamespaceArticles' ; }
 1050+ else
 1051+ { ($namespace = $title) =~ s/([^:])\:.*$/$1/ ; }
 1052+
 1053+ if (($lang ne $langprev) && ($langprev ne ""))
 1054+ {
 1055+ $filter_matches = $lang =~ $reg_exp_filter ;
 1056+ if ($filter_matches)
 1057+ { print "F $lang\n" ; }
 1058+ }
 1059+ $langprev = $lang ;
 1060+
 1061+ if (($files_in_found < $days_in_month) && ($files_in_found > 0)) # always > 0 actually
 1062+ { $total = sprintf ("%.0f",($total / $files_in_found) * $days_in_month) ; }
 1063+
 1064+ $counts_per_month =~ s/,$// ;
 1065+ $total_per_month = sprintf ("%.0f", $monthly_correction * $total_per_month) ;
 1066+
 1067+ $out_month_all->print ("$key_low $total_per_month $counts_per_month\n") ;
 1068+ if ($total_per_month ge 5)
 1069+ { $out_month_ge5->print ("$key_low $total_per_month $counts_per_month\n") ; }
 1070+
 1071+ $key_low_prev = $key_low ;
 1072+ }
 1073+
 1074+ &Log ("File production took " . (time-$time_start) . " seconds\n\n") ;
 1075+
 1076+ &Log ("[$lines, $files_in_open] $key_low\n") ;
 1077+
 1078+ $out_month_all->close () ;
 1079+ $out_month_ge5->close () ;
 1080+
 1081+ if ($bayes)
 1082+ {
 1083+ foreach $file_in (@files_this_month)
 1084+ {
 1085+ print "unlink $dir_in/$file_in (dummy run, test only)\n" ;
 1086+ # unlink "$dir_in/$file_in" ;
 1087+ }
 1088+ }
 1089+
 1090+ &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
 1091+}
 1092+
 1093+sub CheckHoursMissing
 1094+{
 1095+ my ($year,$month,$day,$file_in) = @_ ;
 1096+ my ($hour,%hours_seen,%hours_valid,$hours_seen,$hours_missing,%hours_missing) ;
 1097+
 1098+# &Log ("\nCheckHoursMissing for day " . ($day+1) . "\n") ;
 1099+
 1100+ if ($bayes)
 1101+ {
 1102+ if ($file_in =~ /\.bz2$/)
 1103+ { open FILE_CHECK, "-|", "bzip2 -dc \"$file_in\"" || abort ("Input file '" . $file_in . "' could not be opened.") ; }
 1104+ elsif ($file_in =~ /\.7z$/)
 1105+ { open FILE_CHECK, "-|", "7z e -so \"$file_in\"" || abort ("Input file '" . $file_in . "' could not be opened.") ; }
 1106+ else
 1107+ { abort ("CheckHoursMissing: unexpected file name $file_in.") ; }
 1108+ }
 1109+ else
 1110+ { open FILE_CHECK, '<', $file_in || &Abort ("Open failed for '$file_in'\n") ; }
 1111+
 1112+ binmode FILE_CHECK ;
 1113+
 1114+ $lines_checked = 0 ;
 1115+ while ($line = <FILE_CHECK>)
 1116+ {
 1117+ if ($line =~ /^#.*?requests per full day are omitted/)
 1118+ { ($threshold_requests_omitted = $line) =~ s/[^\d]//g ; }
 1119+
 1120+ next if $line =~ /^#/ or $line =~ /^@/ ;
 1121+
 1122+ last if $lines_checked ++ > 10000 ;
 1123+
 1124+ chomp $line ;
 1125+ if ($line =~ /^[^ ]+ [^ ]+ [^ ]+$/) # prepare for format change: space will be added between daily total and hourly counts
 1126+ {
 1127+ ($lang,$title,$counts) = split (' ', $line) ;
 1128+ }
 1129+ else
 1130+ {
 1131+ ($lang,$title,$total,$counts) = split (' ', $line) ;
 1132+ $counts = "$total$counts" ;
 1133+ }
 1134+ # &Log ("Counts 1 $counts\n") ; # test
 1135+
 1136+ undef @counts ;
 1137+ # $counts = "123A1B2C?D4" ; # test
 1138+ $counts =~ s/([A-X])(\d+|\?)/(push @counts,"$1$2"),""/ge ;
 1139+ foreach $key (@counts)
 1140+ {
 1141+ my $hour = ord (substr ($key,0,1)) - ord ('A') ;
 1142+
 1143+ # test code
 1144+ # if ($month % 2 == 1)
 1145+ # {
 1146+ # if ($day % 3 == 0)
 1147+ # {
 1148+ # next if $hour == 2 ;
 1149+ # if ($hour % 3 == 0)
 1150+ # { $key = substr ($key,0,1,) . '?' ; }
 1151+ # }
 1152+ # }
 1153+ # else
 1154+ # { next if $hour == 2 ; }
 1155+
 1156+ next if $hours_seen {$hour} > 0 ;
 1157+ $hours_seen {$hour} = $true ;
 1158+ $hours_seen ++ ;
 1159+ if ($key =~ /\d/)
 1160+ { $hours_valid {$hour} ++ ; }
 1161+ else
 1162+ {
 1163+ $hours_missing {$hour} ++ ;
 1164+ $hours_missing ++ ;
 1165+ $hours_missing_coded .= chr ($day + ord ('A')) . chr ($hour + ord ('A')) .',' ;
 1166+ }
 1167+ }
 1168+ # &Log ("Counts 2 $counts, seen: $hours_seen, valid:". (join ',', sort {$a <=> $b} keys %hours_valid) . ", missing: " . (join ',', sort {$a <=> $b} keys %hours_missing) . "\n") ; # test
 1169+
 1170+ last if $hours_seen == 24 ;
 1171+ }
 1172+
 1173+ close FILE_CHECK ;
 1174+
 1175+ for ($hour = 0 ; $hour <= 23 ; $hour++)
 1176+ {
 1177+ if (! $hours_seen {$hour})
 1178+ {
 1179+ $hours_missing {$hour} ++ ;
 1180+ $hours_missing ++ ;
 1181+ $hours_missing_coded .= chr ($day + ord ('A')) . chr ($hour + ord ('A')) .',' ;
 1182+ }
 1183+ }
 1184+
 1185+ if ($lines_checked > 10000)
 1186+ { &Log ("\nDay " . ($day+1) . ": not all hours encountered after 10,000 lines !!! Seen (can be ?=missing) " . (join ',', sort {$a <=> $b} keys %hours_seen) . "\n") ; }
 1187+
 1188+ if ($hours_missing > 0)
 1189+ {
 1190+ $text_hour = $hours_missing > 1 ? 'hours' : 'hour' ;
 1191+ push @hours_missing_per_day, "# Day " . ($day+1) . ": $text_hour missing " . (join ',', sort {$a <=> $b} keys %hours_missing) . "\n" ;
 1192+ print "Day " . ($day+1) . ": $text_hour missing " . (join ',', sort {$a <=> $b} keys %hours_missing) . "\n" ;
 1193+ }
 1194+
 1195+ $total_hours_missing += $hours_missing ;
 1196+}
 1197+
 1198+sub Log
 1199+{
 1200+ $msg = shift ;
 1201+ print $msg ;
 1202+ print LOG $msg ;
 1203+}
 1204+
 1205+sub Abort
 1206+{
 1207+ $msg = shift ;
 1208+ print "Abort script\nError: $msg\n" ;
 1209+ print LOG "Abort script\nError: $msg\n" ;
 1210+ exit ;
 1211+}
 1212+
 1213+#=============================================================================================================
 1214+
 1215+# snippets obsolete but revivable code / test code
 1216+
 1217+#sub Compact
 1218+#{
 1219+# my $day = shift ;
 1220+# &Log ("Compact files for $day\n") ;
 1221+
 1222+# $file_in = "pagecounts-$day.out" ;
 1223+# $file_out1 = "pagecounts-${day}_all.gz" ;
 1224+# $file_out2 = "pagecounts-${day}_10plus.gz" ;
 1225+# open IN, "<", $file_in ;
 1226+# binmode $file_in ;
 1227+
 1228+# my $out_day1 = IO::Compress::Gzip->new ($file_out1) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
 1229+# my $out_day2 = IO::Compress::Gzip->new ($file_out2) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
 1230+
 1231+# open OUT, ">", $file_out ;
 1232+# binmode $file_out ;
 1233+
 1234+# $lang_prev = "" ;
 1235+# while ($line = <IN>)
 1236+# {
 1237+# chomp ($line) ;
 1238+# ($lang, $title, $counts) = split (' ', $line) ;
 1239+# $title2 = $title ;
 1240+# $title =~ s/\%20/_/g ;
 1241+# $title =~ s/\%3A/:/g ;
 1242+# # $title =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
 1243+# # if ($title =~ /[\x00-\x1F]/)
 1244+# # { &Log ("> '$title2'\n") ; }
 1245+# $title =~ s/\x00-\x1F/"%" . sprintf ("%X", ord($1)) ;/ge ;
 1246+# print $out_day1 "$lang $title $counts\n" ;
 1247+# ($counts2 = $counts) =~ s/^(\d+).*/$1/ ;
 1248+# if ($counts2 >= $threshold)
 1249+# { print $out_day2 "$lang $title $counts\n" ; }
 1250+# $lang_prev = $lang ;
 1251+# }
 1252+#
 1253+# close IN ;
 1254+# $out_day1->close() ;
 1255+# $out_day2->close() ;
 1256+#}
 1257+
 1258+
 1259+#sub GetViewDistribution
 1260+#{
 1261+# open OUT, ">", "Views.csv" ;
 1262+# foreach $file_in (@files)
 1263+# {
 1264+# ($hour = $file_in) =~ s/^pagecounts-\d+-(\d\d)\d+\.gz$/$1/ ;
 1265+# $hour = chr(ord('A')+$hour) ;
 1266+# &Log ("Process $hour $file_in\n") ;
 1267+
 1268+# $in_hour1 = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed: $GunzipError\n") ;
 1269+# while ($line = <$in_hour1>)
 1270+# {
 1271+# ($lang,$title,$count,$dummy) = split (' ', $line) ;
 1272+# if (($lang eq "en") && ($title !~ /:/)) # only en: and namespace 0
 1273+# {
 1274+# $tot {$hour} += $count ;
 1275+# if ($count < 3)
 1276+# { @counts {$hour . substr ($title,0,1)}++ ; }
 1277+# }
 1278+# }
 1279+# $in_hour1->close () ;
 1280+# }
 1281+#
 1282+# print OUT "," ;
 1283+# foreach $hour ('A'..'X')
 1284+# { print OUT $hour . ", " ; }
 1285+# print OUT "\n" ;
 1286+#
 1287+# print OUT "," ;
 1288+# foreach $hour ('A'..'X')
 1289+# { print OUT $tot {$hour} . ", " ; }
 1290+# print OUT "\n" ;
 1291+#
 1292+# for ($c=0; $c < 256; $c++)
 1293+# {
 1294+# # do not print chars " and , as such: confuses csv format
 1295+# if ($c < 33)
 1296+# { print OUT "chr($c), " ; }
 1297+# elsif (chr($c) eq '"')
 1298+# { print OUT "dquote, " ; }
 1299+# elsif (chr($c) eq ',')
 1300+# { print OUT "comma, " ; }
 1301+# else
 1302+# { print OUT chr($c) . ", " ; }
 1303+#
 1304+# foreach $hour ('A'..'X')
 1305+# { print OUT (0+@counts {$hour.chr($c)}) , ", " ; }
 1306+#
 1307+# if ($c < 255)
 1308+# { print OUT "\n" ; }
 1309+# }
 1310+# close OUT ;
 1311+#}
 1312+
 1313+
 1314+#sub RecompactVisitorStats
 1315+#{
 1316+# my $dir_in = "D:/Wikipedia_Visitors/full_day/" ;
 1317+# chdir ($dir_in) || &Abort ("Cannot chdir to $dir_in\n") ;
 1318+# local (*DIR);
 1319+# opendir (DIR, ".");
 1320+# @files = () ;
 1321+# while ($file_in = readdir (DIR))
 1322+# {
 1323+# next if $file_in !~ /^pagecounts-\d{8,8}_fd.gz$/ ;
 1324+#
 1325+# push @files, $file_in ;
 1326+# }
 1327+
 1328+# $filecnt = $#files+1 ;
 1329+# @files = sort { substr ($a, 20,2) <=> substr ($b, 20,2)} @files ;
 1330+
 1331+# foreach $file (@files)
 1332+# { &RecompactVisitorStats2 ($file) ; }
 1333+# closedir (DIR, ".");
 1334+#}
 1335+
 1336+#sub RecompactVisitorStats2
 1337+#{
 1338+## http://www.7-zip.org/7z.html
 1339+# my $file = shift ;
 1340+# my $time_start = time ;
 1341+# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
 1342+## my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
 1343+# my $file_in = "D:/Wikipedia_Visitors/full_day/$file" ;
 1344+# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
 1345+# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
 1346+
 1347+# &Log ("Process $file_in\n") ;
 1348+
 1349+# $in_hour = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
 1350+# binmode $in_hour ;
 1351+# open OUT, ">", $file_out ;
 1352+# binmode OUT ;
 1353+
 1354+# my ($title, $title2) ;
 1355+# while ($line = <$in_hour>)
 1356+# {
 1357+# chomp ($line) ;
 1358+# ($lang,$title,$counts) = split (" ", $line) ;
 1359+
 1360+# if ($lang ne $lang_prev) { print "$lang " ; }
 1361+# $lang_prev = $lang ;
 1362+
 1363+# # test pagecounts-20080701_fd.gz
 1364+# # all records 424 Mib compressed (1984 uncompressed)
 1365+# # count > 1 212 Mib compressed ( 733 uncompressed)
 1366+# # count > 2 169 Mib compressed ( 551 uncompressed)
 1367+# next if $counts <= 1 ;
 1368+
 1369+# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
 1370+# $title =~ s/\s/_/g;
 1371+# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
 1372+# $lang =~ s/\.y/2/ ;
 1373+
 1374+# print OUT "$lang $title $counts\n" ;
 1375+# }
 1376+
 1377+# print "Close files\n" ;
 1378+# $in_hour -> close () ;
 1379+# close (OUT) ;
 1380+
 1381+# &Log ("Compress $file_out\n") ;
 1382+
 1383+# unlink $file_7z ;
 1384+# $result = `$path_7z a $file_7z $file_out` ;
 1385+# &Log ("Compressed\n") ;
 1386+# &Log ("Result " . ($result+0) . " \n") ;
 1387+# if ((-e $file_7z) && (-s $file_7z > 0) && (($result == 0) || ($result == 7)))
 1388+# { unlink $file_out ; }
 1389+
 1390+# &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
 1391+## 0 No error
 1392+## 1 Warning (Non fatal error(s)). For example, one or more files were locked by some other application, so they were not compressed.
 1393+## 2 Fatal error
 1394+## 7 Command line error
 1395+## 8 Not enough memory for operation
 1396+## 255 User stopped the process
 1397+#}
 1398+
 1399+
 1400+#sub RecompactVisitorStats3
 1401+#{
 1402+## http://www.7-zip.org/7z.html
 1403+# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
 1404+# my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
 1405+# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
 1406+# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
 1407+## my $file_log = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts.log" ;
 1408+
 1409+# $in_hour = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
 1410+# binmode $in_hour ;
 1411+## $out_day = IO::Compress::Gzip->new ($file_out) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
 1412+## binmode $out_day ;
 1413+# open OUT, ">", $file_out ;
 1414+# binmode OUT ;
 1415+## open LOG, ">", $file_log ;
 1416+## binmode LOG ;
 1417+
 1418+# my ($title, $title2) ;
 1419+# while ($line = <$in_hour>)
 1420+# {
 1421+# chomp ($line) ;
 1422+# ($lang,$title,$counts) = split (" ", $line) ;
 1423+
 1424+# if ($lang ne $lang_prev) { print "$lang\n" ; }
 1425+## last if $lang gt "fs" ;
 1426+# $lang_prev = $lang ;
 1427+
 1428+# # test pagecounts-20080701_fd.gz
 1429+# # all records 424 Mib compressed (1984 uncompressed)
 1430+# # count > 1 212 Mib compressed ( 733 uncompressed)
 1431+# # count > 2 169 Mib compressed ( 551 uncompressed)
 1432+# next if $counts <= 1 ;
 1433+
 1434+## next if $lang !~ /^(?:ar|fr)/ ;
 1435+
 1436+#if ($false)
 1437+#{
 1438+# $title1b = $title ;
 1439+# $title1b =~ s/(\%[A-Fa-f0-9]{2})/uc($1)/seg;
 1440+# $title1b =~ s/\%28/(/g ;
 1441+# $title1b =~ s/\%29/)/g ;
 1442+# $title1b =~ s/\%3A/:/g ;
 1443+# $title1b =~ s/\%2F/\//g ;
 1444+# $title1b =~ s/\%5C/\\/g ;
 1445+# $title1b =~ s/\%2A/*/g ;
 1446+# $title1b =~ s/\%21/!/g ;
 1447+# $title1b =~ s/\%5F/_/g ;
 1448+# $title1b =~ s/\%2C/,/g ;
 1449+# $title1b =~ s/\%2E/./g ;
 1450+# $title1b =~ s/\%2D/-/g ;
 1451+# $title1b =~ s/\%25/%/g ;
 1452+# $title1b =~ s/\%7E/~/g ;
 1453+# $title1b =~ s/\%27/'/g ;
 1454+# $title1b =~ s/\%3D/=/g ;
 1455+# $title1b =~ s/\%26/&/g ;
 1456+# $title1b =~ s/\%3B/;/g ;
 1457+# $title1b =~ s/\%3F/?/g ;
 1458+# $title1b =~ s/\%2B/+/g ;
 1459+# $title2 = $title1b ;
 1460+# $title2 =~ s/%([A-F0-9]{2})/chr(hex($1))/seg;
 1461+
 1462+# if ($title1b ne $title2) # if changed anything at all
 1463+# {
 1464+# $title3 = uri_escape ($title2) ;
 1465+# $title3 =~ s/\%28/(/g ;
 1466+# $title3 =~ s/\%29/)/g ;
 1467+# $title3 =~ s/\%3A/:/g ;
 1468+# $title3 =~ s/\%2F/\//g ;
 1469+# $title3 =~ s/\%5C/\\/g ;
 1470+# $title3 =~ s/\%2A/\*/g ;
 1471+# $title3 =~ s/\%21/\!/g ;
 1472+# $title3 =~ s/\%5F/\_/g ;
 1473+# $title3 =~ s/\%2C/,/g ;
 1474+# $title3 =~ s/\%2E/./g ;
 1475+# $title3 =~ s/\%2D/-/g ;
 1476+# $title3 =~ s/\%25/%/g ;
 1477+# $title3 =~ s/\%7E/~/g ;
 1478+# $title3 =~ s/\%27/'/g ;
 1479+# $title3 =~ s/\%3D/=/g ;
 1480+# $title3 =~ s/\%26/&/g ;
 1481+# $title3 =~ s/\%3B/;/g ;
 1482+# $title3 =~ s/\%3F/?/g ;
 1483+# $title3 =~ s/\%2B/+/g ;
 1484+
 1485+# if ($title1b eq $title3) # process reversible ?
 1486+# {
 1487+# $y++ ;
 1488+# $title2 =~ s/\s/_/g;
 1489+# $title = $title2 ;
 1490+# }
 1491+# else
 1492+# {
 1493+# $n++ ;
 1494+# print "Y $y N $n\n$title\n$title3\n\n" ;
 1495+# print LOG "Y $y N $n\n$title\n$title3\n\n" ;
 1496+# }
 1497+# }
 1498+#}
 1499+# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
 1500+# $title =~ s/\s/_/g;
 1501+# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
 1502+# $lang =~ s/\.y/2/ ;
 1503+
 1504+# # print $out_day "$lang $title $counts\n" ;
 1505+# print OUT "$lang $title $counts\n" ;
 1506+# }
 1507+
 1508+# print "Close files\n" ;
 1509+# $in_hour -> close () ;
 1510+## $out_day -> close () ;
 1511+# close (OUT) ;
 1512+# $result = `$path_7z a $file_out $file_txt` ;
 1513+# print $result ;
 1514+#}
 1515+
 1516+
 1517+
 1518+# test (partial) reversibility of process
 1519+#sub UncompactVisitorStats
 1520+#{
 1521+# my $file_in = "out/2009-03/pagecounts-20090301_fdt" ;
 1522+# my $dir_out = "out" ;
 1523+# # $in_hour = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
 1524+# open $in_hour, '<', $file_in ;
 1525+# binmode $in_hour ;
 1526+
 1527+# for ($h=0 ; $h<=23 ; $h++)
 1528+# {
 1529+# $time = sprintf ("%02d",$h) . "0000" ;
 1530+## $file_out = "$dir_out/pagecounts-20090301-$time.gz" ;
 1531+# $file_out = "$dir_out/pagecounts-20090301-$time" ;
 1532+# open $out_day [$h], '>', $file_out ;
 1533+## $out_day [$h] = IO::Compress::Gzip->new ($file_out) || &Abort ("IO::Compress::Gzip failed: $GzipError\n");
 1534+# binmode $out_day [$h] ;
 1535+# }
 1536+
 1537+# while ($line = <$in_hour>)
 1538+# {
 1539+# next if $line =~ /^#/ ;
 1540+# next if $line =~ /^@/ ;
 1541+# chomp ($line) ;
 1542+## print "$line\n" ;
 1543+# if ($lines++ > 10000) { exit ; }
 1544+# ($lang,$title,$counts) = split (" ", $line) ;
 1545+# $lang =~ s/\.z// ;
 1546+# $lang =~ s/\.y/2/ ;
 1547+# $counts =~ s/^\d+// ; # remove (redundant) preceding total
 1548+# while ($counts ne "")
 1549+# {
 1550+# $letter = substr ($counts,0,1) ;
 1551+# $counts = substr ($counts,1) ;
 1552+# ($count = $counts) =~ s/^(\d+).*$/$1/ ;
 1553+# $counts =~ s/^\d+(.*)$/$1/ ;
 1554+# $h = ord ($letter) - ord ('A') ;
 1555+# $file = $out_day [$h] ;
 1556+# $writes {$h} ++ ;
 1557+# print $file "$lang $title $count\n" ;
 1558+# }
 1559+
 1560+# }
 1561+
 1562+# for ($h=0 ; $h<=23 ; $h++)
 1563+# {
 1564+## $out_day [$h] -> close () ;
 1565+# close $out_day [$h] ;
 1566+# }
 1567+#}
 1568+
 1569+

Status & tagging log