Index: trunk/wikistats/dammit.lt/dammit_report.sh |
— | — | @@ -0,0 +1 @@ |
| 2 | +perl /a/dammit.lt/DammitReportPageRequestsStaffWikis.pl |
Property changes on: trunk/wikistats/dammit.lt/dammit_report.sh |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 3 | + native |
Index: trunk/wikistats/dammit.lt/dammit_sync.sh |
— | — | @@ -0,0 +1,2 @@ |
| 2 | +perl /a/dammit.lt/DammitSyncFiles.pl |
| 3 | +#perl /home/ezachte/wikistats/WikiCountsJobProgress.pl >> /a/dammit.lt/cron.txt |
Property changes on: trunk/wikistats/dammit.lt/dammit_sync.sh |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 4 | + native |
Index: trunk/wikistats/dammit.lt/cellar/!DammitPageViewsPerSpecialSearch.pl |
— | — | @@ -0,0 +1,263 @@ |
| 2 | +#!/usr/bin/perl |
| 3 | + |
| 4 | + use lib "/home/ezachte/lib" ; |
| 5 | + use EzLib ; |
| 6 | + $trace_on_exit = $true ; |
| 7 | + |
| 8 | + use CGI::Carp qw(fatalsToBrowser); |
| 9 | + use Time::Local ; |
| 10 | + use Net::Domain qw (hostname); |
| 11 | + |
| 12 | + $file_csv_pagecounts = "pagecounts-$month-$language\_fdt" ; |
| 13 | + |
| 14 | + open CSV, '>', "/a/dammit.lt/SpecialSearch.csv" ; |
| 15 | + open TXT, '>', "/a/dammit.lt/SpecialSearch.txt" ; |
| 16 | + |
| 17 | + $timestart = time ; |
| 18 | + |
| 19 | + &ScanFiles ; |
| 20 | + print "\n\nReady\n\n" ; |
| 21 | + exit ; |
| 22 | + |
| 23 | +sub ScanFiles |
| 24 | +{ |
| 25 | + print "ScanFiles\n" ; |
| 26 | + print "Filter view counts for $language to $dir_out/$file_csv_pagecounts\n\n" ; |
| 27 | + |
| 28 | + $year = 2009 ; |
| 29 | + $month = 10 ; |
| 30 | + while ($year == 2009 || ($year == 2010 && $month <= 5)) |
| 31 | + { |
| 32 | + |
| 33 | + for ($day = 1 ; $day <= 31 ; $day++) |
| 34 | + { |
| 35 | + |
| 36 | + $yyyymm = sprintf ("%04d-%02d", $year, $month) ; |
| 37 | + $yyyymmdd = sprintf ("%04d%02d%02d", $year, $month, $day) ; |
| 38 | + |
| 39 | + $file_pagecounts = "/a/dammit.lt/pagecounts/$yyyymm/pagecounts-${yyyymmdd}_h.bz2" ; |
| 40 | + |
| 41 | + if (! -e $file_pagecounts) |
| 42 | + { print "Not found: $file_pagecounts\n" ; next ; } |
| 43 | + |
| 44 | + print ddhhmmss (time,"%d:%02d:%02d") . "\nRead $file_pagecounts\n\n" ; |
| 45 | + |
| 46 | + if ($file_pagecounts =~ /.7z$/) |
| 47 | + { open IN, "-|", "./7za e -so \"$file_pagecounts\"" || die ("Input file '$file_pagecounts' could not be opened.") ; } |
| 48 | + elsif ($file_pagecounts =~ /.bz2$/) |
| 49 | + { open IN, "-|", "bzip2 -dc \"$file_pagecounts\"" || die ("Input file '$file_pagecounts' could not be opened.") ; } |
| 50 | + else |
| 51 | + { next ; } # open IN, '<', $file_pagecounts ; } |
| 52 | + |
| 53 | + $project = "" ; |
| 54 | + while ($line = <IN>) |
| 55 | + { |
| 56 | + next if $line =~ /^#/ ; |
| 57 | + next if $line =~ /^@/ ; |
| 58 | + |
| 59 | + if ($line !~ /$project/) |
| 60 | + { |
| 61 | + if ($project eq 'en.z') |
| 62 | + { |
| 63 | + print CSV "\"=date($year,$month,$day)\",$project2,$generic,$specific,$other\n" ; |
| 64 | + print "\"=date($year,$month,$day)\",$project2,$generic,$specific,$other\n" ; |
| 65 | + $generic = 0 ; |
| 66 | + $specific = 0 ; |
| 67 | + $other = 0 ; |
| 68 | + } |
| 69 | + ($project) = split ' ', $line ; print "$project " ; |
| 70 | + } |
| 71 | + next if $line lt "en.z " ; |
| 72 | + last if $line gt "en.\xFF" ; |
| 73 | + |
| 74 | + if ($project eq 'en.z') |
| 75 | + { |
| 76 | + if ($line =~ /Special:Search/i) |
| 77 | + { |
| 78 | + ($project, $title, $counts) = split (' ', $line) ; |
| 79 | + ($project2 = $project) =~ s/\.z// ; |
| 80 | + $counts =~ s/^(\d+).*$/$1/ ; |
| 81 | + $title =~ s/,/,/g ; |
| 82 | + |
| 83 | + if ($yyyymmdd eq '20100201') |
| 84 | + { print TXT "$yyyymmdd,$project2,$counts,$title\n" ; } |
| 85 | + |
| 86 | + if ($title =~ /^Special:Search\//i) |
| 87 | + { $specific += $counts ; } |
| 88 | + elsif ($title =~ /^Special:Search/i) |
| 89 | + { $generic += $counts ; } |
| 90 | + else |
| 91 | + { $other += $counts ; } |
| 92 | + } |
| 93 | + } |
| 94 | + } |
| 95 | + close IN ; |
| 96 | + } |
| 97 | + close OUT ; |
| 98 | + $month ++ ; |
| 99 | + if ($month > 12) |
| 100 | + { $month = 1 ; $year ++ ; } |
| 101 | + |
| 102 | + } |
| 103 | +} |
| 104 | + |
| 105 | +sub CountArticles |
| 106 | +{ |
| 107 | + print "CountArticles\n" ; |
| 108 | + if (! -e "$dir_in/$file_csv_pagecounts") |
| 109 | + { print "File not found: $dir_in/$file_csv_pagecounts\n" ; exit ; } |
| 110 | + |
| 111 | + open IN, '<', "$dir_in/$file_csv_pagecounts" ; |
| 112 | + while ($line = <IN>) |
| 113 | + { |
| 114 | + chomp ($line) ; |
| 115 | + |
| 116 | + ($count,$title) = split (' ', $line,2) ; |
| 117 | +# if ($title !~ /Depardieu/) { next ; } |
| 118 | + $title =~ s/%([0-9A-F]{2})/chr(hex($1))/ge ; |
| 119 | + if ($unicodetoascii) |
| 120 | + { $title =~ s/([\x80-\xFF]{2,})/&UnicodeToAscii($1)/ge ; } |
| 121 | + $title =~ s/(\&\#\d+\;)/&HtmlToAscii($1)/ge ; |
| 122 | + $title =~ s/\"/'/g ; |
| 123 | + $title =~ s/\&/&/g ; |
| 124 | + $title = lc ($title) ; |
| 125 | +# print "X $count $title\n" ; |
| 126 | + $titles {$title} += $count ; |
| 127 | + } |
| 128 | + close IN ; |
| 129 | + |
| 130 | + open OUT, '>', "$dir_out/WikiStatsPageViewsPerArticleSortByTitle.txt" ; |
| 131 | + open IN, '<', "$dir_out/WikiStatsArticles.csv" ; |
| 132 | + while ($line = <IN>) |
| 133 | + { |
| 134 | + chomp ($line) ; |
| 135 | + ($title,$category) = split (',',$line) ; |
| 136 | + |
| 137 | +# next if $category !~ /politicus/ ; |
| 138 | +# next if $category =~ /Nederlands/ ; |
| 139 | +# $category =~ s/-politicus// ; |
| 140 | + |
| 141 | +# if ($title !~ /Depardieu/) { next ; } |
| 142 | + $title =~ s/\%2C/,/g ; |
| 143 | + $category =~ s/\%2C/,/g ; |
| 144 | + $title =~ s/\s/_/g ; |
| 145 | + $title =~ s/(\&\#\d+\;)/&HtmlToAscii($1)/ge ; |
| 146 | + $title =~ s/\"/'/g ; |
| 147 | + $title =~ s/\&/&/g ; |
| 148 | + $title_lc = lc ($title) ; |
| 149 | + $count = ($titles {$title_lc}+0) ; # force numeric |
| 150 | +# print "Y $count $title_lc\n" ; |
| 151 | + print OUT sprintf ("%5d",$count) . " " . $title . "\n" ; |
| 152 | + if ($title ne $title_prev) |
| 153 | + { $articles {$title} += $count ; } |
| 154 | + $title_prev = $title ; |
| 155 | + $categories {$category} += $count ; |
| 156 | + $titlecat {$title} = $category ; |
| 157 | + } |
| 158 | + close IN ; |
| 159 | + close OUT ; |
| 160 | + |
| 161 | + open OUT, '>', "$dir_out/WikiStatsPageViewsPerArticleSortByTitle.txt" ; |
| 162 | + print OUT "Wikipedia '$wikipedia', Category: '$categoryroot', Month: '$month_out'\n" ; |
| 163 | + foreach $article (sort keys %articles) |
| 164 | +# { print OUT sprintf ("%5d",$articles {$article}) . " " . $article . "\n" ; } |
| 165 | + { &Print ($articles {$article}, $article) ; } |
| 166 | + close OUT ; |
| 167 | + |
| 168 | + open OUT, '>', "$dir_out/WikiStatsPageViewsPerArticleSortByViews.txt" ; |
| 169 | + print OUT "Wikipedia '$wikipedia', Category: '$categoryroot', Month: '$month_out'\n" ; |
| 170 | + foreach $article (sort {$articles {$b} <=> $articles {$a}} keys %articles) |
| 171 | +# { print OUT sprintf ("%5d",$articles {$article}) . " " . $article . "\n" ; } |
| 172 | + { &Print ($articles {$article}, $article) ; } |
| 173 | + close OUT ; |
| 174 | + |
| 175 | + open OUT, '>', "$dir_out/WikiStatsPageViewsPerCategorySortByTitle.txt" ; |
| 176 | + print OUT "Wikipedia '$wikipedia', Category: '$categoryroot', Month: '$month_out'\n" ; |
| 177 | + foreach $category (sort keys %categories) |
| 178 | +# { print OUT sprintf ("%5d",$categories {$category}) . " " . $category . "\n" ; } |
| 179 | + { &Print ($categories {$category}, $category) ; } |
| 180 | + close OUT ; |
| 181 | + |
| 182 | + open OUT, '>', "$dir_out/WikiStatsPageViewsPerCategorySortByViews.txt" ; |
| 183 | + print OUT "Wikipedia '$wikipedia', Category: '$categoryroot', Month: '$month_out'\n" ; |
| 184 | + foreach $category (sort {$categories {$b} <=> $categories {$a}} keys %categories) |
| 185 | +# { print OUT sprintf ("%5d",$categories {$category}) . " " . $category . "\n" ; } |
| 186 | + { &Print ($categories {$category}, $category) ; } |
| 187 | + close OUT ; |
| 188 | + |
| 189 | +# open OUT, '>', "$dir_out/WikiStatsPageViewsPerPerArticleSortByViewsPvdA.csv" ; |
| 190 | +# print OUT "politicus,partij,hits,kleur\n" ; |
| 191 | +# foreach $article (sort {$articles {$b} <=> $articles {$a}} keys %articles) |
| 192 | +# { |
| 193 | +# last if $articles {$article} == 0 ; |
| 194 | +# next if $titlecat {$article} !~ /pvda/i ; |
| 195 | +# $color = int(rand(255)) ; |
| 196 | +# print OUT "$article,${titlecat {$article}},${articles {$article}},$color\n" ; |
| 197 | +# } |
| 198 | +# close OUT ; |
| 199 | + |
| 200 | +} |
| 201 | + |
| 202 | +sub Print |
| 203 | +{ |
| 204 | + my $count = shift ; |
| 205 | + my $text = shift ; |
| 206 | + print OUT sprintf ("%5d",$count) . " p/m = " . sprintf ("%4.0f",$count/$daysinmonth) . " p/d : $text\n" ; |
| 207 | +} |
| 208 | + |
| 209 | +# translates one unicode character into plain ascii |
| 210 | +sub UnicodeToAscii { |
| 211 | + my $unicode = shift ; |
| 212 | + |
| 213 | + my $char = substr ($unicode,0,1) ; |
| 214 | + my $ord = ord ($char) ; |
| 215 | + my ($c, $value, $html) ; |
| 216 | + |
| 217 | + if ($ord < 128) # plain ascii character |
| 218 | + { return ($unicode) ; } # (will not occur in this script) |
| 219 | + else |
| 220 | + { |
| 221 | + if ($ord >= 252) { $value = $ord - 252 ; } |
| 222 | + elsif ($ord >= 248) { $value = $ord - 248 ; } |
| 223 | + elsif ($ord >= 240) { $value = $ord - 240 ; } |
| 224 | + elsif ($ord >= 224) { $value = $ord - 224 ; } |
| 225 | + else { $value = $ord - 192 ; } |
| 226 | + |
| 227 | + for ($c = 1 ; $c < length ($unicode) ; $c++) |
| 228 | + { $value = $value * 64 + ord (substr ($unicode, $c,1)) - 128 ; } |
| 229 | + |
| 230 | + if ($value < 256) |
| 231 | + { return (chr ($value)) ; } |
| 232 | + |
| 233 | + # $unicode =~ s/([\x80-\xFF])/("%".sprintf("%02X",$1))/gie ; |
| 234 | + return ($unicode) ; |
| 235 | + } |
| 236 | +} |
| 237 | + |
| 238 | +sub HtmlToAscii { |
| 239 | + my $html = shift ; |
| 240 | + my $html2 = $html ; |
| 241 | + $html2 =~ s/[^\d]//g ; |
| 242 | + if ($html2 <= 255) |
| 243 | + { return (chr ($html2)) ; } |
| 244 | + else |
| 245 | + { return ($html) ; } |
| 246 | +} |
| 247 | + |
| 248 | +sub Log |
| 249 | +{ |
| 250 | + $msg = shift ; |
| 251 | + print $msg ; |
| 252 | + print FILE_LOG $msg ; |
| 253 | +} |
| 254 | + |
| 255 | +sub Abort |
| 256 | +{ |
| 257 | + $msg = shift ; |
| 258 | + print "Abort script\nError: $msg\n" ; |
| 259 | + print LOG "Abort script\nError: $msg\n" ; |
| 260 | + exit ; |
| 261 | +} |
| 262 | + |
| 263 | + |
| 264 | + |
Index: trunk/wikistats/dammit.lt/cellar/!DammitCollectViewsOneArticle.pl |
— | — | @@ -0,0 +1,199 @@ |
| 2 | +#!/usr/local/bin/perl
|
| 3 | +
|
| 4 | +# 27 April 2010 renamed from WikiStatsCollectViewsOneArticle.pl
|
| 5 | +
|
| 6 | + use CGI qw(:all);
|
| 7 | + use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; # install IO-Compress-Zlib
|
| 8 | + use IO::Compress::Gzip qw(gzip $GzipError) ; # install IO-Compress-Zlib
|
| 9 | +
|
| 10 | + $| = 1; # flush screen output
|
| 11 | + $true = 1 ;
|
| 12 | + $false = 0 ;
|
| 13 | + $mode = "H" ; # daily files as opposed to H
|
| 14 | +
|
| 15 | +# $dir0 = "D:/Wikipedia_Visitors/full_day" ;
|
| 16 | + $dir0 = "D:/Wikipedia_Visitors" ;
|
| 17 | + chdir ($dir0) || die "Cannot chdir to $dir0\n";
|
| 18 | +
|
| 19 | +# open TXT, ">", "JoeBiden.txt" ;
|
| 20 | +# open TXT, ">", "FalungGong.txt" ;
|
| 21 | +# &ProcessMonth (2008,7) ;
|
| 22 | +# &ProcessMonth (2008,8) ;
|
| 23 | +# close TXT ;
|
| 24 | +
|
| 25 | + &ProcessSelection ;
|
| 26 | +
|
| 27 | + exit ;
|
| 28 | +
|
| 29 | +sub ProcessSelection
|
| 30 | +{
|
| 31 | + open "IN", "<", "FalungGong.txt" ;
|
| 32 | + open "OUT", ">", "FalungGongTotals.csv" ;
|
| 33 | + while ($line = <IN>)
|
| 34 | + {
|
| 35 | + chomp ($line) ;
|
| 36 | + $line =~ s/\s+/ /g ;
|
| 37 | + ($timestamp, $project, $count, $title) = split (' ', $line) ;
|
| 38 | + # $timestamp =~ s/\d\d\d\d$// ; # discard minutes and seconds
|
| 39 | + $timestamp =~ s/\-\d\d\d\d\d\d$// ; # discard hours, minutes and seconds
|
| 40 | + if ($project eq "zh")
|
| 41 | + { @counts_zh {$timestamp} += $count ; }
|
| 42 | + else
|
| 43 | + { @counts_other {$timestamp} += $count ; }
|
| 44 | + }
|
| 45 | + close IN ;
|
| 46 | +
|
| 47 | +foreach $date (sort keys %counts_zh)
|
| 48 | +{
|
| 49 | + $year = substr ($date,0,4) ;
|
| 50 | + $month = substr ($date,4,2) ;
|
| 51 | + $day = substr ($date,6,2) ;
|
| 52 | + $timestamp = sprintf ("%02d/%02d/%04d", $day, $month, $year) ;
|
| 53 | + print OUT $timestamp . "," . (@counts_zh {$date}) . "\n" ;
|
| 54 | +}
|
| 55 | +
|
| 56 | +if (0)
|
| 57 | +{
|
| 58 | + $month = 7 ;
|
| 59 | + for $day (1..31)
|
| 60 | + {
|
| 61 | + for $hour (0..23)
|
| 62 | + {
|
| 63 | + $timestamp = sprintf ("%04d%02d%02d-%02d", 2008, 7, $day, $hour) ;
|
| 64 | + $timestamp2 = sprintf ("%02d/%02d/%04d %02d:%02d", $day, 7, 2008, $hour, 0) ;
|
| 65 | + print OUT $timestamp2 . "," . (@counts_zh {$timestamp}+0) . "," . (@counts_other {$timestamp}+0) . "\n" ;
|
| 66 | + }
|
| 67 | + }
|
| 68 | +
|
| 69 | + $month = 8 ;
|
| 70 | + for $day (1..31)
|
| 71 | + {
|
| 72 | + for $hour (0..23)
|
| 73 | + {
|
| 74 | + $timestamp = sprintf ("%04d%02d%02d-%02d", 2008, 8, $day, $hour) ;
|
| 75 | + $timestamp2 = sprintf ("%02d/%02d/%04d %02d:%02d", $day, 8, 2008, $hour, 0) ;
|
| 76 | + print OUT $timestamp2 . "," . (@counts_zh {$timestamp}+0) . "," . (@counts_other {$timestamp}+0) . "\n" ;
|
| 77 | + }
|
| 78 | + }
|
| 79 | +
|
| 80 | + $month = 9 ;
|
| 81 | + for $day (1..14)
|
| 82 | + {
|
| 83 | + for $hour (0..23)
|
| 84 | + {
|
| 85 | + $timestamp = sprintf ("%04d%02d%02d-%02d", 2008, 9, $day, $hour) ;
|
| 86 | + $timestamp2 = sprintf ("%02d/%02d/%04d %02d:%02d", $day, 9, 2008, $hour, 0) ;
|
| 87 | + print OUT $timestamp2 . "," . (@counts_zh {$timestamp}+0) . "," . (@counts_other {$timestamp}+0) . "\n" ;
|
| 88 | + }
|
| 89 | + }
|
| 90 | +}
|
| 91 | + close OUT ;
|
| 92 | +}
|
| 93 | +
|
| 94 | +sub ProcessMonth
|
| 95 | +{
|
| 96 | + my $year = shift ;
|
| 97 | + my $month = sprintf ("%02d", shift) ;
|
| 98 | +
|
| 99 | + $dir0 =~ s/[\\\/]$// ;
|
| 100 | +
|
| 101 | + $dir_in = "$dir0/$year-$month-pagecounts" ;
|
| 102 | + &Log ("Process year $year month $month from '$dir_in'\n") ;
|
| 103 | + chdir ($dir_in) || die "Cannot chdir to $dir_in\n";
|
| 104 | + local (*DIR);
|
| 105 | +
|
| 106 | + opendir (DIR, ".");
|
| 107 | + @files = () ;
|
| 108 | + while ($file_in = readdir (DIR))
|
| 109 | + {
|
| 110 | + if ($mode eq "H")
|
| 111 | + {
|
| 112 | + if ($file_in !~ /^pagecounts-\d{8,8}-\d{6,6}.gz$/)
|
| 113 | + { next ; }
|
| 114 | + if ($file_in lt "pagecounts-20080816-000000.gz")
|
| 115 | + { next ; }
|
| 116 | +# if ($file_in ge "pagecounts-20080831-000000.gz")
|
| 117 | +# { next ; }
|
| 118 | + }
|
| 119 | + if ($mode eq "D")
|
| 120 | + {
|
| 121 | + if ($file_in !~ /^pagecounts-\d{8,8}_fd.gz$/)
|
| 122 | + { next ; }
|
| 123 | +# if ($file_in lt "pagecounts-20080801_fd.gz")
|
| 124 | +# { next ; }
|
| 125 | +# if ($file_in ge "pagecounts-20080831_fd.gz")
|
| 126 | +# { next ; }
|
| 127 | + }
|
| 128 | + push @files, $file_in ;
|
| 129 | + }
|
| 130 | + closedir (DIR, ".");
|
| 131 | +
|
| 132 | + @files = sort {$a cmp $b} @files ;
|
| 133 | +
|
| 134 | + foreach $file (@files)
|
| 135 | + { &ProcessFile ($file) ; }
|
| 136 | +}
|
| 137 | +
|
| 138 | +sub ProcessFile
|
| 139 | +{
|
| 140 | + my $file = shift ;
|
| 141 | + $date = substr ($file, 11, 8) ;
|
| 142 | + $time = substr ($file, 20, 6) ;
|
| 143 | + print "ProcessFile ($file)\n" ;
|
| 144 | +
|
| 145 | + my $lines ;
|
| 146 | + $in_gz = IO::Uncompress::Gunzip->new ($file) or die "IO::Uncompress::Gunzip failed for '$file': $GunzipError\n";
|
| 147 | + binmode $in_gz ;
|
| 148 | + while ($line = <$in_gz>)
|
| 149 | + {
|
| 150 | + # if ($line ge "eo")
|
| 151 | + # { last ; }
|
| 152 | + # if ($line !~ /^en /)
|
| 153 | + # { next ; }
|
| 154 | + # if ($lines ++ == 0) { print "$line" ; }
|
| 155 | +
|
| 156 | +# if ($line =~ /sarah.*palin/i)
|
| 157 | +# if ($line =~ /joe.*biden/i)
|
| 158 | + if ($line =~ / \%E6\%B3\%95\%E8\%BD\%AE\%E5\%8A\%9F /)
|
| 159 | + {
|
| 160 | + if ($mode eq "H")
|
| 161 | + {
|
| 162 | + ($wiki,$title,$views,$bytes) = split (' ', $line) ;
|
| 163 | + $line = sprintf ("%-10s", $wiki) . " " . sprintf ("%8d",$views) . " $title\n" ;
|
| 164 | + print "$date-$time $line" ;
|
| 165 | + print TXT "$date-$time $line" ;
|
| 166 | + }
|
| 167 | + if ($mode eq "D")
|
| 168 | + {
|
| 169 | + chomp ($line) ;
|
| 170 | +
|
| 171 | + ($wiki,$title,$views_all_day) = split (' ', $line) ;
|
| 172 | + $wiki =~ s/\.z// ;
|
| 173 | + $wiki =~ s/\.y/2/ ;
|
| 174 | + $views_all_day =~ s/^\d+// ; # remove (redundant) preceding total
|
| 175 | + while ($views_all_day ne "")
|
| 176 | + {
|
| 177 | + $letter = substr ($views_all_day,0,1) ;
|
| 178 | + $views_all_day = substr ($views_all_day,1) ;
|
| 179 | + ($views_one_hour = $views_all_day) =~ s/^(\d+).*$/$1/ ;
|
| 180 | + $views_all_day =~ s/^\d+(.*)$/$1/ ;
|
| 181 | + $time = sprintf ("%02d",ord ($letter) - ord ('A')) . "0000" ;
|
| 182 | +
|
| 183 | + $line = sprintf ("%-10s", $wiki) . " " . sprintf ("%8d",$views_one_hour) . " $title\n" ;
|
| 184 | + print "$date-$time $line" ;
|
| 185 | + print TXT "$date-$time $line" ;
|
| 186 | + }
|
| 187 | + }
|
| 188 | + }
|
| 189 | + }
|
| 190 | +
|
| 191 | + $in_gz->close() ;
|
| 192 | +}
|
| 193 | +
|
| 194 | +sub Log
|
| 195 | +{
|
| 196 | + $msg = shift ;
|
| 197 | + print $msg ;
|
| 198 | + print LOG $msg ;
|
| 199 | +}
|
| 200 | +
|
Index: trunk/wikistats/dammit.lt/cellar/!DammitFilterDailyPagecountFilesPerLanguage.pl |
— | — | @@ -0,0 +1,156 @@ |
| 2 | +#!/usr/bin/perl |
| 3 | + |
| 4 | +# 27 April 2010 renamed from WikiStatsFilterCompactedDammitFilesPerLanguage.pl |
| 5 | + |
| 6 | + use lib "/home/ezachte/lib" ; |
| 7 | + use EzLib ; |
| 8 | + $trace_on_exit = $true ; |
| 9 | + |
| 10 | + use CGI::Carp qw(fatalsToBrowser); |
| 11 | + use Time::Local ; |
| 12 | + use Net::Domain qw (hostname); |
| 13 | + |
| 14 | + $language = "nl" ; |
| 15 | + $wikipedia = "$language.wikipedia.org" ; # read from input |
| 16 | + |
| 17 | + $path_in = "." ; |
| 18 | + $path_out = "." ; |
| 19 | + if ($hostname eq "bayes") |
| 20 | + { |
| 21 | + $path_in = "/a/dammit.lt/pagecounts" ; |
| 22 | + $path_out = "/a/dammit.lt/pagecounts/languages/$language.z" ; |
| 23 | + if (! -d $path_out) |
| 24 | + { mkdir $path_out, 0777 ; } |
| 25 | + $path_7za = "/usr/lib/p7zip/7za" ; |
| 26 | + } |
| 27 | + |
| 28 | + $month = 8 ; |
| 29 | + $year = 2008 ; |
| 30 | + $yyyymm = sprintf ("%04d-%02d", $year, $month) ; |
| 31 | + $path_in_monthly = "$path_in/$yyyymm" ; |
| 32 | + while (-d $path_in_monthly) |
| 33 | + { |
| 34 | + print "\nCheck dir $path_in_monthly\n" ; |
| 35 | + |
| 36 | + $file_filtered = "$path_out/pagecounts-$yyyymm-$language-fdt.txt" ; |
| 37 | + |
| 38 | + if ($hostname eq "bayes") |
| 39 | + { |
| 40 | + $file_filtered_7z = "$file_filtered.7z" ; |
| 41 | + |
| 42 | + if (-e $file_filtered_7z) |
| 43 | + { print "File $file_filtered_7z already exists\n" ; } |
| 44 | + else |
| 45 | + { &FilterCounts ($yyyymm, $file_filtered) ; } |
| 46 | + } |
| 47 | + else |
| 48 | + { &FilterCounts ($yyyymm, $file_filtered) ; } |
| 49 | + |
| 50 | + $month++ ; |
| 51 | + if ($month > 12) |
| 52 | + { $month = 1 ; $year++ ; } |
| 53 | + $yyyymm = sprintf ("%04d-%02d", $year, $month) ; |
| 54 | + $path_in_monthly = "$path_in/$yyyymm" ; |
| 55 | + } |
| 56 | + |
| 57 | + print "\n\nReady\n\n" ; |
| 58 | + exit ; |
| 59 | + |
| 60 | +sub FilterCounts |
| 61 | +{ |
| 62 | + my ($yyyymm, $file_filtered) = @_ ; |
| 63 | + ($yyyymm2 = $yyyymm) =~ s/-// ; |
| 64 | + |
| 65 | + open OUT, '>', $file_filtered ; |
| 66 | + |
| 67 | + print OUT "# Counts for articles with less than a few requests per full day (before April 2010 five per day, from then on two per day) were not preserved in daily archives and hence are neither included here\n" ; |
| 68 | +# print OUT "# Subproject is language code, followed by project code\n" ; |
| 69 | +# print OUT "# Project is b:wikibooks, k:wiktionary, n:wikinews, q:wikiquote, s:wikisource, v:wikiversity, z:wikipedia (z added by compression script: wikipedia happens to be sorted last in dammit.lt files)\n" ; |
| 70 | + print OUT "# Counts format is total per day, followed by count per hour if larger than zero, hour 0..23 shown as A..X (saves up to 22 bytes per line compared to comma separated values)\n" ; |
| 71 | + print OUT "# If data are missing for some hour (file missing or corrupt) a question mark (?) is shown (and for each missing hour the daily total is incremented with hourly average)\n" ; |
| 72 | + print OUT "# Lines starting with ampersand (@) show totals per 'namespace' (including omitted counts for low traffic articles)\n" ; |
| 73 | + print OUT "# Since valid namespace string are not known in the compression script any string followed by colon (:) counts as possible namespace string\n" ; |
| 74 | + print OUT "# Please reconcile with real namespace name strings later\n" ; |
| 75 | + print OUT "# 'namespaces' with count < 5 are combined in 'Other' (on larger wikis these are surely false positives)\n" ; |
| 76 | + print OUT "# Page titles are shown unmodified (preserves sort sequence)\n" ; |
| 77 | + |
| 78 | + |
| 79 | + for ($day = 1 ; $day <= 31 ; $day++) |
| 80 | + { |
| 81 | + $yyyymmdd = "$yyyymm-" . sprintf ("%02d", $day) ; |
| 82 | + |
| 83 | + $file_pagecounts = "$path_in/$yyyymm/pagecounts-$yyyymm2" . sprintf ("%02d", $day) . "_fdt" ; |
| 84 | + if ($hostname eq "bayes") |
| 85 | + { $file_pagecounts .= ".7z" ; } |
| 86 | + |
| 87 | + |
| 88 | + if (! -e $file_pagecounts) |
| 89 | + { |
| 90 | + print "\nNot found: $file_pagecounts\n" ; |
| 91 | + print OUT "# $yyyymmdd missing!\n" ; |
| 92 | + next ; |
| 93 | + } |
| 94 | + |
| 95 | + print "Read $file_pagecounts\n" ; |
| 96 | + print OUT "# $yyyymmdd\n" ; |
| 97 | + |
| 98 | + if ($hostname eq "bayes") |
| 99 | + { open IN, "-|", "./7za e -so \"$file_pagecounts\"" || die ("Input file '" . $file_pagecounts . "' could not be opened.") ; } |
| 100 | + else |
| 101 | + { open IN, '<', $file_pagecounts ; } |
| 102 | + |
| 103 | + while ($line = <IN>) |
| 104 | + { |
| 105 | + $ch = substr ($line,0,1) ; |
| 106 | + |
| 107 | + next if $ch eq '#' ; # comments |
| 108 | + |
| 109 | + if ($ch eq '@') # summary per language project |
| 110 | + { |
| 111 | + if ($line =~ /^\@ $language\.z /o) |
| 112 | + { print OUT $line ; } |
| 113 | + next ; |
| 114 | + } |
| 115 | + |
| 116 | + next if $line lt "$language.z" ; |
| 117 | + last if $line !~ /$language.z / ; |
| 118 | + |
| 119 | + ($project, $title, $counts) = split (' ', $line) ; |
| 120 | + print OUT "$title $counts\n" ; |
| 121 | + } |
| 122 | + close IN ; |
| 123 | + } |
| 124 | + close OUT ; |
| 125 | + |
| 126 | + $cmd = "$path_7za a $file_filtered.7z $file_filtered" ; |
| 127 | + $result = `$cmd` ; |
| 128 | + |
| 129 | + if ($result =~ /Everything is Ok/s) |
| 130 | + { |
| 131 | + $result =~ s/^.*?(Updating.*?)\n.*$/$1 -> OK/s ; |
| 132 | + unlink $file_filtered ; |
| 133 | + } |
| 134 | + else |
| 135 | + { |
| 136 | + print "Delete $file_filtered.7z\n" ; |
| 137 | + unlink "$file_filtered.7z" ; |
| 138 | + } |
| 139 | + |
| 140 | + print "$cmd -> $result\n" ; |
| 141 | +} |
| 142 | + |
| 143 | +sub Log |
| 144 | +{ |
| 145 | + $msg = shift ; |
| 146 | + print $msg ; |
| 147 | +} |
| 148 | + |
| 149 | +sub Abort |
| 150 | +{ |
| 151 | + $msg = shift ; |
| 152 | + print "Abort script\nError: $msg\n" ; |
| 153 | + exit ; |
| 154 | +} |
| 155 | + |
| 156 | + |
| 157 | + |
Index: trunk/wikistats/dammit.lt/cellar/!DammitPatchProjectcountsForFundraiser.pl |
— | — | @@ -0,0 +1,86 @@ |
| 2 | +#!/usr/bin/perl
|
| 3 | +
|
| 4 | +$| = 1; # flush screen output
|
| 5 | +
|
| 6 | +open IN, '<', 'DammitPatchProjectcountsForFundraiser/AllSquids.csv' ;
|
| 7 | +open LOG, '>', 'DammitPatchProjectcountsForFundraiser/Log.txt' ;
|
| 8 | +
|
| 9 | +chdir ("DammitPatchProjectcountsForFundraiser") || die "Cannot chdir to DammitPatchProjectcountsForFundraiser\n" ;
|
| 10 | +
|
| 11 | +while ($line = <IN>)
|
| 12 | +{
|
| 13 | + chomp $line ;
|
| 14 | +
|
| 15 | + next if $line =~ /[*]/ ;
|
| 16 | + next if $line !~ /^2010/ ;
|
| 17 | +
|
| 18 | + ($date,$hour,$events,$avg_delta) = split (',', $line) ;
|
| 19 | +
|
| 20 | + next if $avg_delta <= 1005 ; # normally projectcounts also miss a few hits, overcorrecting would skew trends
|
| 21 | + &Patch ($date, $hour, $avg_delta) ;
|
| 22 | +}
|
| 23 | +
|
| 24 | +print "\n\nReady\n\n" ;
|
| 25 | +exit ;
|
| 26 | +
|
| 27 | +sub Patch
|
| 28 | +{
|
| 29 | + ($date,$hour,$avg_delta) = @_ ;
|
| 30 | +
|
| 31 | + $date =~ s/-//g ;
|
| 32 | + $file = "projectcounts-$date-" . sprintf ("%02d",$hour) . "0000" ;
|
| 33 | +
|
| 34 | + if (! -e $file)
|
| 35 | + {
|
| 36 | + $file = "projectcounts-$date-" . sprintf ("%02d",$hour) . "0001" ;
|
| 37 | + if (! -e $file)
|
| 38 | + {
|
| 39 | + print "File '$file' missing!\n" ;
|
| 40 | + exit ;
|
| 41 | + }
|
| 42 | + }
|
| 43 | + &PatchFile ($file, $avg_delta) ;
|
| 44 | +}
|
| 45 | +
|
| 46 | +sub PatchFile
|
| 47 | +{
|
| 48 | + my ($file,$avg_delta) = @_ ;
|
| 49 | + my $line ;
|
| 50 | + $correction = $avg_delta / 1000 ;
|
| 51 | + print "Patch file $file: avg delta $avg_delta -> correction $correction\n" ;
|
| 52 | +
|
| 53 | + open PROJECTFILE, '<', $file || die "Could not open '$file'\n" ;
|
| 54 | +
|
| 55 | + undef @projectfile ;
|
| 56 | + $file_changed = 0 ;
|
| 57 | + while ($line = <PROJECTFILE>)
|
| 58 | + {
|
| 59 | + chomp $line ;
|
| 60 | + ($project,$dash,$count,$bytes) = split (' ', $line) ;
|
| 61 | +
|
| 62 | + if ($bytes > 0)
|
| 63 | + {
|
| 64 | + $count = sprintf ("%.0f", $correction * $count) ;
|
| 65 | + # &Log ("\n$line ->\n") ;
|
| 66 | + $line = "$project $dash $count 1" ;
|
| 67 | + # &Log ("$line\n") ;
|
| 68 | + }
|
| 69 | + push @projectfile, "$line\n" ;
|
| 70 | + }
|
| 71 | +
|
| 72 | + close PROJECTFILE ;
|
| 73 | +
|
| 74 | + open PROJECTFILE, '>', $file || die "Could not open '$file'\n" ;
|
| 75 | + print PROJECTFILE @projectfile ;
|
| 76 | + close PROJECTFILE ;
|
| 77 | +}
|
| 78 | +
|
| 79 | +sub Log
|
| 80 | +{
|
| 81 | + my $msg = shift ;
|
| 82 | + print $msg ;
|
| 83 | + print LOG $msg ;
|
| 84 | +}
|
| 85 | +
|
| 86 | +
|
| 87 | +
|
Index: trunk/wikistats/dammit.lt/cellar/!DammitRankSpecialPages.pl |
— | — | @@ -0,0 +1,44 @@ |
| 2 | +#!/usr/local/bin/perl
|
| 3 | +use CGI qw(:all);
|
| 4 | +
|
| 5 | + open IN, '<', "pagecounts-20090301_fdt" ;
|
| 6 | + open OUT, '>', "!DammitRankSpecialPages.txt" ;
|
| 7 | + $projprev = "" ;
|
| 8 | + while ($line = <IN>)
|
| 9 | + {
|
| 10 | + if ($line =~ /^#/) { next ; }
|
| 11 | + if ($line =~ /^@/) { next ; }
|
| 12 | + # if (($line !~ / Wikipedia\:/) && ($line !~ / Help\:/) && ($line !~ / Hilfe\:/) && ($line !~ / Wikip�dia\:/) && ($line !~ / Aide\:/) )
|
| 13 | + if (($line !~ / Help\:/) && ($line !~ / Hilfe\:/) && ($line !~ / Aide\:/))
|
| 14 | + { next ; }
|
| 15 | +
|
| 16 | + chomp ($line) ;
|
| 17 | + ($project, $title, $counts) = split (' ', $line) ;
|
| 18 | + $project =~ s/^([^\.]+)\.z/wikipedia:$1/ ;
|
| 19 | + $project =~ s/^([^\.]+)\.b/wikibooks:$1/ ;
|
| 20 | + $project =~ s/^([^\.]+)\.d/wiktionary:$1/ ; # dictionaire
|
| 21 | + $project =~ s/^([^\.]+)\.m/wikimedia:$1/ ;
|
| 22 | + $project =~ s/^([^\.]+)\.n/wikinews:$1/ ;
|
| 23 | + $project =~ s/^([^\.]+)\.q/wikiquote:$1/ ;
|
| 24 | + $project =~ s/^([^\.]+)\.s/wikisource:$1/ ;
|
| 25 | + $project =~ s/^([^\.]+)\.v/wikiversity:$1/ ;
|
| 26 | + $project =~ s/^([^\.]+)\.x/wikispecial:$1/ ;
|
| 27 | + if ($project ne $projprev)
|
| 28 | + {
|
| 29 | + $rows = 0 ;
|
| 30 | + foreach $key (sort {$counts {$b} <=> $counts {$a}} keys %counts)
|
| 31 | + {
|
| 32 | + print OUT sprintf ("%8d", $counts {$key} ) . ": $key\n" ;
|
| 33 | + if ($rows++ > 50)
|
| 34 | + { last ;}
|
| 35 | + }
|
| 36 | + undef %counts ;
|
| 37 | + }
|
| 38 | + $projprev = $project ;
|
| 39 | +
|
| 40 | + $counts =~ s/^(\d+).*$/$1/ ;
|
| 41 | + @counts {"$project $title"} += $counts ;
|
| 42 | + }
|
| 43 | +
|
| 44 | +
|
| 45 | +
|
Index: trunk/wikistats/dammit.lt/cellar/!DammitCollectArticleNames.pl |
— | — | @@ -0,0 +1,152 @@ |
| 2 | +#!/usr/local/bin/perl
|
| 3 | +
|
| 4 | +# 27 April 2010 renamed from WikiStatsCollectArticleNames.pl
|
| 5 | +
|
| 6 | +use CGI qw(:all);
|
| 7 | +use Time::Local ;
|
| 8 | +use Getopt::Std ;
|
| 9 | +
|
| 10 | + &ParseArguments ;
|
| 11 | + $dumpfile = &FindDumpFile ;
|
| 12 | + &ProcessFile ($dumpfile, "$path_out/$mode\_$project.txt") ;
|
| 13 | + print "\n\nReady\n\n" ;
|
| 14 | + exit ;
|
| 15 | +
|
| 16 | +sub ParseArguments
|
| 17 | +{
|
| 18 | + my $options ;
|
| 19 | + getopt ("iomp", \%options) ;
|
| 20 | +
|
| 21 | + &Abort ("Specify input folder for xml dump files as: -i path") if (! defined (@options {"i"})) ;
|
| 22 | + &Abort ("Specify output folder as: -o path") if (! defined (@options {"o"})) ;
|
| 23 | +
|
| 24 | + $path_in = @options {"i"} ;
|
| 25 | + $path_out = @options {"o"} ;
|
| 26 | + $project = @options {"p"} ;
|
| 27 | + $mode = @options {"m"} ;
|
| 28 | +
|
| 29 | + $language = $project ;
|
| 30 | + $language_ = $language ;
|
| 31 | + $language_ =~ s/-/_/g ;
|
| 32 | +
|
| 33 | + if ($mode eq "")
|
| 34 | + { $mode = "wp" ; }
|
| 35 | + if ($mode !~ /^(?:wb|wk|wn|wp|wq|ws|wx|wv)$/)
|
| 36 | + { abort ("Specify mode as: -m [wb|wk|wn|wp|wq|ws|wx|wv]\n(wp=wikipedia (default), wb=wikibooks, wk=wiktionary, wn=wikinews, wq=wikiquote, ws=wikisource, wx=wikispecial, wv=wikiversity)") ; }
|
| 37 | +
|
| 38 | + &Abort ("Project $project is skipped: 'mania' and/or 'team' in the name") if ($project =~ /(?:mania|team)/i) ;
|
| 39 | +
|
| 40 | + if ($project =~ /wik(?:|ibooks|inews|iquote|isource|tionary|iversity)$/i)
|
| 41 | + {
|
| 42 | + $project_suffix = $project ;
|
| 43 | + $project_suffix =~ s/wik(?:|ibooks|inews|iquote|isource|tionary|iversity)$// ;
|
| 44 | + }
|
| 45 | + $language =~ s/wik(?:|ibooks|inews|iquote|isource|tionary|iversity)$// ;
|
| 46 | +
|
| 47 | + if ($project =~ /wiki$/i)
|
| 48 | + {
|
| 49 | + $project_suffix = $project ;
|
| 50 | + $project_suffix =~ s/wiki$// ;
|
| 51 | + }
|
| 52 | + $language =~ s/wiki$// ;
|
| 53 | +
|
| 54 | + &Log ("Project '$project' -> language '$language'\n\n") ;
|
| 55 | +}
|
| 56 | +
|
| 57 | +sub FindDumpFile
|
| 58 | +{
|
| 59 | + my ($dumpdir,$dir,$file,$scandir,$status) ;
|
| 60 | +
|
| 61 | + @files = glob "$path_in/*" ;
|
| 62 | +
|
| 63 | + &Log ("Find latest valid dump dir in $path_in ->\n\n") ;
|
| 64 | + foreach $file (@files)
|
| 65 | + {
|
| 66 | + if ($file !~ /\/\d{8,8}$/)
|
| 67 | + { next ; }
|
| 68 | + if (! -d $file)
|
| 69 | + { next ; }
|
| 70 | +
|
| 71 | + ($dir = $file) =~ s/.*?\/(\d{8,8})/$1/ ;
|
| 72 | + $scandir = "$path_in/$dir" ;
|
| 73 | + if (! -e "$scandir/status.html")
|
| 74 | + { &Log ("$scandir/status.html not found\n") ; }
|
| 75 | + elsif (! -e "$scandir/index.html")
|
| 76 | + { &Log ("$scandir/index.html not found\n") ; }
|
| 77 | + else
|
| 78 | + {
|
| 79 | + open STATUS, '<', "$scandir/status.html" ;
|
| 80 | + $line = <STATUS> ;
|
| 81 | + chomp $line ;
|
| 82 | + close STATUS ;
|
| 83 | + $status = "undetermined: $line" ;
|
| 84 | + if ($line =~ /dump complete/i)
|
| 85 | + { $status = "dump complete" ; }
|
| 86 | + elsif ($line =~ /dump aborted/i)
|
| 87 | + { $status = "dump aborted" ; }
|
| 88 | + elsif ($line =~ /dump in progress/i)
|
| 89 | + { $status = "dump in progress" ; }
|
| 90 | + if ($dumpdir lt $dir)
|
| 91 | + {
|
| 92 | + if ($status eq "dump complete")
|
| 93 | + {
|
| 94 | + open INDEX, '<', "$scandir/index.html" ;
|
| 95 | + while ($line = <INDEX>)
|
| 96 | + {
|
| 97 | + if ($line =~ /failed.*?All pages with complete.*?edit history/i)
|
| 98 | + {
|
| 99 | + $status = "dump aborted (dump failed)" ;
|
| 100 | + last ;
|
| 101 | + }
|
| 102 | + }
|
| 103 | + close INDEX ;
|
| 104 | + }
|
| 105 | + if ($status eq "dump complete")
|
| 106 | + { $dumpdir = $dir ; }
|
| 107 | + }
|
| 108 | + &Log ("$dir: $status\n") ;
|
| 109 | + }
|
| 110 | + }
|
| 111 | + if ($dumpdir eq "")
|
| 112 | + { &Abort ("No valid dump dir found\n") ; }
|
| 113 | +
|
| 114 | + $path_in .= "/$dumpdir/" ;
|
| 115 | + &Log ("\nDump dir -> $path_in\n") ;
|
| 116 | + $dumpdate = $dumpdir ;
|
| 117 | +
|
| 118 | + $dumpfile = "$path_in/$project-$dumpdate-pages-meta-current.xml.bz2" ;
|
| 119 | + &Log ("\nFile in $dumpfile\n") ;
|
| 120 | + return ($dumpfile) ;
|
| 121 | +}
|
| 122 | +
|
| 123 | +sub ProcessFile
|
| 124 | +{
|
| 125 | + my $file_in = shift ;
|
| 126 | + my $file_out = shift ;
|
| 127 | + print "File out $file_out\n" ;
|
| 128 | + open FILE_OUT, '>', $file_out || abort ("Output file '" . $file_out . "' could not be opened.") ;
|
| 129 | + open FILE_IN, "-|", "bzip2 -dc \"$file_in\"" || abort ("Input file '" . $file_in . "' could not be opened.") ;
|
| 130 | + while ($line = <FILE_IN>)
|
| 131 | + {
|
| 132 | + # $line =~ s/<title>([^<]*)<\/title>/print FILE_OUT "$1\n", print "$1\n"/ge ;
|
| 133 | + $line =~ s/<title>([^<]*)<\/title>/print FILE_OUT "$1\n"/ge ;
|
| 134 | + }
|
| 135 | + close FILE_IN ;
|
| 136 | +}
|
| 137 | +
|
| 138 | +sub Log
|
| 139 | +{
|
| 140 | + $msg = shift ;
|
| 141 | + print $msg ;
|
| 142 | +# print LOG $msg ;
|
| 143 | +}
|
| 144 | +
|
| 145 | +sub Abort
|
| 146 | +{
|
| 147 | + $msg = shift ;
|
| 148 | + print "Abort script\nError: $msg\n" ;
|
| 149 | +# print LOG "Abort script\nError: $msg\n" ;
|
| 150 | + exit ;
|
| 151 | +}
|
| 152 | +
|
| 153 | +
|
Index: trunk/wikistats/dammit.lt/cellar/!DammitPrepCollectHarvestInterwikiLinks.pl |
— | — | @@ -0,0 +1,40 @@ |
| 2 | +#!/usr/bin/perl
|
| 3 | +
|
| 4 | +open IN, '<', 'index.php' ;
|
| 5 | +
|
| 6 | +while ($line = <IN>)
|
| 7 | +{
|
| 8 | + if ($line =~ /class=\"interwiki/)
|
| 9 | + {
|
| 10 | + chomp ($line) ;
|
| 11 | + $lang = $line ;
|
| 12 | + $lang =~ s/^.*?interwiki-(\w+).*$/$1/ ;
|
| 13 | + $title = $line ;
|
| 14 | + $title =~ s/^.*?href=\"([^\"]+)\".*$/$1/ ;
|
| 15 | + $title =~ s/^.*\/([^\/]+)$/$1/ ;
|
| 16 | +# print "[$lang] $title\n" ;
|
| 17 | + @languages {$title} .= "$lang," ;
|
| 18 | + @langcnt {$title}++ ;
|
| 19 | + }
|
| 20 | +}
|
| 21 | +print "\n\n\n" ;
|
| 22 | +
|
| 23 | +foreach $title (sort {$langcnt {$b} <=> $langcnt {$a}} keys %langcnt)
|
| 24 | +{
|
| 25 | + $count = $langcnt {$title} ;
|
| 26 | + if ($count > 10)
|
| 27 | + { $pattern .= "^$title\n" ; }
|
| 28 | + else
|
| 29 | + {
|
| 30 | + $langlist = $languages {$title} ;
|
| 31 | + @langs = split (',', $langlist) ;
|
| 32 | + foreach $lang (@langs)
|
| 33 | + {
|
| 34 | + print "$lang $title\n" ;
|
| 35 | + $pattern .= "^$lang\.z $title\n"
|
| 36 | + }
|
| 37 | + }
|
| 38 | +}
|
| 39 | +
|
| 40 | +print "\n\nPATTERN:\n$pattern\n" ;
|
| 41 | +
|
Index: trunk/wikistats/dammit.lt/cellar/!DammitRankSpecialPages.txt |
— | — | @@ -0,0 +1,576 @@ |
| 2 | + 14: wikipedia:als Hilfe:Neue_Seite_anlegen |
| 3 | + 33: wikipedia:am Help:Contents |
| 4 | + 68: wikipedia:ang Help:Innung |
| 5 | + 10: wikipedia:arc Help:Contents |
| 6 | + 31: wikipedia:ay Help:Contents |
| 7 | + 12: wikipedia:bar Hilfe:Hilfe |
| 8 | + 10: wikiversity:beta Help:Contents |
| 9 | + 11: wikipedia:bo Help:Contents |
| 10 | + 10: wikipedia:chr Help:Contents |
| 11 | + 11: wikipedia:co Help:Contents |
| 12 | + 993: wikimedia:commons Help:Contents |
| 13 | + 86: wikimedia:commons Help:Inkscape |
| 14 | + 30: wikimedia:commons Help:SVG |
| 15 | + 23: wikimedia:commons Help:Creating_a_DjVu_file |
| 16 | + 21: wikimedia:commons Help:Sommaire |
| 17 | + 20: wikimedia:commons Help:Converting_video |
| 18 | + 18: wikimedia:commons Hilfe:%C3%9Cbersicht |
| 19 | + 17: wikimedia:commons Help:Scanning |
| 20 | + 16: wikimedia:commons Help:%E7%9B%AE%E6%AC%A1 |
| 21 | + 15: wikimedia:commons Help:%C3%9Cbersicht |
| 22 | + 14: wikimedia:commons Help:Mass_deletion_request |
| 23 | + 13: wikimedia:commons Help:Zoomable_images |
| 24 | + 12: wikimedia:commons Help:Logging_in |
| 25 | + 11: wikimedia:commons Help:Mpeg2dv.sh |
| 26 | + 36: wikibooks:de Hilfe:Sammlungen |
| 27 | + 18: wikibooks:de Hilfe:Suche |
| 28 | + 18: wikibooks:de Hilfe:So_schreibe_ich_gute_B%C3%BCcher |
| 29 | + 11: wikibooks:de Hilfe:Erste_Schritte_auf_der_Spielwiese |
| 30 | + 10: wikibooks:de Hilfe:Urheberrechte_beachten |
| 31 | + 10: wikibooks:de Hilfe:Wikibook_lokal_speichern |
| 32 | + 811: wiktionary:de Hilfe:H%C3%B6rbeispiele |
| 33 | + 197: wiktionary:de Hilfe:Wortart |
| 34 | + 164: wiktionary:de Hilfe:IPA |
| 35 | + 63: wiktionary:de Hilfe:Nominativ |
| 36 | + 61: wiktionary:de Hilfe:Sonderzeichen/Tabelle |
| 37 | + 35: wiktionary:de Hilfe:H%C3%A4ufig_gestellte_Fragen |
| 38 | + 35: wiktionary:de Hilfe:Genitiv |
| 39 | + 34: wiktionary:de Hilfe:Plural |
| 40 | + 33: wiktionary:de Hilfe:Singular |
| 41 | + 26: wiktionary:de Hilfe:Pr%C3%A4sens |
| 42 | + 26: wiktionary:de Hilfe:Akkusativ |
| 43 | + 25: wiktionary:de Hilfe:Flexionstabellen_(Altgriechisch) |
| 44 | + 25: wiktionary:de Hilfe:Dativ |
| 45 | + 24: wiktionary:de Hilfe:Pr%C3%A4teritum |
| 46 | + 23: wiktionary:de Hilfe:Suche |
| 47 | + 21: wiktionary:de Hilfe:Konjunktiv |
| 48 | + 19: wiktionary:de Hilfe:Sonderzeichen |
| 49 | + 18: wiktionary:de Hilfe:Flexionstabellen_(Franz%C3%B6sisch) |
| 50 | + 15: wiktionary:de Hilfe:H%C3%B6rbeispiele/Liste |
| 51 | + 15: wiktionary:de Hilfe:Flexionstabellen_(Lateinisch) |
| 52 | + 15: wiktionary:de Hilfe:Lautschrift |
| 53 | + 14: wiktionary:de Hilfe:Positiv |
| 54 | + 14: wiktionary:de Hilfe:Kasus |
| 55 | + 13: wiktionary:de Hilfe:Imperativ |
| 56 | + 13: wiktionary:de Hilfe:Flexionstabellen |
| 57 | + 13: wiktionary:de Hilfe:Flexionstabellen_(Spanisch) |
| 58 | + 12: wiktionary:de Hilfe:Partizip |
| 59 | + 12: wiktionary:de Hilfe:Hinweise_f%C3%BCr_Leser |
| 60 | + 10: wiktionary:de Hilfe:Komparativ |
| 61 | + 19: wikinews:de Hilfe:Erste_Schritte |
| 62 | + 18: wikinews:de Hilfe:%C3%9Cbersicht |
| 63 | + 14: wikinews:de Hilfe:Zweite_Schritte |
| 64 | + 10: wikinews:de Hilfe:Quellenverzeichnis |
| 65 | + 11: wikiquote:de Hilfe:Erste_Schritte |
| 66 | + 20: wikisource:de Hilfe:Bearbeitungsstand |
| 67 | + 18: wikisource:de Hilfe:B%C3%BCcher |
| 68 | + 16: wikisource:de Hilfe:Korrekturlesen |
| 69 | + 10: wikisource:de Hilfe:Scannen_von_B%C3%BCchern |
| 70 | + 1722: wikipedia:de Hilfe:Buchfunktion |
| 71 | + 1154: wikipedia:de Hilfe:Sonderzeichen |
| 72 | + 923: wikipedia:de Hilfe:Gesichtete_und_gepr%C3%BCfte_Versionen |
| 73 | + 834: wikipedia:de Hilfe:Tutorial |
| 74 | + 747: wikipedia:de Hilfe:TeX |
| 75 | + 728: wikipedia:de Hilfe:Suche |
| 76 | + 484: wikipedia:de Hilfe:Wikimedia_Commons |
| 77 | + 417: wikipedia:de Hilfe:Neu_bei_Wikipedia |
| 78 | + 338: wikipedia:de Hilfe:Bearbeitungshilfe |
| 79 | + 272: wikipedia:de Hilfe:Tutorial/3 |
| 80 | + 242: wikipedia:de Hilfe:Spezialseiten |
| 81 | + 241: wikipedia:de Hilfe:Tutorial/1 |
| 82 | + 233: wikipedia:de Hilfe:Bilder |
| 83 | + 219: wikipedia:de Hilfe:Textgestaltung |
| 84 | + 202: wikipedia:de Hilfe:Vorlagen |
| 85 | + 191: wikipedia:de Hilfe:Seite_bearbeiten |
| 86 | + 177: wikipedia:de Hilfe:Zusammenfassung_und_Quelle |
| 87 | + 175: wikipedia:de Hilfe:Einzelnachweise |
| 88 | + 173: wikipedia:de Hilfe:Tutorial/2 |
| 89 | + 169: wikipedia:de Hilfe:Tabellen |
| 90 | + 167: wikipedia:de Hilfe:Audio |
| 91 | + 166: wikipedia:de Hilfe:Neue_Seite_anlegen |
| 92 | + 154: wikipedia:de Hilfe:Einstellungen |
| 93 | + 141: wikipedia:de Hilfe:Formatvorlagen |
| 94 | + 140: wikipedia:de Hilfe:Signatur |
| 95 | + 139: wikipedia:de Hilfe:Tutorial/4 |
| 96 | + 126: wikipedia:de Hilfe:FAQ |
| 97 | + 125: wikipedia:de Hilfe:Bausteine |
| 98 | + 125: wikipedia:de Hilfe:Archivieren |
| 99 | + 121: wikipedia:de Hilfe:Namensr%C3%A4ume |
| 100 | + 117: wikipedia:de Hilfe:Links |
| 101 | + 116: wikipedia:de Hilfe:Personendaten |
| 102 | + 115: wikipedia:de Hilfe:Zusammenfassung_und_Quellen |
| 103 | + 114: wikipedia:de Hilfe:Weiterleitung |
| 104 | + 110: wikipedia:de Hilfe:Bearbeiten |
| 105 | + 105: wikipedia:de Hilfe:Buchfunktion/Fragen_und_Antworten |
| 106 | + 95: wikipedia:de Hilfe:Benutzerkonto_anlegen |
| 107 | + 94: wikipedia:de Hilfe:Bild_und_Ton |
| 108 | + 87: wikipedia:de Hilfe:Farben |
| 109 | + 85: wikipedia:de Hilfe:Allgemeine_Textbausteine |
| 110 | + 79: wikipedia:de Hilfe:Versionen |
| 111 | + 79: wikipedia:de Hilfe:Bildertutorial |
| 112 | + 78: wikipedia:de Hilfe:Navigation |
| 113 | + 78: wikipedia:de Hilfe:Inhaltsverzeichnis |
| 114 | + 76: wikipedia:de Hilfe:Benutzerkonto |
| 115 | + 75: wikipedia:de Hilfe:Formatieren |
| 116 | + 74: wikipedia:de Hilfe:Listen_und_Tabellen |
| 117 | + 74: wikipedia:de Hilfe:Buchfunktion/Feedback_zur_Buchfunktion |
| 118 | + 74: wikipedia:de Hilfe:Tutorial/6 |
| 119 | + 73: wikipedia:de Hilfe:Tutorial/5 |
| 120 | + 72: wikipedia:de Hilfe:Benutzernamensraum |
| 121 | + 69: wikipedia:de Hilfe:Glossar |
| 122 | + 345: wikibooks:en Help:Contents |
| 123 | + 330: wikibooks:en Help:Page_validation |
| 124 | + 125: wikibooks:en Help:Collections |
| 125 | + 62: wikibooks:en Help:Starting_a_new_page_or_book |
| 126 | + 42: wikibooks:en Help:Editing |
| 127 | + 40: wikibooks:en Help:About |
| 128 | + 36: wikibooks:en Help:Development_stages |
| 129 | + 29: wikibooks:en Help:Searching |
| 130 | + 25: wikibooks:en Help:Print_versions |
| 131 | + 22: wikibooks:en Help:Contributing_FAQ |
| 132 | + 18: wikibooks:en Help:Contents/editing_wikibooks_-_the_basics |
| 133 | + 12: wikibooks:en Help:Why_contribute%3F |
| 134 | + 12: wikibooks:en Help:How_to_start_a_book |
| 135 | + 10: wikibooks:en Help:Images_and_other_uploaded_files |
| 136 | + 10: wikibooks:en Help:FAQ |
| 137 | + 869: wiktionary:en Help:Contents |
| 138 | + 58: wiktionary:en Help:Searching |
| 139 | + 45: wiktionary:en Help:How_to_check_translations |
| 140 | + 44: wiktionary:en Help:Starting_a_new_page |
| 141 | + 23: wiktionary:en Help:Example_sentences |
| 142 | + 21: wiktionary:en Help:How_to_edit_a_page |
| 143 | + 13: wiktionary:en Help:FAQ |
| 144 | + 13: wiktionary:en Help:Edit_summary |
| 145 | + 10: wiktionary:en Help:Audio_pronunciations |
| 146 | + 10: wiktionary:en Help:Editing |
| 147 | + 84: wikinews:en Help:Page_validation |
| 148 | + 36: wikinews:en Help:Editing_http://schoolpapers.hostinginfive.com/bike.htm |
| 149 | + 36: wikinews:en Help:Editing%20http://schoolpapers.hostinginfive.com/bike.htm |
| 150 | + 22: wikinews:en Help:Contents |
| 151 | + 14: wikinews:en Help:Editing |
| 152 | + 10: wikinews:en Help:How_to_decorate_your_article |
| 153 | + 271: wikiquote:en Help:Contents |
| 154 | + 235: wikisource:en Help:Contents |
| 155 | + 81: wikisource:en Help:Books |
| 156 | + 56: wikisource:en Help:Public_domain |
| 157 | + 38: wikisource:en Help:Adding_texts |
| 158 | + 32: wikisource:en Help:Searching |
| 159 | + 22: wikisource:en Help:DjVu_files |
| 160 | + 15: wikisource:en Help:Introduction |
| 161 | + 12: wikisource:en Help:DJVU_files |
| 162 | + 11: wikisource:en Help:Editing_Wikisource |
| 163 | + 11: wikisource:en Help:Editing_poetry |
| 164 | + 11: wikisource:en Help:Side_by_side_image_view_for_proofreading |
| 165 | + 294: wikiversity:en Help:Guides |
| 166 | + 193: wikiversity:en Help:Contents |
| 167 | + 89: wikiversity:en Help:The_original_tour_for_newcomers |
| 168 | + 81: wikiversity:en Help:The_original_tour_for_newcomers/1 |
| 169 | + 56: wikiversity:en Help:The_original_tour_for_newcomers/2 |
| 170 | + 41: wikiversity:en Help:The_original_tour_for_newcomers/3 |
| 171 | + 37: wikiversity:en Help:The_original_tour_for_newcomers/4 |
| 172 | + 28: wikiversity:en Help:Resources_by_subject |
| 173 | + 20: wikiversity:en Help:Resources_by_educational_level |
| 174 | + 19: wikiversity:en Help:Resources_by_type |
| 175 | + 15: wikiversity:en Help:Editing |
| 176 | + 15: wikiversity:en Help:Creating_educational_content_at_Wikiversity |
| 177 | + 13: wikiversity:en Help:Accessing_Wikiversity_by_educational_level |
| 178 | + 12: wikiversity:en Help:Resources_by_completion_status |
| 179 | + 12: wikiversity:en Help:Quiz |
| 180 | + 10: wikiversity:en Help:Project_boxes |
| 181 | + 6368: wikipedia:en Help:Contents |
| 182 | + 2203: wikipedia:en Help:Category |
| 183 | + 1422: wikipedia:en Help:Japanese |
| 184 | + 849: wikipedia:en Help:Books |
| 185 | + 782: wikipedia:en Help:Special_page |
| 186 | + 623: wikipedia:en Help:Page_history |
| 187 | + 597: wikipedia:en Help:IPA |
| 188 | + 581: wikipedia:en Help:Edit_summary |
| 189 | + 518: wikipedia:en Help:Minor_edit |
| 190 | + 512: wikipedia:en Help:IPA_for_English |
| 191 | + 496: wikipedia:en Help:Link |
| 192 | + 304: wikipedia:en Help:Editing |
| 193 | + 291: wikipedia:en Help:Multilingual_support_(East_Asian) |
| 194 | + 239: wikipedia:en Help:Watching_pages |
| 195 | + 238: wikipedia:en Help:Contents/Editing_Wikipedia |
| 196 | + 224: wikipedia:en Help:Template |
| 197 | + 198: wikipedia:en Help:Special_characters |
| 198 | + 198: wikipedia:en Help:Table |
| 199 | + 193: wikipedia:en Help:Contents/Getting_started |
| 200 | + 193: wikipedia:en Help:Section |
| 201 | + 186: wikipedia:en Help:Pronunciation_respelling_key |
| 202 | + 180: wikipedia:en Help:Diff |
| 203 | + 176: wikipedia:en Help:Starting_a_new_page |
| 204 | + 175: wikipedia:en Help:Reverting |
| 205 | + 171: wikipedia:en Help:Archiving_a_talk_page |
| 206 | + 158: wikipedia:en Help:User_contributions |
| 207 | + 151: wikipedia:en Help:Books/Feedback |
| 208 | + 138: wikipedia:en Help:Displaying_a_formula |
| 209 | + 135: wikipedia:en Help:Merging_and_moving_pages |
| 210 | + 134: wikipedia:en Help:Formula |
| 211 | + 111: wikipedia:en Help:Multilingual_support_(Indic) |
| 212 | + 109: wikipedia:en Help:Talk_page |
| 213 | + 105: wikipedia:en Help:Books/Frequently_Asked_Questions |
| 214 | + 105: wikipedia:en Help:Searching |
| 215 | + 99: wikipedia:en Help:CentralAuth |
| 216 | + 97: wikipedia:en Help:Contents/Browsing_Wikipedia |
| 217 | + 96: wikipedia:en Help:Books/for_experts |
| 218 | + 95: wikipedia:en Help:Images_and_other_uploaded_files |
| 219 | + 95: wikipedia:en Help:IPA_chart_for_Russian |
| 220 | + 93: wikipedia:en Help:Logging_in |
| 221 | + 90: wikipedia:en Help:Contents/Links |
| 222 | + 88: wikipedia:en Help:Contents/Images_and_media |
| 223 | + 74: wikipedia:en Help:Redirect |
| 224 | + 73: wikipedia:en Help:Preferences |
| 225 | + 71: wikipedia:en Help:Contents/Policies_and_guidelines |
| 226 | + 67: wikipedia:en Help:Footnotes |
| 227 | + 66: wikipedia:en Help:Contents/Technical_information |
| 228 | + 64: wikipedia:en Help:Edit_conflict |
| 229 | + 62: wikipedia:en Help:HTML_in_wikitext |
| 230 | + 62: wikipedia:en Help:Recent_changes |
| 231 | + 59: wikipedia:en Help:Namespace |
| 232 | + 55: wikipedia:en Help:Cite_errors |
| 233 | + 51: wikibooks:fr Aide:Compilations |
| 234 | + 21: wikibooks:fr Aide:Compilations/Probl%C3%A8mes |
| 235 | + 21: wikibooks:fr Aide:Compilations/FAQ |
| 236 | + 13: wikibooks:fr Aide:Raccourcis |
| 237 | + 12: wikibooks:fr Aide:Accueil |
| 238 | + 10: wikibooks:fr Aide:Compilations/Aide_avanc%C3%A9e |
| 239 | + 113: wiktionary:fr Aide:%C3%89tymologies |
| 240 | + 96: wiktionary:fr Aide:Synonymes_et_antonymes |
| 241 | + 61: wiktionary:fr Aide:Sommaire |
| 242 | + 24: wiktionary:fr Aide:Prononciations |
| 243 | + 21: wiktionary:fr Aide:D%C3%A9finitions |
| 244 | + 18: wiktionary:fr Aide:%C3%89tymologie_grecque |
| 245 | + 17: wiktionary:fr Aide:Anagrammes |
| 246 | + 17: wiktionary:fr Aide:Aide |
| 247 | + 12: wiktionary:fr Aide:Exemples |
| 248 | + 10: wiktionary:fr Aide:Homophones_et_paronymes |
| 249 | + 14: wikinews:fr Aide:Sommaire |
| 250 | + 13: wikiquote:fr Aide:Sommaire |
| 251 | + 127: wikisource:fr Aide:Aide_au_lecteur |
| 252 | + 24: wikisource:fr Aide:Livres |
| 253 | + 17: wikisource:fr Aide:Cr%C3%A9er_un_fichier_DjVu |
| 254 | + 17: wikisource:fr Aide:Accueil |
| 255 | + 12: wikisource:fr Aide:Guide_du_nouveau_contributeur |
| 256 | + 11: wikisource:fr Aide:Comment_num%C3%A9riser |
| 257 | + 10: wikisource:fr Aide:Aide |
| 258 | + 103: wikiversity:fr Aide:Niveau_de_difficult%C3%A9 |
| 259 | + 26: wikiversity:fr Aide:Niveau_de_difficult%C3%A9/Niveau_13 |
| 260 | + 24: wikiversity:fr Aide:Sommaire |
| 261 | + 21: wikiversity:fr Aide:Niveau_de_difficult%C3%A9/Niveau_14 |
| 262 | + 18: wikiversity:fr Aide:Niveau_de_difficult%C3%A9/Niveau_11 |
| 263 | + 16: wikiversity:fr Aide:Comment_cr%C3%A9er_un_projet |
| 264 | + 15: wikiversity:fr Aide:Niveau_de_difficult%C3%A9/Niveau_10 |
| 265 | + 11: wikiversity:fr Aide:Frise_chronologique |
| 266 | + 4860: wikipedia:fr Aide:Homonymie |
| 267 | + 1742: wikipedia:fr Aide:Recherche |
| 268 | + 1703: wikipedia:fr Aide:Sommaire |
| 269 | + 1253: wikipedia:fr Aide:Importer_un_fichier |
| 270 | + 945: wikipedia:fr Aide:%C3%89bauche |
| 271 | + 901: wikipedia:fr Aide:Livres |
| 272 | + 731: wikipedia:fr Aide:Comment_modifier_une_page |
| 273 | + 359: wikipedia:fr Aide:Poser_une_question |
| 274 | + 308: wikipedia:fr Aide:Tout_l%27indispensable... |
| 275 | + 288: wikipedia:fr Aide:Consultation |
| 276 | + 268: wikipedia:fr Aide:Premiers_pas |
| 277 | + 217: wikipedia:fr Aide:Comment_cr%C3%A9er_une_page |
| 278 | + 215: wikipedia:fr Aide:Redirection |
| 279 | + 170: wikipedia:fr Aide:Importer_un_logo |
| 280 | + 158: wikipedia:fr Aide:Syntaxe |
| 281 | + 139: wikipedia:fr Aide:Importer_un_fichier_sur_Commons |
| 282 | + 134: wikipedia:fr Aide:Unicode |
| 283 | + 126: wikipedia:fr Aide:Note |
| 284 | + 109: wikipedia:fr Aide:Importer_sur_Commons_un_fichier_dont_je_suis_l%27auteur |
| 285 | + 106: wikipedia:fr Aide:Toujours_commenter_vos_modifications_dans_la_bo%C3%AEte_de_r%C3%A9sum%C3%A9 |
| 286 | + 106: wikipedia:fr Aide:Ins%C3%A9rer_une_image |
| 287 | + 101: wikipedia:fr Aide:Comment_r%C3%A9diger_une_page |
| 288 | + 101: wikipedia:fr Aide:%C3%89couter_des_sons_ogg |
| 289 | + 94: wikipedia:fr Aide:Premiers_pas/2 |
| 290 | + 85: wikipedia:fr Aide:Mod%C3%A8le |
| 291 | + 82: wikipedia:fr Aide:Japonais |
| 292 | + 75: wikipedia:fr Aide:Comment_cr%C3%A9er_un_article |
| 293 | + 74: wikipedia:fr Aide:Formules_TeX |
| 294 | + 71: wikipedia:fr Aide:Caract%C3%A8res_sp%C3%A9ciaux |
| 295 | + 68: wikipedia:fr Aide:Caract%C3%A8res_sp%C3%A9ciaux_probl%C3%A9matiques |
| 296 | + 65: wikipedia:fr Aide:Premiers_pas/3 |
| 297 | + 63: wikipedia:fr Aide:Regarder_des_vid%C3%A9os_ogg |
| 298 | + 63: wikipedia:fr Aide:Compte_utilisateur |
| 299 | + 60: wikipedia:fr Aide:Sourcer |
| 300 | + 60: wikipedia:fr Aide:Sommaire/D%C3%A9buter |
| 301 | + 58: wikipedia:fr Aide:Sommaire/Modifier_Wikip%C3%A9dia |
| 302 | + 55: wikipedia:fr Aide:Historique |
| 303 | + 51: wikipedia:fr Aide:Espace_de_noms |
| 304 | + 49: wikipedia:fr Aide:Republication |
| 305 | + 49: wikipedia:fr Aide:Sommaire/Traduction |
| 306 | + 44: wikipedia:fr Aide:Cat%C3%A9gorie |
| 307 | + 42: wikipedia:fr Aide:Couleurs |
| 308 | + 41: wikipedia:fr Aide:Accents |
| 309 | + 39: wikipedia:fr Aide:Signature |
| 310 | + 38: wikipedia:fr Aide:Liens_externes |
| 311 | + 38: wikipedia:fr Aide:Les_diff%C3%A9rents_r%C3%B4les |
| 312 | + 37: wikipedia:fr Aide:Acc%C3%A8s_%C3%A0_Wikip%C3%A9dia_avec_un_t%C3%A9l%C3%A9phone_portable_et_un_PDA |
| 313 | + 35: wikipedia:fr Aide:Sommaire/Parcourir_Wikip%C3%A9dia |
| 314 | + 33: wikipedia:fr Aide:Frise_chronologique |
| 315 | + 33: wikipedia:fr Aide:Raccourci |
| 316 | + 32: wikipedia:fr Aide:Page_utilisateur |
| 317 | + 31: wikipedia:fr Aide:Page_Utilisateur |
| 318 | + 14: wikipedia:gd Help:Cuideachadh |
| 319 | + 25: wikipedia:gn Help:Contents |
| 320 | + 10: wikipedia:ig Help:Contents |
| 321 | + 48: wikipedia:ilo Help:Contents |
| 322 | + 12: wikipedia:ilo Help:Dagiti_Linaon |
| 323 | + 24: wikimedia:incubator Help:Manual |
| 324 | + 15: wikimedia:incubator Help:Contents |
| 325 | + 25: wikipedia:io Help:Helpo |
| 326 | + 18: wikibooks:ja Help:%E9%80%B2%E6%8D%97%E7%8A%B6%E6%B3%81 |
| 327 | + 14: wiktionary:ja Help:%E7%9B%AE%E6%AC%A1 |
| 328 | + 1121: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1 |
| 329 | + 537: wikipedia:ja Help:%E6%A4%9C%E7%B4%A2 |
| 330 | + 188: wikipedia:ja Help:%E3%83%9A%E3%83%BC%E3%82%B8%E3%81%AE%E7%B7%A8%E9%9B%86 |
| 331 | + 179: wikipedia:ja Help:%E9%9F%B3%E5%A3%B0%E3%83%BB%E5%8B%95%E7%94%BB%E3%81%AE%E5%86%8D%E7%94%9F |
| 332 | + 147: wikipedia:ja Help:%25E7%259B%25AE%25E6%25AC%25A1 |
| 333 | + 132: wikipedia:ja Help:%E7%94%BB%E5%83%8F%E3%81%AA%E3%81%A9%E3%81%AE%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB%E3%81%AE%E3%82%A2%E3%83%83%E3%83%97%E3%83%AD%E3%83%BC%E3%83%89%E3%81%A8%E5%88%A9%E7%94%A8 |
| 334 | + 120: wikipedia:ja Help:%E8%84%9A%E6%B3%A8/%E8%AA%AD%E8%80%85%E5%90%91%E3%81%91 |
| 335 | + 98: wikipedia:ja Help:%E7%89%B9%E6%AE%8A%E6%96%87%E5%AD%97 |
| 336 | + 84: wikipedia:ja Help:%E6%96%B0%E8%A6%8F%E3%83%9A%E3%83%BC%E3%82%B8%E3%81%AE%E4%BD%9C%E6%88%90 |
| 337 | + 69: wikipedia:ja Help:%E8%A8%98%E4%BA%8B%E3%81%A8%E3%81%AF%E4%BD%95%E3%81%8B |
| 338 | + 68: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E6%96%B0%E3%81%97%E3%81%84%E8%A8%98%E4%BA%8B%E3%82%92%E6%9B%B8%E3%81%8F |
| 339 | + 65: wikipedia:ja Help:%E7%94%BB%E5%83%8F%E3%81%AE%E8%A1%A8%E7%A4%BA |
| 340 | + 64: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E7%9B%B8%E8%AB%87%E3%81%A8%E8%B3%AA%E5%95%8F |
| 341 | + 57: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E7%B7%A8%E9%9B%86%E5%85%A5%E9%96%80 |
| 342 | + 56: wikipedia:ja Help:%E3%83%AD%E3%82%B0%E3%82%A4%E3%83%B3 |
| 343 | + 48: wikipedia:ja Help:%E3%83%86%E3%83%B3%E3%83%97%E3%83%AC%E3%83%BC%E3%83%88 |
| 344 | + 45: wikipedia:ja Help:%E3%83%8E%E3%83%BC%E3%83%88%E3%83%9A%E3%83%BC%E3%82%B8 |
| 345 | + 38: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E3%83%9E%E3%83%BC%E3%82%AF%E3%82%A2%E3%83%83%E3%83%97 |
| 346 | + 35: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E7%94%BB%E5%83%8F%E3%81%AA%E3%81%A9%E3%81%AE%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB |
| 347 | + 34: wikipedia:ja Help:%E6%97%A9%E8%A6%8B%E8%A1%A8 |
| 348 | + 32: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E8%A8%98%E4%BA%8B%E3%82%92%E8%82%B2%E3%81%A6%E3%82%8B |
| 349 | + 32: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E8%AA%AD%E8%80%85%E5%90%91%E3%81%91 |
| 350 | + 31: wikipedia:ja Help:%E8%84%9A%E6%B3%A8 |
| 351 | + 30: wikipedia:ja Help:%E7%B4%B0%E9%83%A8%E3%81%AE%E7%B7%A8%E9%9B%86 |
| 352 | + 30: wikipedia:ja Help:%E3%83%AA%E3%83%80%E3%82%A4%E3%83%AC%E3%82%AF%E3%83%88 |
| 353 | + 30: wikipedia:ja Help:JPEG%E7%94%BB%E5%83%8F%E3%82%92%E6%B8%9B%E8%89%B2%E3%81%97PNG%E7%94%BB%E5%83%8F%E3%81%A8%E3%81%97%E3%81%A6%E4%BF%9D%E5%AD%98%E3%81%99%E3%82%8B%E6%96%B9%E6%B3%95 |
| 354 | + 30: wikipedia:ja Help:%E9%81%8E%E5%8E%BB%E3%83%AD%E3%82%B0 |
| 355 | + 29: wikipedia:ja Help:%E5%B1%A5%E6%AD%B4 |
| 356 | + 28: wikipedia:ja Help:ISBN%E3%81%AE%E3%83%AA%E3%83%B3%E3%82%AF |
| 357 | + 27: wikipedia:ja Help:%E3%83%8A%E3%83%93%E3%82%B2%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E3%83%BB%E3%83%9D%E3%83%83%E3%83%97%E3%82%A2%E3%83%83%E3%83%97 |
| 358 | + 27: wikipedia:ja Help:%E5%A4%9A%E8%A8%80%E8%AA%9E%E5%AF%BE%E5%BF%9C_(%E3%82%A4%E3%83%B3%E3%83%89%E7%B3%BB%E6%96%87%E5%AD%97) |
| 359 | + 26: wikipedia:ja Help:%E3%83%AA%E3%83%B3%E3%82%AF |
| 360 | + 25: wikipedia:ja Help:%E7%AE%87%E6%9D%A1%E6%9B%B8%E3%81%8D |
| 361 | + 24: wikipedia:ja Help:%E3%82%A6%E3%82%A3%E3%82%AD%E3%83%86%E3%82%AD%E3%82%B9%E3%83%88%E3%81%AB%E3%81%8A%E3%81%91%E3%82%8BHTML |
| 362 | + 24: wikipedia:ja Help:%E3%83%9A%E3%83%BC%E3%82%B8%E3%81%AE%E7%A7%BB%E5%8B%95 |
| 363 | + 23: wikipedia:ja Help:%E8%A1%A8%E3%81%AE%E4%BD%9C%E3%82%8A%E6%96%B9 |
| 364 | + 21: wikipedia:ja Help:%E8%A6%81%E7%B4%84%E6%AC%84 |
| 365 | + 21: wikipedia:ja Help:%E3%83%9E%E3%82%B8%E3%83%83%E3%82%AF%E3%83%AF%E3%83%BC%E3%83%89 |
| 366 | + 21: wikipedia:ja Help:Pywikipediabot |
| 367 | + 20: wikipedia:ja Help:%25E6%25A4%259C%25E7%25B4%25A2 |
| 368 | + 20: wikipedia:ja Help:%E3%82%AB%E3%83%86%E3%82%B4%E3%83%AA |
| 369 | + 20: wikipedia:ja Help:%E3%82%B5%E3%83%B3%E3%83%89%E3%83%9C%E3%83%83%E3%82%AF%E3%82%B9 |
| 370 | + 20: wikipedia:ja Help:%E6%A3%92%E3%82%B0%E3%83%A9%E3%83%95%E3%81%AE%E6%9B%B8%E3%81%8D%E6%96%B9 |
| 371 | + 18: wikipedia:ja Help:%E3%82%BB%E3%82%AF%E3%82%B7%E3%83%A7%E3%83%B3 |
| 372 | + 15: wikipedia:ja Help:%E3%82%A6%E3%82%A3%E3%82%AD%E3%83%A1%E3%83%BC%E3%83%AB |
| 373 | + 15: wikipedia:ja Help:%E5%80%8B%E4%BA%BA%E8%A8%AD%E5%AE%9A |
| 374 | + 15: wikipedia:ja Help:%E5%90%8D%E5%89%8D%E7%A9%BA%E9%96%93 |
| 375 | + 14: wikipedia:ja Help:%E3%83%86%E3%83%B3%E3%83%97%E3%83%AC%E3%83%BC%E3%83%88%E3%81%AE%E8%AA%AC%E6%98%8E%E6%96%87 |
| 376 | + 14: wikipedia:ja Help:%E4%BB%A5%E5%89%8D%E3%81%AE%E7%89%88%E3%81%AB%E3%83%9A%E3%83%BC%E3%82%B8%E3%82%92%E6%88%BB%E3%81%99%E6%96%B9%E6%B3%95 |
| 377 | + 13: wikipedia:ja Help:%E3%82%BD%E3%83%95%E3%83%88%E3%83%AA%E3%83%80%E3%82%A4%E3%83%AC%E3%82%AF%E3%83%88 |
| 378 | + 13: wikipedia:ja Help:%E3%83%9A%E3%83%BC%E3%82%B8%E5%90%8D |
| 379 | + 13: wikipedia:ja Help:%E6%9D%A1%E4%BB%B6%E6%96%87 |
| 380 | + 30: wikipedia:kg Help:Contents |
| 381 | + 2343: wikimedia:meta Help:External_editors |
| 382 | + 1266: wikimedia:meta Help:Contents |
| 383 | + 353: wikimedia:meta Help:Editing |
| 384 | + 246: wikimedia:meta Help:Help |
| 385 | + 237: wikimedia:meta Help:Starting_a_new_page |
| 386 | + 217: wikimedia:meta Help:Images_and_other_uploaded_files |
| 387 | + 210: wikimedia:meta Help:Unified_login |
| 388 | + 208: wikimedia:meta Help:Table |
| 389 | + 169: wikimedia:meta Hilfe:Handbuch |
| 390 | + 167: wikimedia:meta Help:Category |
| 391 | + 156: wikimedia:meta Help:Template |
| 392 | + 131: wikimedia:meta Help:User_style%20http://schoolpapers.hostinginfive.com/bike.htm |
| 393 | + 131: wikimedia:meta Help:User_style_http://schoolpapers.hostinginfive.com/bike.htm |
| 394 | + 118: wikimedia:meta Help:Link |
| 395 | + 111: wikimedia:meta Help:Editor |
| 396 | + 107: wikimedia:meta Help:Formula |
| 397 | + 105: wikimedia:meta Help:Wikitext_examples |
| 398 | + 97: wikimedia:meta Help:Reference_card |
| 399 | + 95: wikimedia:meta Help:Section |
| 400 | + 89: wikimedia:meta Help:Special_characters |
| 401 | + 85: wikimedia:meta Help:Wikitext |
| 402 | + 85: wikimedia:meta Help:HTML_in_wikitext |
| 403 | + 78: wikimedia:meta Help:System_admin |
| 404 | + 77: wikimedia:meta Help:Preferences |
| 405 | + 72: wikimedia:meta Aide:Contenu |
| 406 | + 69: wikimedia:meta Help:Displaying_a_formula |
| 407 | + 65: wikimedia:meta Help:Page_name |
| 408 | + 64: wikimedia:meta Help:Magic_words |
| 409 | + 61: wikimedia:meta Help:Advanced_editing |
| 410 | + 59: wikimedia:meta Help:Reader |
| 411 | + 57: wikimedia:meta Help:List |
| 412 | + 56: wikimedia:meta Help:Searching |
| 413 | + 56: wikimedia:meta Help:Moderator |
| 414 | + 54: wikimedia:meta Help:Interwiki_linking |
| 415 | + 52: wikimedia:meta Help:Transwiki |
| 416 | + 52: wikimedia:meta Help:Redirect |
| 417 | + 52: wikimedia:meta Help:Namespace |
| 418 | + 50: wikimedia:meta Hilfe:Externe_Editoren |
| 419 | + 49: wikimedia:meta Help:Public_domain_image_resources |
| 420 | + 48: wikimedia:meta Help:User_style |
| 421 | + 48: wikimedia:meta Help:Variable |
| 422 | + 48: wikimedia:meta Help:Introduction |
| 423 | + 46: wikimedia:meta Help:Moving_a_page |
| 424 | + 46: wikimedia:meta Help:ParserFunctions |
| 425 | + 45: wikimedia:meta Help:Logging_in |
| 426 | + 45: wikimedia:meta Help:Export |
| 427 | + 43: wikimedia:meta Help:Editing_FAQ |
| 428 | + 42: wikimedia:meta Help:Import |
| 429 | + 41: wikimedia:meta Help:Special_page |
| 430 | + 41: wikimedia:meta Hilfe:Textgestaltung |
| 431 | + 40: wikimedia:meta Help:Job_queue |
| 432 | + 40: wikimedia:meta Help:URL |
| 433 | + 14: wikipedia:meta Help:Contents |
| 434 | + 35: wikipedia:mi Help:Contents |
| 435 | + 24: wikipedia:mr Help:Contents |
| 436 | + 18: wikipedia:ne Help:Contents |
| 437 | + 12: wikibooks:nl Help:Boeken |
| 438 | + 312: wikipedia:nl Help:Boeken |
| 439 | + 182: wikipedia:nl Help:Zoeken |
| 440 | + 124: wikipedia:nl Help:Tips_voor_het_schrijven_van_een_goed_artikel |
| 441 | + 105: wikipedia:nl Help:Nieuwe_pagina_aanmaken |
| 442 | + 72: wikipedia:nl Help:Helpdesk |
| 443 | + 70: wikipedia:nl Help:Beveiligde_pagina%27s |
| 444 | + 64: wikipedia:nl Help:Ogg_Vorbis |
| 445 | + 61: wikipedia:nl Help:Uitleg |
| 446 | + 60: wikipedia:nl Help:Wikipedia |
| 447 | + 56: wikipedia:nl Help:Veelgestelde_vragen |
| 448 | + 53: wikipedia:nl Help:Veelvoorkomende_spelfouten |
| 449 | + 51: wikipedia:nl Help:Samenvatting |
| 450 | + 45: wikipedia:nl Help:Hoe_kan_ik_meedoen%3F |
| 451 | + 39: wikipedia:nl Help:Installeer_je_eigen_Wiki |
| 452 | + 36: wikipedia:nl Help:Gebruik_van_bestanden |
| 453 | + 35: wikipedia:nl Help:Terminologie_op_Wikipedia |
| 454 | + 34: wikipedia:nl Help:Gebruik_van_tabellen |
| 455 | + 33: wikipedia:nl Help:Referenties_en_voetnoten |
| 456 | + 33: wikipedia:nl Help:Afkortingen_op_Wikipedia_chat |
| 457 | + 32: wikipedia:nl Help:Gebruik_van_categorie%C3%ABn |
| 458 | + 31: wikipedia:nl Help:Tekstopmaak |
| 459 | + 30: wikipedia:nl Help:Gebruik_van_sjablonen |
| 460 | + 30: wikipedia:nl Help:Contact_met_Wikipedia |
| 461 | + 29: wikipedia:nl Help:Speciale_tekens |
| 462 | + 27: wikipedia:nl Help:Kleine_wijziging |
| 463 | + 26: wikipedia:nl Help:Alfabetische_index |
| 464 | + 25: wikipedia:nl Help:Spellinggids |
| 465 | + 25: wikipedia:nl Help:TeX_in_Wikipedia |
| 466 | + 24: wikipedia:nl Help:Standaardvorm_voor_biografie%C3%ABn |
| 467 | + 24: wikipedia:nl Help:Gebruik_van_bots |
| 468 | + 23: wikipedia:nl Help:Beginnetje |
| 469 | + 23: wikipedia:nl Help:Tips_voor_het_vertalen_van_een_artikel_vanaf_een_andere_Wikipedia |
| 470 | + 23: wikipedia:nl Help:Gebruik_van_links |
| 471 | + 22: wikipedia:nl Help:Samenvoegen_van_artikelen |
| 472 | + 20: wikipedia:nl Help:Hulpmiddelen |
| 473 | + 19: wikipedia:nl Help:Auteursrechten |
| 474 | + 18: wikipedia:nl Help:Gebruik_van_openbare_bronnen |
| 475 | + 17: wikipedia:nl Help:Bronnensjabloon |
| 476 | + 17: wikipedia:nl Help:Wikipediachat |
| 477 | + 16: wikipedia:nl Help:Inhoud |
| 478 | + 16: wikipedia:nl Help:Gebruik_van_geluid |
| 479 | + 15: wikipedia:nl Help:Externe_kaarten |
| 480 | + 15: wikipedia:nl Help:Waarom_zou_ik_meedoen%3F |
| 481 | + 15: wikipedia:nl Help:Naamruimte |
| 482 | + 14: wikipedia:nl Help:EasyTimeline |
| 483 | + 14: wikipedia:nl Help:English |
| 484 | + 13: wikipedia:nl Help:Media_uploaden_naar_commons |
| 485 | + 13: wikipedia:nl Help:Overlegpagina |
| 486 | + 13: wikipedia:nl Help:Unieke_van_Wikipedia |
| 487 | + 12: wikipedia:nl Help:Gebruik_van_de_taxobox |
| 488 | + 11: wikipedia:nl Help:Doorverwijzen |
| 489 | + 11: wikipedia:nl Help:Huis-_tuin-_en_keukeninspiratie |
| 490 | + 10: wikipedia:nrm Help:Contents |
| 491 | + 15: wikipedia:pam Help:Kalamnan |
| 492 | + 10: wikipedia:pdc Hilfe:Hilfe |
| 493 | + 15: wikipedia:sc Help:Aiuto |
| 494 | + 32: wikipedia:scn Help:Aiutu |
| 495 | + 15: wikipedia:sco Help:Contents |
| 496 | + 48: wikipedia:se Help:Contents |
| 497 | + 65: wiktionary:simple Help:Contents |
| 498 | + 266: wikipedia:simple Help:Contents |
| 499 | + 241: wikipedia:simple Help:Books |
| 500 | + 21: wikipedia:simple Help:How_to_use_images |
| 501 | + 18: wikipedia:simple Help:How_to_change_pages |
| 502 | + 13: wikipedia:simple Help:Editing |
| 503 | + 11: wikipedia:simple Help:How_to_edit |
| 504 | + 10: wikipedia:simple Help:Archiving_a_talk_page |
| 505 | + 10: wikipedia:simple Help:Pronunciation_respelling_key |
| 506 | + 50: wikimedia:species Help:Contents |
| 507 | + 19: wikimedia:species Help:Image_Guidelines |
| 508 | + 17: wikimedia:species Help:General_Wikispecies |
| 509 | + 15: wikimedia:species Help:Author_Names |
| 510 | + 22: wikipedia:sw Help:Contents |
| 511 | + 28: wikipedia:te Help:Contents |
| 512 | + 20: wikipedia:test Help:Books |
| 513 | + 14: wikipedia:test Help:Page_validation |
| 514 | + 11: wikipedia:to Help:Contents |
| 515 | + 21: wikipedia:uz Help:Contents |
| 516 | + 748: www.w Help:Contents |
| 517 | + 417: www.w Help:Configuration_settings |
| 518 | + 373: www.w Help:Editing_pages |
| 519 | + 355: www.w Help:Formatting |
| 520 | + 276: www.w Help:Magic_words |
| 521 | + 261: www.w Help:Navigation |
| 522 | + 253: www.w Help:Extension:ParserFunctions |
| 523 | + 208: www.w Help:Images |
| 524 | + 185: www.w Help:FAQ |
| 525 | + 172: www.w Help:Links |
| 526 | + 164: www.w Help:Starting_a_new_page |
| 527 | + 153: www.w Help:Templates |
| 528 | + 147: www.w Help:Tables |
| 529 | + 66: www.w Help:Categories |
| 530 | + 47: www.w Help:Redirects |
| 531 | + 46: www.w Help:Assigning_permissions |
| 532 | + 46: www.w Help:Editing |
| 533 | + 45: www.w Help:Namespaces |
| 534 | + 44: www.w Help:Skins |
| 535 | + 40: www.w Help:Managing_files |
| 536 | + 38: www.w Help:Contents/de |
| 537 | + 36: www.w Help:Special_pages |
| 538 | + 35: www.w Help:Subpages |
| 539 | + 33: www.w Help:Preferences |
| 540 | + 31: www.w Help:Variables |
| 541 | + 30: www.w Help:Moving_a_page |
| 542 | + 29: www.w Help:Editing_pages/de |
| 543 | + 28: www.w Help:User_page |
| 544 | + 27: www.w Help:Contents/ru |
| 545 | + 27: www.w Help:Sysops_and_permissions |
| 546 | + 25: www.w Help:Talk_pages |
| 547 | + 25: www.w Help:Editing_pages/ja |
| 548 | + 24: www.w Help:Searching |
| 549 | + 24: www.w Help:Navigation/de |
| 550 | + 22: www.w Help:User_rights |
| 551 | + 21: www.w Help:Signatures |
| 552 | + 21: www.w Help:Deleting_a_page |
| 553 | + 21: www.w Help:Tracking_changes |
| 554 | + 20: www.w Help:Linked_images |
| 555 | + 20: www.w Help:ParserFunctions |
| 556 | + 19: www.w Help:Navigation/ru |
| 557 | + 18: www.w Help:Interwiki_linking |
| 558 | + 18: www.w Help:User_rights/favicon.ico |
| 559 | + 18: www.w Help:User_rights/favicon.gif |
| 560 | + 17: www.w Help:Formatting/de |
| 561 | + 17: www.w Help:Editing_pages/pt |
| 562 | + 17: www.w Help:Patrolled_edits |
| 563 | + 16: www.w Help:Contents/es |
| 564 | + 15: www.w Help:Links/ru |
| 565 | + 14: www.w Help:Sysop_deleting_and_undeleting |
| 566 | + 14: www.w Help:Starting_a_new_page/de |
| 567 | + 13: www.w Help:Protecting_and_unprotecting_pages |
| 568 | + 117: wikipedia:www Help:Contents |
| 569 | + 55: wikipedia:zh-classical Help:%E5%87%A1%E4%BE%8B |
| 570 | + 14: wikipedia:zh-classical Help:Page_validation |
| 571 | + 57: wikipedia:zh-min-nan Help:Bo%CC%8Dk-lio%CC%8Dk |
| 572 | + 13: wikipedia:zh-min-nan Help:%E5%A6%82%E4%BD%95%E8%BC%B8%E5%85%A5%E7%99%BD%E8%A9%B1%E5%AD%97 |
| 573 | + 10: wikipedia:zh-min-nan Help:%E5%A6%82%E4%BD%95%E8%AE%80 |
| 574 | + 51: wikipedia:zh-yue Help:%E7%9B%AE%E9%8C%84 |
| 575 | + 19: wikisource:zh Help:%E7%9B%AE%E5%BD%95 |
| 576 | + 13: wikisource:zh Help:%E4%B9%A6 |
| 577 | + 12: wikisource:zh Help:%E5%85%A5%E9%97%A8%E6%8C%87%E5%8D%97 |
Property changes on: trunk/wikistats/dammit.lt/cellar/!DammitRankSpecialPages.txt |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 578 | + native |
Index: trunk/wikistats/dammit.lt/cellar/!DammitPatchProjectcountsForBanners.pl |
— | — | @@ -0,0 +1,189 @@ |
| 2 | +#!/usr/bin/perl
|
| 3 | +
|
| 4 | +$| = 1; # flush screen output
|
| 5 | +
|
| 6 | +open IN, '<', 'PageViewsBannerPages.txt' ;
|
| 7 | +open OUT1, '>', 'PageViewsBannerPagesUse.txt' ;
|
| 8 | +open OUT2, '>', 'PageViewsBannerPagesDiscard.txt' ;
|
| 9 | +open LOG, '>', 'PageViewsBannerPagesLog.txt' ;
|
| 10 | +
|
| 11 | +while ($line = <IN>)
|
| 12 | +{
|
| 13 | + ($date,$project,$title,$counts) = split (' ', $line) ;
|
| 14 | +
|
| 15 | + $date =~ s/^.*?(\d{8}).*$/$1/ ;
|
| 16 | + $project =~ s/^.*?:// ;
|
| 17 | + $project =~ s/\.z// ;
|
| 18 | +
|
| 19 | + $projects {$project} ++ ;
|
| 20 | +
|
| 21 | + ($total = $counts) =~ s/\D.*//g ;
|
| 22 | +
|
| 23 | +# next if $line !~ /20101001/ ;
|
| 24 | +# next if $line !~ /fy\.z/ ;
|
| 25 | +
|
| 26 | + if ($line !~ /(?:BannerCont|BannerList|BannerLoad|Bannerbeheer)/i)
|
| 27 | + {
|
| 28 | + print OUT2 $line ;
|
| 29 | + $total_discard += $total ;
|
| 30 | + $titles_discard {"$project $title"} += $total ;
|
| 31 | + next ;
|
| 32 | + }
|
| 33 | +
|
| 34 | + print OUT1 $line ;
|
| 35 | + $titles_use {"$project $title"} += $total ;
|
| 36 | + $total_use += $total ;
|
| 37 | +
|
| 38 | + # print "$counts: " ;
|
| 39 | + $counts =~ s/^\d+// ; # remove (redundant) preceding total
|
| 40 | + while ($counts ne "")
|
| 41 | + {
|
| 42 | + $letter = substr ($counts,0,1) ;
|
| 43 | + $counts = substr ($counts,1) ;
|
| 44 | + ($count = $counts) =~ s/^(\d+).*$/$1/ ;
|
| 45 | + $counts =~ s/^\d+(.*)$/$1/ ;
|
| 46 | + $hour = ord ($letter) - ord ('A') ;
|
| 47 | + # print "[$hour] $count " ;
|
| 48 | +
|
| 49 | + $substract {"$project,$date,$hour"} += $count ;
|
| 50 | + # if (($project eq 'fy') && ($date eq '20101001'))
|
| 51 | + # { print "$project,$date,$hour\n" ; }
|
| 52 | + }
|
| 53 | + # print "\n" ;
|
| 54 | +
|
| 55 | +}
|
| 56 | +close IN ;
|
| 57 | +
|
| 58 | +&Log ("\n\nDiscard:\n") ;
|
| 59 | +foreach $title (sort {$titles_discard {$b} <=> $titles_discard {$a}} keys %titles_discard)
|
| 60 | +{
|
| 61 | + print $titles_discard {$title} . " : $title\n" ;
|
| 62 | + print LOG $titles_discard {$title} . " : $title\n" ;
|
| 63 | + last if $lines_discard++ > 10 ;
|
| 64 | +}
|
| 65 | +
|
| 66 | +&Log ("\n\nUse:\n") ;
|
| 67 | +foreach $title (sort {$titles_use {$b} <=> $titles_use {$a}} keys %titles_use)
|
| 68 | +{
|
| 69 | + print LOG $titles_use {$title} . " : $title\n" ;
|
| 70 | + next if $lines_use++ > 10 ;
|
| 71 | + print $titles_use {$title} . " : $title\n" ;
|
| 72 | + last if $lines_use++ > 1000 ;
|
| 73 | +}
|
| 74 | +
|
| 75 | +&Log ("\n\nProjects:\n") ;
|
| 76 | +foreach $project (sort keys %projects)
|
| 77 | +{
|
| 78 | + &Log ("$project ") ;
|
| 79 | + &Log ("\n") if $projects_printed++ %10 == 0 ;
|
| 80 | +}
|
| 81 | +close OUT1 ;
|
| 82 | +close OUT2 ;
|
| 83 | +close LOG ;
|
| 84 | +
|
| 85 | +&Patch ;
|
| 86 | +
|
| 87 | +&Log ("Use $total_use\n") ;
|
| 88 | +&Log ("Discard $total_discard\n") ;
|
| 89 | +&Log ("Substracted $counts_substracted\n") ;
|
| 90 | +
|
| 91 | +print "\n\nReady\n\n" ;
|
| 92 | +exit ;
|
| 93 | +
|
| 94 | +sub Patch
|
| 95 | +{
|
| 96 | + &Log ("\n\nPatch\n\n") ;
|
| 97 | + if (-d "/a/dammit.lt/projectcounts")
|
| 98 | + { $dir = "/a/dammit.lt/projectcounts" ; }
|
| 99 | + else
|
| 100 | + { $dir = "w:/# In Dammit.lt/projectcounts/t" ; }
|
| 101 | +
|
| 102 | + chdir ($dir) || die "Cannot chdir to $dir\n" ;
|
| 103 | +
|
| 104 | + local (*DIR);
|
| 105 | + opendir (DIR, ".");
|
| 106 | + @files = () ;
|
| 107 | +
|
| 108 | + while ($file_in = readdir (DIR))
|
| 109 | + {
|
| 110 | + next if $file_in !~ /^projectcounts-2010(?:09|10)/ ;
|
| 111 | + # next if $file_in !~ /^projectcounts-20101001/ ;
|
| 112 | +
|
| 113 | + push @files, $file_in ;
|
| 114 | + }
|
| 115 | +
|
| 116 | + closedir (DIR);
|
| 117 | +
|
| 118 | + @files = sort @files ;
|
| 119 | +
|
| 120 | + foreach $file (@files)
|
| 121 | + { &PatchFile ($file) ; }
|
| 122 | +
|
| 123 | + &Log ("\n\nUnpatched\n\n") ;
|
| 124 | + foreach $key (sort keys %substract)
|
| 125 | + {
|
| 126 | + if (! $substract_found {$key})
|
| 127 | + { &Log ("$key\n") ; }
|
| 128 | + }
|
| 129 | +}
|
| 130 | +
|
| 131 | +sub PatchFile
|
| 132 | +{
|
| 133 | + my $file = shift ;
|
| 134 | + my $line ;
|
| 135 | + print "\nFile $file\n" ;
|
| 136 | +
|
| 137 | + ($dummy,$date,$time) = split '-', $file ;
|
| 138 | + $hour = substr ($time,0,2) + 0 ;
|
| 139 | +
|
| 140 | + open PROJECTFILE, '<', "$dir/$file" || die "Could not open '$dir/$file'\n" ;
|
| 141 | +
|
| 142 | + undef @projectfile ;
|
| 143 | + $file_changed = 0 ;
|
| 144 | + while ($line = <PROJECTFILE>)
|
| 145 | + {
|
| 146 | + chomp $line ;
|
| 147 | + ($project,$dash,$count,$bytes) = split (' ', $line) ;
|
| 148 | +
|
| 149 | + # next if $project ne 'fy' ;
|
| 150 | + # print "$line\n" ;
|
| 151 | + next if $bytes eq '' ;
|
| 152 | + $count_substract = $substract {"$project,$date,$hour"} ;
|
| 153 | + $substract_found {"$project,$date,$hour"} ++ ;
|
| 154 | +
|
| 155 | + if ($count_substract == 0)
|
| 156 | + { push @projectfile, $line ; }
|
| 157 | + else
|
| 158 | + {
|
| 159 | + $file_changed = 1 ;
|
| 160 | + $count -= $count_substract ;
|
| 161 | + &Log ("\n$line ->\n") ;
|
| 162 | + $line = "$project $dash $count 1" ;
|
| 163 | + push @projectfile, $line ;
|
| 164 | + &Log ("$line\n") ;
|
| 165 | + }
|
| 166 | + # next if $count_substract eq '' ;
|
| 167 | + $counts_substracted += $count_substract ;
|
| 168 | + # print "$project $count minus $count_substract\n" ; # '$project,$date,$hour'\n" ;
|
| 169 | + }
|
| 170 | +
|
| 171 | + close PROJECTFILE ;
|
| 172 | +
|
| 173 | + if ($file_changed)
|
| 174 | + {
|
| 175 | + open PROJECTFILE, '>', "$dir/$file" || die "Could not open '$dir/$file'\n" ;
|
| 176 | + foreach $line (@projectfile)
|
| 177 | + { print PROJECTFILE "$line\n" ; }
|
| 178 | + close PROJECTFILE ;
|
| 179 | + }
|
| 180 | +}
|
| 181 | +
|
| 182 | +sub Log
|
| 183 | +{
|
| 184 | + my $msg = shift ;
|
| 185 | + print $msg ;
|
| 186 | + print LOG $msg ;
|
| 187 | +}
|
| 188 | +
|
| 189 | +
|
| 190 | +
|
Index: trunk/wikistats/dammit.lt/cellar/!DammitFilesFindMisses.pl |
— | — | @@ -0,0 +1,185 @@ |
| 2 | +#!/usr/local/bin/perl
|
| 3 | +
|
| 4 | +# to do
|
| 5 | +# titles can occur twice (because of ucfirst) , add those counts before pushing to table @data
|
| 6 | +# remove extra parameters e.g. "Gabriel_Andrade&limit=500"
|
| 7 | +
|
| 8 | + use CGI qw(:all);
|
| 9 | + use URI::Escape;
|
| 10 | + use Getopt::Std ;
|
| 11 | + use Cwd ;
|
| 12 | +
|
| 13 | + $bayes = -d "/a/dammit.lt/pagecounts" ;
|
| 14 | + $path_7za = "/usr/lib/p7zip/7za" ;
|
| 15 | + $path_grep = "/bin/grep" ;
|
| 16 | +
|
| 17 | + $| = 1; # flush screen output
|
| 18 | + $true = 1 ;
|
| 19 | + $false = 0 ;
|
| 20 | +
|
| 21 | + $jobstart = time ;
|
| 22 | +
|
| 23 | + $key = "de.z" ;
|
| 24 | +
|
| 25 | +# -i "D:/\@Wikimedia/!Perl/#Projects/Dammit Log Files/Scan Log Files/in" -o "D:/\@Wikimedia/!Perl/#Projects/Dammit Log Files/Scan Log Files/out" -f 20090429 -t 20090429 -p ''
|
| 26 | + my $options ;
|
| 27 | + getopt ("iop", \%options) ;
|
| 28 | +
|
| 29 | + $file_articles_in = "W:/# In Dumps/dewiki-20090917-all-titles-in-ns0" ;
|
| 30 | + $file_articles_out = "W:/# In Dumps/dewiki-20090917-all-titles-in-ns0_b" ;
|
| 31 | + $file_pageviews_in = "W:/pagecounts-20090801_fdt" ;
|
| 32 | + $file_pageviews_out = "W:/pagecounts-20090801_fdt_b" ;
|
| 33 | + $file_extract = "W:/! Perl/Dammit Log Files/Scan Log Files/PageViewsExtractArticlesDeWp.txt" ;
|
| 34 | + $file_missing = "W:/! Perl/Dammit Log Files/Scan Log Files/PageViewsMissingArticlesDeWp.txt" ;
|
| 35 | +
|
| 36 | +# if (! defined ($options {"i"})) { &Abort ("Specify input dir as -i dirname") } ;
|
| 37 | +# if (! defined ($options {"o"})) { &Abort ("Specify output dir as -o dirname") } ;
|
| 38 | +# if (! defined ($options {"p"})) { &Abort ("Specify project as -p \".....\"") } ;
|
| 39 | +
|
| 40 | +# $dir_in = $options {"i"} ;
|
| 41 | +# $dir_out = $options {"o"} ;
|
| 42 | +# $project = $options {"p"} ;
|
| 43 | +
|
| 44 | +# $work = cwd() ;
|
| 45 | +# print "Work dir $work\n" ;
|
| 46 | +# if ($dir_in !~ /[\/\\]/)
|
| 47 | +# { $dir_in = "$work/$dir_in" ; }
|
| 48 | +# if ($dir_out !~ /[\/\\]/)
|
| 49 | +# { $dir_out = "$work/$dir_out" ; }
|
| 50 | +
|
| 51 | +# if (! -d $dir_in) { &Abort ("Input dir not found: $dir_in") } ;
|
| 52 | +# if (! -d $dir_out)
|
| 53 | +# {
|
| 54 | +# print "Create output dir $dir_out\n" ;
|
| 55 | +# mkdir $dir_out ;
|
| 56 | +# if (! -d $dir_out)
|
| 57 | +# { &Abort ("Output dir could not be created.") } ;
|
| 58 | +# }
|
| 59 | +
|
| 60 | + print "\nExtract missing articles\n" ; # Parm in: $dir_in\nParm out: $dir_out\n" ;
|
| 61 | +
|
| 62 | +# &SortEncodedArticleTitles ;
|
| 63 | + &ExtractMissingArticles ;
|
| 64 | +
|
| 65 | + &Log ("\nReady\n") ;
|
| 66 | + exit ;
|
| 67 | +
|
| 68 | +sub SortEncodedArticleTitles
|
| 69 | +{
|
| 70 | + open IN, '<', $file_articles_in || &Abort ("$file_articles_in could not be opened") ;
|
| 71 | + open OUT, '>', $file_articles_out || &Abort ("$file_articles_out could not be opened") ;
|
| 72 | +
|
| 73 | + while ($line = <IN>)
|
| 74 | + {
|
| 75 | + chomp ($line) ;
|
| 76 | + $line =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
|
| 77 | + $line =~ s/([\x00-\x31\x80-\xFF])/"%".sprintf("%X",ord ($1))/ge ;
|
| 78 | + $line = ucfirst ($line) ;
|
| 79 | + push @data, $line ;
|
| 80 | + }
|
| 81 | + close IN ;
|
| 82 | +
|
| 83 | + @data = sort @data ;
|
| 84 | +
|
| 85 | + foreach $line (@data)
|
| 86 | + { print OUT "$line\n" ; }
|
| 87 | +
|
| 88 | + close OUT ;
|
| 89 | +
|
| 90 | + #--------------------------------------------------------------------------------------
|
| 91 | +
|
| 92 | + open IN, '<', $file_pageviews_in || &Abort ("$file_pageviews_in could not be opened") ;
|
| 93 | + open OUT, '>', $file_pageviews_out || &Abort ("$file_pageviews_tmp could not be opened") ;
|
| 94 | +
|
| 95 | + @data = () ;
|
| 96 | + while ($line = <IN>)
|
| 97 | + {
|
| 98 | + if ($line !~ /^$key /) { next ; }
|
| 99 | +
|
| 100 | + chomp ($line) ;
|
| 101 | + ($key2,$title,$counts) = split (' ', $line) ;
|
| 102 | + $title =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
|
| 103 | + $title =~ s/([\x00-\x31\x80-\xFF])/"%".sprintf("%X",ord ($1))/ge ;
|
| 104 | + $title = ucfirst ($title) ;
|
| 105 | + push @data, "$title $counts" ;
|
| 106 | + }
|
| 107 | + close IN ;
|
| 108 | +
|
| 109 | + @data = sort @data ;
|
| 110 | +
|
| 111 | + foreach $line (@data)
|
| 112 | + { print OUT "$line\n" ; }
|
| 113 | +
|
| 114 | + close OUT ;
|
| 115 | +}
|
| 116 | +
|
| 117 | +sub ExtractMissingArticles
|
| 118 | +{
|
| 119 | + my $dir_in = shift ;
|
| 120 | + my $dir_out = shift ;
|
| 121 | +
|
| 122 | + open ARTICLES, '<', $file_articles_out || &Abort ("$file_articles_out could not be opened") ;
|
| 123 | + open PAGEVIEWS, '<', $file_pageviews_out || &Abort ("$file_pageviews_out could not be opened") ;
|
| 124 | + open EXTRACT, '>', $file_extract || &Abort ("$file_extract could not be written") ;
|
| 125 | + open MISSING, '>', $file_missing || &Abort ("$file_missing could not be written") ;
|
| 126 | +
|
| 127 | + $title_at = <ARTICLES> ; # at = article title
|
| 128 | + chomp $title_at ;
|
| 129 | +
|
| 130 | + @data = () ;
|
| 131 | + while ($line_pv = <PAGEVIEWS>) # pv = page view
|
| 132 | + {
|
| 133 | + chomp ($line_pv) ;
|
| 134 | + ($title_pv,$counts) = split (' ', $line_pv) ;
|
| 135 | +
|
| 136 | + while (($title_at ne "") && ($title_pv gt $title_at))
|
| 137 | + {
|
| 138 | + # print EXTRACT " PV '$title_pv' gt AT $title_at\n" ;
|
| 139 | + $title_at = <ARTICLES> ;
|
| 140 | + chomp ($title_at) ;
|
| 141 | + }
|
| 142 | +
|
| 143 | + chomp ($line_articles) ;
|
| 144 | + # if ($title_pv eq $title_at)
|
| 145 | + # { print EXTRACT " PV '$title_pv' EQ AT '$title_at'\n" ; }
|
| 146 | + # else
|
| 147 | + # { print EXTRACT " PV '$title_pv' NE AT '$title_at'\n" ; }
|
| 148 | + if ($title_pv ne $title_at)
|
| 149 | + {
|
| 150 | + $title_pv2 = $title_pv ;
|
| 151 | + $title_pv2 =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
|
| 152 | + print EXTRACT "$title_pv2 $counts\n" ;
|
| 153 | +
|
| 154 | + if ($title_pv2 !~ /:/) # temp treat all titles with : as namespaces
|
| 155 | + {
|
| 156 | + $counts =~ s/^(\d+).*$/$1/ ;
|
| 157 | + push @data, "$counts $title_pv2" ;
|
| 158 | + }
|
| 159 | + }
|
| 160 | + }
|
| 161 | + @data = sort {$b <=> $a} @data ;
|
| 162 | + foreach $line (@data)
|
| 163 | + { print MISSING "$line\n" ; }
|
| 164 | +}
|
| 165 | +
|
| 166 | +sub Log
|
| 167 | +{
|
| 168 | + $msg = shift ;
|
| 169 | + print $msg ;
|
| 170 | + print LOG $msg ;
|
| 171 | +}
|
| 172 | +
|
| 173 | +sub Abort
|
| 174 | +{
|
| 175 | + $msg = shift ;
|
| 176 | + print "Abort script\nError: $msg\n" ;
|
| 177 | + print LOG "Abort script\nError: $msg\n" ;
|
| 178 | + exit ;
|
| 179 | +}
|
| 180 | +
|
| 181 | +sub mmss
|
| 182 | +{
|
| 183 | + my $seconds = shift ;
|
| 184 | + return (int ($seconds / 60) . " min, " . ($seconds % 60) . " sec") ;
|
| 185 | +}
|
| 186 | +
|
Index: trunk/wikistats/dammit.lt/cellar/!DammitScanCompactedFiles.pl |
— | — | @@ -0,0 +1,364 @@ |
| 2 | +#!/usr/local/bin/perl
|
| 3 | +
|
| 4 | +# 27 April 2010 renamed from WikiStatsScanCompactedDammitFiles.pl
|
| 5 | +
|
| 6 | + use CGI qw(:all);
|
| 7 | + use URI::Escape;
|
| 8 | + use Getopt::Std ;
|
| 9 | + use Cwd ;
|
| 10 | +
|
| 11 | +# grep pagecounts-20090428_fdt -f pandemic.txt > scan.txt
|
| 12 | +# utf-8 encoder for non western article titles: http://www.motobit.com/util/url-encoder.asp
|
| 13 | +
|
| 14 | +# &UncompactVisitorStats ('.') ;
|
| 15 | +# exit ;
|
| 16 | +
|
| 17 | + $bayes = -d "/a/dammit.lt/pagecounts" ;
|
| 18 | + $path_7za = "/usr/lib/p7zip/7za" ;
|
| 19 | + $path_grep = "/bin/grep" ;
|
| 20 | +
|
| 21 | +# if (! $bayes)
|
| 22 | +# {
|
| 23 | +# print "Test on Windows\n" ;
|
| 24 | +# include IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; # install IO-Compress-Zlib
|
| 25 | +# include IO::Compress::Gzip qw(gzip $GzipError) ; # install IO-Compress-Zlib
|
| 26 | +# }
|
| 27 | +
|
| 28 | + $| = 1; # flush screen output
|
| 29 | + $true = 1 ;
|
| 30 | + $false = 0 ;
|
| 31 | + $threshold = 5 ;
|
| 32 | + $jobstart = time ;
|
| 33 | +
|
| 34 | +# -i "D:/\@Wikimedia/!Perl/#Projects/Dammit Log Files/Scan Log Files/in" -o "D:/\@Wikimedia/!Perl/#Projects/Dammit Log Files/Scan Log Files/out" -f 20090429 -t 20090429 -p ''
|
| 35 | + my $options ;
|
| 36 | + getopt ("ioftp", \%options) ;
|
| 37 | +
|
| 38 | + if (! defined ($options {"i"})) { &Abort ("Specify input dir as -i dirname") } ;
|
| 39 | + if (! defined ($options {"o"})) { &Abort ("Specify output dir as -o dirname") } ;
|
| 40 | + if (! defined ($options {"f"})) { &Abort ("Specify from date as -f yyyymmdd") } ;
|
| 41 | + if (! defined ($options {"t"})) { &Abort ("Specify till date as -t yyyymmdd") } ;
|
| 42 | + if (! defined ($options {"p"})) { &Abort ("Specify pattern as -p \".....\"") } ;
|
| 43 | +
|
| 44 | + $dir_in = $options {"i"} ;
|
| 45 | + $dir_out = $options {"o"} ;
|
| 46 | + $datefrom = $options {"f"} ;
|
| 47 | + $datetill = $options {"t"} ;
|
| 48 | + $pattern = $options {"p"} ;
|
| 49 | +
|
| 50 | + print "Pattern '$pattern'\n" ;
|
| 51 | + if ($pattern eq "html")
|
| 52 | + { $pattern = &GetPattern ; }
|
| 53 | +
|
| 54 | + $work = cwd() ;
|
| 55 | + print "Work dir $work\n" ;
|
| 56 | + if ($dir_in !~ /[\/\\]/)
|
| 57 | + { $dir_in = "$work/$dir_in" ; }
|
| 58 | + if ($dir_out !~ /[\/\\]/)
|
| 59 | + { $dir_out = "$work/$dir_out" ; }
|
| 60 | +
|
| 61 | + if (! -d $dir_in) { &Abort ("Input dir not found: $dir_in") } ;
|
| 62 | + if (! -d $dir_out)
|
| 63 | + {
|
| 64 | + print "Create output dir $dir_out\n" ;
|
| 65 | + mkdir $dir_out ;
|
| 66 | + if (! -d $dir_out)
|
| 67 | + { &Abort ("Output dir could not be created.") } ;
|
| 68 | + }
|
| 69 | +
|
| 70 | + print "\nParm pattern: $pattern\n\n" ;
|
| 71 | +# $pattern = "^nl.z Amsterdam\n^de.z Leiden\n" ;
|
| 72 | + if ($pattern =~ /^\#/)
|
| 73 | + { $file_pattern = substr ($pattern,1) ; }
|
| 74 | + else
|
| 75 | + {
|
| 76 | + $pattern =~ s/\\n/\n/gs ;
|
| 77 | + $file_pattern = "$dir_out/pattern.txt" ;
|
| 78 | + print "Write pattern to $file_pattern\n" ;
|
| 79 | + open PATTERN, ">", $file_pattern ;
|
| 80 | + print PATTERN $pattern ;
|
| 81 | + close PATTERN ;
|
| 82 | + }
|
| 83 | +
|
| 84 | + if (($datefrom !~ /^20\d{6}$/))
|
| 85 | + { &Abort ("Specify from date: as -f yyyymmdd") ; }
|
| 86 | + if (($datetill !~ /^20\d{6}$/))
|
| 87 | + { &Abort ("Specify till date: as -t yyyymmdd") ; }
|
| 88 | +
|
| 89 | + $dirfrom = substr ($datefrom,0,4) . "-" . substr ($datefrom,4,2) ;
|
| 90 | + $dirtill = substr ($datetill,0,4) . "-" . substr ($datetill,4,2) ;
|
| 91 | +
|
| 92 | + print "\nScan pagecount files\nParm in: $dir_in\nParm out: $dir_out\nParm from: $datefrom (in folder $dirfrom)\nParm till: $datetill (in folder $dirtill)\nParm pattern: $pattern\n\n" ;
|
| 93 | +
|
| 94 | + open LOG, ">>", "$work/WikiStatsScanVisitorstats.log" ;
|
| 95 | +
|
| 96 | + &ScanVisitorStats ($dir_in, $dir_out, $dirfrom, $dirtill, $datefrom, $datetill) ;
|
| 97 | + &UncompactVisitorStats ($dir_out) ;
|
| 98 | +
|
| 99 | + &Log ("\nReady\n") ;
|
| 100 | + close LOG ;
|
| 101 | + exit ;
|
| 102 | +
|
| 103 | +sub ScanVisitorStats
|
| 104 | +{
|
| 105 | + my $dir_in = shift ;
|
| 106 | + my $dir_out = shift ;
|
| 107 | + my $dirfrom = shift ;
|
| 108 | + my $dirtill = shift ;
|
| 109 | + my $datefrom = shift ;
|
| 110 | + my $datetill = shift ;
|
| 111 | +
|
| 112 | + my @dirs ;
|
| 113 | + my @files ;
|
| 114 | +
|
| 115 | + chdir ($dir_in) || &Abort ("Cannot chdir to $dir_in\n") ;
|
| 116 | + local (*DIR);
|
| 117 | + opendir (DIR, ".");
|
| 118 | + @files = () ;
|
| 119 | + while ($file_in = readdir (DIR))
|
| 120 | + {
|
| 121 | + if (! -d $file_in)
|
| 122 | + { next ; }
|
| 123 | + if ($file_in !~ /^20\d\d-\d\d$/)
|
| 124 | + { next ; }
|
| 125 | + if (($file_in lt $dirfrom) || ($file_in gt $dirtill))
|
| 126 | + { next ; }
|
| 127 | + &Log ("Store folder $file_in\n") ;
|
| 128 | + push @dirs, $file_in ;
|
| 129 | + }
|
| 130 | + &Log ("\n") ;
|
| 131 | + closedir (DIR, ".");
|
| 132 | +
|
| 133 | + @dirs = sort @dirs ;
|
| 134 | +
|
| 135 | + foreach $dir (@dirs)
|
| 136 | + {
|
| 137 | + chdir ("$dir_in/$dir") || &Abort ("Cannot chdir to $dir_in/$dir\n") ;
|
| 138 | + local (*DIR);
|
| 139 | + opendir (DIR, ".");
|
| 140 | + while ($file_in = readdir (DIR))
|
| 141 | + {
|
| 142 | + if (-d $file_in)
|
| 143 | + { next ; }
|
| 144 | + if ($file_in !~ /^pagecounts-\d{8,8}_fdt.7z$/)
|
| 145 | + { next ; }
|
| 146 | + if (($file_in lt "pagecounts-$datefrom") || ($file_in gt "pagecounts-$datetill\xFF"))
|
| 147 | + { next ; }
|
| 148 | + &Log ("Store file $file_in\n") ;
|
| 149 | + push @files, "$dir/$file_in" ;
|
| 150 | + }
|
| 151 | + closedir (DIR, ".");
|
| 152 | + }
|
| 153 | + &Log ("\n") ;
|
| 154 | +
|
| 155 | + if ($#files > -1)
|
| 156 | + {
|
| 157 | + @files = sort @files ;
|
| 158 | +
|
| 159 | + unlink "$dir_out/scan.txt" ;
|
| 160 | + foreach $file (@files)
|
| 161 | + {
|
| 162 | + my $filestart = time ;
|
| 163 | + my $date = $file ;
|
| 164 | + $date =~ s/^.*?-(\d{8,8})_.*$/$1/ ;
|
| 165 | + $size = -s "$dir_in/$file" ;
|
| 166 | + print "Scan file '$file' ($size bytes)\n" ;
|
| 167 | +
|
| 168 | + $cmd = "echo \"\# $date\" >> $dir_out/scan.txt" ;
|
| 169 | + print "Cmd: $cmd\n" ;
|
| 170 | + $result = `$cmd` ;
|
| 171 | +
|
| 172 | + $cmd = "7z -so e $dir_in/$file | grep -i -f $file_pattern >> $dir_out/scan.txt" ;
|
| 173 | + print "Cmd: $cmd\n" ;
|
| 174 | + $result = `$cmd` ;
|
| 175 | +
|
| 176 | + print "File done in " . &mmss(time - $filestart) . "\n\n" ;
|
| 177 | + }
|
| 178 | +
|
| 179 | + print "Job done in " . &mmss(time - $jobstart) . "\n" ;
|
| 180 | + print "Average file took " . &mmss(int (time - $jobstart)/($#files+1)) . "\n" ;
|
| 181 | + }
|
| 182 | + &Log ("\n\n") ;
|
| 183 | +}
|
| 184 | +
|
| 185 | +sub UncompactVisitorStats
|
| 186 | +{
|
| 187 | + &Log ("\nUncompact visitors stats\n\n") ;
|
| 188 | + my $dir_out = shift ;
|
| 189 | +
|
| 190 | + my $file_in = "$dir_out/scan.txt" ;
|
| 191 | + my $file_out1 = "$dir_out/CountsDailyPerLanguageTitles.csv" ; # totals for full day per language:title
|
| 192 | + my $file_out2 = "$dir_out/CountsHourlyPerLanguageTitle.csv" ; # hourly counts per language:title (hours vertical)
|
| 193 | + my $file_out3 = "$dir_out/CountsHourlyPerLanguage.csv" ; # hourly counts per language (hours vertical)
|
| 194 | + my ($date,$time,$year,$month,$day) ; ;
|
| 195 | +
|
| 196 | + open IN, '<', $file_in ;
|
| 197 | + binmode IN ;
|
| 198 | +
|
| 199 | + while ($line = <IN>)
|
| 200 | + {
|
| 201 | + # process timestamp
|
| 202 | + if ($line =~ /^#/)
|
| 203 | + {
|
| 204 | + $date = substr ($line,2,8) ;
|
| 205 | + $year = substr ($date,0,4) ;
|
| 206 | + $month = substr ($date,4,2) ;
|
| 207 | + $day = substr ($date,6,2) ;
|
| 208 | + $date = "=DATE($year,$month,$day)" ;
|
| 209 | + next ;
|
| 210 | + }
|
| 211 | +
|
| 212 | + chomp ($line) ;
|
| 213 | + ($lang,$title,$counts) = split (" ", $line) ;
|
| 214 | + $title =~ s/,/,/g ;
|
| 215 | + $lang =~ s/\.z// ;
|
| 216 | + $lang =~ s/\.y/2/ ;
|
| 217 | + $counts =~ s/^\d+// ; # remove (redundant) preceding total
|
| 218 | +
|
| 219 | + # store hourly counts
|
| 220 | + while ($counts ne "")
|
| 221 | + {
|
| 222 | + $letter = substr ($counts,0,1) ;
|
| 223 | + $counts = substr ($counts,1) ;
|
| 224 | + ($count = $counts) =~ s/^(\d+).*$/$1/ ;
|
| 225 | + $counts =~ s/^\d+(.*)$/$1/ ;
|
| 226 | + $h = sprintf ("%02d", ord ($letter) - ord ('A')) ;
|
| 227 | + $time = $date . "+TIME($h,0,0)" ;
|
| 228 | +
|
| 229 | + $hits1 {"$lang,$title,\"$date\""} += $count ;
|
| 230 | + $key = "$lang:$title" ;
|
| 231 | + $times {$time}++ ;
|
| 232 | + $keys {$key} ++ ;
|
| 233 | + $languages {$lang} ++ ;
|
| 234 | + $hits2 {"$time,$key"} += $count ;
|
| 235 | + $hits3 {"$time,$lang"} += $count ;
|
| 236 | + }
|
| 237 | + }
|
| 238 | +
|
| 239 | + close IN ;
|
| 240 | +
|
| 241 | + # file_out1: write totals for full day per language:title
|
| 242 | + # quick way to see which titles are visisted significantly
|
| 243 | + @lines = sort @lines ;
|
| 244 | + open OUT, '>', $file_out1 ;
|
| 245 | + binmode OUT ;
|
| 246 | + foreach $key (sort keys %hits1)
|
| 247 | + { print OUT "$key,${hits1{$key}}\n" ; }
|
| 248 | + close OUT ;
|
| 249 | +
|
| 250 | + # file_out2: write hourly counts per language:title (hours vertical)
|
| 251 | + open OUT, '>', $file_out2 ;
|
| 252 | + binmode OUT ;
|
| 253 | +
|
| 254 | + # header line
|
| 255 | + $line = "date / group" ;
|
| 256 | + foreach $key (sort keys %keys)
|
| 257 | + { $line .= ",$key" ; }
|
| 258 | + $line .= "\n" ;
|
| 259 | + print OUT $line ;
|
| 260 | +
|
| 261 | + foreach $time (sort keys %times)
|
| 262 | + {
|
| 263 | + $line = "\"$time\"" ;
|
| 264 | + foreach $key (sort keys %keys)
|
| 265 | + {
|
| 266 | + $count = $hits2 {"$time,$key"} ;
|
| 267 | + if ($count eq "")
|
| 268 | + { $count = 0 ; }
|
| 269 | + $line .= ",$count" ;
|
| 270 | + }
|
| 271 | + $line .= "\n" ;
|
| 272 | + print OUT $line ;
|
| 273 | + }
|
| 274 | + close OUT ;
|
| 275 | +
|
| 276 | + # file_out3: write hourly counts per language (hours vertical)
|
| 277 | + open OUT, '>', $file_out3 ;
|
| 278 | + binmode OUT ;
|
| 279 | +
|
| 280 | + # header line
|
| 281 | + $line = "date / group" ;
|
| 282 | + foreach $lang (sort keys %languages)
|
| 283 | + { $line .= ",$lang" ; }
|
| 284 | + $line .= "\n" ;
|
| 285 | + print OUT $line ;
|
| 286 | +
|
| 287 | + foreach $time (sort keys %times)
|
| 288 | + {
|
| 289 | + $line = "\"$time\"" ;
|
| 290 | + foreach $lang (sort keys %languages)
|
| 291 | + {
|
| 292 | + $count = $hits3 {"$time,$lang"} ;
|
| 293 | + if ($count eq "")
|
| 294 | + { $count = 0 ; }
|
| 295 | + $line .= ",$count" ;
|
| 296 | + }
|
| 297 | + $line .= "\n" ;
|
| 298 | + print OUT $line ;
|
| 299 | + }
|
| 300 | + close OUT ;
|
| 301 | +
|
| 302 | +}
|
| 303 | +
|
| 304 | +sub GetPattern
|
| 305 | +{
|
| 306 | + print "GetPattern\n" ;
|
| 307 | + open HTML, '<', 'wikilinks.html' ;
|
| 308 | + $pattern = "" ;
|
| 309 | + while ($line = <HTML>)
|
| 310 | + {
|
| 311 | + if ($line =~ /class=\"interwiki/)
|
| 312 | + {
|
| 313 | + chomp ($line) ;
|
| 314 | + $lang = $line ;
|
| 315 | + $lang =~ s/^.*?interwiki-(\w+).*$/$1/ ;
|
| 316 | + $title = $line ;
|
| 317 | + $title =~ s/^.*?href=\"([^\"]+)\".*$/$1/ ;
|
| 318 | + $title =~ s/^.*\/([^\/]+)$/$1/ ;
|
| 319 | + # print "[$lang] $title\n" ;
|
| 320 | + @languages {$title} .= "$lang," ;
|
| 321 | + @langcnt {$title}++ ;
|
| 322 | + }
|
| 323 | + }
|
| 324 | + print "\n\n\n" ;
|
| 325 | +
|
| 326 | + foreach $title (sort {$langcnt {$b} <=> $langcnt {$a}} keys %langcnt)
|
| 327 | + {
|
| 328 | + $count = $langcnt {$title} ;
|
| 329 | + if ($count > 10)
|
| 330 | + { $pattern .= "$title\n" ; }
|
| 331 | + else
|
| 332 | + {
|
| 333 | + $langlist = $languages {$title} ;
|
| 334 | + @langs = split (',', $langlist) ;
|
| 335 | + foreach $lang (@langs)
|
| 336 | + {
|
| 337 | + print "$lang $title\n" ;
|
| 338 | + $pattern .= "^$lang\.z $title\n"
|
| 339 | + }
|
| 340 | + }
|
| 341 | + }
|
| 342 | + return ($pattern) ;
|
| 343 | +}
|
| 344 | +
|
| 345 | +sub Log
|
| 346 | +{
|
| 347 | + $msg = shift ;
|
| 348 | + print $msg ;
|
| 349 | + print LOG $msg ;
|
| 350 | +}
|
| 351 | +
|
| 352 | +sub Abort
|
| 353 | +{
|
| 354 | + $msg = shift ;
|
| 355 | + print "Abort script\nError: $msg\n" ;
|
| 356 | + print LOG "Abort script\nError: $msg\n" ;
|
| 357 | + exit ;
|
| 358 | +}
|
| 359 | +
|
| 360 | +sub mmss
|
| 361 | +{
|
| 362 | + my $seconds = shift ;
|
| 363 | + return (int ($seconds / 60) . " min, " . ($seconds % 60) . " sec") ;
|
| 364 | +}
|
| 365 | +
|
Index: trunk/wikistats/dammit.lt/dammit_compact_monthly.sh |
— | — | @@ -0,0 +1,11 @@ |
| 2 | +#!/bin/sh |
| 3 | + |
| 4 | +ulimit -v 8000000 |
| 5 | + |
| 6 | +# dte=$(date +%Y%m) |
| 7 | +# dte=$(date --date "$dte -1 days" +%Y%m) |
| 8 | +# echo "Compact dammit.lt files for one day: $dte" |
| 9 | + |
| 10 | +echo "Compact dammit.lt files for one month" |
| 11 | +nice perl /a/dammit.lt/DammitCompactHourlyOrDailyPageCountFiles.pl -m -d 201001 -i /a/dammit.lt/pagecounts -o /a/dammit.lt/pagecounts/monthly >> /a/dammit.lt/pagecounts/monthly/compact_log.txt |
| 12 | + |
Property changes on: trunk/wikistats/dammit.lt/dammit_compact_monthly.sh |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 13 | + native |
Index: trunk/wikistats/dammit.lt/DammitReportPageRequestsStaffWikis.pl |
— | — | @@ -0,0 +1,281 @@ |
| 2 | +#!/usr/bin/perl |
| 3 | + |
| 4 | +# bash file for daily generation and copy |
| 5 | +# blank article title ?! |
| 6 | + |
| 7 | +# no warnings 'uninitialized'; |
| 8 | + |
| 9 | + use lib "/home/ezachte/lib" ; # general routines |
| 10 | + use lib "/home/ezachte/wikistats" ; # WikiReports*.pm modules |
| 11 | + use lib "W:/! Perl/Wikistats" ; # test env |
| 12 | + |
| 13 | + use EzLib ; |
| 14 | + ez_lib_version (8) ; |
| 15 | + $trace_on_exit = $true ; |
| 16 | + |
| 17 | +# use Time::Local ; |
| 18 | +# use Net::Domain qw (hostname); |
| 19 | + |
| 20 | + use WikiReportsDate ; |
| 21 | + use WikiReportsLiterals ; |
| 22 | + use WikiReportsOutputMisc ; |
| 23 | + use WikiReportsScripts ; |
| 24 | + use WikiReportsNoWikimedia ; |
| 25 | + use WikiReportsLocalizations ; |
| 26 | + use WikiReportsHtml ; |
| 27 | + |
| 28 | + my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); |
| 29 | + |
| 30 | + &SetMonths ; |
| 31 | + &SetLiterals ; |
| 32 | + &SetScripts ; |
| 33 | + |
| 34 | + &CountMostViewedPages ($this_month) ; |
| 35 | + if ($mday <= 5) |
| 36 | + { &CountMostViewedPages ($prev_month) ; } |
| 37 | + |
| 38 | + exit ; |
| 39 | + |
| 40 | +sub SetMonths |
| 41 | +{ |
| 42 | + $mon ++ ; |
| 43 | + $year += 1900 ; |
| 44 | + $this_month = sprintf ("%04d-%02d", $year, $mon) ; |
| 45 | + if (-- $mon == 0) { $mon = 12 ; $year-- ; } |
| 46 | + $prev_month = sprintf ("%04d-%02d", $year, $mon) ; |
| 47 | + if (-- $mon == 0) { $mon = 12 ; $year-- ; } |
| 48 | + $prev_prev_month = sprintf ("%04d-%02d", $year, $mon) ; |
| 49 | +} |
| 50 | + |
| 51 | +sub CountMostViewedPages |
| 52 | +{ |
| 53 | + my $month = shift ; |
| 54 | + ($month2 = $month) =~ s/-// ; |
| 55 | + |
| 56 | + undef %views ; |
| 57 | + |
| 58 | + &LogT ("Count pages for $month\n\n") ; |
| 59 | + |
| 60 | + if ($job_runs_on_production_server) |
| 61 | + { |
| 62 | + &LogT ("Job runs on production server\n") ; |
| 63 | + $path_in = "/a/dammit.lt/filtered" ; |
| 64 | + $path_out = "/mnt/htdocs/page_views" ; |
| 65 | + } |
| 66 | + else |
| 67 | + { |
| 68 | + &LogT ("Job runs on local test server\n") ; |
| 69 | + $path_in = "w:/! Perl/Dammit/Page Requests Staff Wikis" ; |
| 70 | + $path_out = "w:/! Perl/Dammit/Page Requests Staff Wikis" ; |
| 71 | + } |
| 72 | + |
| 73 | + &LogT ("Path in: $path_in\n") ; |
| 74 | + &LogT ("Path out: $path_out\n") ; |
| 75 | + chdir $path_in ; |
| 76 | + my @files = glob "*" ; # glob on qualified dir on Windows gives problems, hence chdir ?? |
| 77 | + |
| 78 | + $first = "" ; |
| 79 | + $last = "" ; |
| 80 | + foreach $file (sort @files) |
| 81 | + { |
| 82 | + next if $file !~ /pagecounts-$month2\d\d.txt/ ; |
| 83 | + &LogT ("$file\n") ; |
| 84 | + |
| 85 | + if ($first eq "") |
| 86 | + { ($first = $file) =~ s/[^\d]//g ; } |
| 87 | + ($last = $file) =~ s/[^\d]//g ; |
| 88 | + $first =~ s/(\d\d\d\d)(\d\d)(\d\d)/$1-$2-$3/ ; |
| 89 | + $last =~ s/(\d\d\d\d)(\d\d)(\d\d)/$1-$2-$3/ ; |
| 90 | + $first_day = substr ($first,8,2) ; |
| 91 | + $last_day = substr ($last ,8,2) ; |
| 92 | + |
| 93 | + open IN, '<', $file ; |
| 94 | + while ($line = <IN>) |
| 95 | + { |
| 96 | + chomp $line ; |
| 97 | + ($project, $article, $counts) = split (' ', $line) ; |
| 98 | + |
| 99 | + next if $article =~ /^\s*$/ ; |
| 100 | + next if $project eq "quality.m" ; # obsolete |
| 101 | + next if $article =~ /:\/\// ; # e.g. http:// |
| 102 | + next if $article =~ /\.php/ ; |
| 103 | + |
| 104 | + $article =~ s/^[\/\\]*// ; |
| 105 | + $article = ucfirst $article ; |
| 106 | + $project =~ s/\.m$// ; |
| 107 | + $project = ucfirst $project ; |
| 108 | + $projects {$project} ++ ; |
| 109 | + |
| 110 | + ($daytotal = $counts) =~ s/^(\d+).*$/$1/ ; |
| 111 | + $views {$project} {$article} += $daytotal ; |
| 112 | + |
| 113 | + # if ($article =~ /China/) |
| 114 | + # { print "$project $article + $daytotal -> " . $views {$project} {$article} . "\n" ; } |
| 115 | + } |
| 116 | + } |
| 117 | + |
| 118 | + $month_eng = month_english_short (substr($month,5,2) - 1) . ' ' . substr ($month,0,4) ; |
| 119 | + |
| 120 | + $period = 'day ' . (substr ($first,8,2)+0) . '-' . (substr ($last,8,2)+0) ; |
| 121 | + |
| 122 | + foreach $project (sort keys %projects) |
| 123 | + { |
| 124 | + &LogT ("\nWrite totals for project $project for month $month (day $first_day - $last_day)\n\n") ; |
| 125 | + |
| 126 | + # === Sort by title === |
| 127 | + |
| 128 | + @articles = sort keys %{$views {$project}} ; |
| 129 | + next if $#articles == -1 ; |
| 130 | + |
| 131 | + open TXT, '>', "$path_out/PageViews${project}-$month-ByTitle.txt" ; |
| 132 | + open CSV, '>', "$path_out/PageViews${project}-$month-ByTitle.csv" ; |
| 133 | + |
| 134 | + print TXT "title,views (period: $first - $last)\n" ; |
| 135 | + print CSV "views,title,period: $first - $last\n" ; |
| 136 | + |
| 137 | + foreach $article (@articles) |
| 138 | + { |
| 139 | + ($article2 = $article) =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ; |
| 140 | + print TXT "$article2,${views {$project} {$article}}\n" ; |
| 141 | + |
| 142 | + $article2 = $article ; |
| 143 | + if ($article2 =~ /,/) |
| 144 | + { $article2 = "\"$article2\"" ; } |
| 145 | + print CSV "${views {$project} {$article}},$article2\n" ; |
| 146 | + } |
| 147 | + close TXT ; |
| 148 | + close CSV ; |
| 149 | + |
| 150 | + # === Sort by views === |
| 151 | + |
| 152 | + if ($month eq $this_month) |
| 153 | + { |
| 154 | + $url_prev = "PageViews${project}-$prev_month-ByViews.html" ; |
| 155 | + $url_next = "" ; |
| 156 | + $out_button_prev = &btn (" < ", $url_prev) ; |
| 157 | + $out_button_next = "" ; |
| 158 | + } |
| 159 | + elsif ($month eq $prev_month) |
| 160 | + { |
| 161 | + $url_prev = "PageViews${project}-$prev_prev_month-ByViews.html" ; |
| 162 | + $url_next = "PageViews${project}-$this_month-ByViews.html" ; |
| 163 | + $out_button_prev = &btn (" < ", $url_prev) ; |
| 164 | + $out_button_next = &btn (" > ", $url_next) ; |
| 165 | + |
| 166 | + if (! -e $url_prev) |
| 167 | + { $out_button_prev = "" ; } |
| 168 | + } |
| 169 | + |
| 170 | + my $out_zoom = "" ; |
| 171 | + my $out_options = "" ; |
| 172 | + my $out_explanation = "" ; #Based on Domas' <a href='http://dammit.lt/wikistats/'>page view files</a>" ; |
| 173 | + my $out_page_subtitle = "" ; |
| 174 | + my $out_crossref = "" ; |
| 175 | + my $out_description = "" ; |
| 176 | + my $out_button_switch = "" ; |
| 177 | + my $out_msg = "<b>$month_eng ($period)</b>" ; |
| 178 | + my $lang = "en" ; |
| 179 | + |
| 180 | + my $out_html_title = "$project wiki page views" ; |
| 181 | + my $out_page_title = "$project wiki page views" ; |
| 182 | + |
| 183 | + $out_scriptfile = "<script language=\"javascript\" type=\"text/javascript\" src=\"WikipediaStatistics14.js\"></script>\n" ; |
| 184 | + $out_style =~ s/td/td {font-size:12px}\nth {font-size:12px}\ntd/ ; # script definition needs clean up |
| 185 | + |
| 186 | + $out_options = &opt ("PageViews${project}-$month-ByViews.html", $project) ; |
| 187 | + foreach $project2 (keys %projects) |
| 188 | + { |
| 189 | + if ($project2 ne $project) |
| 190 | + { $out_options .= &opt ("PageViews${project2}-$month-ByViews.html", $project2) ; } |
| 191 | + } |
| 192 | + |
| 193 | + $unicode = $true ; |
| 194 | + &GenerateHtmlStart ($out_html_title, $out_zoom, $out_options, |
| 195 | + $out_page_title, $out_page_subtitle, $out_explanation, |
| 196 | + $out_button_prev, $out_button_next, $out_button_switch, |
| 197 | + $out_crossref, $out_msg) ; |
| 198 | + |
| 199 | + $out_html =~ s/Sitemap.htm/http:\/\/stats.wikimedia.org/ ; # Q&D patch |
| 200 | + $out_html =~ s/ Home / stats.wikimedia.org / ; # Q&D patch |
| 201 | + |
| 202 | + @articles = sort {$views {$project}{$b} <=> $views {$project}{$a}} keys %{$views {$project}} ; |
| 203 | + |
| 204 | + open TXT, '>', "$path_out/PageViews${project}-$month-ByViews.txt" ; |
| 205 | + open CSV, '>', "$path_out/PageViews${project}-$month-ByViews.csv" ; |
| 206 | + |
| 207 | + print TXT "title,views (period: $first - $last)\n" ; |
| 208 | + print CSV "views,title,period: $first - $last\n" ; |
| 209 | + |
| 210 | + $out_html .= "<p><b>Other formats</b>: " ; |
| 211 | + $out_html .= "ordered by views: <a href='PageViews${project}-$month-ByViews.txt'>text file</a> / <a href='PageViews${project}-$month-ByViews.csv'>csv file</a>, " ; |
| 212 | + $out_html .= "ordered by title: <a href='PageViews${project}-$month-ByTitle.txt'>text file</a> / <a href='PageViews${project}-$month-ByTitle.csv'>csv file</a><p>" ; |
| 213 | + $out_html .= "<table border=1>\n" ; |
| 214 | + $out_html .= "<tr><th class=cb>Rank</th><th class=cb>Views</th><th class=lb>Title</th></tr>\n" ; |
| 215 | + |
| 216 | + $lines = 0 ; |
| 217 | + foreach $article (@articles) |
| 218 | + { |
| 219 | + ($article2 = $article) =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ; |
| 220 | + print TXT "$article2,${views {$project} {$article}}\n" ; |
| 221 | + |
| 222 | + $article2 =~ s/_/ /g ; |
| 223 | + if (++$lines <= 1000) |
| 224 | + { $out_html .= "<tr><td class=c>$lines</td><td class=r>${views {$project} {$article}}</td><td class=l><a href='http://$project.wikimedia.org/wiki/$article'>$article2</a></td></tr>\n" ; } |
| 225 | + |
| 226 | + $article2 = $article ; |
| 227 | + if ($article2 =~ /,/) |
| 228 | + { $article2 = "\"$article2\"" ; } |
| 229 | + print CSV "${views {$project} {$article}},$article2\n" ; |
| 230 | + } |
| 231 | + |
| 232 | + $out_html .= "</table>\n" ; |
| 233 | + |
| 234 | + close TXT ; |
| 235 | + close CSV ; |
| 236 | + |
| 237 | + $out_html .= "<p><small>Counts based on <a href='http://dammit.lt/wikistats/'>Domas' hourly pagecount files</a><br>" . |
| 238 | + "File generated on " . date_time_english (time) . "<br>Author: Erik Zachte</small>" ; |
| 239 | + |
| 240 | + open HTML, '>', "$path_out/PageViews${project}-$month-ByViews.html" ; |
| 241 | + print HTML $out_html ; |
| 242 | + close HTML ; |
| 243 | + |
| 244 | + if ($month eq $this_month) # static url |
| 245 | + { |
| 246 | + open HTML, '>', "$path_out/PageViews${project}.html" ; |
| 247 | + print HTML $out_html ; |
| 248 | + close HTML ; |
| 249 | + } |
| 250 | + } |
| 251 | +} |
| 252 | + |
| 253 | +# translates one unicode character into plain ascii |
| 254 | +sub UnicodeToAscii { |
| 255 | + my $unicode = shift ; |
| 256 | + |
| 257 | + my $char = substr ($unicode,0,1) ; |
| 258 | + my $ord = ord ($char) ; |
| 259 | + my ($c, $value, $html) ; |
| 260 | + |
| 261 | + if ($ord < 128) # plain ascii character |
| 262 | + { return ($unicode) ; } # (will not occur in this script) |
| 263 | + else |
| 264 | + { |
| 265 | + if ($ord >= 252) { $value = $ord - 252 ; } |
| 266 | + elsif ($ord >= 248) { $value = $ord - 248 ; } |
| 267 | + elsif ($ord >= 240) { $value = $ord - 240 ; } |
| 268 | + elsif ($ord >= 224) { $value = $ord - 224 ; } |
| 269 | + else { $value = $ord - 192 ; } |
| 270 | + |
| 271 | + for ($c = 1 ; $c < length ($unicode) ; $c++) |
| 272 | + { $value = $value * 64 + ord (substr ($unicode, $c,1)) - 128 ; } |
| 273 | + |
| 274 | + if ($value < 256) |
| 275 | + { return (chr ($value)) ; } |
| 276 | + |
| 277 | + # $unicode =~ s/([\x80-\xFF])/("%".sprintf("%02X",$1))/gie ; |
| 278 | + return ($unicode) ; |
| 279 | + } |
| 280 | +} |
| 281 | + |
| 282 | + |
Index: trunk/wikistats/dammit.lt/DammitSyncFiles.pl |
— | — | @@ -0,0 +1,197 @@ |
| 2 | +#!/usr/bin/perl |
| 3 | + |
| 4 | +# 27 April 2010 renamed from WikiStatsDammitSync.pl |
| 5 | + |
| 6 | + use Time::Local ; |
| 7 | + use Archive::Tar; |
| 8 | + |
| 9 | + $tar = Archive::Tar->new; |
| 10 | + |
| 11 | + $| = 1; # flush screen output |
| 12 | + |
| 13 | + $maxdaysago = 40; # do not download files more than this ago |
| 14 | + |
| 15 | + if (-e "a_dammit.lt_index.html") # test |
| 16 | + { $file_html = "a_dammit.lt_index.html" ; } |
| 17 | + else |
| 18 | + { |
| 19 | + open LOG, '>>', "/a/dammit.lt/WikiStatsDammitSync.log" ; |
| 20 | + |
| 21 | + $file_html = "/a/dammit.lt/index.html" ; |
| 22 | + unlink $file_html ; |
| 23 | + $cmd = "wget -O $file_html http://dammit.lt/wikistats/" ; |
| 24 | + $result = `$cmd` ; |
| 25 | + if ($result == 0) |
| 26 | + { $result = "OK" ; } |
| 27 | + &Log ("Cmd '$cmd' -> $result \n\n") ; |
| 28 | + |
| 29 | + if (! -e $file_html) { &Abort ("File $file_html not found") ; } |
| 30 | + if (-s $file_html == 0) { &Abort ("File $file_html empty") ; } |
| 31 | + } |
| 32 | + |
| 33 | + $timestart = time ; |
| 34 | + |
| 35 | + chdir "/a/dammit.lt/projectcounts" ; |
| 36 | + $cmd = `pwd` ; |
| 37 | + &Log ("Cmd '$cmd'\n") ; |
| 38 | + $result = `$cmd` ; |
| 39 | + print "$result\n" ; |
| 40 | + |
| 41 | + open HTML,'<',$file_html ; |
| 42 | + while ($line = <HTML>) |
| 43 | + { |
| 44 | + if ($line =~ /<title>/) |
| 45 | + { |
| 46 | + $subdir = "" ; |
| 47 | + if ($line =~ /archive/) |
| 48 | + { |
| 49 | + $line =~ s/^.*?\/wikistats\/// ; |
| 50 | + $line =~ s/<.*$// ; |
| 51 | + chomp $line ; |
| 52 | + $subdir = $line ; |
| 53 | + } |
| 54 | + &Log ("Subdir = '$subdir'\n") ; |
| 55 | + next ; |
| 56 | + } |
| 57 | + |
| 58 | + if ($line !~ /application\/octet-stream/) { next ; } |
| 59 | + |
| 60 | + ($file = $line) =~ s/^.*?a href=\"([^"]+)\".*$/$1/s ; |
| 61 | + ($date = $line) =~ s/^.*?class=\"m\">([^<]+)<.*$/$1/s ; |
| 62 | + ($date,$time) = split (' ', $date) ; |
| 63 | + |
| 64 | + if ($file =~ /^pagecounts/) |
| 65 | + { |
| 66 | + $yy = substr ($file,11,4) ; |
| 67 | + $mm = substr ($file,15,2) ; |
| 68 | + $dd = substr ($file,17,2) ; |
| 69 | + $daysago = int ((time - timegm(0,0,0,$dd,$mm-1,$yy-1900)) / (24 * 60 * 60)) ; |
| 70 | + |
| 71 | + print "$file: $daysago days ago\n" ; |
| 72 | + if ($daysago > $maxdaysago) { next ; } |
| 73 | + |
| 74 | + # $path_7z = "/a/dammit.lt/pagecounts/$yy\-$mm/pagecounts\-$yy$mm$dd\_fdt.7z" ; |
| 75 | + # if (-e $path_7z) { print "exists\n" ; next ; } |
| 76 | + |
| 77 | + $path = "/a/dammit.lt/pagecounts/$yy\-$mm/pagecounts\-$yy$mm$dd\_h" ; |
| 78 | + if ((-e "$path.7z") || (-e "$path.zip") || (-e "$path.bz2") || (-e "$path.gz")) |
| 79 | + { print "$path.[7z|zip|bz2|gz] exists\n" ; next ; } |
| 80 | + else |
| 81 | + { print "$path.[7z|zip|bz2|gz] new -> download\n" ; } |
| 82 | + } |
| 83 | + |
| 84 | + # if ($file =~ /^projectcounts/) |
| 85 | + # { |
| 86 | + # $yy = substr ($file,14,4) ; |
| 87 | + # $mm = substr ($file,18,2) ; |
| 88 | + # $dd = substr ($file,20,2) ; |
| 89 | + # $daysago = int ((time - timegm(0,0,0,$dd,$mm-1,$yy-1900)) / (24 * 60 * 60)) ; |
| 90 | + # if ($daysago > $maxdaysago) { next ; } |
| 91 | + # } |
| 92 | + |
| 93 | + |
| 94 | + $yy = substr ($date,0,4) ; |
| 95 | + $mm = substr ($date,5,3) ; |
| 96 | + $dd = substr ($date,9,2) ; |
| 97 | + $hh = substr ($time,0,2) ; |
| 98 | + $nn = substr ($time,3,2) ; |
| 99 | + $ss = substr ($time,6,2) ; |
| 100 | + |
| 101 | + if ($mm eq 'Jan') { $mm = 1 ; } |
| 102 | + elsif ($mm eq 'Feb') { $mm = 2 ; } |
| 103 | + elsif ($mm eq 'Mar') { $mm = 3 ; } |
| 104 | + elsif ($mm eq 'Apr') { $mm = 4 ; } |
| 105 | + elsif ($mm eq 'May') { $mm = 5 ; } |
| 106 | + elsif ($mm eq 'Jun') { $mm = 6 ; } |
| 107 | + elsif ($mm eq 'Jul') { $mm = 7 ; } |
| 108 | + elsif ($mm eq 'Aug') { $mm = 8 ; } |
| 109 | + elsif ($mm eq 'Sep') { $mm = 9 ; } |
| 110 | + elsif ($mm eq 'Oct') { $mm = 10 ; } |
| 111 | + elsif ($mm eq 'Nov') { $mm = 11 ; } |
| 112 | + elsif ($mm eq 'Dec') { $mm = 12 ; } |
| 113 | + else { &Abort ("Invalid month '$mm' in file date $date $time") ; } |
| 114 | + |
| 115 | + $date2 = sprintf ("%02d%02d%02d%02d%02d.%02d", ($yy-2000), $mm, $dd, $hh, $nn, $ss) ; |
| 116 | + |
| 117 | + if ($file =~ /^(?:page|project)counts-2/) |
| 118 | + { |
| 119 | + |
| 120 | + if ($file =~ /^pagecounts/) |
| 121 | + { $path = "/a/dammit.lt/pagecounts/$file" ; } |
| 122 | + else |
| 123 | + { $path = "/a/dammit.lt/projectcounts/$file" ; } |
| 124 | + |
| 125 | + if (-e $path) |
| 126 | + { |
| 127 | + &Log ("File $path exists\n") ; |
| 128 | + if (-s $path == 0) |
| 129 | + { |
| 130 | + &Log ("File $path empty -> overwrite\n") ; |
| 131 | + unlink $path ; |
| 132 | + } |
| 133 | + else { next ; } |
| 134 | + } |
| 135 | + |
| 136 | + if ($file =~ /^projectcounts/) |
| 137 | + { |
| 138 | + $tar_file = "/a/dammit.lt/projectcounts/projectcounts-$yy.tar" ; |
| 139 | + if (-e $tar_file) |
| 140 | + { |
| 141 | + if ($tar_file ne $tar_file_prev) |
| 142 | + { |
| 143 | + &Log ("\nRead tar file $tar_file\n") ; |
| 144 | + $tar->read($tar_file); |
| 145 | + $tar_file_prev = $tar_file ; |
| 146 | + } |
| 147 | + if ($tar->contains_file ($file)) |
| 148 | + { |
| 149 | + &Log ("File $file exists in tar file $tar_file\n") ; |
| 150 | + next ; |
| 151 | + } |
| 152 | + } |
| 153 | + else |
| 154 | + { &Log ("Tar file $tar_file not found\n") ; } |
| 155 | + } |
| 156 | + |
| 157 | + &Log ("Write file $path, set date $date2\n") ; |
| 158 | + |
| 159 | + $cmd = "wget -a /a/dammit.lt/wget.log -O $path http://mituzas.lt/wikistats/$subdir$file" ; |
| 160 | + $result = `$cmd` ; |
| 161 | + if ($result == 0) |
| 162 | + { $result = "OK" ; } |
| 163 | + &Log ("Cmd '$cmd' -> $result \n\n") ; |
| 164 | + |
| 165 | + `touch $path -t $date2` ; |
| 166 | + |
| 167 | + if ($file =~ /^projectcounts/) |
| 168 | + { |
| 169 | + $cmd = "tar --append --file=$tar_file $file" ; |
| 170 | + &Log ("Cmd '$cmd'\n") ; |
| 171 | + $result = `$cmd` ; |
| 172 | + print "$result\n" ; |
| 173 | + unlink $path ; |
| 174 | + } |
| 175 | + } |
| 176 | + } |
| 177 | + |
| 178 | + &Log ("Ready in " . (time - $timestart) . " sec.\n") ; |
| 179 | + close HTML ; |
| 180 | + close LOG ; |
| 181 | + exit ; |
| 182 | + |
| 183 | +sub Log |
| 184 | +{ |
| 185 | + $msg = shift ; |
| 186 | + my ($ss, $nn, $hh) = (localtime(time))[0,1,2] ; |
| 187 | + my $time = sprintf ("%02d:%02d:%02d", $hh, $nn, $ss) ; |
| 188 | + $msg = "$time $msg" ; |
| 189 | + print $msg ; |
| 190 | + print LOG $msg ; |
| 191 | +} |
| 192 | + |
| 193 | +sub Abort |
| 194 | +{ |
| 195 | + $msg = shift ; |
| 196 | + &Log ($msg) ; |
| 197 | + exit ; |
| 198 | +} |
Index: trunk/wikistats/dammit.lt/DammitCompactHourlyPageCountFiles.pl |
— | — | @@ -0,0 +1,964 @@ |
| 2 | + #!/usr/local/bin/perl
|
| 3 | +
|
| 4 | +# 4/27/2010 renamed from WikiStatsCompactDammitFiles.pl
|
| 5 | +
|
| 6 | +# http://article.gmane.org/gmane.science.linguistics.wikipedia.technical/38154/match=new+statistics+stuff
|
| 7 | +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/webstatscollector/
|
| 8 | +# https://bugzilla.wikimedia.org/show_bug.cgi?id=13541
|
| 9 | +# http://de.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases
|
| 10 | +
|
| 11 | +# Ideas:
|
| 12 | +# 1 namespace string -> namespace number ? (may not save much space: compress will deal with recurring patterns like these)
|
| 13 | +# 2 frequenty distribution hits per file per first letter _-> manifest crawler
|
| 14 | +# assuming crawler collects articles in alphabetical order
|
| 15 | +# 3 first letter uppercase -> sort (in sections per first two chars ?)
|
| 16 | +
|
| 17 | + use lib "/home/ezachte/lib" ;
|
| 18 | + use EzLib ;
|
| 19 | +
|
| 20 | + $trace_on_exit = $true ;
|
| 21 | + ez_lib_version (13) ;
|
| 22 | +
|
| 23 | + # set defaults mainly for tests on local machine
|
| 24 | + default_argv "-i C:/bayes_backup/a/dammit.lt/pagecounts|-t C:/bayes_backup/a/dammit.lt|-f C:/bayes_backup/a/dammit.lt|-o C:/bayes_backup/a/dammit.lt|-d 20101215" ;
|
| 25 | +
|
| 26 | + use CGI qw(:all);
|
| 27 | + use URI::Escape;
|
| 28 | + use Getopt::Std ;
|
| 29 | + use Cwd ;
|
| 30 | + $bayes = -d "/a/dammit.lt" ;
|
| 31 | + $path_7za = "/usr/lib/p7zip/7za" ;
|
| 32 | + if (! $bayes)
|
| 33 | + {
|
| 34 | + print "Test on Windows\n" ;
|
| 35 | + use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; # install IO-Compress-Zlib
|
| 36 | + use IO::Compress::Gzip qw(gzip $GzipError) ; # install IO-Compress-Zlib
|
| 37 | + }
|
| 38 | +
|
| 39 | + $| = 1; # flush screen output
|
| 40 | +
|
| 41 | + $true = 1 ;
|
| 42 | + $false = 0 ;
|
| 43 | + $threshold = 0 ;
|
| 44 | + undef %totals_per_namespace ;
|
| 45 | +
|
| 46 | + $filter = "^(?:outreach|quality|strategy|usability)\.m\$" ;
|
| 47 | + print "Filter: $filter\n" ;
|
| 48 | + $reg_exp_filter = qr"$filter" ;
|
| 49 | +
|
| 50 | + $track = "NonExistingPageForSquidLogMonitoring" ;
|
| 51 | + print "Track: $track\n" ;
|
| 52 | + $reg_exp_track = qr"$track" ;
|
| 53 | +
|
| 54 | +# -i "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/in" -o "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/out"
|
| 55 | +
|
| 56 | + my $options ;
|
| 57 | + getopt ("iodft", \%options) ;
|
| 58 | +
|
| 59 | + if (! defined ($options {"i"})) { &Abort ("Specify input dir: -i dirname") } ;
|
| 60 | + if (! defined ($options {"o"})) { &Abort ("Specify output dir: -o dirname") } ;
|
| 61 | + if (! defined ($options {"f"})) { &Abort ("Specify filter dir: -f dirname") } ;
|
| 62 | + if (! defined ($options {"t"})) { &Abort ("Specify tracking dir: -t dirname") } ;
|
| 63 | + if (! defined ($options {"d"})) { &Abort ("Specify date range: as yyyymmdd, yyyymm*, yyyy* or *") } ;
|
| 64 | +
|
| 65 | + $dir_in = $options {"i"} ;
|
| 66 | + $dir_out = $options {"o"} ;
|
| 67 | + $dir_filtered = $options {"f"} ;
|
| 68 | + $dir_track = $options {"t"} ;
|
| 69 | + $daterange = $options {"d"} ;
|
| 70 | +
|
| 71 | + $work = cwd() ;
|
| 72 | + print "Work dir $work\n" ;
|
| 73 | +
|
| 74 | + if ($dir_in !~ /[\/\\]/)
|
| 75 | + { $dir_in = "$work/$dir_in" ; }
|
| 76 | +
|
| 77 | + if ($dir_out !~ /[\/\\]/)
|
| 78 | + { $dir_out = "$work/$dir_out" ; }
|
| 79 | +
|
| 80 | + if ($dir_filtered !~ /[\/\\]/)
|
| 81 | + { $dir_filtered = "$work/$dir_filtered" ; }
|
| 82 | +
|
| 83 | + if ($dir_track !~ /[\/\\]/)
|
| 84 | + { $dir_track = "$work/$dir_track" ; }
|
| 85 | +
|
| 86 | + if (! -d $dir_in)
|
| 87 | + { &Abort ("Input dir not found: $dir_in") } ;
|
| 88 | +
|
| 89 | + if (! -d $dir_out)
|
| 90 | + {
|
| 91 | + print "Create output dir $dir_out\n" ;
|
| 92 | + mkdir $dir_out ;
|
| 93 | + if (! -d $dir_out)
|
| 94 | + { &Abort ("Output dir could not be created.") } ;
|
| 95 | + }
|
| 96 | +
|
| 97 | + if (($daterange !~ /^\d{8}$/) && ($daterange !~ /^\d{6}\*$/) && ($daterange !~ /^\d{4}\*$/) && ($daterange !~ /^\*$/))
|
| 98 | + { &Abort ("Specify date range: as yyyymmdd, yyyymm*, yyyy* or *") ; }
|
| 99 | +
|
| 100 | + print "\nCompress pagecount files\nin: $dir_in\nout: $dir_out\nflt: $dir_filtered\ntrack: $dir_track\ndate range: $daterange" ;
|
| 101 | + $daterange =~ s/\*/\\d+/ ;
|
| 102 | +
|
| 103 | + open LOG, ">>", "$work/WikiStatsCompactDammitFiles.log" ;
|
| 104 | +
|
| 105 | + &CompactVisitorStats ($dir_in, $dir_out, $dir_filtered, $dir_track, $daterange) ;
|
| 106 | +# &UncompactVisitorStats ; # test only, to see if process is revertible
|
| 107 | +
|
| 108 | + &Log ("\nReady\n") ;
|
| 109 | + close LOG ;
|
| 110 | + exit ;
|
| 111 | +
|
| 112 | +sub CompactVisitorStats
|
| 113 | +{
|
| 114 | + my $dir_in = shift ;
|
| 115 | + my $dir_out = shift ;
|
| 116 | + my $dir_filtered = shift ;
|
| 117 | + my $dir_track = shift ;
|
| 118 | + my $daterange = shift ;
|
| 119 | +
|
| 120 | + chdir ($dir_in) || &Abort ("Cannot chdir to $dir_in\n") ;
|
| 121 | +
|
| 122 | + local (*DIR);
|
| 123 | + opendir (DIR, ".");
|
| 124 | + @files = () ;
|
| 125 | +
|
| 126 | + while ($file_in = readdir (DIR))
|
| 127 | + {
|
| 128 | + next if $file_in !~ /^pagecounts-$daterange-\d{6,6}.gz$/ ;
|
| 129 | +
|
| 130 | + push @files, $file_in ;
|
| 131 | + }
|
| 132 | +
|
| 133 | + closedir (DIR, ".");
|
| 134 | +
|
| 135 | + @files = sort @files ;
|
| 136 | +
|
| 137 | + if (($daterange =~ /^\d{8}$/) and ($#files < 23))
|
| 138 | + { &Abort ("Less than 24 files found for date $daterange\n" . @files) ; }
|
| 139 | +
|
| 140 | + foreach $file (@files)
|
| 141 | + {
|
| 142 | + $date = substr ($file,11,8) ;
|
| 143 | + $process_dates {$date}++ ;
|
| 144 | + }
|
| 145 | +
|
| 146 | + &Log ("\n\n") ;
|
| 147 | +
|
| 148 | + foreach $date (sort keys %process_dates)
|
| 149 | + { &MergeFilesFullDay ($dir_in, $dir_out, $dir_filtered, $dir_track, $date) ; }
|
| 150 | +}
|
| 151 | +
|
| 152 | +sub MergeFilesFullDay
|
| 153 | +{
|
| 154 | + my $dir_in = shift ;
|
| 155 | + my $dir_out = shift ;
|
| 156 | + my $dir_filtered = shift ;
|
| 157 | + my $dir_track = shift ;
|
| 158 | + my $date = shift ;
|
| 159 | +
|
| 160 | + my $year = substr ($date,0,4) ;
|
| 161 | + my $month = substr ($date,4,2) ;
|
| 162 | + my $day = substr ($date,6,2) ;
|
| 163 | +
|
| 164 | + my ($file_out1, $file_out2, $file_out3, $out_gz) ;
|
| 165 | +
|
| 166 | + $dir_out = "$dir_out/${year}-${month}" ;
|
| 167 | + if (! -d $dir_out)
|
| 168 | + {
|
| 169 | + mkdir $dir_out ;
|
| 170 | + if (! -d $dir_out)
|
| 171 | + { &Abort ("Output dir could not be created: $dir_out") } ;
|
| 172 | + }
|
| 173 | +
|
| 174 | + my @files_today = () ;
|
| 175 | + foreach $file (@files)
|
| 176 | + {
|
| 177 | + next if $file !~ /^pagecounts-$date-\d{6,6}.gz$/ ;
|
| 178 | +
|
| 179 | + push @files_today, $file ;
|
| 180 | + }
|
| 181 | +
|
| 182 | + # very few times (nearly) dupiclate files are found for same hour
|
| 183 | + # keep the largest and presumably most complete one
|
| 184 | + for ($i = 0 ; $i < $#files_today ; $i++)
|
| 185 | + {
|
| 186 | + for ($j = $i+1 ; $j <= $#files_today ; $j++)
|
| 187 | + {
|
| 188 | + if (substr ($files_today [$i],0,25) eq substr ($files_today [$j],0,25))
|
| 189 | + {
|
| 190 | + $size_i = -s $files_today [$i] ;
|
| 191 | + $size_j = -s $files_today [$j] ;
|
| 192 | + print "${files_today [$i]}: $size_i\n" ;
|
| 193 | + print "${files_today [$j]}: $size_j\n" ;
|
| 194 | + if ($size_i > $size_j)
|
| 195 | + {
|
| 196 | + print "Keep ${files_today [$i]}\n\n" ;
|
| 197 | + $files_today [$j]= "" ;
|
| 198 | + }
|
| 199 | + else
|
| 200 | + {
|
| 201 | + print "Keep ${files_today [$j]}\n\n" ;
|
| 202 | + $files_today [$i]= "" ;
|
| 203 | + }
|
| 204 | + }
|
| 205 | + }
|
| 206 | + }
|
| 207 | +
|
| 208 | + $time_start = time ;
|
| 209 | + $lines = 0 ;
|
| 210 | +
|
| 211 | + undef @in_gz ;
|
| 212 | + undef $file_open ;
|
| 213 | + my $time_start = time ;
|
| 214 | +
|
| 215 | + # $file_out = "pagecounts-$year$month$day_full_day" ;
|
| 216 | + # open OUT, ">", $file_out ;
|
| 217 | + # binmode $file_out ;
|
| 218 | +
|
| 219 | + # print "File_out1 $file_out1\n" ;
|
| 220 | + # print "File_out2 $file_out2\n" ;
|
| 221 | + # print "File_out3 $file_out3\n" ;
|
| 222 | +
|
| 223 | +# my $out_gz1 = IO::Compress::Gzip->new ($file_out1) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
| 224 | + if ($bayes)
|
| 225 | + {
|
| 226 | + # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd" ; # full day
|
| 227 | + $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h" ; # full day, hourly data
|
| 228 | + # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d" ; # full day, compact, just daily totals
|
| 229 | + if ((-e "$file_out2.7z") || (-e "$file_out2.bz2") || (-e "$file_out2.zip") || (-e "$file_out2.gz"))
|
| 230 | + {
|
| 231 | + &Log ("\nTarget file '$file_out2.[7z|bz2|zip|gz]' exists already. Skip this date.\n") ;
|
| 232 | + return ;
|
| 233 | + }
|
| 234 | + if ($#files_today < 23)
|
| 235 | + {
|
| 236 | + &Log ("\nLess than 24 files found for target file '$file_out2.7z'. Skip this date.\n") ;
|
| 237 | + return ;
|
| 238 | + }
|
| 239 | +
|
| 240 | + open $out_gz2, ">", "$file_out2" || &Abort ("Output file '$file_out2' could not be opened.") ;
|
| 241 | + # open $out_gz3, ">", "$file_out3" || &Abort ("Output file '$file_out3' could not be opened.") ;
|
| 242 | + }
|
| 243 | + else
|
| 244 | + {
|
| 245 | + # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd.gz" ; # full day
|
| 246 | + $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h.gz" ; # full day, count above threshold
|
| 247 | + $out_gz2 = IO::Compress::Gzip->new ($file_out2) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
| 248 | + # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d.gz" ; # full day, count above threshold
|
| 249 | + # $out_gz3 = IO::Compress::Gzip->new ($file_out3) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
| 250 | + }
|
| 251 | +
|
| 252 | +# binmode $out_gz1 ;
|
| 253 | + binmode $out_gz2 ;
|
| 254 | +# binmode $out_gz3 ;
|
| 255 | +
|
| 256 | + $file_filtered = "$dir_filtered/pagecounts-$year$month$day.txt" ;
|
| 257 | + &Log ("\nFilter file: $file_filtered\n") ;
|
| 258 | + open $out_filtered, '>', $file_filtered ;
|
| 259 | + binmode $out_filtered ;
|
| 260 | +
|
| 261 | + $file_track = "$dir_track/_PageCountsForSquidLogTracking.txt" ;
|
| 262 | + &Log ("Tracking file: $file_track\n\n") ;
|
| 263 | +
|
| 264 | + for ($hour = 0 ; $hour < 24 ; $hour++)
|
| 265 | + { $file_in_found [$hour] = $false ; }
|
| 266 | +
|
| 267 | + $files_in_open = 0 ;
|
| 268 | + $files_in_found = 0 ;
|
| 269 | + $langprev = "" ;
|
| 270 | + foreach $file_in (@files_today)
|
| 271 | + {
|
| 272 | + next if $file_in eq "" ;
|
| 273 | +
|
| 274 | + ($hour = $file_in) =~ s/^pagecounts-\d+-(\d\d)\d+\.gz$/$1/ ;
|
| 275 | + $hour = (0+$hour) ;
|
| 276 | + # print " file found '$file_in'\n" ;
|
| 277 | +
|
| 278 | + if ($bayes)
|
| 279 | + { open $in_gz [$hour], "-|", "gzip -dc \"$file_in\"" || &Abort ("Input file '" . $file_in . "' could not be opened.") ; }
|
| 280 | + else
|
| 281 | + { $in_gz [$hour] = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ; }
|
| 282 | + binmode $in_gz [$hour] ;
|
| 283 | +
|
| 284 | + $files_in_open++ ;
|
| 285 | + $file_in_found [$hour] = $true ;
|
| 286 | + $file_in_open [$hour] = $true ;
|
| 287 | + $files_in_found ++ ;
|
| 288 | + $file = $in_gz [$hour] ;
|
| 289 | + $line = <$file> ;
|
| 290 | + $line =~ s/^(\w+)2 /$1.y /o ;
|
| 291 | + $line =~ s/^(\w+) /$1.z /o ;
|
| 292 | +
|
| 293 | + ($lang,$title,$count [$hour],$dummy) = split (' ', $line) ;
|
| 294 | + $key [$hour] = "$lang $title" ;
|
| 295 | + }
|
| 296 | +
|
| 297 | + $comment = "# Wikimedia page request counts for $date, each line shows 'subproject title counts'\n" ;
|
| 298 | + if ($threshold > 0 )
|
| 299 | + { $comment .= "# Count for articles with less than $threshold requests per full day are omitted\n" ; }
|
| 300 | + $comment .= "# Subproject is language code, followed by project code\n" ;
|
| 301 | + $comment .= "# Project is b:wikibooks, k:wiktionary, n:wikinews, q:wikiquote, s:wikisource, v:wikiversity, z:wikipedia (z added by compression script: wikipedia happens to be sorted last in dammit.lt files)\n" ;
|
| 302 | + $comment .= "# Counts format is total per day, followed by count per hour if larger than zero, hour 0..23 shown as A..X (saves up to 22 bytes per line compared to comma separated values)\n" ;
|
| 303 | + $comment .= "# If data are missing for some hour (file missing or corrupt) a question mark (?) is shown (and for each missing hour the daily total is incremented with hourly average)\n" ;
|
| 304 | + print $out_gz2 $comment ;
|
| 305 | +# print $out_gz3 $comment ;
|
| 306 | +
|
| 307 | + if ($files_in_found < 24)
|
| 308 | + {
|
| 309 | + for ($hour = 0 ; $hour < 24 ; $hour++)
|
| 310 | + {
|
| 311 | + if (! $file_in_found [$hour])
|
| 312 | + { $hours_missing .= "$hour," ; }
|
| 313 | + }
|
| 314 | + $hours_missing =~ s/,$// ;
|
| 315 | + &Log ("Merge files: date = $date, only $files_in_found files found!\n") ;
|
| 316 | + }
|
| 317 | + else
|
| 318 | + { &Log ("Merge files: date = $date\n") ; }
|
| 319 | +
|
| 320 | + if ($hours_missing ne '')
|
| 321 | + {
|
| 322 | + print $out_gz2 "#\n" ;
|
| 323 | + print $out_gz2 "# In this file data are missing for hour(s) $hours_missing!\n" ;
|
| 324 | + # print $out_gz3 "#\n" ;
|
| 325 | + # print $out_gz3 "# In this file data are missing for hour(s) $hours_missing!\n" ;
|
| 326 | + }
|
| 327 | + $comment = "#\n" ;
|
| 328 | + $comment .= "# Lines starting with ampersand (@) show totals per 'namespace' (including omitted counts for low traffic articles)\n" ;
|
| 329 | + $comment .= "# Since valid namespace string are not known in the compression script any string followed by colon (:) counts as possible namespace string\n" ;
|
| 330 | + $comment .= "# Please reconcile with real namespace name strings later\n" ;
|
| 331 | + $comment .= "# 'namespaces' with count < 5 are combined in 'Other' (on larger wikis these are surely false positives)\n" ;
|
| 332 | + $comment .= "#\n" ;
|
| 333 | + $comment .= "# Page titles are shown unmodified (preserves sort sequence)\n" ;
|
| 334 | + $comment .= "#\n" ;
|
| 335 | + print $out_gz2 $comment ;
|
| 336 | +# print $out_gz3 $comment ;
|
| 337 | +
|
| 338 | + $key_low_prev = "" ;
|
| 339 | + while ($files_in_open > 0)
|
| 340 | + {
|
| 341 | + $key_low = "\xFF\xFF";
|
| 342 | + for ($hour = 0 ; $hour < 24 ; $hour++)
|
| 343 | + {
|
| 344 | + if (($files_in_open == 24) || ($file_in_found [$hour] && $file_in_open [$hour]))
|
| 345 | + {
|
| 346 | + if ($key [$hour] lt $key_low)
|
| 347 | + { $key_low = $key [$hour] ; }
|
| 348 | + }
|
| 349 | + }
|
| 350 | +
|
| 351 | + if (($key_low =~ /^nov/) || ($key_low_prev =~ /^nov/))
|
| 352 | + { &Log ("key_low '$key_low' (key_low_prev '$key_low_prev')\n") ; }
|
| 353 | +
|
| 354 | + $counts = "" ;
|
| 355 | + $total = 0 ;
|
| 356 | + for ($hour = 0 ; $hour < 24 ; $hour++)
|
| 357 | + {
|
| 358 | + if (! $file_in_found [$hour])
|
| 359 | + { $counts .= chr ($hour+ord('A')) . '?' ; }
|
| 360 | + elsif (($files_in_open == 24) || $file_in_open [$hour])
|
| 361 | + {
|
| 362 | + if ($key [$hour] eq $key_low)
|
| 363 | + {
|
| 364 | + $counts .= chr ($hour+ord('A')) . $count [$hour] ;
|
| 365 | + $total += $count [$hour] ;
|
| 366 | + $file = $in_gz [$hour] ;
|
| 367 | + # $line = <$file> ;
|
| 368 | +
|
| 369 | + while ($true)
|
| 370 | + {
|
| 371 | + if ($line = <$file>) # =~ /^a/)
|
| 372 | + {
|
| 373 | + $line =~ s/^([\w\-]+)2 /$1.y /o ;
|
| 374 | + $line =~ s/^([\w\-]+) /$1.z /o ;
|
| 375 | + ($lang,$title,$count [$hour],$dummy) = split (' ', $line) ;
|
| 376 | + $key [$hour] = "$lang $title" ;
|
| 377 | +
|
| 378 | + last if $lang !~ /\d/ ;
|
| 379 | + }
|
| 380 | + else
|
| 381 | + {
|
| 382 | + if ($bayes)
|
| 383 | + { close $in_gz [$hour] ; }
|
| 384 | + else
|
| 385 | + { $in_gz [$hour] -> close () ; }
|
| 386 | + $files_in_open-- ;
|
| 387 | + $file_in_open [$hour] = $false ;
|
| 388 | + $key [$hour] = "\xFF\xFF";
|
| 389 | +
|
| 390 | + last ;
|
| 391 | + }
|
| 392 | + }
|
| 393 | + }
|
| 394 | + }
|
| 395 | + }
|
| 396 | + if ($lines == 0)
|
| 397 | + { &Log ("\nlines: project key\n") ; }
|
| 398 | +
|
| 399 | + if (++$lines % 100000 == 0)
|
| 400 | + { &Log ("$lines: $key_low\n") ; }
|
| 401 | +
|
| 402 | + # last if $lines > 10000 ; # test
|
| 403 | +
|
| 404 | + last if $key_low eq "\xFF\xFF" ;
|
| 405 | +
|
| 406 | + # Q&D fix for unexplained out of order error for what seems to be invalid language
|
| 407 | + # remember : no suffix on language code gets replaced by .y or .z to fixed sort order
|
| 408 | + # ^nov.mw nov1 1 8765
|
| 409 | + # ^nov1.mw nov1 1 931 <--------------
|
| 410 | + # ^nov 10_dw_oktobre 1 11421
|
| 411 | + ($lang,$title) = split (' ', $key_low) ;
|
| 412 | + if ($lang =~ /\d/)
|
| 413 | + {
|
| 414 | + $invalid_languages {$lang}++ ;
|
| 415 | + &Log ("\nSkip invalid language '$lang'\n") ;
|
| 416 | + next ;
|
| 417 | + }
|
| 418 | +
|
| 419 | +
|
| 420 | + if ($key_low_prev gt $key_low)
|
| 421 | + {
|
| 422 | + for ($hour = 0 ; $hour < 24 ; $hour++)
|
| 423 | + { &Log ("hour $hour: key ${key[$hour]}\n") ; }
|
| 424 | +
|
| 425 | + &Abort ("Sequence error: '$key_low_prev' gt '$key_low'\n") ;
|
| 426 | + }
|
| 427 | +
|
| 428 | + if (($key_low_prev eq $key_low) && ($files_in_open > 0))
|
| 429 | + {
|
| 430 | + for ($hour = 0 ; $hour < 24 ; $hour++)
|
| 431 | + {
|
| 432 | + if ($file_in_open [$hour])
|
| 433 | + { print "hour $hour: file open, key ${key [$hour]}\n" ; }
|
| 434 | + else
|
| 435 | + { print "hour $hour: file closed, key ${key [$hour]}\n" ; }
|
| 436 | + }
|
| 437 | + &Abort ("Sequence error: '$key_low_prev' eq '$key_low'\n") ;
|
| 438 | + }
|
| 439 | +
|
| 440 | + # print OUT "$key_low $total$counts\n" ;
|
| 441 | +# print $out_gz1 "$key_low $total$counts\n" ;
|
| 442 | +
|
| 443 | + ($lang,$title) = split (' ', $key_low) ;
|
| 444 | +
|
| 445 | + $title =~ s/\%20/_/g ;
|
| 446 | + $title =~ s/\%3A/:/gi ;
|
| 447 | +# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
|
| 448 | + if (($title !~ /\:/) || ($title =~ /^:[^:]*$/)) # no colon or only on first position
|
| 449 | + { $namespace = 'NamespaceArticles' ; }
|
| 450 | + else
|
| 451 | + { ($namespace = $title) =~ s/([^:])\:.*$/$1/ ; }
|
| 452 | + # print "KEY $key_low -> $namespace\n" ;
|
| 453 | +
|
| 454 | + if (($lang ne $langprev) && ($langprev ne ""))
|
| 455 | + {
|
| 456 | + $filter_matches = $lang =~ $reg_exp_filter ;
|
| 457 | + if ($filter_matches)
|
| 458 | + { print "F $lang\n" ; }
|
| 459 | + # else
|
| 460 | + # { print "- $lang\n" ; }
|
| 461 | +
|
| 462 | + &WriteTotalsPerNamespace ($out_gz2, $langprev) ;
|
| 463 | + # &WriteTotalsPerNamespace ($out_gz3, $langprev) ;
|
| 464 | + undef %totals_per_namespace ;
|
| 465 | + }
|
| 466 | + $langprev = $lang ;
|
| 467 | +
|
| 468 | + if (($files_in_found < 24) && ($files_in_found > 0)) # always > 0 actually
|
| 469 | + { $total = sprintf ("%.0f",($total / $files_in_found) * 24) ; }
|
| 470 | +
|
| 471 | + $totals_per_namespace {"$lang $namespace"} += $total ;
|
| 472 | +
|
| 473 | + if ($filter_matches)
|
| 474 | + { print $out_filtered "$key_low $total$counts\n" ; }
|
| 475 | +
|
| 476 | + if ($key_low =~ $reg_exp_track) # track count for NonExistingPageForSquidLogMonitoring on en.z
|
| 477 | + {
|
| 478 | + open $out_track, '>>', $file_track ;
|
| 479 | + binmode $out_track ;
|
| 480 | + print $out_track "$key_low $total$counts\n" ;
|
| 481 | + close $out_track ;
|
| 482 | + }
|
| 483 | +
|
| 484 | + if ($total >= $threshold)
|
| 485 | + { print $out_gz2 "$key_low $total$counts\n" ;
|
| 486 | + # print $out_gz3 "$key_low $total\n" ;
|
| 487 | + }
|
| 488 | +
|
| 489 | + $key_low_prev = $key_low ;
|
| 490 | + # print "OUT $key_low $counts\n" ;
|
| 491 | + }
|
| 492 | +
|
| 493 | + &WriteTotalsPerNamespace ($out_gz2, $langprev) ;
|
| 494 | +# &WriteTotalsPerNamespace ($out_gz3, $langprev) ;
|
| 495 | +
|
| 496 | + &Log ("File production took " . (time-$time_start) . " seconds\n\n") ;
|
| 497 | +
|
| 498 | + &Log ("[$lines, $files_in_open] $key_low\n") ;
|
| 499 | +# close OUT ;
|
| 500 | +
|
| 501 | + if ($bayes)
|
| 502 | + {
|
| 503 | + # close $out_gz1 ;
|
| 504 | + close $out_gz2 ;
|
| 505 | + # close $out_gz3 ;
|
| 506 | + close $out_filtered ;
|
| 507 | +
|
| 508 | +# $cmd = "$path_7za a $file_out2.7z $file_out2" ;
|
| 509 | +# $result = `$cmd` ;
|
| 510 | +# if ($result =~ /Everything is Ok/s)
|
| 511 | +# {
|
| 512 | +# $result =~ s/^.*?(Updating.*?)\n.*$/$1 -> OK/s ;
|
| 513 | +# unlink $file_out2 ;
|
| 514 | +# foreach $file_in (@files_today)
|
| 515 | +# {
|
| 516 | +# print "unlink $dir_in/$file_in\n" ;
|
| 517 | +# unlink "$dir_in/$file_in" ;
|
| 518 | +# }
|
| 519 | +# }
|
| 520 | +# else
|
| 521 | +# {
|
| 522 | +# print "Delete $file_out2.7z\n" ;
|
| 523 | +# unlink "$file_out2.7z" ;
|
| 524 | +# }
|
| 525 | +
|
| 526 | +
|
| 527 | + $cmd = "bzip2 -9 -v $file_out2" ;
|
| 528 | + &Log ("\n\n$cmd ->\n") ;
|
| 529 | + $result = `$cmd` ;
|
| 530 | + &Log ("\n\n") ;
|
| 531 | +
|
| 532 | + # if ($true) # qqq
|
| 533 | + if ($false)
|
| 534 | + {
|
| 535 | + foreach $file_in (@files_today)
|
| 536 | + {
|
| 537 | + print "unlink $dir_in/$file_in\n" ;
|
| 538 | + unlink "$dir_in/$file_in" ;
|
| 539 | + }
|
| 540 | + }
|
| 541 | + else
|
| 542 | + {
|
| 543 | + # print "Delete $file_out2.7z\n" ;
|
| 544 | + # unlink "$file_out2.7z" ;
|
| 545 | + }
|
| 546 | +
|
| 547 | + # $cmd = "bzip2 -9 -v $file_out3" ;
|
| 548 | + # &Log ("\n$cmd ->\n") ;
|
| 549 | + # $result = `$cmd` ;
|
| 550 | + # &Log ("\n\n") ;
|
| 551 | + &Log ("Compression took " . (time-$time_start_compression) . " seconds\n\n") ;
|
| 552 | + }
|
| 553 | + else
|
| 554 | + {
|
| 555 | + # $out_gz1->close() ;
|
| 556 | + $out_gz2->close() ;
|
| 557 | + # $out_gz3->close() ;
|
| 558 | + close $out_filtered ;
|
| 559 | + }
|
| 560 | +
|
| 561 | + &Log ("\nRecords skipped for invalid languages:\n") ;
|
| 562 | + foreach $key (sort keys %invalid_languages)
|
| 563 | + { &Log ("$key: ${invalid_languages {$key}}\n") ; }
|
| 564 | +
|
| 565 | + &Log ("\nTotals per namespace written: $lines_namespace_counts\n") ;
|
| 566 | + &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
|
| 567 | +}
|
| 568 | +
|
| 569 | +sub WriteTotalsPerNamespace
|
| 570 | +{
|
| 571 | + my $out_gz = shift ;
|
| 572 | + my $lang = shift ;
|
| 573 | + my $total ;
|
| 574 | + my $totals_per_namespace_other ;
|
| 575 | +
|
| 576 | + foreach my $key (sort keys %totals_per_namespace)
|
| 577 | + {
|
| 578 | + $total = $totals_per_namespace {$key} ;
|
| 579 | + if ($total < 5)
|
| 580 | + { $totals_per_namespace_other += $total ; }
|
| 581 | + else
|
| 582 | + {
|
| 583 | + # print "@ $key $total\n" ;
|
| 584 | + print $out_gz "@ $key $total\n" ;
|
| 585 | + $lines_namespace_counts ++ ;
|
| 586 | + }
|
| 587 | + }
|
| 588 | + if ($totals_per_namespace_other > 0 )
|
| 589 | + {
|
| 590 | + # print "@ $lang -other- $totals_per_namespace_other\n" ;
|
| 591 | + print $out_gz "@ $lang -other- $totals_per_namespace_other\n" ;
|
| 592 | + $lines_namespace_counts ++ ;
|
| 593 | + }
|
| 594 | +}
|
| 595 | +
|
| 596 | +sub Log
|
| 597 | +{
|
| 598 | + $msg = shift ;
|
| 599 | + print $msg ;
|
| 600 | + print LOG $msg ;
|
| 601 | +}
|
| 602 | +
|
| 603 | +sub Abort
|
| 604 | +{
|
| 605 | + $msg = shift ;
|
| 606 | + print "Abort script\nError: $msg\n" ;
|
| 607 | + print LOG "Abort script\nError: $msg\n" ;
|
| 608 | + exit ;
|
| 609 | +}
|
| 610 | +
|
| 611 | +#=============================================================================================================
|
| 612 | +
|
| 613 | +#sub Compact
|
| 614 | +#{
|
| 615 | +# my $day = shift ;
|
| 616 | +# &Log ("Compact files for $day\n") ;
|
| 617 | +
|
| 618 | +# $file_in = "pagecounts-$day.out" ;
|
| 619 | +# $file_out1 = "pagecounts-${day}_all.gz" ;
|
| 620 | +# $file_out2 = "pagecounts-${day}_10plus.gz" ;
|
| 621 | +# open IN, "<", $file_in ;
|
| 622 | +# binmode $file_in ;
|
| 623 | +
|
| 624 | +# my $out_gz1 = IO::Compress::Gzip->new ($file_out1) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
| 625 | +# my $out_gz2 = IO::Compress::Gzip->new ($file_out2) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
| 626 | +
|
| 627 | +# open OUT, ">", $file_out ;
|
| 628 | +# binmode $file_out ;
|
| 629 | +
|
| 630 | +# $lang_prev = "" ;
|
| 631 | +# while ($line = <IN>)
|
| 632 | +# {
|
| 633 | +# chomp ($line) ;
|
| 634 | +# ($lang, $title, $counts) = split (' ', $line) ;
|
| 635 | +# $title2 = $title ;
|
| 636 | +# $title =~ s/\%20/_/g ;
|
| 637 | +# $title =~ s/\%3A/:/g ;
|
| 638 | +# # $title =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
|
| 639 | +# # if ($title =~ /[\x00-\x1F]/)
|
| 640 | +# # { &Log ("> '$title2'\n") ; }
|
| 641 | +# $title =~ s/\x00-\x1F/"%" . sprintf ("%X", ord($1)) ;/ge ;
|
| 642 | +# print $out_gz1 "$lang $title $counts\n" ;
|
| 643 | +# ($counts2 = $counts) =~ s/^(\d+).*/$1/ ;
|
| 644 | +# if ($counts2 >= $threshold)
|
| 645 | +# { print $out_gz2 "$lang $title $counts\n" ; }
|
| 646 | +# $lang_prev = $lang ;
|
| 647 | +# }
|
| 648 | +#
|
| 649 | +# close IN ;
|
| 650 | +# $out_gz1->close() ;
|
| 651 | +# $out_gz2->close() ;
|
| 652 | +#}
|
| 653 | +
|
| 654 | +
|
| 655 | +#sub GetViewDistribution
|
| 656 | +#{
|
| 657 | +# open OUT, ">", "Views.csv" ;
|
| 658 | +# foreach $file_in (@files)
|
| 659 | +# {
|
| 660 | +# ($hour = $file_in) =~ s/^pagecounts-\d+-(\d\d)\d+\.gz$/$1/ ;
|
| 661 | +# $hour = chr(ord('A')+$hour) ;
|
| 662 | +# &Log ("Process $hour $file_in\n") ;
|
| 663 | +
|
| 664 | +# $in_gz1 = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed: $GunzipError\n") ;
|
| 665 | +# while ($line = <$in_gz1>)
|
| 666 | +# {
|
| 667 | +# ($lang,$title,$count,$dummy) = split (' ', $line) ;
|
| 668 | +# if (($lang eq "en") && ($title !~ /:/)) # only en: and namespace 0
|
| 669 | +# {
|
| 670 | +# $tot {$hour} += $count ;
|
| 671 | +# if ($count < 3)
|
| 672 | +# { @counts {$hour . substr ($title,0,1)}++ ; }
|
| 673 | +# }
|
| 674 | +# }
|
| 675 | +# $in_gz1->close () ;
|
| 676 | +# }
|
| 677 | +#
|
| 678 | +# print OUT "," ;
|
| 679 | +# foreach $hour ('A'..'X')
|
| 680 | +# { print OUT $hour . ", " ; }
|
| 681 | +# print OUT "\n" ;
|
| 682 | +#
|
| 683 | +# print OUT "," ;
|
| 684 | +# foreach $hour ('A'..'X')
|
| 685 | +# { print OUT $tot {$hour} . ", " ; }
|
| 686 | +# print OUT "\n" ;
|
| 687 | +#
|
| 688 | +# for ($c=0; $c < 256; $c++)
|
| 689 | +# {
|
| 690 | +# # do not print chars " and , as such: confuses csv format
|
| 691 | +# if ($c < 33)
|
| 692 | +# { print OUT "chr($c), " ; }
|
| 693 | +# elsif (chr($c) eq '"')
|
| 694 | +# { print OUT "dquote, " ; }
|
| 695 | +# elsif (chr($c) eq ',')
|
| 696 | +# { print OUT "comma, " ; }
|
| 697 | +# else
|
| 698 | +# { print OUT chr($c) . ", " ; }
|
| 699 | +#
|
| 700 | +# foreach $hour ('A'..'X')
|
| 701 | +# { print OUT (0+@counts {$hour.chr($c)}) , ", " ; }
|
| 702 | +#
|
| 703 | +# if ($c < 255)
|
| 704 | +# { print OUT "\n" ; }
|
| 705 | +# }
|
| 706 | +# close OUT ;
|
| 707 | +#}
|
| 708 | +
|
| 709 | +
|
| 710 | +#sub RecompactVisitorStats
|
| 711 | +#{
|
| 712 | +# my $dir_in = "D:/Wikipedia_Visitors/full_day/" ;
|
| 713 | +# chdir ($dir_in) || &Abort ("Cannot chdir to $dir_in\n") ;
|
| 714 | +# local (*DIR);
|
| 715 | +# opendir (DIR, ".");
|
| 716 | +# @files = () ;
|
| 717 | +# while ($file_in = readdir (DIR))
|
| 718 | +# {
|
| 719 | +# next if $file_in !~ /^pagecounts-\d{8,8}_fd.gz$/ ;
|
| 720 | +#
|
| 721 | +# push @files, $file_in ;
|
| 722 | +# }
|
| 723 | +
|
| 724 | +# $filecnt = $#files+1 ;
|
| 725 | +# @files = sort { substr ($a, 20,2) <=> substr ($b, 20,2)} @files ;
|
| 726 | +
|
| 727 | +# foreach $file (@files)
|
| 728 | +# { &RecompactVisitorStats2 ($file) ; }
|
| 729 | +# closedir (DIR, ".");
|
| 730 | +#}
|
| 731 | +
|
| 732 | +#sub RecompactVisitorStats2
|
| 733 | +#{
|
| 734 | +## http://www.7-zip.org/7z.html
|
| 735 | +# my $file = shift ;
|
| 736 | +# my $time_start = time ;
|
| 737 | +# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
|
| 738 | +## my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
|
| 739 | +# my $file_in = "D:/Wikipedia_Visitors/full_day/$file" ;
|
| 740 | +# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
|
| 741 | +# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
|
| 742 | +
|
| 743 | +# &Log ("Process $file_in\n") ;
|
| 744 | +
|
| 745 | +# $in_gz = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
|
| 746 | +# binmode $in_gz ;
|
| 747 | +# open OUT, ">", $file_out ;
|
| 748 | +# binmode OUT ;
|
| 749 | +
|
| 750 | +# my ($title, $title2) ;
|
| 751 | +# while ($line = <$in_gz>)
|
| 752 | +# {
|
| 753 | +# chomp ($line) ;
|
| 754 | +# ($lang,$title,$counts) = split (" ", $line) ;
|
| 755 | +
|
| 756 | +# if ($lang ne $lang_prev) { print "$lang " ; }
|
| 757 | +# $lang_prev = $lang ;
|
| 758 | +
|
| 759 | +# # test pagecounts-20080701_fd.gz
|
| 760 | +# # all records 424 Mib compressed (1984 uncompressed)
|
| 761 | +# # count > 1 212 Mib compressed ( 733 uncompressed)
|
| 762 | +# # count > 2 169 Mib compressed ( 551 uncompressed)
|
| 763 | +# next if $counts <= 1 ;
|
| 764 | +
|
| 765 | +# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
|
| 766 | +# $title =~ s/\s/_/g;
|
| 767 | +# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
|
| 768 | +# $lang =~ s/\.y/2/ ;
|
| 769 | +
|
| 770 | +# print OUT "$lang $title $counts\n" ;
|
| 771 | +# }
|
| 772 | +
|
| 773 | +# print "Close files\n" ;
|
| 774 | +# $in_gz -> close () ;
|
| 775 | +# close (OUT) ;
|
| 776 | +
|
| 777 | +# &Log ("Compress $file_out\n") ;
|
| 778 | +
|
| 779 | +# unlink $file_7z ;
|
| 780 | +# $result = `$path_7z a $file_7z $file_out` ;
|
| 781 | +# &Log ("Compressed\n") ;
|
| 782 | +# &Log ("Result " . ($result+0) . " \n") ;
|
| 783 | +# if ((-e $file_7z) && (-s $file_7z > 0) && (($result == 0) || ($result == 7)))
|
| 784 | +# { unlink $file_out ; }
|
| 785 | +
|
| 786 | +# &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
|
| 787 | +## 0 No error
|
| 788 | +## 1 Warning (Non fatal error(s)). For example, one or more files were locked by some other application, so they were not compressed.
|
| 789 | +## 2 Fatal error
|
| 790 | +## 7 Command line error
|
| 791 | +## 8 Not enough memory for operation
|
| 792 | +## 255 User stopped the process
|
| 793 | +#}
|
| 794 | +
|
| 795 | +
|
| 796 | +#sub RecompactVisitorStats3
|
| 797 | +#{
|
| 798 | +## http://www.7-zip.org/7z.html
|
| 799 | +# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
|
| 800 | +# my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
|
| 801 | +# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
|
| 802 | +# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
|
| 803 | +## my $file_log = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts.log" ;
|
| 804 | +
|
| 805 | +# $in_gz = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
|
| 806 | +# binmode $in_gz ;
|
| 807 | +## $out_gz = IO::Compress::Gzip->new ($file_out) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
| 808 | +## binmode $out_gz ;
|
| 809 | +# open OUT, ">", $file_out ;
|
| 810 | +# binmode OUT ;
|
| 811 | +## open LOG, ">", $file_log ;
|
| 812 | +## binmode LOG ;
|
| 813 | +
|
| 814 | +# my ($title, $title2) ;
|
| 815 | +# while ($line = <$in_gz>)
|
| 816 | +# {
|
| 817 | +# chomp ($line) ;
|
| 818 | +# ($lang,$title,$counts) = split (" ", $line) ;
|
| 819 | +
|
| 820 | +# if ($lang ne $lang_prev) { print "$lang\n" ; }
|
| 821 | +## last if $lang gt "fs" ;
|
| 822 | +# $lang_prev = $lang ;
|
| 823 | +
|
| 824 | +# # test pagecounts-20080701_fd.gz
|
| 825 | +# # all records 424 Mib compressed (1984 uncompressed)
|
| 826 | +# # count > 1 212 Mib compressed ( 733 uncompressed)
|
| 827 | +# # count > 2 169 Mib compressed ( 551 uncompressed)
|
| 828 | +# next if $counts <= 1 ;
|
| 829 | +
|
| 830 | +## next if $lang !~ /^(?:ar|fr)/ ;
|
| 831 | +
|
| 832 | +#if ($false)
|
| 833 | +#{
|
| 834 | +# $title1b = $title ;
|
| 835 | +# $title1b =~ s/(\%[A-Fa-f0-9]{2})/uc($1)/seg;
|
| 836 | +# $title1b =~ s/\%28/(/g ;
|
| 837 | +# $title1b =~ s/\%29/)/g ;
|
| 838 | +# $title1b =~ s/\%3A/:/g ;
|
| 839 | +# $title1b =~ s/\%2F/\//g ;
|
| 840 | +# $title1b =~ s/\%5C/\\/g ;
|
| 841 | +# $title1b =~ s/\%2A/*/g ;
|
| 842 | +# $title1b =~ s/\%21/!/g ;
|
| 843 | +# $title1b =~ s/\%5F/_/g ;
|
| 844 | +# $title1b =~ s/\%2C/,/g ;
|
| 845 | +# $title1b =~ s/\%2E/./g ;
|
| 846 | +# $title1b =~ s/\%2D/-/g ;
|
| 847 | +# $title1b =~ s/\%25/%/g ;
|
| 848 | +# $title1b =~ s/\%7E/~/g ;
|
| 849 | +# $title1b =~ s/\%27/'/g ;
|
| 850 | +# $title1b =~ s/\%3D/=/g ;
|
| 851 | +# $title1b =~ s/\%26/&/g ;
|
| 852 | +# $title1b =~ s/\%3B/;/g ;
|
| 853 | +# $title1b =~ s/\%3F/?/g ;
|
| 854 | +# $title1b =~ s/\%2B/+/g ;
|
| 855 | +# $title2 = $title1b ;
|
| 856 | +# $title2 =~ s/%([A-F0-9]{2})/chr(hex($1))/seg;
|
| 857 | +
|
| 858 | +# if ($title1b ne $title2) # if changed anything at all
|
| 859 | +# {
|
| 860 | +# $title3 = uri_escape ($title2) ;
|
| 861 | +# $title3 =~ s/\%28/(/g ;
|
| 862 | +# $title3 =~ s/\%29/)/g ;
|
| 863 | +# $title3 =~ s/\%3A/:/g ;
|
| 864 | +# $title3 =~ s/\%2F/\//g ;
|
| 865 | +# $title3 =~ s/\%5C/\\/g ;
|
| 866 | +# $title3 =~ s/\%2A/\*/g ;
|
| 867 | +# $title3 =~ s/\%21/\!/g ;
|
| 868 | +# $title3 =~ s/\%5F/\_/g ;
|
| 869 | +# $title3 =~ s/\%2C/,/g ;
|
| 870 | +# $title3 =~ s/\%2E/./g ;
|
| 871 | +# $title3 =~ s/\%2D/-/g ;
|
| 872 | +# $title3 =~ s/\%25/%/g ;
|
| 873 | +# $title3 =~ s/\%7E/~/g ;
|
| 874 | +# $title3 =~ s/\%27/'/g ;
|
| 875 | +# $title3 =~ s/\%3D/=/g ;
|
| 876 | +# $title3 =~ s/\%26/&/g ;
|
| 877 | +# $title3 =~ s/\%3B/;/g ;
|
| 878 | +# $title3 =~ s/\%3F/?/g ;
|
| 879 | +# $title3 =~ s/\%2B/+/g ;
|
| 880 | +
|
| 881 | +# if ($title1b eq $title3) # process reversible ?
|
| 882 | +# {
|
| 883 | +# $y++ ;
|
| 884 | +# $title2 =~ s/\s/_/g;
|
| 885 | +# $title = $title2 ;
|
| 886 | +# }
|
| 887 | +# else
|
| 888 | +# {
|
| 889 | +# $n++ ;
|
| 890 | +# print "Y $y N $n\n$title\n$title3\n\n" ;
|
| 891 | +# print LOG "Y $y N $n\n$title\n$title3\n\n" ;
|
| 892 | +# }
|
| 893 | +# }
|
| 894 | +#}
|
| 895 | +# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
|
| 896 | +# $title =~ s/\s/_/g;
|
| 897 | +# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
|
| 898 | +# $lang =~ s/\.y/2/ ;
|
| 899 | +
|
| 900 | +# # print $out_gz "$lang $title $counts\n" ;
|
| 901 | +# print OUT "$lang $title $counts\n" ;
|
| 902 | +# }
|
| 903 | +
|
| 904 | +# print "Close files\n" ;
|
| 905 | +# $in_gz -> close () ;
|
| 906 | +## $out_gz -> close () ;
|
| 907 | +# close (OUT) ;
|
| 908 | +# $result = `$path_7z a $file_out $file_txt` ;
|
| 909 | +# print $result ;
|
| 910 | +#}
|
| 911 | +
|
| 912 | +
|
| 913 | +
|
| 914 | +# test (partial) reversibility of process
|
| 915 | +#sub UncompactVisitorStats
|
| 916 | +#{
|
| 917 | +# my $file_in = "out/2009-03/pagecounts-20090301_fdt1" ;
|
| 918 | +# my $dir_out = "out" ;
|
| 919 | +# # $in_gz = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
|
| 920 | +# open $in_gz, '<', $file_in ;
|
| 921 | +# binmode $in_gz ;
|
| 922 | +
|
| 923 | +# for ($h=0 ; $h<=23 ; $h++)
|
| 924 | +# {
|
| 925 | +# $time = sprintf ("%02d",$h) . "0000" ;
|
| 926 | +## $file_out = "$dir_out/pagecounts-20090301-$time.gz" ;
|
| 927 | +# $file_out = "$dir_out/pagecounts-20090301-$time" ;
|
| 928 | +# open $out_gz [$h], '>', $file_out ;
|
| 929 | +## $out_gz [$h] = IO::Compress::Gzip->new ($file_out) || &Abort ("IO::Compress::Gzip failed: $GzipError\n");
|
| 930 | +# binmode $out_gz [$h] ;
|
| 931 | +# }
|
| 932 | +
|
| 933 | +# while ($line = <$in_gz>)
|
| 934 | +# {
|
| 935 | +# next if $line =~ /^#/ ;
|
| 936 | +# next if $line =~ /^@/ ;
|
| 937 | +# chomp ($line) ;
|
| 938 | +## print "$line\n" ;
|
| 939 | +# if ($lines++ > 10000) { exit ; }
|
| 940 | +# ($lang,$title,$counts) = split (" ", $line) ;
|
| 941 | +# $lang =~ s/\.z// ;
|
| 942 | +# $lang =~ s/\.y/2/ ;
|
| 943 | +# $counts =~ s/^\d+// ; # remove (redundant) preceding total
|
| 944 | +# while ($counts ne "")
|
| 945 | +# {
|
| 946 | +# $letter = substr ($counts,0,1) ;
|
| 947 | +# $counts = substr ($counts,1) ;
|
| 948 | +# ($count = $counts) =~ s/^(\d+).*$/$1/ ;
|
| 949 | +# $counts =~ s/^\d+(.*)$/$1/ ;
|
| 950 | +# $h = ord ($letter) - ord ('A') ;
|
| 951 | +# $file = $out_gz [$h] ;
|
| 952 | +# $writes {$h} ++ ;
|
| 953 | +# print $file "$lang $title $count\n" ;
|
| 954 | +# }
|
| 955 | +
|
| 956 | +# }
|
| 957 | +
|
| 958 | +# for ($h=0 ; $h<=23 ; $h++)
|
| 959 | +# {
|
| 960 | +## $out_gz [$h] -> close () ;
|
| 961 | +# close $out_gz [$h] ;
|
| 962 | +# }
|
| 963 | +#}
|
| 964 | +
|
| 965 | +
|
Index: trunk/wikistats/dammit.lt/dammit_scan.sh |
— | — | @@ -0,0 +1,10 @@ |
| 2 | +i='/a/dammit.lt/pagecounts' # input dir |
| 3 | +o='/home/ezachte/wikistats/scans' # output dir |
| 4 | +f=20090424 # from date |
| 5 | +t=20091110 # till date |
| 6 | +#="swine.*flu\nswine.*influenza\nflu.*outbreak\ninfluenza.*outbreak\ngripe.*porcina\npandem\n" |
| 7 | +p=".*influensa\n.*H1N1.*\npandemi\n" |
| 8 | +#p="#$o/pattern_influenza_en.txt" # file name |
| 9 | +#p="#$o/pattern_pandemic_shortlist.txt" # file name |
| 10 | +#p=html |
| 11 | +perl /a/dammit.lt/DammitScanCompactedFiles.pl -i $i -o $o -f $f -t $t -p $p |
Property changes on: trunk/wikistats/dammit.lt/dammit_scan.sh |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 12 | + native |
Index: trunk/wikistats/dammit.lt/dammit_filter.sh |
— | — | @@ -0,0 +1,5 @@ |
| 2 | +#='/a/dammit.lt/pagecounts' # input dir |
| 3 | +#o='/home/ezachte/wikistats/scans' # output dir |
| 4 | +#f=20090424 # from date |
| 5 | +#t=20091110 # till date |
| 6 | +perl /a/dammit.lt/DammitFilterDailyPageCountsPerLanguage.pl |
Property changes on: trunk/wikistats/dammit.lt/dammit_filter.sh |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 7 | + native |
Index: trunk/wikistats/dammit.lt/DammitCompactHourlyOrDailyPageCountFiles.pl |
— | — | @@ -0,0 +1,1568 @@ |
| 2 | +#!/usr/local/bin/perl
|
| 3 | +
|
| 4 | +# 4/27/2010 renamed from WikiStatsCompactDammitFiles.pl
|
| 5 | +
|
| 6 | +# http://article.gmane.org/gmane.science.linguistics.wikipedia.technical/38154/match=new+statistics+stuff
|
| 7 | +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/webstatscollector/
|
| 8 | +# https://bugzilla.wikimedia.org/show_bug.cgi?id=13541
|
| 9 | +# http://de.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases
|
| 10 | +
|
| 11 | +# Ideas:
|
| 12 | +# 1 namespace string -> namespace number ? (may not save much space: compress will deal with recurring patterns like these)
|
| 13 | +# 2 frequency distribution hits per file per first letter _-> manifest crawler
|
| 14 | +# assuming crawler collects articles in alphabetical order
|
| 15 | +# 3 first letter uppercase -> sort (in sections per first two chars ?)
|
| 16 | +
|
| 17 | + use lib "/home/ezachte/lib" ;
|
| 18 | + use EzLib ;
|
| 19 | +
|
| 20 | + $trace_on_exit = $true ;
|
| 21 | + ez_lib_version (13) ;
|
| 22 | +
|
| 23 | + # set defaults mainly for tests on local machine
|
| 24 | +# default_argv "-i C:/bayes_backup/a/dammit.lt/pagecounts|-t C:/bayes_backup/a/dammit.lt|-f C:/bayes_backup/a/dammit.lt|-o C:/bayes_backup/a/dammit.lt|-d 20101215" ;
|
| 25 | + default_argv "-m|-i C:/bayes_backup/a/dammit.lt/pagecounts|-o C:/bayes_backup/a/dammit.lt|-d 200812" ;
|
| 26 | +
|
| 27 | + use CGI qw(:all);
|
| 28 | + use URI::Escape;
|
| 29 | + use Cwd ;
|
| 30 | + $bayes = -d "/a/dammit.lt" ;
|
| 31 | +# $path_7za = "/usr/lib/p7zip/7za" ;
|
| 32 | +
|
| 33 | + use IO::Compress::Bzip2 qw(bzip2 $Bzip2Error) ;
|
| 34 | +
|
| 35 | + if (! $bayes)
|
| 36 | + {
|
| 37 | + print "Test on Windows\n" ;
|
| 38 | + use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; # install IO-Compress-Zlib
|
| 39 | + use IO::Compress::Gzip qw(gzip $GzipError) ; # install IO-Compress-Zlib
|
| 40 | + }
|
| 41 | +
|
| 42 | + $| = 1; # flush screen output
|
| 43 | +
|
| 44 | + $true = 1 ;
|
| 45 | + $false = 0 ;
|
| 46 | + $threshold = 0 ;
|
| 47 | + undef %totals_per_namespace ;
|
| 48 | +
|
| 49 | + ($sec,$min,$hour,$mday,$month,$year,$wday,$yday,$isdst) = gmtime(time);
|
| 50 | + $year = $year + 1900;
|
| 51 | + $month++ ;
|
| 52 | + $month_run = sprintf ("%4d-%2d", $year, $month) ;
|
| 53 | + print "Current month: $month_run\n" ;
|
| 54 | +
|
| 55 | + $filter = "^(?:outreach|quality|strategy|usability)\.m\$" ;
|
| 56 | + print "Filter: $filter\n" ;
|
| 57 | + $reg_exp_filter = qr"$filter" ;
|
| 58 | +
|
| 59 | + $track = "NonExistingPageForSquidLogMonitoring" ;
|
| 60 | + print "Track: $track\n" ;
|
| 61 | + $reg_exp_track = qr"$track" ;
|
| 62 | +
|
| 63 | +# -i "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/in" -o "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/out"
|
| 64 | +
|
| 65 | + my $options ;
|
| 66 | + getopt ("iodft", \%options) ;
|
| 67 | +
|
| 68 | + $compactmonth = $options {"m"} ;
|
| 69 | + $compactday = ! $compactmonth ;
|
| 70 | +
|
| 71 | + if (! defined ($options {"i"})) { &Abort ("Specify input dir: -i dirname") } ;
|
| 72 | + if ($compactday)
|
| 73 | + {
|
| 74 | + if (! defined ($options {"o"})) { &Abort ("Specify output dir: -o dirname") } ;
|
| 75 | + if (! defined ($options {"f"})) { &Abort ("Specify filter dir: -f dirname") } ;
|
| 76 | + if (! defined ($options {"t"})) { &Abort ("Specify tracking dir: -t dirname") } ;
|
| 77 | + if (! defined ($options {"d"})) { &Abort ("Specify date range: as yyyymmdd, yyyymm*, yyyy* or *") } ;
|
| 78 | + }
|
| 79 | + if ($compactmonth)
|
| 80 | + {
|
| 81 | + if (! defined ($options {"d"})) { &Abort ("Specify date range: as yyyymm, yyyy* or *") } ;
|
| 82 | + }
|
| 83 | +
|
| 84 | +
|
| 85 | + $dir_in = $options {"i"} ;
|
| 86 | + $dir_out = $options {"o"} ;
|
| 87 | + $dir_filtered = $options {"f"} ;
|
| 88 | + $dir_track = $options {"t"} ;
|
| 89 | + $daterange = $options {"d"} ;
|
| 90 | +
|
| 91 | + $work = cwd() ;
|
| 92 | + print "Work dir $work\n" ;
|
| 93 | +
|
| 94 | + if ($dir_in !~ /[\/\\]/)
|
| 95 | + { $dir_in = "$work/$dir_in" ; }
|
| 96 | +
|
| 97 | + if ($dir_out eq '')
|
| 98 | + { $dir_out = "$work" ; }
|
| 99 | + elsif ($dir_out !~ /[\/\\]/)
|
| 100 | + { $dir_out = "$work/$dir_out" ; }
|
| 101 | +
|
| 102 | + if ($compactmonth && ($dir_out eq ''))
|
| 103 | + { $dir_out = $dir_in ; }
|
| 104 | +
|
| 105 | + if ($dir_filtered !~ /[\/\\]/)
|
| 106 | + { $dir_filtered = "$work/$dir_filtered" ; }
|
| 107 | +
|
| 108 | + if ($dir_track !~ /[\/\\]/)
|
| 109 | + { $dir_track = "$work/$dir_track" ; }
|
| 110 | +
|
| 111 | + if (! -d $dir_in)
|
| 112 | + { &Abort ("Input dir not found: $dir_in") } ;
|
| 113 | +
|
| 114 | + if (! -d $dir_out)
|
| 115 | + {
|
| 116 | + print "Create output dir $dir_out\n" ;
|
| 117 | + mkdir $dir_out ;
|
| 118 | + if (! -d $dir_out)
|
| 119 | + { &Abort ("Output dir could not be created.") } ;
|
| 120 | + }
|
| 121 | +
|
| 122 | + open LOG, ">>", "$work/WikiStatsCompactDammitFiles.log" ;
|
| 123 | +
|
| 124 | + if ($compactday)
|
| 125 | + {
|
| 126 | + if (($daterange !~ /^\d{8}$/) && ($daterange !~ /^\d{6}\*$/) && ($daterange !~ /^\d{4}\*$/) && ($daterange !~ /^\*$/))
|
| 127 | + { &Abort ("Specify date range: as yyyymmdd, yyyymm*, yyyy* or *") ; }
|
| 128 | +
|
| 129 | + &Log ("\nCompress pagecount files\nin: $dir_in\nout: $dir_out\nflt: $dir_filtered\ntrack: $dir_track\ndate range: $daterange\n") ;
|
| 130 | + $daterange =~ s/\*/\\d+/ ;
|
| 131 | +
|
| 132 | + &CompactVisitorStatsOneDay ($dir_in, $dir_out, $dir_filtered, $dir_track, $daterange) ;
|
| 133 | + # &UncompactVisitorStats ; # test only, to see if process is revertible
|
| 134 | + }
|
| 135 | +
|
| 136 | + if ($compactmonth)
|
| 137 | + {
|
| 138 | + if (($daterange !~ /^\d{6}$/) && ($daterange !~ /^\d{4}\*$/) && ($daterange !~ /^\*$/))
|
| 139 | + { &Abort ("Specify date range: as yyyymm, yyyy* or *") ; }
|
| 140 | +
|
| 141 | + ($daterange2 = $daterange) =~ s/\*/\\d+/ ;
|
| 142 | + &Log ("\nCompress pagecount files\nin: $dir_in\nout: $dir_out\ndate range: $daterange->$daterange2\n") ;
|
| 143 | +
|
| 144 | + &CompactVisitorStatsOneMonth ($dir_in, $dir_out, $daterange2) ;
|
| 145 | + }
|
| 146 | +
|
| 147 | + &Log ("\nReady\n") ;
|
| 148 | + close LOG ;
|
| 149 | + exit ;
|
| 150 | +
|
| 151 | +sub CompactVisitorStatsOneDay
|
| 152 | +{
|
| 153 | + my $dir_in = shift ;
|
| 154 | + my $dir_out = shift ;
|
| 155 | + my $dir_filtered = shift ;
|
| 156 | + my $dir_track = shift ;
|
| 157 | + my $daterange = shift ;
|
| 158 | +
|
| 159 | + chdir ($dir_in) || &Abort ("Cannot chdir to $dir_in\n") ;
|
| 160 | +
|
| 161 | + local (*DIR);
|
| 162 | + opendir (DIR, ".");
|
| 163 | + @files = () ;
|
| 164 | +
|
| 165 | + while ($file_in = readdir (DIR))
|
| 166 | + {
|
| 167 | + next if $file_in !~ /^pagecounts-$daterange-\d{6,6}.gz$/ ;
|
| 168 | +
|
| 169 | + push @files, $file_in ;
|
| 170 | + }
|
| 171 | +
|
| 172 | + closedir (DIR);
|
| 173 | +
|
| 174 | + @files = sort @files ;
|
| 175 | +
|
| 176 | +# if (($daterange =~ /^\d{8}$/) and ($#files < 23))
|
| 177 | +# { &Abort ("Less than 24 files found for date $daterange\n" . @files) ; }
|
| 178 | +
|
| 179 | + foreach $file (@files)
|
| 180 | + {
|
| 181 | + $date = substr ($file,11,8) ;
|
| 182 | + $process_dates {$date}++ ;
|
| 183 | + }
|
| 184 | +
|
| 185 | + &Log ("\n\n") ;
|
| 186 | +
|
| 187 | + foreach $date (sort keys %process_dates)
|
| 188 | + { &MergeFilesFullDay ($dir_in, $dir_out, $dir_filtered, $dir_track, $date) ; }
|
| 189 | +}
|
| 190 | +
|
| 191 | +sub MergeFilesFullDay
|
| 192 | +{
|
| 193 | + my $dir_in = shift ;
|
| 194 | + my $dir_out = shift ;
|
| 195 | + my $dir_filtered = shift ;
|
| 196 | + my $dir_track = shift ;
|
| 197 | + my $date = shift ;
|
| 198 | +
|
| 199 | + my $year = substr ($date,0,4) ;
|
| 200 | + my $month = substr ($date,4,2) ;
|
| 201 | + my $day = substr ($date,6,2) ;
|
| 202 | +
|
| 203 | + my ($file_out1, $file_out2, $file_out3, $out_day, $hours_missing) ;
|
| 204 | +
|
| 205 | + $dir_out = "$dir_out/${year}-${month}" ;
|
| 206 | + if (! -d $dir_out)
|
| 207 | + {
|
| 208 | + mkdir $dir_out ;
|
| 209 | + if (! -d $dir_out)
|
| 210 | + { &Abort ("Output dir could not be created: $dir_out") } ;
|
| 211 | + }
|
| 212 | +
|
| 213 | + my @files_today = () ;
|
| 214 | + foreach $file (@files)
|
| 215 | + {
|
| 216 | + next if $file !~ /^pagecounts-$date-\d{6,6}.gz$/ ;
|
| 217 | +
|
| 218 | + push @files_today, $file ;
|
| 219 | + }
|
| 220 | +
|
| 221 | + # very few times (nearly) dupiclate files are found for same hour
|
| 222 | + # keep the largest and presumably most complete one
|
| 223 | + for ($i = 0 ; $i < $#files_today ; $i++)
|
| 224 | + {
|
| 225 | + for ($j = $i+1 ; $j <= $#files_today ; $j++)
|
| 226 | + {
|
| 227 | + if (substr ($files_today [$i],0,25) eq substr ($files_today [$j],0,25))
|
| 228 | + {
|
| 229 | + $size_i = -s $files_today [$i] ;
|
| 230 | + $size_j = -s $files_today [$j] ;
|
| 231 | + print "${files_today [$i]}: $size_i\n" ;
|
| 232 | + print "${files_today [$j]}: $size_j\n" ;
|
| 233 | + if ($size_i > $size_j)
|
| 234 | + {
|
| 235 | + print "Keep ${files_today [$i]}\n\n" ;
|
| 236 | + $files_today [$j]= "" ;
|
| 237 | + }
|
| 238 | + else
|
| 239 | + {
|
| 240 | + print "Keep ${files_today [$j]}\n\n" ;
|
| 241 | + $files_today [$i]= "" ;
|
| 242 | + }
|
| 243 | + }
|
| 244 | + }
|
| 245 | + }
|
| 246 | +
|
| 247 | + $time_start = time ;
|
| 248 | + $lines = 0 ;
|
| 249 | +
|
| 250 | + undef @in_hour ;
|
| 251 | +
|
| 252 | + # $file_out = "pagecounts-$year$month$day_full_day" ;
|
| 253 | + # open OUT, ">", $file_out ;
|
| 254 | + # binmode $file_out ;
|
| 255 | +
|
| 256 | +# my $out_day1 = IO::Compress::Gzip->new ($file_out1) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
| 257 | + if ($bayes)
|
| 258 | + {
|
| 259 | + # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd" ; # full day
|
| 260 | + $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h" ; # full day, hourly data
|
| 261 | + # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d" ; # full day, compact, just daily totals
|
| 262 | + if ((-e "$file_out2.7z") || (-e "$file_out2.bz2") || (-e "$file_out2.zip") || (-e "$file_out2.gz"))
|
| 263 | + {
|
| 264 | + &Log ("\nTarget file '$file_out2.[7z|bz2|zip|gz]' exists already. Skip this date.\n") ;
|
| 265 | + return ;
|
| 266 | + }
|
| 267 | + if ($#files_today < 23)
|
| 268 | + {
|
| 269 | + &Log ("\nLess than 24 files found for target file '$file_out2.7z'. Skip this date.\n") ;
|
| 270 | + return ;
|
| 271 | + }
|
| 272 | +
|
| 273 | + open $out_day2, ">", "$file_out2" || &Abort ("Output file '$file_out2' could not be opened.") ;
|
| 274 | + # open $out_day3, ">", "$file_out3" || &Abort ("Output file '$file_out3' could not be opened.") ;
|
| 275 | + }
|
| 276 | + else
|
| 277 | + {
|
| 278 | + # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd.gz" ; # full day
|
| 279 | + $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h.gz" ; # full day, hourly data, count above threshold
|
| 280 | + $out_day2 = IO::Compress::Gzip->new ($file_out2) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
| 281 | + # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d.gz" ; # full day, count above threshold
|
| 282 | + # $out_day3 = IO::Compress::Gzip->new ($file_out3) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
| 283 | + }
|
| 284 | +
|
| 285 | +# binmode $out_day1 ;
|
| 286 | + binmode $out_day2 ;
|
| 287 | +# binmode $out_day3 ;
|
| 288 | +
|
| 289 | + # print "File_out1 $file_out1\n" ;
|
| 290 | + print "File_out2 $file_out2\n" ;
|
| 291 | + # print "File_out3 $file_out3\n" ;
|
| 292 | +
|
| 293 | + $file_filtered = "$dir_filtered/pagecounts-$year$month$day.txt" ;
|
| 294 | + &Log ("\nFilter file: $file_filtered\n") ;
|
| 295 | + open $out_filtered, '>', $file_filtered ;
|
| 296 | + binmode $out_filtered ;
|
| 297 | +
|
| 298 | + $file_track = "$dir_track/_PageCountsForSquidLogTracking.txt" ;
|
| 299 | + &Log ("Tracking file: $file_track\n\n") ;
|
| 300 | +
|
| 301 | + for ($hour = 0 ; $hour < 24 ; $hour++)
|
| 302 | + { $file_in_found [$hour] = $false ; }
|
| 303 | +
|
| 304 | + $files_in_open = 0 ;
|
| 305 | + $files_in_found = 0 ;
|
| 306 | + $langprev = "" ;
|
| 307 | + foreach $file_in (@files_today)
|
| 308 | + {
|
| 309 | + next if $file_in eq "" ;
|
| 310 | +
|
| 311 | + ($hour = $file_in) =~ s/^pagecounts-\d+-(\d\d)\d+\.gz$/$1/ ;
|
| 312 | + $hour = (0+$hour) ;
|
| 313 | + # print " file found '$file_in'\n" ;
|
| 314 | +
|
| 315 | + if ($bayes)
|
| 316 | + { open $in_hour [$hour], "-|", "gzip -dc \"$file_in\"" || &Abort ("Input file '" . $file_in . "' could not be opened.") ; }
|
| 317 | + else
|
| 318 | + { $in_hour [$hour] = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ; }
|
| 319 | + binmode $in_hour [$hour] ;
|
| 320 | +
|
| 321 | + $files_in_open++ ;
|
| 322 | + $file_in_found [$hour] = $true ;
|
| 323 | + $file_in_open [$hour] = $true ;
|
| 324 | + $files_in_found ++ ;
|
| 325 | + $file = $in_hour [$hour] ;
|
| 326 | + $line = <$file> ;
|
| 327 | + $line =~ s/^(\w+)2 /$1.y /o ;# project wikipedia comes without suffix -> out of sort order, make it fit by appending suffix
|
| 328 | + $line =~ s/^(\w+) /$1.z /o ;
|
| 329 | +
|
| 330 | + ($lang,$title,$count [$hour],$dummy) = split (' ', $line) ;
|
| 331 | + $key [$hour] = "$lang $title" ;
|
| 332 | + }
|
| 333 | +
|
| 334 | + $comment = "# Wikimedia page request counts for $date, each line shows 'subproject title counts'\n" ;
|
| 335 | + if ($threshold > 0 )
|
| 336 | + { $comment .= "# Count for articles with less than $threshold requests per full day are omitted\n" ; }
|
| 337 | + $comment .= "# Subproject is language code, followed by project code\n" ;
|
| 338 | + $comment .= "# Project is b:wikibooks, k:wiktionary, n:wikinews, q:wikiquote, s:wikisource, v:wikiversity, z:wikipedia (z added by compression script: wikipedia happens to be sorted last in dammit.lt files)\n" ;
|
| 339 | + $comment .= "# Counts format is total per day, followed by count per hour if larger than zero, hour 0..23 shown as A..X (saves up to 22 bytes per line compared to comma separated values)\n" ;
|
| 340 | + $comment .= "# If data are missing for some hour (file missing or corrupt) a question mark (?) is shown (and for each missing hour the daily total is incremented with hourly average)\n\n" ;
|
| 341 | + print $out_day2 $comment ;
|
| 342 | +# print $out_day3 $comment ;
|
| 343 | +
|
| 344 | + if ($files_in_found < 24)
|
| 345 | + {
|
| 346 | + for ($hour = 0 ; $hour < 24 ; $hour++)
|
| 347 | + {
|
| 348 | + if (! $file_in_found [$hour])
|
| 349 | + { $hours_missing .= "$hour," ; }
|
| 350 | + }
|
| 351 | + $hours_missing =~ s/,$// ;
|
| 352 | + &Log ("Merge files: date = $date, only $files_in_found files found!\n\n") ;
|
| 353 | + }
|
| 354 | + else
|
| 355 | + { &Log ("Merge files: date = $date\n") ; }
|
| 356 | +
|
| 357 | + if ($hours_missing ne '')
|
| 358 | + {
|
| 359 | + print $out_day2 "#\n" ;
|
| 360 | + print $out_day2 "# In this file data are missing for hour(s) $hours_missing!\n" ;
|
| 361 | + # print $out_day3 "#\n" ;
|
| 362 | + # print $out_day3 "# In this file data are missing for hour(s) $hours_missing!\n" ;
|
| 363 | + }
|
| 364 | + $comment = "#\n" ;
|
| 365 | + $comment .= "# Lines starting with ampersand (@) show totals per 'namespace' (including omitted counts for low traffic articles)\n" ;
|
| 366 | + $comment .= "# Since valid namespace string are not known in the compression script any string followed by colon (:) counts as possible namespace string\n" ;
|
| 367 | + $comment .= "# Please reconcile with real namespace name strings later\n" ;
|
| 368 | + $comment .= "# 'namespaces' with count < 5 are combined in 'Other' (on larger wikis these are surely false positives)\n" ;
|
| 369 | + $comment .= "#\n" ;
|
| 370 | + $comment .= "# Page titles are shown unmodified (preserves sort sequence)\n" ;
|
| 371 | + $comment .= "#\n" ;
|
| 372 | + print $out_day2 $comment ;
|
| 373 | +# print $out_day3 $comment ;
|
| 374 | +
|
| 375 | + $key_low_prev = "" ;
|
| 376 | + while ($files_in_open > 0)
|
| 377 | + {
|
| 378 | + $key_low = "\xFF\xFF";
|
| 379 | + for ($hour = 0 ; $hour < 24 ; $hour++)
|
| 380 | + {
|
| 381 | + if (($files_in_open == 24) || ($file_in_found [$hour] && $file_in_open [$hour]))
|
| 382 | + {
|
| 383 | + if ($key [$hour] lt $key_low)
|
| 384 | + { $key_low = $key [$hour] ; }
|
| 385 | + }
|
| 386 | + }
|
| 387 | +
|
| 388 | + if (($key_low =~ /^nov/) || ($key_low_prev =~ /^nov/))
|
| 389 | + { &Log ("key_low '$key_low' (key_low_prev '$key_low_prev')\n") ; }
|
| 390 | +
|
| 391 | + $counts = "" ;
|
| 392 | + $total = 0 ;
|
| 393 | + for ($hour = 0 ; $hour < 24 ; $hour++)
|
| 394 | + {
|
| 395 | + if (! $file_in_found [$hour])
|
| 396 | + { $counts .= chr ($hour+ord('A')) . '?' ; }
|
| 397 | + elsif (($files_in_open == 24) || $file_in_open [$hour])
|
| 398 | + {
|
| 399 | + if ($key [$hour] eq $key_low)
|
| 400 | + {
|
| 401 | + $counts .= chr ($hour+ord('A')) . $count [$hour] ;
|
| 402 | + $total += $count [$hour] ;
|
| 403 | + $file = $in_hour [$hour] ;
|
| 404 | + # $line = <$file> ;
|
| 405 | +
|
| 406 | + while ($true)
|
| 407 | + {
|
| 408 | + if ($line = <$file>) # =~ /^a/)
|
| 409 | + {
|
| 410 | + $line =~ s/^([\w\-]+)2 /$1.y /o ; # project wikipedia comes without suffix -> out of sort order, make it fit by appending suffix
|
| 411 | + $line =~ s/^([\w\-]+) /$1.z /o ;
|
| 412 | + ($lang,$title,$count [$hour],$dummy) = split (' ', $line) ;
|
| 413 | + $key [$hour] = "$lang $title" ;
|
| 414 | +
|
| 415 | + last if $lang !~ /\d/ ;
|
| 416 | + }
|
| 417 | + else
|
| 418 | + {
|
| 419 | + if ($bayes)
|
| 420 | + { close $in_hour [$hour] ; }
|
| 421 | + else
|
| 422 | + { $in_hour [$hour] -> close () ; }
|
| 423 | + $files_in_open-- ;
|
| 424 | + $file_in_open [$hour] = $false ;
|
| 425 | + $key [$hour] = "\xFF\xFF";
|
| 426 | +
|
| 427 | + last ;
|
| 428 | + }
|
| 429 | + }
|
| 430 | + }
|
| 431 | + }
|
| 432 | + }
|
| 433 | + if ($lines == 0)
|
| 434 | + { &Log ("\nlines: project key\n") ; }
|
| 435 | +
|
| 436 | + if (++$lines % 100000 == 0)
|
| 437 | + { &Log ("$lines: $key_low\n") ; }
|
| 438 | +
|
| 439 | + # last if $lines > 10000 ; # test
|
| 440 | +
|
| 441 | + last if $key_low eq "\xFF\xFF" ;
|
| 442 | +
|
| 443 | + # Q&D fix for unexplained out of order error for what seems to be invalid language
|
| 444 | + # remember : no suffix on language code gets replaced by .y or .z to fixed sort order
|
| 445 | + # ^nov.mw nov1 1 8765
|
| 446 | + # ^nov1.mw nov1 1 931 <--------------
|
| 447 | + # ^nov 10_dw_oktobre 1 11421
|
| 448 | + ($lang,$title) = split (' ', $key_low) ;
|
| 449 | + if ($lang =~ /\d/)
|
| 450 | + {
|
| 451 | + $invalid_languages {$lang}++ ;
|
| 452 | + &Log ("\nSkip invalid language '$lang'\n") ;
|
| 453 | + next ;
|
| 454 | + }
|
| 455 | +
|
| 456 | +
|
| 457 | + if ($key_low_prev gt $key_low)
|
| 458 | + {
|
| 459 | + for ($hour = 0 ; $hour < 24 ; $hour++)
|
| 460 | + { &Log ("hour $hour: key ${key[$hour]}\n") ; }
|
| 461 | +
|
| 462 | + &Abort ("Sequence error: '$key_low_prev' gt '$key_low'\n") ;
|
| 463 | + }
|
| 464 | +
|
| 465 | + if (($key_low_prev eq $key_low) && ($files_in_open > 0))
|
| 466 | + {
|
| 467 | + for ($hour = 0 ; $hour < 24 ; $hour++)
|
| 468 | + {
|
| 469 | + if ($file_in_open [$hour])
|
| 470 | + { print "hour $hour: file open, key ${key [$hour]}\n" ; }
|
| 471 | + else
|
| 472 | + { print "hour $hour: file closed, key ${key [$hour]}\n" ; }
|
| 473 | + }
|
| 474 | + &Abort ("Sequence error: '$key_low_prev' eq '$key_low'\n") ;
|
| 475 | + }
|
| 476 | +
|
| 477 | + # print OUT "$key_low $total$counts\n" ;
|
| 478 | +# print $out_day1 "$key_low $total$counts\n" ;
|
| 479 | +
|
| 480 | + ($lang,$title) = split (' ', $key_low) ;
|
| 481 | +
|
| 482 | + $title =~ s/\%20/_/g ;
|
| 483 | + $title =~ s/\%3A/:/gi ;
|
| 484 | +# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
|
| 485 | + if (($title !~ /\:/) || ($title =~ /^:[^:]*$/)) # no colon or only on first position
|
| 486 | + { $namespace = 'NamespaceArticles' ; }
|
| 487 | + else
|
| 488 | + { ($namespace = $title) =~ s/([^:])\:.*$/$1/ ; }
|
| 489 | + # print "KEY $key_low -> $namespace\n" ;
|
| 490 | +
|
| 491 | + if (($lang ne $langprev) && ($langprev ne ""))
|
| 492 | + {
|
| 493 | + $filter_matches = $lang =~ $reg_exp_filter ;
|
| 494 | + if ($filter_matches)
|
| 495 | + { print "F $lang\n" ; }
|
| 496 | + # else
|
| 497 | + # { print "- $lang\n" ; }
|
| 498 | +
|
| 499 | + &WriteTotalsPerNamespace ($out_day2, $langprev) ;
|
| 500 | + # &WriteTotalsPerNamespace ($out_day3, $langprev) ;
|
| 501 | + undef %totals_per_namespace ;
|
| 502 | + }
|
| 503 | + $langprev = $lang ;
|
| 504 | +
|
| 505 | + if (($files_in_found < 24) && ($files_in_found > 0)) # always > 0 actually
|
| 506 | + { $total = sprintf ("%.0f",($total / $files_in_found) * 24) ; }
|
| 507 | +
|
| 508 | + $totals_per_namespace {"$lang $namespace"} += $total ;
|
| 509 | +
|
| 510 | + if ($filter_matches)
|
| 511 | + { print $out_filtered "$key_low $total$counts\n" ; }
|
| 512 | +
|
| 513 | + if ($key_low =~ $reg_exp_track) # track count for NonExistingPageForSquidLogMonitoring on en.z
|
| 514 | + {
|
| 515 | + open $out_track, '>>', $file_track ;
|
| 516 | + binmode $out_track ;
|
| 517 | + print $out_track "$key_low $total$counts\n" ;
|
| 518 | + close $out_track ;
|
| 519 | + }
|
| 520 | +
|
| 521 | + if ($total >= $threshold)
|
| 522 | + { print $out_day2 "$key_low $total$counts\n" ;
|
| 523 | + # print $out_day3 "$key_low $total\n" ;
|
| 524 | + }
|
| 525 | +
|
| 526 | + $key_low_prev = $key_low ;
|
| 527 | + # print "OUT $key_low $counts\n" ;
|
| 528 | + }
|
| 529 | +
|
| 530 | + &WriteTotalsPerNamespace ($out_day2, $langprev) ;
|
| 531 | +# &WriteTotalsPerNamespace ($out_day3, $langprev) ;
|
| 532 | +
|
| 533 | + &Log ("File production took " . (time-$time_start) . " seconds\n\n") ;
|
| 534 | +
|
| 535 | + &Log ("[$lines, $files_in_open] $key_low\n") ;
|
| 536 | +# close OUT ;
|
| 537 | +
|
| 538 | + if ($bayes)
|
| 539 | + {
|
| 540 | + # close $out_day1 ;
|
| 541 | + close $out_day2 ;
|
| 542 | + # close $out_day3 ;
|
| 543 | + close $out_filtered ;
|
| 544 | +
|
| 545 | +# $cmd = "$path_7za a $file_out2.7z $file_out2" ;
|
| 546 | +# $result = `$cmd` ;
|
| 547 | +# if ($result =~ /Everything is Ok/s)
|
| 548 | +# {
|
| 549 | +# $result =~ s/^.*?(Updating.*?)\n.*$/$1 -> OK/s ;
|
| 550 | +# unlink $file_out2 ;
|
| 551 | +# foreach $file_in (@files_today)
|
| 552 | +# {
|
| 553 | +# print "unlink $dir_in/$file_in\n" ;
|
| 554 | +# unlink "$dir_in/$file_in" ;
|
| 555 | +# }
|
| 556 | +# }
|
| 557 | +# else
|
| 558 | +# {
|
| 559 | +# print "Delete $file_out2.7z\n" ;
|
| 560 | +# unlink "$file_out2.7z" ;
|
| 561 | +# }
|
| 562 | +
|
| 563 | +
|
| 564 | + $time_start_compression = time ;
|
| 565 | + $cmd = "bzip2 -9 -v $file_out2" ;
|
| 566 | + &Log ("\n\n$cmd ->\n") ;
|
| 567 | + $result = `$cmd` ;
|
| 568 | + &Log ("\n\nCompression took " . (time-$time_start_compression) . " seconds\n$result\n") ;
|
| 569 | +
|
| 570 | + if ($true)
|
| 571 | + {
|
| 572 | + foreach $file_in (@files_today)
|
| 573 | + {
|
| 574 | + print "unlink $dir_in/$file_in\n" ;
|
| 575 | + unlink "$dir_in/$file_in" ;
|
| 576 | + }
|
| 577 | + }
|
| 578 | + else
|
| 579 | + {
|
| 580 | + # print "Delete $file_out2.7z\n" ;
|
| 581 | + # unlink "$file_out2.7z" ;
|
| 582 | + }
|
| 583 | + }
|
| 584 | + else
|
| 585 | + {
|
| 586 | + # $out_day1->close() ;
|
| 587 | + $out_day2->close() ;
|
| 588 | + # $out_day3->close() ;
|
| 589 | + close $out_filtered ;
|
| 590 | + }
|
| 591 | +
|
| 592 | + &Log ("\nRecords skipped for invalid languages:\n") ;
|
| 593 | + foreach $key (sort keys %invalid_languages)
|
| 594 | + { &Log ("$key: ${invalid_languages {$key}}\n") ; }
|
| 595 | +
|
| 596 | + &Log ("\nTotals per namespace written: $lines_namespace_counts\n") ;
|
| 597 | + &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
|
| 598 | +}
|
| 599 | +
|
| 600 | +sub WriteTotalsPerNamespace
|
| 601 | +{
|
| 602 | + my $out_day = shift ;
|
| 603 | + my $lang = shift ;
|
| 604 | + my $total ;
|
| 605 | + my $totals_per_namespace_other ;
|
| 606 | +
|
| 607 | + foreach my $key (sort keys %totals_per_namespace)
|
| 608 | + {
|
| 609 | + $total = $totals_per_namespace {$key} ;
|
| 610 | + if ($total < 5)
|
| 611 | + { $totals_per_namespace_other += $total ; }
|
| 612 | + else
|
| 613 | + {
|
| 614 | + # print "@ $key $total\n" ;
|
| 615 | + print $out_day "@ $key $total\n" ;
|
| 616 | + $lines_namespace_counts ++ ;
|
| 617 | + }
|
| 618 | + }
|
| 619 | + if ($totals_per_namespace_other > 0 )
|
| 620 | + {
|
| 621 | + # print "@ $lang -other- $totals_per_namespace_other\n" ;
|
| 622 | + print $out_day "@ $lang -other- $totals_per_namespace_other\n" ;
|
| 623 | + $lines_namespace_counts ++ ;
|
| 624 | + }
|
| 625 | +}
|
| 626 | +
|
| 627 | +sub CompactVisitorStatsOneMonth
|
| 628 | +{
|
| 629 | + my $dir_in = shift ;
|
| 630 | + my $dir_out = shift ;
|
| 631 | + my $daterange = shift ;
|
| 632 | +
|
| 633 | + &Log ("\nCompactVisitorStatsOneMonth\n\n") ;
|
| 634 | +
|
| 635 | + chdir ($dir_in) || &Abort ("Cannot chdir to $dir_in\n") ;
|
| 636 | +
|
| 637 | + local (*DIR);
|
| 638 | + opendir (DIR, ".");
|
| 639 | + @files = () ;
|
| 640 | +
|
| 641 | + while ($dir = readdir (DIR))
|
| 642 | + {
|
| 643 | + next if ! -d $dir ;
|
| 644 | + next if $dir !~ /^\d\d\d\d-\d\d$/ ;
|
| 645 | +
|
| 646 | + push @dirs, $dir ;
|
| 647 | + }
|
| 648 | +
|
| 649 | + closedir (DIR);
|
| 650 | +
|
| 651 | + @dirs = sort @dirs ;
|
| 652 | +
|
| 653 | + foreach $dir (@dirs)
|
| 654 | + {
|
| 655 | + &Log ("\n\n" . '-' x 80 . "\n\nCompactVisitorStatsOneMonth:\nCheck dir $dir_in/$dir\n") ;
|
| 656 | +
|
| 657 | + if (-e "$dir_in/$dir/a")
|
| 658 | + {
|
| 659 | + &Log ("Already done -> skip\n\n") ;
|
| 660 | + next ;
|
| 661 | + }
|
| 662 | +
|
| 663 | + ($dir2 = $dir) =~ s/-//g ;
|
| 664 | + if ($dir2 !~ /^$daterange/)
|
| 665 | + {
|
| 666 | + &Log ("Directory out of date range ($daterange) -> skip\n\n") ;
|
| 667 | + next ;
|
| 668 | + }
|
| 669 | +
|
| 670 | + local (*DIR2);
|
| 671 | + opendir (DIR2, "$dir_in/$dir");
|
| 672 | +
|
| 673 | + undef @files ;
|
| 674 | + undef %process_dates ;
|
| 675 | +
|
| 676 | + while ($file_in = readdir (DIR2))
|
| 677 | + {
|
| 678 | + if ($bayes)
|
| 679 | + { next if $file_in !~ /^pagecounts-\d{8}_(?:fdt\.7z|h\.bz2)$/ ; }
|
| 680 | + else
|
| 681 | + { next if $file_in !~ /^pagecounts-\d{8}_fdt$/ ; }
|
| 682 | +
|
| 683 | + &Log ("File found: $file_in\n") ;
|
| 684 | +
|
| 685 | + push @files, $file_in ;
|
| 686 | + }
|
| 687 | +
|
| 688 | + closedir (DIR2);
|
| 689 | +
|
| 690 | + @files = sort @files ;
|
| 691 | +
|
| 692 | + foreach $file (@files)
|
| 693 | + {
|
| 694 | + $date = substr ($file,11,8) ;
|
| 695 | + $process_dates {$date}++ ;
|
| 696 | + }
|
| 697 | +
|
| 698 | + &Log ("\n\n") ;
|
| 699 | +
|
| 700 | + &MergeFilesFullMonth ($dir_in, $dir_out, $dir, @files) ;
|
| 701 | + }
|
| 702 | +
|
| 703 | + exit ;
|
| 704 | +}
|
| 705 | +
|
| 706 | +sub MergeFilesFullMonth
|
| 707 | +{
|
| 708 | + my $dir_in = shift ;
|
| 709 | + my $dir_out = shift ;
|
| 710 | + my $dir = shift ;
|
| 711 | + my @files_this_month = @_ ;
|
| 712 | +
|
| 713 | + my $year = substr ($dir,0,4) ;
|
| 714 | + my $month = substr ($dir,5,2) ;
|
| 715 | +
|
| 716 | + my (@file_in_open, @file_in_found, @counts, $days_missing) ;
|
| 717 | + my $days_in_month = days_in_month ($year, $month) ;
|
| 718 | +
|
| 719 | + my ($file_out2) ;
|
| 720 | +
|
| 721 | + $lines = 0 ;
|
| 722 | +
|
| 723 | + undef @in_day ;
|
| 724 | + my $time_start = time ;
|
| 725 | +
|
| 726 | + if ($dir eq $month_run)
|
| 727 | + { $scope = "part" ; }
|
| 728 | + else
|
| 729 | + { $scope = "all" ; }
|
| 730 | +
|
| 731 | + $file_out = "$dir_out/pagecounts-$year-$month-$scope" ;
|
| 732 | +
|
| 733 | + &Log ("\nMergeFilesFullMonth\nIn: $dir_in/$dir\nOut: $dir_out/$file_out\nDays expected: $days_in_month\n\nProcess...\n") ;
|
| 734 | +
|
| 735 | + if ($bayes)
|
| 736 | + {
|
| 737 | + if ((-e "$file_out.7z") || (-e "$file_out.bz2") || (-e "$file_out.zip") || (-e "$file_out.gz"))
|
| 738 | + {
|
| 739 | + &Log ("\nTarget file '$file_out.[7z|bz2|zip|gz]' exists already. Skip this month.\n") ;
|
| 740 | + return ;
|
| 741 | + }
|
| 742 | + }
|
| 743 | +
|
| 744 | +
|
| 745 | + my $out_month_all = new IO::Compress::Bzip2 "$file_out.bz2" or die "bzip2 failed for $file_out.bz2: $Bzip2Error\n";
|
| 746 | + my $out_month_ge5 = new IO::Compress::Bzip2 "${file_out}_ge5.bz2" or die "bzip2 failed for ${file_out}_ge5.bz2: $Bzip2Error\n";
|
| 747 | +
|
| 748 | + $out_month_all->binmode() ;
|
| 749 | + $out_month_ge5->binmode() ;
|
| 750 | +
|
| 751 | + for ($day = 0 ; $day < $days_in_month ; $day++)
|
| 752 | + { $file_in_found [$day] = $false ; }
|
| 753 | +
|
| 754 | + $files_in_open = 0 ;
|
| 755 | + $files_in_found = 0 ;
|
| 756 | + $total_hours_missing = 0 ;
|
| 757 | + $langprev = "" ;
|
| 758 | + $lines_read_this_month = 0 ;
|
| 759 | + @hours_missing_per_day = () ;
|
| 760 | + $hours_missing_coded = '' ;
|
| 761 | + $lines_omitted_daily = 0 ;
|
| 762 | +
|
| 763 | + foreach $file_in (@files_this_month)
|
| 764 | + {
|
| 765 | + next if $file_in eq "" ;
|
| 766 | +
|
| 767 | + ($day = $file_in) =~ s/^pagecounts-\d{6}(\d+)_(?:fdt|fdt\.7z|h\.bz2)$/$1/ ;
|
| 768 | + $day = sprintf ("%2d", $day-1) ;
|
| 769 | +
|
| 770 | + $file_in = "$dir_in/$year-$month/$file_in" ;
|
| 771 | + # print "File $file_in -> day $day\n" ;
|
| 772 | +
|
| 773 | + &CheckHoursMissing ($year,$month,$day,$file_in) ;
|
| 774 | +
|
| 775 | + if ($bayes)
|
| 776 | + {
|
| 777 | + if ($file_in =~ /\.bz2$/)
|
| 778 | + { open $in_day [$day], "-|", "bzip2 -dc \"$file_in\"" || abort ("Input file '" . $file_in . "' could not be opened.") ; }
|
| 779 | + elsif ($file_in =~ /\.7z$/)
|
| 780 | + { open $in_day [$day], "-|", "7z e -so \"$file_in\"" || abort ("Input file '" . $file_in . "' could not be opened.") ; }
|
| 781 | + else
|
| 782 | + { abort ("MergeFilesFullMonth: unexpected file name $file_in.") ; }
|
| 783 | + }
|
| 784 | + else
|
| 785 | + { open $in_day [$day], '<', $file_in || &Abort ("Open failed for '$file_in'\n") ; }
|
| 786 | +
|
| 787 | + binmode $in_day [$day] ;
|
| 788 | +
|
| 789 | + $files_in_open++ ;
|
| 790 | + $file_in_found [$day] = $true ;
|
| 791 | + $file_in_open [$day] = $true ;
|
| 792 | + $files_in_found ++ ;
|
| 793 | +
|
| 794 | + $file = $in_day [$day] ;
|
| 795 | + $line = <$file> ;
|
| 796 | + while (($line =~ /^#/) || ($line =~ /^@/))
|
| 797 | + { $line = <$file> ; }
|
| 798 | +
|
| 799 | + chomp $line ;
|
| 800 | + if ($line =~ /^[^ ]+ [^ ]+ [^ ]+$/) # prepare for format change: space will be added between daily total and hourly counts
|
| 801 | + {
|
| 802 | + ($lang,$title,$counts) = split (' ', $line) ;
|
| 803 | + }
|
| 804 | + else
|
| 805 | + {
|
| 806 | + ($lang,$title,$total,$counts) = split (' ', $line) ;
|
| 807 | + $counts = "$total$counts" ;
|
| 808 | + }
|
| 809 | +
|
| 810 | + $key [$day] = "$lang $title" ;
|
| 811 | + $counts [$day] = $counts ;
|
| 812 | + # print "DAY " . ($day+1) . " KEY ${key [$day]} COUNTS $counts\n" ;
|
| 813 | + }
|
| 814 | + print "\n" ;
|
| 815 | +
|
| 816 | + $comment = "# Wikimedia article requests (aka page views) for year $year, month $month\n" ;
|
| 817 | + if ($threshold > 0 )
|
| 818 | + { $comment .= "# Count for articles with less than $threshold requests per full month are omitted\n" ; }
|
| 819 | + $comment .= "#\n" ;
|
| 820 | + $comment .= "# Each line contains four fields separated by spaces\n" ;
|
| 821 | + $comment .= "# - wiki code (subproject.project, see below)\n" ;
|
| 822 | + $comment .= "# - article title (encoding from original hourly files is preserved to maintain proper sort sequence)\n" ;
|
| 823 | + $comment .= "# - monthly total (possibly extrapolated from available data when hours/days in input were missing)\n" ;
|
| 824 | + $comment .= "# - hourly counts (only for hours where indeed article requests occurred)\n" ;
|
| 825 | + $comment .= "#\n" ;
|
| 826 | + $comment .= "# Subproject is language code, followed by project code\n" ;
|
| 827 | + $comment .= "# Project is b:wikibooks, k:wiktionary, n:wikinews, q:wikiquote, s:wikisource, v:wikiversity, z:wikipedia\n" ;
|
| 828 | + $comment .= "# Note: suffix z added by compression script: project wikipedia happens to be sorted last in dammit.lt files, so add this suffix to fix sort order\n" ;
|
| 829 | + $comment .= "#\n" ;
|
| 830 | + $comment .= "# To keep hourly counts compact and tidy both day and hour are coded as one character each, as follows:\n" ;
|
| 831 | + $comment .= "# Hour 0..23 shown as A..X convert to number: ordinal (char) - ordinal ('A')\n" ;
|
| 832 | + $comment .= "# Day 1..31 shown as A.._ 27=[ 28=\\ 29=] 30=^ 31=_ convert to number: ordinal (char) - ordinal ('A') + 1\n" ;
|
| 833 | + $comment .= "#\n" ;
|
| 834 | + $comment .= "# Original data source: Wikimedia full (=unsampled) squid logs\n" ;
|
| 835 | + $comment .= "# These data have been aggregated from hourly pagecount files at http://dammit.lt/wikistats, originally produced by Domas Mituzas\n" ;
|
| 836 | + $comment .= "# Daily and monthly aggregator script built by Erik Zachte\n" ;
|
| 837 | + $comment .= "# Each day hourly files for previous day are downloaded and merged into one file per day\n" ;
|
| 838 | + $comment .= "# Each month daily files are merged into one file per month\n" ;
|
| 839 | +# $comment .= "# If data are missing for some hour (file missing or corrupt) a question mark (?) is shown (and for each missing hour the daily total is incremented with hourly average)\n" ;
|
| 840 | +# $comment .= "# If data are missing for some day (file missing or corrupt) a question mark (?) is shown (and for each missing day the monthly total is incremented with daily average)\n" ;
|
| 841 | + $comment .= "#\n" ;
|
| 842 | +
|
| 843 | + $out_month_all->print ($comment) ;
|
| 844 | + $comment .= "# This file contains only lines with monthly page request total greater/equal 5\n" ;
|
| 845 | + $comment .= "#\n" ;
|
| 846 | + $out_month_ge5->print ($comment) ;
|
| 847 | +
|
| 848 | + if ($files_in_found < $days_in_month)
|
| 849 | + {
|
| 850 | + for ($day = 0 ; $day < $days_in_month ; $day++)
|
| 851 | + {
|
| 852 | + if (! $file_in_found [$day])
|
| 853 | + {
|
| 854 | + $days_missing .= ($day+1) . "," ;
|
| 855 | + $total_hours_missing += 24 ;
|
| 856 | + for (my $h = 0 ; $h <= 23 ; $h++)
|
| 857 | + { $hours_missing_coded .= chr ($day + ord ('A')) . chr ($h + ord ('A')) .',' ; }
|
| 858 | + }
|
| 859 | + }
|
| 860 | +
|
| 861 | + $days_missing =~ s/,$// ;
|
| 862 | + &Log ("Merge files: year $year, month $month, only $files_in_found files found!\n\n") ;
|
| 863 | +
|
| 864 | + if ($days_missing =~ /,/)
|
| 865 | + {
|
| 866 | + $out_month_all->print ("# No input files found for days $days_missing!\n#\n") ;
|
| 867 | + $out_month_ge5->print ("# No input files found for days $days_missing!\n#\n") ;
|
| 868 | + print "No input files found for days $days_missing!\n\n" ;
|
| 869 | + }
|
| 870 | + else
|
| 871 | + {
|
| 872 | + $out_month_all->print ("# No input file found for day $days_missing!\n#\n") ;
|
| 873 | + $out_month_ge5->print ("# No input file found for day $days_missing!\n#\n") ;
|
| 874 | + print "No input file found for day $days_missing!\n\n" ;
|
| 875 | + }
|
| 876 | + }
|
| 877 | + else
|
| 878 | + { &Log ("Merge files: year $year, month $month\n\n") ; }
|
| 879 | +
|
| 880 | + if ($#hours_missing_per_day > -1)
|
| 881 | + {
|
| 882 | + $out_month_all->print (@hours_missing_per_day) ;
|
| 883 | + $out_month_ge5->print (@hours_missing_per_day) ;
|
| 884 | + }
|
| 885 | +
|
| 886 | + if ($hours_missing_coded ne '')
|
| 887 | + {
|
| 888 | + $hours_missing_coded =~ s/,$// ;
|
| 889 | + $hours_missing_coded = join (',', sort {$a cmp $b} split (',', $hours_missing_coded)) ; # single hours and full days missing added out of sort order
|
| 890 | + $out_month_all->print ("#\n# Hours missing: $hours_missing_coded\n") ;
|
| 891 | + $out_month_ge5->print ("#\n# Hours missing: $hours_missing_coded\n") ;
|
| 892 | + print "Hours missing: $hours_missing_coded\n\n" ;
|
| 893 | + }
|
| 894 | +
|
| 895 | + $monthly_correction = 1 ;
|
| 896 | + if ($total_hours_missing == 0)
|
| 897 | + {
|
| 898 | + $out_month_all->print ("# Data for all hours of each day were available in input\n#\n") ;
|
| 899 | + $out_month_ge5->print ("# Data for all hours of each day were available in input\n#\n") ;
|
| 900 | + print "Data for all hours of each day were available in input\n\n" ;
|
| 901 | + }
|
| 902 | + else
|
| 903 | + {
|
| 904 | + $monthly_correction = sprintf ("%.4f", ($days_in_month * 24) / ($days_in_month * 24 - $total_hours_missing)) ;
|
| 905 | + $out_month_all->print ("#\n# In this file data for $total_hours_missing hours were not encountered in input\n") ;
|
| 906 | + $out_month_ge5->print ("#\n# In this file data for $total_hours_missing hours were not encountered in input\n") ;
|
| 907 | + $out_month_all->print ("# Monthly totals per page have been extrapolated from available counts: multiplication factor = $monthly_correction\n#\n") ;
|
| 908 | + $out_month_ge5->print ("# Monthly totals per page have been extrapolated from available counts: multiplication factor = $monthly_correction\n#\n") ;
|
| 909 | + print "In this file data for $total_hours_missing hours were not encountered in input\n" ;
|
| 910 | + print "Monthly totals per page have been extrapolated from available counts: multiplication factor = $monthly_correction\n\n" ;
|
| 911 | + }
|
| 912 | +
|
| 913 | + if ($threshold_requests_omitted > 0)
|
| 914 | + {
|
| 915 | + $out_month_all->print ("# For this month intermediate files (from daily aggregation of hourly files) did no longer contain lines with daily total below $threshold_requests_omitted page requests\n#\n") ;
|
| 916 | + $out_month_ge5->print ("# For this month intermediate files (from daily aggregation of hourly files) did no longer contain lines with daily total below $threshold_requests_omitted page requests\n#\n") ;
|
| 917 | + print "# For this month intermediate files (from daily aggregation of hourly files) did no longer contain lines with daily total below $threshold_requests_omitted page requests\n#\n" ;
|
| 918 | + }
|
| 919 | +
|
| 920 | + $key_low_prev = "" ;
|
| 921 | + while ($files_in_open > 0)
|
| 922 | + {
|
| 923 | + # last if $cycles ++ > 10000 ; # test code
|
| 924 | +
|
| 925 | + $key_low = "\xFF\xFF";
|
| 926 | + for ($day = 0 ; $day < $days_in_month ; $day++)
|
| 927 | + {
|
| 928 | + if (($files_in_open == $days_in_month) || ($file_in_found [$day] && $file_in_open [$day]))
|
| 929 | + {
|
| 930 | + if ($key [$day] lt $key_low)
|
| 931 | + { $key_low = $key [$day] ; }
|
| 932 | + }
|
| 933 | + }
|
| 934 | +
|
| 935 | + $counts_per_month = "" ;
|
| 936 | + $total_per_month = 0 ;
|
| 937 | +
|
| 938 | + for ($day = 0 ; $day < $days_in_month ; $day++)
|
| 939 | + {
|
| 940 | + if (! $file_in_found [$day])
|
| 941 | + {
|
| 942 | + # $counts_per_month .= chr ($day+ord('A')) . '?' ;
|
| 943 | + }
|
| 944 | + elsif (($files_in_open == $days_in_month) || $file_in_open [$day]) # slight optimization
|
| 945 | + {
|
| 946 | + if ($key [$day] eq $key_low)
|
| 947 | + {
|
| 948 | + $ch_day = chr ($day+ord('A')) ;
|
| 949 | + $counts_per_day = $counts [$day] ;
|
| 950 | +
|
| 951 | + ($total_per_day = $counts_per_day) =~ s/^(\d+).*$/$1/ ;
|
| 952 | + $counts_per_day =~ s/^\d+// ; # remove total
|
| 953 | +
|
| 954 | + $counts_per_day =~ s/([A-Z]\d+)/$ch_day$1,/g ; # prefix each hourly count with char that represent day
|
| 955 | + $counts_per_month .= $counts_per_day ;
|
| 956 | +
|
| 957 | + $total_per_month += $total_per_day ;
|
| 958 | + $file = $in_day [$day] ;
|
| 959 | + # $line = <$file> ;
|
| 960 | +
|
| 961 | + while ($true)
|
| 962 | + {
|
| 963 | + # if (($line = <$file>) && ($lines_read_this_month++ < 10000)) # test code
|
| 964 | + if ($line = <$file>)
|
| 965 | + {
|
| 966 | + next if $line =~ /^#/ ;
|
| 967 | + next if $line =~ /^@/ ;
|
| 968 | +
|
| 969 | + $line =~ s/^([\w\-]+)2 /$1.y /o ;
|
| 970 | + $line =~ s/^([\w\-]+) /$1.z /o ;
|
| 971 | +
|
| 972 | + chomp $line ;
|
| 973 | +
|
| 974 | + if ($line =~ /^[^ ]+ [^ ]+ [^ ]+$/) # prepare for format change: space will be added between daily total and hourly counts
|
| 975 | + {
|
| 976 | + ($lang,$title,$counts) = split (' ', $line) ;
|
| 977 | + }
|
| 978 | + else
|
| 979 | + {
|
| 980 | + ($lang,$title,$total,$counts) = split (' ', $line) ;
|
| 981 | + $counts = "$total$counts" ;
|
| 982 | + }
|
| 983 | +
|
| 984 | + $key [$day] = "$lang $title" ;
|
| 985 | + $counts [$day] = $counts ;
|
| 986 | +
|
| 987 | + last ;
|
| 988 | + }
|
| 989 | + else
|
| 990 | + {
|
| 991 | + close $in_day [$day] ;
|
| 992 | +
|
| 993 | + $files_in_open-- ;
|
| 994 | + $file_in_open [$day] = $false ;
|
| 995 | + $key [$day] = "\xFF\xFF";
|
| 996 | +
|
| 997 | + last ;
|
| 998 | + }
|
| 999 | + }
|
| 1000 | + }
|
| 1001 | + }
|
| 1002 | + }
|
| 1003 | + if ($lines == 0)
|
| 1004 | + { &Log ("\nlines: project key\n") ; }
|
| 1005 | +
|
| 1006 | + if (++$lines % 100000 == 0)
|
| 1007 | + { &Log ("$lines: $key_low\n") ; }
|
| 1008 | +
|
| 1009 | + # last if $lines > 10000 ; # test
|
| 1010 | +
|
| 1011 | + last if $key_low eq "\xFF\xFF" ;
|
| 1012 | +
|
| 1013 | + # Q&D fix for unexplained out of order error for what seems to be invalid language
|
| 1014 | + # remember : language code without suffix gets appended by .y or .z to fix sort order
|
| 1015 | + # ^nov.mw nov1 1 8765
|
| 1016 | + # ^nov1.mw nov1 1 931 <--------------
|
| 1017 | + # ^nov 10_dw_oktobre 1 11421
|
| 1018 | + ($lang,$title) = split (' ', $key_low) ;
|
| 1019 | + if ($lang =~ /\d/)
|
| 1020 | + {
|
| 1021 | + $invalid_languages {$lang}++ ;
|
| 1022 | + &Log ("\nSkip invalid language '$lang'\n") ;
|
| 1023 | + next ;
|
| 1024 | + }
|
| 1025 | +
|
| 1026 | + if ($key_low_prev gt $key_low)
|
| 1027 | + {
|
| 1028 | + for ($day = 0 ; $day < $days_in_month ; $day++)
|
| 1029 | + { &Log ("day " . ($day+1) . ": key ${key[$day]}\n") ; }
|
| 1030 | +
|
| 1031 | + &Abort ("Sequence error: '$key_low_prev' gt '$key_low'\n") ;
|
| 1032 | + }
|
| 1033 | +
|
| 1034 | + if (($key_low_prev eq $key_low) && ($files_in_open > 0))
|
| 1035 | + {
|
| 1036 | + for ($day = 0 ; $day < $days_in_month ; $day++)
|
| 1037 | + {
|
| 1038 | + if ($file_in_open [$day])
|
| 1039 | + { print "day " . ($day+1) . ": file open, key ${key [$day]}\n" ; }
|
| 1040 | + else
|
| 1041 | + { print "day " . ($day+1) . ": file closed, key ${key [$day]}\n" ; }
|
| 1042 | + }
|
| 1043 | + &Abort ("Sequence error: '$key_low_prev' eq '$key_low'\n") ;
|
| 1044 | + }
|
| 1045 | +
|
| 1046 | + ($lang,$title) = split (' ', $key_low) ;
|
| 1047 | +
|
| 1048 | + if (($title !~ /\:/) || ($title =~ /^:[^:]*$/)) # no colon or only on first position
|
| 1049 | + { $namespace = 'NamespaceArticles' ; }
|
| 1050 | + else
|
| 1051 | + { ($namespace = $title) =~ s/([^:])\:.*$/$1/ ; }
|
| 1052 | +
|
| 1053 | + if (($lang ne $langprev) && ($langprev ne ""))
|
| 1054 | + {
|
| 1055 | + $filter_matches = $lang =~ $reg_exp_filter ;
|
| 1056 | + if ($filter_matches)
|
| 1057 | + { print "F $lang\n" ; }
|
| 1058 | + }
|
| 1059 | + $langprev = $lang ;
|
| 1060 | +
|
| 1061 | + if (($files_in_found < $days_in_month) && ($files_in_found > 0)) # always > 0 actually
|
| 1062 | + { $total = sprintf ("%.0f",($total / $files_in_found) * $days_in_month) ; }
|
| 1063 | +
|
| 1064 | + $counts_per_month =~ s/,$// ;
|
| 1065 | + $total_per_month = sprintf ("%.0f", $monthly_correction * $total_per_month) ;
|
| 1066 | +
|
| 1067 | + $out_month_all->print ("$key_low $total_per_month $counts_per_month\n") ;
|
| 1068 | + if ($total_per_month ge 5)
|
| 1069 | + { $out_month_ge5->print ("$key_low $total_per_month $counts_per_month\n") ; }
|
| 1070 | +
|
| 1071 | + $key_low_prev = $key_low ;
|
| 1072 | + }
|
| 1073 | +
|
| 1074 | + &Log ("File production took " . (time-$time_start) . " seconds\n\n") ;
|
| 1075 | +
|
| 1076 | + &Log ("[$lines, $files_in_open] $key_low\n") ;
|
| 1077 | +
|
| 1078 | + $out_month_all->close () ;
|
| 1079 | + $out_month_ge5->close () ;
|
| 1080 | +
|
| 1081 | + if ($bayes)
|
| 1082 | + {
|
| 1083 | + foreach $file_in (@files_this_month)
|
| 1084 | + {
|
| 1085 | + print "unlink $dir_in/$file_in (dummy run, test only)\n" ;
|
| 1086 | + # unlink "$dir_in/$file_in" ;
|
| 1087 | + }
|
| 1088 | + }
|
| 1089 | +
|
| 1090 | + &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
|
| 1091 | +}
|
| 1092 | +
|
| 1093 | +sub CheckHoursMissing
|
| 1094 | +{
|
| 1095 | + my ($year,$month,$day,$file_in) = @_ ;
|
| 1096 | + my ($hour,%hours_seen,%hours_valid,$hours_seen,$hours_missing,%hours_missing) ;
|
| 1097 | +
|
| 1098 | +# &Log ("\nCheckHoursMissing for day " . ($day+1) . "\n") ;
|
| 1099 | +
|
| 1100 | + if ($bayes)
|
| 1101 | + {
|
| 1102 | + if ($file_in =~ /\.bz2$/)
|
| 1103 | + { open FILE_CHECK, "-|", "bzip2 -dc \"$file_in\"" || abort ("Input file '" . $file_in . "' could not be opened.") ; }
|
| 1104 | + elsif ($file_in =~ /\.7z$/)
|
| 1105 | + { open FILE_CHECK, "-|", "7z e -so \"$file_in\"" || abort ("Input file '" . $file_in . "' could not be opened.") ; }
|
| 1106 | + else
|
| 1107 | + { abort ("CheckHoursMissing: unexpected file name $file_in.") ; }
|
| 1108 | + }
|
| 1109 | + else
|
| 1110 | + { open FILE_CHECK, '<', $file_in || &Abort ("Open failed for '$file_in'\n") ; }
|
| 1111 | +
|
| 1112 | + binmode FILE_CHECK ;
|
| 1113 | +
|
| 1114 | + $lines_checked = 0 ;
|
| 1115 | + while ($line = <FILE_CHECK>)
|
| 1116 | + {
|
| 1117 | + if ($line =~ /^#.*?requests per full day are omitted/)
|
| 1118 | + { ($threshold_requests_omitted = $line) =~ s/[^\d]//g ; }
|
| 1119 | +
|
| 1120 | + next if $line =~ /^#/ or $line =~ /^@/ ;
|
| 1121 | +
|
| 1122 | + last if $lines_checked ++ > 10000 ;
|
| 1123 | +
|
| 1124 | + chomp $line ;
|
| 1125 | + if ($line =~ /^[^ ]+ [^ ]+ [^ ]+$/) # prepare for format change: space will be added between daily total and hourly counts
|
| 1126 | + {
|
| 1127 | + ($lang,$title,$counts) = split (' ', $line) ;
|
| 1128 | + }
|
| 1129 | + else
|
| 1130 | + {
|
| 1131 | + ($lang,$title,$total,$counts) = split (' ', $line) ;
|
| 1132 | + $counts = "$total$counts" ;
|
| 1133 | + }
|
| 1134 | + # &Log ("Counts 1 $counts\n") ; # test
|
| 1135 | +
|
| 1136 | + undef @counts ;
|
| 1137 | + # $counts = "123A1B2C?D4" ; # test
|
| 1138 | + $counts =~ s/([A-X])(\d+|\?)/(push @counts,"$1$2"),""/ge ;
|
| 1139 | + foreach $key (@counts)
|
| 1140 | + {
|
| 1141 | + my $hour = ord (substr ($key,0,1)) - ord ('A') ;
|
| 1142 | +
|
| 1143 | + # test code
|
| 1144 | + # if ($month % 2 == 1)
|
| 1145 | + # {
|
| 1146 | + # if ($day % 3 == 0)
|
| 1147 | + # {
|
| 1148 | + # next if $hour == 2 ;
|
| 1149 | + # if ($hour % 3 == 0)
|
| 1150 | + # { $key = substr ($key,0,1,) . '?' ; }
|
| 1151 | + # }
|
| 1152 | + # }
|
| 1153 | + # else
|
| 1154 | + # { next if $hour == 2 ; }
|
| 1155 | +
|
| 1156 | + next if $hours_seen {$hour} > 0 ;
|
| 1157 | + $hours_seen {$hour} = $true ;
|
| 1158 | + $hours_seen ++ ;
|
| 1159 | + if ($key =~ /\d/)
|
| 1160 | + { $hours_valid {$hour} ++ ; }
|
| 1161 | + else
|
| 1162 | + {
|
| 1163 | + $hours_missing {$hour} ++ ;
|
| 1164 | + $hours_missing ++ ;
|
| 1165 | + $hours_missing_coded .= chr ($day + ord ('A')) . chr ($hour + ord ('A')) .',' ;
|
| 1166 | + }
|
| 1167 | + }
|
| 1168 | + # &Log ("Counts 2 $counts, seen: $hours_seen, valid:". (join ',', sort {$a <=> $b} keys %hours_valid) . ", missing: " . (join ',', sort {$a <=> $b} keys %hours_missing) . "\n") ; # test
|
| 1169 | +
|
| 1170 | + last if $hours_seen == 24 ;
|
| 1171 | + }
|
| 1172 | +
|
| 1173 | + close FILE_CHECK ;
|
| 1174 | +
|
| 1175 | + for ($hour = 0 ; $hour <= 23 ; $hour++)
|
| 1176 | + {
|
| 1177 | + if (! $hours_seen {$hour})
|
| 1178 | + {
|
| 1179 | + $hours_missing {$hour} ++ ;
|
| 1180 | + $hours_missing ++ ;
|
| 1181 | + $hours_missing_coded .= chr ($day + ord ('A')) . chr ($hour + ord ('A')) .',' ;
|
| 1182 | + }
|
| 1183 | + }
|
| 1184 | +
|
| 1185 | + if ($lines_checked > 10000)
|
| 1186 | + { &Log ("\nDay " . ($day+1) . ": not all hours encountered after 10,000 lines !!! Seen (can be ?=missing) " . (join ',', sort {$a <=> $b} keys %hours_seen) . "\n") ; }
|
| 1187 | +
|
| 1188 | + if ($hours_missing > 0)
|
| 1189 | + {
|
| 1190 | + $text_hour = $hours_missing > 1 ? 'hours' : 'hour' ;
|
| 1191 | + push @hours_missing_per_day, "# Day " . ($day+1) . ": $text_hour missing " . (join ',', sort {$a <=> $b} keys %hours_missing) . "\n" ;
|
| 1192 | + print "Day " . ($day+1) . ": $text_hour missing " . (join ',', sort {$a <=> $b} keys %hours_missing) . "\n" ;
|
| 1193 | + }
|
| 1194 | +
|
| 1195 | + $total_hours_missing += $hours_missing ;
|
| 1196 | +}
|
| 1197 | +
|
| 1198 | +sub Log
|
| 1199 | +{
|
| 1200 | + $msg = shift ;
|
| 1201 | + print $msg ;
|
| 1202 | + print LOG $msg ;
|
| 1203 | +}
|
| 1204 | +
|
| 1205 | +sub Abort
|
| 1206 | +{
|
| 1207 | + $msg = shift ;
|
| 1208 | + print "Abort script\nError: $msg\n" ;
|
| 1209 | + print LOG "Abort script\nError: $msg\n" ;
|
| 1210 | + exit ;
|
| 1211 | +}
|
| 1212 | +
|
| 1213 | +#=============================================================================================================
|
| 1214 | +
|
| 1215 | +# snippets obsolete but revivable code / test code
|
| 1216 | +
|
| 1217 | +#sub Compact
|
| 1218 | +#{
|
| 1219 | +# my $day = shift ;
|
| 1220 | +# &Log ("Compact files for $day\n") ;
|
| 1221 | +
|
| 1222 | +# $file_in = "pagecounts-$day.out" ;
|
| 1223 | +# $file_out1 = "pagecounts-${day}_all.gz" ;
|
| 1224 | +# $file_out2 = "pagecounts-${day}_10plus.gz" ;
|
| 1225 | +# open IN, "<", $file_in ;
|
| 1226 | +# binmode $file_in ;
|
| 1227 | +
|
| 1228 | +# my $out_day1 = IO::Compress::Gzip->new ($file_out1) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
| 1229 | +# my $out_day2 = IO::Compress::Gzip->new ($file_out2) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
| 1230 | +
|
| 1231 | +# open OUT, ">", $file_out ;
|
| 1232 | +# binmode $file_out ;
|
| 1233 | +
|
| 1234 | +# $lang_prev = "" ;
|
| 1235 | +# while ($line = <IN>)
|
| 1236 | +# {
|
| 1237 | +# chomp ($line) ;
|
| 1238 | +# ($lang, $title, $counts) = split (' ', $line) ;
|
| 1239 | +# $title2 = $title ;
|
| 1240 | +# $title =~ s/\%20/_/g ;
|
| 1241 | +# $title =~ s/\%3A/:/g ;
|
| 1242 | +# # $title =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
|
| 1243 | +# # if ($title =~ /[\x00-\x1F]/)
|
| 1244 | +# # { &Log ("> '$title2'\n") ; }
|
| 1245 | +# $title =~ s/\x00-\x1F/"%" . sprintf ("%X", ord($1)) ;/ge ;
|
| 1246 | +# print $out_day1 "$lang $title $counts\n" ;
|
| 1247 | +# ($counts2 = $counts) =~ s/^(\d+).*/$1/ ;
|
| 1248 | +# if ($counts2 >= $threshold)
|
| 1249 | +# { print $out_day2 "$lang $title $counts\n" ; }
|
| 1250 | +# $lang_prev = $lang ;
|
| 1251 | +# }
|
| 1252 | +#
|
| 1253 | +# close IN ;
|
| 1254 | +# $out_day1->close() ;
|
| 1255 | +# $out_day2->close() ;
|
| 1256 | +#}
|
| 1257 | +
|
| 1258 | +
|
| 1259 | +#sub GetViewDistribution
|
| 1260 | +#{
|
| 1261 | +# open OUT, ">", "Views.csv" ;
|
| 1262 | +# foreach $file_in (@files)
|
| 1263 | +# {
|
| 1264 | +# ($hour = $file_in) =~ s/^pagecounts-\d+-(\d\d)\d+\.gz$/$1/ ;
|
| 1265 | +# $hour = chr(ord('A')+$hour) ;
|
| 1266 | +# &Log ("Process $hour $file_in\n") ;
|
| 1267 | +
|
| 1268 | +# $in_hour1 = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed: $GunzipError\n") ;
|
| 1269 | +# while ($line = <$in_hour1>)
|
| 1270 | +# {
|
| 1271 | +# ($lang,$title,$count,$dummy) = split (' ', $line) ;
|
| 1272 | +# if (($lang eq "en") && ($title !~ /:/)) # only en: and namespace 0
|
| 1273 | +# {
|
| 1274 | +# $tot {$hour} += $count ;
|
| 1275 | +# if ($count < 3)
|
| 1276 | +# { @counts {$hour . substr ($title,0,1)}++ ; }
|
| 1277 | +# }
|
| 1278 | +# }
|
| 1279 | +# $in_hour1->close () ;
|
| 1280 | +# }
|
| 1281 | +#
|
| 1282 | +# print OUT "," ;
|
| 1283 | +# foreach $hour ('A'..'X')
|
| 1284 | +# { print OUT $hour . ", " ; }
|
| 1285 | +# print OUT "\n" ;
|
| 1286 | +#
|
| 1287 | +# print OUT "," ;
|
| 1288 | +# foreach $hour ('A'..'X')
|
| 1289 | +# { print OUT $tot {$hour} . ", " ; }
|
| 1290 | +# print OUT "\n" ;
|
| 1291 | +#
|
| 1292 | +# for ($c=0; $c < 256; $c++)
|
| 1293 | +# {
|
| 1294 | +# # do not print chars " and , as such: confuses csv format
|
| 1295 | +# if ($c < 33)
|
| 1296 | +# { print OUT "chr($c), " ; }
|
| 1297 | +# elsif (chr($c) eq '"')
|
| 1298 | +# { print OUT "dquote, " ; }
|
| 1299 | +# elsif (chr($c) eq ',')
|
| 1300 | +# { print OUT "comma, " ; }
|
| 1301 | +# else
|
| 1302 | +# { print OUT chr($c) . ", " ; }
|
| 1303 | +#
|
| 1304 | +# foreach $hour ('A'..'X')
|
| 1305 | +# { print OUT (0+@counts {$hour.chr($c)}) , ", " ; }
|
| 1306 | +#
|
| 1307 | +# if ($c < 255)
|
| 1308 | +# { print OUT "\n" ; }
|
| 1309 | +# }
|
| 1310 | +# close OUT ;
|
| 1311 | +#}
|
| 1312 | +
|
| 1313 | +
|
| 1314 | +#sub RecompactVisitorStats
|
| 1315 | +#{
|
| 1316 | +# my $dir_in = "D:/Wikipedia_Visitors/full_day/" ;
|
| 1317 | +# chdir ($dir_in) || &Abort ("Cannot chdir to $dir_in\n") ;
|
| 1318 | +# local (*DIR);
|
| 1319 | +# opendir (DIR, ".");
|
| 1320 | +# @files = () ;
|
| 1321 | +# while ($file_in = readdir (DIR))
|
| 1322 | +# {
|
| 1323 | +# next if $file_in !~ /^pagecounts-\d{8,8}_fd.gz$/ ;
|
| 1324 | +#
|
| 1325 | +# push @files, $file_in ;
|
| 1326 | +# }
|
| 1327 | +
|
| 1328 | +# $filecnt = $#files+1 ;
|
| 1329 | +# @files = sort { substr ($a, 20,2) <=> substr ($b, 20,2)} @files ;
|
| 1330 | +
|
| 1331 | +# foreach $file (@files)
|
| 1332 | +# { &RecompactVisitorStats2 ($file) ; }
|
| 1333 | +# closedir (DIR, ".");
|
| 1334 | +#}
|
| 1335 | +
|
| 1336 | +#sub RecompactVisitorStats2
|
| 1337 | +#{
|
| 1338 | +## http://www.7-zip.org/7z.html
|
| 1339 | +# my $file = shift ;
|
| 1340 | +# my $time_start = time ;
|
| 1341 | +# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
|
| 1342 | +## my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
|
| 1343 | +# my $file_in = "D:/Wikipedia_Visitors/full_day/$file" ;
|
| 1344 | +# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
|
| 1345 | +# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
|
| 1346 | +
|
| 1347 | +# &Log ("Process $file_in\n") ;
|
| 1348 | +
|
| 1349 | +# $in_hour = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
|
| 1350 | +# binmode $in_hour ;
|
| 1351 | +# open OUT, ">", $file_out ;
|
| 1352 | +# binmode OUT ;
|
| 1353 | +
|
| 1354 | +# my ($title, $title2) ;
|
| 1355 | +# while ($line = <$in_hour>)
|
| 1356 | +# {
|
| 1357 | +# chomp ($line) ;
|
| 1358 | +# ($lang,$title,$counts) = split (" ", $line) ;
|
| 1359 | +
|
| 1360 | +# if ($lang ne $lang_prev) { print "$lang " ; }
|
| 1361 | +# $lang_prev = $lang ;
|
| 1362 | +
|
| 1363 | +# # test pagecounts-20080701_fd.gz
|
| 1364 | +# # all records 424 Mib compressed (1984 uncompressed)
|
| 1365 | +# # count > 1 212 Mib compressed ( 733 uncompressed)
|
| 1366 | +# # count > 2 169 Mib compressed ( 551 uncompressed)
|
| 1367 | +# next if $counts <= 1 ;
|
| 1368 | +
|
| 1369 | +# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
|
| 1370 | +# $title =~ s/\s/_/g;
|
| 1371 | +# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
|
| 1372 | +# $lang =~ s/\.y/2/ ;
|
| 1373 | +
|
| 1374 | +# print OUT "$lang $title $counts\n" ;
|
| 1375 | +# }
|
| 1376 | +
|
| 1377 | +# print "Close files\n" ;
|
| 1378 | +# $in_hour -> close () ;
|
| 1379 | +# close (OUT) ;
|
| 1380 | +
|
| 1381 | +# &Log ("Compress $file_out\n") ;
|
| 1382 | +
|
| 1383 | +# unlink $file_7z ;
|
| 1384 | +# $result = `$path_7z a $file_7z $file_out` ;
|
| 1385 | +# &Log ("Compressed\n") ;
|
| 1386 | +# &Log ("Result " . ($result+0) . " \n") ;
|
| 1387 | +# if ((-e $file_7z) && (-s $file_7z > 0) && (($result == 0) || ($result == 7)))
|
| 1388 | +# { unlink $file_out ; }
|
| 1389 | +
|
| 1390 | +# &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
|
| 1391 | +## 0 No error
|
| 1392 | +## 1 Warning (Non fatal error(s)). For example, one or more files were locked by some other application, so they were not compressed.
|
| 1393 | +## 2 Fatal error
|
| 1394 | +## 7 Command line error
|
| 1395 | +## 8 Not enough memory for operation
|
| 1396 | +## 255 User stopped the process
|
| 1397 | +#}
|
| 1398 | +
|
| 1399 | +
|
| 1400 | +#sub RecompactVisitorStats3
|
| 1401 | +#{
|
| 1402 | +## http://www.7-zip.org/7z.html
|
| 1403 | +# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
|
| 1404 | +# my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
|
| 1405 | +# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
|
| 1406 | +# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
|
| 1407 | +## my $file_log = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts.log" ;
|
| 1408 | +
|
| 1409 | +# $in_hour = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
|
| 1410 | +# binmode $in_hour ;
|
| 1411 | +## $out_day = IO::Compress::Gzip->new ($file_out) || &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
|
| 1412 | +## binmode $out_day ;
|
| 1413 | +# open OUT, ">", $file_out ;
|
| 1414 | +# binmode OUT ;
|
| 1415 | +## open LOG, ">", $file_log ;
|
| 1416 | +## binmode LOG ;
|
| 1417 | +
|
| 1418 | +# my ($title, $title2) ;
|
| 1419 | +# while ($line = <$in_hour>)
|
| 1420 | +# {
|
| 1421 | +# chomp ($line) ;
|
| 1422 | +# ($lang,$title,$counts) = split (" ", $line) ;
|
| 1423 | +
|
| 1424 | +# if ($lang ne $lang_prev) { print "$lang\n" ; }
|
| 1425 | +## last if $lang gt "fs" ;
|
| 1426 | +# $lang_prev = $lang ;
|
| 1427 | +
|
| 1428 | +# # test pagecounts-20080701_fd.gz
|
| 1429 | +# # all records 424 Mib compressed (1984 uncompressed)
|
| 1430 | +# # count > 1 212 Mib compressed ( 733 uncompressed)
|
| 1431 | +# # count > 2 169 Mib compressed ( 551 uncompressed)
|
| 1432 | +# next if $counts <= 1 ;
|
| 1433 | +
|
| 1434 | +## next if $lang !~ /^(?:ar|fr)/ ;
|
| 1435 | +
|
| 1436 | +#if ($false)
|
| 1437 | +#{
|
| 1438 | +# $title1b = $title ;
|
| 1439 | +# $title1b =~ s/(\%[A-Fa-f0-9]{2})/uc($1)/seg;
|
| 1440 | +# $title1b =~ s/\%28/(/g ;
|
| 1441 | +# $title1b =~ s/\%29/)/g ;
|
| 1442 | +# $title1b =~ s/\%3A/:/g ;
|
| 1443 | +# $title1b =~ s/\%2F/\//g ;
|
| 1444 | +# $title1b =~ s/\%5C/\\/g ;
|
| 1445 | +# $title1b =~ s/\%2A/*/g ;
|
| 1446 | +# $title1b =~ s/\%21/!/g ;
|
| 1447 | +# $title1b =~ s/\%5F/_/g ;
|
| 1448 | +# $title1b =~ s/\%2C/,/g ;
|
| 1449 | +# $title1b =~ s/\%2E/./g ;
|
| 1450 | +# $title1b =~ s/\%2D/-/g ;
|
| 1451 | +# $title1b =~ s/\%25/%/g ;
|
| 1452 | +# $title1b =~ s/\%7E/~/g ;
|
| 1453 | +# $title1b =~ s/\%27/'/g ;
|
| 1454 | +# $title1b =~ s/\%3D/=/g ;
|
| 1455 | +# $title1b =~ s/\%26/&/g ;
|
| 1456 | +# $title1b =~ s/\%3B/;/g ;
|
| 1457 | +# $title1b =~ s/\%3F/?/g ;
|
| 1458 | +# $title1b =~ s/\%2B/+/g ;
|
| 1459 | +# $title2 = $title1b ;
|
| 1460 | +# $title2 =~ s/%([A-F0-9]{2})/chr(hex($1))/seg;
|
| 1461 | +
|
| 1462 | +# if ($title1b ne $title2) # if changed anything at all
|
| 1463 | +# {
|
| 1464 | +# $title3 = uri_escape ($title2) ;
|
| 1465 | +# $title3 =~ s/\%28/(/g ;
|
| 1466 | +# $title3 =~ s/\%29/)/g ;
|
| 1467 | +# $title3 =~ s/\%3A/:/g ;
|
| 1468 | +# $title3 =~ s/\%2F/\//g ;
|
| 1469 | +# $title3 =~ s/\%5C/\\/g ;
|
| 1470 | +# $title3 =~ s/\%2A/\*/g ;
|
| 1471 | +# $title3 =~ s/\%21/\!/g ;
|
| 1472 | +# $title3 =~ s/\%5F/\_/g ;
|
| 1473 | +# $title3 =~ s/\%2C/,/g ;
|
| 1474 | +# $title3 =~ s/\%2E/./g ;
|
| 1475 | +# $title3 =~ s/\%2D/-/g ;
|
| 1476 | +# $title3 =~ s/\%25/%/g ;
|
| 1477 | +# $title3 =~ s/\%7E/~/g ;
|
| 1478 | +# $title3 =~ s/\%27/'/g ;
|
| 1479 | +# $title3 =~ s/\%3D/=/g ;
|
| 1480 | +# $title3 =~ s/\%26/&/g ;
|
| 1481 | +# $title3 =~ s/\%3B/;/g ;
|
| 1482 | +# $title3 =~ s/\%3F/?/g ;
|
| 1483 | +# $title3 =~ s/\%2B/+/g ;
|
| 1484 | +
|
| 1485 | +# if ($title1b eq $title3) # process reversible ?
|
| 1486 | +# {
|
| 1487 | +# $y++ ;
|
| 1488 | +# $title2 =~ s/\s/_/g;
|
| 1489 | +# $title = $title2 ;
|
| 1490 | +# }
|
| 1491 | +# else
|
| 1492 | +# {
|
| 1493 | +# $n++ ;
|
| 1494 | +# print "Y $y N $n\n$title\n$title3\n\n" ;
|
| 1495 | +# print LOG "Y $y N $n\n$title\n$title3\n\n" ;
|
| 1496 | +# }
|
| 1497 | +# }
|
| 1498 | +#}
|
| 1499 | +# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
|
| 1500 | +# $title =~ s/\s/_/g;
|
| 1501 | +# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
|
| 1502 | +# $lang =~ s/\.y/2/ ;
|
| 1503 | +
|
| 1504 | +# # print $out_day "$lang $title $counts\n" ;
|
| 1505 | +# print OUT "$lang $title $counts\n" ;
|
| 1506 | +# }
|
| 1507 | +
|
| 1508 | +# print "Close files\n" ;
|
| 1509 | +# $in_hour -> close () ;
|
| 1510 | +## $out_day -> close () ;
|
| 1511 | +# close (OUT) ;
|
| 1512 | +# $result = `$path_7z a $file_out $file_txt` ;
|
| 1513 | +# print $result ;
|
| 1514 | +#}
|
| 1515 | +
|
| 1516 | +
|
| 1517 | +
|
| 1518 | +# test (partial) reversibility of process
|
| 1519 | +#sub UncompactVisitorStats
|
| 1520 | +#{
|
| 1521 | +# my $file_in = "out/2009-03/pagecounts-20090301_fdt" ;
|
| 1522 | +# my $dir_out = "out" ;
|
| 1523 | +# # $in_hour = IO::Uncompress::Gunzip->new ($file_in) || &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
|
| 1524 | +# open $in_hour, '<', $file_in ;
|
| 1525 | +# binmode $in_hour ;
|
| 1526 | +
|
| 1527 | +# for ($h=0 ; $h<=23 ; $h++)
|
| 1528 | +# {
|
| 1529 | +# $time = sprintf ("%02d",$h) . "0000" ;
|
| 1530 | +## $file_out = "$dir_out/pagecounts-20090301-$time.gz" ;
|
| 1531 | +# $file_out = "$dir_out/pagecounts-20090301-$time" ;
|
| 1532 | +# open $out_day [$h], '>', $file_out ;
|
| 1533 | +## $out_day [$h] = IO::Compress::Gzip->new ($file_out) || &Abort ("IO::Compress::Gzip failed: $GzipError\n");
|
| 1534 | +# binmode $out_day [$h] ;
|
| 1535 | +# }
|
| 1536 | +
|
| 1537 | +# while ($line = <$in_hour>)
|
| 1538 | +# {
|
| 1539 | +# next if $line =~ /^#/ ;
|
| 1540 | +# next if $line =~ /^@/ ;
|
| 1541 | +# chomp ($line) ;
|
| 1542 | +## print "$line\n" ;
|
| 1543 | +# if ($lines++ > 10000) { exit ; }
|
| 1544 | +# ($lang,$title,$counts) = split (" ", $line) ;
|
| 1545 | +# $lang =~ s/\.z// ;
|
| 1546 | +# $lang =~ s/\.y/2/ ;
|
| 1547 | +# $counts =~ s/^\d+// ; # remove (redundant) preceding total
|
| 1548 | +# while ($counts ne "")
|
| 1549 | +# {
|
| 1550 | +# $letter = substr ($counts,0,1) ;
|
| 1551 | +# $counts = substr ($counts,1) ;
|
| 1552 | +# ($count = $counts) =~ s/^(\d+).*$/$1/ ;
|
| 1553 | +# $counts =~ s/^\d+(.*)$/$1/ ;
|
| 1554 | +# $h = ord ($letter) - ord ('A') ;
|
| 1555 | +# $file = $out_day [$h] ;
|
| 1556 | +# $writes {$h} ++ ;
|
| 1557 | +# print $file "$lang $title $count\n" ;
|
| 1558 | +# }
|
| 1559 | +
|
| 1560 | +# }
|
| 1561 | +
|
| 1562 | +# for ($h=0 ; $h<=23 ; $h++)
|
| 1563 | +# {
|
| 1564 | +## $out_day [$h] -> close () ;
|
| 1565 | +# close $out_day [$h] ;
|
| 1566 | +# }
|
| 1567 | +#}
|
| 1568 | +
|
| 1569 | +
|