r82396 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r82395‎ \| r82396 \| r82397 >
Date:	13:34, 18 February 2011
Author:	ezachte
Status:	deferred
Tags:
Comment:	Collect and process hourly page view files from http://dammit.lt/wikistats Folder cellar contains old special purpose code, perhaps still of value (undocumented)
Modified paths:	/trunk/wikistats/dammit.lt (added) (history) /trunk/wikistats/dammit.lt/DammitCompactHourlyOrDailyPageCountFiles.pl (added) (history) /trunk/wikistats/dammit.lt/DammitCompactHourlyPageCountFiles.pl (added) (history) /trunk/wikistats/dammit.lt/DammitReportPageRequestsStaffWikis.pl (added) (history) /trunk/wikistats/dammit.lt/DammitSyncFiles.pl (added) (history) /trunk/wikistats/dammit.lt/cellar (added) (history) /trunk/wikistats/dammit.lt/cellar/!DammitCollectArticleNames.pl (added) (history) /trunk/wikistats/dammit.lt/cellar/!DammitCollectViewsOneArticle.pl (added) (history) /trunk/wikistats/dammit.lt/cellar/!DammitFilesFindMisses.pl (added) (history) /trunk/wikistats/dammit.lt/cellar/!DammitFilterDailyPagecountFilesPerLanguage.pl (added) (history) /trunk/wikistats/dammit.lt/cellar/!DammitPageViewsPerSpecialSearch.pl (added) (history) /trunk/wikistats/dammit.lt/cellar/!DammitPatchProjectcountsForBanners.pl (added) (history) /trunk/wikistats/dammit.lt/cellar/!DammitPatchProjectcountsForFundraiser.pl (added) (history) /trunk/wikistats/dammit.lt/cellar/!DammitPrepCollectHarvestInterwikiLinks.pl (added) (history) /trunk/wikistats/dammit.lt/cellar/!DammitRankSpecialPages.pl (added) (history) /trunk/wikistats/dammit.lt/cellar/!DammitRankSpecialPages.txt (added) (history) /trunk/wikistats/dammit.lt/cellar/!DammitScanCompactedFiles.pl (added) (history) /trunk/wikistats/dammit.lt/dammit_compact_monthly.sh (added) (history) /trunk/wikistats/dammit.lt/dammit_filter.sh (added) (history) /trunk/wikistats/dammit.lt/dammit_report.sh (added) (history) /trunk/wikistats/dammit.lt/dammit_scan.sh (added) (history) /trunk/wikistats/dammit.lt/dammit_sync.sh (added) (history)

Diff [purge]

Index: trunk/wikistats/dammit.lt/dammit_report.sh
—	—	@@ -0,0 +1 @@
	2	+perl /a/dammit.lt/DammitReportPageRequestsStaffWikis.pl
Property changes on: trunk/wikistats/dammit.lt/dammit_report.sh
___________________________________________________________________
Added: svn:eol-style
1	3	+ native
Index: trunk/wikistats/dammit.lt/dammit_sync.sh
—	—	@@ -0,0 +1,2 @@
	2	+perl /a/dammit.lt/DammitSyncFiles.pl
	3	+#perl /home/ezachte/wikistats/WikiCountsJobProgress.pl >> /a/dammit.lt/cron.txt
Property changes on: trunk/wikistats/dammit.lt/dammit_sync.sh
___________________________________________________________________
Added: svn:eol-style
1	4	+ native
Index: trunk/wikistats/dammit.lt/cellar/!DammitPageViewsPerSpecialSearch.pl
—	—	@@ -0,0 +1,263 @@
	2	+#!/usr/bin/perl
	3	+
	4	+ use lib "/home/ezachte/lib" ;
	5	+ use EzLib ;
	6	+ $trace_on_exit = $true ;
	7	+
	8	+ use CGI::Carp qw(fatalsToBrowser);
	9	+ use Time::Local ;
	10	+ use Net::Domain qw (hostname);
	11	+
	12	+ $file_csv_pagecounts = "pagecounts-$month-$language\_fdt" ;
	13	+
	14	+ open CSV, '>', "/a/dammit.lt/SpecialSearch.csv" ;
	15	+ open TXT, '>', "/a/dammit.lt/SpecialSearch.txt" ;
	16	+
	17	+ $timestart = time ;
	18	+
	19	+ &ScanFiles ;
	20	+ print "\n\nReady\n\n" ;
	21	+ exit ;
	22	+
	23	+sub ScanFiles
	24	+{
	25	+ print "ScanFiles\n" ;
	26	+ print "Filter view counts for $language to $dir_out/$file_csv_pagecounts\n\n" ;
	27	+
	28	+ $year = 2009 ;
	29	+ $month = 10 ;
	30	+ while ($year == 2009 \|\| ($year == 2010 && $month <= 5))
	31	+ {
	32	+
	33	+ for ($day = 1 ; $day <= 31 ; $day++)
	34	+ {
	35	+
	36	+ $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
	37	+ $yyyymmdd = sprintf ("%04d%02d%02d", $year, $month, $day) ;
	38	+
	39	+ $file_pagecounts = "/a/dammit.lt/pagecounts/$yyyymm/pagecounts-${yyyymmdd}_h.bz2" ;
	40	+
	41	+ if (! -e $file_pagecounts)
	42	+ { print "Not found: $file_pagecounts\n" ; next ; }
	43	+
	44	+ print ddhhmmss (time,"%d:%02d:%02d") . "\nRead $file_pagecounts\n\n" ;
	45	+
	46	+ if ($file_pagecounts =~ /.7z$/)
	47	+ { open IN, "-\|", "./7za e -so \"$file_pagecounts\"" \|\| die ("Input file '$file_pagecounts' could not be opened.") ; }
	48	+ elsif ($file_pagecounts =~ /.bz2$/)
	49	+ { open IN, "-\|", "bzip2 -dc \"$file_pagecounts\"" \|\| die ("Input file '$file_pagecounts' could not be opened.") ; }
	50	+ else
	51	+ { next ; } # open IN, '<', $file_pagecounts ; }
	52	+
	53	+ $project = "" ;
	54	+ while ($line = <IN>)
	55	+ {
	56	+ next if $line =~ /^#/ ;
	57	+ next if $line =~ /^@/ ;
	58	+
	59	+ if ($line !~ /$project/)
	60	+ {
	61	+ if ($project eq 'en.z')
	62	+ {
	63	+ print CSV "\"=date($year,$month,$day)\",$project2,$generic,$specific,$other\n" ;
	64	+ print "\"=date($year,$month,$day)\",$project2,$generic,$specific,$other\n" ;
	65	+ $generic = 0 ;
	66	+ $specific = 0 ;
	67	+ $other = 0 ;
	68	+ }
	69	+ ($project) = split ' ', $line ; print "$project " ;
	70	+ }
	71	+ next if $line lt "en.z " ;
	72	+ last if $line gt "en.\xFF" ;
	73	+
	74	+ if ($project eq 'en.z')
	75	+ {
	76	+ if ($line =~ /Special:Search/i)
	77	+ {
	78	+ ($project, $title, $counts) = split (' ', $line) ;
	79	+ ($project2 = $project) =~ s/\.z// ;
	80	+ $counts =~ s/^(\d+).*$/$1/ ;
	81	+ $title =~ s/,/,/g ;
	82	+
	83	+ if ($yyyymmdd eq '20100201')
	84	+ { print TXT "$yyyymmdd,$project2,$counts,$title\n" ; }
	85	+
	86	+ if ($title =~ /^Special:Search\//i)
	87	+ { $specific += $counts ; }
	88	+ elsif ($title =~ /^Special:Search/i)
	89	+ { $generic += $counts ; }
	90	+ else
	91	+ { $other += $counts ; }
	92	+ }
	93	+ }
	94	+ }
	95	+ close IN ;
	96	+ }
	97	+ close OUT ;
	98	+ $month ++ ;
	99	+ if ($month > 12)
	100	+ { $month = 1 ; $year ++ ; }
	101	+
	102	+ }
	103	+}
	104	+
	105	+sub CountArticles
	106	+{
	107	+ print "CountArticles\n" ;
	108	+ if (! -e "$dir_in/$file_csv_pagecounts")
	109	+ { print "File not found: $dir_in/$file_csv_pagecounts\n" ; exit ; }
	110	+
	111	+ open IN, '<', "$dir_in/$file_csv_pagecounts" ;
	112	+ while ($line = <IN>)
	113	+ {
	114	+ chomp ($line) ;
	115	+
	116	+ ($count,$title) = split (' ', $line,2) ;
	117	+# if ($title !~ /Depardieu/) { next ; }
	118	+ $title =~ s/%([0-9A-F]{2})/chr(hex($1))/ge ;
	119	+ if ($unicodetoascii)
	120	+ { $title =~ s/([\x80-\xFF]{2,})/&UnicodeToAscii($1)/ge ; }
	121	+ $title =~ s/(\&\#\d+\;)/&HtmlToAscii($1)/ge ;
	122	+ $title =~ s/\"/'/g ;
	123	+ $title =~ s/\&/&/g ;
	124	+ $title = lc ($title) ;
	125	+# print "X $count $title\n" ;
	126	+ $titles {$title} += $count ;
	127	+ }
	128	+ close IN ;
	129	+
	130	+ open OUT, '>', "$dir_out/WikiStatsPageViewsPerArticleSortByTitle.txt" ;
	131	+ open IN, '<', "$dir_out/WikiStatsArticles.csv" ;
	132	+ while ($line = <IN>)
	133	+ {
	134	+ chomp ($line) ;
	135	+ ($title,$category) = split (',',$line) ;
	136	+
	137	+# next if $category !~ /politicus/ ;
	138	+# next if $category =~ /Nederlands/ ;
	139	+# $category =~ s/-politicus// ;
	140	+
	141	+# if ($title !~ /Depardieu/) { next ; }
	142	+ $title =~ s/\%2C/,/g ;
	143	+ $category =~ s/\%2C/,/g ;
	144	+ $title =~ s/\s/_/g ;
	145	+ $title =~ s/(\&\#\d+\;)/&HtmlToAscii($1)/ge ;
	146	+ $title =~ s/\"/'/g ;
	147	+ $title =~ s/\&/&/g ;
	148	+ $title_lc = lc ($title) ;
	149	+ $count = ($titles {$title_lc}+0) ; # force numeric
	150	+# print "Y $count $title_lc\n" ;
	151	+ print OUT sprintf ("%5d",$count) . " " . $title . "\n" ;
	152	+ if ($title ne $title_prev)
	153	+ { $articles {$title} += $count ; }
	154	+ $title_prev = $title ;
	155	+ $categories {$category} += $count ;
	156	+ $titlecat {$title} = $category ;
	157	+ }
	158	+ close IN ;
	159	+ close OUT ;
	160	+
	161	+ open OUT, '>', "$dir_out/WikiStatsPageViewsPerArticleSortByTitle.txt" ;
	162	+ print OUT "Wikipedia '$wikipedia', Category: '$categoryroot', Month: '$month_out'\n" ;
	163	+ foreach $article (sort keys %articles)
	164	+# { print OUT sprintf ("%5d",$articles {$article}) . " " . $article . "\n" ; }
	165	+ { &Print ($articles {$article}, $article) ; }
	166	+ close OUT ;
	167	+
	168	+ open OUT, '>', "$dir_out/WikiStatsPageViewsPerArticleSortByViews.txt" ;
	169	+ print OUT "Wikipedia '$wikipedia', Category: '$categoryroot', Month: '$month_out'\n" ;
	170	+ foreach $article (sort {$articles {$b} <=> $articles {$a}} keys %articles)
	171	+# { print OUT sprintf ("%5d",$articles {$article}) . " " . $article . "\n" ; }
	172	+ { &Print ($articles {$article}, $article) ; }
	173	+ close OUT ;
	174	+
	175	+ open OUT, '>', "$dir_out/WikiStatsPageViewsPerCategorySortByTitle.txt" ;
	176	+ print OUT "Wikipedia '$wikipedia', Category: '$categoryroot', Month: '$month_out'\n" ;
	177	+ foreach $category (sort keys %categories)
	178	+# { print OUT sprintf ("%5d",$categories {$category}) . " " . $category . "\n" ; }
	179	+ { &Print ($categories {$category}, $category) ; }
	180	+ close OUT ;
	181	+
	182	+ open OUT, '>', "$dir_out/WikiStatsPageViewsPerCategorySortByViews.txt" ;
	183	+ print OUT "Wikipedia '$wikipedia', Category: '$categoryroot', Month: '$month_out'\n" ;
	184	+ foreach $category (sort {$categories {$b} <=> $categories {$a}} keys %categories)
	185	+# { print OUT sprintf ("%5d",$categories {$category}) . " " . $category . "\n" ; }
	186	+ { &Print ($categories {$category}, $category) ; }
	187	+ close OUT ;
	188	+
	189	+# open OUT, '>', "$dir_out/WikiStatsPageViewsPerPerArticleSortByViewsPvdA.csv" ;
	190	+# print OUT "politicus,partij,hits,kleur\n" ;
	191	+# foreach $article (sort {$articles {$b} <=> $articles {$a}} keys %articles)
	192	+# {
	193	+# last if $articles {$article} == 0 ;
	194	+# next if $titlecat {$article} !~ /pvda/i ;
	195	+# $color = int(rand(255)) ;
	196	+# print OUT "$article,${titlecat {$article}},${articles {$article}},$color\n" ;
	197	+# }
	198	+# close OUT ;
	199	+
	200	+}
	201	+
	202	+sub Print
	203	+{
	204	+ my $count = shift ;
	205	+ my $text = shift ;
	206	+ print OUT sprintf ("%5d",$count) . " p/m = " . sprintf ("%4.0f",$count/$daysinmonth) . " p/d : $text\n" ;
	207	+}
	208	+
	209	+# translates one unicode character into plain ascii
	210	+sub UnicodeToAscii {
	211	+ my $unicode = shift ;
	212	+
	213	+ my $char = substr ($unicode,0,1) ;
	214	+ my $ord = ord ($char) ;
	215	+ my ($c, $value, $html) ;
	216	+
	217	+ if ($ord < 128) # plain ascii character
	218	+ { return ($unicode) ; } # (will not occur in this script)
	219	+ else
	220	+ {
	221	+ if ($ord >= 252) { $value = $ord - 252 ; }
	222	+ elsif ($ord >= 248) { $value = $ord - 248 ; }
	223	+ elsif ($ord >= 240) { $value = $ord - 240 ; }
	224	+ elsif ($ord >= 224) { $value = $ord - 224 ; }
	225	+ else { $value = $ord - 192 ; }
	226	+
	227	+ for ($c = 1 ; $c < length ($unicode) ; $c++)
	228	+ { $value = $value * 64 + ord (substr ($unicode, $c,1)) - 128 ; }
	229	+
	230	+ if ($value < 256)
	231	+ { return (chr ($value)) ; }
	232	+
	233	+ # $unicode =~ s/([\x80-\xFF])/("%".sprintf("%02X",$1))/gie ;
	234	+ return ($unicode) ;
	235	+ }
	236	+}
	237	+
	238	+sub HtmlToAscii {
	239	+ my $html = shift ;
	240	+ my $html2 = $html ;
	241	+ $html2 =~ s/[^\d]//g ;
	242	+ if ($html2 <= 255)
	243	+ { return (chr ($html2)) ; }
	244	+ else
	245	+ { return ($html) ; }
	246	+}
	247	+
	248	+sub Log
	249	+{
	250	+ $msg = shift ;
	251	+ print $msg ;
	252	+ print FILE_LOG $msg ;
	253	+}
	254	+
	255	+sub Abort
	256	+{
	257	+ $msg = shift ;
	258	+ print "Abort script\nError: $msg\n" ;
	259	+ print LOG "Abort script\nError: $msg\n" ;
	260	+ exit ;
	261	+}
	262	+
	263	+
	264	+
Index: trunk/wikistats/dammit.lt/cellar/!DammitCollectViewsOneArticle.pl
—	—	@@ -0,0 +1,199 @@
	2	+#!/usr/local/bin/perl
	3	+
	4	+# 27 April 2010 renamed from WikiStatsCollectViewsOneArticle.pl
	5	+
	6	+ use CGI qw(:all);
	7	+ use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; # install IO-Compress-Zlib
	8	+ use IO::Compress::Gzip qw(gzip $GzipError) ; # install IO-Compress-Zlib
	9	+
	10	+ $\| = 1; # flush screen output
	11	+ $true = 1 ;
	12	+ $false = 0 ;
	13	+ $mode = "H" ; # daily files as opposed to H
	14	+
	15	+# $dir0 = "D:/Wikipedia_Visitors/full_day" ;
	16	+ $dir0 = "D:/Wikipedia_Visitors" ;
	17	+ chdir ($dir0) \|\| die "Cannot chdir to $dir0\n";
	18	+
	19	+# open TXT, ">", "JoeBiden.txt" ;
	20	+# open TXT, ">", "FalungGong.txt" ;
	21	+# &ProcessMonth (2008,7) ;
	22	+# &ProcessMonth (2008,8) ;
	23	+# close TXT ;
	24	+
	25	+ &ProcessSelection ;
	26	+
	27	+ exit ;
	28	+
	29	+sub ProcessSelection
	30	+{
	31	+ open "IN", "<", "FalungGong.txt" ;
	32	+ open "OUT", ">", "FalungGongTotals.csv" ;
	33	+ while ($line = <IN>)
	34	+ {
	35	+ chomp ($line) ;
	36	+ $line =~ s/\s+/ /g ;
	37	+ ($timestamp, $project, $count, $title) = split (' ', $line) ;
	38	+ # $timestamp =~ s/\d\d\d\d$// ; # discard minutes and seconds
	39	+ $timestamp =~ s/\-\d\d\d\d\d\d$// ; # discard hours, minutes and seconds
	40	+ if ($project eq "zh")
	41	+ { @counts_zh {$timestamp} += $count ; }
	42	+ else
	43	+ { @counts_other {$timestamp} += $count ; }
	44	+ }
	45	+ close IN ;
	46	+
	47	+foreach $date (sort keys %counts_zh)
	48	+{
	49	+ $year = substr ($date,0,4) ;
	50	+ $month = substr ($date,4,2) ;
	51	+ $day = substr ($date,6,2) ;
	52	+ $timestamp = sprintf ("%02d/%02d/%04d", $day, $month, $year) ;
	53	+ print OUT $timestamp . "," . (@counts_zh {$date}) . "\n" ;
	54	+}
	55	+
	56	+if (0)
	57	+{
	58	+ $month = 7 ;
	59	+ for $day (1..31)
	60	+ {
	61	+ for $hour (0..23)
	62	+ {
	63	+ $timestamp = sprintf ("%04d%02d%02d-%02d", 2008, 7, $day, $hour) ;
	64	+ $timestamp2 = sprintf ("%02d/%02d/%04d %02d:%02d", $day, 7, 2008, $hour, 0) ;
	65	+ print OUT $timestamp2 . "," . (@counts_zh {$timestamp}+0) . "," . (@counts_other {$timestamp}+0) . "\n" ;
	66	+ }
	67	+ }
	68	+
	69	+ $month = 8 ;
	70	+ for $day (1..31)
	71	+ {
	72	+ for $hour (0..23)
	73	+ {
	74	+ $timestamp = sprintf ("%04d%02d%02d-%02d", 2008, 8, $day, $hour) ;
	75	+ $timestamp2 = sprintf ("%02d/%02d/%04d %02d:%02d", $day, 8, 2008, $hour, 0) ;
	76	+ print OUT $timestamp2 . "," . (@counts_zh {$timestamp}+0) . "," . (@counts_other {$timestamp}+0) . "\n" ;
	77	+ }
	78	+ }
	79	+
	80	+ $month = 9 ;
	81	+ for $day (1..14)
	82	+ {
	83	+ for $hour (0..23)
	84	+ {
	85	+ $timestamp = sprintf ("%04d%02d%02d-%02d", 2008, 9, $day, $hour) ;
	86	+ $timestamp2 = sprintf ("%02d/%02d/%04d %02d:%02d", $day, 9, 2008, $hour, 0) ;
	87	+ print OUT $timestamp2 . "," . (@counts_zh {$timestamp}+0) . "," . (@counts_other {$timestamp}+0) . "\n" ;
	88	+ }
	89	+ }
	90	+}
	91	+ close OUT ;
	92	+}
	93	+
	94	+sub ProcessMonth
	95	+{
	96	+ my $year = shift ;
	97	+ my $month = sprintf ("%02d", shift) ;
	98	+
	99	+ $dir0 =~ s/[\\\/]$// ;
	100	+
	101	+ $dir_in = "$dir0/$year-$month-pagecounts" ;
	102	+ &Log ("Process year $year month $month from '$dir_in'\n") ;
	103	+ chdir ($dir_in) \|\| die "Cannot chdir to $dir_in\n";
	104	+ local (*DIR);
	105	+
	106	+ opendir (DIR, ".");
	107	+ @files = () ;
	108	+ while ($file_in = readdir (DIR))
	109	+ {
	110	+ if ($mode eq "H")
	111	+ {
	112	+ if ($file_in !~ /^pagecounts-\d{8,8}-\d{6,6}.gz$/)
	113	+ { next ; }
	114	+ if ($file_in lt "pagecounts-20080816-000000.gz")
	115	+ { next ; }
	116	+# if ($file_in ge "pagecounts-20080831-000000.gz")
	117	+# { next ; }
	118	+ }
	119	+ if ($mode eq "D")
	120	+ {
	121	+ if ($file_in !~ /^pagecounts-\d{8,8}_fd.gz$/)
	122	+ { next ; }
	123	+# if ($file_in lt "pagecounts-20080801_fd.gz")
	124	+# { next ; }
	125	+# if ($file_in ge "pagecounts-20080831_fd.gz")
	126	+# { next ; }
	127	+ }
	128	+ push @files, $file_in ;
	129	+ }
	130	+ closedir (DIR, ".");
	131	+
	132	+ @files = sort {$a cmp $b} @files ;
	133	+
	134	+ foreach $file (@files)
	135	+ { &ProcessFile ($file) ; }
	136	+}
	137	+
	138	+sub ProcessFile
	139	+{
	140	+ my $file = shift ;
	141	+ $date = substr ($file, 11, 8) ;
	142	+ $time = substr ($file, 20, 6) ;
	143	+ print "ProcessFile ($file)\n" ;
	144	+
	145	+ my $lines ;
	146	+ $in_gz = IO::Uncompress::Gunzip->new ($file) or die "IO::Uncompress::Gunzip failed for '$file': $GunzipError\n";
	147	+ binmode $in_gz ;
	148	+ while ($line = <$in_gz>)
	149	+ {
	150	+ # if ($line ge "eo")
	151	+ # { last ; }
	152	+ # if ($line !~ /^en /)
	153	+ # { next ; }
	154	+ # if ($lines ++ == 0) { print "$line" ; }
	155	+
	156	+# if ($line =~ /sarah.*palin/i)
	157	+# if ($line =~ /joe.*biden/i)
	158	+ if ($line =~ / \%E6\%B3\%95\%E8\%BD\%AE\%E5\%8A\%9F /)
	159	+ {
	160	+ if ($mode eq "H")
	161	+ {
	162	+ ($wiki,$title,$views,$bytes) = split (' ', $line) ;
	163	+ $line = sprintf ("%-10s", $wiki) . " " . sprintf ("%8d",$views) . " $title\n" ;
	164	+ print "$date-$time $line" ;
	165	+ print TXT "$date-$time $line" ;
	166	+ }
	167	+ if ($mode eq "D")
	168	+ {
	169	+ chomp ($line) ;
	170	+
	171	+ ($wiki,$title,$views_all_day) = split (' ', $line) ;
	172	+ $wiki =~ s/\.z// ;
	173	+ $wiki =~ s/\.y/2/ ;
	174	+ $views_all_day =~ s/^\d+// ; # remove (redundant) preceding total
	175	+ while ($views_all_day ne "")
	176	+ {
	177	+ $letter = substr ($views_all_day,0,1) ;
	178	+ $views_all_day = substr ($views_all_day,1) ;
	179	+ ($views_one_hour = $views_all_day) =~ s/^(\d+).*$/$1/ ;
	180	+ $views_all_day =~ s/^\d+(.*)$/$1/ ;
	181	+ $time = sprintf ("%02d",ord ($letter) - ord ('A')) . "0000" ;
	182	+
	183	+ $line = sprintf ("%-10s", $wiki) . " " . sprintf ("%8d",$views_one_hour) . " $title\n" ;
	184	+ print "$date-$time $line" ;
	185	+ print TXT "$date-$time $line" ;
	186	+ }
	187	+ }
	188	+ }
	189	+ }
	190	+
	191	+ $in_gz->close() ;
	192	+}
	193	+
	194	+sub Log
	195	+{
	196	+ $msg = shift ;
	197	+ print $msg ;
	198	+ print LOG $msg ;
	199	+}
	200	+
Index: trunk/wikistats/dammit.lt/cellar/!DammitFilterDailyPagecountFilesPerLanguage.pl
—	—	@@ -0,0 +1,156 @@
	2	+#!/usr/bin/perl
	3	+
	4	+# 27 April 2010 renamed from WikiStatsFilterCompactedDammitFilesPerLanguage.pl
	5	+
	6	+ use lib "/home/ezachte/lib" ;
	7	+ use EzLib ;
	8	+ $trace_on_exit = $true ;
	9	+
	10	+ use CGI::Carp qw(fatalsToBrowser);
	11	+ use Time::Local ;
	12	+ use Net::Domain qw (hostname);
	13	+
	14	+ $language = "nl" ;
	15	+ $wikipedia = "$language.wikipedia.org" ; # read from input
	16	+
	17	+ $path_in = "." ;
	18	+ $path_out = "." ;
	19	+ if ($hostname eq "bayes")
	20	+ {
	21	+ $path_in = "/a/dammit.lt/pagecounts" ;
	22	+ $path_out = "/a/dammit.lt/pagecounts/languages/$language.z" ;
	23	+ if (! -d $path_out)
	24	+ { mkdir $path_out, 0777 ; }
	25	+ $path_7za = "/usr/lib/p7zip/7za" ;
	26	+ }
	27	+
	28	+ $month = 8 ;
	29	+ $year = 2008 ;
	30	+ $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
	31	+ $path_in_monthly = "$path_in/$yyyymm" ;
	32	+ while (-d $path_in_monthly)
	33	+ {
	34	+ print "\nCheck dir $path_in_monthly\n" ;
	35	+
	36	+ $file_filtered = "$path_out/pagecounts-$yyyymm-$language-fdt.txt" ;
	37	+
	38	+ if ($hostname eq "bayes")
	39	+ {
	40	+ $file_filtered_7z = "$file_filtered.7z" ;
	41	+
	42	+ if (-e $file_filtered_7z)
	43	+ { print "File $file_filtered_7z already exists\n" ; }
	44	+ else
	45	+ { &FilterCounts ($yyyymm, $file_filtered) ; }
	46	+ }
	47	+ else
	48	+ { &FilterCounts ($yyyymm, $file_filtered) ; }
	49	+
	50	+ $month++ ;
	51	+ if ($month > 12)
	52	+ { $month = 1 ; $year++ ; }
	53	+ $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
	54	+ $path_in_monthly = "$path_in/$yyyymm" ;
	55	+ }
	56	+
	57	+ print "\n\nReady\n\n" ;
	58	+ exit ;
	59	+
	60	+sub FilterCounts
	61	+{
	62	+ my ($yyyymm, $file_filtered) = @_ ;
	63	+ ($yyyymm2 = $yyyymm) =~ s/-// ;
	64	+
	65	+ open OUT, '>', $file_filtered ;
	66	+
	67	+ print OUT "# Counts for articles with less than a few requests per full day (before April 2010 five per day, from then on two per day) were not preserved in daily archives and hence are neither included here\n" ;
	68	+# print OUT "# Subproject is language code, followed by project code\n" ;
	69	+# print OUT "# Project is b:wikibooks, k:wiktionary, n:wikinews, q:wikiquote, s:wikisource, v:wikiversity, z:wikipedia (z added by compression script: wikipedia happens to be sorted last in dammit.lt files)\n" ;
	70	+ print OUT "# Counts format is total per day, followed by count per hour if larger than zero, hour 0..23 shown as A..X (saves up to 22 bytes per line compared to comma separated values)\n" ;
	71	+ print OUT "# If data are missing for some hour (file missing or corrupt) a question mark (?) is shown (and for each missing hour the daily total is incremented with hourly average)\n" ;
	72	+ print OUT "# Lines starting with ampersand (@) show totals per 'namespace' (including omitted counts for low traffic articles)\n" ;
	73	+ print OUT "# Since valid namespace string are not known in the compression script any string followed by colon (:) counts as possible namespace string\n" ;
	74	+ print OUT "# Please reconcile with real namespace name strings later\n" ;
	75	+ print OUT "# 'namespaces' with count < 5 are combined in 'Other' (on larger wikis these are surely false positives)\n" ;
	76	+ print OUT "# Page titles are shown unmodified (preserves sort sequence)\n" ;
	77	+
	78	+
	79	+ for ($day = 1 ; $day <= 31 ; $day++)
	80	+ {
	81	+ $yyyymmdd = "$yyyymm-" . sprintf ("%02d", $day) ;
	82	+
	83	+ $file_pagecounts = "$path_in/$yyyymm/pagecounts-$yyyymm2" . sprintf ("%02d", $day) . "_fdt" ;
	84	+ if ($hostname eq "bayes")
	85	+ { $file_pagecounts .= ".7z" ; }
	86	+
	87	+
	88	+ if (! -e $file_pagecounts)
	89	+ {
	90	+ print "\nNot found: $file_pagecounts\n" ;
	91	+ print OUT "# $yyyymmdd missing!\n" ;
	92	+ next ;
	93	+ }
	94	+
	95	+ print "Read $file_pagecounts\n" ;
	96	+ print OUT "# $yyyymmdd\n" ;
	97	+
	98	+ if ($hostname eq "bayes")
	99	+ { open IN, "-\|", "./7za e -so \"$file_pagecounts\"" \|\| die ("Input file '" . $file_pagecounts . "' could not be opened.") ; }
	100	+ else
	101	+ { open IN, '<', $file_pagecounts ; }
	102	+
	103	+ while ($line = <IN>)
	104	+ {
	105	+ $ch = substr ($line,0,1) ;
	106	+
	107	+ next if $ch eq '#' ; # comments
	108	+
	109	+ if ($ch eq '@') # summary per language project
	110	+ {
	111	+ if ($line =~ /^\@ $language\.z /o)
	112	+ { print OUT $line ; }
	113	+ next ;
	114	+ }
	115	+
	116	+ next if $line lt "$language.z" ;
	117	+ last if $line !~ /$language.z / ;
	118	+
	119	+ ($project, $title, $counts) = split (' ', $line) ;
	120	+ print OUT "$title $counts\n" ;
	121	+ }
	122	+ close IN ;
	123	+ }
	124	+ close OUT ;
	125	+
	126	+ $cmd = "$path_7za a $file_filtered.7z $file_filtered" ;
	127	+ $result = `$cmd` ;
	128	+
	129	+ if ($result =~ /Everything is Ok/s)
	130	+ {
	131	+ $result =~ s/^.?(Updating.?)\n.*$/$1 -> OK/s ;
	132	+ unlink $file_filtered ;
	133	+ }
	134	+ else
	135	+ {
	136	+ print "Delete $file_filtered.7z\n" ;
	137	+ unlink "$file_filtered.7z" ;
	138	+ }
	139	+
	140	+ print "$cmd -> $result\n" ;
	141	+}
	142	+
	143	+sub Log
	144	+{
	145	+ $msg = shift ;
	146	+ print $msg ;
	147	+}
	148	+
	149	+sub Abort
	150	+{
	151	+ $msg = shift ;
	152	+ print "Abort script\nError: $msg\n" ;
	153	+ exit ;
	154	+}
	155	+
	156	+
	157	+
Index: trunk/wikistats/dammit.lt/cellar/!DammitPatchProjectcountsForFundraiser.pl
—	—	@@ -0,0 +1,86 @@
	2	+#!/usr/bin/perl
	3	+
	4	+$\| = 1; # flush screen output
	5	+
	6	+open IN, '<', 'DammitPatchProjectcountsForFundraiser/AllSquids.csv' ;
	7	+open LOG, '>', 'DammitPatchProjectcountsForFundraiser/Log.txt' ;
	8	+
	9	+chdir ("DammitPatchProjectcountsForFundraiser") \|\| die "Cannot chdir to DammitPatchProjectcountsForFundraiser\n" ;
	10	+
	11	+while ($line = <IN>)
	12	+{
	13	+ chomp $line ;
	14	+
	15	+ next if $line =~ /[*]/ ;
	16	+ next if $line !~ /^2010/ ;
	17	+
	18	+ ($date,$hour,$events,$avg_delta) = split (',', $line) ;
	19	+
	20	+ next if $avg_delta <= 1005 ; # normally projectcounts also miss a few hits, overcorrecting would skew trends
	21	+ &Patch ($date, $hour, $avg_delta) ;
	22	+}
	23	+
	24	+print "\n\nReady\n\n" ;
	25	+exit ;
	26	+
	27	+sub Patch
	28	+{
	29	+ ($date,$hour,$avg_delta) = @_ ;
	30	+
	31	+ $date =~ s/-//g ;
	32	+ $file = "projectcounts-$date-" . sprintf ("%02d",$hour) . "0000" ;
	33	+
	34	+ if (! -e $file)
	35	+ {
	36	+ $file = "projectcounts-$date-" . sprintf ("%02d",$hour) . "0001" ;
	37	+ if (! -e $file)
	38	+ {
	39	+ print "File '$file' missing!\n" ;
	40	+ exit ;
	41	+ }
	42	+ }
	43	+ &PatchFile ($file, $avg_delta) ;
	44	+}
	45	+
	46	+sub PatchFile
	47	+{
	48	+ my ($file,$avg_delta) = @_ ;
	49	+ my $line ;
	50	+ $correction = $avg_delta / 1000 ;
	51	+ print "Patch file $file: avg delta $avg_delta -> correction $correction\n" ;
	52	+
	53	+ open PROJECTFILE, '<', $file \|\| die "Could not open '$file'\n" ;
	54	+
	55	+ undef @projectfile ;
	56	+ $file_changed = 0 ;
	57	+ while ($line = <PROJECTFILE>)
	58	+ {
	59	+ chomp $line ;
	60	+ ($project,$dash,$count,$bytes) = split (' ', $line) ;
	61	+
	62	+ if ($bytes > 0)
	63	+ {
	64	+ $count = sprintf ("%.0f", $correction * $count) ;
	65	+ # &Log ("\n$line ->\n") ;
	66	+ $line = "$project $dash $count 1" ;
	67	+ # &Log ("$line\n") ;
	68	+ }
	69	+ push @projectfile, "$line\n" ;
	70	+ }
	71	+
	72	+ close PROJECTFILE ;
	73	+
	74	+ open PROJECTFILE, '>', $file \|\| die "Could not open '$file'\n" ;
	75	+ print PROJECTFILE @projectfile ;
	76	+ close PROJECTFILE ;
	77	+}
	78	+
	79	+sub Log
	80	+{
	81	+ my $msg = shift ;
	82	+ print $msg ;
	83	+ print LOG $msg ;
	84	+}
	85	+
	86	+
	87	+
Index: trunk/wikistats/dammit.lt/cellar/!DammitRankSpecialPages.pl
—	—	@@ -0,0 +1,44 @@
	2	+#!/usr/local/bin/perl
	3	+use CGI qw(:all);
	4	+
	5	+ open IN, '<', "pagecounts-20090301_fdt" ;
	6	+ open OUT, '>', "!DammitRankSpecialPages.txt" ;
	7	+ $projprev = "" ;
	8	+ while ($line = <IN>)
	9	+ {
	10	+ if ($line =~ /^#/) { next ; }
	11	+ if ($line =~ /^@/) { next ; }
	12	+ # if (($line !~ / Wikipedia\:/) && ($line !~ / Help\:/) && ($line !~ / Hilfe\:/) && ($line !~ / Wikip�dia\:/) && ($line !~ / Aide\:/) )
	13	+ if (($line !~ / Help\:/) && ($line !~ / Hilfe\:/) && ($line !~ / Aide\:/))
	14	+ { next ; }
	15	+
	16	+ chomp ($line) ;
	17	+ ($project, $title, $counts) = split (' ', $line) ;
	18	+ $project =~ s/^([^\.]+)\.z/wikipedia:$1/ ;
	19	+ $project =~ s/^([^\.]+)\.b/wikibooks:$1/ ;
	20	+ $project =~ s/^([^\.]+)\.d/wiktionary:$1/ ; # dictionaire
	21	+ $project =~ s/^([^\.]+)\.m/wikimedia:$1/ ;
	22	+ $project =~ s/^([^\.]+)\.n/wikinews:$1/ ;
	23	+ $project =~ s/^([^\.]+)\.q/wikiquote:$1/ ;
	24	+ $project =~ s/^([^\.]+)\.s/wikisource:$1/ ;
	25	+ $project =~ s/^([^\.]+)\.v/wikiversity:$1/ ;
	26	+ $project =~ s/^([^\.]+)\.x/wikispecial:$1/ ;
	27	+ if ($project ne $projprev)
	28	+ {
	29	+ $rows = 0 ;
	30	+ foreach $key (sort {$counts {$b} <=> $counts {$a}} keys %counts)
	31	+ {
	32	+ print OUT sprintf ("%8d", $counts {$key} ) . ": $key\n" ;
	33	+ if ($rows++ > 50)
	34	+ { last ;}
	35	+ }
	36	+ undef %counts ;
	37	+ }
	38	+ $projprev = $project ;
	39	+
	40	+ $counts =~ s/^(\d+).*$/$1/ ;
	41	+ @counts {"$project $title"} += $counts ;
	42	+ }
	43	+
	44	+
	45	+
Index: trunk/wikistats/dammit.lt/cellar/!DammitCollectArticleNames.pl
—	—	@@ -0,0 +1,152 @@
	2	+#!/usr/local/bin/perl
	3	+
	4	+# 27 April 2010 renamed from WikiStatsCollectArticleNames.pl
	5	+
	6	+use CGI qw(:all);
	7	+use Time::Local ;
	8	+use Getopt::Std ;
	9	+
	10	+ &ParseArguments ;
	11	+ $dumpfile = &FindDumpFile ;
	12	+ &ProcessFile ($dumpfile, "$path_out/$mode\_$project.txt") ;
	13	+ print "\n\nReady\n\n" ;
	14	+ exit ;
	15	+
	16	+sub ParseArguments
	17	+{
	18	+ my $options ;
	19	+ getopt ("iomp", \%options) ;
	20	+
	21	+ &Abort ("Specify input folder for xml dump files as: -i path") if (! defined (@options {"i"})) ;
	22	+ &Abort ("Specify output folder as: -o path") if (! defined (@options {"o"})) ;
	23	+
	24	+ $path_in = @options {"i"} ;
	25	+ $path_out = @options {"o"} ;
	26	+ $project = @options {"p"} ;
	27	+ $mode = @options {"m"} ;
	28	+
	29	+ $language = $project ;
	30	+ $language_ = $language ;
	31	+ $language_ =~ s/-/_/g ;
	32	+
	33	+ if ($mode eq "")
	34	+ { $mode = "wp" ; }
	35	+ if ($mode !~ /^(?:wb\|wk\|wn\|wp\|wq\|ws\|wx\|wv)$/)
	36	+ { abort ("Specify mode as: -m [wb\|wk\|wn\|wp\|wq\|ws\|wx\|wv]\n(wp=wikipedia (default), wb=wikibooks, wk=wiktionary, wn=wikinews, wq=wikiquote, ws=wikisource, wx=wikispecial, wv=wikiversity)") ; }
	37	+
	38	+ &Abort ("Project $project is skipped: 'mania' and/or 'team' in the name") if ($project =~ /(?:mania\|team)/i) ;
	39	+
	40	+ if ($project =~ /wik(?:\|ibooks\|inews\|iquote\|isource\|tionary\|iversity)$/i)
	41	+ {
	42	+ $project_suffix = $project ;
	43	+ $project_suffix =~ s/wik(?:\|ibooks\|inews\|iquote\|isource\|tionary\|iversity)$// ;
	44	+ }
	45	+ $language =~ s/wik(?:\|ibooks\|inews\|iquote\|isource\|tionary\|iversity)$// ;
	46	+
	47	+ if ($project =~ /wiki$/i)
	48	+ {
	49	+ $project_suffix = $project ;
	50	+ $project_suffix =~ s/wiki$// ;
	51	+ }
	52	+ $language =~ s/wiki$// ;
	53	+
	54	+ &Log ("Project '$project' -> language '$language'\n\n") ;
	55	+}
	56	+
	57	+sub FindDumpFile
	58	+{
	59	+ my ($dumpdir,$dir,$file,$scandir,$status) ;
	60	+
	61	+ @files = glob "$path_in/*" ;
	62	+
	63	+ &Log ("Find latest valid dump dir in $path_in ->\n\n") ;
	64	+ foreach $file (@files)
	65	+ {
	66	+ if ($file !~ /\/\d{8,8}$/)
	67	+ { next ; }
	68	+ if (! -d $file)
	69	+ { next ; }
	70	+
	71	+ ($dir = $file) =~ s/.*?\/(\d{8,8})/$1/ ;
	72	+ $scandir = "$path_in/$dir" ;
	73	+ if (! -e "$scandir/status.html")
	74	+ { &Log ("$scandir/status.html not found\n") ; }
	75	+ elsif (! -e "$scandir/index.html")
	76	+ { &Log ("$scandir/index.html not found\n") ; }
	77	+ else
	78	+ {
	79	+ open STATUS, '<', "$scandir/status.html" ;
	80	+ $line = <STATUS> ;
	81	+ chomp $line ;
	82	+ close STATUS ;
	83	+ $status = "undetermined: $line" ;
	84	+ if ($line =~ /dump complete/i)
	85	+ { $status = "dump complete" ; }
	86	+ elsif ($line =~ /dump aborted/i)
	87	+ { $status = "dump aborted" ; }
	88	+ elsif ($line =~ /dump in progress/i)
	89	+ { $status = "dump in progress" ; }
	90	+ if ($dumpdir lt $dir)
	91	+ {
	92	+ if ($status eq "dump complete")
	93	+ {
	94	+ open INDEX, '<', "$scandir/index.html" ;
	95	+ while ($line = <INDEX>)
	96	+ {
	97	+ if ($line =~ /failed.?All pages with complete.?edit history/i)
	98	+ {
	99	+ $status = "dump aborted (dump failed)" ;
	100	+ last ;
	101	+ }
	102	+ }
	103	+ close INDEX ;
	104	+ }
	105	+ if ($status eq "dump complete")
	106	+ { $dumpdir = $dir ; }
	107	+ }
	108	+ &Log ("$dir: $status\n") ;
	109	+ }
	110	+ }
	111	+ if ($dumpdir eq "")
	112	+ { &Abort ("No valid dump dir found\n") ; }
	113	+
	114	+ $path_in .= "/$dumpdir/" ;
	115	+ &Log ("\nDump dir -> $path_in\n") ;
	116	+ $dumpdate = $dumpdir ;
	117	+
	118	+ $dumpfile = "$path_in/$project-$dumpdate-pages-meta-current.xml.bz2" ;
	119	+ &Log ("\nFile in $dumpfile\n") ;
	120	+ return ($dumpfile) ;
	121	+}
	122	+
	123	+sub ProcessFile
	124	+{
	125	+ my $file_in = shift ;
	126	+ my $file_out = shift ;
	127	+ print "File out $file_out\n" ;
	128	+ open FILE_OUT, '>', $file_out \|\| abort ("Output file '" . $file_out . "' could not be opened.") ;
	129	+ open FILE_IN, "-\|", "bzip2 -dc \"$file_in\"" \|\| abort ("Input file '" . $file_in . "' could not be opened.") ;
	130	+ while ($line = <FILE_IN>)
	131	+ {
	132	+ # $line =~ s/<title>([^<]*)<\/title>/print FILE_OUT "$1\n", print "$1\n"/ge ;
	133	+ $line =~ s/<title>([^<]*)<\/title>/print FILE_OUT "$1\n"/ge ;
	134	+ }
	135	+ close FILE_IN ;
	136	+}
	137	+
	138	+sub Log
	139	+{
	140	+ $msg = shift ;
	141	+ print $msg ;
	142	+# print LOG $msg ;
	143	+}
	144	+
	145	+sub Abort
	146	+{
	147	+ $msg = shift ;
	148	+ print "Abort script\nError: $msg\n" ;
	149	+# print LOG "Abort script\nError: $msg\n" ;
	150	+ exit ;
	151	+}
	152	+
	153	+
Index: trunk/wikistats/dammit.lt/cellar/!DammitPrepCollectHarvestInterwikiLinks.pl
—	—	@@ -0,0 +1,40 @@
	2	+#!/usr/bin/perl
	3	+
	4	+open IN, '<', 'index.php' ;
	5	+
	6	+while ($line = <IN>)
	7	+{
	8	+ if ($line =~ /class=\"interwiki/)
	9	+ {
	10	+ chomp ($line) ;
	11	+ $lang = $line ;
	12	+ $lang =~ s/^.?interwiki-(\w+).$/$1/ ;
	13	+ $title = $line ;
	14	+ $title =~ s/^.?href=\"([^\"]+)\".$/$1/ ;
	15	+ $title =~ s/^.*\/([^\/]+)$/$1/ ;
	16	+# print "[$lang] $title\n" ;
	17	+ @languages {$title} .= "$lang," ;
	18	+ @langcnt {$title}++ ;
	19	+ }
	20	+}
	21	+print "\n\n\n" ;
	22	+
	23	+foreach $title (sort {$langcnt {$b} <=> $langcnt {$a}} keys %langcnt)
	24	+{
	25	+ $count = $langcnt {$title} ;
	26	+ if ($count > 10)
	27	+ { $pattern .= "^$title\n" ; }
	28	+ else
	29	+ {
	30	+ $langlist = $languages {$title} ;
	31	+ @langs = split (',', $langlist) ;
	32	+ foreach $lang (@langs)
	33	+ {
	34	+ print "$lang $title\n" ;
	35	+ $pattern .= "^$lang\.z $title\n"
	36	+ }
	37	+ }
	38	+}
	39	+
	40	+print "\n\nPATTERN:\n$pattern\n" ;
	41	+
Index: trunk/wikistats/dammit.lt/cellar/!DammitRankSpecialPages.txt
—	—	@@ -0,0 +1,576 @@
	2	+ 14: wikipedia:als Hilfe:Neue_Seite_anlegen
	3	+ 33: wikipedia:am Help:Contents
	4	+ 68: wikipedia:ang Help:Innung
	5	+ 10: wikipedia:arc Help:Contents
	6	+ 31: wikipedia:ay Help:Contents
	7	+ 12: wikipedia:bar Hilfe:Hilfe
	8	+ 10: wikiversity:beta Help:Contents
	9	+ 11: wikipedia:bo Help:Contents
	10	+ 10: wikipedia:chr Help:Contents
	11	+ 11: wikipedia:co Help:Contents
	12	+ 993: wikimedia:commons Help:Contents
	13	+ 86: wikimedia:commons Help:Inkscape
	14	+ 30: wikimedia:commons Help:SVG
	15	+ 23: wikimedia:commons Help:Creating_a_DjVu_file
	16	+ 21: wikimedia:commons Help:Sommaire
	17	+ 20: wikimedia:commons Help:Converting_video
	18	+ 18: wikimedia:commons Hilfe:%C3%9Cbersicht
	19	+ 17: wikimedia:commons Help:Scanning
	20	+ 16: wikimedia:commons Help:%E7%9B%AE%E6%AC%A1
	21	+ 15: wikimedia:commons Help:%C3%9Cbersicht
	22	+ 14: wikimedia:commons Help:Mass_deletion_request
	23	+ 13: wikimedia:commons Help:Zoomable_images
	24	+ 12: wikimedia:commons Help:Logging_in
	25	+ 11: wikimedia:commons Help:Mpeg2dv.sh
	26	+ 36: wikibooks:de Hilfe:Sammlungen
	27	+ 18: wikibooks:de Hilfe:Suche
	28	+ 18: wikibooks:de Hilfe:So_schreibe_ich_gute_B%C3%BCcher
	29	+ 11: wikibooks:de Hilfe:Erste_Schritte_auf_der_Spielwiese
	30	+ 10: wikibooks:de Hilfe:Urheberrechte_beachten
	31	+ 10: wikibooks:de Hilfe:Wikibook_lokal_speichern
	32	+ 811: wiktionary:de Hilfe:H%C3%B6rbeispiele
	33	+ 197: wiktionary:de Hilfe:Wortart
	34	+ 164: wiktionary:de Hilfe:IPA
	35	+ 63: wiktionary:de Hilfe:Nominativ
	36	+ 61: wiktionary:de Hilfe:Sonderzeichen/Tabelle
	37	+ 35: wiktionary:de Hilfe:H%C3%A4ufig_gestellte_Fragen
	38	+ 35: wiktionary:de Hilfe:Genitiv
	39	+ 34: wiktionary:de Hilfe:Plural
	40	+ 33: wiktionary:de Hilfe:Singular
	41	+ 26: wiktionary:de Hilfe:Pr%C3%A4sens
	42	+ 26: wiktionary:de Hilfe:Akkusativ
	43	+ 25: wiktionary:de Hilfe:Flexionstabellen_(Altgriechisch)
	44	+ 25: wiktionary:de Hilfe:Dativ
	45	+ 24: wiktionary:de Hilfe:Pr%C3%A4teritum
	46	+ 23: wiktionary:de Hilfe:Suche
	47	+ 21: wiktionary:de Hilfe:Konjunktiv
	48	+ 19: wiktionary:de Hilfe:Sonderzeichen
	49	+ 18: wiktionary:de Hilfe:Flexionstabellen_(Franz%C3%B6sisch)
	50	+ 15: wiktionary:de Hilfe:H%C3%B6rbeispiele/Liste
	51	+ 15: wiktionary:de Hilfe:Flexionstabellen_(Lateinisch)
	52	+ 15: wiktionary:de Hilfe:Lautschrift
	53	+ 14: wiktionary:de Hilfe:Positiv
	54	+ 14: wiktionary:de Hilfe:Kasus
	55	+ 13: wiktionary:de Hilfe:Imperativ
	56	+ 13: wiktionary:de Hilfe:Flexionstabellen
	57	+ 13: wiktionary:de Hilfe:Flexionstabellen_(Spanisch)
	58	+ 12: wiktionary:de Hilfe:Partizip
	59	+ 12: wiktionary:de Hilfe:Hinweise_f%C3%BCr_Leser
	60	+ 10: wiktionary:de Hilfe:Komparativ
	61	+ 19: wikinews:de Hilfe:Erste_Schritte
	62	+ 18: wikinews:de Hilfe:%C3%9Cbersicht
	63	+ 14: wikinews:de Hilfe:Zweite_Schritte
	64	+ 10: wikinews:de Hilfe:Quellenverzeichnis
	65	+ 11: wikiquote:de Hilfe:Erste_Schritte
	66	+ 20: wikisource:de Hilfe:Bearbeitungsstand
	67	+ 18: wikisource:de Hilfe:B%C3%BCcher
	68	+ 16: wikisource:de Hilfe:Korrekturlesen
	69	+ 10: wikisource:de Hilfe:Scannen_von_B%C3%BCchern
	70	+ 1722: wikipedia:de Hilfe:Buchfunktion
	71	+ 1154: wikipedia:de Hilfe:Sonderzeichen
	72	+ 923: wikipedia:de Hilfe:Gesichtete_und_gepr%C3%BCfte_Versionen
	73	+ 834: wikipedia:de Hilfe:Tutorial
	74	+ 747: wikipedia:de Hilfe:TeX
	75	+ 728: wikipedia:de Hilfe:Suche
	76	+ 484: wikipedia:de Hilfe:Wikimedia_Commons
	77	+ 417: wikipedia:de Hilfe:Neu_bei_Wikipedia
	78	+ 338: wikipedia:de Hilfe:Bearbeitungshilfe
	79	+ 272: wikipedia:de Hilfe:Tutorial/3
	80	+ 242: wikipedia:de Hilfe:Spezialseiten
	81	+ 241: wikipedia:de Hilfe:Tutorial/1
	82	+ 233: wikipedia:de Hilfe:Bilder
	83	+ 219: wikipedia:de Hilfe:Textgestaltung
	84	+ 202: wikipedia:de Hilfe:Vorlagen
	85	+ 191: wikipedia:de Hilfe:Seite_bearbeiten
	86	+ 177: wikipedia:de Hilfe:Zusammenfassung_und_Quelle
	87	+ 175: wikipedia:de Hilfe:Einzelnachweise
	88	+ 173: wikipedia:de Hilfe:Tutorial/2
	89	+ 169: wikipedia:de Hilfe:Tabellen
	90	+ 167: wikipedia:de Hilfe:Audio
	91	+ 166: wikipedia:de Hilfe:Neue_Seite_anlegen
	92	+ 154: wikipedia:de Hilfe:Einstellungen
	93	+ 141: wikipedia:de Hilfe:Formatvorlagen
	94	+ 140: wikipedia:de Hilfe:Signatur
	95	+ 139: wikipedia:de Hilfe:Tutorial/4
	96	+ 126: wikipedia:de Hilfe:FAQ
	97	+ 125: wikipedia:de Hilfe:Bausteine
	98	+ 125: wikipedia:de Hilfe:Archivieren
	99	+ 121: wikipedia:de Hilfe:Namensr%C3%A4ume
	100	+ 117: wikipedia:de Hilfe:Links
	101	+ 116: wikipedia:de Hilfe:Personendaten
	102	+ 115: wikipedia:de Hilfe:Zusammenfassung_und_Quellen
	103	+ 114: wikipedia:de Hilfe:Weiterleitung
	104	+ 110: wikipedia:de Hilfe:Bearbeiten
	105	+ 105: wikipedia:de Hilfe:Buchfunktion/Fragen_und_Antworten
	106	+ 95: wikipedia:de Hilfe:Benutzerkonto_anlegen
	107	+ 94: wikipedia:de Hilfe:Bild_und_Ton
	108	+ 87: wikipedia:de Hilfe:Farben
	109	+ 85: wikipedia:de Hilfe:Allgemeine_Textbausteine
	110	+ 79: wikipedia:de Hilfe:Versionen
	111	+ 79: wikipedia:de Hilfe:Bildertutorial
	112	+ 78: wikipedia:de Hilfe:Navigation
	113	+ 78: wikipedia:de Hilfe:Inhaltsverzeichnis
	114	+ 76: wikipedia:de Hilfe:Benutzerkonto
	115	+ 75: wikipedia:de Hilfe:Formatieren
	116	+ 74: wikipedia:de Hilfe:Listen_und_Tabellen
	117	+ 74: wikipedia:de Hilfe:Buchfunktion/Feedback_zur_Buchfunktion
	118	+ 74: wikipedia:de Hilfe:Tutorial/6
	119	+ 73: wikipedia:de Hilfe:Tutorial/5
	120	+ 72: wikipedia:de Hilfe:Benutzernamensraum
	121	+ 69: wikipedia:de Hilfe:Glossar
	122	+ 345: wikibooks:en Help:Contents
	123	+ 330: wikibooks:en Help:Page_validation
	124	+ 125: wikibooks:en Help:Collections
	125	+ 62: wikibooks:en Help:Starting_a_new_page_or_book
	126	+ 42: wikibooks:en Help:Editing
	127	+ 40: wikibooks:en Help:About
	128	+ 36: wikibooks:en Help:Development_stages
	129	+ 29: wikibooks:en Help:Searching
	130	+ 25: wikibooks:en Help:Print_versions
	131	+ 22: wikibooks:en Help:Contributing_FAQ
	132	+ 18: wikibooks:en Help:Contents/editing_wikibooks_-_the_basics
	133	+ 12: wikibooks:en Help:Why_contribute%3F
	134	+ 12: wikibooks:en Help:How_to_start_a_book
	135	+ 10: wikibooks:en Help:Images_and_other_uploaded_files
	136	+ 10: wikibooks:en Help:FAQ
	137	+ 869: wiktionary:en Help:Contents
	138	+ 58: wiktionary:en Help:Searching
	139	+ 45: wiktionary:en Help:How_to_check_translations
	140	+ 44: wiktionary:en Help:Starting_a_new_page
	141	+ 23: wiktionary:en Help:Example_sentences
	142	+ 21: wiktionary:en Help:How_to_edit_a_page
	143	+ 13: wiktionary:en Help:FAQ
	144	+ 13: wiktionary:en Help:Edit_summary
	145	+ 10: wiktionary:en Help:Audio_pronunciations
	146	+ 10: wiktionary:en Help:Editing
	147	+ 84: wikinews:en Help:Page_validation
	148	+ 36: wikinews:en Help:Editing_http://schoolpapers.hostinginfive.com/bike.htm
	149	+ 36: wikinews:en Help:Editing%20http://schoolpapers.hostinginfive.com/bike.htm
	150	+ 22: wikinews:en Help:Contents
	151	+ 14: wikinews:en Help:Editing
	152	+ 10: wikinews:en Help:How_to_decorate_your_article
	153	+ 271: wikiquote:en Help:Contents
	154	+ 235: wikisource:en Help:Contents
	155	+ 81: wikisource:en Help:Books
	156	+ 56: wikisource:en Help:Public_domain
	157	+ 38: wikisource:en Help:Adding_texts
	158	+ 32: wikisource:en Help:Searching
	159	+ 22: wikisource:en Help:DjVu_files
	160	+ 15: wikisource:en Help:Introduction
	161	+ 12: wikisource:en Help:DJVU_files
	162	+ 11: wikisource:en Help:Editing_Wikisource
	163	+ 11: wikisource:en Help:Editing_poetry
	164	+ 11: wikisource:en Help:Side_by_side_image_view_for_proofreading
	165	+ 294: wikiversity:en Help:Guides
	166	+ 193: wikiversity:en Help:Contents
	167	+ 89: wikiversity:en Help:The_original_tour_for_newcomers
	168	+ 81: wikiversity:en Help:The_original_tour_for_newcomers/1
	169	+ 56: wikiversity:en Help:The_original_tour_for_newcomers/2
	170	+ 41: wikiversity:en Help:The_original_tour_for_newcomers/3
	171	+ 37: wikiversity:en Help:The_original_tour_for_newcomers/4
	172	+ 28: wikiversity:en Help:Resources_by_subject
	173	+ 20: wikiversity:en Help:Resources_by_educational_level
	174	+ 19: wikiversity:en Help:Resources_by_type
	175	+ 15: wikiversity:en Help:Editing
	176	+ 15: wikiversity:en Help:Creating_educational_content_at_Wikiversity
	177	+ 13: wikiversity:en Help:Accessing_Wikiversity_by_educational_level
	178	+ 12: wikiversity:en Help:Resources_by_completion_status
	179	+ 12: wikiversity:en Help:Quiz
	180	+ 10: wikiversity:en Help:Project_boxes
	181	+ 6368: wikipedia:en Help:Contents
	182	+ 2203: wikipedia:en Help:Category
	183	+ 1422: wikipedia:en Help:Japanese
	184	+ 849: wikipedia:en Help:Books
	185	+ 782: wikipedia:en Help:Special_page
	186	+ 623: wikipedia:en Help:Page_history
	187	+ 597: wikipedia:en Help:IPA
	188	+ 581: wikipedia:en Help:Edit_summary
	189	+ 518: wikipedia:en Help:Minor_edit
	190	+ 512: wikipedia:en Help:IPA_for_English
	191	+ 496: wikipedia:en Help:Link
	192	+ 304: wikipedia:en Help:Editing
	193	+ 291: wikipedia:en Help:Multilingual_support_(East_Asian)
	194	+ 239: wikipedia:en Help:Watching_pages
	195	+ 238: wikipedia:en Help:Contents/Editing_Wikipedia
	196	+ 224: wikipedia:en Help:Template
	197	+ 198: wikipedia:en Help:Special_characters
	198	+ 198: wikipedia:en Help:Table
	199	+ 193: wikipedia:en Help:Contents/Getting_started
	200	+ 193: wikipedia:en Help:Section
	201	+ 186: wikipedia:en Help:Pronunciation_respelling_key
	202	+ 180: wikipedia:en Help:Diff
	203	+ 176: wikipedia:en Help:Starting_a_new_page
	204	+ 175: wikipedia:en Help:Reverting
	205	+ 171: wikipedia:en Help:Archiving_a_talk_page
	206	+ 158: wikipedia:en Help:User_contributions
	207	+ 151: wikipedia:en Help:Books/Feedback
	208	+ 138: wikipedia:en Help:Displaying_a_formula
	209	+ 135: wikipedia:en Help:Merging_and_moving_pages
	210	+ 134: wikipedia:en Help:Formula
	211	+ 111: wikipedia:en Help:Multilingual_support_(Indic)
	212	+ 109: wikipedia:en Help:Talk_page
	213	+ 105: wikipedia:en Help:Books/Frequently_Asked_Questions
	214	+ 105: wikipedia:en Help:Searching
	215	+ 99: wikipedia:en Help:CentralAuth
	216	+ 97: wikipedia:en Help:Contents/Browsing_Wikipedia
	217	+ 96: wikipedia:en Help:Books/for_experts
	218	+ 95: wikipedia:en Help:Images_and_other_uploaded_files
	219	+ 95: wikipedia:en Help:IPA_chart_for_Russian
	220	+ 93: wikipedia:en Help:Logging_in
	221	+ 90: wikipedia:en Help:Contents/Links
	222	+ 88: wikipedia:en Help:Contents/Images_and_media
	223	+ 74: wikipedia:en Help:Redirect
	224	+ 73: wikipedia:en Help:Preferences
	225	+ 71: wikipedia:en Help:Contents/Policies_and_guidelines
	226	+ 67: wikipedia:en Help:Footnotes
	227	+ 66: wikipedia:en Help:Contents/Technical_information
	228	+ 64: wikipedia:en Help:Edit_conflict
	229	+ 62: wikipedia:en Help:HTML_in_wikitext
	230	+ 62: wikipedia:en Help:Recent_changes
	231	+ 59: wikipedia:en Help:Namespace
	232	+ 55: wikipedia:en Help:Cite_errors
	233	+ 51: wikibooks:fr Aide:Compilations
	234	+ 21: wikibooks:fr Aide:Compilations/Probl%C3%A8mes
	235	+ 21: wikibooks:fr Aide:Compilations/FAQ
	236	+ 13: wikibooks:fr Aide:Raccourcis
	237	+ 12: wikibooks:fr Aide:Accueil
	238	+ 10: wikibooks:fr Aide:Compilations/Aide_avanc%C3%A9e
	239	+ 113: wiktionary:fr Aide:%C3%89tymologies
	240	+ 96: wiktionary:fr Aide:Synonymes_et_antonymes
	241	+ 61: wiktionary:fr Aide:Sommaire
	242	+ 24: wiktionary:fr Aide:Prononciations
	243	+ 21: wiktionary:fr Aide:D%C3%A9finitions
	244	+ 18: wiktionary:fr Aide:%C3%89tymologie_grecque
	245	+ 17: wiktionary:fr Aide:Anagrammes
	246	+ 17: wiktionary:fr Aide:Aide
	247	+ 12: wiktionary:fr Aide:Exemples
	248	+ 10: wiktionary:fr Aide:Homophones_et_paronymes
	249	+ 14: wikinews:fr Aide:Sommaire
	250	+ 13: wikiquote:fr Aide:Sommaire
	251	+ 127: wikisource:fr Aide:Aide_au_lecteur
	252	+ 24: wikisource:fr Aide:Livres
	253	+ 17: wikisource:fr Aide:Cr%C3%A9er_un_fichier_DjVu
	254	+ 17: wikisource:fr Aide:Accueil
	255	+ 12: wikisource:fr Aide:Guide_du_nouveau_contributeur
	256	+ 11: wikisource:fr Aide:Comment_num%C3%A9riser
	257	+ 10: wikisource:fr Aide:Aide
	258	+ 103: wikiversity:fr Aide:Niveau_de_difficult%C3%A9
	259	+ 26: wikiversity:fr Aide:Niveau_de_difficult%C3%A9/Niveau_13
	260	+ 24: wikiversity:fr Aide:Sommaire
	261	+ 21: wikiversity:fr Aide:Niveau_de_difficult%C3%A9/Niveau_14
	262	+ 18: wikiversity:fr Aide:Niveau_de_difficult%C3%A9/Niveau_11
	263	+ 16: wikiversity:fr Aide:Comment_cr%C3%A9er_un_projet
	264	+ 15: wikiversity:fr Aide:Niveau_de_difficult%C3%A9/Niveau_10
	265	+ 11: wikiversity:fr Aide:Frise_chronologique
	266	+ 4860: wikipedia:fr Aide:Homonymie
	267	+ 1742: wikipedia:fr Aide:Recherche
	268	+ 1703: wikipedia:fr Aide:Sommaire
	269	+ 1253: wikipedia:fr Aide:Importer_un_fichier
	270	+ 945: wikipedia:fr Aide:%C3%89bauche
	271	+ 901: wikipedia:fr Aide:Livres
	272	+ 731: wikipedia:fr Aide:Comment_modifier_une_page
	273	+ 359: wikipedia:fr Aide:Poser_une_question
	274	+ 308: wikipedia:fr Aide:Tout_l%27indispensable...
	275	+ 288: wikipedia:fr Aide:Consultation
	276	+ 268: wikipedia:fr Aide:Premiers_pas
	277	+ 217: wikipedia:fr Aide:Comment_cr%C3%A9er_une_page
	278	+ 215: wikipedia:fr Aide:Redirection
	279	+ 170: wikipedia:fr Aide:Importer_un_logo
	280	+ 158: wikipedia:fr Aide:Syntaxe
	281	+ 139: wikipedia:fr Aide:Importer_un_fichier_sur_Commons
	282	+ 134: wikipedia:fr Aide:Unicode
	283	+ 126: wikipedia:fr Aide:Note
	284	+ 109: wikipedia:fr Aide:Importer_sur_Commons_un_fichier_dont_je_suis_l%27auteur
	285	+ 106: wikipedia:fr Aide:Toujours_commenter_vos_modifications_dans_la_bo%C3%AEte_de_r%C3%A9sum%C3%A9
	286	+ 106: wikipedia:fr Aide:Ins%C3%A9rer_une_image
	287	+ 101: wikipedia:fr Aide:Comment_r%C3%A9diger_une_page
	288	+ 101: wikipedia:fr Aide:%C3%89couter_des_sons_ogg
	289	+ 94: wikipedia:fr Aide:Premiers_pas/2
	290	+ 85: wikipedia:fr Aide:Mod%C3%A8le
	291	+ 82: wikipedia:fr Aide:Japonais
	292	+ 75: wikipedia:fr Aide:Comment_cr%C3%A9er_un_article
	293	+ 74: wikipedia:fr Aide:Formules_TeX
	294	+ 71: wikipedia:fr Aide:Caract%C3%A8res_sp%C3%A9ciaux
	295	+ 68: wikipedia:fr Aide:Caract%C3%A8res_sp%C3%A9ciaux_probl%C3%A9matiques
	296	+ 65: wikipedia:fr Aide:Premiers_pas/3
	297	+ 63: wikipedia:fr Aide:Regarder_des_vid%C3%A9os_ogg
	298	+ 63: wikipedia:fr Aide:Compte_utilisateur
	299	+ 60: wikipedia:fr Aide:Sourcer
	300	+ 60: wikipedia:fr Aide:Sommaire/D%C3%A9buter
	301	+ 58: wikipedia:fr Aide:Sommaire/Modifier_Wikip%C3%A9dia
	302	+ 55: wikipedia:fr Aide:Historique
	303	+ 51: wikipedia:fr Aide:Espace_de_noms
	304	+ 49: wikipedia:fr Aide:Republication
	305	+ 49: wikipedia:fr Aide:Sommaire/Traduction
	306	+ 44: wikipedia:fr Aide:Cat%C3%A9gorie
	307	+ 42: wikipedia:fr Aide:Couleurs
	308	+ 41: wikipedia:fr Aide:Accents
	309	+ 39: wikipedia:fr Aide:Signature
	310	+ 38: wikipedia:fr Aide:Liens_externes
	311	+ 38: wikipedia:fr Aide:Les_diff%C3%A9rents_r%C3%B4les
	312	+ 37: wikipedia:fr Aide:Acc%C3%A8s_%C3%A0_Wikip%C3%A9dia_avec_un_t%C3%A9l%C3%A9phone_portable_et_un_PDA
	313	+ 35: wikipedia:fr Aide:Sommaire/Parcourir_Wikip%C3%A9dia
	314	+ 33: wikipedia:fr Aide:Frise_chronologique
	315	+ 33: wikipedia:fr Aide:Raccourci
	316	+ 32: wikipedia:fr Aide:Page_utilisateur
	317	+ 31: wikipedia:fr Aide:Page_Utilisateur
	318	+ 14: wikipedia:gd Help:Cuideachadh
	319	+ 25: wikipedia:gn Help:Contents
	320	+ 10: wikipedia:ig Help:Contents
	321	+ 48: wikipedia:ilo Help:Contents
	322	+ 12: wikipedia:ilo Help:Dagiti_Linaon
	323	+ 24: wikimedia:incubator Help:Manual
	324	+ 15: wikimedia:incubator Help:Contents
	325	+ 25: wikipedia:io Help:Helpo
	326	+ 18: wikibooks:ja Help:%E9%80%B2%E6%8D%97%E7%8A%B6%E6%B3%81
	327	+ 14: wiktionary:ja Help:%E7%9B%AE%E6%AC%A1
	328	+ 1121: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1
	329	+ 537: wikipedia:ja Help:%E6%A4%9C%E7%B4%A2
	330	+ 188: wikipedia:ja Help:%E3%83%9A%E3%83%BC%E3%82%B8%E3%81%AE%E7%B7%A8%E9%9B%86
	331	+ 179: wikipedia:ja Help:%E9%9F%B3%E5%A3%B0%E3%83%BB%E5%8B%95%E7%94%BB%E3%81%AE%E5%86%8D%E7%94%9F
	332	+ 147: wikipedia:ja Help:%25E7%259B%25AE%25E6%25AC%25A1
	333	+ 132: wikipedia:ja Help:%E7%94%BB%E5%83%8F%E3%81%AA%E3%81%A9%E3%81%AE%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB%E3%81%AE%E3%82%A2%E3%83%83%E3%83%97%E3%83%AD%E3%83%BC%E3%83%89%E3%81%A8%E5%88%A9%E7%94%A8
	334	+ 120: wikipedia:ja Help:%E8%84%9A%E6%B3%A8/%E8%AA%AD%E8%80%85%E5%90%91%E3%81%91
	335	+ 98: wikipedia:ja Help:%E7%89%B9%E6%AE%8A%E6%96%87%E5%AD%97
	336	+ 84: wikipedia:ja Help:%E6%96%B0%E8%A6%8F%E3%83%9A%E3%83%BC%E3%82%B8%E3%81%AE%E4%BD%9C%E6%88%90
	337	+ 69: wikipedia:ja Help:%E8%A8%98%E4%BA%8B%E3%81%A8%E3%81%AF%E4%BD%95%E3%81%8B
	338	+ 68: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E6%96%B0%E3%81%97%E3%81%84%E8%A8%98%E4%BA%8B%E3%82%92%E6%9B%B8%E3%81%8F
	339	+ 65: wikipedia:ja Help:%E7%94%BB%E5%83%8F%E3%81%AE%E8%A1%A8%E7%A4%BA
	340	+ 64: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E7%9B%B8%E8%AB%87%E3%81%A8%E8%B3%AA%E5%95%8F
	341	+ 57: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E7%B7%A8%E9%9B%86%E5%85%A5%E9%96%80
	342	+ 56: wikipedia:ja Help:%E3%83%AD%E3%82%B0%E3%82%A4%E3%83%B3
	343	+ 48: wikipedia:ja Help:%E3%83%86%E3%83%B3%E3%83%97%E3%83%AC%E3%83%BC%E3%83%88
	344	+ 45: wikipedia:ja Help:%E3%83%8E%E3%83%BC%E3%83%88%E3%83%9A%E3%83%BC%E3%82%B8
	345	+ 38: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E3%83%9E%E3%83%BC%E3%82%AF%E3%82%A2%E3%83%83%E3%83%97
	346	+ 35: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E7%94%BB%E5%83%8F%E3%81%AA%E3%81%A9%E3%81%AE%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB
	347	+ 34: wikipedia:ja Help:%E6%97%A9%E8%A6%8B%E8%A1%A8
	348	+ 32: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E8%A8%98%E4%BA%8B%E3%82%92%E8%82%B2%E3%81%A6%E3%82%8B
	349	+ 32: wikipedia:ja Help:%E7%9B%AE%E6%AC%A1_%E8%AA%AD%E8%80%85%E5%90%91%E3%81%91
	350	+ 31: wikipedia:ja Help:%E8%84%9A%E6%B3%A8
	351	+ 30: wikipedia:ja Help:%E7%B4%B0%E9%83%A8%E3%81%AE%E7%B7%A8%E9%9B%86
	352	+ 30: wikipedia:ja Help:%E3%83%AA%E3%83%80%E3%82%A4%E3%83%AC%E3%82%AF%E3%83%88
	353	+ 30: wikipedia:ja Help:JPEG%E7%94%BB%E5%83%8F%E3%82%92%E6%B8%9B%E8%89%B2%E3%81%97PNG%E7%94%BB%E5%83%8F%E3%81%A8%E3%81%97%E3%81%A6%E4%BF%9D%E5%AD%98%E3%81%99%E3%82%8B%E6%96%B9%E6%B3%95
	354	+ 30: wikipedia:ja Help:%E9%81%8E%E5%8E%BB%E3%83%AD%E3%82%B0
	355	+ 29: wikipedia:ja Help:%E5%B1%A5%E6%AD%B4
	356	+ 28: wikipedia:ja Help:ISBN%E3%81%AE%E3%83%AA%E3%83%B3%E3%82%AF
	357	+ 27: wikipedia:ja Help:%E3%83%8A%E3%83%93%E3%82%B2%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E3%83%BB%E3%83%9D%E3%83%83%E3%83%97%E3%82%A2%E3%83%83%E3%83%97
	358	+ 27: wikipedia:ja Help:%E5%A4%9A%E8%A8%80%E8%AA%9E%E5%AF%BE%E5%BF%9C_(%E3%82%A4%E3%83%B3%E3%83%89%E7%B3%BB%E6%96%87%E5%AD%97)
	359	+ 26: wikipedia:ja Help:%E3%83%AA%E3%83%B3%E3%82%AF
	360	+ 25: wikipedia:ja Help:%E7%AE%87%E6%9D%A1%E6%9B%B8%E3%81%8D
	361	+ 24: wikipedia:ja Help:%E3%82%A6%E3%82%A3%E3%82%AD%E3%83%86%E3%82%AD%E3%82%B9%E3%83%88%E3%81%AB%E3%81%8A%E3%81%91%E3%82%8BHTML
	362	+ 24: wikipedia:ja Help:%E3%83%9A%E3%83%BC%E3%82%B8%E3%81%AE%E7%A7%BB%E5%8B%95
	363	+ 23: wikipedia:ja Help:%E8%A1%A8%E3%81%AE%E4%BD%9C%E3%82%8A%E6%96%B9
	364	+ 21: wikipedia:ja Help:%E8%A6%81%E7%B4%84%E6%AC%84
	365	+ 21: wikipedia:ja Help:%E3%83%9E%E3%82%B8%E3%83%83%E3%82%AF%E3%83%AF%E3%83%BC%E3%83%89
	366	+ 21: wikipedia:ja Help:Pywikipediabot
	367	+ 20: wikipedia:ja Help:%25E6%25A4%259C%25E7%25B4%25A2
	368	+ 20: wikipedia:ja Help:%E3%82%AB%E3%83%86%E3%82%B4%E3%83%AA
	369	+ 20: wikipedia:ja Help:%E3%82%B5%E3%83%B3%E3%83%89%E3%83%9C%E3%83%83%E3%82%AF%E3%82%B9
	370	+ 20: wikipedia:ja Help:%E6%A3%92%E3%82%B0%E3%83%A9%E3%83%95%E3%81%AE%E6%9B%B8%E3%81%8D%E6%96%B9
	371	+ 18: wikipedia:ja Help:%E3%82%BB%E3%82%AF%E3%82%B7%E3%83%A7%E3%83%B3
	372	+ 15: wikipedia:ja Help:%E3%82%A6%E3%82%A3%E3%82%AD%E3%83%A1%E3%83%BC%E3%83%AB
	373	+ 15: wikipedia:ja Help:%E5%80%8B%E4%BA%BA%E8%A8%AD%E5%AE%9A
	374	+ 15: wikipedia:ja Help:%E5%90%8D%E5%89%8D%E7%A9%BA%E9%96%93
	375	+ 14: wikipedia:ja Help:%E3%83%86%E3%83%B3%E3%83%97%E3%83%AC%E3%83%BC%E3%83%88%E3%81%AE%E8%AA%AC%E6%98%8E%E6%96%87
	376	+ 14: wikipedia:ja Help:%E4%BB%A5%E5%89%8D%E3%81%AE%E7%89%88%E3%81%AB%E3%83%9A%E3%83%BC%E3%82%B8%E3%82%92%E6%88%BB%E3%81%99%E6%96%B9%E6%B3%95
	377	+ 13: wikipedia:ja Help:%E3%82%BD%E3%83%95%E3%83%88%E3%83%AA%E3%83%80%E3%82%A4%E3%83%AC%E3%82%AF%E3%83%88
	378	+ 13: wikipedia:ja Help:%E3%83%9A%E3%83%BC%E3%82%B8%E5%90%8D
	379	+ 13: wikipedia:ja Help:%E6%9D%A1%E4%BB%B6%E6%96%87
	380	+ 30: wikipedia:kg Help:Contents
	381	+ 2343: wikimedia:meta Help:External_editors
	382	+ 1266: wikimedia:meta Help:Contents
	383	+ 353: wikimedia:meta Help:Editing
	384	+ 246: wikimedia:meta Help:Help
	385	+ 237: wikimedia:meta Help:Starting_a_new_page
	386	+ 217: wikimedia:meta Help:Images_and_other_uploaded_files
	387	+ 210: wikimedia:meta Help:Unified_login
	388	+ 208: wikimedia:meta Help:Table
	389	+ 169: wikimedia:meta Hilfe:Handbuch
	390	+ 167: wikimedia:meta Help:Category
	391	+ 156: wikimedia:meta Help:Template
	392	+ 131: wikimedia:meta Help:User_style%20http://schoolpapers.hostinginfive.com/bike.htm
	393	+ 131: wikimedia:meta Help:User_style_http://schoolpapers.hostinginfive.com/bike.htm
	394	+ 118: wikimedia:meta Help:Link
	395	+ 111: wikimedia:meta Help:Editor
	396	+ 107: wikimedia:meta Help:Formula
	397	+ 105: wikimedia:meta Help:Wikitext_examples
	398	+ 97: wikimedia:meta Help:Reference_card
	399	+ 95: wikimedia:meta Help:Section
	400	+ 89: wikimedia:meta Help:Special_characters
	401	+ 85: wikimedia:meta Help:Wikitext
	402	+ 85: wikimedia:meta Help:HTML_in_wikitext
	403	+ 78: wikimedia:meta Help:System_admin
	404	+ 77: wikimedia:meta Help:Preferences
	405	+ 72: wikimedia:meta Aide:Contenu
	406	+ 69: wikimedia:meta Help:Displaying_a_formula
	407	+ 65: wikimedia:meta Help:Page_name
	408	+ 64: wikimedia:meta Help:Magic_words
	409	+ 61: wikimedia:meta Help:Advanced_editing
	410	+ 59: wikimedia:meta Help:Reader
	411	+ 57: wikimedia:meta Help:List
	412	+ 56: wikimedia:meta Help:Searching
	413	+ 56: wikimedia:meta Help:Moderator
	414	+ 54: wikimedia:meta Help:Interwiki_linking
	415	+ 52: wikimedia:meta Help:Transwiki
	416	+ 52: wikimedia:meta Help:Redirect
	417	+ 52: wikimedia:meta Help:Namespace
	418	+ 50: wikimedia:meta Hilfe:Externe_Editoren
	419	+ 49: wikimedia:meta Help:Public_domain_image_resources
	420	+ 48: wikimedia:meta Help:User_style
	421	+ 48: wikimedia:meta Help:Variable
	422	+ 48: wikimedia:meta Help:Introduction
	423	+ 46: wikimedia:meta Help:Moving_a_page
	424	+ 46: wikimedia:meta Help:ParserFunctions
	425	+ 45: wikimedia:meta Help:Logging_in
	426	+ 45: wikimedia:meta Help:Export
	427	+ 43: wikimedia:meta Help:Editing_FAQ
	428	+ 42: wikimedia:meta Help:Import
	429	+ 41: wikimedia:meta Help:Special_page
	430	+ 41: wikimedia:meta Hilfe:Textgestaltung
	431	+ 40: wikimedia:meta Help:Job_queue
	432	+ 40: wikimedia:meta Help:URL
	433	+ 14: wikipedia:meta Help:Contents
	434	+ 35: wikipedia:mi Help:Contents
	435	+ 24: wikipedia:mr Help:Contents
	436	+ 18: wikipedia:ne Help:Contents
	437	+ 12: wikibooks:nl Help:Boeken
	438	+ 312: wikipedia:nl Help:Boeken
	439	+ 182: wikipedia:nl Help:Zoeken
	440	+ 124: wikipedia:nl Help:Tips_voor_het_schrijven_van_een_goed_artikel
	441	+ 105: wikipedia:nl Help:Nieuwe_pagina_aanmaken
	442	+ 72: wikipedia:nl Help:Helpdesk
	443	+ 70: wikipedia:nl Help:Beveiligde_pagina%27s
	444	+ 64: wikipedia:nl Help:Ogg_Vorbis
	445	+ 61: wikipedia:nl Help:Uitleg
	446	+ 60: wikipedia:nl Help:Wikipedia
	447	+ 56: wikipedia:nl Help:Veelgestelde_vragen
	448	+ 53: wikipedia:nl Help:Veelvoorkomende_spelfouten
	449	+ 51: wikipedia:nl Help:Samenvatting
	450	+ 45: wikipedia:nl Help:Hoe_kan_ik_meedoen%3F
	451	+ 39: wikipedia:nl Help:Installeer_je_eigen_Wiki
	452	+ 36: wikipedia:nl Help:Gebruik_van_bestanden
	453	+ 35: wikipedia:nl Help:Terminologie_op_Wikipedia
	454	+ 34: wikipedia:nl Help:Gebruik_van_tabellen
	455	+ 33: wikipedia:nl Help:Referenties_en_voetnoten
	456	+ 33: wikipedia:nl Help:Afkortingen_op_Wikipedia_chat
	457	+ 32: wikipedia:nl Help:Gebruik_van_categorie%C3%ABn
	458	+ 31: wikipedia:nl Help:Tekstopmaak
	459	+ 30: wikipedia:nl Help:Gebruik_van_sjablonen
	460	+ 30: wikipedia:nl Help:Contact_met_Wikipedia
	461	+ 29: wikipedia:nl Help:Speciale_tekens
	462	+ 27: wikipedia:nl Help:Kleine_wijziging
	463	+ 26: wikipedia:nl Help:Alfabetische_index
	464	+ 25: wikipedia:nl Help:Spellinggids
	465	+ 25: wikipedia:nl Help:TeX_in_Wikipedia
	466	+ 24: wikipedia:nl Help:Standaardvorm_voor_biografie%C3%ABn
	467	+ 24: wikipedia:nl Help:Gebruik_van_bots
	468	+ 23: wikipedia:nl Help:Beginnetje
	469	+ 23: wikipedia:nl Help:Tips_voor_het_vertalen_van_een_artikel_vanaf_een_andere_Wikipedia
	470	+ 23: wikipedia:nl Help:Gebruik_van_links
	471	+ 22: wikipedia:nl Help:Samenvoegen_van_artikelen
	472	+ 20: wikipedia:nl Help:Hulpmiddelen
	473	+ 19: wikipedia:nl Help:Auteursrechten
	474	+ 18: wikipedia:nl Help:Gebruik_van_openbare_bronnen
	475	+ 17: wikipedia:nl Help:Bronnensjabloon
	476	+ 17: wikipedia:nl Help:Wikipediachat
	477	+ 16: wikipedia:nl Help:Inhoud
	478	+ 16: wikipedia:nl Help:Gebruik_van_geluid
	479	+ 15: wikipedia:nl Help:Externe_kaarten
	480	+ 15: wikipedia:nl Help:Waarom_zou_ik_meedoen%3F
	481	+ 15: wikipedia:nl Help:Naamruimte
	482	+ 14: wikipedia:nl Help:EasyTimeline
	483	+ 14: wikipedia:nl Help:English
	484	+ 13: wikipedia:nl Help:Media_uploaden_naar_commons
	485	+ 13: wikipedia:nl Help:Overlegpagina
	486	+ 13: wikipedia:nl Help:Unieke_van_Wikipedia
	487	+ 12: wikipedia:nl Help:Gebruik_van_de_taxobox
	488	+ 11: wikipedia:nl Help:Doorverwijzen
	489	+ 11: wikipedia:nl Help:Huis-_tuin-_en_keukeninspiratie
	490	+ 10: wikipedia:nrm Help:Contents
	491	+ 15: wikipedia:pam Help:Kalamnan
	492	+ 10: wikipedia:pdc Hilfe:Hilfe
	493	+ 15: wikipedia:sc Help:Aiuto
	494	+ 32: wikipedia:scn Help:Aiutu
	495	+ 15: wikipedia:sco Help:Contents
	496	+ 48: wikipedia:se Help:Contents
	497	+ 65: wiktionary:simple Help:Contents
	498	+ 266: wikipedia:simple Help:Contents
	499	+ 241: wikipedia:simple Help:Books
	500	+ 21: wikipedia:simple Help:How_to_use_images
	501	+ 18: wikipedia:simple Help:How_to_change_pages
	502	+ 13: wikipedia:simple Help:Editing
	503	+ 11: wikipedia:simple Help:How_to_edit
	504	+ 10: wikipedia:simple Help:Archiving_a_talk_page
	505	+ 10: wikipedia:simple Help:Pronunciation_respelling_key
	506	+ 50: wikimedia:species Help:Contents
	507	+ 19: wikimedia:species Help:Image_Guidelines
	508	+ 17: wikimedia:species Help:General_Wikispecies
	509	+ 15: wikimedia:species Help:Author_Names
	510	+ 22: wikipedia:sw Help:Contents
	511	+ 28: wikipedia:te Help:Contents
	512	+ 20: wikipedia:test Help:Books
	513	+ 14: wikipedia:test Help:Page_validation
	514	+ 11: wikipedia:to Help:Contents
	515	+ 21: wikipedia:uz Help:Contents
	516	+ 748: www.w Help:Contents
	517	+ 417: www.w Help:Configuration_settings
	518	+ 373: www.w Help:Editing_pages
	519	+ 355: www.w Help:Formatting
	520	+ 276: www.w Help:Magic_words
	521	+ 261: www.w Help:Navigation
	522	+ 253: www.w Help:Extension:ParserFunctions
	523	+ 208: www.w Help:Images
	524	+ 185: www.w Help:FAQ
	525	+ 172: www.w Help:Links
	526	+ 164: www.w Help:Starting_a_new_page
	527	+ 153: www.w Help:Templates
	528	+ 147: www.w Help:Tables
	529	+ 66: www.w Help:Categories
	530	+ 47: www.w Help:Redirects
	531	+ 46: www.w Help:Assigning_permissions
	532	+ 46: www.w Help:Editing
	533	+ 45: www.w Help:Namespaces
	534	+ 44: www.w Help:Skins
	535	+ 40: www.w Help:Managing_files
	536	+ 38: www.w Help:Contents/de
	537	+ 36: www.w Help:Special_pages
	538	+ 35: www.w Help:Subpages
	539	+ 33: www.w Help:Preferences
	540	+ 31: www.w Help:Variables
	541	+ 30: www.w Help:Moving_a_page
	542	+ 29: www.w Help:Editing_pages/de
	543	+ 28: www.w Help:User_page
	544	+ 27: www.w Help:Contents/ru
	545	+ 27: www.w Help:Sysops_and_permissions
	546	+ 25: www.w Help:Talk_pages
	547	+ 25: www.w Help:Editing_pages/ja
	548	+ 24: www.w Help:Searching
	549	+ 24: www.w Help:Navigation/de
	550	+ 22: www.w Help:User_rights
	551	+ 21: www.w Help:Signatures
	552	+ 21: www.w Help:Deleting_a_page
	553	+ 21: www.w Help:Tracking_changes
	554	+ 20: www.w Help:Linked_images
	555	+ 20: www.w Help:ParserFunctions
	556	+ 19: www.w Help:Navigation/ru
	557	+ 18: www.w Help:Interwiki_linking
	558	+ 18: www.w Help:User_rights/favicon.ico
	559	+ 18: www.w Help:User_rights/favicon.gif
	560	+ 17: www.w Help:Formatting/de
	561	+ 17: www.w Help:Editing_pages/pt
	562	+ 17: www.w Help:Patrolled_edits
	563	+ 16: www.w Help:Contents/es
	564	+ 15: www.w Help:Links/ru
	565	+ 14: www.w Help:Sysop_deleting_and_undeleting
	566	+ 14: www.w Help:Starting_a_new_page/de
	567	+ 13: www.w Help:Protecting_and_unprotecting_pages
	568	+ 117: wikipedia:www Help:Contents
	569	+ 55: wikipedia:zh-classical Help:%E5%87%A1%E4%BE%8B
	570	+ 14: wikipedia:zh-classical Help:Page_validation
	571	+ 57: wikipedia:zh-min-nan Help:Bo%CC%8Dk-lio%CC%8Dk
	572	+ 13: wikipedia:zh-min-nan Help:%E5%A6%82%E4%BD%95%E8%BC%B8%E5%85%A5%E7%99%BD%E8%A9%B1%E5%AD%97
	573	+ 10: wikipedia:zh-min-nan Help:%E5%A6%82%E4%BD%95%E8%AE%80
	574	+ 51: wikipedia:zh-yue Help:%E7%9B%AE%E9%8C%84
	575	+ 19: wikisource:zh Help:%E7%9B%AE%E5%BD%95
	576	+ 13: wikisource:zh Help:%E4%B9%A6
	577	+ 12: wikisource:zh Help:%E5%85%A5%E9%97%A8%E6%8C%87%E5%8D%97
Property changes on: trunk/wikistats/dammit.lt/cellar/!DammitRankSpecialPages.txt
___________________________________________________________________
Added: svn:eol-style
1	578	+ native
Index: trunk/wikistats/dammit.lt/cellar/!DammitPatchProjectcountsForBanners.pl
—	—	@@ -0,0 +1,189 @@
	2	+#!/usr/bin/perl
	3	+
	4	+$\| = 1; # flush screen output
	5	+
	6	+open IN, '<', 'PageViewsBannerPages.txt' ;
	7	+open OUT1, '>', 'PageViewsBannerPagesUse.txt' ;
	8	+open OUT2, '>', 'PageViewsBannerPagesDiscard.txt' ;
	9	+open LOG, '>', 'PageViewsBannerPagesLog.txt' ;
	10	+
	11	+while ($line = <IN>)
	12	+{
	13	+ ($date,$project,$title,$counts) = split (' ', $line) ;
	14	+
	15	+ $date =~ s/^.?(\d{8}).$/$1/ ;
	16	+ $project =~ s/^.*?:// ;
	17	+ $project =~ s/\.z// ;
	18	+
	19	+ $projects {$project} ++ ;
	20	+
	21	+ ($total = $counts) =~ s/\D.*//g ;
	22	+
	23	+# next if $line !~ /20101001/ ;
	24	+# next if $line !~ /fy\.z/ ;
	25	+
	26	+ if ($line !~ /(?:BannerCont\|BannerList\|BannerLoad\|Bannerbeheer)/i)
	27	+ {
	28	+ print OUT2 $line ;
	29	+ $total_discard += $total ;
	30	+ $titles_discard {"$project $title"} += $total ;
	31	+ next ;
	32	+ }
	33	+
	34	+ print OUT1 $line ;
	35	+ $titles_use {"$project $title"} += $total ;
	36	+ $total_use += $total ;
	37	+
	38	+ # print "$counts: " ;
	39	+ $counts =~ s/^\d+// ; # remove (redundant) preceding total
	40	+ while ($counts ne "")
	41	+ {
	42	+ $letter = substr ($counts,0,1) ;
	43	+ $counts = substr ($counts,1) ;
	44	+ ($count = $counts) =~ s/^(\d+).*$/$1/ ;
	45	+ $counts =~ s/^\d+(.*)$/$1/ ;
	46	+ $hour = ord ($letter) - ord ('A') ;
	47	+ # print "[$hour] $count " ;
	48	+
	49	+ $substract {"$project,$date,$hour"} += $count ;
	50	+ # if (($project eq 'fy') && ($date eq '20101001'))
	51	+ # { print "$project,$date,$hour\n" ; }
	52	+ }
	53	+ # print "\n" ;
	54	+
	55	+}
	56	+close IN ;
	57	+
	58	+&Log ("\n\nDiscard:\n") ;
	59	+foreach $title (sort {$titles_discard {$b} <=> $titles_discard {$a}} keys %titles_discard)
	60	+{
	61	+ print $titles_discard {$title} . " : $title\n" ;
	62	+ print LOG $titles_discard {$title} . " : $title\n" ;
	63	+ last if $lines_discard++ > 10 ;
	64	+}
	65	+
	66	+&Log ("\n\nUse:\n") ;
	67	+foreach $title (sort {$titles_use {$b} <=> $titles_use {$a}} keys %titles_use)
	68	+{
	69	+ print LOG $titles_use {$title} . " : $title\n" ;
	70	+ next if $lines_use++ > 10 ;
	71	+ print $titles_use {$title} . " : $title\n" ;
	72	+ last if $lines_use++ > 1000 ;
	73	+}
	74	+
	75	+&Log ("\n\nProjects:\n") ;
	76	+foreach $project (sort keys %projects)
	77	+{
	78	+ &Log ("$project ") ;
	79	+ &Log ("\n") if $projects_printed++ %10 == 0 ;
	80	+}
	81	+close OUT1 ;
	82	+close OUT2 ;
	83	+close LOG ;
	84	+
	85	+&Patch ;
	86	+
	87	+&Log ("Use $total_use\n") ;
	88	+&Log ("Discard $total_discard\n") ;
	89	+&Log ("Substracted $counts_substracted\n") ;
	90	+
	91	+print "\n\nReady\n\n" ;
	92	+exit ;
	93	+
	94	+sub Patch
	95	+{
	96	+ &Log ("\n\nPatch\n\n") ;
	97	+ if (-d "/a/dammit.lt/projectcounts")
	98	+ { $dir = "/a/dammit.lt/projectcounts" ; }
	99	+ else
	100	+ { $dir = "w:/# In Dammit.lt/projectcounts/t" ; }
	101	+
	102	+ chdir ($dir) \|\| die "Cannot chdir to $dir\n" ;
	103	+
	104	+ local (*DIR);
	105	+ opendir (DIR, ".");
	106	+ @files = () ;
	107	+
	108	+ while ($file_in = readdir (DIR))
	109	+ {
	110	+ next if $file_in !~ /^projectcounts-2010(?:09\|10)/ ;
	111	+ # next if $file_in !~ /^projectcounts-20101001/ ;
	112	+
	113	+ push @files, $file_in ;
	114	+ }
	115	+
	116	+ closedir (DIR);
	117	+
	118	+ @files = sort @files ;
	119	+
	120	+ foreach $file (@files)
	121	+ { &PatchFile ($file) ; }
	122	+
	123	+ &Log ("\n\nUnpatched\n\n") ;
	124	+ foreach $key (sort keys %substract)
	125	+ {
	126	+ if (! $substract_found {$key})
	127	+ { &Log ("$key\n") ; }
	128	+ }
	129	+}
	130	+
	131	+sub PatchFile
	132	+{
	133	+ my $file = shift ;
	134	+ my $line ;
	135	+ print "\nFile $file\n" ;
	136	+
	137	+ ($dummy,$date,$time) = split '-', $file ;
	138	+ $hour = substr ($time,0,2) + 0 ;
	139	+
	140	+ open PROJECTFILE, '<', "$dir/$file" \|\| die "Could not open '$dir/$file'\n" ;
	141	+
	142	+ undef @projectfile ;
	143	+ $file_changed = 0 ;
	144	+ while ($line = <PROJECTFILE>)
	145	+ {
	146	+ chomp $line ;
	147	+ ($project,$dash,$count,$bytes) = split (' ', $line) ;
	148	+
	149	+ # next if $project ne 'fy' ;
	150	+ # print "$line\n" ;
	151	+ next if $bytes eq '' ;
	152	+ $count_substract = $substract {"$project,$date,$hour"} ;
	153	+ $substract_found {"$project,$date,$hour"} ++ ;
	154	+
	155	+ if ($count_substract == 0)
	156	+ { push @projectfile, $line ; }
	157	+ else
	158	+ {
	159	+ $file_changed = 1 ;
	160	+ $count -= $count_substract ;
	161	+ &Log ("\n$line ->\n") ;
	162	+ $line = "$project $dash $count 1" ;
	163	+ push @projectfile, $line ;
	164	+ &Log ("$line\n") ;
	165	+ }
	166	+ # next if $count_substract eq '' ;
	167	+ $counts_substracted += $count_substract ;
	168	+ # print "$project $count minus $count_substract\n" ; # '$project,$date,$hour'\n" ;
	169	+ }
	170	+
	171	+ close PROJECTFILE ;
	172	+
	173	+ if ($file_changed)
	174	+ {
	175	+ open PROJECTFILE, '>', "$dir/$file" \|\| die "Could not open '$dir/$file'\n" ;
	176	+ foreach $line (@projectfile)
	177	+ { print PROJECTFILE "$line\n" ; }
	178	+ close PROJECTFILE ;
	179	+ }
	180	+}
	181	+
	182	+sub Log
	183	+{
	184	+ my $msg = shift ;
	185	+ print $msg ;
	186	+ print LOG $msg ;
	187	+}
	188	+
	189	+
	190	+
Index: trunk/wikistats/dammit.lt/cellar/!DammitFilesFindMisses.pl
—	—	@@ -0,0 +1,185 @@
	2	+#!/usr/local/bin/perl
	3	+
	4	+# to do
	5	+# titles can occur twice (because of ucfirst) , add those counts before pushing to table @data
	6	+# remove extra parameters e.g. "Gabriel_Andrade&limit=500"
	7	+
	8	+ use CGI qw(:all);
	9	+ use URI::Escape;
	10	+ use Getopt::Std ;
	11	+ use Cwd ;
	12	+
	13	+ $bayes = -d "/a/dammit.lt/pagecounts" ;
	14	+ $path_7za = "/usr/lib/p7zip/7za" ;
	15	+ $path_grep = "/bin/grep" ;
	16	+
	17	+ $\| = 1; # flush screen output
	18	+ $true = 1 ;
	19	+ $false = 0 ;
	20	+
	21	+ $jobstart = time ;
	22	+
	23	+ $key = "de.z" ;
	24	+
	25	+# -i "D:/\@Wikimedia/!Perl/#Projects/Dammit Log Files/Scan Log Files/in" -o "D:/\@Wikimedia/!Perl/#Projects/Dammit Log Files/Scan Log Files/out" -f 20090429 -t 20090429 -p ''
	26	+ my $options ;
	27	+ getopt ("iop", \%options) ;
	28	+
	29	+ $file_articles_in = "W:/# In Dumps/dewiki-20090917-all-titles-in-ns0" ;
	30	+ $file_articles_out = "W:/# In Dumps/dewiki-20090917-all-titles-in-ns0_b" ;
	31	+ $file_pageviews_in = "W:/pagecounts-20090801_fdt" ;
	32	+ $file_pageviews_out = "W:/pagecounts-20090801_fdt_b" ;
	33	+ $file_extract = "W:/! Perl/Dammit Log Files/Scan Log Files/PageViewsExtractArticlesDeWp.txt" ;
	34	+ $file_missing = "W:/! Perl/Dammit Log Files/Scan Log Files/PageViewsMissingArticlesDeWp.txt" ;
	35	+
	36	+# if (! defined ($options {"i"})) { &Abort ("Specify input dir as -i dirname") } ;
	37	+# if (! defined ($options {"o"})) { &Abort ("Specify output dir as -o dirname") } ;
	38	+# if (! defined ($options {"p"})) { &Abort ("Specify project as -p \".....\"") } ;
	39	+
	40	+# $dir_in = $options {"i"} ;
	41	+# $dir_out = $options {"o"} ;
	42	+# $project = $options {"p"} ;
	43	+
	44	+# $work = cwd() ;
	45	+# print "Work dir $work\n" ;
	46	+# if ($dir_in !~ /[\/\\]/)
	47	+# { $dir_in = "$work/$dir_in" ; }
	48	+# if ($dir_out !~ /[\/\\]/)
	49	+# { $dir_out = "$work/$dir_out" ; }
	50	+
	51	+# if (! -d $dir_in) { &Abort ("Input dir not found: $dir_in") } ;
	52	+# if (! -d $dir_out)
	53	+# {
	54	+# print "Create output dir $dir_out\n" ;
	55	+# mkdir $dir_out ;
	56	+# if (! -d $dir_out)
	57	+# { &Abort ("Output dir could not be created.") } ;
	58	+# }
	59	+
	60	+ print "\nExtract missing articles\n" ; # Parm in: $dir_in\nParm out: $dir_out\n" ;
	61	+
	62	+# &SortEncodedArticleTitles ;
	63	+ &ExtractMissingArticles ;
	64	+
	65	+ &Log ("\nReady\n") ;
	66	+ exit ;
	67	+
	68	+sub SortEncodedArticleTitles
	69	+{
	70	+ open IN, '<', $file_articles_in \|\| &Abort ("$file_articles_in could not be opened") ;
	71	+ open OUT, '>', $file_articles_out \|\| &Abort ("$file_articles_out could not be opened") ;
	72	+
	73	+ while ($line = <IN>)
	74	+ {
	75	+ chomp ($line) ;
	76	+ $line =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
	77	+ $line =~ s/([\x00-\x31\x80-\xFF])/"%".sprintf("%X",ord ($1))/ge ;
	78	+ $line = ucfirst ($line) ;
	79	+ push @data, $line ;
	80	+ }
	81	+ close IN ;
	82	+
	83	+ @data = sort @data ;
	84	+
	85	+ foreach $line (@data)
	86	+ { print OUT "$line\n" ; }
	87	+
	88	+ close OUT ;
	89	+
	90	+ #--------------------------------------------------------------------------------------
	91	+
	92	+ open IN, '<', $file_pageviews_in \|\| &Abort ("$file_pageviews_in could not be opened") ;
	93	+ open OUT, '>', $file_pageviews_out \|\| &Abort ("$file_pageviews_tmp could not be opened") ;
	94	+
	95	+ @data = () ;
	96	+ while ($line = <IN>)
	97	+ {
	98	+ if ($line !~ /^$key /) { next ; }
	99	+
	100	+ chomp ($line) ;
	101	+ ($key2,$title,$counts) = split (' ', $line) ;
	102	+ $title =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
	103	+ $title =~ s/([\x00-\x31\x80-\xFF])/"%".sprintf("%X",ord ($1))/ge ;
	104	+ $title = ucfirst ($title) ;
	105	+ push @data, "$title $counts" ;
	106	+ }
	107	+ close IN ;
	108	+
	109	+ @data = sort @data ;
	110	+
	111	+ foreach $line (@data)
	112	+ { print OUT "$line\n" ; }
	113	+
	114	+ close OUT ;
	115	+}
	116	+
	117	+sub ExtractMissingArticles
	118	+{
	119	+ my $dir_in = shift ;
	120	+ my $dir_out = shift ;
	121	+
	122	+ open ARTICLES, '<', $file_articles_out \|\| &Abort ("$file_articles_out could not be opened") ;
	123	+ open PAGEVIEWS, '<', $file_pageviews_out \|\| &Abort ("$file_pageviews_out could not be opened") ;
	124	+ open EXTRACT, '>', $file_extract \|\| &Abort ("$file_extract could not be written") ;
	125	+ open MISSING, '>', $file_missing \|\| &Abort ("$file_missing could not be written") ;
	126	+
	127	+ $title_at = <ARTICLES> ; # at = article title
	128	+ chomp $title_at ;
	129	+
	130	+ @data = () ;
	131	+ while ($line_pv = <PAGEVIEWS>) # pv = page view
	132	+ {
	133	+ chomp ($line_pv) ;
	134	+ ($title_pv,$counts) = split (' ', $line_pv) ;
	135	+
	136	+ while (($title_at ne "") && ($title_pv gt $title_at))
	137	+ {
	138	+ # print EXTRACT " PV '$title_pv' gt AT $title_at\n" ;
	139	+ $title_at = <ARTICLES> ;
	140	+ chomp ($title_at) ;
	141	+ }
	142	+
	143	+ chomp ($line_articles) ;
	144	+ # if ($title_pv eq $title_at)
	145	+ # { print EXTRACT " PV '$title_pv' EQ AT '$title_at'\n" ; }
	146	+ # else
	147	+ # { print EXTRACT " PV '$title_pv' NE AT '$title_at'\n" ; }
	148	+ if ($title_pv ne $title_at)
	149	+ {
	150	+ $title_pv2 = $title_pv ;
	151	+ $title_pv2 =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
	152	+ print EXTRACT "$title_pv2 $counts\n" ;
	153	+
	154	+ if ($title_pv2 !~ /:/) # temp treat all titles with : as namespaces
	155	+ {
	156	+ $counts =~ s/^(\d+).*$/$1/ ;
	157	+ push @data, "$counts $title_pv2" ;
	158	+ }
	159	+ }
	160	+ }
	161	+ @data = sort {$b <=> $a} @data ;
	162	+ foreach $line (@data)
	163	+ { print MISSING "$line\n" ; }
	164	+}
	165	+
	166	+sub Log
	167	+{
	168	+ $msg = shift ;
	169	+ print $msg ;
	170	+ print LOG $msg ;
	171	+}
	172	+
	173	+sub Abort
	174	+{
	175	+ $msg = shift ;
	176	+ print "Abort script\nError: $msg\n" ;
	177	+ print LOG "Abort script\nError: $msg\n" ;
	178	+ exit ;
	179	+}
	180	+
	181	+sub mmss
	182	+{
	183	+ my $seconds = shift ;
	184	+ return (int ($seconds / 60) . " min, " . ($seconds % 60) . " sec") ;
	185	+}
	186	+
Index: trunk/wikistats/dammit.lt/cellar/!DammitScanCompactedFiles.pl
—	—	@@ -0,0 +1,364 @@
	2	+#!/usr/local/bin/perl
	3	+
	4	+# 27 April 2010 renamed from WikiStatsScanCompactedDammitFiles.pl
	5	+
	6	+ use CGI qw(:all);
	7	+ use URI::Escape;
	8	+ use Getopt::Std ;
	9	+ use Cwd ;
	10	+
	11	+# grep pagecounts-20090428_fdt -f pandemic.txt > scan.txt
	12	+# utf-8 encoder for non western article titles: http://www.motobit.com/util/url-encoder.asp
	13	+
	14	+# &UncompactVisitorStats ('.') ;
	15	+# exit ;
	16	+
	17	+ $bayes = -d "/a/dammit.lt/pagecounts" ;
	18	+ $path_7za = "/usr/lib/p7zip/7za" ;
	19	+ $path_grep = "/bin/grep" ;
	20	+
	21	+# if (! $bayes)
	22	+# {
	23	+# print "Test on Windows\n" ;
	24	+# include IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; # install IO-Compress-Zlib
	25	+# include IO::Compress::Gzip qw(gzip $GzipError) ; # install IO-Compress-Zlib
	26	+# }
	27	+
	28	+ $\| = 1; # flush screen output
	29	+ $true = 1 ;
	30	+ $false = 0 ;
	31	+ $threshold = 5 ;
	32	+ $jobstart = time ;
	33	+
	34	+# -i "D:/\@Wikimedia/!Perl/#Projects/Dammit Log Files/Scan Log Files/in" -o "D:/\@Wikimedia/!Perl/#Projects/Dammit Log Files/Scan Log Files/out" -f 20090429 -t 20090429 -p ''
	35	+ my $options ;
	36	+ getopt ("ioftp", \%options) ;
	37	+
	38	+ if (! defined ($options {"i"})) { &Abort ("Specify input dir as -i dirname") } ;
	39	+ if (! defined ($options {"o"})) { &Abort ("Specify output dir as -o dirname") } ;
	40	+ if (! defined ($options {"f"})) { &Abort ("Specify from date as -f yyyymmdd") } ;
	41	+ if (! defined ($options {"t"})) { &Abort ("Specify till date as -t yyyymmdd") } ;
	42	+ if (! defined ($options {"p"})) { &Abort ("Specify pattern as -p \".....\"") } ;
	43	+
	44	+ $dir_in = $options {"i"} ;
	45	+ $dir_out = $options {"o"} ;
	46	+ $datefrom = $options {"f"} ;
	47	+ $datetill = $options {"t"} ;
	48	+ $pattern = $options {"p"} ;
	49	+
	50	+ print "Pattern '$pattern'\n" ;
	51	+ if ($pattern eq "html")
	52	+ { $pattern = &GetPattern ; }
	53	+
	54	+ $work = cwd() ;
	55	+ print "Work dir $work\n" ;
	56	+ if ($dir_in !~ /[\/\\]/)
	57	+ { $dir_in = "$work/$dir_in" ; }
	58	+ if ($dir_out !~ /[\/\\]/)
	59	+ { $dir_out = "$work/$dir_out" ; }
	60	+
	61	+ if (! -d $dir_in) { &Abort ("Input dir not found: $dir_in") } ;
	62	+ if (! -d $dir_out)
	63	+ {
	64	+ print "Create output dir $dir_out\n" ;
	65	+ mkdir $dir_out ;
	66	+ if (! -d $dir_out)
	67	+ { &Abort ("Output dir could not be created.") } ;
	68	+ }
	69	+
	70	+ print "\nParm pattern: $pattern\n\n" ;
	71	+# $pattern = "^nl.z Amsterdam\n^de.z Leiden\n" ;
	72	+ if ($pattern =~ /^\#/)
	73	+ { $file_pattern = substr ($pattern,1) ; }
	74	+ else
	75	+ {
	76	+ $pattern =~ s/\\n/\n/gs ;
	77	+ $file_pattern = "$dir_out/pattern.txt" ;
	78	+ print "Write pattern to $file_pattern\n" ;
	79	+ open PATTERN, ">", $file_pattern ;
	80	+ print PATTERN $pattern ;
	81	+ close PATTERN ;
	82	+ }
	83	+
	84	+ if (($datefrom !~ /^20\d{6}$/))
	85	+ { &Abort ("Specify from date: as -f yyyymmdd") ; }
	86	+ if (($datetill !~ /^20\d{6}$/))
	87	+ { &Abort ("Specify till date: as -t yyyymmdd") ; }
	88	+
	89	+ $dirfrom = substr ($datefrom,0,4) . "-" . substr ($datefrom,4,2) ;
	90	+ $dirtill = substr ($datetill,0,4) . "-" . substr ($datetill,4,2) ;
	91	+
	92	+ print "\nScan pagecount files\nParm in: $dir_in\nParm out: $dir_out\nParm from: $datefrom (in folder $dirfrom)\nParm till: $datetill (in folder $dirtill)\nParm pattern: $pattern\n\n" ;
	93	+
	94	+ open LOG, ">>", "$work/WikiStatsScanVisitorstats.log" ;
	95	+
	96	+ &ScanVisitorStats ($dir_in, $dir_out, $dirfrom, $dirtill, $datefrom, $datetill) ;
	97	+ &UncompactVisitorStats ($dir_out) ;
	98	+
	99	+ &Log ("\nReady\n") ;
	100	+ close LOG ;
	101	+ exit ;
	102	+
	103	+sub ScanVisitorStats
	104	+{
	105	+ my $dir_in = shift ;
	106	+ my $dir_out = shift ;
	107	+ my $dirfrom = shift ;
	108	+ my $dirtill = shift ;
	109	+ my $datefrom = shift ;
	110	+ my $datetill = shift ;
	111	+
	112	+ my @dirs ;
	113	+ my @files ;
	114	+
	115	+ chdir ($dir_in) \|\| &Abort ("Cannot chdir to $dir_in\n") ;
	116	+ local (*DIR);
	117	+ opendir (DIR, ".");
	118	+ @files = () ;
	119	+ while ($file_in = readdir (DIR))
	120	+ {
	121	+ if (! -d $file_in)
	122	+ { next ; }
	123	+ if ($file_in !~ /^20\d\d-\d\d$/)
	124	+ { next ; }
	125	+ if (($file_in lt $dirfrom) \|\| ($file_in gt $dirtill))
	126	+ { next ; }
	127	+ &Log ("Store folder $file_in\n") ;
	128	+ push @dirs, $file_in ;
	129	+ }
	130	+ &Log ("\n") ;
	131	+ closedir (DIR, ".");
	132	+
	133	+ @dirs = sort @dirs ;
	134	+
	135	+ foreach $dir (@dirs)
	136	+ {
	137	+ chdir ("$dir_in/$dir") \|\| &Abort ("Cannot chdir to $dir_in/$dir\n") ;
	138	+ local (*DIR);
	139	+ opendir (DIR, ".");
	140	+ while ($file_in = readdir (DIR))
	141	+ {
	142	+ if (-d $file_in)
	143	+ { next ; }
	144	+ if ($file_in !~ /^pagecounts-\d{8,8}_fdt.7z$/)
	145	+ { next ; }
	146	+ if (($file_in lt "pagecounts-$datefrom") \|\| ($file_in gt "pagecounts-$datetill\xFF"))
	147	+ { next ; }
	148	+ &Log ("Store file $file_in\n") ;
	149	+ push @files, "$dir/$file_in" ;
	150	+ }
	151	+ closedir (DIR, ".");
	152	+ }
	153	+ &Log ("\n") ;
	154	+
	155	+ if ($#files > -1)
	156	+ {
	157	+ @files = sort @files ;
	158	+
	159	+ unlink "$dir_out/scan.txt" ;
	160	+ foreach $file (@files)
	161	+ {
	162	+ my $filestart = time ;
	163	+ my $date = $file ;
	164	+ $date =~ s/^.?-(\d{8,8})_.$/$1/ ;
	165	+ $size = -s "$dir_in/$file" ;
	166	+ print "Scan file '$file' ($size bytes)\n" ;
	167	+
	168	+ $cmd = "echo \"\# $date\" >> $dir_out/scan.txt" ;
	169	+ print "Cmd: $cmd\n" ;
	170	+ $result = `$cmd` ;
	171	+
	172	+ $cmd = "7z -so e $dir_in/$file \| grep -i -f $file_pattern >> $dir_out/scan.txt" ;
	173	+ print "Cmd: $cmd\n" ;
	174	+ $result = `$cmd` ;
	175	+
	176	+ print "File done in " . &mmss(time - $filestart) . "\n\n" ;
	177	+ }
	178	+
	179	+ print "Job done in " . &mmss(time - $jobstart) . "\n" ;
	180	+ print "Average file took " . &mmss(int (time - $jobstart)/($#files+1)) . "\n" ;
	181	+ }
	182	+ &Log ("\n\n") ;
	183	+}
	184	+
	185	+sub UncompactVisitorStats
	186	+{
	187	+ &Log ("\nUncompact visitors stats\n\n") ;
	188	+ my $dir_out = shift ;
	189	+
	190	+ my $file_in = "$dir_out/scan.txt" ;
	191	+ my $file_out1 = "$dir_out/CountsDailyPerLanguageTitles.csv" ; # totals for full day per language:title
	192	+ my $file_out2 = "$dir_out/CountsHourlyPerLanguageTitle.csv" ; # hourly counts per language:title (hours vertical)
	193	+ my $file_out3 = "$dir_out/CountsHourlyPerLanguage.csv" ; # hourly counts per language (hours vertical)
	194	+ my ($date,$time,$year,$month,$day) ; ;
	195	+
	196	+ open IN, '<', $file_in ;
	197	+ binmode IN ;
	198	+
	199	+ while ($line = <IN>)
	200	+ {
	201	+ # process timestamp
	202	+ if ($line =~ /^#/)
	203	+ {
	204	+ $date = substr ($line,2,8) ;
	205	+ $year = substr ($date,0,4) ;
	206	+ $month = substr ($date,4,2) ;
	207	+ $day = substr ($date,6,2) ;
	208	+ $date = "=DATE($year,$month,$day)" ;
	209	+ next ;
	210	+ }
	211	+
	212	+ chomp ($line) ;
	213	+ ($lang,$title,$counts) = split (" ", $line) ;
	214	+ $title =~ s/,/,/g ;
	215	+ $lang =~ s/\.z// ;
	216	+ $lang =~ s/\.y/2/ ;
	217	+ $counts =~ s/^\d+// ; # remove (redundant) preceding total
	218	+
	219	+ # store hourly counts
	220	+ while ($counts ne "")
	221	+ {
	222	+ $letter = substr ($counts,0,1) ;
	223	+ $counts = substr ($counts,1) ;
	224	+ ($count = $counts) =~ s/^(\d+).*$/$1/ ;
	225	+ $counts =~ s/^\d+(.*)$/$1/ ;
	226	+ $h = sprintf ("%02d", ord ($letter) - ord ('A')) ;
	227	+ $time = $date . "+TIME($h,0,0)" ;
	228	+
	229	+ $hits1 {"$lang,$title,\"$date\""} += $count ;
	230	+ $key = "$lang:$title" ;
	231	+ $times {$time}++ ;
	232	+ $keys {$key} ++ ;
	233	+ $languages {$lang} ++ ;
	234	+ $hits2 {"$time,$key"} += $count ;
	235	+ $hits3 {"$time,$lang"} += $count ;
	236	+ }
	237	+ }
	238	+
	239	+ close IN ;
	240	+
	241	+ # file_out1: write totals for full day per language:title
	242	+ # quick way to see which titles are visisted significantly
	243	+ @lines = sort @lines ;
	244	+ open OUT, '>', $file_out1 ;
	245	+ binmode OUT ;
	246	+ foreach $key (sort keys %hits1)
	247	+ { print OUT "$key,${hits1{$key}}\n" ; }
	248	+ close OUT ;
	249	+
	250	+ # file_out2: write hourly counts per language:title (hours vertical)
	251	+ open OUT, '>', $file_out2 ;
	252	+ binmode OUT ;
	253	+
	254	+ # header line
	255	+ $line = "date / group" ;
	256	+ foreach $key (sort keys %keys)
	257	+ { $line .= ",$key" ; }
	258	+ $line .= "\n" ;
	259	+ print OUT $line ;
	260	+
	261	+ foreach $time (sort keys %times)
	262	+ {
	263	+ $line = "\"$time\"" ;
	264	+ foreach $key (sort keys %keys)
	265	+ {
	266	+ $count = $hits2 {"$time,$key"} ;
	267	+ if ($count eq "")
	268	+ { $count = 0 ; }
	269	+ $line .= ",$count" ;
	270	+ }
	271	+ $line .= "\n" ;
	272	+ print OUT $line ;
	273	+ }
	274	+ close OUT ;
	275	+
	276	+ # file_out3: write hourly counts per language (hours vertical)
	277	+ open OUT, '>', $file_out3 ;
	278	+ binmode OUT ;
	279	+
	280	+ # header line
	281	+ $line = "date / group" ;
	282	+ foreach $lang (sort keys %languages)
	283	+ { $line .= ",$lang" ; }
	284	+ $line .= "\n" ;
	285	+ print OUT $line ;
	286	+
	287	+ foreach $time (sort keys %times)
	288	+ {
	289	+ $line = "\"$time\"" ;
	290	+ foreach $lang (sort keys %languages)
	291	+ {
	292	+ $count = $hits3 {"$time,$lang"} ;
	293	+ if ($count eq "")
	294	+ { $count = 0 ; }
	295	+ $line .= ",$count" ;
	296	+ }
	297	+ $line .= "\n" ;
	298	+ print OUT $line ;
	299	+ }
	300	+ close OUT ;
	301	+
	302	+}
	303	+
	304	+sub GetPattern
	305	+{
	306	+ print "GetPattern\n" ;
	307	+ open HTML, '<', 'wikilinks.html' ;
	308	+ $pattern = "" ;
	309	+ while ($line = <HTML>)
	310	+ {
	311	+ if ($line =~ /class=\"interwiki/)
	312	+ {
	313	+ chomp ($line) ;
	314	+ $lang = $line ;
	315	+ $lang =~ s/^.?interwiki-(\w+).$/$1/ ;
	316	+ $title = $line ;
	317	+ $title =~ s/^.?href=\"([^\"]+)\".$/$1/ ;
	318	+ $title =~ s/^.*\/([^\/]+)$/$1/ ;
	319	+ # print "[$lang] $title\n" ;
	320	+ @languages {$title} .= "$lang," ;
	321	+ @langcnt {$title}++ ;
	322	+ }
	323	+ }
	324	+ print "\n\n\n" ;
	325	+
	326	+ foreach $title (sort {$langcnt {$b} <=> $langcnt {$a}} keys %langcnt)
	327	+ {
	328	+ $count = $langcnt {$title} ;
	329	+ if ($count > 10)
	330	+ { $pattern .= "$title\n" ; }
	331	+ else
	332	+ {
	333	+ $langlist = $languages {$title} ;
	334	+ @langs = split (',', $langlist) ;
	335	+ foreach $lang (@langs)
	336	+ {
	337	+ print "$lang $title\n" ;
	338	+ $pattern .= "^$lang\.z $title\n"
	339	+ }
	340	+ }
	341	+ }
	342	+ return ($pattern) ;
	343	+}
	344	+
	345	+sub Log
	346	+{
	347	+ $msg = shift ;
	348	+ print $msg ;
	349	+ print LOG $msg ;
	350	+}
	351	+
	352	+sub Abort
	353	+{
	354	+ $msg = shift ;
	355	+ print "Abort script\nError: $msg\n" ;
	356	+ print LOG "Abort script\nError: $msg\n" ;
	357	+ exit ;
	358	+}
	359	+
	360	+sub mmss
	361	+{
	362	+ my $seconds = shift ;
	363	+ return (int ($seconds / 60) . " min, " . ($seconds % 60) . " sec") ;
	364	+}
	365	+
Index: trunk/wikistats/dammit.lt/dammit_compact_monthly.sh
—	—	@@ -0,0 +1,11 @@
	2	+#!/bin/sh
	3	+
	4	+ulimit -v 8000000
	5	+
	6	+# dte=$(date +%Y%m)
	7	+# dte=$(date --date "$dte -1 days" +%Y%m)
	8	+# echo "Compact dammit.lt files for one day: $dte"
	9	+
	10	+echo "Compact dammit.lt files for one month"
	11	+nice perl /a/dammit.lt/DammitCompactHourlyOrDailyPageCountFiles.pl -m -d 201001 -i /a/dammit.lt/pagecounts -o /a/dammit.lt/pagecounts/monthly >> /a/dammit.lt/pagecounts/monthly/compact_log.txt
	12	+
Property changes on: trunk/wikistats/dammit.lt/dammit_compact_monthly.sh
___________________________________________________________________
Added: svn:eol-style
1	13	+ native
Index: trunk/wikistats/dammit.lt/DammitReportPageRequestsStaffWikis.pl
—	—	@@ -0,0 +1,281 @@
	2	+#!/usr/bin/perl
	3	+
	4	+# bash file for daily generation and copy
	5	+# blank article title ?!
	6	+
	7	+# no warnings 'uninitialized';
	8	+
	9	+ use lib "/home/ezachte/lib" ; # general routines
	10	+ use lib "/home/ezachte/wikistats" ; # WikiReports*.pm modules
	11	+ use lib "W:/! Perl/Wikistats" ; # test env
	12	+
	13	+ use EzLib ;
	14	+ ez_lib_version (8) ;
	15	+ $trace_on_exit = $true ;
	16	+
	17	+# use Time::Local ;
	18	+# use Net::Domain qw (hostname);
	19	+
	20	+ use WikiReportsDate ;
	21	+ use WikiReportsLiterals ;
	22	+ use WikiReportsOutputMisc ;
	23	+ use WikiReportsScripts ;
	24	+ use WikiReportsNoWikimedia ;
	25	+ use WikiReportsLocalizations ;
	26	+ use WikiReportsHtml ;
	27	+
	28	+ my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
	29	+
	30	+ &SetMonths ;
	31	+ &SetLiterals ;
	32	+ &SetScripts ;
	33	+
	34	+ &CountMostViewedPages ($this_month) ;
	35	+ if ($mday <= 5)
	36	+ { &CountMostViewedPages ($prev_month) ; }
	37	+
	38	+ exit ;
	39	+
	40	+sub SetMonths
	41	+{
	42	+ $mon ++ ;
	43	+ $year += 1900 ;
	44	+ $this_month = sprintf ("%04d-%02d", $year, $mon) ;
	45	+ if (-- $mon == 0) { $mon = 12 ; $year-- ; }
	46	+ $prev_month = sprintf ("%04d-%02d", $year, $mon) ;
	47	+ if (-- $mon == 0) { $mon = 12 ; $year-- ; }
	48	+ $prev_prev_month = sprintf ("%04d-%02d", $year, $mon) ;
	49	+}
	50	+
	51	+sub CountMostViewedPages
	52	+{
	53	+ my $month = shift ;
	54	+ ($month2 = $month) =~ s/-// ;
	55	+
	56	+ undef %views ;
	57	+
	58	+ &LogT ("Count pages for $month\n\n") ;
	59	+
	60	+ if ($job_runs_on_production_server)
	61	+ {
	62	+ &LogT ("Job runs on production server\n") ;
	63	+ $path_in = "/a/dammit.lt/filtered" ;
	64	+ $path_out = "/mnt/htdocs/page_views" ;
	65	+ }
	66	+ else
	67	+ {
	68	+ &LogT ("Job runs on local test server\n") ;
	69	+ $path_in = "w:/! Perl/Dammit/Page Requests Staff Wikis" ;
	70	+ $path_out = "w:/! Perl/Dammit/Page Requests Staff Wikis" ;
	71	+ }
	72	+
	73	+ &LogT ("Path in: $path_in\n") ;
	74	+ &LogT ("Path out: $path_out\n") ;
	75	+ chdir $path_in ;
	76	+ my @files = glob "*" ; # glob on qualified dir on Windows gives problems, hence chdir ??
	77	+
	78	+ $first = "" ;
	79	+ $last = "" ;
	80	+ foreach $file (sort @files)
	81	+ {
	82	+ next if $file !~ /pagecounts-$month2\d\d.txt/ ;
	83	+ &LogT ("$file\n") ;
	84	+
	85	+ if ($first eq "")
	86	+ { ($first = $file) =~ s/[^\d]//g ; }
	87	+ ($last = $file) =~ s/[^\d]//g ;
	88	+ $first =~ s/(\d\d\d\d)(\d\d)(\d\d)/$1-$2-$3/ ;
	89	+ $last =~ s/(\d\d\d\d)(\d\d)(\d\d)/$1-$2-$3/ ;
	90	+ $first_day = substr ($first,8,2) ;
	91	+ $last_day = substr ($last ,8,2) ;
	92	+
	93	+ open IN, '<', $file ;
	94	+ while ($line = <IN>)
	95	+ {
	96	+ chomp $line ;
	97	+ ($project, $article, $counts) = split (' ', $line) ;
	98	+
	99	+ next if $article =~ /^\s*$/ ;
	100	+ next if $project eq "quality.m" ; # obsolete
	101	+ next if $article =~ /:\/\// ; # e.g. http://
	102	+ next if $article =~ /\.php/ ;
	103	+
	104	+ $article =~ s/^[\/\\]*// ;
	105	+ $article = ucfirst $article ;
	106	+ $project =~ s/\.m$// ;
	107	+ $project = ucfirst $project ;
	108	+ $projects {$project} ++ ;
	109	+
	110	+ ($daytotal = $counts) =~ s/^(\d+).*$/$1/ ;
	111	+ $views {$project} {$article} += $daytotal ;
	112	+
	113	+ # if ($article =~ /China/)
	114	+ # { print "$project $article + $daytotal -> " . $views {$project} {$article} . "\n" ; }
	115	+ }
	116	+ }
	117	+
	118	+ $month_eng = month_english_short (substr($month,5,2) - 1) . ' ' . substr ($month,0,4) ;
	119	+
	120	+ $period = 'day ' . (substr ($first,8,2)+0) . '-' . (substr ($last,8,2)+0) ;
	121	+
	122	+ foreach $project (sort keys %projects)
	123	+ {
	124	+ &LogT ("\nWrite totals for project $project for month $month (day $first_day - $last_day)\n\n") ;
	125	+
	126	+ # === Sort by title ===
	127	+
	128	+ @articles = sort keys %{$views {$project}} ;
	129	+ next if $#articles == -1 ;
	130	+
	131	+ open TXT, '>', "$path_out/PageViews${project}-$month-ByTitle.txt" ;
	132	+ open CSV, '>', "$path_out/PageViews${project}-$month-ByTitle.csv" ;
	133	+
	134	+ print TXT "title,views (period: $first - $last)\n" ;
	135	+ print CSV "views,title,period: $first - $last\n" ;
	136	+
	137	+ foreach $article (@articles)
	138	+ {
	139	+ ($article2 = $article) =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
	140	+ print TXT "$article2,${views {$project} {$article}}\n" ;
	141	+
	142	+ $article2 = $article ;
	143	+ if ($article2 =~ /,/)
	144	+ { $article2 = "\"$article2\"" ; }
	145	+ print CSV "${views {$project} {$article}},$article2\n" ;
	146	+ }
	147	+ close TXT ;
	148	+ close CSV ;
	149	+
	150	+ # === Sort by views ===
	151	+
	152	+ if ($month eq $this_month)
	153	+ {
	154	+ $url_prev = "PageViews${project}-$prev_month-ByViews.html" ;
	155	+ $url_next = "" ;
	156	+ $out_button_prev = &btn (" < ", $url_prev) ;
	157	+ $out_button_next = "" ;
	158	+ }
	159	+ elsif ($month eq $prev_month)
	160	+ {
	161	+ $url_prev = "PageViews${project}-$prev_prev_month-ByViews.html" ;
	162	+ $url_next = "PageViews${project}-$this_month-ByViews.html" ;
	163	+ $out_button_prev = &btn (" < ", $url_prev) ;
	164	+ $out_button_next = &btn (" > ", $url_next) ;
	165	+
	166	+ if (! -e $url_prev)
	167	+ { $out_button_prev = "" ; }
	168	+ }
	169	+
	170	+ my $out_zoom = "" ;
	171	+ my $out_options = "" ;
	172	+ my $out_explanation = "" ; #Based on Domas' <a href='http://dammit.lt/wikistats/'>page view files</a>" ;
	173	+ my $out_page_subtitle = "" ;
	174	+ my $out_crossref = "" ;
	175	+ my $out_description = "" ;
	176	+ my $out_button_switch = "" ;
	177	+ my $out_msg = "<b>$month_eng ($period)</b>" ;
	178	+ my $lang = "en" ;
	179	+
	180	+ my $out_html_title = "$project wiki page views" ;
	181	+ my $out_page_title = "$project wiki page views" ;
	182	+
	183	+ $out_scriptfile = "<script language=\"javascript\" type=\"text/javascript\" src=\"WikipediaStatistics14.js\"></script>\n" ;
	184	+ $out_style =~ s/td/td {font-size:12px}\nth {font-size:12px}\ntd/ ; # script definition needs clean up
	185	+
	186	+ $out_options = &opt ("PageViews${project}-$month-ByViews.html", $project) ;
	187	+ foreach $project2 (keys %projects)
	188	+ {
	189	+ if ($project2 ne $project)
	190	+ { $out_options .= &opt ("PageViews${project2}-$month-ByViews.html", $project2) ; }
	191	+ }
	192	+
	193	+ $unicode = $true ;
	194	+ &GenerateHtmlStart ($out_html_title, $out_zoom, $out_options,
	195	+ $out_page_title, $out_page_subtitle, $out_explanation,
	196	+ $out_button_prev, $out_button_next, $out_button_switch,
	197	+ $out_crossref, $out_msg) ;
	198	+
	199	+ $out_html =~ s/Sitemap.htm/http:\/\/stats.wikimedia.org/ ; # Q&D patch
	200	+ $out_html =~ s/ Home / stats.wikimedia.org / ; # Q&D patch
	201	+
	202	+ @articles = sort {$views {$project}{$b} <=> $views {$project}{$a}} keys %{$views {$project}} ;
	203	+
	204	+ open TXT, '>', "$path_out/PageViews${project}-$month-ByViews.txt" ;
	205	+ open CSV, '>', "$path_out/PageViews${project}-$month-ByViews.csv" ;
	206	+
	207	+ print TXT "title,views (period: $first - $last)\n" ;
	208	+ print CSV "views,title,period: $first - $last\n" ;
	209	+
	210	+ $out_html .= "<p><b>Other formats</b>: " ;
	211	+ $out_html .= "ordered by views: <a href='PageViews${project}-$month-ByViews.txt'>text file</a> / <a href='PageViews${project}-$month-ByViews.csv'>csv file</a>, " ;
	212	+ $out_html .= "ordered by title: <a href='PageViews${project}-$month-ByTitle.txt'>text file</a> / <a href='PageViews${project}-$month-ByTitle.csv'>csv file</a><p>" ;
	213	+ $out_html .= "<table border=1>\n" ;
	214	+ $out_html .= "<tr><th class=cb>Rank</th><th class=cb>Views</th><th class=lb>Title</th></tr>\n" ;
	215	+
	216	+ $lines = 0 ;
	217	+ foreach $article (@articles)
	218	+ {
	219	+ ($article2 = $article) =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
	220	+ print TXT "$article2,${views {$project} {$article}}\n" ;
	221	+
	222	+ $article2 =~ s/_/ /g ;
	223	+ if (++$lines <= 1000)
	224	+ { $out_html .= "<tr><td class=c>$lines</td><td class=r>${views {$project} {$article}}</td><td class=l><a href='http://$project.wikimedia.org/wiki/$article'>$article2</a></td></tr>\n" ; }
	225	+
	226	+ $article2 = $article ;
	227	+ if ($article2 =~ /,/)
	228	+ { $article2 = "\"$article2\"" ; }
	229	+ print CSV "${views {$project} {$article}},$article2\n" ;
	230	+ }
	231	+
	232	+ $out_html .= "</table>\n" ;
	233	+
	234	+ close TXT ;
	235	+ close CSV ;
	236	+
	237	+ $out_html .= "<p><small>Counts based on <a href='http://dammit.lt/wikistats/'>Domas' hourly pagecount files</a><br>" .
	238	+ "File generated on " . date_time_english (time) . "<br>Author: Erik Zachte</small>" ;
	239	+
	240	+ open HTML, '>', "$path_out/PageViews${project}-$month-ByViews.html" ;
	241	+ print HTML $out_html ;
	242	+ close HTML ;
	243	+
	244	+ if ($month eq $this_month) # static url
	245	+ {
	246	+ open HTML, '>', "$path_out/PageViews${project}.html" ;
	247	+ print HTML $out_html ;
	248	+ close HTML ;
	249	+ }
	250	+ }
	251	+}
	252	+
	253	+# translates one unicode character into plain ascii
	254	+sub UnicodeToAscii {
	255	+ my $unicode = shift ;
	256	+
	257	+ my $char = substr ($unicode,0,1) ;
	258	+ my $ord = ord ($char) ;
	259	+ my ($c, $value, $html) ;
	260	+
	261	+ if ($ord < 128) # plain ascii character
	262	+ { return ($unicode) ; } # (will not occur in this script)
	263	+ else
	264	+ {
	265	+ if ($ord >= 252) { $value = $ord - 252 ; }
	266	+ elsif ($ord >= 248) { $value = $ord - 248 ; }
	267	+ elsif ($ord >= 240) { $value = $ord - 240 ; }
	268	+ elsif ($ord >= 224) { $value = $ord - 224 ; }
	269	+ else { $value = $ord - 192 ; }
	270	+
	271	+ for ($c = 1 ; $c < length ($unicode) ; $c++)
	272	+ { $value = $value * 64 + ord (substr ($unicode, $c,1)) - 128 ; }
	273	+
	274	+ if ($value < 256)
	275	+ { return (chr ($value)) ; }
	276	+
	277	+ # $unicode =~ s/([\x80-\xFF])/("%".sprintf("%02X",$1))/gie ;
	278	+ return ($unicode) ;
	279	+ }
	280	+}
	281	+
	282	+
Index: trunk/wikistats/dammit.lt/DammitSyncFiles.pl
—	—	@@ -0,0 +1,197 @@
	2	+#!/usr/bin/perl
	3	+
	4	+# 27 April 2010 renamed from WikiStatsDammitSync.pl
	5	+
	6	+ use Time::Local ;
	7	+ use Archive::Tar;
	8	+
	9	+ $tar = Archive::Tar->new;
	10	+
	11	+ $\| = 1; # flush screen output
	12	+
	13	+ $maxdaysago = 40; # do not download files more than this ago
	14	+
	15	+ if (-e "a_dammit.lt_index.html") # test
	16	+ { $file_html = "a_dammit.lt_index.html" ; }
	17	+ else
	18	+ {
	19	+ open LOG, '>>', "/a/dammit.lt/WikiStatsDammitSync.log" ;
	20	+
	21	+ $file_html = "/a/dammit.lt/index.html" ;
	22	+ unlink $file_html ;
	23	+ $cmd = "wget -O $file_html http://dammit.lt/wikistats/" ;
	24	+ $result = `$cmd` ;
	25	+ if ($result == 0)
	26	+ { $result = "OK" ; }
	27	+ &Log ("Cmd '$cmd' -> $result \n\n") ;
	28	+
	29	+ if (! -e $file_html) { &Abort ("File $file_html not found") ; }
	30	+ if (-s $file_html == 0) { &Abort ("File $file_html empty") ; }
	31	+ }
	32	+
	33	+ $timestart = time ;
	34	+
	35	+ chdir "/a/dammit.lt/projectcounts" ;
	36	+ $cmd = `pwd` ;
	37	+ &Log ("Cmd '$cmd'\n") ;
	38	+ $result = `$cmd` ;
	39	+ print "$result\n" ;
	40	+
	41	+ open HTML,'<',$file_html ;
	42	+ while ($line = <HTML>)
	43	+ {
	44	+ if ($line =~ /<title>/)
	45	+ {
	46	+ $subdir = "" ;
	47	+ if ($line =~ /archive/)
	48	+ {
	49	+ $line =~ s/^.*?\/wikistats\/// ;
	50	+ $line =~ s/<.*$// ;
	51	+ chomp $line ;
	52	+ $subdir = $line ;
	53	+ }
	54	+ &Log ("Subdir = '$subdir'\n") ;
	55	+ next ;
	56	+ }
	57	+
	58	+ if ($line !~ /application\/octet-stream/) { next ; }
	59	+
	60	+ ($file = $line) =~ s/^.?a href=\"([^"]+)\".$/$1/s ;
	61	+ ($date = $line) =~ s/^.?class=\"m\">([^<]+)<.$/$1/s ;
	62	+ ($date,$time) = split (' ', $date) ;
	63	+
	64	+ if ($file =~ /^pagecounts/)
	65	+ {
	66	+ $yy = substr ($file,11,4) ;
	67	+ $mm = substr ($file,15,2) ;
	68	+ $dd = substr ($file,17,2) ;
	69	+ $daysago = int ((time - timegm(0,0,0,$dd,$mm-1,$yy-1900)) / (24 * 60 * 60)) ;
	70	+
	71	+ print "$file: $daysago days ago\n" ;
	72	+ if ($daysago > $maxdaysago) { next ; }
	73	+
	74	+ # $path_7z = "/a/dammit.lt/pagecounts/$yy\-$mm/pagecounts\-$yy$mm$dd\_fdt.7z" ;
	75	+ # if (-e $path_7z) { print "exists\n" ; next ; }
	76	+
	77	+ $path = "/a/dammit.lt/pagecounts/$yy\-$mm/pagecounts\-$yy$mm$dd\_h" ;
	78	+ if ((-e "$path.7z") \|\| (-e "$path.zip") \|\| (-e "$path.bz2") \|\| (-e "$path.gz"))
	79	+ { print "$path.[7z\|zip\|bz2\|gz] exists\n" ; next ; }
	80	+ else
	81	+ { print "$path.[7z\|zip\|bz2\|gz] new -> download\n" ; }
	82	+ }
	83	+
	84	+ # if ($file =~ /^projectcounts/)
	85	+ # {
	86	+ # $yy = substr ($file,14,4) ;
	87	+ # $mm = substr ($file,18,2) ;
	88	+ # $dd = substr ($file,20,2) ;
	89	+ # $daysago = int ((time - timegm(0,0,0,$dd,$mm-1,$yy-1900)) / (24 * 60 * 60)) ;
	90	+ # if ($daysago > $maxdaysago) { next ; }
	91	+ # }
	92	+
	93	+
	94	+ $yy = substr ($date,0,4) ;
	95	+ $mm = substr ($date,5,3) ;
	96	+ $dd = substr ($date,9,2) ;
	97	+ $hh = substr ($time,0,2) ;
	98	+ $nn = substr ($time,3,2) ;
	99	+ $ss = substr ($time,6,2) ;
	100	+
	101	+ if ($mm eq 'Jan') { $mm = 1 ; }
	102	+ elsif ($mm eq 'Feb') { $mm = 2 ; }
	103	+ elsif ($mm eq 'Mar') { $mm = 3 ; }
	104	+ elsif ($mm eq 'Apr') { $mm = 4 ; }
	105	+ elsif ($mm eq 'May') { $mm = 5 ; }
	106	+ elsif ($mm eq 'Jun') { $mm = 6 ; }
	107	+ elsif ($mm eq 'Jul') { $mm = 7 ; }
	108	+ elsif ($mm eq 'Aug') { $mm = 8 ; }
	109	+ elsif ($mm eq 'Sep') { $mm = 9 ; }
	110	+ elsif ($mm eq 'Oct') { $mm = 10 ; }
	111	+ elsif ($mm eq 'Nov') { $mm = 11 ; }
	112	+ elsif ($mm eq 'Dec') { $mm = 12 ; }
	113	+ else { &Abort ("Invalid month '$mm' in file date $date $time") ; }
	114	+
	115	+ $date2 = sprintf ("%02d%02d%02d%02d%02d.%02d", ($yy-2000), $mm, $dd, $hh, $nn, $ss) ;
	116	+
	117	+ if ($file =~ /^(?:page\|project)counts-2/)
	118	+ {
	119	+
	120	+ if ($file =~ /^pagecounts/)
	121	+ { $path = "/a/dammit.lt/pagecounts/$file" ; }
	122	+ else
	123	+ { $path = "/a/dammit.lt/projectcounts/$file" ; }
	124	+
	125	+ if (-e $path)
	126	+ {
	127	+ &Log ("File $path exists\n") ;
	128	+ if (-s $path == 0)
	129	+ {
	130	+ &Log ("File $path empty -> overwrite\n") ;
	131	+ unlink $path ;
	132	+ }
	133	+ else { next ; }
	134	+ }
	135	+
	136	+ if ($file =~ /^projectcounts/)
	137	+ {
	138	+ $tar_file = "/a/dammit.lt/projectcounts/projectcounts-$yy.tar" ;
	139	+ if (-e $tar_file)
	140	+ {
	141	+ if ($tar_file ne $tar_file_prev)
	142	+ {
	143	+ &Log ("\nRead tar file $tar_file\n") ;
	144	+ $tar->read($tar_file);
	145	+ $tar_file_prev = $tar_file ;
	146	+ }
	147	+ if ($tar->contains_file ($file))
	148	+ {
	149	+ &Log ("File $file exists in tar file $tar_file\n") ;
	150	+ next ;
	151	+ }
	152	+ }
	153	+ else
	154	+ { &Log ("Tar file $tar_file not found\n") ; }
	155	+ }
	156	+
	157	+ &Log ("Write file $path, set date $date2\n") ;
	158	+
	159	+ $cmd = "wget -a /a/dammit.lt/wget.log -O $path http://mituzas.lt/wikistats/$subdir$file" ;
	160	+ $result = `$cmd` ;
	161	+ if ($result == 0)
	162	+ { $result = "OK" ; }
	163	+ &Log ("Cmd '$cmd' -> $result \n\n") ;
	164	+
	165	+ `touch $path -t $date2` ;
	166	+
	167	+ if ($file =~ /^projectcounts/)
	168	+ {
	169	+ $cmd = "tar --append --file=$tar_file $file" ;
	170	+ &Log ("Cmd '$cmd'\n") ;
	171	+ $result = `$cmd` ;
	172	+ print "$result\n" ;
	173	+ unlink $path ;
	174	+ }
	175	+ }
	176	+ }
	177	+
	178	+ &Log ("Ready in " . (time - $timestart) . " sec.\n") ;
	179	+ close HTML ;
	180	+ close LOG ;
	181	+ exit ;
	182	+
	183	+sub Log
	184	+{
	185	+ $msg = shift ;
	186	+ my ($ss, $nn, $hh) = (localtime(time))[0,1,2] ;
	187	+ my $time = sprintf ("%02d:%02d:%02d", $hh, $nn, $ss) ;
	188	+ $msg = "$time $msg" ;
	189	+ print $msg ;
	190	+ print LOG $msg ;
	191	+}
	192	+
	193	+sub Abort
	194	+{
	195	+ $msg = shift ;
	196	+ &Log ($msg) ;
	197	+ exit ;
	198	+}
Index: trunk/wikistats/dammit.lt/DammitCompactHourlyPageCountFiles.pl
—	—	@@ -0,0 +1,964 @@
	2	+ #!/usr/local/bin/perl
	3	+
	4	+# 4/27/2010 renamed from WikiStatsCompactDammitFiles.pl
	5	+
	6	+# http://article.gmane.org/gmane.science.linguistics.wikipedia.technical/38154/match=new+statistics+stuff
	7	+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/webstatscollector/
	8	+# https://bugzilla.wikimedia.org/show_bug.cgi?id=13541
	9	+# http://de.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general\|namespaces\|namespacealiases
	10	+
	11	+# Ideas:
	12	+# 1 namespace string -> namespace number ? (may not save much space: compress will deal with recurring patterns like these)
	13	+# 2 frequenty distribution hits per file per first letter _-> manifest crawler
	14	+# assuming crawler collects articles in alphabetical order
	15	+# 3 first letter uppercase -> sort (in sections per first two chars ?)
	16	+
	17	+ use lib "/home/ezachte/lib" ;
	18	+ use EzLib ;
	19	+
	20	+ $trace_on_exit = $true ;
	21	+ ez_lib_version (13) ;
	22	+
	23	+ # set defaults mainly for tests on local machine
	24	+ default_argv "-i C:/bayes_backup/a/dammit.lt/pagecounts\|-t C:/bayes_backup/a/dammit.lt\|-f C:/bayes_backup/a/dammit.lt\|-o C:/bayes_backup/a/dammit.lt\|-d 20101215" ;
	25	+
	26	+ use CGI qw(:all);
	27	+ use URI::Escape;
	28	+ use Getopt::Std ;
	29	+ use Cwd ;
	30	+ $bayes = -d "/a/dammit.lt" ;
	31	+ $path_7za = "/usr/lib/p7zip/7za" ;
	32	+ if (! $bayes)
	33	+ {
	34	+ print "Test on Windows\n" ;
	35	+ use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; # install IO-Compress-Zlib
	36	+ use IO::Compress::Gzip qw(gzip $GzipError) ; # install IO-Compress-Zlib
	37	+ }
	38	+
	39	+ $\| = 1; # flush screen output
	40	+
	41	+ $true = 1 ;
	42	+ $false = 0 ;
	43	+ $threshold = 0 ;
	44	+ undef %totals_per_namespace ;
	45	+
	46	+ $filter = "^(?:outreach\|quality\|strategy\|usability)\.m\$" ;
	47	+ print "Filter: $filter\n" ;
	48	+ $reg_exp_filter = qr"$filter" ;
	49	+
	50	+ $track = "NonExistingPageForSquidLogMonitoring" ;
	51	+ print "Track: $track\n" ;
	52	+ $reg_exp_track = qr"$track" ;
	53	+
	54	+# -i "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/in" -o "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/out"
	55	+
	56	+ my $options ;
	57	+ getopt ("iodft", \%options) ;
	58	+
	59	+ if (! defined ($options {"i"})) { &Abort ("Specify input dir: -i dirname") } ;
	60	+ if (! defined ($options {"o"})) { &Abort ("Specify output dir: -o dirname") } ;
	61	+ if (! defined ($options {"f"})) { &Abort ("Specify filter dir: -f dirname") } ;
	62	+ if (! defined ($options {"t"})) { &Abort ("Specify tracking dir: -t dirname") } ;
	63	+ if (! defined ($options {"d"})) { &Abort ("Specify date range: as yyyymmdd, yyyymm, yyyy or *") } ;
	64	+
	65	+ $dir_in = $options {"i"} ;
	66	+ $dir_out = $options {"o"} ;
	67	+ $dir_filtered = $options {"f"} ;
	68	+ $dir_track = $options {"t"} ;
	69	+ $daterange = $options {"d"} ;
	70	+
	71	+ $work = cwd() ;
	72	+ print "Work dir $work\n" ;
	73	+
	74	+ if ($dir_in !~ /[\/\\]/)
	75	+ { $dir_in = "$work/$dir_in" ; }
	76	+
	77	+ if ($dir_out !~ /[\/\\]/)
	78	+ { $dir_out = "$work/$dir_out" ; }
	79	+
	80	+ if ($dir_filtered !~ /[\/\\]/)
	81	+ { $dir_filtered = "$work/$dir_filtered" ; }
	82	+
	83	+ if ($dir_track !~ /[\/\\]/)
	84	+ { $dir_track = "$work/$dir_track" ; }
	85	+
	86	+ if (! -d $dir_in)
	87	+ { &Abort ("Input dir not found: $dir_in") } ;
	88	+
	89	+ if (! -d $dir_out)
	90	+ {
	91	+ print "Create output dir $dir_out\n" ;
	92	+ mkdir $dir_out ;
	93	+ if (! -d $dir_out)
	94	+ { &Abort ("Output dir could not be created.") } ;
	95	+ }
	96	+
	97	+ if (($daterange !~ /^\d{8}$/) && ($daterange !~ /^\d{6}\$/) && ($daterange !~ /^\d{4}\$/) && ($daterange !~ /^\*$/))
	98	+ { &Abort ("Specify date range: as yyyymmdd, yyyymm, yyyy or *") ; }
	99	+
	100	+ print "\nCompress pagecount files\nin: $dir_in\nout: $dir_out\nflt: $dir_filtered\ntrack: $dir_track\ndate range: $daterange" ;
	101	+ $daterange =~ s/\*/\\d+/ ;
	102	+
	103	+ open LOG, ">>", "$work/WikiStatsCompactDammitFiles.log" ;
	104	+
	105	+ &CompactVisitorStats ($dir_in, $dir_out, $dir_filtered, $dir_track, $daterange) ;
	106	+# &UncompactVisitorStats ; # test only, to see if process is revertible
	107	+
	108	+ &Log ("\nReady\n") ;
	109	+ close LOG ;
	110	+ exit ;
	111	+
	112	+sub CompactVisitorStats
	113	+{
	114	+ my $dir_in = shift ;
	115	+ my $dir_out = shift ;
	116	+ my $dir_filtered = shift ;
	117	+ my $dir_track = shift ;
	118	+ my $daterange = shift ;
	119	+
	120	+ chdir ($dir_in) \|\| &Abort ("Cannot chdir to $dir_in\n") ;
	121	+
	122	+ local (*DIR);
	123	+ opendir (DIR, ".");
	124	+ @files = () ;
	125	+
	126	+ while ($file_in = readdir (DIR))
	127	+ {
	128	+ next if $file_in !~ /^pagecounts-$daterange-\d{6,6}.gz$/ ;
	129	+
	130	+ push @files, $file_in ;
	131	+ }
	132	+
	133	+ closedir (DIR, ".");
	134	+
	135	+ @files = sort @files ;
	136	+
	137	+ if (($daterange =~ /^\d{8}$/) and ($#files < 23))
	138	+ { &Abort ("Less than 24 files found for date $daterange\n" . @files) ; }
	139	+
	140	+ foreach $file (@files)
	141	+ {
	142	+ $date = substr ($file,11,8) ;
	143	+ $process_dates {$date}++ ;
	144	+ }
	145	+
	146	+ &Log ("\n\n") ;
	147	+
	148	+ foreach $date (sort keys %process_dates)
	149	+ { &MergeFilesFullDay ($dir_in, $dir_out, $dir_filtered, $dir_track, $date) ; }
	150	+}
	151	+
	152	+sub MergeFilesFullDay
	153	+{
	154	+ my $dir_in = shift ;
	155	+ my $dir_out = shift ;
	156	+ my $dir_filtered = shift ;
	157	+ my $dir_track = shift ;
	158	+ my $date = shift ;
	159	+
	160	+ my $year = substr ($date,0,4) ;
	161	+ my $month = substr ($date,4,2) ;
	162	+ my $day = substr ($date,6,2) ;
	163	+
	164	+ my ($file_out1, $file_out2, $file_out3, $out_gz) ;
	165	+
	166	+ $dir_out = "$dir_out/${year}-${month}" ;
	167	+ if (! -d $dir_out)
	168	+ {
	169	+ mkdir $dir_out ;
	170	+ if (! -d $dir_out)
	171	+ { &Abort ("Output dir could not be created: $dir_out") } ;
	172	+ }
	173	+
	174	+ my @files_today = () ;
	175	+ foreach $file (@files)
	176	+ {
	177	+ next if $file !~ /^pagecounts-$date-\d{6,6}.gz$/ ;
	178	+
	179	+ push @files_today, $file ;
	180	+ }
	181	+
	182	+ # very few times (nearly) dupiclate files are found for same hour
	183	+ # keep the largest and presumably most complete one
	184	+ for ($i = 0 ; $i < $#files_today ; $i++)
	185	+ {
	186	+ for ($j = $i+1 ; $j <= $#files_today ; $j++)
	187	+ {
	188	+ if (substr ($files_today [$i],0,25) eq substr ($files_today [$j],0,25))
	189	+ {
	190	+ $size_i = -s $files_today [$i] ;
	191	+ $size_j = -s $files_today [$j] ;
	192	+ print "${files_today [$i]}: $size_i\n" ;
	193	+ print "${files_today [$j]}: $size_j\n" ;
	194	+ if ($size_i > $size_j)
	195	+ {
	196	+ print "Keep ${files_today [$i]}\n\n" ;
	197	+ $files_today [$j]= "" ;
	198	+ }
	199	+ else
	200	+ {
	201	+ print "Keep ${files_today [$j]}\n\n" ;
	202	+ $files_today [$i]= "" ;
	203	+ }
	204	+ }
	205	+ }
	206	+ }
	207	+
	208	+ $time_start = time ;
	209	+ $lines = 0 ;
	210	+
	211	+ undef @in_gz ;
	212	+ undef $file_open ;
	213	+ my $time_start = time ;
	214	+
	215	+ # $file_out = "pagecounts-$year$month$day_full_day" ;
	216	+ # open OUT, ">", $file_out ;
	217	+ # binmode $file_out ;
	218	+
	219	+ # print "File_out1 $file_out1\n" ;
	220	+ # print "File_out2 $file_out2\n" ;
	221	+ # print "File_out3 $file_out3\n" ;
	222	+
	223	+# my $out_gz1 = IO::Compress::Gzip->new ($file_out1) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
	224	+ if ($bayes)
	225	+ {
	226	+ # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd" ; # full day
	227	+ $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h" ; # full day, hourly data
	228	+ # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d" ; # full day, compact, just daily totals
	229	+ if ((-e "$file_out2.7z") \|\| (-e "$file_out2.bz2") \|\| (-e "$file_out2.zip") \|\| (-e "$file_out2.gz"))
	230	+ {
	231	+ &Log ("\nTarget file '$file_out2.[7z\|bz2\|zip\|gz]' exists already. Skip this date.\n") ;
	232	+ return ;
	233	+ }
	234	+ if ($#files_today < 23)
	235	+ {
	236	+ &Log ("\nLess than 24 files found for target file '$file_out2.7z'. Skip this date.\n") ;
	237	+ return ;
	238	+ }
	239	+
	240	+ open $out_gz2, ">", "$file_out2" \|\| &Abort ("Output file '$file_out2' could not be opened.") ;
	241	+ # open $out_gz3, ">", "$file_out3" \|\| &Abort ("Output file '$file_out3' could not be opened.") ;
	242	+ }
	243	+ else
	244	+ {
	245	+ # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd.gz" ; # full day
	246	+ $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h.gz" ; # full day, count above threshold
	247	+ $out_gz2 = IO::Compress::Gzip->new ($file_out2) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
	248	+ # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d.gz" ; # full day, count above threshold
	249	+ # $out_gz3 = IO::Compress::Gzip->new ($file_out3) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
	250	+ }
	251	+
	252	+# binmode $out_gz1 ;
	253	+ binmode $out_gz2 ;
	254	+# binmode $out_gz3 ;
	255	+
	256	+ $file_filtered = "$dir_filtered/pagecounts-$year$month$day.txt" ;
	257	+ &Log ("\nFilter file: $file_filtered\n") ;
	258	+ open $out_filtered, '>', $file_filtered ;
	259	+ binmode $out_filtered ;
	260	+
	261	+ $file_track = "$dir_track/_PageCountsForSquidLogTracking.txt" ;
	262	+ &Log ("Tracking file: $file_track\n\n") ;
	263	+
	264	+ for ($hour = 0 ; $hour < 24 ; $hour++)
	265	+ { $file_in_found [$hour] = $false ; }
	266	+
	267	+ $files_in_open = 0 ;
	268	+ $files_in_found = 0 ;
	269	+ $langprev = "" ;
	270	+ foreach $file_in (@files_today)
	271	+ {
	272	+ next if $file_in eq "" ;
	273	+
	274	+ ($hour = $file_in) =~ s/^pagecounts-\d+-(\d\d)\d+\.gz$/$1/ ;
	275	+ $hour = (0+$hour) ;
	276	+ # print " file found '$file_in'\n" ;
	277	+
	278	+ if ($bayes)
	279	+ { open $in_gz [$hour], "-\|", "gzip -dc \"$file_in\"" \|\| &Abort ("Input file '" . $file_in . "' could not be opened.") ; }
	280	+ else
	281	+ { $in_gz [$hour] = IO::Uncompress::Gunzip->new ($file_in) \|\| &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ; }
	282	+ binmode $in_gz [$hour] ;
	283	+
	284	+ $files_in_open++ ;
	285	+ $file_in_found [$hour] = $true ;
	286	+ $file_in_open [$hour] = $true ;
	287	+ $files_in_found ++ ;
	288	+ $file = $in_gz [$hour] ;
	289	+ $line = <$file> ;
	290	+ $line =~ s/^(\w+)2 /$1.y /o ;
	291	+ $line =~ s/^(\w+) /$1.z /o ;
	292	+
	293	+ ($lang,$title,$count [$hour],$dummy) = split (' ', $line) ;
	294	+ $key [$hour] = "$lang $title" ;
	295	+ }
	296	+
	297	+ $comment = "# Wikimedia page request counts for $date, each line shows 'subproject title counts'\n" ;
	298	+ if ($threshold > 0 )
	299	+ { $comment .= "# Count for articles with less than $threshold requests per full day are omitted\n" ; }
	300	+ $comment .= "# Subproject is language code, followed by project code\n" ;
	301	+ $comment .= "# Project is b:wikibooks, k:wiktionary, n:wikinews, q:wikiquote, s:wikisource, v:wikiversity, z:wikipedia (z added by compression script: wikipedia happens to be sorted last in dammit.lt files)\n" ;
	302	+ $comment .= "# Counts format is total per day, followed by count per hour if larger than zero, hour 0..23 shown as A..X (saves up to 22 bytes per line compared to comma separated values)\n" ;
	303	+ $comment .= "# If data are missing for some hour (file missing or corrupt) a question mark (?) is shown (and for each missing hour the daily total is incremented with hourly average)\n" ;
	304	+ print $out_gz2 $comment ;
	305	+# print $out_gz3 $comment ;
	306	+
	307	+ if ($files_in_found < 24)
	308	+ {
	309	+ for ($hour = 0 ; $hour < 24 ; $hour++)
	310	+ {
	311	+ if (! $file_in_found [$hour])
	312	+ { $hours_missing .= "$hour," ; }
	313	+ }
	314	+ $hours_missing =~ s/,$// ;
	315	+ &Log ("Merge files: date = $date, only $files_in_found files found!\n") ;
	316	+ }
	317	+ else
	318	+ { &Log ("Merge files: date = $date\n") ; }
	319	+
	320	+ if ($hours_missing ne '')
	321	+ {
	322	+ print $out_gz2 "#\n" ;
	323	+ print $out_gz2 "# In this file data are missing for hour(s) $hours_missing!\n" ;
	324	+ # print $out_gz3 "#\n" ;
	325	+ # print $out_gz3 "# In this file data are missing for hour(s) $hours_missing!\n" ;
	326	+ }
	327	+ $comment = "#\n" ;
	328	+ $comment .= "# Lines starting with ampersand (@) show totals per 'namespace' (including omitted counts for low traffic articles)\n" ;
	329	+ $comment .= "# Since valid namespace string are not known in the compression script any string followed by colon (:) counts as possible namespace string\n" ;
	330	+ $comment .= "# Please reconcile with real namespace name strings later\n" ;
	331	+ $comment .= "# 'namespaces' with count < 5 are combined in 'Other' (on larger wikis these are surely false positives)\n" ;
	332	+ $comment .= "#\n" ;
	333	+ $comment .= "# Page titles are shown unmodified (preserves sort sequence)\n" ;
	334	+ $comment .= "#\n" ;
	335	+ print $out_gz2 $comment ;
	336	+# print $out_gz3 $comment ;
	337	+
	338	+ $key_low_prev = "" ;
	339	+ while ($files_in_open > 0)
	340	+ {
	341	+ $key_low = "\xFF\xFF";
	342	+ for ($hour = 0 ; $hour < 24 ; $hour++)
	343	+ {
	344	+ if (($files_in_open == 24) \|\| ($file_in_found [$hour] && $file_in_open [$hour]))
	345	+ {
	346	+ if ($key [$hour] lt $key_low)
	347	+ { $key_low = $key [$hour] ; }
	348	+ }
	349	+ }
	350	+
	351	+ if (($key_low =~ /^nov/) \|\| ($key_low_prev =~ /^nov/))
	352	+ { &Log ("key_low '$key_low' (key_low_prev '$key_low_prev')\n") ; }
	353	+
	354	+ $counts = "" ;
	355	+ $total = 0 ;
	356	+ for ($hour = 0 ; $hour < 24 ; $hour++)
	357	+ {
	358	+ if (! $file_in_found [$hour])
	359	+ { $counts .= chr ($hour+ord('A')) . '?' ; }
	360	+ elsif (($files_in_open == 24) \|\| $file_in_open [$hour])
	361	+ {
	362	+ if ($key [$hour] eq $key_low)
	363	+ {
	364	+ $counts .= chr ($hour+ord('A')) . $count [$hour] ;
	365	+ $total += $count [$hour] ;
	366	+ $file = $in_gz [$hour] ;
	367	+ # $line = <$file> ;
	368	+
	369	+ while ($true)
	370	+ {
	371	+ if ($line = <$file>) # =~ /^a/)
	372	+ {
	373	+ $line =~ s/^([\w\-]+)2 /$1.y /o ;
	374	+ $line =~ s/^([\w\-]+) /$1.z /o ;
	375	+ ($lang,$title,$count [$hour],$dummy) = split (' ', $line) ;
	376	+ $key [$hour] = "$lang $title" ;
	377	+
	378	+ last if $lang !~ /\d/ ;
	379	+ }
	380	+ else
	381	+ {
	382	+ if ($bayes)
	383	+ { close $in_gz [$hour] ; }
	384	+ else
	385	+ { $in_gz [$hour] -> close () ; }
	386	+ $files_in_open-- ;
	387	+ $file_in_open [$hour] = $false ;
	388	+ $key [$hour] = "\xFF\xFF";
	389	+
	390	+ last ;
	391	+ }
	392	+ }
	393	+ }
	394	+ }
	395	+ }
	396	+ if ($lines == 0)
	397	+ { &Log ("\nlines: project key\n") ; }
	398	+
	399	+ if (++$lines % 100000 == 0)
	400	+ { &Log ("$lines: $key_low\n") ; }
	401	+
	402	+ # last if $lines > 10000 ; # test
	403	+
	404	+ last if $key_low eq "\xFF\xFF" ;
	405	+
	406	+ # Q&D fix for unexplained out of order error for what seems to be invalid language
	407	+ # remember : no suffix on language code gets replaced by .y or .z to fixed sort order
	408	+ # ^nov.mw nov1 1 8765
	409	+ # ^nov1.mw nov1 1 931 <--------------
	410	+ # ^nov 10_dw_oktobre 1 11421
	411	+ ($lang,$title) = split (' ', $key_low) ;
	412	+ if ($lang =~ /\d/)
	413	+ {
	414	+ $invalid_languages {$lang}++ ;
	415	+ &Log ("\nSkip invalid language '$lang'\n") ;
	416	+ next ;
	417	+ }
	418	+
	419	+
	420	+ if ($key_low_prev gt $key_low)
	421	+ {
	422	+ for ($hour = 0 ; $hour < 24 ; $hour++)
	423	+ { &Log ("hour $hour: key ${key[$hour]}\n") ; }
	424	+
	425	+ &Abort ("Sequence error: '$key_low_prev' gt '$key_low'\n") ;
	426	+ }
	427	+
	428	+ if (($key_low_prev eq $key_low) && ($files_in_open > 0))
	429	+ {
	430	+ for ($hour = 0 ; $hour < 24 ; $hour++)
	431	+ {
	432	+ if ($file_in_open [$hour])
	433	+ { print "hour $hour: file open, key ${key [$hour]}\n" ; }
	434	+ else
	435	+ { print "hour $hour: file closed, key ${key [$hour]}\n" ; }
	436	+ }
	437	+ &Abort ("Sequence error: '$key_low_prev' eq '$key_low'\n") ;
	438	+ }
	439	+
	440	+ # print OUT "$key_low $total$counts\n" ;
	441	+# print $out_gz1 "$key_low $total$counts\n" ;
	442	+
	443	+ ($lang,$title) = split (' ', $key_low) ;
	444	+
	445	+ $title =~ s/\%20/_/g ;
	446	+ $title =~ s/\%3A/:/gi ;
	447	+# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
	448	+ if (($title !~ /\:/) \|\| ($title =~ /^:[^:]*$/)) # no colon or only on first position
	449	+ { $namespace = 'NamespaceArticles' ; }
	450	+ else
	451	+ { ($namespace = $title) =~ s/([^:])\:.*$/$1/ ; }
	452	+ # print "KEY $key_low -> $namespace\n" ;
	453	+
	454	+ if (($lang ne $langprev) && ($langprev ne ""))
	455	+ {
	456	+ $filter_matches = $lang =~ $reg_exp_filter ;
	457	+ if ($filter_matches)
	458	+ { print "F $lang\n" ; }
	459	+ # else
	460	+ # { print "- $lang\n" ; }
	461	+
	462	+ &WriteTotalsPerNamespace ($out_gz2, $langprev) ;
	463	+ # &WriteTotalsPerNamespace ($out_gz3, $langprev) ;
	464	+ undef %totals_per_namespace ;
	465	+ }
	466	+ $langprev = $lang ;
	467	+
	468	+ if (($files_in_found < 24) && ($files_in_found > 0)) # always > 0 actually
	469	+ { $total = sprintf ("%.0f",($total / $files_in_found) * 24) ; }
	470	+
	471	+ $totals_per_namespace {"$lang $namespace"} += $total ;
	472	+
	473	+ if ($filter_matches)
	474	+ { print $out_filtered "$key_low $total$counts\n" ; }
	475	+
	476	+ if ($key_low =~ $reg_exp_track) # track count for NonExistingPageForSquidLogMonitoring on en.z
	477	+ {
	478	+ open $out_track, '>>', $file_track ;
	479	+ binmode $out_track ;
	480	+ print $out_track "$key_low $total$counts\n" ;
	481	+ close $out_track ;
	482	+ }
	483	+
	484	+ if ($total >= $threshold)
	485	+ { print $out_gz2 "$key_low $total$counts\n" ;
	486	+ # print $out_gz3 "$key_low $total\n" ;
	487	+ }
	488	+
	489	+ $key_low_prev = $key_low ;
	490	+ # print "OUT $key_low $counts\n" ;
	491	+ }
	492	+
	493	+ &WriteTotalsPerNamespace ($out_gz2, $langprev) ;
	494	+# &WriteTotalsPerNamespace ($out_gz3, $langprev) ;
	495	+
	496	+ &Log ("File production took " . (time-$time_start) . " seconds\n\n") ;
	497	+
	498	+ &Log ("[$lines, $files_in_open] $key_low\n") ;
	499	+# close OUT ;
	500	+
	501	+ if ($bayes)
	502	+ {
	503	+ # close $out_gz1 ;
	504	+ close $out_gz2 ;
	505	+ # close $out_gz3 ;
	506	+ close $out_filtered ;
	507	+
	508	+# $cmd = "$path_7za a $file_out2.7z $file_out2" ;
	509	+# $result = `$cmd` ;
	510	+# if ($result =~ /Everything is Ok/s)
	511	+# {
	512	+# $result =~ s/^.?(Updating.?)\n.*$/$1 -> OK/s ;
	513	+# unlink $file_out2 ;
	514	+# foreach $file_in (@files_today)
	515	+# {
	516	+# print "unlink $dir_in/$file_in\n" ;
	517	+# unlink "$dir_in/$file_in" ;
	518	+# }
	519	+# }
	520	+# else
	521	+# {
	522	+# print "Delete $file_out2.7z\n" ;
	523	+# unlink "$file_out2.7z" ;
	524	+# }
	525	+
	526	+
	527	+ $cmd = "bzip2 -9 -v $file_out2" ;
	528	+ &Log ("\n\n$cmd ->\n") ;
	529	+ $result = `$cmd` ;
	530	+ &Log ("\n\n") ;
	531	+
	532	+ # if ($true) # qqq
	533	+ if ($false)
	534	+ {
	535	+ foreach $file_in (@files_today)
	536	+ {
	537	+ print "unlink $dir_in/$file_in\n" ;
	538	+ unlink "$dir_in/$file_in" ;
	539	+ }
	540	+ }
	541	+ else
	542	+ {
	543	+ # print "Delete $file_out2.7z\n" ;
	544	+ # unlink "$file_out2.7z" ;
	545	+ }
	546	+
	547	+ # $cmd = "bzip2 -9 -v $file_out3" ;
	548	+ # &Log ("\n$cmd ->\n") ;
	549	+ # $result = `$cmd` ;
	550	+ # &Log ("\n\n") ;
	551	+ &Log ("Compression took " . (time-$time_start_compression) . " seconds\n\n") ;
	552	+ }
	553	+ else
	554	+ {
	555	+ # $out_gz1->close() ;
	556	+ $out_gz2->close() ;
	557	+ # $out_gz3->close() ;
	558	+ close $out_filtered ;
	559	+ }
	560	+
	561	+ &Log ("\nRecords skipped for invalid languages:\n") ;
	562	+ foreach $key (sort keys %invalid_languages)
	563	+ { &Log ("$key: ${invalid_languages {$key}}\n") ; }
	564	+
	565	+ &Log ("\nTotals per namespace written: $lines_namespace_counts\n") ;
	566	+ &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
	567	+}
	568	+
	569	+sub WriteTotalsPerNamespace
	570	+{
	571	+ my $out_gz = shift ;
	572	+ my $lang = shift ;
	573	+ my $total ;
	574	+ my $totals_per_namespace_other ;
	575	+
	576	+ foreach my $key (sort keys %totals_per_namespace)
	577	+ {
	578	+ $total = $totals_per_namespace {$key} ;
	579	+ if ($total < 5)
	580	+ { $totals_per_namespace_other += $total ; }
	581	+ else
	582	+ {
	583	+ # print "@ $key $total\n" ;
	584	+ print $out_gz "@ $key $total\n" ;
	585	+ $lines_namespace_counts ++ ;
	586	+ }
	587	+ }
	588	+ if ($totals_per_namespace_other > 0 )
	589	+ {
	590	+ # print "@ $lang -other- $totals_per_namespace_other\n" ;
	591	+ print $out_gz "@ $lang -other- $totals_per_namespace_other\n" ;
	592	+ $lines_namespace_counts ++ ;
	593	+ }
	594	+}
	595	+
	596	+sub Log
	597	+{
	598	+ $msg = shift ;
	599	+ print $msg ;
	600	+ print LOG $msg ;
	601	+}
	602	+
	603	+sub Abort
	604	+{
	605	+ $msg = shift ;
	606	+ print "Abort script\nError: $msg\n" ;
	607	+ print LOG "Abort script\nError: $msg\n" ;
	608	+ exit ;
	609	+}
	610	+
	611	+#=============================================================================================================
	612	+
	613	+#sub Compact
	614	+#{
	615	+# my $day = shift ;
	616	+# &Log ("Compact files for $day\n") ;
	617	+
	618	+# $file_in = "pagecounts-$day.out" ;
	619	+# $file_out1 = "pagecounts-${day}_all.gz" ;
	620	+# $file_out2 = "pagecounts-${day}_10plus.gz" ;
	621	+# open IN, "<", $file_in ;
	622	+# binmode $file_in ;
	623	+
	624	+# my $out_gz1 = IO::Compress::Gzip->new ($file_out1) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
	625	+# my $out_gz2 = IO::Compress::Gzip->new ($file_out2) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
	626	+
	627	+# open OUT, ">", $file_out ;
	628	+# binmode $file_out ;
	629	+
	630	+# $lang_prev = "" ;
	631	+# while ($line = <IN>)
	632	+# {
	633	+# chomp ($line) ;
	634	+# ($lang, $title, $counts) = split (' ', $line) ;
	635	+# $title2 = $title ;
	636	+# $title =~ s/\%20/_/g ;
	637	+# $title =~ s/\%3A/:/g ;
	638	+# # $title =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
	639	+# # if ($title =~ /[\x00-\x1F]/)
	640	+# # { &Log ("> '$title2'\n") ; }
	641	+# $title =~ s/\x00-\x1F/"%" . sprintf ("%X", ord($1)) ;/ge ;
	642	+# print $out_gz1 "$lang $title $counts\n" ;
	643	+# ($counts2 = $counts) =~ s/^(\d+).*/$1/ ;
	644	+# if ($counts2 >= $threshold)
	645	+# { print $out_gz2 "$lang $title $counts\n" ; }
	646	+# $lang_prev = $lang ;
	647	+# }
	648	+#
	649	+# close IN ;
	650	+# $out_gz1->close() ;
	651	+# $out_gz2->close() ;
	652	+#}
	653	+
	654	+
	655	+#sub GetViewDistribution
	656	+#{
	657	+# open OUT, ">", "Views.csv" ;
	658	+# foreach $file_in (@files)
	659	+# {
	660	+# ($hour = $file_in) =~ s/^pagecounts-\d+-(\d\d)\d+\.gz$/$1/ ;
	661	+# $hour = chr(ord('A')+$hour) ;
	662	+# &Log ("Process $hour $file_in\n") ;
	663	+
	664	+# $in_gz1 = IO::Uncompress::Gunzip->new ($file_in) \|\| &Abort ("IO::Uncompress::Gunzip failed: $GunzipError\n") ;
	665	+# while ($line = <$in_gz1>)
	666	+# {
	667	+# ($lang,$title,$count,$dummy) = split (' ', $line) ;
	668	+# if (($lang eq "en") && ($title !~ /:/)) # only en: and namespace 0
	669	+# {
	670	+# $tot {$hour} += $count ;
	671	+# if ($count < 3)
	672	+# { @counts {$hour . substr ($title,0,1)}++ ; }
	673	+# }
	674	+# }
	675	+# $in_gz1->close () ;
	676	+# }
	677	+#
	678	+# print OUT "," ;
	679	+# foreach $hour ('A'..'X')
	680	+# { print OUT $hour . ", " ; }
	681	+# print OUT "\n" ;
	682	+#
	683	+# print OUT "," ;
	684	+# foreach $hour ('A'..'X')
	685	+# { print OUT $tot {$hour} . ", " ; }
	686	+# print OUT "\n" ;
	687	+#
	688	+# for ($c=0; $c < 256; $c++)
	689	+# {
	690	+# # do not print chars " and , as such: confuses csv format
	691	+# if ($c < 33)
	692	+# { print OUT "chr($c), " ; }
	693	+# elsif (chr($c) eq '"')
	694	+# { print OUT "dquote, " ; }
	695	+# elsif (chr($c) eq ',')
	696	+# { print OUT "comma, " ; }
	697	+# else
	698	+# { print OUT chr($c) . ", " ; }
	699	+#
	700	+# foreach $hour ('A'..'X')
	701	+# { print OUT (0+@counts {$hour.chr($c)}) , ", " ; }
	702	+#
	703	+# if ($c < 255)
	704	+# { print OUT "\n" ; }
	705	+# }
	706	+# close OUT ;
	707	+#}
	708	+
	709	+
	710	+#sub RecompactVisitorStats
	711	+#{
	712	+# my $dir_in = "D:/Wikipedia_Visitors/full_day/" ;
	713	+# chdir ($dir_in) \|\| &Abort ("Cannot chdir to $dir_in\n") ;
	714	+# local (*DIR);
	715	+# opendir (DIR, ".");
	716	+# @files = () ;
	717	+# while ($file_in = readdir (DIR))
	718	+# {
	719	+# next if $file_in !~ /^pagecounts-\d{8,8}_fd.gz$/ ;
	720	+#
	721	+# push @files, $file_in ;
	722	+# }
	723	+
	724	+# $filecnt = $#files+1 ;
	725	+# @files = sort { substr ($a, 20,2) <=> substr ($b, 20,2)} @files ;
	726	+
	727	+# foreach $file (@files)
	728	+# { &RecompactVisitorStats2 ($file) ; }
	729	+# closedir (DIR, ".");
	730	+#}
	731	+
	732	+#sub RecompactVisitorStats2
	733	+#{
	734	+## http://www.7-zip.org/7z.html
	735	+# my $file = shift ;
	736	+# my $time_start = time ;
	737	+# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
	738	+## my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
	739	+# my $file_in = "D:/Wikipedia_Visitors/full_day/$file" ;
	740	+# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
	741	+# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
	742	+
	743	+# &Log ("Process $file_in\n") ;
	744	+
	745	+# $in_gz = IO::Uncompress::Gunzip->new ($file_in) \|\| &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
	746	+# binmode $in_gz ;
	747	+# open OUT, ">", $file_out ;
	748	+# binmode OUT ;
	749	+
	750	+# my ($title, $title2) ;
	751	+# while ($line = <$in_gz>)
	752	+# {
	753	+# chomp ($line) ;
	754	+# ($lang,$title,$counts) = split (" ", $line) ;
	755	+
	756	+# if ($lang ne $lang_prev) { print "$lang " ; }
	757	+# $lang_prev = $lang ;
	758	+
	759	+# # test pagecounts-20080701_fd.gz
	760	+# # all records 424 Mib compressed (1984 uncompressed)
	761	+# # count > 1 212 Mib compressed ( 733 uncompressed)
	762	+# # count > 2 169 Mib compressed ( 551 uncompressed)
	763	+# next if $counts <= 1 ;
	764	+
	765	+# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
	766	+# $title =~ s/\s/_/g;
	767	+# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
	768	+# $lang =~ s/\.y/2/ ;
	769	+
	770	+# print OUT "$lang $title $counts\n" ;
	771	+# }
	772	+
	773	+# print "Close files\n" ;
	774	+# $in_gz -> close () ;
	775	+# close (OUT) ;
	776	+
	777	+# &Log ("Compress $file_out\n") ;
	778	+
	779	+# unlink $file_7z ;
	780	+# $result = `$path_7z a $file_7z $file_out` ;
	781	+# &Log ("Compressed\n") ;
	782	+# &Log ("Result " . ($result+0) . " \n") ;
	783	+# if ((-e $file_7z) && (-s $file_7z > 0) && (($result == 0) \|\| ($result == 7)))
	784	+# { unlink $file_out ; }
	785	+
	786	+# &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
	787	+## 0 No error
	788	+## 1 Warning (Non fatal error(s)). For example, one or more files were locked by some other application, so they were not compressed.
	789	+## 2 Fatal error
	790	+## 7 Command line error
	791	+## 8 Not enough memory for operation
	792	+## 255 User stopped the process
	793	+#}
	794	+
	795	+
	796	+#sub RecompactVisitorStats3
	797	+#{
	798	+## http://www.7-zip.org/7z.html
	799	+# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
	800	+# my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
	801	+# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
	802	+# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
	803	+## my $file_log = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts.log" ;
	804	+
	805	+# $in_gz = IO::Uncompress::Gunzip->new ($file_in) \|\| &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
	806	+# binmode $in_gz ;
	807	+## $out_gz = IO::Compress::Gzip->new ($file_out) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
	808	+## binmode $out_gz ;
	809	+# open OUT, ">", $file_out ;
	810	+# binmode OUT ;
	811	+## open LOG, ">", $file_log ;
	812	+## binmode LOG ;
	813	+
	814	+# my ($title, $title2) ;
	815	+# while ($line = <$in_gz>)
	816	+# {
	817	+# chomp ($line) ;
	818	+# ($lang,$title,$counts) = split (" ", $line) ;
	819	+
	820	+# if ($lang ne $lang_prev) { print "$lang\n" ; }
	821	+## last if $lang gt "fs" ;
	822	+# $lang_prev = $lang ;
	823	+
	824	+# # test pagecounts-20080701_fd.gz
	825	+# # all records 424 Mib compressed (1984 uncompressed)
	826	+# # count > 1 212 Mib compressed ( 733 uncompressed)
	827	+# # count > 2 169 Mib compressed ( 551 uncompressed)
	828	+# next if $counts <= 1 ;
	829	+
	830	+## next if $lang !~ /^(?:ar\|fr)/ ;
	831	+
	832	+#if ($false)
	833	+#{
	834	+# $title1b = $title ;
	835	+# $title1b =~ s/(\%[A-Fa-f0-9]{2})/uc($1)/seg;
	836	+# $title1b =~ s/\%28/(/g ;
	837	+# $title1b =~ s/\%29/)/g ;
	838	+# $title1b =~ s/\%3A/:/g ;
	839	+# $title1b =~ s/\%2F/\//g ;
	840	+# $title1b =~ s/\%5C/\\/g ;
	841	+# $title1b =~ s/\%2A/*/g ;
	842	+# $title1b =~ s/\%21/!/g ;
	843	+# $title1b =~ s/\%5F/_/g ;
	844	+# $title1b =~ s/\%2C/,/g ;
	845	+# $title1b =~ s/\%2E/./g ;
	846	+# $title1b =~ s/\%2D/-/g ;
	847	+# $title1b =~ s/\%25/%/g ;
	848	+# $title1b =~ s/\%7E/~/g ;
	849	+# $title1b =~ s/\%27/'/g ;
	850	+# $title1b =~ s/\%3D/=/g ;
	851	+# $title1b =~ s/\%26/&/g ;
	852	+# $title1b =~ s/\%3B/;/g ;
	853	+# $title1b =~ s/\%3F/?/g ;
	854	+# $title1b =~ s/\%2B/+/g ;
	855	+# $title2 = $title1b ;
	856	+# $title2 =~ s/%([A-F0-9]{2})/chr(hex($1))/seg;
	857	+
	858	+# if ($title1b ne $title2) # if changed anything at all
	859	+# {
	860	+# $title3 = uri_escape ($title2) ;
	861	+# $title3 =~ s/\%28/(/g ;
	862	+# $title3 =~ s/\%29/)/g ;
	863	+# $title3 =~ s/\%3A/:/g ;
	864	+# $title3 =~ s/\%2F/\//g ;
	865	+# $title3 =~ s/\%5C/\\/g ;
	866	+# $title3 =~ s/\%2A/\*/g ;
	867	+# $title3 =~ s/\%21/\!/g ;
	868	+# $title3 =~ s/\%5F/\_/g ;
	869	+# $title3 =~ s/\%2C/,/g ;
	870	+# $title3 =~ s/\%2E/./g ;
	871	+# $title3 =~ s/\%2D/-/g ;
	872	+# $title3 =~ s/\%25/%/g ;
	873	+# $title3 =~ s/\%7E/~/g ;
	874	+# $title3 =~ s/\%27/'/g ;
	875	+# $title3 =~ s/\%3D/=/g ;
	876	+# $title3 =~ s/\%26/&/g ;
	877	+# $title3 =~ s/\%3B/;/g ;
	878	+# $title3 =~ s/\%3F/?/g ;
	879	+# $title3 =~ s/\%2B/+/g ;
	880	+
	881	+# if ($title1b eq $title3) # process reversible ?
	882	+# {
	883	+# $y++ ;
	884	+# $title2 =~ s/\s/_/g;
	885	+# $title = $title2 ;
	886	+# }
	887	+# else
	888	+# {
	889	+# $n++ ;
	890	+# print "Y $y N $n\n$title\n$title3\n\n" ;
	891	+# print LOG "Y $y N $n\n$title\n$title3\n\n" ;
	892	+# }
	893	+# }
	894	+#}
	895	+# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
	896	+# $title =~ s/\s/_/g;
	897	+# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
	898	+# $lang =~ s/\.y/2/ ;
	899	+
	900	+# # print $out_gz "$lang $title $counts\n" ;
	901	+# print OUT "$lang $title $counts\n" ;
	902	+# }
	903	+
	904	+# print "Close files\n" ;
	905	+# $in_gz -> close () ;
	906	+## $out_gz -> close () ;
	907	+# close (OUT) ;
	908	+# $result = `$path_7z a $file_out $file_txt` ;
	909	+# print $result ;
	910	+#}
	911	+
	912	+
	913	+
	914	+# test (partial) reversibility of process
	915	+#sub UncompactVisitorStats
	916	+#{
	917	+# my $file_in = "out/2009-03/pagecounts-20090301_fdt1" ;
	918	+# my $dir_out = "out" ;
	919	+# # $in_gz = IO::Uncompress::Gunzip->new ($file_in) \|\| &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
	920	+# open $in_gz, '<', $file_in ;
	921	+# binmode $in_gz ;
	922	+
	923	+# for ($h=0 ; $h<=23 ; $h++)
	924	+# {
	925	+# $time = sprintf ("%02d",$h) . "0000" ;
	926	+## $file_out = "$dir_out/pagecounts-20090301-$time.gz" ;
	927	+# $file_out = "$dir_out/pagecounts-20090301-$time" ;
	928	+# open $out_gz [$h], '>', $file_out ;
	929	+## $out_gz [$h] = IO::Compress::Gzip->new ($file_out) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n");
	930	+# binmode $out_gz [$h] ;
	931	+# }
	932	+
	933	+# while ($line = <$in_gz>)
	934	+# {
	935	+# next if $line =~ /^#/ ;
	936	+# next if $line =~ /^@/ ;
	937	+# chomp ($line) ;
	938	+## print "$line\n" ;
	939	+# if ($lines++ > 10000) { exit ; }
	940	+# ($lang,$title,$counts) = split (" ", $line) ;
	941	+# $lang =~ s/\.z// ;
	942	+# $lang =~ s/\.y/2/ ;
	943	+# $counts =~ s/^\d+// ; # remove (redundant) preceding total
	944	+# while ($counts ne "")
	945	+# {
	946	+# $letter = substr ($counts,0,1) ;
	947	+# $counts = substr ($counts,1) ;
	948	+# ($count = $counts) =~ s/^(\d+).*$/$1/ ;
	949	+# $counts =~ s/^\d+(.*)$/$1/ ;
	950	+# $h = ord ($letter) - ord ('A') ;
	951	+# $file = $out_gz [$h] ;
	952	+# $writes {$h} ++ ;
	953	+# print $file "$lang $title $count\n" ;
	954	+# }
	955	+
	956	+# }
	957	+
	958	+# for ($h=0 ; $h<=23 ; $h++)
	959	+# {
	960	+## $out_gz [$h] -> close () ;
	961	+# close $out_gz [$h] ;
	962	+# }
	963	+#}
	964	+
	965	+
Index: trunk/wikistats/dammit.lt/dammit_scan.sh
—	—	@@ -0,0 +1,10 @@
	2	+i='/a/dammit.lt/pagecounts' # input dir
	3	+o='/home/ezachte/wikistats/scans' # output dir
	4	+f=20090424 # from date
	5	+t=20091110 # till date
	6	+#="swine.flu\nswine.influenza\nflu.outbreak\ninfluenza.outbreak\ngripe.*porcina\npandem\n"
	7	+p=".influensa\n.H1N1.*\npandemi\n"
	8	+#p="#$o/pattern_influenza_en.txt" # file name
	9	+#p="#$o/pattern_pandemic_shortlist.txt" # file name
	10	+#p=html
	11	+perl /a/dammit.lt/DammitScanCompactedFiles.pl -i $i -o $o -f $f -t $t -p $p
Property changes on: trunk/wikistats/dammit.lt/dammit_scan.sh
___________________________________________________________________
Added: svn:eol-style
1	12	+ native
Index: trunk/wikistats/dammit.lt/dammit_filter.sh
—	—	@@ -0,0 +1,5 @@
	2	+#='/a/dammit.lt/pagecounts' # input dir
	3	+#o='/home/ezachte/wikistats/scans' # output dir
	4	+#f=20090424 # from date
	5	+#t=20091110 # till date
	6	+perl /a/dammit.lt/DammitFilterDailyPageCountsPerLanguage.pl
Property changes on: trunk/wikistats/dammit.lt/dammit_filter.sh
___________________________________________________________________
Added: svn:eol-style
1	7	+ native
Index: trunk/wikistats/dammit.lt/DammitCompactHourlyOrDailyPageCountFiles.pl
—	—	@@ -0,0 +1,1568 @@
	2	+#!/usr/local/bin/perl
	3	+
	4	+# 4/27/2010 renamed from WikiStatsCompactDammitFiles.pl
	5	+
	6	+# http://article.gmane.org/gmane.science.linguistics.wikipedia.technical/38154/match=new+statistics+stuff
	7	+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/webstatscollector/
	8	+# https://bugzilla.wikimedia.org/show_bug.cgi?id=13541
	9	+# http://de.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general\|namespaces\|namespacealiases
	10	+
	11	+# Ideas:
	12	+# 1 namespace string -> namespace number ? (may not save much space: compress will deal with recurring patterns like these)
	13	+# 2 frequency distribution hits per file per first letter _-> manifest crawler
	14	+# assuming crawler collects articles in alphabetical order
	15	+# 3 first letter uppercase -> sort (in sections per first two chars ?)
	16	+
	17	+ use lib "/home/ezachte/lib" ;
	18	+ use EzLib ;
	19	+
	20	+ $trace_on_exit = $true ;
	21	+ ez_lib_version (13) ;
	22	+
	23	+ # set defaults mainly for tests on local machine
	24	+# default_argv "-i C:/bayes_backup/a/dammit.lt/pagecounts\|-t C:/bayes_backup/a/dammit.lt\|-f C:/bayes_backup/a/dammit.lt\|-o C:/bayes_backup/a/dammit.lt\|-d 20101215" ;
	25	+ default_argv "-m\|-i C:/bayes_backup/a/dammit.lt/pagecounts\|-o C:/bayes_backup/a/dammit.lt\|-d 200812" ;
	26	+
	27	+ use CGI qw(:all);
	28	+ use URI::Escape;
	29	+ use Cwd ;
	30	+ $bayes = -d "/a/dammit.lt" ;
	31	+# $path_7za = "/usr/lib/p7zip/7za" ;
	32	+
	33	+ use IO::Compress::Bzip2 qw(bzip2 $Bzip2Error) ;
	34	+
	35	+ if (! $bayes)
	36	+ {
	37	+ print "Test on Windows\n" ;
	38	+ use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; # install IO-Compress-Zlib
	39	+ use IO::Compress::Gzip qw(gzip $GzipError) ; # install IO-Compress-Zlib
	40	+ }
	41	+
	42	+ $\| = 1; # flush screen output
	43	+
	44	+ $true = 1 ;
	45	+ $false = 0 ;
	46	+ $threshold = 0 ;
	47	+ undef %totals_per_namespace ;
	48	+
	49	+ ($sec,$min,$hour,$mday,$month,$year,$wday,$yday,$isdst) = gmtime(time);
	50	+ $year = $year + 1900;
	51	+ $month++ ;
	52	+ $month_run = sprintf ("%4d-%2d", $year, $month) ;
	53	+ print "Current month: $month_run\n" ;
	54	+
	55	+ $filter = "^(?:outreach\|quality\|strategy\|usability)\.m\$" ;
	56	+ print "Filter: $filter\n" ;
	57	+ $reg_exp_filter = qr"$filter" ;
	58	+
	59	+ $track = "NonExistingPageForSquidLogMonitoring" ;
	60	+ print "Track: $track\n" ;
	61	+ $reg_exp_track = qr"$track" ;
	62	+
	63	+# -i "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/in" -o "D:/\@Wikimedia/!Perl/#Projects/Visitors Compact Log Files/out"
	64	+
	65	+ my $options ;
	66	+ getopt ("iodft", \%options) ;
	67	+
	68	+ $compactmonth = $options {"m"} ;
	69	+ $compactday = ! $compactmonth ;
	70	+
	71	+ if (! defined ($options {"i"})) { &Abort ("Specify input dir: -i dirname") } ;
	72	+ if ($compactday)
	73	+ {
	74	+ if (! defined ($options {"o"})) { &Abort ("Specify output dir: -o dirname") } ;
	75	+ if (! defined ($options {"f"})) { &Abort ("Specify filter dir: -f dirname") } ;
	76	+ if (! defined ($options {"t"})) { &Abort ("Specify tracking dir: -t dirname") } ;
	77	+ if (! defined ($options {"d"})) { &Abort ("Specify date range: as yyyymmdd, yyyymm, yyyy or *") } ;
	78	+ }
	79	+ if ($compactmonth)
	80	+ {
	81	+ if (! defined ($options {"d"})) { &Abort ("Specify date range: as yyyymm, yyyy* or *") } ;
	82	+ }
	83	+
	84	+
	85	+ $dir_in = $options {"i"} ;
	86	+ $dir_out = $options {"o"} ;
	87	+ $dir_filtered = $options {"f"} ;
	88	+ $dir_track = $options {"t"} ;
	89	+ $daterange = $options {"d"} ;
	90	+
	91	+ $work = cwd() ;
	92	+ print "Work dir $work\n" ;
	93	+
	94	+ if ($dir_in !~ /[\/\\]/)
	95	+ { $dir_in = "$work/$dir_in" ; }
	96	+
	97	+ if ($dir_out eq '')
	98	+ { $dir_out = "$work" ; }
	99	+ elsif ($dir_out !~ /[\/\\]/)
	100	+ { $dir_out = "$work/$dir_out" ; }
	101	+
	102	+ if ($compactmonth && ($dir_out eq ''))
	103	+ { $dir_out = $dir_in ; }
	104	+
	105	+ if ($dir_filtered !~ /[\/\\]/)
	106	+ { $dir_filtered = "$work/$dir_filtered" ; }
	107	+
	108	+ if ($dir_track !~ /[\/\\]/)
	109	+ { $dir_track = "$work/$dir_track" ; }
	110	+
	111	+ if (! -d $dir_in)
	112	+ { &Abort ("Input dir not found: $dir_in") } ;
	113	+
	114	+ if (! -d $dir_out)
	115	+ {
	116	+ print "Create output dir $dir_out\n" ;
	117	+ mkdir $dir_out ;
	118	+ if (! -d $dir_out)
	119	+ { &Abort ("Output dir could not be created.") } ;
	120	+ }
	121	+
	122	+ open LOG, ">>", "$work/WikiStatsCompactDammitFiles.log" ;
	123	+
	124	+ if ($compactday)
	125	+ {
	126	+ if (($daterange !~ /^\d{8}$/) && ($daterange !~ /^\d{6}\$/) && ($daterange !~ /^\d{4}\$/) && ($daterange !~ /^\*$/))
	127	+ { &Abort ("Specify date range: as yyyymmdd, yyyymm, yyyy or *") ; }
	128	+
	129	+ &Log ("\nCompress pagecount files\nin: $dir_in\nout: $dir_out\nflt: $dir_filtered\ntrack: $dir_track\ndate range: $daterange\n") ;
	130	+ $daterange =~ s/\*/\\d+/ ;
	131	+
	132	+ &CompactVisitorStatsOneDay ($dir_in, $dir_out, $dir_filtered, $dir_track, $daterange) ;
	133	+ # &UncompactVisitorStats ; # test only, to see if process is revertible
	134	+ }
	135	+
	136	+ if ($compactmonth)
	137	+ {
	138	+ if (($daterange !~ /^\d{6}$/) && ($daterange !~ /^\d{4}\$/) && ($daterange !~ /^\$/))
	139	+ { &Abort ("Specify date range: as yyyymm, yyyy* or *") ; }
	140	+
	141	+ ($daterange2 = $daterange) =~ s/\*/\\d+/ ;
	142	+ &Log ("\nCompress pagecount files\nin: $dir_in\nout: $dir_out\ndate range: $daterange->$daterange2\n") ;
	143	+
	144	+ &CompactVisitorStatsOneMonth ($dir_in, $dir_out, $daterange2) ;
	145	+ }
	146	+
	147	+ &Log ("\nReady\n") ;
	148	+ close LOG ;
	149	+ exit ;
	150	+
	151	+sub CompactVisitorStatsOneDay
	152	+{
	153	+ my $dir_in = shift ;
	154	+ my $dir_out = shift ;
	155	+ my $dir_filtered = shift ;
	156	+ my $dir_track = shift ;
	157	+ my $daterange = shift ;
	158	+
	159	+ chdir ($dir_in) \|\| &Abort ("Cannot chdir to $dir_in\n") ;
	160	+
	161	+ local (*DIR);
	162	+ opendir (DIR, ".");
	163	+ @files = () ;
	164	+
	165	+ while ($file_in = readdir (DIR))
	166	+ {
	167	+ next if $file_in !~ /^pagecounts-$daterange-\d{6,6}.gz$/ ;
	168	+
	169	+ push @files, $file_in ;
	170	+ }
	171	+
	172	+ closedir (DIR);
	173	+
	174	+ @files = sort @files ;
	175	+
	176	+# if (($daterange =~ /^\d{8}$/) and ($#files < 23))
	177	+# { &Abort ("Less than 24 files found for date $daterange\n" . @files) ; }
	178	+
	179	+ foreach $file (@files)
	180	+ {
	181	+ $date = substr ($file,11,8) ;
	182	+ $process_dates {$date}++ ;
	183	+ }
	184	+
	185	+ &Log ("\n\n") ;
	186	+
	187	+ foreach $date (sort keys %process_dates)
	188	+ { &MergeFilesFullDay ($dir_in, $dir_out, $dir_filtered, $dir_track, $date) ; }
	189	+}
	190	+
	191	+sub MergeFilesFullDay
	192	+{
	193	+ my $dir_in = shift ;
	194	+ my $dir_out = shift ;
	195	+ my $dir_filtered = shift ;
	196	+ my $dir_track = shift ;
	197	+ my $date = shift ;
	198	+
	199	+ my $year = substr ($date,0,4) ;
	200	+ my $month = substr ($date,4,2) ;
	201	+ my $day = substr ($date,6,2) ;
	202	+
	203	+ my ($file_out1, $file_out2, $file_out3, $out_day, $hours_missing) ;
	204	+
	205	+ $dir_out = "$dir_out/${year}-${month}" ;
	206	+ if (! -d $dir_out)
	207	+ {
	208	+ mkdir $dir_out ;
	209	+ if (! -d $dir_out)
	210	+ { &Abort ("Output dir could not be created: $dir_out") } ;
	211	+ }
	212	+
	213	+ my @files_today = () ;
	214	+ foreach $file (@files)
	215	+ {
	216	+ next if $file !~ /^pagecounts-$date-\d{6,6}.gz$/ ;
	217	+
	218	+ push @files_today, $file ;
	219	+ }
	220	+
	221	+ # very few times (nearly) dupiclate files are found for same hour
	222	+ # keep the largest and presumably most complete one
	223	+ for ($i = 0 ; $i < $#files_today ; $i++)
	224	+ {
	225	+ for ($j = $i+1 ; $j <= $#files_today ; $j++)
	226	+ {
	227	+ if (substr ($files_today [$i],0,25) eq substr ($files_today [$j],0,25))
	228	+ {
	229	+ $size_i = -s $files_today [$i] ;
	230	+ $size_j = -s $files_today [$j] ;
	231	+ print "${files_today [$i]}: $size_i\n" ;
	232	+ print "${files_today [$j]}: $size_j\n" ;
	233	+ if ($size_i > $size_j)
	234	+ {
	235	+ print "Keep ${files_today [$i]}\n\n" ;
	236	+ $files_today [$j]= "" ;
	237	+ }
	238	+ else
	239	+ {
	240	+ print "Keep ${files_today [$j]}\n\n" ;
	241	+ $files_today [$i]= "" ;
	242	+ }
	243	+ }
	244	+ }
	245	+ }
	246	+
	247	+ $time_start = time ;
	248	+ $lines = 0 ;
	249	+
	250	+ undef @in_hour ;
	251	+
	252	+ # $file_out = "pagecounts-$year$month$day_full_day" ;
	253	+ # open OUT, ">", $file_out ;
	254	+ # binmode $file_out ;
	255	+
	256	+# my $out_day1 = IO::Compress::Gzip->new ($file_out1) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
	257	+ if ($bayes)
	258	+ {
	259	+ # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd" ; # full day
	260	+ $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h" ; # full day, hourly data
	261	+ # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d" ; # full day, compact, just daily totals
	262	+ if ((-e "$file_out2.7z") \|\| (-e "$file_out2.bz2") \|\| (-e "$file_out2.zip") \|\| (-e "$file_out2.gz"))
	263	+ {
	264	+ &Log ("\nTarget file '$file_out2.[7z\|bz2\|zip\|gz]' exists already. Skip this date.\n") ;
	265	+ return ;
	266	+ }
	267	+ if ($#files_today < 23)
	268	+ {
	269	+ &Log ("\nLess than 24 files found for target file '$file_out2.7z'. Skip this date.\n") ;
	270	+ return ;
	271	+ }
	272	+
	273	+ open $out_day2, ">", "$file_out2" \|\| &Abort ("Output file '$file_out2' could not be opened.") ;
	274	+ # open $out_day3, ">", "$file_out3" \|\| &Abort ("Output file '$file_out3' could not be opened.") ;
	275	+ }
	276	+ else
	277	+ {
	278	+ # $file_out1 = "$dir_out/pagecounts-$year$month$day" . "_fd.gz" ; # full day
	279	+ $file_out2 = "$dir_out/pagecounts-$year$month$day" . "_h.gz" ; # full day, hourly data, count above threshold
	280	+ $out_day2 = IO::Compress::Gzip->new ($file_out2) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
	281	+ # $file_out3 = "$dir_out/pagecounts-$year$month$day" . "_d.gz" ; # full day, count above threshold
	282	+ # $out_day3 = IO::Compress::Gzip->new ($file_out3) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
	283	+ }
	284	+
	285	+# binmode $out_day1 ;
	286	+ binmode $out_day2 ;
	287	+# binmode $out_day3 ;
	288	+
	289	+ # print "File_out1 $file_out1\n" ;
	290	+ print "File_out2 $file_out2\n" ;
	291	+ # print "File_out3 $file_out3\n" ;
	292	+
	293	+ $file_filtered = "$dir_filtered/pagecounts-$year$month$day.txt" ;
	294	+ &Log ("\nFilter file: $file_filtered\n") ;
	295	+ open $out_filtered, '>', $file_filtered ;
	296	+ binmode $out_filtered ;
	297	+
	298	+ $file_track = "$dir_track/_PageCountsForSquidLogTracking.txt" ;
	299	+ &Log ("Tracking file: $file_track\n\n") ;
	300	+
	301	+ for ($hour = 0 ; $hour < 24 ; $hour++)
	302	+ { $file_in_found [$hour] = $false ; }
	303	+
	304	+ $files_in_open = 0 ;
	305	+ $files_in_found = 0 ;
	306	+ $langprev = "" ;
	307	+ foreach $file_in (@files_today)
	308	+ {
	309	+ next if $file_in eq "" ;
	310	+
	311	+ ($hour = $file_in) =~ s/^pagecounts-\d+-(\d\d)\d+\.gz$/$1/ ;
	312	+ $hour = (0+$hour) ;
	313	+ # print " file found '$file_in'\n" ;
	314	+
	315	+ if ($bayes)
	316	+ { open $in_hour [$hour], "-\|", "gzip -dc \"$file_in\"" \|\| &Abort ("Input file '" . $file_in . "' could not be opened.") ; }
	317	+ else
	318	+ { $in_hour [$hour] = IO::Uncompress::Gunzip->new ($file_in) \|\| &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ; }
	319	+ binmode $in_hour [$hour] ;
	320	+
	321	+ $files_in_open++ ;
	322	+ $file_in_found [$hour] = $true ;
	323	+ $file_in_open [$hour] = $true ;
	324	+ $files_in_found ++ ;
	325	+ $file = $in_hour [$hour] ;
	326	+ $line = <$file> ;
	327	+ $line =~ s/^(\w+)2 /$1.y /o ;# project wikipedia comes without suffix -> out of sort order, make it fit by appending suffix
	328	+ $line =~ s/^(\w+) /$1.z /o ;
	329	+
	330	+ ($lang,$title,$count [$hour],$dummy) = split (' ', $line) ;
	331	+ $key [$hour] = "$lang $title" ;
	332	+ }
	333	+
	334	+ $comment = "# Wikimedia page request counts for $date, each line shows 'subproject title counts'\n" ;
	335	+ if ($threshold > 0 )
	336	+ { $comment .= "# Count for articles with less than $threshold requests per full day are omitted\n" ; }
	337	+ $comment .= "# Subproject is language code, followed by project code\n" ;
	338	+ $comment .= "# Project is b:wikibooks, k:wiktionary, n:wikinews, q:wikiquote, s:wikisource, v:wikiversity, z:wikipedia (z added by compression script: wikipedia happens to be sorted last in dammit.lt files)\n" ;
	339	+ $comment .= "# Counts format is total per day, followed by count per hour if larger than zero, hour 0..23 shown as A..X (saves up to 22 bytes per line compared to comma separated values)\n" ;
	340	+ $comment .= "# If data are missing for some hour (file missing or corrupt) a question mark (?) is shown (and for each missing hour the daily total is incremented with hourly average)\n\n" ;
	341	+ print $out_day2 $comment ;
	342	+# print $out_day3 $comment ;
	343	+
	344	+ if ($files_in_found < 24)
	345	+ {
	346	+ for ($hour = 0 ; $hour < 24 ; $hour++)
	347	+ {
	348	+ if (! $file_in_found [$hour])
	349	+ { $hours_missing .= "$hour," ; }
	350	+ }
	351	+ $hours_missing =~ s/,$// ;
	352	+ &Log ("Merge files: date = $date, only $files_in_found files found!\n\n") ;
	353	+ }
	354	+ else
	355	+ { &Log ("Merge files: date = $date\n") ; }
	356	+
	357	+ if ($hours_missing ne '')
	358	+ {
	359	+ print $out_day2 "#\n" ;
	360	+ print $out_day2 "# In this file data are missing for hour(s) $hours_missing!\n" ;
	361	+ # print $out_day3 "#\n" ;
	362	+ # print $out_day3 "# In this file data are missing for hour(s) $hours_missing!\n" ;
	363	+ }
	364	+ $comment = "#\n" ;
	365	+ $comment .= "# Lines starting with ampersand (@) show totals per 'namespace' (including omitted counts for low traffic articles)\n" ;
	366	+ $comment .= "# Since valid namespace string are not known in the compression script any string followed by colon (:) counts as possible namespace string\n" ;
	367	+ $comment .= "# Please reconcile with real namespace name strings later\n" ;
	368	+ $comment .= "# 'namespaces' with count < 5 are combined in 'Other' (on larger wikis these are surely false positives)\n" ;
	369	+ $comment .= "#\n" ;
	370	+ $comment .= "# Page titles are shown unmodified (preserves sort sequence)\n" ;
	371	+ $comment .= "#\n" ;
	372	+ print $out_day2 $comment ;
	373	+# print $out_day3 $comment ;
	374	+
	375	+ $key_low_prev = "" ;
	376	+ while ($files_in_open > 0)
	377	+ {
	378	+ $key_low = "\xFF\xFF";
	379	+ for ($hour = 0 ; $hour < 24 ; $hour++)
	380	+ {
	381	+ if (($files_in_open == 24) \|\| ($file_in_found [$hour] && $file_in_open [$hour]))
	382	+ {
	383	+ if ($key [$hour] lt $key_low)
	384	+ { $key_low = $key [$hour] ; }
	385	+ }
	386	+ }
	387	+
	388	+ if (($key_low =~ /^nov/) \|\| ($key_low_prev =~ /^nov/))
	389	+ { &Log ("key_low '$key_low' (key_low_prev '$key_low_prev')\n") ; }
	390	+
	391	+ $counts = "" ;
	392	+ $total = 0 ;
	393	+ for ($hour = 0 ; $hour < 24 ; $hour++)
	394	+ {
	395	+ if (! $file_in_found [$hour])
	396	+ { $counts .= chr ($hour+ord('A')) . '?' ; }
	397	+ elsif (($files_in_open == 24) \|\| $file_in_open [$hour])
	398	+ {
	399	+ if ($key [$hour] eq $key_low)
	400	+ {
	401	+ $counts .= chr ($hour+ord('A')) . $count [$hour] ;
	402	+ $total += $count [$hour] ;
	403	+ $file = $in_hour [$hour] ;
	404	+ # $line = <$file> ;
	405	+
	406	+ while ($true)
	407	+ {
	408	+ if ($line = <$file>) # =~ /^a/)
	409	+ {
	410	+ $line =~ s/^([\w\-]+)2 /$1.y /o ; # project wikipedia comes without suffix -> out of sort order, make it fit by appending suffix
	411	+ $line =~ s/^([\w\-]+) /$1.z /o ;
	412	+ ($lang,$title,$count [$hour],$dummy) = split (' ', $line) ;
	413	+ $key [$hour] = "$lang $title" ;
	414	+
	415	+ last if $lang !~ /\d/ ;
	416	+ }
	417	+ else
	418	+ {
	419	+ if ($bayes)
	420	+ { close $in_hour [$hour] ; }
	421	+ else
	422	+ { $in_hour [$hour] -> close () ; }
	423	+ $files_in_open-- ;
	424	+ $file_in_open [$hour] = $false ;
	425	+ $key [$hour] = "\xFF\xFF";
	426	+
	427	+ last ;
	428	+ }
	429	+ }
	430	+ }
	431	+ }
	432	+ }
	433	+ if ($lines == 0)
	434	+ { &Log ("\nlines: project key\n") ; }
	435	+
	436	+ if (++$lines % 100000 == 0)
	437	+ { &Log ("$lines: $key_low\n") ; }
	438	+
	439	+ # last if $lines > 10000 ; # test
	440	+
	441	+ last if $key_low eq "\xFF\xFF" ;
	442	+
	443	+ # Q&D fix for unexplained out of order error for what seems to be invalid language
	444	+ # remember : no suffix on language code gets replaced by .y or .z to fixed sort order
	445	+ # ^nov.mw nov1 1 8765
	446	+ # ^nov1.mw nov1 1 931 <--------------
	447	+ # ^nov 10_dw_oktobre 1 11421
	448	+ ($lang,$title) = split (' ', $key_low) ;
	449	+ if ($lang =~ /\d/)
	450	+ {
	451	+ $invalid_languages {$lang}++ ;
	452	+ &Log ("\nSkip invalid language '$lang'\n") ;
	453	+ next ;
	454	+ }
	455	+
	456	+
	457	+ if ($key_low_prev gt $key_low)
	458	+ {
	459	+ for ($hour = 0 ; $hour < 24 ; $hour++)
	460	+ { &Log ("hour $hour: key ${key[$hour]}\n") ; }
	461	+
	462	+ &Abort ("Sequence error: '$key_low_prev' gt '$key_low'\n") ;
	463	+ }
	464	+
	465	+ if (($key_low_prev eq $key_low) && ($files_in_open > 0))
	466	+ {
	467	+ for ($hour = 0 ; $hour < 24 ; $hour++)
	468	+ {
	469	+ if ($file_in_open [$hour])
	470	+ { print "hour $hour: file open, key ${key [$hour]}\n" ; }
	471	+ else
	472	+ { print "hour $hour: file closed, key ${key [$hour]}\n" ; }
	473	+ }
	474	+ &Abort ("Sequence error: '$key_low_prev' eq '$key_low'\n") ;
	475	+ }
	476	+
	477	+ # print OUT "$key_low $total$counts\n" ;
	478	+# print $out_day1 "$key_low $total$counts\n" ;
	479	+
	480	+ ($lang,$title) = split (' ', $key_low) ;
	481	+
	482	+ $title =~ s/\%20/_/g ;
	483	+ $title =~ s/\%3A/:/gi ;
	484	+# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
	485	+ if (($title !~ /\:/) \|\| ($title =~ /^:[^:]*$/)) # no colon or only on first position
	486	+ { $namespace = 'NamespaceArticles' ; }
	487	+ else
	488	+ { ($namespace = $title) =~ s/([^:])\:.*$/$1/ ; }
	489	+ # print "KEY $key_low -> $namespace\n" ;
	490	+
	491	+ if (($lang ne $langprev) && ($langprev ne ""))
	492	+ {
	493	+ $filter_matches = $lang =~ $reg_exp_filter ;
	494	+ if ($filter_matches)
	495	+ { print "F $lang\n" ; }
	496	+ # else
	497	+ # { print "- $lang\n" ; }
	498	+
	499	+ &WriteTotalsPerNamespace ($out_day2, $langprev) ;
	500	+ # &WriteTotalsPerNamespace ($out_day3, $langprev) ;
	501	+ undef %totals_per_namespace ;
	502	+ }
	503	+ $langprev = $lang ;
	504	+
	505	+ if (($files_in_found < 24) && ($files_in_found > 0)) # always > 0 actually
	506	+ { $total = sprintf ("%.0f",($total / $files_in_found) * 24) ; }
	507	+
	508	+ $totals_per_namespace {"$lang $namespace"} += $total ;
	509	+
	510	+ if ($filter_matches)
	511	+ { print $out_filtered "$key_low $total$counts\n" ; }
	512	+
	513	+ if ($key_low =~ $reg_exp_track) # track count for NonExistingPageForSquidLogMonitoring on en.z
	514	+ {
	515	+ open $out_track, '>>', $file_track ;
	516	+ binmode $out_track ;
	517	+ print $out_track "$key_low $total$counts\n" ;
	518	+ close $out_track ;
	519	+ }
	520	+
	521	+ if ($total >= $threshold)
	522	+ { print $out_day2 "$key_low $total$counts\n" ;
	523	+ # print $out_day3 "$key_low $total\n" ;
	524	+ }
	525	+
	526	+ $key_low_prev = $key_low ;
	527	+ # print "OUT $key_low $counts\n" ;
	528	+ }
	529	+
	530	+ &WriteTotalsPerNamespace ($out_day2, $langprev) ;
	531	+# &WriteTotalsPerNamespace ($out_day3, $langprev) ;
	532	+
	533	+ &Log ("File production took " . (time-$time_start) . " seconds\n\n") ;
	534	+
	535	+ &Log ("[$lines, $files_in_open] $key_low\n") ;
	536	+# close OUT ;
	537	+
	538	+ if ($bayes)
	539	+ {
	540	+ # close $out_day1 ;
	541	+ close $out_day2 ;
	542	+ # close $out_day3 ;
	543	+ close $out_filtered ;
	544	+
	545	+# $cmd = "$path_7za a $file_out2.7z $file_out2" ;
	546	+# $result = `$cmd` ;
	547	+# if ($result =~ /Everything is Ok/s)
	548	+# {
	549	+# $result =~ s/^.?(Updating.?)\n.*$/$1 -> OK/s ;
	550	+# unlink $file_out2 ;
	551	+# foreach $file_in (@files_today)
	552	+# {
	553	+# print "unlink $dir_in/$file_in\n" ;
	554	+# unlink "$dir_in/$file_in" ;
	555	+# }
	556	+# }
	557	+# else
	558	+# {
	559	+# print "Delete $file_out2.7z\n" ;
	560	+# unlink "$file_out2.7z" ;
	561	+# }
	562	+
	563	+
	564	+ $time_start_compression = time ;
	565	+ $cmd = "bzip2 -9 -v $file_out2" ;
	566	+ &Log ("\n\n$cmd ->\n") ;
	567	+ $result = `$cmd` ;
	568	+ &Log ("\n\nCompression took " . (time-$time_start_compression) . " seconds\n$result\n") ;
	569	+
	570	+ if ($true)
	571	+ {
	572	+ foreach $file_in (@files_today)
	573	+ {
	574	+ print "unlink $dir_in/$file_in\n" ;
	575	+ unlink "$dir_in/$file_in" ;
	576	+ }
	577	+ }
	578	+ else
	579	+ {
	580	+ # print "Delete $file_out2.7z\n" ;
	581	+ # unlink "$file_out2.7z" ;
	582	+ }
	583	+ }
	584	+ else
	585	+ {
	586	+ # $out_day1->close() ;
	587	+ $out_day2->close() ;
	588	+ # $out_day3->close() ;
	589	+ close $out_filtered ;
	590	+ }
	591	+
	592	+ &Log ("\nRecords skipped for invalid languages:\n") ;
	593	+ foreach $key (sort keys %invalid_languages)
	594	+ { &Log ("$key: ${invalid_languages {$key}}\n") ; }
	595	+
	596	+ &Log ("\nTotals per namespace written: $lines_namespace_counts\n") ;
	597	+ &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
	598	+}
	599	+
	600	+sub WriteTotalsPerNamespace
	601	+{
	602	+ my $out_day = shift ;
	603	+ my $lang = shift ;
	604	+ my $total ;
	605	+ my $totals_per_namespace_other ;
	606	+
	607	+ foreach my $key (sort keys %totals_per_namespace)
	608	+ {
	609	+ $total = $totals_per_namespace {$key} ;
	610	+ if ($total < 5)
	611	+ { $totals_per_namespace_other += $total ; }
	612	+ else
	613	+ {
	614	+ # print "@ $key $total\n" ;
	615	+ print $out_day "@ $key $total\n" ;
	616	+ $lines_namespace_counts ++ ;
	617	+ }
	618	+ }
	619	+ if ($totals_per_namespace_other > 0 )
	620	+ {
	621	+ # print "@ $lang -other- $totals_per_namespace_other\n" ;
	622	+ print $out_day "@ $lang -other- $totals_per_namespace_other\n" ;
	623	+ $lines_namespace_counts ++ ;
	624	+ }
	625	+}
	626	+
	627	+sub CompactVisitorStatsOneMonth
	628	+{
	629	+ my $dir_in = shift ;
	630	+ my $dir_out = shift ;
	631	+ my $daterange = shift ;
	632	+
	633	+ &Log ("\nCompactVisitorStatsOneMonth\n\n") ;
	634	+
	635	+ chdir ($dir_in) \|\| &Abort ("Cannot chdir to $dir_in\n") ;
	636	+
	637	+ local (*DIR);
	638	+ opendir (DIR, ".");
	639	+ @files = () ;
	640	+
	641	+ while ($dir = readdir (DIR))
	642	+ {
	643	+ next if ! -d $dir ;
	644	+ next if $dir !~ /^\d\d\d\d-\d\d$/ ;
	645	+
	646	+ push @dirs, $dir ;
	647	+ }
	648	+
	649	+ closedir (DIR);
	650	+
	651	+ @dirs = sort @dirs ;
	652	+
	653	+ foreach $dir (@dirs)
	654	+ {
	655	+ &Log ("\n\n" . '-' x 80 . "\n\nCompactVisitorStatsOneMonth:\nCheck dir $dir_in/$dir\n") ;
	656	+
	657	+ if (-e "$dir_in/$dir/a")
	658	+ {
	659	+ &Log ("Already done -> skip\n\n") ;
	660	+ next ;
	661	+ }
	662	+
	663	+ ($dir2 = $dir) =~ s/-//g ;
	664	+ if ($dir2 !~ /^$daterange/)
	665	+ {
	666	+ &Log ("Directory out of date range ($daterange) -> skip\n\n") ;
	667	+ next ;
	668	+ }
	669	+
	670	+ local (*DIR2);
	671	+ opendir (DIR2, "$dir_in/$dir");
	672	+
	673	+ undef @files ;
	674	+ undef %process_dates ;
	675	+
	676	+ while ($file_in = readdir (DIR2))
	677	+ {
	678	+ if ($bayes)
	679	+ { next if $file_in !~ /^pagecounts-\d{8}_(?:fdt\.7z\|h\.bz2)$/ ; }
	680	+ else
	681	+ { next if $file_in !~ /^pagecounts-\d{8}_fdt$/ ; }
	682	+
	683	+ &Log ("File found: $file_in\n") ;
	684	+
	685	+ push @files, $file_in ;
	686	+ }
	687	+
	688	+ closedir (DIR2);
	689	+
	690	+ @files = sort @files ;
	691	+
	692	+ foreach $file (@files)
	693	+ {
	694	+ $date = substr ($file,11,8) ;
	695	+ $process_dates {$date}++ ;
	696	+ }
	697	+
	698	+ &Log ("\n\n") ;
	699	+
	700	+ &MergeFilesFullMonth ($dir_in, $dir_out, $dir, @files) ;
	701	+ }
	702	+
	703	+ exit ;
	704	+}
	705	+
	706	+sub MergeFilesFullMonth
	707	+{
	708	+ my $dir_in = shift ;
	709	+ my $dir_out = shift ;
	710	+ my $dir = shift ;
	711	+ my @files_this_month = @_ ;
	712	+
	713	+ my $year = substr ($dir,0,4) ;
	714	+ my $month = substr ($dir,5,2) ;
	715	+
	716	+ my (@file_in_open, @file_in_found, @counts, $days_missing) ;
	717	+ my $days_in_month = days_in_month ($year, $month) ;
	718	+
	719	+ my ($file_out2) ;
	720	+
	721	+ $lines = 0 ;
	722	+
	723	+ undef @in_day ;
	724	+ my $time_start = time ;
	725	+
	726	+ if ($dir eq $month_run)
	727	+ { $scope = "part" ; }
	728	+ else
	729	+ { $scope = "all" ; }
	730	+
	731	+ $file_out = "$dir_out/pagecounts-$year-$month-$scope" ;
	732	+
	733	+ &Log ("\nMergeFilesFullMonth\nIn: $dir_in/$dir\nOut: $dir_out/$file_out\nDays expected: $days_in_month\n\nProcess...\n") ;
	734	+
	735	+ if ($bayes)
	736	+ {
	737	+ if ((-e "$file_out.7z") \|\| (-e "$file_out.bz2") \|\| (-e "$file_out.zip") \|\| (-e "$file_out.gz"))
	738	+ {
	739	+ &Log ("\nTarget file '$file_out.[7z\|bz2\|zip\|gz]' exists already. Skip this month.\n") ;
	740	+ return ;
	741	+ }
	742	+ }
	743	+
	744	+
	745	+ my $out_month_all = new IO::Compress::Bzip2 "$file_out.bz2" or die "bzip2 failed for $file_out.bz2: $Bzip2Error\n";
	746	+ my $out_month_ge5 = new IO::Compress::Bzip2 "${file_out}_ge5.bz2" or die "bzip2 failed for ${file_out}_ge5.bz2: $Bzip2Error\n";
	747	+
	748	+ $out_month_all->binmode() ;
	749	+ $out_month_ge5->binmode() ;
	750	+
	751	+ for ($day = 0 ; $day < $days_in_month ; $day++)
	752	+ { $file_in_found [$day] = $false ; }
	753	+
	754	+ $files_in_open = 0 ;
	755	+ $files_in_found = 0 ;
	756	+ $total_hours_missing = 0 ;
	757	+ $langprev = "" ;
	758	+ $lines_read_this_month = 0 ;
	759	+ @hours_missing_per_day = () ;
	760	+ $hours_missing_coded = '' ;
	761	+ $lines_omitted_daily = 0 ;
	762	+
	763	+ foreach $file_in (@files_this_month)
	764	+ {
	765	+ next if $file_in eq "" ;
	766	+
	767	+ ($day = $file_in) =~ s/^pagecounts-\d{6}(\d+)_(?:fdt\|fdt\.7z\|h\.bz2)$/$1/ ;
	768	+ $day = sprintf ("%2d", $day-1) ;
	769	+
	770	+ $file_in = "$dir_in/$year-$month/$file_in" ;
	771	+ # print "File $file_in -> day $day\n" ;
	772	+
	773	+ &CheckHoursMissing ($year,$month,$day,$file_in) ;
	774	+
	775	+ if ($bayes)
	776	+ {
	777	+ if ($file_in =~ /\.bz2$/)
	778	+ { open $in_day [$day], "-\|", "bzip2 -dc \"$file_in\"" \|\| abort ("Input file '" . $file_in . "' could not be opened.") ; }
	779	+ elsif ($file_in =~ /\.7z$/)
	780	+ { open $in_day [$day], "-\|", "7z e -so \"$file_in\"" \|\| abort ("Input file '" . $file_in . "' could not be opened.") ; }
	781	+ else
	782	+ { abort ("MergeFilesFullMonth: unexpected file name $file_in.") ; }
	783	+ }
	784	+ else
	785	+ { open $in_day [$day], '<', $file_in \|\| &Abort ("Open failed for '$file_in'\n") ; }
	786	+
	787	+ binmode $in_day [$day] ;
	788	+
	789	+ $files_in_open++ ;
	790	+ $file_in_found [$day] = $true ;
	791	+ $file_in_open [$day] = $true ;
	792	+ $files_in_found ++ ;
	793	+
	794	+ $file = $in_day [$day] ;
	795	+ $line = <$file> ;
	796	+ while (($line =~ /^#/) \|\| ($line =~ /^@/))
	797	+ { $line = <$file> ; }
	798	+
	799	+ chomp $line ;
	800	+ if ($line =~ /^[^ ]+ [^ ]+ [^ ]+$/) # prepare for format change: space will be added between daily total and hourly counts
	801	+ {
	802	+ ($lang,$title,$counts) = split (' ', $line) ;
	803	+ }
	804	+ else
	805	+ {
	806	+ ($lang,$title,$total,$counts) = split (' ', $line) ;
	807	+ $counts = "$total$counts" ;
	808	+ }
	809	+
	810	+ $key [$day] = "$lang $title" ;
	811	+ $counts [$day] = $counts ;
	812	+ # print "DAY " . ($day+1) . " KEY ${key [$day]} COUNTS $counts\n" ;
	813	+ }
	814	+ print "\n" ;
	815	+
	816	+ $comment = "# Wikimedia article requests (aka page views) for year $year, month $month\n" ;
	817	+ if ($threshold > 0 )
	818	+ { $comment .= "# Count for articles with less than $threshold requests per full month are omitted\n" ; }
	819	+ $comment .= "#\n" ;
	820	+ $comment .= "# Each line contains four fields separated by spaces\n" ;
	821	+ $comment .= "# - wiki code (subproject.project, see below)\n" ;
	822	+ $comment .= "# - article title (encoding from original hourly files is preserved to maintain proper sort sequence)\n" ;
	823	+ $comment .= "# - monthly total (possibly extrapolated from available data when hours/days in input were missing)\n" ;
	824	+ $comment .= "# - hourly counts (only for hours where indeed article requests occurred)\n" ;
	825	+ $comment .= "#\n" ;
	826	+ $comment .= "# Subproject is language code, followed by project code\n" ;
	827	+ $comment .= "# Project is b:wikibooks, k:wiktionary, n:wikinews, q:wikiquote, s:wikisource, v:wikiversity, z:wikipedia\n" ;
	828	+ $comment .= "# Note: suffix z added by compression script: project wikipedia happens to be sorted last in dammit.lt files, so add this suffix to fix sort order\n" ;
	829	+ $comment .= "#\n" ;
	830	+ $comment .= "# To keep hourly counts compact and tidy both day and hour are coded as one character each, as follows:\n" ;
	831	+ $comment .= "# Hour 0..23 shown as A..X convert to number: ordinal (char) - ordinal ('A')\n" ;
	832	+ $comment .= "# Day 1..31 shown as A.._ 27=[ 28=\\ 29=] 30=^ 31=_ convert to number: ordinal (char) - ordinal ('A') + 1\n" ;
	833	+ $comment .= "#\n" ;
	834	+ $comment .= "# Original data source: Wikimedia full (=unsampled) squid logs\n" ;
	835	+ $comment .= "# These data have been aggregated from hourly pagecount files at http://dammit.lt/wikistats, originally produced by Domas Mituzas\n" ;
	836	+ $comment .= "# Daily and monthly aggregator script built by Erik Zachte\n" ;
	837	+ $comment .= "# Each day hourly files for previous day are downloaded and merged into one file per day\n" ;
	838	+ $comment .= "# Each month daily files are merged into one file per month\n" ;
	839	+# $comment .= "# If data are missing for some hour (file missing or corrupt) a question mark (?) is shown (and for each missing hour the daily total is incremented with hourly average)\n" ;
	840	+# $comment .= "# If data are missing for some day (file missing or corrupt) a question mark (?) is shown (and for each missing day the monthly total is incremented with daily average)\n" ;
	841	+ $comment .= "#\n" ;
	842	+
	843	+ $out_month_all->print ($comment) ;
	844	+ $comment .= "# This file contains only lines with monthly page request total greater/equal 5\n" ;
	845	+ $comment .= "#\n" ;
	846	+ $out_month_ge5->print ($comment) ;
	847	+
	848	+ if ($files_in_found < $days_in_month)
	849	+ {
	850	+ for ($day = 0 ; $day < $days_in_month ; $day++)
	851	+ {
	852	+ if (! $file_in_found [$day])
	853	+ {
	854	+ $days_missing .= ($day+1) . "," ;
	855	+ $total_hours_missing += 24 ;
	856	+ for (my $h = 0 ; $h <= 23 ; $h++)
	857	+ { $hours_missing_coded .= chr ($day + ord ('A')) . chr ($h + ord ('A')) .',' ; }
	858	+ }
	859	+ }
	860	+
	861	+ $days_missing =~ s/,$// ;
	862	+ &Log ("Merge files: year $year, month $month, only $files_in_found files found!\n\n") ;
	863	+
	864	+ if ($days_missing =~ /,/)
	865	+ {
	866	+ $out_month_all->print ("# No input files found for days $days_missing!\n#\n") ;
	867	+ $out_month_ge5->print ("# No input files found for days $days_missing!\n#\n") ;
	868	+ print "No input files found for days $days_missing!\n\n" ;
	869	+ }
	870	+ else
	871	+ {
	872	+ $out_month_all->print ("# No input file found for day $days_missing!\n#\n") ;
	873	+ $out_month_ge5->print ("# No input file found for day $days_missing!\n#\n") ;
	874	+ print "No input file found for day $days_missing!\n\n" ;
	875	+ }
	876	+ }
	877	+ else
	878	+ { &Log ("Merge files: year $year, month $month\n\n") ; }
	879	+
	880	+ if ($#hours_missing_per_day > -1)
	881	+ {
	882	+ $out_month_all->print (@hours_missing_per_day) ;
	883	+ $out_month_ge5->print (@hours_missing_per_day) ;
	884	+ }
	885	+
	886	+ if ($hours_missing_coded ne '')
	887	+ {
	888	+ $hours_missing_coded =~ s/,$// ;
	889	+ $hours_missing_coded = join (',', sort {$a cmp $b} split (',', $hours_missing_coded)) ; # single hours and full days missing added out of sort order
	890	+ $out_month_all->print ("#\n# Hours missing: $hours_missing_coded\n") ;
	891	+ $out_month_ge5->print ("#\n# Hours missing: $hours_missing_coded\n") ;
	892	+ print "Hours missing: $hours_missing_coded\n\n" ;
	893	+ }
	894	+
	895	+ $monthly_correction = 1 ;
	896	+ if ($total_hours_missing == 0)
	897	+ {
	898	+ $out_month_all->print ("# Data for all hours of each day were available in input\n#\n") ;
	899	+ $out_month_ge5->print ("# Data for all hours of each day were available in input\n#\n") ;
	900	+ print "Data for all hours of each day were available in input\n\n" ;
	901	+ }
	902	+ else
	903	+ {
	904	+ $monthly_correction = sprintf ("%.4f", ($days_in_month * 24) / ($days_in_month * 24 - $total_hours_missing)) ;
	905	+ $out_month_all->print ("#\n# In this file data for $total_hours_missing hours were not encountered in input\n") ;
	906	+ $out_month_ge5->print ("#\n# In this file data for $total_hours_missing hours were not encountered in input\n") ;
	907	+ $out_month_all->print ("# Monthly totals per page have been extrapolated from available counts: multiplication factor = $monthly_correction\n#\n") ;
	908	+ $out_month_ge5->print ("# Monthly totals per page have been extrapolated from available counts: multiplication factor = $monthly_correction\n#\n") ;
	909	+ print "In this file data for $total_hours_missing hours were not encountered in input\n" ;
	910	+ print "Monthly totals per page have been extrapolated from available counts: multiplication factor = $monthly_correction\n\n" ;
	911	+ }
	912	+
	913	+ if ($threshold_requests_omitted > 0)
	914	+ {
	915	+ $out_month_all->print ("# For this month intermediate files (from daily aggregation of hourly files) did no longer contain lines with daily total below $threshold_requests_omitted page requests\n#\n") ;
	916	+ $out_month_ge5->print ("# For this month intermediate files (from daily aggregation of hourly files) did no longer contain lines with daily total below $threshold_requests_omitted page requests\n#\n") ;
	917	+ print "# For this month intermediate files (from daily aggregation of hourly files) did no longer contain lines with daily total below $threshold_requests_omitted page requests\n#\n" ;
	918	+ }
	919	+
	920	+ $key_low_prev = "" ;
	921	+ while ($files_in_open > 0)
	922	+ {
	923	+ # last if $cycles ++ > 10000 ; # test code
	924	+
	925	+ $key_low = "\xFF\xFF";
	926	+ for ($day = 0 ; $day < $days_in_month ; $day++)
	927	+ {
	928	+ if (($files_in_open == $days_in_month) \|\| ($file_in_found [$day] && $file_in_open [$day]))
	929	+ {
	930	+ if ($key [$day] lt $key_low)
	931	+ { $key_low = $key [$day] ; }
	932	+ }
	933	+ }
	934	+
	935	+ $counts_per_month = "" ;
	936	+ $total_per_month = 0 ;
	937	+
	938	+ for ($day = 0 ; $day < $days_in_month ; $day++)
	939	+ {
	940	+ if (! $file_in_found [$day])
	941	+ {
	942	+ # $counts_per_month .= chr ($day+ord('A')) . '?' ;
	943	+ }
	944	+ elsif (($files_in_open == $days_in_month) \|\| $file_in_open [$day]) # slight optimization
	945	+ {
	946	+ if ($key [$day] eq $key_low)
	947	+ {
	948	+ $ch_day = chr ($day+ord('A')) ;
	949	+ $counts_per_day = $counts [$day] ;
	950	+
	951	+ ($total_per_day = $counts_per_day) =~ s/^(\d+).*$/$1/ ;
	952	+ $counts_per_day =~ s/^\d+// ; # remove total
	953	+
	954	+ $counts_per_day =~ s/([A-Z]\d+)/$ch_day$1,/g ; # prefix each hourly count with char that represent day
	955	+ $counts_per_month .= $counts_per_day ;
	956	+
	957	+ $total_per_month += $total_per_day ;
	958	+ $file = $in_day [$day] ;
	959	+ # $line = <$file> ;
	960	+
	961	+ while ($true)
	962	+ {
	963	+ # if (($line = <$file>) && ($lines_read_this_month++ < 10000)) # test code
	964	+ if ($line = <$file>)
	965	+ {
	966	+ next if $line =~ /^#/ ;
	967	+ next if $line =~ /^@/ ;
	968	+
	969	+ $line =~ s/^([\w\-]+)2 /$1.y /o ;
	970	+ $line =~ s/^([\w\-]+) /$1.z /o ;
	971	+
	972	+ chomp $line ;
	973	+
	974	+ if ($line =~ /^[^ ]+ [^ ]+ [^ ]+$/) # prepare for format change: space will be added between daily total and hourly counts
	975	+ {
	976	+ ($lang,$title,$counts) = split (' ', $line) ;
	977	+ }
	978	+ else
	979	+ {
	980	+ ($lang,$title,$total,$counts) = split (' ', $line) ;
	981	+ $counts = "$total$counts" ;
	982	+ }
	983	+
	984	+ $key [$day] = "$lang $title" ;
	985	+ $counts [$day] = $counts ;
	986	+
	987	+ last ;
	988	+ }
	989	+ else
	990	+ {
	991	+ close $in_day [$day] ;
	992	+
	993	+ $files_in_open-- ;
	994	+ $file_in_open [$day] = $false ;
	995	+ $key [$day] = "\xFF\xFF";
	996	+
	997	+ last ;
	998	+ }
	999	+ }
	1000	+ }
	1001	+ }
	1002	+ }
	1003	+ if ($lines == 0)
	1004	+ { &Log ("\nlines: project key\n") ; }
	1005	+
	1006	+ if (++$lines % 100000 == 0)
	1007	+ { &Log ("$lines: $key_low\n") ; }
	1008	+
	1009	+ # last if $lines > 10000 ; # test
	1010	+
	1011	+ last if $key_low eq "\xFF\xFF" ;
	1012	+
	1013	+ # Q&D fix for unexplained out of order error for what seems to be invalid language
	1014	+ # remember : language code without suffix gets appended by .y or .z to fix sort order
	1015	+ # ^nov.mw nov1 1 8765
	1016	+ # ^nov1.mw nov1 1 931 <--------------
	1017	+ # ^nov 10_dw_oktobre 1 11421
	1018	+ ($lang,$title) = split (' ', $key_low) ;
	1019	+ if ($lang =~ /\d/)
	1020	+ {
	1021	+ $invalid_languages {$lang}++ ;
	1022	+ &Log ("\nSkip invalid language '$lang'\n") ;
	1023	+ next ;
	1024	+ }
	1025	+
	1026	+ if ($key_low_prev gt $key_low)
	1027	+ {
	1028	+ for ($day = 0 ; $day < $days_in_month ; $day++)
	1029	+ { &Log ("day " . ($day+1) . ": key ${key[$day]}\n") ; }
	1030	+
	1031	+ &Abort ("Sequence error: '$key_low_prev' gt '$key_low'\n") ;
	1032	+ }
	1033	+
	1034	+ if (($key_low_prev eq $key_low) && ($files_in_open > 0))
	1035	+ {
	1036	+ for ($day = 0 ; $day < $days_in_month ; $day++)
	1037	+ {
	1038	+ if ($file_in_open [$day])
	1039	+ { print "day " . ($day+1) . ": file open, key ${key [$day]}\n" ; }
	1040	+ else
	1041	+ { print "day " . ($day+1) . ": file closed, key ${key [$day]}\n" ; }
	1042	+ }
	1043	+ &Abort ("Sequence error: '$key_low_prev' eq '$key_low'\n") ;
	1044	+ }
	1045	+
	1046	+ ($lang,$title) = split (' ', $key_low) ;
	1047	+
	1048	+ if (($title !~ /\:/) \|\| ($title =~ /^:[^:]*$/)) # no colon or only on first position
	1049	+ { $namespace = 'NamespaceArticles' ; }
	1050	+ else
	1051	+ { ($namespace = $title) =~ s/([^:])\:.*$/$1/ ; }
	1052	+
	1053	+ if (($lang ne $langprev) && ($langprev ne ""))
	1054	+ {
	1055	+ $filter_matches = $lang =~ $reg_exp_filter ;
	1056	+ if ($filter_matches)
	1057	+ { print "F $lang\n" ; }
	1058	+ }
	1059	+ $langprev = $lang ;
	1060	+
	1061	+ if (($files_in_found < $days_in_month) && ($files_in_found > 0)) # always > 0 actually
	1062	+ { $total = sprintf ("%.0f",($total / $files_in_found) * $days_in_month) ; }
	1063	+
	1064	+ $counts_per_month =~ s/,$// ;
	1065	+ $total_per_month = sprintf ("%.0f", $monthly_correction * $total_per_month) ;
	1066	+
	1067	+ $out_month_all->print ("$key_low $total_per_month $counts_per_month\n") ;
	1068	+ if ($total_per_month ge 5)
	1069	+ { $out_month_ge5->print ("$key_low $total_per_month $counts_per_month\n") ; }
	1070	+
	1071	+ $key_low_prev = $key_low ;
	1072	+ }
	1073	+
	1074	+ &Log ("File production took " . (time-$time_start) . " seconds\n\n") ;
	1075	+
	1076	+ &Log ("[$lines, $files_in_open] $key_low\n") ;
	1077	+
	1078	+ $out_month_all->close () ;
	1079	+ $out_month_ge5->close () ;
	1080	+
	1081	+ if ($bayes)
	1082	+ {
	1083	+ foreach $file_in (@files_this_month)
	1084	+ {
	1085	+ print "unlink $dir_in/$file_in (dummy run, test only)\n" ;
	1086	+ # unlink "$dir_in/$file_in" ;
	1087	+ }
	1088	+ }
	1089	+
	1090	+ &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
	1091	+}
	1092	+
	1093	+sub CheckHoursMissing
	1094	+{
	1095	+ my ($year,$month,$day,$file_in) = @_ ;
	1096	+ my ($hour,%hours_seen,%hours_valid,$hours_seen,$hours_missing,%hours_missing) ;
	1097	+
	1098	+# &Log ("\nCheckHoursMissing for day " . ($day+1) . "\n") ;
	1099	+
	1100	+ if ($bayes)
	1101	+ {
	1102	+ if ($file_in =~ /\.bz2$/)
	1103	+ { open FILE_CHECK, "-\|", "bzip2 -dc \"$file_in\"" \|\| abort ("Input file '" . $file_in . "' could not be opened.") ; }
	1104	+ elsif ($file_in =~ /\.7z$/)
	1105	+ { open FILE_CHECK, "-\|", "7z e -so \"$file_in\"" \|\| abort ("Input file '" . $file_in . "' could not be opened.") ; }
	1106	+ else
	1107	+ { abort ("CheckHoursMissing: unexpected file name $file_in.") ; }
	1108	+ }
	1109	+ else
	1110	+ { open FILE_CHECK, '<', $file_in \|\| &Abort ("Open failed for '$file_in'\n") ; }
	1111	+
	1112	+ binmode FILE_CHECK ;
	1113	+
	1114	+ $lines_checked = 0 ;
	1115	+ while ($line = <FILE_CHECK>)
	1116	+ {
	1117	+ if ($line =~ /^#.*?requests per full day are omitted/)
	1118	+ { ($threshold_requests_omitted = $line) =~ s/[^\d]//g ; }
	1119	+
	1120	+ next if $line =~ /^#/ or $line =~ /^@/ ;
	1121	+
	1122	+ last if $lines_checked ++ > 10000 ;
	1123	+
	1124	+ chomp $line ;
	1125	+ if ($line =~ /^[^ ]+ [^ ]+ [^ ]+$/) # prepare for format change: space will be added between daily total and hourly counts
	1126	+ {
	1127	+ ($lang,$title,$counts) = split (' ', $line) ;
	1128	+ }
	1129	+ else
	1130	+ {
	1131	+ ($lang,$title,$total,$counts) = split (' ', $line) ;
	1132	+ $counts = "$total$counts" ;
	1133	+ }
	1134	+ # &Log ("Counts 1 $counts\n") ; # test
	1135	+
	1136	+ undef @counts ;
	1137	+ # $counts = "123A1B2C?D4" ; # test
	1138	+ $counts =~ s/([A-X])(\d+\|\?)/(push @counts,"$1$2"),""/ge ;
	1139	+ foreach $key (@counts)
	1140	+ {
	1141	+ my $hour = ord (substr ($key,0,1)) - ord ('A') ;
	1142	+
	1143	+ # test code
	1144	+ # if ($month % 2 == 1)
	1145	+ # {
	1146	+ # if ($day % 3 == 0)
	1147	+ # {
	1148	+ # next if $hour == 2 ;
	1149	+ # if ($hour % 3 == 0)
	1150	+ # { $key = substr ($key,0,1,) . '?' ; }
	1151	+ # }
	1152	+ # }
	1153	+ # else
	1154	+ # { next if $hour == 2 ; }
	1155	+
	1156	+ next if $hours_seen {$hour} > 0 ;
	1157	+ $hours_seen {$hour} = $true ;
	1158	+ $hours_seen ++ ;
	1159	+ if ($key =~ /\d/)
	1160	+ { $hours_valid {$hour} ++ ; }
	1161	+ else
	1162	+ {
	1163	+ $hours_missing {$hour} ++ ;
	1164	+ $hours_missing ++ ;
	1165	+ $hours_missing_coded .= chr ($day + ord ('A')) . chr ($hour + ord ('A')) .',' ;
	1166	+ }
	1167	+ }
	1168	+ # &Log ("Counts 2 $counts, seen: $hours_seen, valid:". (join ',', sort {$a <=> $b} keys %hours_valid) . ", missing: " . (join ',', sort {$a <=> $b} keys %hours_missing) . "\n") ; # test
	1169	+
	1170	+ last if $hours_seen == 24 ;
	1171	+ }
	1172	+
	1173	+ close FILE_CHECK ;
	1174	+
	1175	+ for ($hour = 0 ; $hour <= 23 ; $hour++)
	1176	+ {
	1177	+ if (! $hours_seen {$hour})
	1178	+ {
	1179	+ $hours_missing {$hour} ++ ;
	1180	+ $hours_missing ++ ;
	1181	+ $hours_missing_coded .= chr ($day + ord ('A')) . chr ($hour + ord ('A')) .',' ;
	1182	+ }
	1183	+ }
	1184	+
	1185	+ if ($lines_checked > 10000)
	1186	+ { &Log ("\nDay " . ($day+1) . ": not all hours encountered after 10,000 lines !!! Seen (can be ?=missing) " . (join ',', sort {$a <=> $b} keys %hours_seen) . "\n") ; }
	1187	+
	1188	+ if ($hours_missing > 0)
	1189	+ {
	1190	+ $text_hour = $hours_missing > 1 ? 'hours' : 'hour' ;
	1191	+ push @hours_missing_per_day, "# Day " . ($day+1) . ": $text_hour missing " . (join ',', sort {$a <=> $b} keys %hours_missing) . "\n" ;
	1192	+ print "Day " . ($day+1) . ": $text_hour missing " . (join ',', sort {$a <=> $b} keys %hours_missing) . "\n" ;
	1193	+ }
	1194	+
	1195	+ $total_hours_missing += $hours_missing ;
	1196	+}
	1197	+
	1198	+sub Log
	1199	+{
	1200	+ $msg = shift ;
	1201	+ print $msg ;
	1202	+ print LOG $msg ;
	1203	+}
	1204	+
	1205	+sub Abort
	1206	+{
	1207	+ $msg = shift ;
	1208	+ print "Abort script\nError: $msg\n" ;
	1209	+ print LOG "Abort script\nError: $msg\n" ;
	1210	+ exit ;
	1211	+}
	1212	+
	1213	+#=============================================================================================================
	1214	+
	1215	+# snippets obsolete but revivable code / test code
	1216	+
	1217	+#sub Compact
	1218	+#{
	1219	+# my $day = shift ;
	1220	+# &Log ("Compact files for $day\n") ;
	1221	+
	1222	+# $file_in = "pagecounts-$day.out" ;
	1223	+# $file_out1 = "pagecounts-${day}_all.gz" ;
	1224	+# $file_out2 = "pagecounts-${day}_10plus.gz" ;
	1225	+# open IN, "<", $file_in ;
	1226	+# binmode $file_in ;
	1227	+
	1228	+# my $out_day1 = IO::Compress::Gzip->new ($file_out1) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
	1229	+# my $out_day2 = IO::Compress::Gzip->new ($file_out2) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
	1230	+
	1231	+# open OUT, ">", $file_out ;
	1232	+# binmode $file_out ;
	1233	+
	1234	+# $lang_prev = "" ;
	1235	+# while ($line = <IN>)
	1236	+# {
	1237	+# chomp ($line) ;
	1238	+# ($lang, $title, $counts) = split (' ', $line) ;
	1239	+# $title2 = $title ;
	1240	+# $title =~ s/\%20/_/g ;
	1241	+# $title =~ s/\%3A/:/g ;
	1242	+# # $title =~ s/\%([0-9A-F]{2})/chr(hex($1))/ge ;
	1243	+# # if ($title =~ /[\x00-\x1F]/)
	1244	+# # { &Log ("> '$title2'\n") ; }
	1245	+# $title =~ s/\x00-\x1F/"%" . sprintf ("%X", ord($1)) ;/ge ;
	1246	+# print $out_day1 "$lang $title $counts\n" ;
	1247	+# ($counts2 = $counts) =~ s/^(\d+).*/$1/ ;
	1248	+# if ($counts2 >= $threshold)
	1249	+# { print $out_day2 "$lang $title $counts\n" ; }
	1250	+# $lang_prev = $lang ;
	1251	+# }
	1252	+#
	1253	+# close IN ;
	1254	+# $out_day1->close() ;
	1255	+# $out_day2->close() ;
	1256	+#}
	1257	+
	1258	+
	1259	+#sub GetViewDistribution
	1260	+#{
	1261	+# open OUT, ">", "Views.csv" ;
	1262	+# foreach $file_in (@files)
	1263	+# {
	1264	+# ($hour = $file_in) =~ s/^pagecounts-\d+-(\d\d)\d+\.gz$/$1/ ;
	1265	+# $hour = chr(ord('A')+$hour) ;
	1266	+# &Log ("Process $hour $file_in\n") ;
	1267	+
	1268	+# $in_hour1 = IO::Uncompress::Gunzip->new ($file_in) \|\| &Abort ("IO::Uncompress::Gunzip failed: $GunzipError\n") ;
	1269	+# while ($line = <$in_hour1>)
	1270	+# {
	1271	+# ($lang,$title,$count,$dummy) = split (' ', $line) ;
	1272	+# if (($lang eq "en") && ($title !~ /:/)) # only en: and namespace 0
	1273	+# {
	1274	+# $tot {$hour} += $count ;
	1275	+# if ($count < 3)
	1276	+# { @counts {$hour . substr ($title,0,1)}++ ; }
	1277	+# }
	1278	+# }
	1279	+# $in_hour1->close () ;
	1280	+# }
	1281	+#
	1282	+# print OUT "," ;
	1283	+# foreach $hour ('A'..'X')
	1284	+# { print OUT $hour . ", " ; }
	1285	+# print OUT "\n" ;
	1286	+#
	1287	+# print OUT "," ;
	1288	+# foreach $hour ('A'..'X')
	1289	+# { print OUT $tot {$hour} . ", " ; }
	1290	+# print OUT "\n" ;
	1291	+#
	1292	+# for ($c=0; $c < 256; $c++)
	1293	+# {
	1294	+# # do not print chars " and , as such: confuses csv format
	1295	+# if ($c < 33)
	1296	+# { print OUT "chr($c), " ; }
	1297	+# elsif (chr($c) eq '"')
	1298	+# { print OUT "dquote, " ; }
	1299	+# elsif (chr($c) eq ',')
	1300	+# { print OUT "comma, " ; }
	1301	+# else
	1302	+# { print OUT chr($c) . ", " ; }
	1303	+#
	1304	+# foreach $hour ('A'..'X')
	1305	+# { print OUT (0+@counts {$hour.chr($c)}) , ", " ; }
	1306	+#
	1307	+# if ($c < 255)
	1308	+# { print OUT "\n" ; }
	1309	+# }
	1310	+# close OUT ;
	1311	+#}
	1312	+
	1313	+
	1314	+#sub RecompactVisitorStats
	1315	+#{
	1316	+# my $dir_in = "D:/Wikipedia_Visitors/full_day/" ;
	1317	+# chdir ($dir_in) \|\| &Abort ("Cannot chdir to $dir_in\n") ;
	1318	+# local (*DIR);
	1319	+# opendir (DIR, ".");
	1320	+# @files = () ;
	1321	+# while ($file_in = readdir (DIR))
	1322	+# {
	1323	+# next if $file_in !~ /^pagecounts-\d{8,8}_fd.gz$/ ;
	1324	+#
	1325	+# push @files, $file_in ;
	1326	+# }
	1327	+
	1328	+# $filecnt = $#files+1 ;
	1329	+# @files = sort { substr ($a, 20,2) <=> substr ($b, 20,2)} @files ;
	1330	+
	1331	+# foreach $file (@files)
	1332	+# { &RecompactVisitorStats2 ($file) ; }
	1333	+# closedir (DIR, ".");
	1334	+#}
	1335	+
	1336	+#sub RecompactVisitorStats2
	1337	+#{
	1338	+## http://www.7-zip.org/7z.html
	1339	+# my $file = shift ;
	1340	+# my $time_start = time ;
	1341	+# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
	1342	+## my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
	1343	+# my $file_in = "D:/Wikipedia_Visitors/full_day/$file" ;
	1344	+# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
	1345	+# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
	1346	+
	1347	+# &Log ("Process $file_in\n") ;
	1348	+
	1349	+# $in_hour = IO::Uncompress::Gunzip->new ($file_in) \|\| &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
	1350	+# binmode $in_hour ;
	1351	+# open OUT, ">", $file_out ;
	1352	+# binmode OUT ;
	1353	+
	1354	+# my ($title, $title2) ;
	1355	+# while ($line = <$in_hour>)
	1356	+# {
	1357	+# chomp ($line) ;
	1358	+# ($lang,$title,$counts) = split (" ", $line) ;
	1359	+
	1360	+# if ($lang ne $lang_prev) { print "$lang " ; }
	1361	+# $lang_prev = $lang ;
	1362	+
	1363	+# # test pagecounts-20080701_fd.gz
	1364	+# # all records 424 Mib compressed (1984 uncompressed)
	1365	+# # count > 1 212 Mib compressed ( 733 uncompressed)
	1366	+# # count > 2 169 Mib compressed ( 551 uncompressed)
	1367	+# next if $counts <= 1 ;
	1368	+
	1369	+# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
	1370	+# $title =~ s/\s/_/g;
	1371	+# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
	1372	+# $lang =~ s/\.y/2/ ;
	1373	+
	1374	+# print OUT "$lang $title $counts\n" ;
	1375	+# }
	1376	+
	1377	+# print "Close files\n" ;
	1378	+# $in_hour -> close () ;
	1379	+# close (OUT) ;
	1380	+
	1381	+# &Log ("Compress $file_out\n") ;
	1382	+
	1383	+# unlink $file_7z ;
	1384	+# $result = `$path_7z a $file_7z $file_out` ;
	1385	+# &Log ("Compressed\n") ;
	1386	+# &Log ("Result " . ($result+0) . " \n") ;
	1387	+# if ((-e $file_7z) && (-s $file_7z > 0) && (($result == 0) \|\| ($result == 7)))
	1388	+# { unlink $file_out ; }
	1389	+
	1390	+# &Log ("Processed in " . (time-$time_start) . " seconds\n\n") ;
	1391	+## 0 No error
	1392	+## 1 Warning (Non fatal error(s)). For example, one or more files were locked by some other application, so they were not compressed.
	1393	+## 2 Fatal error
	1394	+## 7 Command line error
	1395	+## 8 Not enough memory for operation
	1396	+## 255 User stopped the process
	1397	+#}
	1398	+
	1399	+
	1400	+#sub RecompactVisitorStats3
	1401	+#{
	1402	+## http://www.7-zip.org/7z.html
	1403	+# my $path_7z = "D:/Wikipedia_Visitors/7z.exe" ;
	1404	+# my $file_in = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts-20080702_fd.gz" ;
	1405	+# my $file_out ; ($file_out = $file_in) =~ s/gz$/txt/ ;
	1406	+# my $file_7z ; ($file_7z = $file_in) =~ s/gz$/7z/ ;
	1407	+## my $file_log = "D:/Wikipedia_Visitors/full_day/2008-07-pagecounts/pagecounts.log" ;
	1408	+
	1409	+# $in_hour = IO::Uncompress::Gunzip->new ($file_in) \|\| &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
	1410	+# binmode $in_hour ;
	1411	+## $out_day = IO::Compress::Gzip->new ($file_out) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n") ;
	1412	+## binmode $out_day ;
	1413	+# open OUT, ">", $file_out ;
	1414	+# binmode OUT ;
	1415	+## open LOG, ">", $file_log ;
	1416	+## binmode LOG ;
	1417	+
	1418	+# my ($title, $title2) ;
	1419	+# while ($line = <$in_hour>)
	1420	+# {
	1421	+# chomp ($line) ;
	1422	+# ($lang,$title,$counts) = split (" ", $line) ;
	1423	+
	1424	+# if ($lang ne $lang_prev) { print "$lang\n" ; }
	1425	+## last if $lang gt "fs" ;
	1426	+# $lang_prev = $lang ;
	1427	+
	1428	+# # test pagecounts-20080701_fd.gz
	1429	+# # all records 424 Mib compressed (1984 uncompressed)
	1430	+# # count > 1 212 Mib compressed ( 733 uncompressed)
	1431	+# # count > 2 169 Mib compressed ( 551 uncompressed)
	1432	+# next if $counts <= 1 ;
	1433	+
	1434	+## next if $lang !~ /^(?:ar\|fr)/ ;
	1435	+
	1436	+#if ($false)
	1437	+#{
	1438	+# $title1b = $title ;
	1439	+# $title1b =~ s/(\%[A-Fa-f0-9]{2})/uc($1)/seg;
	1440	+# $title1b =~ s/\%28/(/g ;
	1441	+# $title1b =~ s/\%29/)/g ;
	1442	+# $title1b =~ s/\%3A/:/g ;
	1443	+# $title1b =~ s/\%2F/\//g ;
	1444	+# $title1b =~ s/\%5C/\\/g ;
	1445	+# $title1b =~ s/\%2A/*/g ;
	1446	+# $title1b =~ s/\%21/!/g ;
	1447	+# $title1b =~ s/\%5F/_/g ;
	1448	+# $title1b =~ s/\%2C/,/g ;
	1449	+# $title1b =~ s/\%2E/./g ;
	1450	+# $title1b =~ s/\%2D/-/g ;
	1451	+# $title1b =~ s/\%25/%/g ;
	1452	+# $title1b =~ s/\%7E/~/g ;
	1453	+# $title1b =~ s/\%27/'/g ;
	1454	+# $title1b =~ s/\%3D/=/g ;
	1455	+# $title1b =~ s/\%26/&/g ;
	1456	+# $title1b =~ s/\%3B/;/g ;
	1457	+# $title1b =~ s/\%3F/?/g ;
	1458	+# $title1b =~ s/\%2B/+/g ;
	1459	+# $title2 = $title1b ;
	1460	+# $title2 =~ s/%([A-F0-9]{2})/chr(hex($1))/seg;
	1461	+
	1462	+# if ($title1b ne $title2) # if changed anything at all
	1463	+# {
	1464	+# $title3 = uri_escape ($title2) ;
	1465	+# $title3 =~ s/\%28/(/g ;
	1466	+# $title3 =~ s/\%29/)/g ;
	1467	+# $title3 =~ s/\%3A/:/g ;
	1468	+# $title3 =~ s/\%2F/\//g ;
	1469	+# $title3 =~ s/\%5C/\\/g ;
	1470	+# $title3 =~ s/\%2A/\*/g ;
	1471	+# $title3 =~ s/\%21/\!/g ;
	1472	+# $title3 =~ s/\%5F/\_/g ;
	1473	+# $title3 =~ s/\%2C/,/g ;
	1474	+# $title3 =~ s/\%2E/./g ;
	1475	+# $title3 =~ s/\%2D/-/g ;
	1476	+# $title3 =~ s/\%25/%/g ;
	1477	+# $title3 =~ s/\%7E/~/g ;
	1478	+# $title3 =~ s/\%27/'/g ;
	1479	+# $title3 =~ s/\%3D/=/g ;
	1480	+# $title3 =~ s/\%26/&/g ;
	1481	+# $title3 =~ s/\%3B/;/g ;
	1482	+# $title3 =~ s/\%3F/?/g ;
	1483	+# $title3 =~ s/\%2B/+/g ;
	1484	+
	1485	+# if ($title1b eq $title3) # process reversible ?
	1486	+# {
	1487	+# $y++ ;
	1488	+# $title2 =~ s/\s/_/g;
	1489	+# $title = $title2 ;
	1490	+# }
	1491	+# else
	1492	+# {
	1493	+# $n++ ;
	1494	+# print "Y $y N $n\n$title\n$title3\n\n" ;
	1495	+# print LOG "Y $y N $n\n$title\n$title3\n\n" ;
	1496	+# }
	1497	+# }
	1498	+#}
	1499	+# $title =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/seg;
	1500	+# $title =~ s/\s/_/g;
	1501	+# $lang =~ s/\.z// ; # remove codes that were added to fix sort sequence
	1502	+# $lang =~ s/\.y/2/ ;
	1503	+
	1504	+# # print $out_day "$lang $title $counts\n" ;
	1505	+# print OUT "$lang $title $counts\n" ;
	1506	+# }
	1507	+
	1508	+# print "Close files\n" ;
	1509	+# $in_hour -> close () ;
	1510	+## $out_day -> close () ;
	1511	+# close (OUT) ;
	1512	+# $result = `$path_7z a $file_out $file_txt` ;
	1513	+# print $result ;
	1514	+#}
	1515	+
	1516	+
	1517	+
	1518	+# test (partial) reversibility of process
	1519	+#sub UncompactVisitorStats
	1520	+#{
	1521	+# my $file_in = "out/2009-03/pagecounts-20090301_fdt" ;
	1522	+# my $dir_out = "out" ;
	1523	+# # $in_hour = IO::Uncompress::Gunzip->new ($file_in) \|\| &Abort ("IO::Uncompress::Gunzip failed for '$file_in': $GunzipError\n") ;
	1524	+# open $in_hour, '<', $file_in ;
	1525	+# binmode $in_hour ;
	1526	+
	1527	+# for ($h=0 ; $h<=23 ; $h++)
	1528	+# {
	1529	+# $time = sprintf ("%02d",$h) . "0000" ;
	1530	+## $file_out = "$dir_out/pagecounts-20090301-$time.gz" ;
	1531	+# $file_out = "$dir_out/pagecounts-20090301-$time" ;
	1532	+# open $out_day [$h], '>', $file_out ;
	1533	+## $out_day [$h] = IO::Compress::Gzip->new ($file_out) \|\| &Abort ("IO::Compress::Gzip failed: $GzipError\n");
	1534	+# binmode $out_day [$h] ;
	1535	+# }
	1536	+
	1537	+# while ($line = <$in_hour>)
	1538	+# {
	1539	+# next if $line =~ /^#/ ;
	1540	+# next if $line =~ /^@/ ;
	1541	+# chomp ($line) ;
	1542	+## print "$line\n" ;
	1543	+# if ($lines++ > 10000) { exit ; }
	1544	+# ($lang,$title,$counts) = split (" ", $line) ;
	1545	+# $lang =~ s/\.z// ;
	1546	+# $lang =~ s/\.y/2/ ;
	1547	+# $counts =~ s/^\d+// ; # remove (redundant) preceding total
	1548	+# while ($counts ne "")
	1549	+# {
	1550	+# $letter = substr ($counts,0,1) ;
	1551	+# $counts = substr ($counts,1) ;
	1552	+# ($count = $counts) =~ s/^(\d+).*$/$1/ ;
	1553	+# $counts =~ s/^\d+(.*)$/$1/ ;
	1554	+# $h = ord ($letter) - ord ('A') ;
	1555	+# $file = $out_day [$h] ;
	1556	+# $writes {$h} ++ ;
	1557	+# print $file "$lang $title $count\n" ;
	1558	+# }
	1559	+
	1560	+# }
	1561	+
	1562	+# for ($h=0 ; $h<=23 ; $h++)
	1563	+# {
	1564	+## $out_day [$h] -> close () ;
	1565	+# close $out_day [$h] ;
	1566	+# }
	1567	+#}
	1568	+
	1569	+

Status & tagging log

15:23, 18 February 2011 Reedy (talk | contribs) changed the status of r82396 [removed: new added: deferred]