r109182 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r109181‎ | r109182 | r109183 >
Date:18:50, 17 January 2012
Author:ezachte
Status:deferred
Tags:
Comment:
DammitSyncProjectCounts supersedes !DammitSyncFiles, complete overhaul and simplification because of changed data stream (no more wget's needed)
Modified paths:
  • /trunk/wikistats/dammit.lt/!DammitSyncFiles.pl (added) (history)
  • /trunk/wikistats/dammit.lt/DammitSyncFiles.pl (deleted) (history)
  • /trunk/wikistats/dammit.lt/DammitSyncProjectCounts.pl (added) (history)

Diff [purge]

Index: trunk/wikistats/dammit.lt/DammitSyncFiles.pl
@@ -1,206 +0,0 @@
2 -#!/usr/bin/perl
3 -
4 -# 27 April 2010 renamed from WikiStatsDammitSync.pl
5 -
6 - use Time::Local ;
7 - use Archive::Tar;
8 -
9 - $tar = Archive::Tar->new;
10 -
11 - $| = 1; # flush screen output
12 -
13 - $maxdaysago = 40; # do not download files more than this ago
14 -
15 - if (-e "a_dammit.lt_index.html") # test
16 - { $file_html = "a_dammit.lt_index.html" ; }
17 - else
18 - {
19 - open LOG, '>>', "/a/dammit.lt/WikiStatsDammitSync.log" ;
20 -
21 - $file_html = "/a/dammit.lt/index.html" ;
22 - unlink $file_html ;
23 - $cmd = "wget -O $file_html http://dammit.lt/wikistats/" ;
24 - $result = `$cmd` ;
25 - if ($result == 0)
26 - { $result = "OK" ; }
27 - &Log ("Cmd '$cmd' -> $result \n\n") ;
28 -
29 - if (! -e $file_html) { &Abort ("File $file_html not found") ; }
30 - if (-s $file_html == 0) { &Abort ("File $file_html empty") ; }
31 - }
32 -
33 - $timestart = time ;
34 -
35 - chdir "/a/dammit.lt/projectcounts" ;
36 - $cmd = `pwd` ;
37 - &Log ("Cmd '$cmd'\n") ;
38 - $result = `$cmd` ;
39 - print "$result\n" ;
40 -
41 - open HTML,'<',$file_html ;
42 - while ($line = <HTML>)
43 - {
44 - if ($line =~ /<title>/)
45 - {
46 - $subdir = "" ;
47 - if ($line =~ /archive/)
48 - {
49 - $line =~ s/^.*?\/wikistats\/// ;
50 - $line =~ s/<.*$// ;
51 - chomp $line ;
52 - $subdir = $line ;
53 - }
54 - &Log ("Subdir = '$subdir'\n") ;
55 - next ;
56 - }
57 -
58 - if ($line !~ /application\/octet-stream/) { next ; }
59 -
60 - ($file = $line) =~ s/^.*?a href=\"([^"]+)\".*$/$1/s ;
61 - ($date = $line) =~ s/^.*?class=\"m\">([^<]+)<.*$/$1/s ;
62 - ($date,$time) = split (' ', $date) ;
63 -
64 - if ($file =~ /^pagecounts/)
65 - {
66 - $yy = substr ($file,11,4) ;
67 - $mm = substr ($file,15,2) ;
68 - $dd = substr ($file,17,2) ;
69 - $daysago = int ((time - timegm(0,0,0,$dd,$mm-1,$yy-1900)) / (24 * 60 * 60)) ;
70 -
71 - print "$file: $daysago days ago\n" ;
72 - if ($daysago > $maxdaysago) { next ; }
73 -
74 - # $path_7z = "/a/dammit.lt/pagecounts/$yy\-$mm/pagecounts\-$yy$mm$dd\_fdt.7z" ;
75 - # if (-e $path_7z) { print "exists\n" ; next ; }
76 -
77 - $path = "/a/dammit.lt/pagecounts/$yy\-$mm/pagecounts\-$yy$mm$dd\_h" ;
78 - if ((-e "$path.7z") || (-e "$path.zip") || (-e "$path.bz2") || (-e "$path.gz"))
79 - { print "$path.[7z|zip|bz2|gz] exists\n" ; next ; }
80 - else
81 - { print "$path.[7z|zip|bz2|gz] new -> download\n" ; }
82 - }
83 -
84 - # if ($file =~ /^projectcounts/)
85 - # {
86 - # $yy = substr ($file,14,4) ;
87 - # $mm = substr ($file,18,2) ;
88 - # $dd = substr ($file,20,2) ;
89 - # $daysago = int ((time - timegm(0,0,0,$dd,$mm-1,$yy-1900)) / (24 * 60 * 60)) ;
90 - # if ($daysago > $maxdaysago) { next ; }
91 - # }
92 -
93 -
94 - $yy = substr ($date,0,4) ;
95 - $mm = substr ($date,5,3) ;
96 - $dd = substr ($date,9,2) ;
97 - $hh = substr ($time,0,2) ;
98 - $nn = substr ($time,3,2) ;
99 - $ss = substr ($time,6,2) ;
100 -
101 - if ($mm eq 'Jan') { $mm = 1 ; }
102 - elsif ($mm eq 'Feb') { $mm = 2 ; }
103 - elsif ($mm eq 'Mar') { $mm = 3 ; }
104 - elsif ($mm eq 'Apr') { $mm = 4 ; }
105 - elsif ($mm eq 'May') { $mm = 5 ; }
106 - elsif ($mm eq 'Jun') { $mm = 6 ; }
107 - elsif ($mm eq 'Jul') { $mm = 7 ; }
108 - elsif ($mm eq 'Aug') { $mm = 8 ; }
109 - elsif ($mm eq 'Sep') { $mm = 9 ; }
110 - elsif ($mm eq 'Oct') { $mm = 10 ; }
111 - elsif ($mm eq 'Nov') { $mm = 11 ; }
112 - elsif ($mm eq 'Dec') { $mm = 12 ; }
113 - else { &Abort ("Invalid month '$mm' in file date $date $time") ; }
114 -
115 - $date2 = sprintf ("%02d%02d%02d%02d%02d.%02d", ($yy-2000), $mm, $dd, $hh, $nn, $ss) ;
116 -
117 - if ($file =~ /^(?:page|project)counts-2/)
118 - {
119 -
120 - if ($file =~ /^pagecounts/)
121 - { $path = "/a/dammit.lt/pagecounts/$file" ; }
122 - else
123 - { $path = "/a/dammit.lt/projectcounts/$file" ; }
124 -
125 - if (-e $path)
126 - {
127 - &Log ("File $path exists\n") ;
128 - if (-s $path == 0)
129 - {
130 - &Log ("File $path empty -> overwrite\n") ;
131 - unlink $path ;
132 - }
133 - else { next ; }
134 - }
135 -
136 - if ($file =~ /^projectcounts/)
137 - {
138 - $tar_file = "/a/dammit.lt/projectcounts/projectcounts-$yy.tar" ;
139 - if (-e $tar_file)
140 - {
141 - if ($tar_file ne $tar_file_prev)
142 - {
143 - &Log ("\nRead tar file $tar_file\n") ;
144 - $tar->read($tar_file);
145 - $tar_file_prev = $tar_file ;
146 - }
147 - if ($tar->contains_file ($file))
148 - {
149 - &Log ("File $file exists in tar file $tar_file\n") ;
150 - next ;
151 - }
152 - }
153 - else
154 - { &Log ("Tar file $tar_file not found\n") ; }
155 - }
156 -
157 - &Log ("Write file $path, set date $date2\n") ;
158 -
159 - $cmd = "wget -a /a/dammit.lt/wget.log -O $path http://dammit.lt/wikistats/$subdir$file" ;
160 - $result = `$cmd` ;
161 - if ((-e $path) && (-s $path == 0))
162 - {
163 - $result = "Empty file -> remove $path" ;
164 - unlink $path ;
165 - }
166 - elsif ($result == 0)
167 - { $result = "OK" ; }
168 -
169 - &Log ("Cmd '$cmd' -> $result \n\n") ;
170 -
171 - if (-e $path)
172 - {
173 - `touch $path -t $date2` ;
174 -
175 - if ($file =~ /^projectcounts/)
176 - {
177 - $cmd = "tar --append --file=$tar_file $file" ;
178 - &Log ("Cmd '$cmd'\n") ;
179 - $result = `$cmd` ;
180 - print "$result\n" ;
181 - unlink $path ;
182 - }
183 - }
184 - }
185 - }
186 -
187 - &Log ("Ready in " . (time - $timestart) . " sec.\n") ;
188 - close HTML ;
189 - close LOG ;
190 - exit ;
191 -
192 -sub Log
193 -{
194 - $msg = shift ;
195 - my ($ss, $nn, $hh) = (localtime(time))[0,1,2] ;
196 - my $time = sprintf ("%02d:%02d:%02d", $hh, $nn, $ss) ;
197 - $msg = "$time $msg" ;
198 - print $msg ;
199 - print LOG $msg ;
200 -}
201 -
202 -sub Abort
203 -{
204 - $msg = shift ;
205 - &Log ($msg) ;
206 - exit ;
207 -}
Index: trunk/wikistats/dammit.lt/DammitSyncProjectCounts.pl
@@ -0,0 +1,125 @@
 2+#!/usr/bin/perl
 3+
 4+# November 2011 this file is much simplified version of DammitSyncFiles.pl
 5+# DammitSyncFiles.pl used to wget each day new pagecounts and projectcounts files from dammit.lt/wikistats
 6+# These files are now on WMF server. Only copy small projectcounts files and store in tar file. Pagecounts files no longer needs to beed copied.
 7+# This provides a compact archive of all files, and allows versioning (after patching certain ranges of files for server underreporting)
 8+
 9+ use Time::Local ;
 10+ use Archive::Tar;
 11+
 12+ $tar = Archive::Tar->new;
 13+
 14+ $| = 1; # flush screen output
 15+
 16+ $timestart = time ;
 17+
 18+ $dir_tars = "/a/dammit.lt/projectcounts" ;
 19+ $dir_dumps = "/mnt/data/xmldatadumps/public/other/pagecounts-raw" ;
 20+ $dir_archive = "/mnt/data/xmldatadumps/public/other/pagecounts-ez/projectcounts" ;
 21+
 22+ if (! -d $dir_tars)
 23+ { &Abort ("Folder not found: '$dir_tars'\n") ; }
 24+ if (! -d $dir_dumps)
 25+ { &Abort ("Folder not found: '$dir_dumps'\n") ; }
 26+
 27+ ($month,$year) = (gmtime(time))[4,5];
 28+ $year += 1900;
 29+ $month ++ ;
 30+ $this_month = sprintf ("%04d/%04d-%02d", $year, $year, $month) ;
 31+ $month -- ;
 32+ if ($month == 0)
 33+ { $month = 12 ; $year -- ; }
 34+ $prev_month = sprintf ("%04d/%04d-%02d", $year, $year, $month) ;
 35+
 36+ &GetProjectCounts ($prev_month) ;
 37+ &GetProjectCounts ($this_month) ;
 38+ &ArchiveTars ;
 39+
 40+ &Log ("Ready in " . (time - $timestart) . " sec.\n") ;
 41+ exit ;
 42+
 43+sub GetProjectCounts
 44+{
 45+ my ($yyyy_yyyy_mm) = @_ ;
 46+ my $year = substr ($yyyy_yyyy_mm,0,4) ;
 47+ my $month = substr ($yyyy_yyyy_mm,10,2) ;
 48+
 49+ print "GetProjectCounts for $year - $month\n" ;
 50+
 51+ $tar_file = "$dir_tars/projectcounts-$year.tar" ;
 52+
 53+ if (-e $tar_file)
 54+ {
 55+ if ($tar_file ne $tar_file_prev)
 56+ {
 57+ &Log ("\nRead tar file $tar_file\n") ;
 58+ $tar->read($tar_file);
 59+ $tar_file_prev = $tar_file ;
 60+ }
 61+ }
 62+ else
 63+ { &Log ("Tar file $tar_file not found\n") ; }
 64+
 65+ my $dir_files = "$dir_dumps/$year/$year-$month" ;
 66+ if (! -d $dir_files)
 67+ { &Abort ("Folder not found: '$dir_files'") ; }
 68+
 69+ chdir $dir_files || &Abort ("Could not change to dir '$dir_files'") ;
 70+
 71+ @files = <*>;
 72+ foreach $file (sort @files)
 73+ {
 74+ next if ! -e $file ;
 75+ next if $file !~ /^projectcounts/ ;
 76+ &GetFile ($tar_file,$dir_files, $file) ;
 77+ $last_file_added = $file ;
 78+ }
 79+}
 80+
 81+sub GetFile
 82+{
 83+ my ($tar_file, $dir_files, $file) = @_ ;
 84+
 85+ if ($tar->contains_file ($file))
 86+ {
 87+ # &Log ("File $file exists in tar file $tar_file\n") ;
 88+ return ;
 89+ }
 90+
 91+ &Log ("Add new file $file to $tar_file\n") ;
 92+
 93+ $cmd = "tar --append --file=$tar_file $file" ;
 94+ &Log ("Cmd '$cmd'\n") ;
 95+ $result = `$cmd` ;
 96+ print "$result\n" ;
 97+}
 98+
 99+sub ArchiveTars
 100+{
 101+ $cmd = "rsync -av $dir_tars/projectcounts-20??.tar $dir_archive" ;
 102+ &Log ("Cmd '$cmd'\n") ;
 103+ $result = `$cmd` ;
 104+ print "$result\n" ;
 105+
 106+ open LAST, '>', "$dir_archive/most_recent_file.txt" ;
 107+ print LAST $last_file_added ;
 108+ close LAST ;
 109+}
 110+
 111+sub Log
 112+{
 113+ $msg = shift ;
 114+ my ($ss, $nn, $hh) = (localtime(time))[0,1,2] ;
 115+ my $time = sprintf ("%02d:%02d:%02d", $hh, $nn, $ss) ;
 116+ $msg = "$time $msg" ;
 117+ print $msg ;
 118+ print LOG $msg ;
 119+}
 120+
 121+sub Abort
 122+{
 123+ $msg = shift ;
 124+ &Log ("\nError: $msg\n\n") ;
 125+ exit ;
 126+}
Index: trunk/wikistats/dammit.lt/!DammitSyncFiles.pl
@@ -0,0 +1,211 @@
 2+#!/usr/bin/perl
 3+
 4+# 27 April 2010 renamed from WikiStatsDammitSync.pl
 5+
 6+# Up till October 2011: copy new files from dammit.lt/wikistats
 7+# Starting November 2011: copy new files directly from WMF server
 8+# Later maybe don't copy large pagecounts files at all. Rather process these files directly from other server.
 9+# For now do copy files to this server until other jobs are ready to accept new location (and not unlinking after compression step etc).
 10+
 11+ use Time::Local ;
 12+ use Archive::Tar;
 13+
 14+ $tar = Archive::Tar->new;
 15+
 16+ $| = 1; # flush screen output
 17+
 18+ $maxdaysago = 40; # do not download files more than this ago
 19+
 20+ if (-e "a_dammit.lt_index.html") # test
 21+ { $file_html = "a_dammit.lt_index.html" ; }
 22+ else
 23+ {
 24+ open LOG, '>>', "/a/dammit.lt/WikiStatsDammitSync.log" ;
 25+
 26+ $file_html = "/a/dammit.lt/index.html" ;
 27+ unlink $file_html ;
 28+ $cmd = "wget -O $file_html http://dammit.lt/wikistats/" ;
 29+ $result = `$cmd` ;
 30+ if ($result == 0)
 31+ { $result = "OK" ; }
 32+ &Log ("Cmd '$cmd' -> $result \n\n") ;
 33+
 34+ if (! -e $file_html) { &Abort ("File $file_html not found") ; }
 35+ if (-s $file_html == 0) { &Abort ("File $file_html empty") ; }
 36+ }
 37+
 38+ $timestart = time ;
 39+
 40+ chdir "/a/dammit.lt/projectcounts" ;
 41+ $cmd = `pwd` ;
 42+ &Log ("Cmd '$cmd'\n") ;
 43+ $result = `$cmd` ;
 44+ print "$result\n" ;
 45+
 46+ open HTML,'<',$file_html ;
 47+ while ($line = <HTML>)
 48+ {
 49+ if ($line =~ /<title>/)
 50+ {
 51+ $subdir = "" ;
 52+ if ($line =~ /archive/)
 53+ {
 54+ $line =~ s/^.*?\/wikistats\/// ;
 55+ $line =~ s/<.*$// ;
 56+ chomp $line ;
 57+ $subdir = $line ;
 58+ }
 59+ &Log ("Subdir = '$subdir'\n") ;
 60+ next ;
 61+ }
 62+
 63+ if ($line !~ /application\/octet-stream/) { next ; }
 64+
 65+ ($file = $line) =~ s/^.*?a href=\"([^"]+)\".*$/$1/s ;
 66+ ($date = $line) =~ s/^.*?class=\"m\">([^<]+)<.*$/$1/s ;
 67+ ($date,$time) = split (' ', $date) ;
 68+
 69+ if ($file =~ /^pagecounts/)
 70+ {
 71+ $yy = substr ($file,11,4) ;
 72+ $mm = substr ($file,15,2) ;
 73+ $dd = substr ($file,17,2) ;
 74+ $daysago = int ((time - timegm(0,0,0,$dd,$mm-1,$yy-1900)) / (24 * 60 * 60)) ;
 75+
 76+ print "$file: $daysago days ago\n" ;
 77+ if ($daysago > $maxdaysago) { next ; }
 78+
 79+ # $path_7z = "/a/dammit.lt/pagecounts/$yy\-$mm/pagecounts\-$yy$mm$dd\_fdt.7z" ;
 80+ # if (-e $path_7z) { print "exists\n" ; next ; }
 81+
 82+ $path = "/a/dammit.lt/pagecounts/$yy\-$mm/pagecounts\-$yy$mm$dd\_h" ;
 83+ if ((-e "$path.7z") || (-e "$path.zip") || (-e "$path.bz2") || (-e "$path.gz"))
 84+ { print "$path.[7z|zip|bz2|gz] exists\n" ; next ; }
 85+ else
 86+ { print "$path.[7z|zip|bz2|gz] new -> download\n" ; }
 87+ }
 88+
 89+ # if ($file =~ /^projectcounts/)
 90+ # {
 91+ # $yy = substr ($file,14,4) ;
 92+ # $mm = substr ($file,18,2) ;
 93+ # $dd = substr ($file,20,2) ;
 94+ # $daysago = int ((time - timegm(0,0,0,$dd,$mm-1,$yy-1900)) / (24 * 60 * 60)) ;
 95+ # if ($daysago > $maxdaysago) { next ; }
 96+ # }
 97+
 98+
 99+ $yy = substr ($date,0,4) ;
 100+ $mm = substr ($date,5,3) ;
 101+ $dd = substr ($date,9,2) ;
 102+ $hh = substr ($time,0,2) ;
 103+ $nn = substr ($time,3,2) ;
 104+ $ss = substr ($time,6,2) ;
 105+
 106+ if ($mm eq 'Jan') { $mm = 1 ; }
 107+ elsif ($mm eq 'Feb') { $mm = 2 ; }
 108+ elsif ($mm eq 'Mar') { $mm = 3 ; }
 109+ elsif ($mm eq 'Apr') { $mm = 4 ; }
 110+ elsif ($mm eq 'May') { $mm = 5 ; }
 111+ elsif ($mm eq 'Jun') { $mm = 6 ; }
 112+ elsif ($mm eq 'Jul') { $mm = 7 ; }
 113+ elsif ($mm eq 'Aug') { $mm = 8 ; }
 114+ elsif ($mm eq 'Sep') { $mm = 9 ; }
 115+ elsif ($mm eq 'Oct') { $mm = 10 ; }
 116+ elsif ($mm eq 'Nov') { $mm = 11 ; }
 117+ elsif ($mm eq 'Dec') { $mm = 12 ; }
 118+ else { &Abort ("Invalid month '$mm' in file date $date $time") ; }
 119+
 120+ $date2 = sprintf ("%02d%02d%02d%02d%02d.%02d", ($yy-2000), $mm, $dd, $hh, $nn, $ss) ;
 121+
 122+ if ($file =~ /^(?:page|project)counts-2/)
 123+ {
 124+
 125+ if ($file =~ /^pagecounts/)
 126+ { $path = "/a/dammit.lt/pagecounts/$file" ; }
 127+ else
 128+ { $path = "/a/dammit.lt/projectcounts/$file" ; }
 129+
 130+ if (-e $path)
 131+ {
 132+ &Log ("File $path exists\n") ;
 133+ if (-s $path == 0)
 134+ {
 135+ &Log ("File $path empty -> overwrite\n") ;
 136+ unlink $path ;
 137+ }
 138+ else { next ; }
 139+ }
 140+
 141+ if ($file =~ /^projectcounts/)
 142+ {
 143+ $tar_file = "/a/dammit.lt/projectcounts/projectcounts-$yy.tar" ;
 144+ if (-e $tar_file)
 145+ {
 146+ if ($tar_file ne $tar_file_prev)
 147+ {
 148+ &Log ("\nRead tar file $tar_file\n") ;
 149+ $tar->read($tar_file);
 150+ $tar_file_prev = $tar_file ;
 151+ }
 152+ if ($tar->contains_file ($file))
 153+ {
 154+ &Log ("File $file exists in tar file $tar_file\n") ;
 155+ next ;
 156+ }
 157+ }
 158+ else
 159+ { &Log ("Tar file $tar_file not found\n") ; }
 160+ }
 161+
 162+ &Log ("Write file $path, set date $date2\n") ;
 163+
 164+ $cmd = "wget -a /a/dammit.lt/wget.log -O $path http://dammit.lt/wikistats/$subdir$file" ;
 165+ $result = `$cmd` ;
 166+ if ((-e $path) && (-s $path == 0))
 167+ {
 168+ $result = "Empty file -> remove $path" ;
 169+ unlink $path ;
 170+ }
 171+ elsif ($result == 0)
 172+ { $result = "OK" ; }
 173+
 174+ &Log ("Cmd '$cmd' -> $result \n\n") ;
 175+
 176+ if (-e $path)
 177+ {
 178+ `touch $path -t $date2` ;
 179+
 180+ if ($file =~ /^projectcounts/)
 181+ {
 182+ $cmd = "tar --append --file=$tar_file $file" ;
 183+ &Log ("Cmd '$cmd'\n") ;
 184+ $result = `$cmd` ;
 185+ print "$result\n" ;
 186+ unlink $path ;
 187+ }
 188+ }
 189+ }
 190+ }
 191+
 192+ &Log ("Ready in " . (time - $timestart) . " sec.\n") ;
 193+ close HTML ;
 194+ close LOG ;
 195+ exit ;
 196+
 197+sub Log
 198+{
 199+ $msg = shift ;
 200+ my ($ss, $nn, $hh) = (localtime(time))[0,1,2] ;
 201+ my $time = sprintf ("%02d:%02d:%02d", $hh, $nn, $ss) ;
 202+ $msg = "$time $msg" ;
 203+ print $msg ;
 204+ print LOG $msg ;
 205+}
 206+
 207+sub Abort
 208+{
 209+ $msg = shift ;
 210+ &Log ($msg) ;
 211+ exit ;
 212+}

Status & tagging log