Index: trunk/wikistats/analytics/analytics_generate_csv_files.sh |
— | — | @@ -1,21 +1,47 @@ |
2 | 2 | #!/bin/sh |
| 3 | + |
| 4 | +# Prepare several csv files, ready for importing into analytics database |
| 5 | +# All generated files have _in_ in name signalling these contain data ready for importing into database |
| 6 | +# One input record corresponds to one database record |
| 7 | + |
3 | 8 | ulimit -v 8000000 |
4 | 9 | |
5 | 10 | clear |
6 | 11 | cd /a/analytics |
7 | 12 | |
8 | | -perl AnalyticsPrepBinariesData.pl -i /a/wikistats/ -o /a/analytics/test/ |
9 | 13 | |
10 | | -# add or replace data from newest comScore csv files (last 14 months) into master files (full history) |
11 | | -# and generate database input csv file from it |
| 14 | +# AnalyticsPrepBinariesData.pl read counts for binaries which were generated by wikistats |
| 15 | +# and which reside in /a/wikistats/csv_[project code]/StatisticsPerBinariesExtension.csv |
| 16 | +# It filters and reorganizes data and produces analytics_in_binaries.csv |
| 17 | +# Output csv contains: project code, language, month, extension name, count |
12 | 18 | |
| 19 | +perl AnalyticsPrepBinariesData.pl -i /a/wikistats/ -o /a/analytics/ |
| 20 | + |
| 21 | +# AnalyticsPrepComscoreData.pl scans /a/analytics/comscore for newest comScore csv files (with data for last 14 months) |
| 22 | +# parses those csv files, adds/replaces data from these csv files into master files (containing full history) |
| 23 | +# and generates input csv file analytics_in_comscore.csv ready for importing into database |
| 24 | +# |
| 25 | +# note : these csv files were manually downloaded from http://mymetrix.comscore.com/app/report.aspx |
| 26 | +# and given more descriptive names, script finds newest files based on partial name search |
| 27 | +# |
13 | 28 | # -r replace (default is add only) |
14 | | -# -i input folder, contains manually downloaded csv files from comScore (or xls, converted to csv) |
| 29 | +# -i input folder, contains manually downloaded csv files from comScore (or xls files manually converted to csv) |
15 | 30 | # -m master files with full history |
16 | | -# -o output csv file, with reach and UV's per region and UV's per top web property, ready for import into database |
| 31 | +# -o output csv file, with reach per region, UV's per region and UV's per top web property, ready for import into database |
| 32 | + |
17 | 33 | perl AnalyticsPrepComscoreData.pl -r -i /a/analytics/comscore -m /a/analytics -o /a/analytics |
18 | 34 | |
| 35 | +# AnalyticsPrepWikiCountsOutput.pl reads a plethora of fields from several csv files from wikistats process |
| 36 | +# It filters and reorganizes data and produces analytics_in_wikistats.csv, ready for import into analytics database |
| 37 | + |
19 | 38 | perl AnalyticsPrepWikiCountsOutput.pl -i /a/wikistats/ -o /a/analytics |
20 | 39 | |
| 40 | +# analytics_in_page_views.csv is written daily as part of WikiCountsSummarizeProjectCounts.pl |
| 41 | +# part of (/home/ezachte/pageviews_monthly.sh job) |
| 42 | +# which processes hourly projectcounts files (per wiki page view totals for one hour) from http://dammit.lt/wikistats |
| 43 | +# and generates several files on different aggregation levels |
| 44 | +# only action here is to copy data to this folder to have everything in one place |
| 45 | +# note: unlike folder name suggests this file contains stats for all projects |
| 46 | + |
21 | 47 | cp /a/wikistats/csv_wp/analytics_in_page_views.csv . |
22 | 48 | |
Index: trunk/wikistats/analytics/analytics_upd.sh |
— | — | @@ -3,4 +3,4 @@ |
4 | 4 | |
5 | 5 | ./analytics_generate_csv_files.sh |
6 | 6 | |
7 | | -mysql -u analytics -h project2.wikimedia.org -preport < analytics_refresh_from_csv.txt |
\ No newline at end of file |
| 7 | +mysql -u analytics -h project2.wikimedia.org -preport < analytics_refresh_from_csv.txt |
Index: trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl |
— | — | @@ -47,6 +47,7 @@ |
48 | 48 | |
49 | 49 | sub ParseArguments |
50 | 50 | { |
| 51 | + print "ParseArguments\n" ; |
51 | 52 | my (@options, $arguments) ; |
52 | 53 | |
53 | 54 | getopt ("io", \%options) ; |
— | — | @@ -80,6 +81,7 @@ |
81 | 82 | |
82 | 83 | sub ReadStatisticsMonthly |
83 | 84 | { |
| 85 | + print "ReadStatisticsMonthly\n" ; |
84 | 86 | &ReadStatisticsMonthlyForProject ("wb") ; |
85 | 87 | &ReadStatisticsMonthlyForProject ("wk") ; |
86 | 88 | &ReadStatisticsMonthlyForProject ("wn") ; |
— | — | @@ -252,6 +254,7 @@ |
253 | 255 | |
254 | 256 | sub FindLargestWikis |
255 | 257 | { |
| 258 | + print "FindLargestWikis\n" ; |
256 | 259 | print "Largest projects (most accumulated very active editors):\n"; |
257 | 260 | @total_edits_ge_100 = sort {$total_edits_ge_100 {$b} <=> $total_edits_ge_100 {$a}} keys %total_edits_ge_100 ; |
258 | 261 | $rank = 0 ; |
— | — | @@ -269,18 +272,18 @@ |
270 | 273 | foreach $project_language (keys %largest_projects) |
271 | 274 | { |
272 | 275 | ($project,$language) = split (',', $project_language) ; |
273 | | - if ($data2 {"$project,$language,$yyyymm"} eq '') |
| 276 | + if ($data2 {"$project,$language,$yyyymm"} eq '') |
274 | 277 | { |
275 | 278 | print "No data yet for large wiki $project_language for $yyyymm-> skip month $yyyymm\n" ; |
276 | 279 | $months {$yyyymm} = 0 ; |
277 | 280 | } |
278 | 281 | } |
279 | 282 | } |
280 | | - exit ; |
281 | 283 | } |
282 | 284 | |
283 | 285 | sub WriteMonthlyData |
284 | 286 | { |
| 287 | + print "WriteMonthlyData\n" ; |
285 | 288 | my $file_csv_out = "$path_out/$file_csv_analytics_in" ; |
286 | 289 | open CSV_OUT, '>', $file_csv_out ; |
287 | 290 | foreach $project_wiki_month (sort keys %data1) |
Index: trunk/wikistats/analytics/_readme.txt |
— | — | @@ -6,7 +6,7 @@ |
7 | 7 | |
8 | 8 | == analytics_upd.sh == |
9 | 9 | Prepares new csv files (delegated to analytics_generate_csv_files.sh), |
10 | | -and empties and reloads all tables for which csv files are in this folder. |
| 10 | +and empties/reloads all tables for which csv files are in this folder. |
11 | 11 | It executes SQL from analytics_refresh_from_csv.txt |
12 | 12 | |
13 | 13 | == CSV files == |