Index: trunk/tools/googlestorage/gbackup-xml-lastruns.sh |
— | — | @@ -0,0 +1,133 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +# This script finds the last complete run (with or without errors) for each |
| 5 | +# project and pushes the files out to Google storage. |
| 6 | +# |
| 7 | +# Usage: gbackup-xml-lastruns.sh [YYYYMMDD] |
| 8 | +# |
| 9 | +# Without the date, a bucket name is generated based on the current date. |
| 10 | +# Use an argument instead in order to resume backup from a previous run. |
| 11 | +# Make sure any partially uploaded files from that run are removed from |
| 12 | +# Google storage before resuming. |
| 13 | +# |
| 14 | +# NOTE: This script relies on gsutil, which can be downloaded here: |
| 15 | +# http://commondatastorage.googleapis.com/pub/gsutil.tar.gz |
| 16 | +# Make sure you have already initialized your gsutil credentials: |
| 17 | +# run the command "gsutil ls" from the command line and |
| 18 | +# when it prompts you for your access key and your secret key, |
| 19 | +# enter them. They are stored in your home directory in the config |
| 20 | +# file ".boto" |
| 21 | + |
| 22 | +dumpbasepath="/data/xmldatadumps/public" |
| 23 | +#backuplistsbasepath="/data/xmldatadumps/googlebackups" |
| 24 | +backuplistsbasepath="/home/ariel/googlestorage/googlebackups" |
| 25 | + |
| 26 | +list=`echo $dumpbasepath/*` |
| 27 | + |
| 28 | +usage() { |
| 29 | + echo "Usage: $0 [rundate]" |
| 30 | + echo "The optional argument rundate may be specified in order to" |
| 31 | + echo "resume backups to google of a previous set of complete runs." |
| 32 | + echo "The argument should be in UTC time, YYYYMMDD format" |
| 33 | + echo |
| 34 | + echo "For example:" |
| 35 | + echo "$0 20101215"; |
| 36 | + echo |
| 37 | + echo "If you want to resume a previous backup you should also remove any partially" |
| 38 | + echo "uploaded files before resuming, as this script will refuse to re-upload any" |
| 39 | + echo "already existing files." |
| 40 | + echo |
| 41 | + echo "Without the argument, a new list of most recently completed dumps (the most recent" |
| 42 | + echo "for each public wiki project) will be generated, stored in $backuplistsbasepath/" |
| 43 | + echo "with a name based on the day this script was run, and subsequently all dumps in" |
| 44 | + echo "that list will be uploaded to google in a new bucket with that date." |
| 45 | + echo |
| 46 | + echo "Note that even if a copy of a dump already exists in a previous bucket, if it's" |
| 47 | + echo "the most recent complete dump for that project we will copy it again; this makes" |
| 48 | + echo "things easier for the user and for us if we ever need to retrieve the data." |
| 49 | + exit 1 |
| 50 | +} |
| 51 | + |
| 52 | +listlatestdumpforproject() { |
| 53 | + # cannot rely on timestamp. sometimes we have rerun a phase in |
| 54 | + # some earlier dump and have it completed later than a later dump, |
| 55 | + # or we may have two en pedia runs going at once in different |
| 56 | + # phases. |
| 57 | + dirs=`ls -r $project | grep -v latest` |
| 58 | + for day in $dirs; do |
| 59 | + # skip the bad and archive dirs, those won't be good runs |
| 60 | + if [ "$project" == "$dumpbasepath/bad" -o "$project" == "$dumpbasepath/archive" ]; then |
| 61 | + continue |
| 62 | + fi |
| 63 | + # we have a bunch of cruft in there like index files, etc. |
| 64 | + # also a bunch of things that aren't dump dirs, like |
| 65 | + # tools, mw, static... |
| 66 | + if [ -d "$project/$day" ]; then |
| 67 | + complete=`grep "Dump complete" "$project/$day/status.html" 2>/dev/null` |
| 68 | + if [ ! -z "$complete" ]; then |
| 69 | + echo "$project/$day" >> "$listtobackup" |
| 70 | + break |
| 71 | + fi |
| 72 | + fi |
| 73 | + done |
| 74 | +} |
| 75 | + |
| 76 | +createbackuplist() { |
| 77 | + > "$listtobackup" |
| 78 | + list=`echo $dumpbasepath/*` |
| 79 | + for project in $list; do |
| 80 | +# echo doing $project |
| 81 | + listlatestdumpforproject |
| 82 | + done |
| 83 | + echo "created backup list $listtobackup" |
| 84 | +} |
| 85 | + |
| 86 | +checkifbackedupalready() { |
| 87 | + result=`gsutil ls "$googlebucketname/$googlefilename" 2>&1 | grep InvalidUriError` |
| 88 | + if [ -z "$result" ]; then |
| 89 | + alreadydone=1 |
| 90 | + else |
| 91 | + alreadydone=0 |
| 92 | + fi |
| 93 | +} |
| 94 | + |
| 95 | +if [ ! -z "$1" ]; then |
| 96 | + rundate="$1" |
| 97 | + googlebucketname="gs://wikimedia-xmldumps-completeruns-$rundate" |
| 98 | + result=`gsutil ls "$googlebucketname" 2>&1 | grep "Not Found"` |
| 99 | + if [ ! -z "$result" ]; then |
| 100 | + echo "No such google bucket $googlebucketname" |
| 101 | + usage |
| 102 | + fi |
| 103 | + listtobackup="$backuplistsbasepath/backtheseup-$rundate.txt" |
| 104 | + if [ ! -e "$listtobackup" ]; then |
| 105 | + echo "no list of things to back up for the run date $rundate" |
| 106 | + echo "Are you sure this is a backup to be resumed?" |
| 107 | + usage |
| 108 | + fi |
| 109 | +else |
| 110 | + rundate=`date -u +%Y%m%d` |
| 111 | + googlebucketname="gs://wikimedia-xmldumps-completeruns-$rundate" |
| 112 | + gsutil mb "$googlebucketname" |
| 113 | + listtobackup="$backuplistsbasepath/backtheseup-$rundate.txt" |
| 114 | + createbackuplist |
| 115 | +fi |
| 116 | + |
| 117 | +# ok now we have a list of directories and projects to back up. |
| 118 | +# let's do them one file at a time, using the gsutil cp -R won't give |
| 119 | +# us quite the right filenames. |
| 120 | + |
| 121 | +while read line |
| 122 | +do |
| 123 | + filenames=`echo $line/*` |
| 124 | + for f in $filenames; do |
| 125 | + # put together the right filename for google |
| 126 | + googlefilename=`echo $f | sed -e "s|$dumpbasepath/||;"` |
| 127 | + checkifbackedupalready |
| 128 | + if [ "$alreadydone" -eq "0" ]; then |
| 129 | + # push it over |
| 130 | + gsutil cp -a "public-read" "$f" "$googlebucketname/$googlefilename" |
| 131 | + fi |
| 132 | + done |
| 133 | +done < "$listtobackup" |
| 134 | +echo "finished copy to google of directories in $listtobackup" |
Property changes on: trunk/tools/googlestorage/gbackup-xml-lastruns.sh |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 135 | + native |
Added: svn:executable |
2 | 136 | + * |