r78621 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r78620‎ | r78621 | r78622 >
Date:11:45, 20 December 2010
Author:ariel
Status:deferred
Tags:
Comment:
back up most recent xml dumps to google storage
Modified paths:
  • /trunk/tools/googlestorage (added) (history)
  • /trunk/tools/googlestorage/gbackup-xml-lastruns.sh (added) (history)

Diff [purge]

Index: trunk/tools/googlestorage/gbackup-xml-lastruns.sh
@@ -0,0 +1,133 @@
 2+#!/bin/bash
 3+
 4+# This script finds the last complete run (with or without errors) for each
 5+# project and pushes the files out to Google storage.
 6+#
 7+# Usage: gbackup-xml-lastruns.sh [YYYYMMDD]
 8+#
 9+# Without the date, a bucket name is generated based on the current date.
 10+# Use an argument instead in order to resume backup from a previous run.
 11+# Make sure any partially uploaded files from that run are removed from
 12+# Google storage before resuming.
 13+#
 14+# NOTE: This script relies on gsutil, which can be downloaded here:
 15+# http://commondatastorage.googleapis.com/pub/gsutil.tar.gz
 16+# Make sure you have already initialized your gsutil credentials:
 17+# run the command "gsutil ls" from the command line and
 18+# when it prompts you for your access key and your secret key,
 19+# enter them. They are stored in your home directory in the config
 20+# file ".boto"
 21+
 22+dumpbasepath="/data/xmldatadumps/public"
 23+#backuplistsbasepath="/data/xmldatadumps/googlebackups"
 24+backuplistsbasepath="/home/ariel/googlestorage/googlebackups"
 25+
 26+list=`echo $dumpbasepath/*`
 27+
 28+usage() {
 29+ echo "Usage: $0 [rundate]"
 30+ echo "The optional argument rundate may be specified in order to"
 31+ echo "resume backups to google of a previous set of complete runs."
 32+ echo "The argument should be in UTC time, YYYYMMDD format"
 33+ echo
 34+ echo "For example:"
 35+ echo "$0 20101215";
 36+ echo
 37+ echo "If you want to resume a previous backup you should also remove any partially"
 38+ echo "uploaded files before resuming, as this script will refuse to re-upload any"
 39+ echo "already existing files."
 40+ echo
 41+ echo "Without the argument, a new list of most recently completed dumps (the most recent"
 42+ echo "for each public wiki project) will be generated, stored in $backuplistsbasepath/"
 43+ echo "with a name based on the day this script was run, and subsequently all dumps in"
 44+ echo "that list will be uploaded to google in a new bucket with that date."
 45+ echo
 46+ echo "Note that even if a copy of a dump already exists in a previous bucket, if it's"
 47+ echo "the most recent complete dump for that project we will copy it again; this makes"
 48+ echo "things easier for the user and for us if we ever need to retrieve the data."
 49+ exit 1
 50+}
 51+
 52+listlatestdumpforproject() {
 53+ # cannot rely on timestamp. sometimes we have rerun a phase in
 54+ # some earlier dump and have it completed later than a later dump,
 55+ # or we may have two en pedia runs going at once in different
 56+ # phases.
 57+ dirs=`ls -r $project | grep -v latest`
 58+ for day in $dirs; do
 59+ # skip the bad and archive dirs, those won't be good runs
 60+ if [ "$project" == "$dumpbasepath/bad" -o "$project" == "$dumpbasepath/archive" ]; then
 61+ continue
 62+ fi
 63+ # we have a bunch of cruft in there like index files, etc.
 64+ # also a bunch of things that aren't dump dirs, like
 65+ # tools, mw, static...
 66+ if [ -d "$project/$day" ]; then
 67+ complete=`grep "Dump complete" "$project/$day/status.html" 2>/dev/null`
 68+ if [ ! -z "$complete" ]; then
 69+ echo "$project/$day" >> "$listtobackup"
 70+ break
 71+ fi
 72+ fi
 73+ done
 74+}
 75+
 76+createbackuplist() {
 77+ > "$listtobackup"
 78+ list=`echo $dumpbasepath/*`
 79+ for project in $list; do
 80+# echo doing $project
 81+ listlatestdumpforproject
 82+ done
 83+ echo "created backup list $listtobackup"
 84+}
 85+
 86+checkifbackedupalready() {
 87+ result=`gsutil ls "$googlebucketname/$googlefilename" 2>&1 | grep InvalidUriError`
 88+ if [ -z "$result" ]; then
 89+ alreadydone=1
 90+ else
 91+ alreadydone=0
 92+ fi
 93+}
 94+
 95+if [ ! -z "$1" ]; then
 96+ rundate="$1"
 97+ googlebucketname="gs://wikimedia-xmldumps-completeruns-$rundate"
 98+ result=`gsutil ls "$googlebucketname" 2>&1 | grep "Not Found"`
 99+ if [ ! -z "$result" ]; then
 100+ echo "No such google bucket $googlebucketname"
 101+ usage
 102+ fi
 103+ listtobackup="$backuplistsbasepath/backtheseup-$rundate.txt"
 104+ if [ ! -e "$listtobackup" ]; then
 105+ echo "no list of things to back up for the run date $rundate"
 106+ echo "Are you sure this is a backup to be resumed?"
 107+ usage
 108+ fi
 109+else
 110+ rundate=`date -u +%Y%m%d`
 111+ googlebucketname="gs://wikimedia-xmldumps-completeruns-$rundate"
 112+ gsutil mb "$googlebucketname"
 113+ listtobackup="$backuplistsbasepath/backtheseup-$rundate.txt"
 114+ createbackuplist
 115+fi
 116+
 117+# ok now we have a list of directories and projects to back up.
 118+# let's do them one file at a time, using the gsutil cp -R won't give
 119+# us quite the right filenames.
 120+
 121+while read line
 122+do
 123+ filenames=`echo $line/*`
 124+ for f in $filenames; do
 125+ # put together the right filename for google
 126+ googlefilename=`echo $f | sed -e "s|$dumpbasepath/||;"`
 127+ checkifbackedupalready
 128+ if [ "$alreadydone" -eq "0" ]; then
 129+ # push it over
 130+ gsutil cp -a "public-read" "$f" "$googlebucketname/$googlefilename"
 131+ fi
 132+ done
 133+done < "$listtobackup"
 134+echo "finished copy to google of directories in $listtobackup"
Property changes on: trunk/tools/googlestorage/gbackup-xml-lastruns.sh
___________________________________________________________________
Added: svn:eol-style
1135 + native
Added: svn:executable
2136 + *

Status & tagging log