r17693 MediaWiki - Code Review archive

Revision:r17692‎ | r17693 | r17694 >
Date:05:28, 15 November 2006
more recentversion
Modified paths:
  • /trunk/tools/dumpHTML/compress-html (modified) (history)
  • /trunk/tools/dumpHTML/compress-volumes2 (modified) (history)
  • /trunk/tools/dumpHTML/do-edition (added) (history)
  • /trunk/tools/dumpHTML/finish-lang (added) (history)
  • /trunk/tools/dumpHTML/functions (modified) (history)
  • /trunk/tools/dumpHTML/netqueue.py (added) (history)
  • /trunk/tools/dumpHTML/queueController.php (added) (history)
  • /trunk/tools/dumpHTML/queueSlave (added) (history)
  • /trunk/tools/dumpHTML/queueSlave.php (added) (history)
  • /trunk/tools/dumpHTML/start-lang (added) (history)

Diff [purge]

Index: trunk/tools/dumpHTML/queueSlave
@@ -0,0 +1,56 @@
 4+import sys,os,signal,socket,re,time
 6+queueHost = sys.argv[1]
 7+queuePort = int(sys.argv[2])
 8+baseDir = sys.argv[3]
 9+siteDir = baseDir+"/wikipedia"
 11+queueSock = socket.socket()
 12+queueSock.connect((queueHost, queuePort))
 13+queueFile = queueSock.makefile()
 16+waiting = False
 18+dataRegex = re.compile("data ([a-z_-]+) (\d+/\d+)")
 20+# Loop until the parent exits
 21+while (os.getppid() > 1):
 22+ queueFile.write("deq\n")
 23+ queueFile.flush()
 24+ s = queueFile.readline()
 25+ m = dataRegex.match(s)
 26+ if m != None:
 27+ waiting = False
 28+ wiki = m.group(1)
 29+ slice = m.group(2)
 30+ print "-------------------------------------------------------------------"
 31+ print wiki + ' ' + slice
 32+ print "-------------------------------------------------------------------"
 33+ checkpoint = baseDir+"/checkpoints/"+wiki+"_" + slice.replace( '/', '_' )
 34+ lang = wiki.replace( 'wiki', '' )
 35+ dest = siteDir+"/"+lang+"-new"
 36+ pid = os.spawnlp(os.P_NOWAIT, "nice", "nice", "-n15",
 37+ "php","-n","dumpHTML.php",wiki,#"--force-copy",
 38+ "--image-snapshot","--interlang","-d",dest,"--slice",slice,
 39+ "--checkpoint",checkpoint,"--no-overwrite")
 41+ # Wait for the child to exit (or the parent)
 42+ status = os.waitpid(pid, os.WNOHANG)
 43+ while status == (0,0) and os.getppid() > 1:
 44+ time.sleep(5)
 45+ status = os.waitpid(pid, os.WNOHANG)
 47+ # If the parent exited, then kill the child
 48+ if status == (0,0):
 49+ os.kill(pid, signal.SIGKILL)
 51+ else:
 52+ if not waiting:
 53+ print "Waiting..."
 54+ waiting = True
 55+ time.sleep(5)
Property changes on: trunk/tools/dumpHTML/queueSlave
Added: svn:executable
158 + *
Index: trunk/tools/dumpHTML/netqueue.py
@@ -0,0 +1,69 @@
 4+import SocketServer, sys, signal, os, threading, Queue
 6+class QueueServer(SocketServer.ThreadingMixIn, SocketServer.TCPServer):
 7+ queue = Queue.Queue(0)
 8+ allow_reuse_address = True
 10+ def enqueue(self, value):
 11+ self.queue.put(value)
 13+ def dequeue(self):
 14+ try:
 15+ value = self.queue.get_nowait()
 16+ except Queue.Empty:
 17+ value = None
 18+ return value
 20+ def blockingDequeue(self, file):
 21+ value = self.queue.get()
 22+ #if file.closed:
 23+ # File doesn't want it, requeue it
 24+ # self.queue.put(value)
 25+ # value = None
 26+ return value;
 28+ def clearQueue(self):
 29+ self.queue = Queue.Queue(0)
 32+class QueueRequestHandler(SocketServer.StreamRequestHandler):
 33+ def handle(self):
 34+ for line in self.rfile:
 35+ cmd = line.strip()
 36+ if cmd[:4] == "enq ":
 37+ self.server.enqueue(cmd[4:])
 38+ self.wfile.write("ok\n")
 39+ elif cmd == "deq":
 40+ value = self.server.dequeue()
 41+ if value is None:
 42+ self.wfile.write("empty\n")
 43+ else:
 44+ self.wfile.write("data " + value + "\n")
 45+ elif cmd == "bdeq":
 46+ value = self.server.blockingDequeue(self.wfile)
 47+ if value is None:
 48+ self.wfile.write("empty\n")
 49+ else:
 50+ self.wfile.write("data " + value + "\n")
 51+ elif cmd == "size":
 52+ self.wfile.write("size " + str(self.server.queue.qsize()) + "\n")
 53+ elif cmd == "clear":
 54+ self.server.clearQueue()
 55+ self.wfile.write("ok\n")
 56+ else:
 57+ self.wfile.write("invalid command\n")
 60+if __name__ == '__main__':
 61+ server = QueueServer(('', 8200), QueueRequestHandler)
 62+ try:
 63+ server.serve_forever()
 64+ except KeyboardInterrupt:
 65+ print "Caught KeyboardInterrupt"
 66+ os.kill(os.getpid(), signal.SIGKILL)
 67+ sys.exit(0)
Property changes on: trunk/tools/dumpHTML/netqueue.py
Added: svn:executable
171 + *
Index: trunk/tools/dumpHTML/do-edition
@@ -0,0 +1,81 @@
 3+import sys, os, socket, signal, time, stat
 5+base = "/mnt/static"
 6+scripts = base + "/scripts"
 8+if len(sys.argv) < 2:
 9+ print "Usage: do-edition <edition>"
 10+ sys.exit(1)
 12+edition = sys.argv[1]
 14+threads = {
 15+ "localhost": 1
 16+ #"srv42": 4,
 17+ #"srv122": 2,
 18+ #"srv123": 2,
 19+ #"srv124": 2,
 20+ #"srv125": 2
 23+# Create some directories
 24+try: os.makedirs(base + "/logs")
 25+except: pass
 26+try: os.makedirs(base + "/checkpoints")
 27+except: pass
 29+# Start queue server
 30+print "Starting queue server"
 31+queueServer = os.fork()
 32+if 0 == queueServer:
 33+ # Run it in a new group so that its precious finishlang children don't get hurt
 34+ os.setsid()
 35+ os.execlp("python", "python", scripts+"/netqueue.py")
 36+ sys.exit(1)
 38+# Wait for it to start up
 39+queueSock = socket.socket()
 40+while queueSock.connect_ex(("localhost", 8200)):
 41+ time.sleep(0.1)
 44+# Start slave threads
 45+slaves = []
 46+for host, number in threads.iteritems():
 47+ for i in range(number):
 48+ print "Starting thread %d on host %s" % (i, host)
 49+ pid = os.fork()
 50+ if pid == 0:
 51+ # Redirect stdout
 52+ os.close(1)
 53+ fd = os.open("%s/logs/%s-%d.out" % (base, host, i), os.O_WRONLY|os.O_CREAT|os.O_APPEND, 0666)
 54+ os.dup2(fd, 1)
 56+ # Redirect stderr
 57+ os.close(2)
 58+ fd = os.open("%s/logs/%s-%d.err" % (base, host, i), os.O_WRONLY|os.O_CREAT|os.O_APPEND, 0666)
 59+ os.dup2(fd, 2)
 61+ if host == "localhost":
 62+ #os.execlp("php", "php", "-n", scripts+"/queueSlave.php", socket.gethostname(), "8200", base)
 63+ os.execlp("python", "python", scripts+"/queueSlave", socket.gethostname(), "8200", base)
 64+ sys.exit(1)
 65+ else:
 66+ #os.execlp("ssh", "ssh", host, "php", "-n", scripts+"/queueSlave.php", socket.gethostname(), "8200", base)
 67+ os.execlp("ssh", "ssh", host, "python", scripts+"/queueSlave", socket.gethostname(), "8200", base)
 68+ sys.exit(1)
 69+ slaves.append(pid)
 71+# Start controller, wait for it to exit
 72+print "Starting controller"
 74+ os.spawnlp(os.P_WAIT, "php", "php", "-n", "queueController.php", edition)
 75+ print "Controller has exited, all done\n"
 76+except KeyboardInterrupt:
 77+ pass
 80+# All done, kill queue server
 81+os.kill(queueServer, signal.SIGKILL)
Property changes on: trunk/tools/dumpHTML/do-edition
Added: svn:executable
183 + *
Index: trunk/tools/dumpHTML/compress-html
@@ -5,30 +5,37 @@
66 exit
77 fi
9 -cd /var/static
1110 lang=$1
1211 edition=$2
14 -dest=/var/static/downloads/$edition/$lang
1518 mkdir -p $dest
1720 echo Finding files...
 21+cd $sitebase
1822 find $lang -name \*.html > $dest/html.lst
2024 #echo Filtering...
2125 #php scripts/filterNamespaces.php $lang wikipedia $dest/html.lst > $dest/reduced.lst
2327 find $lang/skins $lang/images -follow -type f \
24 - -not \( -name \*.php -or -name \*.xcf -or -name .\* -or -name \*~ -or -path \*/CVS/\* \) \
 28+ -not \( -name \*.php -or -name \*.xcf -or -name .\* -or -name \*~ -or \
 29+ -path \*/CVS/\* -or -path \*/.svn\* -or -path \*/wikimania\* \) \
2530 > $dest/skins.lst
2631 find $lang/raw -type f >> $dest/skins.lst
2732 echo $lang/upload/b/bc/Wiki.png >> $dest/skins.lst
2934 echo Creating HTML archive...
3035 rm -f $dest/wikipedia-$lang-html.7z
31 -7z -l a $dest/wikipedia-$lang-html.7z @$dest/html.lst @$dest/skins.lst
 37+# Set chunk size to 8MB for faster random access
 38+7z -l -ms8m a $dest/wikipedia-$lang-html.7z @$dest/html.lst @$dest/skins.lst
3340 #if [ ! -e $dest/wikipedia-$lang-reduced.7z ];then
3441 # echo
3542 # echo Creating reduced archive...
Index: trunk/tools/dumpHTML/finish-lang
@@ -0,0 +1,34 @@
 4+if [ "X$2" == "X" ];then
 5+ echo "Usage: finish <lang> <edition>"
 6+ exit
 13+if [ ! -d /mnt/static/$site/$lang-new ]; then
 14+ echo "Already compressed $lang"
 15+ exit
 17+if [ ! -e /mnt/static/$site/$lang-new/index.html ]; then
 18+ echo "$lang-new directory is broken, missing index.html, skipping."
 19+ exit
 22+if [ -d /mnt/static/$site/$lang ];then
 23+ if [ -d /mnt/static/$site/$lang-old ];then
 24+ rm -rf /mnt/static/$site/$lang-old
 25+ fi
 26+ mv /mnt/static/$site/$lang /mnt/static/$site/$lang-old
 28+mv /mnt/static/$site/$lang-new /mnt/static/$site/$lang
 30+echo "$lang: Compressing HTML..."
 31+$bindir/compress-html $lang $edition 2>&1 >/dev/null
 32+#echo "$lang: Making image tarball..."
 33+#ssh albert tar -C /mnt/static -cf /a/upload_snapshot/$edition/downloads/wikipedia-$lang-images.tar -h $lang/upload
 34+#ln -sf /mnt/upload_snapshot/$edition/downloads/wikipedia-$lang-images.tar /mnt/static/downloads/$edition/$lang/wikipedia-$lang-images.tar
 35+echo "$lang: Done."
Property changes on: trunk/tools/dumpHTML/finish-lang
Added: svn:executable
136 + *
Index: trunk/tools/dumpHTML/queueSlave.php
@@ -0,0 +1,47 @@
 4+$queueHost = $argv[1];
 5+$queuePort = $argv[2];
 6+$baseDir = $argv[3];
 8+$queueSock = fsockopen( $queueHost, $queuePort );
 9+if ( !$queueSock ) {
 10+ echo "Unable to connect to queue server\n";
 11+ die( 1 );
 14+chdir( "/home/wikipedia/common/php-1.5/maintenance" );
 15+$waiting = false;
 16+while ( 1 ) {
 17+ if ( !fwrite( $queueSock, "deq\n" ) ) {
 18+ echo "Unable to write to queue server\n";
 19+ die( 1 );
 20+ }
 21+ $s = fgets( $queueSock );
 22+ if ( $s === false ) {
 23+ echo "Unable to read from queue server\n";
 24+ die( 1 );
 25+ }
 26+ if ( preg_match( '!^data ([a-z_-]+) (\d+/\d+)!', $s, $m ) ) {
 27+ $waiting = false;
 28+ $wiki = $m[1];
 29+ $slice = $m[2];
 30+ echo "-------------------------------------------------------------------\n";
 31+ echo "$wiki $slice\n";
 32+ echo "-------------------------------------------------------------------\n";
 33+ $checkpoint = "$baseDir/checkpoints/{$wiki}_" . str_replace( '/', '_', $slice );
 34+ $lang = str_replace( 'wiki', '', $wiki );
 35+ $dest = "$baseDir/$lang-new";
 37+ passthru( "php -n dumpHTML.php $wiki --force-copy --image-snapshot --interlang -d $dest --slice $slice --checkpoint $checkpoint" );
 38+ } else {
 39+ # Wait for jobs
 40+ if ( !$waiting ) {
 41+ print "Waiting...\n";
 42+ $waiting = true;
 43+ }
 44+ sleep( 5 );
 45+ }
Index: trunk/tools/dumpHTML/compress-volumes2
@@ -5,7 +5,7 @@
66 exit
77 fi
9 -basedir=/var/static
1010 htmldir=$basedir/$1
1111 destdir=$basedir/downloads/$1/volumes
1212 shift
Index: trunk/tools/dumpHTML/functions
@@ -9,12 +9,12 @@
1010 }
1212 finishlang() {
13 - if [ -d /var/static/$1 ];then
14 - mv /var/static/$1 /var/static/$1-old
 13+ if [ -d /mnt/static/$1 ];then
 14+ mv /mnt/static/$1 /mnt/static/$1-old
1515 fi
16 - mv /var/static/$1-new /var/static/$1
 16+ mv /mnt/static/$1-new /mnt/static/$1
18 - echo "Compressing..."
 18+ echo "Compressing $1..."
1919 $bindir/compress-html $1 $edition 2>&1 >/dev/null
2020 echo "Done."
2121 }
Index: trunk/tools/dumpHTML/start-lang
@@ -0,0 +1,27 @@
 4+if [ -z $1 ];then
 5+ echo "Usage: start-lang <lang>"
 6+ exit 1
 13+if [ ! -d $dest ];then
 14+ mkdir -p $dest
 15+ #rm -rf /mnt/upload3/wikipedia/$lang/shared
 16+ ln -s /home/wikipedia/htdocs/wikipedia.org/images $dest/images
 18+ # Upload snapshot disabled, not enough space
 19+ #[ -d /mnt/upload_snapshot/new/$lang ] || mkdir -p /mnt/upload_snapshot/new/$lang
 20+ #ln -s /mnt/upload_snapshot/new/$lang $dest/upload
 22+ mkdir $dest/upload
 23+ ln -s /mnt/upload3/wikipedia/$lang/* $dest/upload/
 24+ rm -f $dest/upload/shared
 25+ mkdir $dest/upload/shared
 26+ ln -s /home/wikipedia/common/php-1.5/skins $dest/skins
 27+ cp /mnt/static/COPYING.html $dest/COPYING.html
Property changes on: trunk/tools/dumpHTML/start-lang
Added: svn:executable
129 + *
Index: trunk/tools/dumpHTML/queueController.php
@@ -0,0 +1,182 @@
 4+$basedir = '/mnt/static';
 6+$wgNoDBParam = true;
 7+require_once( '/home/wikipedia/common/php/maintenance/commandLine.inc' );
 9+if ( !isset( $args[0] ) ) {
 10+ echo "Usage: queueController.php <edition>\n";
 13+$wikiList = array_map( 'trim', file( '/home/wikipedia/common/wikipedia.dblist' ) );
 14+$yaseo = array_map( 'trim', file( '/home/wikipedia/common/yaseo.dblist' ) );
 15+$wikiList = array_diff( $wikiList, $yaseo );
 17+$targetQueueSize = 20;
 18+$maxArticlesPerJob = 10000;
 19+$jobTimeout = 86400;
 20+$edition = $args[0];
 22+$queueSock = fsockopen( 'localhost', 8200 );
 23+if ( !$queueSock ) {
 24+ echo "Unable to connect to queue server\n";
 25+ die(1);
 28+# Flush the queue
 29+fwrite( $queueSock, "clear\n" );
 30+fgets( $queueSock );
 32+# Fetch wiki stats
 33+$wikiSizes = @file_get_contents( "$basedir/checkpoints/wikiSizes" );
 34+if ( $wikiSizes ) {
 35+ $wikiSizes = unserialize( $wikiSizes );
 36+} else {
 37+ $wikiSizes = array();
 38+ foreach ( $wikiList as $wiki ) {
 39+ if ( $wgAlternateMaster[$wiki] ) {
 40+ $db = new Database( $wgAlternateMaster[$wiki], $wgDBuser, $wgDBpassword, $wiki );
 41+ } else {
 42+ $db = wfGetDB( DB_SLAVE );
 43+ }
 45+ $wikiSizes[$wiki] = $db->selectField( "`$wiki`.site_stats", 'ss_total_pages' );
 46+ }
 47+ file_put_contents( "$basedir/checkpoints/wikiSizes", serialize( $wikiSizes ) );
 50+# Compute job array
 51+$jobs = array();
 52+$jobsRemainingPerWiki = array();
 53+foreach ( $wikiSizes as $wiki => $size ) {
 54+ if ( in_array( $wiki, $yaseo ) ) {
 55+ continue;
 56+ }
 57+ $numJobs = intval( ceil( $size / $maxArticlesPerJob ) );
 58+ $jobsRemainingPerWiki[$wiki] = $numJobs;
 59+ for ( $i = 1; $i <= $numJobs; $i++ ) {
 60+ $jobs[] = "$wiki $i/$numJobs";
 61+ }
 64+$start = 0;
 65+$doneCount = 0;
 66+$queued = 0;
 67+$jobCount = count( $jobs );
 68+$queueTimes = array();
 69+$initialisedWikis = array();
 71+print "$jobCount jobs to do\n";
 73+while ( $doneCount < $jobCount ) {
 74+ for ( $i = $start; $i < $jobCount && getQueueSize() < $targetQueueSize; $i++ ) {
 75+ if ( !isset( $jobs[$i] ) ) {
 76+ # Already done and removed
 77+ continue;
 78+ }
 79+ $job = $jobs[$i];
 80+ list( $wiki ) = explode( ' ', $job );
 81+ if ( !$wiki ) {
 82+ die( "Invalid job: $job\n" );
 83+ }
 84+ $queueing = false;
 85+ if ( isDone( $job ) ) {
 86+ $doneCount++;
 87+ print "Job $i done: $job ($doneCount of $jobCount)\n";
 88+ $remaining = --$jobsRemainingPerWiki[$wiki];
 89+ if ( !$remaining ) {
 90+ finishWiki( $wiki );
 91+ } else {
 92+ print "$remaining jobs remaining for $wiki\n";
 93+ }
 95+ unset( $jobs[$i] );
 96+ while ( !isset( $jobs[$start] ) && $start < $jobCount ) {
 97+ $start++;
 98+ }
 99+ } elseif ( !isset( $queueTimes[$i] ) ) {
 100+ print "Queueing job $i: $job\n";
 101+ $queueing = true;
 102+ } elseif ( time() > $queueTimes[$i] + $jobTimeout ) {
 103+ print "Timeout, requeueing job $i: $job\n";
 104+ $queueing = true;
 105+ } else {
 106+ $queueing = false;
 107+ }
 108+ if ( $queueing ) {
 109+ if ( !isset( $initialisedWikis[$wiki] ) ) {
 110+ startWiki( $wiki );
 111+ $initialisedWikis[$wiki] = true;
 112+ }
 113+ enqueue( $job );
 114+ $queueTimes[$i] = time();
 115+ }
 116+ }
 117+ sleep(10);
 122+function getQueueSize() {
 123+ global $queueSock;
 124+ if ( fwrite( $queueSock, "size\n" ) === false ) {
 125+ die( "Unable to write to queue server\n" );
 126+ }
 128+ $response = fgets( $queueSock );
 129+ if ( $response === false ) {
 130+ die( "Unable to read from queue server\n" );
 131+ }
 132+ if ( !preg_match( "/^size (\d*)/", $response, $m ) ) {
 133+ die( "Invalid response to size request\n" );
 134+ }
 135+ return $m[1];
 138+function isDone( $job ) {
 139+ global $basedir;
 140+ $jobCpFile = "$basedir/checkpoints/" . strtr( $job, ' /', '__' );
 141+ $lines = @file( $jobCpFile );
 142+ if ( $lines === false ) {
 143+ return false;
 144+ }
 145+ $test = 'everything=done';
 146+ foreach ( $lines as $line ) {
 147+ if ( substr( $line, 0, strlen( $test ) ) == $test ) {
 148+ return true;
 149+ }
 150+ }
 151+ return false;
 154+function enqueue( $job ) {
 155+ global $queueSock;
 156+ if ( false === fwrite( $queueSock, "enq $job\n" ) ) {
 157+ die( "Unable to write to queue server\n" );
 158+ }
 160+ # Read and throw away response
 161+ $response = fgets( $queueSock );
 164+function startWiki( $wiki ) {
 165+ global $basedir;
 166+ $lang = str_replace( 'wiki', '', $wiki );
 167+ print "Starting language $lang\n";
 168+ passthru( "$basedir/scripts/start-lang $lang" );
 171+function finishWiki( $wiki ) {
 172+ global $edition, $basedir;
 173+ $lang = str_replace( 'wiki', '', $wiki );
 174+ if ( !is_dir( "$basedir/wikipedia/$lang-new" ) ) {
 175+ # Already compressed
 176+ print "Already compressed $lang\n";
 177+ return;
 178+ }
 179+ print "Finishing language $lang\n";
 180+ passthru( "$basedir/scripts/finish-lang $lang $edition >> $basedir/logs/finish.log 2>&1 &" );