Index: branches/wmf-deployment/extensions/DumpHTML/dumpHTML.php |
— | — | @@ -0,0 +1,182 @@ |
| 2 | +<?php |
| 3 | +/** |
| 4 | + * @todo document |
| 5 | + * @addtogroup Maintenance |
| 6 | + */ |
| 7 | + |
| 8 | +$usage = <<<ENDS |
| 9 | +Usage: |
| 10 | +php dumpHTML.php [options...] |
| 11 | + |
| 12 | + --help show this message |
| 13 | + |
| 14 | + -d <dest> destination directory |
| 15 | + -s <start> start ID |
| 16 | + -e <end> end ID |
| 17 | + -k <skin> skin to use (defaults to htmldump) |
| 18 | + --no-overwrite skip existing HTML files |
| 19 | + --checkpoint <file> use a checkpoint file to allow restarting of interrupted dumps |
| 20 | + --slice <n/m> split the job into m segments and do the n'th one |
| 21 | + --images only do image description pages |
| 22 | + --shared-desc only do shared (commons) image description pages |
| 23 | + --no-shared-desc don't do shared image description pages |
| 24 | + --categories only do category pages |
| 25 | + --redirects only do redirects |
| 26 | + --special only do miscellaneous stuff |
| 27 | + --force-copy copy commons instead of symlink, needed for Wikimedia |
| 28 | + --interlang allow interlanguage links |
| 29 | + --image-snapshot copy all images used to the destination directory |
| 30 | + --compress generate compressed version of the html pages |
| 31 | + --udp-profile <N> profile 1/N rendering operations using ProfilerSimpleUDP |
| 32 | + --oom-adj <N> set /proc/<pid>/oom_adj |
| 33 | + --show-titles write each article title to stdout |
| 34 | + --group <group> use the specified user group to read articles |
| 35 | + |
| 36 | +ENDS; |
| 37 | + |
| 38 | +define( 'MW_HTML_FOR_DUMP', 1 ); |
| 39 | + |
| 40 | +$optionsWithArgs = array( 's', 'd', 'e', 'k', 'checkpoint', 'slice', 'udp-profile', 'oom-adj', 'group' ); |
| 41 | +$options = array( 'help' ); |
| 42 | +$profiling = false; |
| 43 | + |
| 44 | +if ( $profiling ) { |
| 45 | + define( 'MW_CMDLINE_CALLBACK', 'wfSetupDump' ); |
| 46 | + function wfSetupDump() { |
| 47 | + global $wgProfiling, $wgProfileToDatabase, $wgProfileSampleRate; |
| 48 | + $wgProfiling = true; |
| 49 | + $wgProfileToDatabase = false; |
| 50 | + $wgProfileSampleRate = 1; |
| 51 | + } |
| 52 | +} |
| 53 | + |
| 54 | +if ( in_array( '--udp-profile', $argv ) ) { |
| 55 | + define( 'MW_FORCE_PROFILE', 1 ); |
| 56 | +} |
| 57 | + |
| 58 | +$IP = getenv( 'MW_INSTALL_PATH' ); |
| 59 | +if ( $IP === false ) { |
| 60 | + $IP = dirname(__FILE__).'/../..'; |
| 61 | +} |
| 62 | +require_once( $IP."/maintenance/commandLine.inc" ); |
| 63 | +require_once( dirname(__FILE__)."/dumpHTML.inc" ); |
| 64 | +require_once( dirname(__FILE__)."/SkinOffline.php" ); |
| 65 | + |
| 66 | +if ( version_compare( $wgVersion, '1.11.1', '<' ) ) { |
| 67 | + echo "Error, the DumpHTML extension needs at least MediaWiki version 1.11.1 to work, you have version $wgVersion.\n"; |
| 68 | + echo "Try using maintenance/dumpHTML.php instead.\n"; |
| 69 | + exit; |
| 70 | +} |
| 71 | + |
| 72 | +error_reporting( E_ALL & (~E_NOTICE) ); |
| 73 | + |
| 74 | +if( isset( $options['help'] ) || isset( $options['h'] ) ) { |
| 75 | + echo $usage; |
| 76 | + exit; |
| 77 | +} |
| 78 | + |
| 79 | +if ( !wfIsWindows() && isset( $options['oom-adj'] ) ) { |
| 80 | + $adj = intval( $options['oom-adj'] ); |
| 81 | + $pid = getmypid(); |
| 82 | + file_put_contents( "/proc/$pid/oom_adj", $adj ); |
| 83 | +} |
| 84 | + |
| 85 | +if ( !empty( $options['s'] ) ) { |
| 86 | + $start = $options['s']; |
| 87 | +} else { |
| 88 | + $start = 1; |
| 89 | +} |
| 90 | + |
| 91 | +if ( !empty( $options['e'] ) ) { |
| 92 | + $end = $options['e']; |
| 93 | +} else { |
| 94 | + $dbr = wfGetDB( DB_SLAVE ); |
| 95 | + $end = $dbr->selectField( 'page', 'max(page_id)', false ); |
| 96 | +} |
| 97 | + |
| 98 | +if ( !empty( $options['d'] ) ) { |
| 99 | + $dest = $options['d']; |
| 100 | +} else { |
| 101 | + $dest = "$IP/static"; |
| 102 | +} |
| 103 | + |
| 104 | +$skin = isset( $options['k'] ) ? $options['k'] : 'offline'; |
| 105 | + |
| 106 | +if ( $options['slice'] ) { |
| 107 | + $bits = explode( '/', $options['slice'] ); |
| 108 | + if ( count( $bits ) != 2 || $bits[0] < 1 || $bits[0] > $bits[1] ) { |
| 109 | + print "Invalid slice specification"; |
| 110 | + exit; |
| 111 | + } |
| 112 | + $sliceNumerator = $bits[0]; |
| 113 | + $sliceDenominator = $bits[1]; |
| 114 | +} else { |
| 115 | + $sliceNumerator = $sliceDenominator = 1; |
| 116 | +} |
| 117 | + |
| 118 | +$wgHTMLDump = new DumpHTML( array( |
| 119 | + 'dest' => $dest, |
| 120 | + 'forceCopy' => $options['force-copy'], |
| 121 | + 'alternateScriptPath' => $options['interlang'], |
| 122 | + 'interwiki' => $options['interlang'], |
| 123 | + 'skin' => $skin, |
| 124 | + 'makeSnapshot' => $options['image-snapshot'], |
| 125 | + 'checkpointFile' => $options['checkpoint'], |
| 126 | + 'startID' => $start, |
| 127 | + 'endID' => $end, |
| 128 | + 'sliceNumerator' => $sliceNumerator, |
| 129 | + 'sliceDenominator' => $sliceDenominator, |
| 130 | + 'noOverwrite' => $options['no-overwrite'], |
| 131 | + 'compress' => $options['compress'], |
| 132 | + 'noSharedDesc' => $options['no-shared-desc'], |
| 133 | + 'udpProfile' => $options['udp-profile'], |
| 134 | + 'showTitles' => $options['show-titles'], |
| 135 | + 'group' => $options['group'], |
| 136 | +)); |
| 137 | + |
| 138 | +$wgHTMLDump->setupDestDir(); |
| 139 | + |
| 140 | +if ( $options['special'] ) { |
| 141 | + $wgHTMLDump->doSpecials(); |
| 142 | +} elseif ( $options['images'] ) { |
| 143 | + $wgHTMLDump->doImageDescriptions(); |
| 144 | +} elseif ( $options['categories'] ) { |
| 145 | + $wgHTMLDump->doCategories(); |
| 146 | +} elseif ( $options['redirects'] ) { |
| 147 | + $wgHTMLDump->doRedirects(); |
| 148 | +} elseif ( $options['shared-desc'] ) { |
| 149 | + $wgHTMLDump->doSharedImageDescriptions(); |
| 150 | +} else { |
| 151 | + print "Creating static HTML dump in directory $dest. \n"; |
| 152 | + $dbr = wfGetDB( DB_SLAVE ); |
| 153 | + $server = $dbr->getProperty( 'mServer' ); |
| 154 | + print "Using database {$server}\n"; |
| 155 | + |
| 156 | + if ( !isset( $options['e'] ) ) { |
| 157 | + $wgHTMLDump->doEverything(); |
| 158 | + } else { |
| 159 | + $wgHTMLDump->doArticles(); |
| 160 | + } |
| 161 | +} |
| 162 | + |
| 163 | +if ( isset( $options['debug'] ) ) { |
| 164 | + #print_r($GLOBALS); |
| 165 | + # Workaround for bug #36957 |
| 166 | + $globals = array_keys( $GLOBALS ); |
| 167 | + #sort( $globals ); |
| 168 | + $sizes = array(); |
| 169 | + foreach ( $globals as $name ) { |
| 170 | + $sizes[$name] = strlen( serialize( $GLOBALS[$name] ) ); |
| 171 | + } |
| 172 | + arsort($sizes); |
| 173 | + $sizes = array_slice( $sizes, 0, 20 ); |
| 174 | + foreach ( $sizes as $name => $size ) { |
| 175 | + printf( "%9d %s\n", $size, $name ); |
| 176 | + } |
| 177 | +} |
| 178 | + |
| 179 | +if ( $profiling ) { |
| 180 | + echo $wgProfiler->getOutput(); |
| 181 | +} |
| 182 | + |
| 183 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/dumpHTML.php |
___________________________________________________________________ |
Name: svn:keywords |
1 | 184 | + Author Date Id Revision |
Name: svn:eol-style |
2 | 185 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/dumpHTML.inc |
— | — | @@ -0,0 +1,1412 @@ |
| 2 | +<?php |
| 3 | +/** |
| 4 | + * @addtogroup Maintenance |
| 5 | + */ |
| 6 | + |
| 7 | +define( 'REPORTING_INTERVAL', 10 ); |
| 8 | + |
| 9 | +class DumpHTML { |
| 10 | + # Destination directory |
| 11 | + var $dest; |
| 12 | + |
| 13 | + # Extension base directory |
| 14 | + var $extdir; |
| 15 | + |
| 16 | + # Skip existing files |
| 17 | + var $noOverwrite = false; |
| 18 | + |
| 19 | + # Show interlanguage links? |
| 20 | + var $interwiki = true; |
| 21 | + |
| 22 | + # Depth of HTML directory tree |
| 23 | + var $depth = 3; |
| 24 | + |
| 25 | + # Directory that commons images are copied into |
| 26 | + var $sharedStaticDirectory; |
| 27 | + |
| 28 | + # Directory that the images are in, after copying |
| 29 | + var $destUploadDirectory; |
| 30 | + |
| 31 | + # Base URL for images, after copying |
| 32 | + var $destUploadUrl; |
| 33 | + |
| 34 | + # Base URL for the destination directory |
| 35 | + var $articleBaseUrl; |
| 36 | + |
| 37 | + # Relative path to image directory |
| 38 | + var $imageRel = 'upload'; |
| 39 | + |
| 40 | + # Copy commons images instead of symlinking |
| 41 | + var $forceCopy = false; |
| 42 | + |
| 43 | + # Make a copy of all images encountered |
| 44 | + var $makeSnapshot = false; |
| 45 | + |
| 46 | + # Don't image description pages in doEverything() |
| 47 | + var $noSharedDesc = false; |
| 48 | + |
| 49 | + # Make links assuming the script path is in the same directory as |
| 50 | + # the destination |
| 51 | + var $alternateScriptPath = false; |
| 52 | + |
| 53 | + # Original values of various globals |
| 54 | + var $oldArticlePath = false, $oldCopyrightIcon = false, $oldLogo, $oldRepoGroup, $oldScriptPath; |
| 55 | + |
| 56 | + # Has setupGlobals been called? |
| 57 | + var $setupDone = false; |
| 58 | + |
| 59 | + # Has to compress html pages |
| 60 | + var $compress = false; |
| 61 | + |
| 62 | + # List of raw pages used in the current article |
| 63 | + var $rawPages; |
| 64 | + |
| 65 | + # Skin to use |
| 66 | + var $skin = 'offline'; |
| 67 | + |
| 68 | + # User group to use |
| 69 | + var $group = false; |
| 70 | + |
| 71 | + # Checkpoint stuff |
| 72 | + var $checkpointFile = false, $checkpoints = false; |
| 73 | + |
| 74 | + var $startID = 1, $endID = false; |
| 75 | + |
| 76 | + var $sliceNumerator = 1, $sliceDenominator = 1; |
| 77 | + |
| 78 | + # Max page ID, lazy initialised |
| 79 | + var $maxPageID = false; |
| 80 | + |
| 81 | + # UDP profiling |
| 82 | + var $udpProfile, $udpProfileCounter = 0, $udpProfileInit = false; |
| 83 | + |
| 84 | + # Debugging options |
| 85 | + var $showTitles = false; |
| 86 | + |
| 87 | + # Extension version |
| 88 | + const VERSION = '2.0'; |
| 89 | + |
| 90 | + function DumpHTML( $settings = array() ) { |
| 91 | + foreach ( $settings as $var => $value ) { |
| 92 | + $this->$var = $value; |
| 93 | + } |
| 94 | + $this->extdir = dirname( __FILE__ ); |
| 95 | + } |
| 96 | + |
| 97 | + function loadCheckpoints() { |
| 98 | + if ( $this->checkpoints !== false ) { |
| 99 | + return true; |
| 100 | + } elseif ( !$this->checkpointFile ) { |
| 101 | + return false; |
| 102 | + } else { |
| 103 | + $lines = @file( $this->checkpointFile ); |
| 104 | + if ( $lines === false ) { |
| 105 | + print "Starting new checkpoint file \"{$this->checkpointFile}\"\n"; |
| 106 | + $this->checkpoints = array(); |
| 107 | + } else { |
| 108 | + $lines = array_map( 'trim', $lines ); |
| 109 | + $this->checkpoints = array(); |
| 110 | + foreach ( $lines as $line ) { |
| 111 | + list( $name, $value ) = explode( '=', $line, 2 ); |
| 112 | + $this->checkpoints[$name] = $value; |
| 113 | + } |
| 114 | + } |
| 115 | + return true; |
| 116 | + } |
| 117 | + } |
| 118 | + |
| 119 | + function getCheckpoint( $type, $defValue = false ) { |
| 120 | + if ( !$this->loadCheckpoints() ) { |
| 121 | + return false; |
| 122 | + } |
| 123 | + if ( !isset( $this->checkpoints[$type] ) ) { |
| 124 | + return false; |
| 125 | + } else { |
| 126 | + return $this->checkpoints[$type]; |
| 127 | + } |
| 128 | + } |
| 129 | + |
| 130 | + function setCheckpoint( $type, $value ) { |
| 131 | + if ( !$this->checkpointFile ) { |
| 132 | + return; |
| 133 | + } |
| 134 | + $this->checkpoints[$type] = $value; |
| 135 | + $blob = ''; |
| 136 | + foreach ( $this->checkpoints as $type => $value ) { |
| 137 | + $blob .= "$type=$value\n"; |
| 138 | + } |
| 139 | + file_put_contents( $this->checkpointFile, $blob ); |
| 140 | + } |
| 141 | + |
| 142 | + function doEverything() { |
| 143 | + if ( $this->getCheckpoint( 'everything' ) == 'done' ) { |
| 144 | + print "Checkpoint says everything is already done\n"; |
| 145 | + return; |
| 146 | + } |
| 147 | + $this->doArticles(); |
| 148 | + $this->doCategories(); |
| 149 | + $this->doRedirects(); |
| 150 | + if ( $this->sliceNumerator == 1 ) { |
| 151 | + $this->doSpecials(); |
| 152 | + } |
| 153 | + $this->doLocalImageDescriptions(); |
| 154 | + |
| 155 | + if ( !$this->noSharedDesc ) { |
| 156 | + $this->doSharedImageDescriptions(); |
| 157 | + } |
| 158 | + |
| 159 | + $this->setCheckpoint( 'everything', 'done' ); |
| 160 | + } |
| 161 | + |
| 162 | + /** |
| 163 | + * Write a set of articles specified by start and end page_id |
| 164 | + * Skip categories and images, they will be done separately |
| 165 | + */ |
| 166 | + function doArticles() { |
| 167 | + if ( $this->endID === false ) { |
| 168 | + $end = $this->getMaxPageID(); |
| 169 | + } else { |
| 170 | + $end = $this->endID; |
| 171 | + } |
| 172 | + $start = $this->startID; |
| 173 | + |
| 174 | + # Start from the checkpoint |
| 175 | + $cp = $this->getCheckpoint( 'article' ); |
| 176 | + if ( $cp == 'done' ) { |
| 177 | + print "Articles already done\n"; |
| 178 | + return; |
| 179 | + } elseif ( $cp !== false ) { |
| 180 | + $start = $cp; |
| 181 | + print "Resuming article dump from checkpoint at page_id $start of $end\n"; |
| 182 | + } else { |
| 183 | + print "Starting from page_id $start of $end\n"; |
| 184 | + } |
| 185 | + |
| 186 | + # Move the start point to the correct slice if it isn't there already |
| 187 | + $start = $this->modSliceStart( $start ); |
| 188 | + |
| 189 | + $this->setupGlobals(); |
| 190 | + |
| 191 | + $mainPageObj = Title::newMainPage(); |
| 192 | + $mainPage = $mainPageObj->getPrefixedDBkey(); |
| 193 | + |
| 194 | + for ( $id = $start, $i = 0; $id <= $end; $id += $this->sliceDenominator, $i++ ) { |
| 195 | + wfWaitForSlaves( 20 ); |
| 196 | + if ( !( $i % REPORTING_INTERVAL) ) { |
| 197 | + print "Processing ID: $id\r"; |
| 198 | + $this->setCheckpoint( 'article', $id ); |
| 199 | + } |
| 200 | + if ( !($i % (REPORTING_INTERVAL*10) ) ) { |
| 201 | + print "\n"; |
| 202 | + } |
| 203 | + $title = Title::newFromID( $id ); |
| 204 | + if ( $title ) { |
| 205 | + $ns = $title->getNamespace() ; |
| 206 | + if ( $ns != NS_CATEGORY && $ns != NS_MEDIAWIKI && |
| 207 | + $title->getPrefixedDBkey() != $mainPage ) { |
| 208 | + $this->doArticle( $title ); |
| 209 | + } |
| 210 | + } |
| 211 | + } |
| 212 | + $this->setCheckpoint( 'article', 'done' ); |
| 213 | + print "\n"; |
| 214 | + } |
| 215 | + |
| 216 | + function doSpecials() { |
| 217 | + $this->doMainPage(); |
| 218 | + |
| 219 | + $this->setupGlobals(); |
| 220 | + print "Special:Categories..."; |
| 221 | + $this->doArticle( SpecialPage::getTitleFor( 'Categories' ) ); |
| 222 | + print "\n"; |
| 223 | + } |
| 224 | + |
| 225 | + /** Write the main page as index.html */ |
| 226 | + function doMainPage() { |
| 227 | + |
| 228 | + print "Making index.html "; |
| 229 | + |
| 230 | + // Set up globals with no ../../.. in the link URLs |
| 231 | + $this->setupGlobals( 0 ); |
| 232 | + |
| 233 | + $title = Title::newMainPage(); |
| 234 | + $text = $this->getArticleHTML( $title ); |
| 235 | + |
| 236 | + # Parse the XHTML to find the images |
| 237 | + #$images = $this->findImages( $text ); |
| 238 | + #$this->copyImages( $images ); |
| 239 | + |
| 240 | + $file = fopen( "{$this->dest}/index.html", "w" ); |
| 241 | + if ( !$file ) { |
| 242 | + print "\nCan't open index.html for writing\n"; |
| 243 | + return false; |
| 244 | + } |
| 245 | + fwrite( $file, $text ); |
| 246 | + fclose( $file ); |
| 247 | + print "\n"; |
| 248 | + } |
| 249 | + |
| 250 | + function doImageDescriptions() { |
| 251 | + $this->doLocalImageDescriptions(); |
| 252 | + if ( !$this->noSharedDesc ) { |
| 253 | + $this->doSharedImageDescriptions(); |
| 254 | + } |
| 255 | + } |
| 256 | + |
| 257 | + /** |
| 258 | + * Dump image description pages that don't have an associated article, but do |
| 259 | + * have a local image |
| 260 | + */ |
| 261 | + function doLocalImageDescriptions() { |
| 262 | + $chunkSize = 1000; |
| 263 | + |
| 264 | + $dbr = wfGetDB( DB_SLAVE ); |
| 265 | + |
| 266 | + $cp = $this->getCheckpoint( 'local image' ); |
| 267 | + if ( $cp == 'done' ) { |
| 268 | + print "Local image descriptions already done\n"; |
| 269 | + return; |
| 270 | + } elseif ( $cp !== false ) { |
| 271 | + print "Writing image description pages starting from $cp\n"; |
| 272 | + $conds = array( 'img_name >= ' . $dbr->addQuotes( $cp ) ); |
| 273 | + } else { |
| 274 | + print "Writing image description pages for local images\n"; |
| 275 | + $conds = false; |
| 276 | + } |
| 277 | + |
| 278 | + $this->setupGlobals(); |
| 279 | + $i = 0; |
| 280 | + |
| 281 | + do { |
| 282 | + $res = $dbr->select( 'image', array( 'img_name' ), $conds, __METHOD__, |
| 283 | + array( 'ORDER BY' => 'img_name', 'LIMIT' => $chunkSize ) ); |
| 284 | + $numRows = $dbr->numRows( $res ); |
| 285 | + |
| 286 | + while ( $row = $dbr->fetchObject( $res ) ) { |
| 287 | + # Update conds for the next chunk query |
| 288 | + $conds = array( 'img_name > ' . $dbr->addQuotes( $row->img_name ) ); |
| 289 | + |
| 290 | + // Slice the result set with a filter |
| 291 | + if ( !$this->sliceFilter( $row->img_name ) ) { |
| 292 | + continue; |
| 293 | + } |
| 294 | + |
| 295 | + wfWaitForSlaves( 10 ); |
| 296 | + if ( !( ++$i % REPORTING_INTERVAL ) ) { |
| 297 | + print "{$row->img_name}\n"; |
| 298 | + if ( $row->img_name !== 'done' ) { |
| 299 | + $this->setCheckpoint( 'local image', $row->img_name ); |
| 300 | + } |
| 301 | + } |
| 302 | + $title = Title::makeTitle( NS_IMAGE, $row->img_name ); |
| 303 | + if ( $title->getArticleID() ) { |
| 304 | + // Already done by dumpHTML |
| 305 | + continue; |
| 306 | + } |
| 307 | + $this->doArticle( $title ); |
| 308 | + } |
| 309 | + $dbr->freeResult( $res ); |
| 310 | + } while ( $numRows ); |
| 311 | + |
| 312 | + $this->setCheckpoint( 'local image', 'done' ); |
| 313 | + print "\n"; |
| 314 | + } |
| 315 | + |
| 316 | + /** |
| 317 | + * Dump images which only have a real description page on commons |
| 318 | + */ |
| 319 | + function doSharedImageDescriptions() { |
| 320 | + list( $start, $end ) = $this->sliceRange( 0, 255 ); |
| 321 | + |
| 322 | + $cp = $this->getCheckpoint( 'shared image' ); |
| 323 | + if ( $cp == 'done' ) { |
| 324 | + print "Shared description pages already done\n"; |
| 325 | + return; |
| 326 | + } elseif ( $cp !== false ) { |
| 327 | + print "Writing description pages for commons images starting from directory $cp/255\n"; |
| 328 | + $start = $cp; |
| 329 | + } else { |
| 330 | + print "Writing description pages for commons images\n"; |
| 331 | + } |
| 332 | + |
| 333 | + $this->setupGlobals(); |
| 334 | + $i = 0; |
| 335 | + foreach ( $this->oldRepoGroup->foreignInfo as $repo ) { |
| 336 | + $repoName = $repo['name']; |
| 337 | + for ( $hash = $start; $hash <= $end; $hash++ ) { |
| 338 | + $this->setCheckpoint( 'shared image', $hash ); |
| 339 | + $rel = sprintf( "%01x/%02x", intval( $hash / 16 ), $hash ); |
| 340 | + $dir = "{$this->destUploadDirectory}/$repoName/$rel"; |
| 341 | + $handle = @opendir( $dir ); |
| 342 | + while ( $handle && $file = readdir( $handle ) ) { |
| 343 | + if ( $file[0] == '.' ) { |
| 344 | + continue; |
| 345 | + } |
| 346 | + if ( !(++$i % REPORTING_INTERVAL ) ) { |
| 347 | + print "$rel $i\r"; |
| 348 | + } |
| 349 | + |
| 350 | + $title = Title::makeTitleSafe( NS_IMAGE, $file ); |
| 351 | + if ( !$title ) { |
| 352 | + wfDebug( __METHOD__.": invalid title: $file\n" ); |
| 353 | + continue; |
| 354 | + } |
| 355 | + $this->doArticle( $title ); |
| 356 | + } |
| 357 | + if ( $handle ) { |
| 358 | + closedir( $handle ); |
| 359 | + } |
| 360 | + print "\n"; |
| 361 | + } |
| 362 | + } |
| 363 | + $this->setCheckpoint( 'shared image', 'done' ); |
| 364 | + print "\n"; |
| 365 | + } |
| 366 | + |
| 367 | + function doCategories() { |
| 368 | + $chunkSize = 1000; |
| 369 | + |
| 370 | + $this->setupGlobals(); |
| 371 | + $dbr = wfGetDB( DB_SLAVE ); |
| 372 | + |
| 373 | + $cp = $this->getCheckpoint( 'category' ); |
| 374 | + if ( $cp == 'done' ) { |
| 375 | + print "Category pages already done\n"; |
| 376 | + return; |
| 377 | + } elseif ( $cp !== false ) { |
| 378 | + print "Resuming category page dump from $cp\n"; |
| 379 | + $conds = array( 'cl_to >= ' . $dbr->addQuotes( $cp ) ); |
| 380 | + } else { |
| 381 | + print "Starting category pages\n"; |
| 382 | + $conds = false; |
| 383 | + } |
| 384 | + |
| 385 | + $i = 0; |
| 386 | + do { |
| 387 | + $res = $dbr->select( 'categorylinks', 'DISTINCT cl_to', $conds, __METHOD__, |
| 388 | + array( 'ORDER BY' => 'cl_to', 'LIMIT' => $chunkSize ) ); |
| 389 | + $numRows = $dbr->numRows( $res ); |
| 390 | + |
| 391 | + while ( $row = $dbr->fetchObject( $res ) ) { |
| 392 | + // Set conditions for next chunk |
| 393 | + $conds = array( 'cl_to > ' . $dbr->addQuotes( $row->cl_to ) ); |
| 394 | + |
| 395 | + // Filter pages from other slices |
| 396 | + if ( !$this->sliceFilter( $row->cl_to ) ) { |
| 397 | + continue; |
| 398 | + } |
| 399 | + |
| 400 | + wfWaitForSlaves( 10 ); |
| 401 | + if ( !(++$i % REPORTING_INTERVAL ) ) { |
| 402 | + print "{$row->cl_to}\n"; |
| 403 | + if ( $row->cl_to != 'done' ) { |
| 404 | + $this->setCheckpoint( 'category', $row->cl_to ); |
| 405 | + } |
| 406 | + } |
| 407 | + $title = Title::makeTitle( NS_CATEGORY, $row->cl_to ); |
| 408 | + $this->doArticle( $title ); |
| 409 | + } |
| 410 | + $dbr->freeResult( $res ); |
| 411 | + } while ( $numRows ); |
| 412 | + |
| 413 | + $this->setCheckpoint( 'category', 'done' ); |
| 414 | + print "\n"; |
| 415 | + } |
| 416 | + |
| 417 | + function doRedirects() { |
| 418 | + print "Doing redirects...\n"; |
| 419 | + |
| 420 | + $chunkSize = 10000; |
| 421 | + $end = $this->getMaxPageID(); |
| 422 | + $cp = $this->getCheckpoint( 'redirect' ); |
| 423 | + if ( $cp == 'done' ) { |
| 424 | + print "Redirects already done\n"; |
| 425 | + return; |
| 426 | + } elseif ( $cp !== false ) { |
| 427 | + print "Resuming redirect generation from page_id $cp\n"; |
| 428 | + $start = intval( $cp ); |
| 429 | + } else { |
| 430 | + $start = 1; |
| 431 | + } |
| 432 | + |
| 433 | + $this->setupGlobals(); |
| 434 | + $dbr = wfGetDB( DB_SLAVE ); |
| 435 | + $i = 0; |
| 436 | + |
| 437 | + for ( $chunkStart = $start; $chunkStart <= $end; $chunkStart += $chunkSize ) { |
| 438 | + $chunkEnd = min( $end, $chunkStart + $chunkSize - 1 ); |
| 439 | + $conds = array( |
| 440 | + 'page_is_redirect' => 1, |
| 441 | + "page_id BETWEEN $chunkStart AND $chunkEnd" |
| 442 | + ); |
| 443 | + # Modulo slicing in SQL |
| 444 | + if ( $this->sliceDenominator != 1 ) { |
| 445 | + $n = intval( $this->sliceNumerator ); |
| 446 | + $m = intval( $this->sliceDenominator ); |
| 447 | + $conds[] = "page_id % $m = $n"; |
| 448 | + } |
| 449 | + $res = $dbr->select( 'page', array( 'page_id', 'page_namespace', 'page_title' ), |
| 450 | + $conds, __METHOD__ ); |
| 451 | + |
| 452 | + while ( $row = $dbr->fetchObject( $res ) ) { |
| 453 | + $title = Title::makeTitle( $row->page_namespace, $row->page_title ); |
| 454 | + if ( !(++$i % (REPORTING_INTERVAL*10) ) ) { |
| 455 | + printf( "Done %d redirects (%2.3f%%)\n", $i, $row->page_id / $end * 100 ); |
| 456 | + $this->setCheckpoint( 'redirect', $row->page_id ); |
| 457 | + } |
| 458 | + $this->doArticle( $title ); |
| 459 | + } |
| 460 | + $dbr->freeResult( $res ); |
| 461 | + } |
| 462 | + $this->setCheckpoint( 'redirect', 'done' ); |
| 463 | + } |
| 464 | + |
| 465 | + /** Write an article specified by title */ |
| 466 | + function doArticle( $title ) { |
| 467 | + if ( $this->noOverwrite ) { |
| 468 | + $fileName = "{$this->dest}/" . $this->getHashedFilename( $title ); |
| 469 | + if ( file_exists( $fileName ) ) { |
| 470 | + return; |
| 471 | + } |
| 472 | + } |
| 473 | + |
| 474 | + if ( $this->showTitles ) { |
| 475 | + print $title->getPrefixedDBkey() . "\n"; |
| 476 | + } |
| 477 | + |
| 478 | + $this->profile(); |
| 479 | + |
| 480 | + $this->rawPages = array(); |
| 481 | + $text = $this->getArticleHTML( $title ); |
| 482 | + |
| 483 | + if ( $text === false ) { |
| 484 | + return; |
| 485 | + } |
| 486 | + |
| 487 | + # Parse the XHTML to find the images |
| 488 | + #$images = $this->findImages( $text ); |
| 489 | + #$this->copyImages( $images ); |
| 490 | + |
| 491 | + # Write to file |
| 492 | + $this->writeArticle( $title, $text ); |
| 493 | + |
| 494 | + # Do raw pages |
| 495 | + $this->mkdir( "{$this->dest}/raw", 0755 ); |
| 496 | + foreach( $this->rawPages as $record ) { |
| 497 | + list( $file, $title, $params ) = $record; |
| 498 | + |
| 499 | + $path = "{$this->dest}/raw/$file"; |
| 500 | + if ( !file_exists( $path ) ) { |
| 501 | + $article = new Article( $title ); |
| 502 | + $request = new FauxRequest( $params ); |
| 503 | + $rp = new RawPage( $article, $request ); |
| 504 | + $text = $rp->getRawText(); |
| 505 | + |
| 506 | + print "Writing $file\n"; |
| 507 | + $file = fopen( $path, 'w' ); |
| 508 | + if ( !$file ) { |
| 509 | + print("Can't open file $path for writing\n"); |
| 510 | + continue; |
| 511 | + } |
| 512 | + fwrite( $file, $text ); |
| 513 | + fclose( $file ); |
| 514 | + } |
| 515 | + } |
| 516 | + |
| 517 | + wfIncrStats( 'dumphtml_article' ); |
| 518 | + } |
| 519 | + |
| 520 | + /** Write the given text to the file identified by the given title object */ |
| 521 | + function writeArticle( $title, $text ) { |
| 522 | + wfProfileIn( __METHOD__ ); |
| 523 | + $filename = $this->getHashedFilename( $title ); |
| 524 | + |
| 525 | + # Temporary hack for current dump, this should be moved to |
| 526 | + # getFriendlyName() at the earliest opportunity. |
| 527 | + # |
| 528 | + # Limit filename length to 255 characters, so it works on ext3. |
| 529 | + # Titles are in fact limited to 255 characters, but dumpHTML |
| 530 | + # adds a suffix which may put them over the limit. |
| 531 | + $length = strlen( $filename ); |
| 532 | + if ( $length > 255 ) { |
| 533 | + print "Warning: Filename too long ($length bytes). Skipping.\n"; |
| 534 | + wfProfileOut( __METHOD__ ); |
| 535 | + return; |
| 536 | + } |
| 537 | + |
| 538 | + $fullName = "{$this->dest}/$filename"; |
| 539 | + $fullDir = dirname( $fullName ); |
| 540 | + |
| 541 | + if ( $this->compress ) { |
| 542 | + $fullName .= ".gz"; |
| 543 | + $text = gzencode( $text, 9 ); |
| 544 | + } |
| 545 | + |
| 546 | + if ( preg_match( '/[\x80-\xFF]/', $fullName ) && wfIsWindows() ) { |
| 547 | + # Work around PHP unicode bug |
| 548 | + $rand = mt_rand( 0, 99999999 ); |
| 549 | + $fullDir = str_replace( '/', '\\', $fullDir ); |
| 550 | + $fullName = str_replace( '/', '\\', $fullName ); |
| 551 | + $tempName = "{$this->dest}\\temp\\TEMP-$rand"; |
| 552 | + |
| 553 | + $success = file_put_contents( $tempName, $text ); |
| 554 | + if ( $success ) { |
| 555 | + wfShellExec( "cscript /nologo " . wfEscapeShellArg( |
| 556 | + dirname( __FILE__ ) . "\\rename-hack.vbs", |
| 557 | + $this->escapeForVBScript( $tempName ), |
| 558 | + $this->escapeForVBScript( $fullName ) ) ); |
| 559 | + } |
| 560 | + } else { |
| 561 | + if ( !$this->mkdir( $fullDir ) ) { |
| 562 | + print "Error: unable to create directory '$fullDir'.\n"; |
| 563 | + } |
| 564 | + #wfSuppressWarnings(); |
| 565 | + $success = file_put_contents( $fullName, $text ); |
| 566 | + #wfRestoreWarnings(); |
| 567 | + } |
| 568 | + |
| 569 | + if ( !$success ) { |
| 570 | + die("Can't open file '$fullName' for writing.\nCheck permissions or use another destination (-d).\n"); |
| 571 | + } |
| 572 | + wfProfileOut( __METHOD__ ); |
| 573 | + } |
| 574 | + |
| 575 | + /** Escape a UTF-8 string for VBScript's Unescape() */ |
| 576 | + function escapeForVBScript( $in ) { |
| 577 | + $utf16 = iconv( 'UTF-8', 'UTF-16BE', $in ); |
| 578 | + $out = ''; |
| 579 | + for ( $i = 0; $i < strlen( $utf16 ); $i += 2 ) { |
| 580 | + $codepoint = ord( $utf16[$i] ) * 256 + ord( $utf16[$i+1] ); |
| 581 | + if ( $codepoint < 128 && $codepoint >= 32 ) { |
| 582 | + $out .= chr( $codepoint ); |
| 583 | + } else { |
| 584 | + $out .= sprintf( "%%u%04X", $codepoint ); |
| 585 | + } |
| 586 | + } |
| 587 | + return $out; |
| 588 | + } |
| 589 | + |
| 590 | + /** Copy a directory recursively, not including .svn */ |
| 591 | + function copyDirectory( $source, $dest ) { |
| 592 | + if ( !is_dir( $dest ) ) { |
| 593 | + if ( !mkdir( $dest ) ) { |
| 594 | + echo "Warning: unable to create directory \"$dest\"\n"; |
| 595 | + return false; |
| 596 | + } |
| 597 | + } |
| 598 | + $dir = opendir( $source ); |
| 599 | + if ( !$dir ) { |
| 600 | + echo "Warning: unable to open directory \"$source\"\n"; |
| 601 | + return false; |
| 602 | + } |
| 603 | + while ( false !== ( $fileName = readdir( $dir ) ) ) { |
| 604 | + if ( substr( $fileName, 0, 1 ) == '.' ) { |
| 605 | + continue; |
| 606 | + } |
| 607 | + $currentSource = "$source/$fileName"; |
| 608 | + $currentDest = "$dest/$fileName"; |
| 609 | + if ( is_dir( $currentSource ) ) { |
| 610 | + $this->copyDirectory( $currentSource, $currentDest ); |
| 611 | + } elseif ( is_file( $currentSource ) ) { |
| 612 | + copy( $currentSource, $currentDest ); |
| 613 | + } |
| 614 | + } |
| 615 | + return true; |
| 616 | + } |
| 617 | + |
| 618 | + /** Set up the destination directory */ |
| 619 | + function setupDestDir() { |
| 620 | + global $IP; |
| 621 | + |
| 622 | + if ( is_dir( $this->dest ) ) { |
| 623 | + echo "WARNING: destination directory already exists, skipping initialisation\n"; |
| 624 | + return; |
| 625 | + } |
| 626 | + echo "Initialising destination directory...\n"; |
| 627 | + if ( !$this->mkdir( "{$this->dest}/skins" ) ) { |
| 628 | + throw new MWException( "Unable to create destination directory." ); |
| 629 | + } |
| 630 | + |
| 631 | + file_put_contents( "{$this->dest}/dumpHTML.version", self::VERSION ); |
| 632 | + $this->copyDirectory( "$IP/skins/vector", "{$this->dest}/skins/vector" ); |
| 633 | + $this->copyDirectory( "$IP/skins/monobook", "{$this->dest}/skins/monobook" ); |
| 634 | + $this->copyDirectory( "$IP/skins/common", "{$this->dest}/skins/common" ); |
| 635 | + $this->copyDirectory( "{$this->extdir}/skin", "{$this->dest}/skins/offline" ); |
| 636 | + } |
| 637 | + |
| 638 | + /** Create a file repo group which is a proxy of an old one */ |
| 639 | + function newRepoGroup( $old ) { |
| 640 | + return new DumpHTML_ProxyRepoGroup( $this, $old ); |
| 641 | + } |
| 642 | + |
| 643 | + /** Set up globals required for parsing */ |
| 644 | + function setupGlobals( $currentDepth = NULL ) { |
| 645 | + global $wgUser, $wgStylePath, $wgArticlePath, $wgMathPath; |
| 646 | + global $wgUploadPath, $wgLogo, $wgMaxCredits, $wgScriptPath; |
| 647 | + global $wgHideInterlanguageLinks, $wgUploadDirectory, $wgThumbnailScriptPath; |
| 648 | + global $wgEnableParserCache, $wgHooks, $wgServer; |
| 649 | + global $wgRightsUrl, $wgRightsText, $wgCopyrightIcon, $wgEnableSidebarCache; |
| 650 | + global $wgGenerateThumbnailOnParse, $wgValidSkinNames, $wgFavicon; |
| 651 | + global $wgDisableCounters; |
| 652 | + |
| 653 | + if ( !$this->setupDone ) { |
| 654 | + $wgHooks['GetLocalURL'][] =& $this; |
| 655 | + $wgHooks['GetFullURL'][] =& $this; |
| 656 | + $wgHooks['SiteNoticeBefore'][] =& $this; |
| 657 | + $wgHooks['SiteNoticeAfter'][] =& $this; |
| 658 | + $this->oldArticlePath = $wgServer . $wgArticlePath; |
| 659 | + $this->oldLogo = $wgLogo; |
| 660 | + $this->oldRepoGroup = RepoGroup::singleton(); |
| 661 | + $this->oldCopyrightIcon = $wgCopyrightIcon; |
| 662 | + $this->oldScriptPath = $wgScriptPath; |
| 663 | + $this->oldFavicon = $wgFavicon; |
| 664 | + $wgValidSkinNames['offline'] = 'Offline'; |
| 665 | + } |
| 666 | + |
| 667 | + if ( is_null( $currentDepth ) ) { |
| 668 | + $currentDepth = $this->depth; |
| 669 | + } |
| 670 | + |
| 671 | + if ( $this->alternateScriptPath ) { |
| 672 | + if ( $currentDepth == 0 ) { |
| 673 | + $wgScriptPath = '.'; |
| 674 | + } else { |
| 675 | + $wgScriptPath = '../..' . str_repeat( '/..', $currentDepth - 1 ); |
| 676 | + } |
| 677 | + } else { |
| 678 | + if ( $currentDepth == 0 ) { |
| 679 | + $wgScriptPath = '..' . str_repeat( '/..', $currentDepth ); |
| 680 | + } else { |
| 681 | + $wgScriptPath = '../..' . str_repeat( '/..', $currentDepth ); |
| 682 | + } |
| 683 | + } |
| 684 | + |
| 685 | + if ( $currentDepth == 0 ) { |
| 686 | + $wgArticlePath = '$1'; |
| 687 | + $this->articleBaseUrl = '.'; |
| 688 | + } else { |
| 689 | + $this->articleBaseUrl = '..' . str_repeat( '/..', $currentDepth ); |
| 690 | + $wgArticlePath = str_repeat( '../', $currentDepth + 1 ) . '$1'; |
| 691 | + } |
| 692 | + |
| 693 | + $uploadBits = explode( '/', str_replace( '\\', '/', $wgUploadPath ) ); |
| 694 | + $this->imageRel = $uploadBits[count($uploadBits) - 1]; |
| 695 | + if ( !in_array( $this->imageRel, array( 'images', 'upload' ) ) ) { |
| 696 | + $this->imageRel = 'images'; |
| 697 | + } |
| 698 | + |
| 699 | + $wgStylePath = "{$this->articleBaseUrl}/skins"; |
| 700 | + |
| 701 | + |
| 702 | + if ( $this->makeSnapshot ) { |
| 703 | + $this->destUploadUrl = "{$this->articleBaseUrl}/{$this->imageRel}"; |
| 704 | + } else { |
| 705 | + $this->destUploadUrl = "$wgScriptPath/{$this->imageRel}"; |
| 706 | + } |
| 707 | + $wgUploadPath = $this->destUploadUrl; // For BC |
| 708 | + $wgMaxCredits = -1; |
| 709 | + $wgHideInterlanguageLinks = !$this->interwiki; |
| 710 | + $wgThumbnailScriptPath = false; |
| 711 | + $wgEnableParserCache = false; |
| 712 | + $wgMathPath = "$wgScriptPath/math"; |
| 713 | + $wgEnableSidebarCache = false; |
| 714 | + $wgGenerateThumbnailOnParse = true; |
| 715 | + $wgDisableCounters = true; |
| 716 | + |
| 717 | + if ( !empty( $wgRightsText ) ) { |
| 718 | + $wgRightsUrl = "$wgScriptPath/COPYING.html"; |
| 719 | + } |
| 720 | + |
| 721 | + $wgUser = User::newFromName( '__dumpHTML', false ); |
| 722 | + $wgUser->setOption( 'skin', $this->skin ); |
| 723 | + $wgUser->setOption( 'editsection', 0 ); |
| 724 | + if ( $this->group ) { |
| 725 | + $groups = explode( ',', $this->group ); |
| 726 | + foreach ( $groups as $group ) { |
| 727 | + $wgUser->addGroup( $group ); |
| 728 | + } |
| 729 | + if ( !$wgUser->isAllowed( 'read' ) ) { |
| 730 | + print "The specified user group is not allowed to read\n"; |
| 731 | + exit( 1 ); |
| 732 | + } |
| 733 | + } elseif ( !$wgUser->isAllowed( 'read' ) ) { |
| 734 | + print "Default users are not allowed to read, please specify a --group option, e.g. --group=sysop\n"; |
| 735 | + exit( 1 ); |
| 736 | + } |
| 737 | + |
| 738 | + if ( $this->makeSnapshot ) { |
| 739 | + $this->destUploadDirectory = "{$this->dest}/{$this->imageRel}"; |
| 740 | + if ( realpath( $this->destUploadDirectory ) == realpath( $wgUploadDirectory ) ) { |
| 741 | + print "Disabling image snapshot because the destination is the same as the source\n"; |
| 742 | + $this->makeSnapshot = false; |
| 743 | + $this->destUploadDirectory = false; |
| 744 | + } |
| 745 | + } else { |
| 746 | + $this->destUploadDirectory = false; |
| 747 | + } |
| 748 | + |
| 749 | + $newRepoGroup = $this->newRepoGroup( $this->oldRepoGroup ); |
| 750 | + RepoGroup::setSingleton( $newRepoGroup ); |
| 751 | + |
| 752 | + # Make a snapshot of the logo image and copyright icon |
| 753 | + $wgLogo = $this->makeUrlSnapshot( $this->oldLogo ); |
| 754 | + if ( preg_match( '/<img [^>]*src="([^"]*)"/', $this->oldCopyrightIcon, $m ) ) { |
| 755 | + $urlText = $m[1]; |
| 756 | + $url = Sanitizer::decodeCharReferences( $urlText ); |
| 757 | + $url = $this->makeUrlSnapshot( $url ); |
| 758 | + $wgCopyrightIcon = str_replace( $urlText, htmlspecialchars( $url ), $this->oldCopyrightIcon); |
| 759 | + } |
| 760 | + |
| 761 | + # Make a snapshot of the favicon |
| 762 | + $wgFavicon = $this->makeUrlSnapshot( $this->oldFavicon ); |
| 763 | + |
| 764 | + $this->setupDone = true; |
| 765 | + } |
| 766 | + |
| 767 | + /** |
| 768 | + * Make a copy of a URL in the destination directory, and return the new relative URL |
| 769 | + */ |
| 770 | + function makeUrlSnapshot( $url ) { |
| 771 | + global $wgServer; |
| 772 | + $this->mkdir( "{$this->dest}/misc" ); |
| 773 | + $destName = urldecode( basename( $url ) ); |
| 774 | + $destPath = "{$this->dest}/misc/$destName"; |
| 775 | + if ( !file_exists( $destPath ) ) { |
| 776 | + if ( !preg_match( '/^https?:/', $url ) ) { |
| 777 | + $url = $wgServer . $url; |
| 778 | + } |
| 779 | + $contents = Http::get( $url ); |
| 780 | + file_put_contents( $destPath, $contents ); |
| 781 | + } |
| 782 | + return "{$this->articleBaseUrl}/misc/" . urlencode( $destName ); |
| 783 | + } |
| 784 | + |
| 785 | + /** Reads the content of a title object, executes the skin and captures the result */ |
| 786 | + function getArticleHTML( $title ) { |
| 787 | + global $wgOut, $wgTitle, $wgArticle, $wgUser; |
| 788 | + |
| 789 | + $linkCache =& LinkCache::singleton(); |
| 790 | + $linkCache->clear(); |
| 791 | + $wgTitle = $title; |
| 792 | + if ( is_null( $wgTitle ) ) { |
| 793 | + return false; |
| 794 | + } |
| 795 | + |
| 796 | + $ns = $wgTitle->getNamespace(); |
| 797 | + if ( $ns == NS_SPECIAL ) { |
| 798 | + $wgOut = new OutputPage; |
| 799 | + $wgOut->setParserOptions( new ParserOptions ); |
| 800 | + SpecialPage::executePath( $wgTitle ); |
| 801 | + } else { |
| 802 | + /** @todo merge with Wiki.php code */ |
| 803 | + if ( $ns == NS_IMAGE ) { |
| 804 | + $wgArticle = new ImagePage( $wgTitle ); |
| 805 | + } elseif ( $ns == NS_CATEGORY ) { |
| 806 | + $wgArticle = new CategoryPage( $wgTitle ); |
| 807 | + } else { |
| 808 | + $wgArticle = new Article( $wgTitle ); |
| 809 | + } |
| 810 | + $rt = Title::newFromRedirect( $wgArticle->fetchContent() ); |
| 811 | + if ( $rt != NULL ) { |
| 812 | + return $this->getRedirect( $rt ); |
| 813 | + } else { |
| 814 | + $wgOut = new OutputPage; |
| 815 | + $wgOut->setParserOptions( new ParserOptions ); |
| 816 | + |
| 817 | + $wgArticle->view(); |
| 818 | + } |
| 819 | + } |
| 820 | + |
| 821 | + |
| 822 | + $sk =& $wgUser->getSkin(); |
| 823 | + ob_start(); |
| 824 | + $sk->outputPage( $wgOut ); |
| 825 | + $text = ob_get_contents(); |
| 826 | + ob_end_clean(); |
| 827 | + |
| 828 | + return $text; |
| 829 | + } |
| 830 | + |
| 831 | + function getRedirect( $rt ) { |
| 832 | + $url = $rt->escapeLocalURL(); |
| 833 | + $text = $rt->getPrefixedText(); |
| 834 | + return <<<ENDTEXT |
| 835 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
| 836 | +<html xmlns="http://www.w3.org/1999/xhtml"> |
| 837 | +<head> |
| 838 | + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> |
| 839 | + <meta http-equiv="Refresh" content="0;url=$url" /> |
| 840 | +</head> |
| 841 | +<body> |
| 842 | + <p>Redirecting to <a href="$url">$text</a></p> |
| 843 | +</body> |
| 844 | +</html> |
| 845 | +ENDTEXT; |
| 846 | + } |
| 847 | + |
| 848 | + /** Returns image paths used in an XHTML document */ |
| 849 | + function findImages( $text ) { |
| 850 | + global $wgOutputEncoding, $wgDumpImages; |
| 851 | + $parser = xml_parser_create( $wgOutputEncoding ); |
| 852 | + xml_set_element_handler( $parser, 'wfDumpStartTagHandler', 'wfDumpEndTagHandler' ); |
| 853 | + |
| 854 | + $wgDumpImages = array(); |
| 855 | + xml_parse( $parser, $text ); |
| 856 | + xml_parser_free( $parser ); |
| 857 | + |
| 858 | + return $wgDumpImages; |
| 859 | + } |
| 860 | + |
| 861 | + /** |
| 862 | + * Returns true if the path exists, false otherwise |
| 863 | + * PHP's file_exists() returns false for broken symlinks, this returns true. |
| 864 | + */ |
| 865 | + function pathExists( $path ) { |
| 866 | + wfSuppressWarnings(); |
| 867 | + $exists = (bool)lstat( $path ); |
| 868 | + wfRestoreWarnings(); |
| 869 | + return $exists; |
| 870 | + } |
| 871 | + |
| 872 | + /** |
| 873 | + * Copy a file specified by a URL to a given directory |
| 874 | + * |
| 875 | + * @param string $srcPath The source URL |
| 876 | + * @param string $srcPathBase The base directory of the source URL |
| 877 | + * @param string $srcDirBase The base filesystem directory of the source URL |
| 878 | + * @param string $destDirBase The base filesystem directory of the destination URL |
| 879 | + */ |
| 880 | + function relativeCopy( $srcPath, $srcPathBase, $srcDirBase, $destDirBase ) { |
| 881 | + $rel = substr( $srcPath, strlen( $srcPathBase ) + 1 ); // +1 for slash |
| 882 | + $sourceLoc = "$srcDirBase/$rel"; |
| 883 | + $destLoc = "$destDirBase/$rel"; |
| 884 | + #print "Copying $sourceLoc to $destLoc\n"; |
| 885 | + if ( !$this->pathExists( $destLoc ) ) { |
| 886 | + $this->mkdir( dirname( $destLoc ), 0755 ); |
| 887 | + if ( function_exists( 'symlink' ) && !$this->forceCopy ) { |
| 888 | + if ( !symlink( $sourceLoc, $destLoc ) ) { |
| 889 | + print "Warning: unable to create symlink at $destLoc\n"; |
| 890 | + } |
| 891 | + } else { |
| 892 | + if ( !copy( $sourceLoc, $destLoc ) ) { |
| 893 | + print "Warning: unable to copy $sourceLoc to $destLoc\n"; |
| 894 | + } |
| 895 | + } |
| 896 | + } |
| 897 | + } |
| 898 | + |
| 899 | + /** |
| 900 | + * Copy an image, and if it is a thumbnail, copy its parent image too |
| 901 | + */ |
| 902 | + function copyImage( $srcPath, $srcPathBase, $srcDirBase, $destDirBase ) { |
| 903 | + $this->relativeCopy( $srcPath, $srcPathBase, $srcDirBase, $destDirBase ); |
| 904 | + if ( substr( $srcPath, strlen( $srcPathBase ) + 1, 6 ) == 'thumb/' ) { |
| 905 | + # The image was a thumbnail |
| 906 | + # Copy the source image as well |
| 907 | + $rel = substr( $srcPath, strlen( $srcPathBase ) + 1 ); |
| 908 | + $parts = explode( '/', $rel ); |
| 909 | + $rel = "{$parts[1]}/{$parts[2]}/{$parts[3]}"; |
| 910 | + $newSrc = "$srcPathBase/$rel"; |
| 911 | + $this->relativeCopy( $newSrc, $srcPathBase, $srcDirBase, $destDirBase ); |
| 912 | + } |
| 913 | + } |
| 914 | + |
| 915 | + /** |
| 916 | + * Copy images (or create symlinks) from commons to a static directory. |
| 917 | + * This is necessary even if you intend to distribute all of commons, because |
| 918 | + * the directory contents is used to work out which image description pages |
| 919 | + * are needed. |
| 920 | + * |
| 921 | + * Also copies math images, and full-sized images if the makeSnapshot option |
| 922 | + * is specified. |
| 923 | + * |
| 924 | + */ |
| 925 | + function copyImages( $images ) { |
| 926 | + global $wgUploadPath, $wgUploadDirectory, $wgMathPath, $wgMathDirectory; |
| 927 | + # Find shared uploads and copy them into the static directory |
| 928 | + $mathPathLength = strlen( $wgMathPath ); |
| 929 | + $uploadPathLength = strlen( $wgUploadPath ); |
| 930 | + foreach ( $images as $escapedImage => $dummy ) { |
| 931 | + $image = urldecode( $escapedImage ); |
| 932 | + |
| 933 | + if ( substr( $image, 0, $mathPathLength ) == $wgMathPath ) { |
| 934 | + $this->relativeCopy( $image, $wgMathPath, $wgMathDirectory, "{$this->dest}/math" ); |
| 935 | + } elseif ( $this->makeSnapshot && substr( $image, 0, $uploadPathLength ) == $wgUploadPath ) { |
| 936 | + $this->copyImage( $image, $wgUploadPath, $wgUploadDirectory, $this->destUploadDirectory ); |
| 937 | + } |
| 938 | + } |
| 939 | + } |
| 940 | + |
| 941 | + function onGetFullURL( &$title, &$url, $query ) { |
| 942 | + global $wgContLang, $wgArticlePath; |
| 943 | + |
| 944 | + $iw = $title->getInterwiki(); |
| 945 | + if ( $title->isExternal() && $wgContLang->getLanguageName( $iw ) ) { |
| 946 | + if ( $title->getDBkey() == '' ) { |
| 947 | + $url = str_replace( '$1', "../$iw/index.html", $wgArticlePath ); |
| 948 | + } else { |
| 949 | + $url = str_replace( '$1', "../$iw/" . wfUrlencode( $this->getHashedFilename( $title ) ), |
| 950 | + $wgArticlePath ); |
| 951 | + } |
| 952 | + $url .= $this->compress ? ".gz" : ""; |
| 953 | + return false; |
| 954 | + } else { |
| 955 | + return true; |
| 956 | + } |
| 957 | + } |
| 958 | + |
| 959 | + function onGetLocalURL( &$title, &$url, $query ) { |
| 960 | + global $wgArticlePath; |
| 961 | + |
| 962 | + if ( $title->isExternal() ) { |
| 963 | + # Default is fine for interwiki |
| 964 | + return true; |
| 965 | + } |
| 966 | + |
| 967 | + $url = false; |
| 968 | + if ( $query != '' ) { |
| 969 | + $params = array(); |
| 970 | + parse_str( $query, $params ); |
| 971 | + if ( isset($params['action']) && $params['action'] == 'raw' ) { |
| 972 | + if ( $params['gen'] == 'css' || $params['gen'] == 'js' ) { |
| 973 | + $file = 'gen.' . $params['gen']; |
| 974 | + } else { |
| 975 | + $file = $this->getFriendlyName( $title->getPrefixedDBkey() ); |
| 976 | + // Clean up Monobook.css etc. |
| 977 | + $matches = array(); |
| 978 | + if ( preg_match( '/^(.*)\.(css|js)_[0-9a-f]{4}$/', $file, $matches ) ) { |
| 979 | + $file = $matches[1] . '.' . $matches[2]; |
| 980 | + } |
| 981 | + } |
| 982 | + $this->rawPages[$file] = array( $file, $title, $params ); |
| 983 | + $url = str_replace( '$1', "raw/" . wfUrlencode( $file ), $wgArticlePath ); |
| 984 | + } |
| 985 | + } |
| 986 | + if ( $url === false ) { |
| 987 | + $url = str_replace( '$1', wfUrlencode( $this->getHashedFilename( $title ) ), $wgArticlePath ); |
| 988 | + } |
| 989 | + $url .= $this->compress ? ".gz" : ""; |
| 990 | + return false; |
| 991 | + } |
| 992 | + |
| 993 | + function getHashedFilename( &$title ) { |
| 994 | + if ( !$title ) { |
| 995 | + throw new MWException( 'Invalid $title parameter to '.__METHOD__ ); |
| 996 | + } |
| 997 | + if ( '' != $title->mInterwiki ) { |
| 998 | + $dbkey = $title->getDBkey(); |
| 999 | + } else { |
| 1000 | + $dbkey = $title->getPrefixedDBkey(); |
| 1001 | + } |
| 1002 | + |
| 1003 | + $mainPage = Title::newMainPage(); |
| 1004 | + if ( $mainPage->getPrefixedDBkey() == $dbkey ) { |
| 1005 | + return 'index.html'; |
| 1006 | + } |
| 1007 | + |
| 1008 | + return 'articles/' . $this->getHashedDirectory( $title ) . '/' . |
| 1009 | + $this->getFriendlyName( $dbkey ) . '.html'; |
| 1010 | + } |
| 1011 | + |
| 1012 | + function getFriendlyName( $name ) { |
| 1013 | + global $wgLang; |
| 1014 | + # Replace illegal characters for Windows paths with underscores |
| 1015 | + $friendlyName = strtr( $name, '/\\*?"<>|~', '_________' ); |
| 1016 | + |
| 1017 | + # Work out lower case form. We assume we're on a system with case-insensitive |
| 1018 | + # filenames, so unless the case is of a special form, we have to disambiguate |
| 1019 | + if ( function_exists( 'mb_strtolower' ) ) { |
| 1020 | + $lowerCase = $wgLang->ucfirst( mb_strtolower( $name ) ); |
| 1021 | + } else { |
| 1022 | + $lowerCase = ucfirst( strtolower( $name ) ); |
| 1023 | + } |
| 1024 | + |
| 1025 | + # Make it mostly unique |
| 1026 | + if ( $lowerCase != $friendlyName ) { |
| 1027 | + $friendlyName .= '_' . substr(md5( $name ), 0, 4); |
| 1028 | + } |
| 1029 | + # Handle colon specially by replacing it with tilde |
| 1030 | + # Thus we reduce the number of paths with hashes appended |
| 1031 | + $friendlyName = str_replace( ':', '~', $friendlyName ); |
| 1032 | + |
| 1033 | + return $friendlyName; |
| 1034 | + } |
| 1035 | + |
| 1036 | + /** |
| 1037 | + * Get a relative directory for putting a title into |
| 1038 | + */ |
| 1039 | + function getHashedDirectory( &$title ) { |
| 1040 | + if ( '' != $title->getInterwiki() ) { |
| 1041 | + $pdbk = $title->getDBkey(); |
| 1042 | + } else { |
| 1043 | + $pdbk = $title->getPrefixedDBkey(); |
| 1044 | + } |
| 1045 | + |
| 1046 | + # Find the first colon if there is one, use characters after it |
| 1047 | + $p = strpos( $pdbk, ':' ); |
| 1048 | + if ( $p !== false ) { |
| 1049 | + $dbk = substr( $pdbk, $p + 1 ); |
| 1050 | + $dbk = substr( $dbk, strspn( $dbk, '_' ) ); |
| 1051 | + } else { |
| 1052 | + $dbk = $pdbk; |
| 1053 | + } |
| 1054 | + |
| 1055 | + # Split into characters |
| 1056 | + $m = array(); |
| 1057 | + preg_match_all( '/./us', $dbk, $m ); |
| 1058 | + |
| 1059 | + $chars = $m[0]; |
| 1060 | + $length = count( $chars ); |
| 1061 | + $dir = ''; |
| 1062 | + |
| 1063 | + for ( $i = 0; $i < $this->depth; $i++ ) { |
| 1064 | + if ( $i ) { |
| 1065 | + $dir .= '/'; |
| 1066 | + } |
| 1067 | + if ( $i >= $length ) { |
| 1068 | + $dir .= '_'; |
| 1069 | + } else { |
| 1070 | + $c = $chars[$i]; |
| 1071 | + if ( ord( $c ) >= 128 || preg_match( '/[a-zA-Z0-9!#$%&()+,[\]^_`{}-]/', $c ) ) { |
| 1072 | + if ( function_exists( 'mb_strtolower' ) ) { |
| 1073 | + $dir .= mb_strtolower( $c ); |
| 1074 | + } else { |
| 1075 | + $dir .= strtolower( $c ); |
| 1076 | + } |
| 1077 | + } else { |
| 1078 | + $dir .= sprintf( "%02X", ord( $c ) ); |
| 1079 | + } |
| 1080 | + } |
| 1081 | + } |
| 1082 | + return $dir; |
| 1083 | + } |
| 1084 | + |
| 1085 | + /** |
| 1086 | + * Calculate the start end end of a job based on the current slice |
| 1087 | + * @param integer $start |
| 1088 | + * @param integer $end |
| 1089 | + * @return array of integers |
| 1090 | + */ |
| 1091 | + function sliceRange( $start, $end ) { |
| 1092 | + $count = $end - $start + 1; |
| 1093 | + $each = $count / $this->sliceDenominator; |
| 1094 | + $sliceStart = $start + intval( $each * ( $this->sliceNumerator - 1 ) ); |
| 1095 | + if ( $this->sliceNumerator == $this->sliceDenominator ) { |
| 1096 | + $sliceEnd = $end; |
| 1097 | + } else { |
| 1098 | + $sliceEnd = $start + intval( $each * $this->sliceNumerator ) - 1; |
| 1099 | + } |
| 1100 | + return array( $sliceStart, $sliceEnd ); |
| 1101 | + } |
| 1102 | + |
| 1103 | + /** |
| 1104 | + * Adjust a start point so that it belongs to the current slice, where slices are defined by integer modulo |
| 1105 | + * @param integer $start |
| 1106 | + * @param integer $base The true start of the range; the minimum start |
| 1107 | + */ |
| 1108 | + function modSliceStart( $start, $base = 1 ) { |
| 1109 | + return ( $start - $base ) - ( ( $start - $base ) % $this->sliceDenominator ) + $this->sliceNumerator - 1 + $base; |
| 1110 | + } |
| 1111 | + |
| 1112 | + /** |
| 1113 | + * Determine whether a string belongs to the current slice, based on hash |
| 1114 | + */ |
| 1115 | + function sliceFilter( $s ) { |
| 1116 | + return crc32( $s ) % $this->sliceDenominator == $this->sliceNumerator - 1; |
| 1117 | + } |
| 1118 | + |
| 1119 | + /** |
| 1120 | + * No site notice |
| 1121 | + */ |
| 1122 | + function onSiteNoticeBefore( &$text ) { |
| 1123 | + $text = ''; |
| 1124 | + return false; |
| 1125 | + } |
| 1126 | + function onSiteNoticeAfter( &$text ) { |
| 1127 | + $text = ''; |
| 1128 | + return false; |
| 1129 | + } |
| 1130 | + |
| 1131 | + function getMaxPageID() { |
| 1132 | + if ( $this->maxPageID === false ) { |
| 1133 | + $dbr = wfGetDB( DB_SLAVE ); |
| 1134 | + $this->maxPageID = $dbr->selectField( 'page', 'max(page_id)', false, __METHOD__ ); |
| 1135 | + } |
| 1136 | + return $this->maxPageID; |
| 1137 | + } |
| 1138 | + |
| 1139 | + function profile() { |
| 1140 | + global $wgProfiler, $wgRequestTime, $wgRUstart; |
| 1141 | + |
| 1142 | + if ( !$this->udpProfile ) { |
| 1143 | + return; |
| 1144 | + } |
| 1145 | + if ( !$this->udpProfileInit ) { |
| 1146 | + $this->udpProfileInit = true; |
| 1147 | + $this->udpProfileCounter = 0; |
| 1148 | + } elseif ( $this->udpProfileCounter == 1 % $this->udpProfile ) { |
| 1149 | + wfProfileOut( '-total' ); |
| 1150 | + $wgProfiler->getFunctionReport(); |
| 1151 | + $wgProfiler = new DumpHTML_ProfilerStub; |
| 1152 | + } |
| 1153 | + if ( $this->udpProfileCounter == 0 ) { |
| 1154 | + $wgProfiler = new ProfilerSimpleUDP; |
| 1155 | + $wgProfiler->setProfileID( 'dumpHTML' ); |
| 1156 | + $wgRequestTime = microtime( true ); |
| 1157 | + $wgRUstart = getrusage(); |
| 1158 | + wfProfileIn( '-total' ); |
| 1159 | + } |
| 1160 | + $this->udpProfileCounter = ( $this->udpProfileCounter + 1 ) % $this->udpProfile; |
| 1161 | + } |
| 1162 | + |
| 1163 | + function debug( $text ) { |
| 1164 | + print "$text\n"; |
| 1165 | + } |
| 1166 | + |
| 1167 | + function mkdir( $dir ) { |
| 1168 | + //if ( wfIsWindows() ) { |
| 1169 | + return wfMkdirParents( $dir, 0755 ); |
| 1170 | + /*} else { |
| 1171 | + $dir = escapeshellarg( $dir ); |
| 1172 | + `mkdir -p -- $dir`; |
| 1173 | + return true; |
| 1174 | + }*/ |
| 1175 | + } |
| 1176 | +} |
| 1177 | + |
| 1178 | +class DumpHTML_ProfilerStub { |
| 1179 | + function profileIn() {} |
| 1180 | + function profileOut() {} |
| 1181 | + function getOutput() {} |
| 1182 | + function close() {} |
| 1183 | + function getFunctionReport() {} |
| 1184 | + function getCurrentSection() { return '';} |
| 1185 | +} |
| 1186 | + |
| 1187 | +class DumpHTML_ProxyRepoGroup extends RepoGroup { |
| 1188 | + var $dump, $backendRG; |
| 1189 | + |
| 1190 | + function __construct( $dump, $backendRG ) { |
| 1191 | + $this->dump = $dump; |
| 1192 | + $this->backendRG = $backendRG; |
| 1193 | + $backendRG->initialiseRepos(); |
| 1194 | + |
| 1195 | + if ( count( $backendRG->foreignRepos ) ) { |
| 1196 | + $localDest = "{$this->dump->destUploadDirectory}/local"; |
| 1197 | + $localUrl = "{$this->dump->destUploadUrl}/local"; |
| 1198 | + } else { |
| 1199 | + $localDest = $this->dump->destUploadDirectory; |
| 1200 | + $localUrl = $this->dump->destUploadUrl; |
| 1201 | + } |
| 1202 | + if ( !$dump->makeSnapshot ) { |
| 1203 | + $localDest = false; |
| 1204 | + } |
| 1205 | + $this->reposInitialised = true; |
| 1206 | + $this->localRepo = new DumpHTML_ProxyRepo( $backendRG->getLocalRepo(), $dump, $localDest, $localUrl ); |
| 1207 | + $this->foreignRepos = array(); |
| 1208 | + foreach ( $backendRG->foreignRepos as $index => $repo ) { |
| 1209 | + $friendlyName = strtr( $repo->getName(), array( '/. ', '___' ) ); |
| 1210 | + if ( !$dump->makeSnapshot ) { |
| 1211 | + $foreignDest = false; |
| 1212 | + } else { |
| 1213 | + $foreignDest = "{$dump->destUploadDirectory}/$friendlyName"; |
| 1214 | + } |
| 1215 | + $this->foreignRepos[] = new DumpHTML_ProxyRepo( $repo, $dump, $foreignDest, |
| 1216 | + $dump->destUploadUrl . '/' . urlencode( $friendlyName ) ); |
| 1217 | + } |
| 1218 | + } |
| 1219 | +} |
| 1220 | + |
| 1221 | +class DumpHTML_ProxyRepo { |
| 1222 | + function __construct( $backend, $dump, $directory, $url ) { |
| 1223 | + $this->backend = $backend; |
| 1224 | + $this->dump = $dump; |
| 1225 | + $this->directory = $directory; |
| 1226 | + $this->url = $url; |
| 1227 | + $this->name = $backend->getName(); |
| 1228 | + $this->backend->thumbScriptUrl = false; |
| 1229 | + $this->backend->transformVia404 = false; |
| 1230 | + $this->backendUrl = $backend->getZoneUrl( 'public' ); |
| 1231 | + } |
| 1232 | + |
| 1233 | + function __call( $name, $args ) { |
| 1234 | + return call_user_func_array( array( $this->backend, $name ), $args ); |
| 1235 | + } |
| 1236 | + |
| 1237 | + function newFile( $title, $time = false) { |
| 1238 | + $file = $this->backend->newFile( $title, $time ); |
| 1239 | + if ( $file ) { |
| 1240 | + $file = new DumpHTML_ProxyFile( $file, $this ); |
| 1241 | + $file->copyToDump(); |
| 1242 | + } |
| 1243 | + return $file; |
| 1244 | + } |
| 1245 | + |
| 1246 | + function findFile( $title, $time = false ) { |
| 1247 | + $file = $this->backend->findFile( $title, $time ); |
| 1248 | + if ( $file ) { |
| 1249 | + $file = new DumpHTML_ProxyFile( $file, $this ); |
| 1250 | + $file->copyToDump(); |
| 1251 | + } |
| 1252 | + return $file; |
| 1253 | + } |
| 1254 | + |
| 1255 | + function copyToDump( $rel ) { |
| 1256 | + if ( !$this->dump->makeSnapshot ) { |
| 1257 | + return; |
| 1258 | + } |
| 1259 | + |
| 1260 | + if ( is_callable( array( $this->backend, 'getZonePath' ) ) ) { |
| 1261 | + $sourceBase = $this->backend->getZonePath( 'public' ); |
| 1262 | + } elseif ( is_callable( array( $this->backend, 'getZoneUrl' ) ) ) { |
| 1263 | + $sourceBase = false; |
| 1264 | + $sourceBaseUrl = $this->backend->getZoneUrl( 'public' ); |
| 1265 | + } else { |
| 1266 | + $sourceBase = false; |
| 1267 | + $sourceBaseUrl = false; |
| 1268 | + } |
| 1269 | + |
| 1270 | + $dest = "{$this->directory}/$rel"; |
| 1271 | + |
| 1272 | + if ( $this->dump->pathExists( $dest ) ) { |
| 1273 | + return; |
| 1274 | + } |
| 1275 | + |
| 1276 | + if ( $sourceBase !== false ) { |
| 1277 | + $source = "$sourceBase/$rel"; |
| 1278 | + if ( !file_exists( $source ) ) { |
| 1279 | + // Hopefully we'll get another go at it later |
| 1280 | + return; |
| 1281 | + } |
| 1282 | + if ( !is_dir( dirname( $dest ) ) ) { |
| 1283 | + $this->dump->mkdir( dirname( $dest ) ); |
| 1284 | + } |
| 1285 | + |
| 1286 | + #$this->dump->debug( "Copying $source to $dest" ); |
| 1287 | + if ( function_exists( 'symlink' ) && !$this->dump->forceCopy ) { |
| 1288 | + if ( !symlink( $source, $dest ) ) { |
| 1289 | + $this->dump->debug( "Warning: unable to create symlink at $dest" ); |
| 1290 | + } |
| 1291 | + } else { |
| 1292 | + if ( !copy( $source, $dest ) ) { |
| 1293 | + $this->dump->debug( "Warning: unable to copy $source to $dest" ); |
| 1294 | + } |
| 1295 | + } |
| 1296 | + } elseif ( $sourceBaseUrl !== false ) { |
| 1297 | + $urlRel = implode( '/', array_map( 'rawurlencode', explode( '/', $rel ) ) ); |
| 1298 | + $sourceUrl = $sourceBaseUrl . '/' . $urlRel; |
| 1299 | + $contents = Http::get( $sourceUrl ); |
| 1300 | + if ( $contents === false ) { |
| 1301 | + $this->dump->debug( "Unable to get contents of file from $sourceUrl" ); |
| 1302 | + } else { |
| 1303 | + if ( !file_put_contents( $dest, $contents ) ) { |
| 1304 | + $this->debug( "Unable to write to $dest" ); |
| 1305 | + } |
| 1306 | + } |
| 1307 | + } // else give up |
| 1308 | + } |
| 1309 | +} |
| 1310 | + |
| 1311 | +class DumpHTML_ProxyFile { |
| 1312 | + function __construct( $backend, $repo ) { |
| 1313 | + $this->backend = $backend; |
| 1314 | + $this->repo = $repo; |
| 1315 | + $this->dump = $repo->dump; |
| 1316 | + } |
| 1317 | + |
| 1318 | + function __call( $name, $args ) { |
| 1319 | + $callback = array( $this->backend, $name ); |
| 1320 | + if ( !is_callable( $callback ) ) { |
| 1321 | + throw new MWException( "Attempt to call invalid function LocalFile::$name\n" ); |
| 1322 | + } |
| 1323 | + $result = call_user_func_array( array( $this->backend, $name ), $args ); |
| 1324 | + if ( is_string( $result ) ) { |
| 1325 | + $result = $this->fixURL( $result ); |
| 1326 | + } elseif ( $result instanceof MediaTransformOutput ) { |
| 1327 | + $result = $this->fixMTO( $result ); |
| 1328 | + } |
| 1329 | + return $result; |
| 1330 | + } |
| 1331 | + |
| 1332 | + function getUrl() { |
| 1333 | + return $this->repo->url . '/' . $this->backend->getUrlRel(); |
| 1334 | + } |
| 1335 | + |
| 1336 | + public function getFullURL() { |
| 1337 | + return $this->getUrl(); |
| 1338 | + } |
| 1339 | + |
| 1340 | + function fixURL( $url ) { |
| 1341 | + if ( is_string( $url ) && substr( $url, 0, strlen( $this->repo->backendUrl ) ) == $this->repo->backendUrl ) { |
| 1342 | + $rel = substr( $url, strlen( $this->repo->backendUrl ) + 1 ); |
| 1343 | + $rel = implode('/', array_map( 'rawurldecode', explode( '/', $rel ) ) ); |
| 1344 | + $this->repo->copyToDump( $rel ); |
| 1345 | + $newUrl = $this->repo->url . '/' . $rel; |
| 1346 | + $url = $newUrl; |
| 1347 | + } |
| 1348 | + return $url; |
| 1349 | + } |
| 1350 | + |
| 1351 | + function fixMTO( $thumb ) { |
| 1352 | + // FIXME: accessing private members, needs MTO::setUrl() |
| 1353 | + if ( isset( $thumb->url ) ) { |
| 1354 | + $thumb->url = $this->fixURL( $thumb->url ); |
| 1355 | + } |
| 1356 | + return $thumb; |
| 1357 | + } |
| 1358 | + |
| 1359 | + function copyToDump() { |
| 1360 | + if ( !$this->dump->makeSnapshot ) { |
| 1361 | + return; |
| 1362 | + } |
| 1363 | + |
| 1364 | + $source = $this->backend->getPath(); |
| 1365 | + $dest = $this->repo->directory . '/' . $this->backend->getRel(); |
| 1366 | + |
| 1367 | + if ( $this->dump->pathExists( $dest ) ) { |
| 1368 | + return; |
| 1369 | + } |
| 1370 | + |
| 1371 | + #$this->dump->debug ( "Copying $source to $dest\n" ); |
| 1372 | + if ( $source === false ) { |
| 1373 | + $sourceUrl = $this->backend->getUrl(); |
| 1374 | + $contents = Http::get( $sourceUrl ); |
| 1375 | + if ( $contents === false ) { |
| 1376 | + $this->dump->debug( "Unable to get contents of file from $sourceUrl" ); |
| 1377 | + } else { |
| 1378 | + if ( !file_put_contents( $dest, $contents ) ) { |
| 1379 | + $this->debug( "Unable to write to $dest" ); |
| 1380 | + } |
| 1381 | + } |
| 1382 | + } else { |
| 1383 | + if ( !is_dir( dirname( $dest ) ) ) { |
| 1384 | + $this->dump->mkdir( dirname( $dest ) ); |
| 1385 | + } |
| 1386 | + |
| 1387 | + if ( function_exists( 'symlink' ) && !$this->dump->forceCopy ) { |
| 1388 | + if ( !symlink( $source, $dest ) ) { |
| 1389 | + $this->dump->debug( "Warning: unable to create symlink at $dest" ); |
| 1390 | + } |
| 1391 | + } else { |
| 1392 | + if ( !copy( $source, $dest ) ) { |
| 1393 | + $this->dump->debug( "Warning: unable to copy $source to $dest" ); |
| 1394 | + } |
| 1395 | + } |
| 1396 | + } |
| 1397 | + } |
| 1398 | +} |
| 1399 | + |
| 1400 | + |
| 1401 | +/** XML parser callback */ |
| 1402 | +function wfDumpStartTagHandler( $parser, $name, $attribs ) { |
| 1403 | + global $wgDumpImages; |
| 1404 | + |
| 1405 | + if ( $name == 'IMG' && isset( $attribs['SRC'] ) ) { |
| 1406 | + $wgDumpImages[$attribs['SRC']] = true; |
| 1407 | + } |
| 1408 | +} |
| 1409 | + |
| 1410 | +/** XML parser callback */ |
| 1411 | +function wfDumpEndTagHandler( $parser, $name ) {} |
| 1412 | + |
| 1413 | +# vim: syn=php |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/dumpHTML.inc |
___________________________________________________________________ |
Name: svn:keywords |
1 | 1414 | + Author Date Id Revision |
Name: svn:eol-style |
2 | 1415 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/rename-hack.vbs |
— | — | @@ -0,0 +1,30 @@ |
| 2 | +' Script to move a file to a multibyte filename |
| 3 | +' Workaround for PHP 5's inadequacy |
| 4 | +dim dirsToCreate(20) |
| 5 | + |
| 6 | +source = Unescape( WScript.Arguments.Item( 0 ) ) |
| 7 | +dest = Unescape( WScript.Arguments.Item( 1 ) ) |
| 8 | +Set fso = CreateObject("Scripting.FileSystemObject") |
| 9 | + |
| 10 | +' Create the destination directory |
| 11 | +destDir = fso.GetParentFolderName( fso.GetAbsolutePathName( dest ) ) |
| 12 | +parent = destDir |
| 13 | +numDirs = 0 |
| 14 | +While parent <> "" and not fso.FolderExists(parent) |
| 15 | + dirsToCreate(numDirs) = parent |
| 16 | + numDirs = numDirs + 1 |
| 17 | + parent = fso.GetParentFolderName( parent ) |
| 18 | +Wend |
| 19 | + |
| 20 | +For i = numDirs - 1 to 0 step -1 |
| 21 | + fso.CreateFolder( dirsToCreate( i ) ) |
| 22 | +Next |
| 23 | + |
| 24 | +' Remove the destination file if it exists already |
| 25 | +if fso.FileExists( dest ) then |
| 26 | + fso.DeleteFile( dest ) |
| 27 | +end if |
| 28 | + |
| 29 | +' Move the temporary file to its destination |
| 30 | +fso.MoveFile source, dest |
| 31 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/rename-hack.vbs |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 32 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/skin/md5.js |
— | — | @@ -0,0 +1,256 @@ |
| 2 | +/* |
| 3 | + * A JavaScript implementation of the RSA Data Security, Inc. MD5 Message |
| 4 | + * Digest Algorithm, as defined in RFC 1321. |
| 5 | + * Version 2.1 Copyright (C) Paul Johnston 1999 - 2002. |
| 6 | + * Other contributors: Greg Holt, Andrew Kepert, Ydnar, Lostinet |
| 7 | + * Distributed under the BSD License |
| 8 | + * See http://pajhome.org.uk/crypt/md5 for more info. |
| 9 | + */ |
| 10 | + |
| 11 | +/* |
| 12 | + * Configurable variables. You may need to tweak these to be compatible with |
| 13 | + * the server-side, but the defaults work in most cases. |
| 14 | + */ |
| 15 | +var hexcase = 0; /* hex output format. 0 - lowercase; 1 - uppercase */ |
| 16 | +var b64pad = ""; /* base-64 pad character. "=" for strict RFC compliance */ |
| 17 | +var chrsz = 8; /* bits per input character. 8 - ASCII; 16 - Unicode */ |
| 18 | + |
| 19 | +/* |
| 20 | + * These are the functions you'll usually want to call |
| 21 | + * They take string arguments and return either hex or base-64 encoded strings |
| 22 | + */ |
| 23 | +function hex_md5(s){ return binl2hex(core_md5(str2binl(s), s.length * chrsz));} |
| 24 | +function b64_md5(s){ return binl2b64(core_md5(str2binl(s), s.length * chrsz));} |
| 25 | +function str_md5(s){ return binl2str(core_md5(str2binl(s), s.length * chrsz));} |
| 26 | +function hex_hmac_md5(key, data) { return binl2hex(core_hmac_md5(key, data)); } |
| 27 | +function b64_hmac_md5(key, data) { return binl2b64(core_hmac_md5(key, data)); } |
| 28 | +function str_hmac_md5(key, data) { return binl2str(core_hmac_md5(key, data)); } |
| 29 | + |
| 30 | +/* |
| 31 | + * Perform a simple self-test to see if the VM is working |
| 32 | + */ |
| 33 | +function md5_vm_test() |
| 34 | +{ |
| 35 | + return hex_md5("abc") == "900150983cd24fb0d6963f7d28e17f72"; |
| 36 | +} |
| 37 | + |
| 38 | +/* |
| 39 | + * Calculate the MD5 of an array of little-endian words, and a bit length |
| 40 | + */ |
| 41 | +function core_md5(x, len) |
| 42 | +{ |
| 43 | + /* append padding */ |
| 44 | + x[len >> 5] |= 0x80 << ((len) % 32); |
| 45 | + x[(((len + 64) >>> 9) << 4) + 14] = len; |
| 46 | + |
| 47 | + var a = 1732584193; |
| 48 | + var b = -271733879; |
| 49 | + var c = -1732584194; |
| 50 | + var d = 271733878; |
| 51 | + |
| 52 | + for(var i = 0; i < x.length; i += 16) |
| 53 | + { |
| 54 | + var olda = a; |
| 55 | + var oldb = b; |
| 56 | + var oldc = c; |
| 57 | + var oldd = d; |
| 58 | + |
| 59 | + a = md5_ff(a, b, c, d, x[i+ 0], 7 , -680876936); |
| 60 | + d = md5_ff(d, a, b, c, x[i+ 1], 12, -389564586); |
| 61 | + c = md5_ff(c, d, a, b, x[i+ 2], 17, 606105819); |
| 62 | + b = md5_ff(b, c, d, a, x[i+ 3], 22, -1044525330); |
| 63 | + a = md5_ff(a, b, c, d, x[i+ 4], 7 , -176418897); |
| 64 | + d = md5_ff(d, a, b, c, x[i+ 5], 12, 1200080426); |
| 65 | + c = md5_ff(c, d, a, b, x[i+ 6], 17, -1473231341); |
| 66 | + b = md5_ff(b, c, d, a, x[i+ 7], 22, -45705983); |
| 67 | + a = md5_ff(a, b, c, d, x[i+ 8], 7 , 1770035416); |
| 68 | + d = md5_ff(d, a, b, c, x[i+ 9], 12, -1958414417); |
| 69 | + c = md5_ff(c, d, a, b, x[i+10], 17, -42063); |
| 70 | + b = md5_ff(b, c, d, a, x[i+11], 22, -1990404162); |
| 71 | + a = md5_ff(a, b, c, d, x[i+12], 7 , 1804603682); |
| 72 | + d = md5_ff(d, a, b, c, x[i+13], 12, -40341101); |
| 73 | + c = md5_ff(c, d, a, b, x[i+14], 17, -1502002290); |
| 74 | + b = md5_ff(b, c, d, a, x[i+15], 22, 1236535329); |
| 75 | + |
| 76 | + a = md5_gg(a, b, c, d, x[i+ 1], 5 , -165796510); |
| 77 | + d = md5_gg(d, a, b, c, x[i+ 6], 9 , -1069501632); |
| 78 | + c = md5_gg(c, d, a, b, x[i+11], 14, 643717713); |
| 79 | + b = md5_gg(b, c, d, a, x[i+ 0], 20, -373897302); |
| 80 | + a = md5_gg(a, b, c, d, x[i+ 5], 5 , -701558691); |
| 81 | + d = md5_gg(d, a, b, c, x[i+10], 9 , 38016083); |
| 82 | + c = md5_gg(c, d, a, b, x[i+15], 14, -660478335); |
| 83 | + b = md5_gg(b, c, d, a, x[i+ 4], 20, -405537848); |
| 84 | + a = md5_gg(a, b, c, d, x[i+ 9], 5 , 568446438); |
| 85 | + d = md5_gg(d, a, b, c, x[i+14], 9 , -1019803690); |
| 86 | + c = md5_gg(c, d, a, b, x[i+ 3], 14, -187363961); |
| 87 | + b = md5_gg(b, c, d, a, x[i+ 8], 20, 1163531501); |
| 88 | + a = md5_gg(a, b, c, d, x[i+13], 5 , -1444681467); |
| 89 | + d = md5_gg(d, a, b, c, x[i+ 2], 9 , -51403784); |
| 90 | + c = md5_gg(c, d, a, b, x[i+ 7], 14, 1735328473); |
| 91 | + b = md5_gg(b, c, d, a, x[i+12], 20, -1926607734); |
| 92 | + |
| 93 | + a = md5_hh(a, b, c, d, x[i+ 5], 4 , -378558); |
| 94 | + d = md5_hh(d, a, b, c, x[i+ 8], 11, -2022574463); |
| 95 | + c = md5_hh(c, d, a, b, x[i+11], 16, 1839030562); |
| 96 | + b = md5_hh(b, c, d, a, x[i+14], 23, -35309556); |
| 97 | + a = md5_hh(a, b, c, d, x[i+ 1], 4 , -1530992060); |
| 98 | + d = md5_hh(d, a, b, c, x[i+ 4], 11, 1272893353); |
| 99 | + c = md5_hh(c, d, a, b, x[i+ 7], 16, -155497632); |
| 100 | + b = md5_hh(b, c, d, a, x[i+10], 23, -1094730640); |
| 101 | + a = md5_hh(a, b, c, d, x[i+13], 4 , 681279174); |
| 102 | + d = md5_hh(d, a, b, c, x[i+ 0], 11, -358537222); |
| 103 | + c = md5_hh(c, d, a, b, x[i+ 3], 16, -722521979); |
| 104 | + b = md5_hh(b, c, d, a, x[i+ 6], 23, 76029189); |
| 105 | + a = md5_hh(a, b, c, d, x[i+ 9], 4 , -640364487); |
| 106 | + d = md5_hh(d, a, b, c, x[i+12], 11, -421815835); |
| 107 | + c = md5_hh(c, d, a, b, x[i+15], 16, 530742520); |
| 108 | + b = md5_hh(b, c, d, a, x[i+ 2], 23, -995338651); |
| 109 | + |
| 110 | + a = md5_ii(a, b, c, d, x[i+ 0], 6 , -198630844); |
| 111 | + d = md5_ii(d, a, b, c, x[i+ 7], 10, 1126891415); |
| 112 | + c = md5_ii(c, d, a, b, x[i+14], 15, -1416354905); |
| 113 | + b = md5_ii(b, c, d, a, x[i+ 5], 21, -57434055); |
| 114 | + a = md5_ii(a, b, c, d, x[i+12], 6 , 1700485571); |
| 115 | + d = md5_ii(d, a, b, c, x[i+ 3], 10, -1894986606); |
| 116 | + c = md5_ii(c, d, a, b, x[i+10], 15, -1051523); |
| 117 | + b = md5_ii(b, c, d, a, x[i+ 1], 21, -2054922799); |
| 118 | + a = md5_ii(a, b, c, d, x[i+ 8], 6 , 1873313359); |
| 119 | + d = md5_ii(d, a, b, c, x[i+15], 10, -30611744); |
| 120 | + c = md5_ii(c, d, a, b, x[i+ 6], 15, -1560198380); |
| 121 | + b = md5_ii(b, c, d, a, x[i+13], 21, 1309151649); |
| 122 | + a = md5_ii(a, b, c, d, x[i+ 4], 6 , -145523070); |
| 123 | + d = md5_ii(d, a, b, c, x[i+11], 10, -1120210379); |
| 124 | + c = md5_ii(c, d, a, b, x[i+ 2], 15, 718787259); |
| 125 | + b = md5_ii(b, c, d, a, x[i+ 9], 21, -343485551); |
| 126 | + |
| 127 | + a = safe_add(a, olda); |
| 128 | + b = safe_add(b, oldb); |
| 129 | + c = safe_add(c, oldc); |
| 130 | + d = safe_add(d, oldd); |
| 131 | + } |
| 132 | + return Array(a, b, c, d); |
| 133 | + |
| 134 | +} |
| 135 | + |
| 136 | +/* |
| 137 | + * These functions implement the four basic operations the algorithm uses. |
| 138 | + */ |
| 139 | +function md5_cmn(q, a, b, x, s, t) |
| 140 | +{ |
| 141 | + return safe_add(bit_rol(safe_add(safe_add(a, q), safe_add(x, t)), s),b); |
| 142 | +} |
| 143 | +function md5_ff(a, b, c, d, x, s, t) |
| 144 | +{ |
| 145 | + return md5_cmn((b & c) | ((~b) & d), a, b, x, s, t); |
| 146 | +} |
| 147 | +function md5_gg(a, b, c, d, x, s, t) |
| 148 | +{ |
| 149 | + return md5_cmn((b & d) | (c & (~d)), a, b, x, s, t); |
| 150 | +} |
| 151 | +function md5_hh(a, b, c, d, x, s, t) |
| 152 | +{ |
| 153 | + return md5_cmn(b ^ c ^ d, a, b, x, s, t); |
| 154 | +} |
| 155 | +function md5_ii(a, b, c, d, x, s, t) |
| 156 | +{ |
| 157 | + return md5_cmn(c ^ (b | (~d)), a, b, x, s, t); |
| 158 | +} |
| 159 | + |
| 160 | +/* |
| 161 | + * Calculate the HMAC-MD5, of a key and some data |
| 162 | + */ |
| 163 | +function core_hmac_md5(key, data) |
| 164 | +{ |
| 165 | + var bkey = str2binl(key); |
| 166 | + if(bkey.length > 16) bkey = core_md5(bkey, key.length * chrsz); |
| 167 | + |
| 168 | + var ipad = Array(16), opad = Array(16); |
| 169 | + for(var i = 0; i < 16; i++) |
| 170 | + { |
| 171 | + ipad[i] = bkey[i] ^ 0x36363636; |
| 172 | + opad[i] = bkey[i] ^ 0x5C5C5C5C; |
| 173 | + } |
| 174 | + |
| 175 | + var hash = core_md5(ipad.concat(str2binl(data)), 512 + data.length * chrsz); |
| 176 | + return core_md5(opad.concat(hash), 512 + 128); |
| 177 | +} |
| 178 | + |
| 179 | +/* |
| 180 | + * Add integers, wrapping at 2^32. This uses 16-bit operations internally |
| 181 | + * to work around bugs in some JS interpreters. |
| 182 | + */ |
| 183 | +function safe_add(x, y) |
| 184 | +{ |
| 185 | + var lsw = (x & 0xFFFF) + (y & 0xFFFF); |
| 186 | + var msw = (x >> 16) + (y >> 16) + (lsw >> 16); |
| 187 | + return (msw << 16) | (lsw & 0xFFFF); |
| 188 | +} |
| 189 | + |
| 190 | +/* |
| 191 | + * Bitwise rotate a 32-bit number to the left. |
| 192 | + */ |
| 193 | +function bit_rol(num, cnt) |
| 194 | +{ |
| 195 | + return (num << cnt) | (num >>> (32 - cnt)); |
| 196 | +} |
| 197 | + |
| 198 | +/* |
| 199 | + * Convert a string to an array of little-endian words |
| 200 | + * If chrsz is ASCII, characters >255 have their hi-byte silently ignored. |
| 201 | + */ |
| 202 | +function str2binl(str) |
| 203 | +{ |
| 204 | + var bin = Array(); |
| 205 | + var mask = (1 << chrsz) - 1; |
| 206 | + for(var i = 0; i < str.length * chrsz; i += chrsz) |
| 207 | + bin[i>>5] |= (str.charCodeAt(i / chrsz) & mask) << (i%32); |
| 208 | + return bin; |
| 209 | +} |
| 210 | + |
| 211 | +/* |
| 212 | + * Convert an array of little-endian words to a string |
| 213 | + */ |
| 214 | +function binl2str(bin) |
| 215 | +{ |
| 216 | + var str = ""; |
| 217 | + var mask = (1 << chrsz) - 1; |
| 218 | + for(var i = 0; i < bin.length * 32; i += chrsz) |
| 219 | + str += String.fromCharCode((bin[i>>5] >>> (i % 32)) & mask); |
| 220 | + return str; |
| 221 | +} |
| 222 | + |
| 223 | +/* |
| 224 | + * Convert an array of little-endian words to a hex string. |
| 225 | + */ |
| 226 | +function binl2hex(binarray) |
| 227 | +{ |
| 228 | + var hex_tab = hexcase ? "0123456789ABCDEF" : "0123456789abcdef"; |
| 229 | + var str = ""; |
| 230 | + for(var i = 0; i < binarray.length * 4; i++) |
| 231 | + { |
| 232 | + str += hex_tab.charAt((binarray[i>>2] >> ((i%4)*8+4)) & 0xF) + |
| 233 | + hex_tab.charAt((binarray[i>>2] >> ((i%4)*8 )) & 0xF); |
| 234 | + } |
| 235 | + return str; |
| 236 | +} |
| 237 | + |
| 238 | +/* |
| 239 | + * Convert an array of little-endian words to a base-64 string |
| 240 | + */ |
| 241 | +function binl2b64(binarray) |
| 242 | +{ |
| 243 | + var tab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; |
| 244 | + var str = ""; |
| 245 | + for(var i = 0; i < binarray.length * 4; i += 3) |
| 246 | + { |
| 247 | + var triplet = (((binarray[i >> 2] >> 8 * ( i %4)) & 0xFF) << 16) |
| 248 | + | (((binarray[i+1 >> 2] >> 8 * ((i+1)%4)) & 0xFF) << 8 ) |
| 249 | + | ((binarray[i+2 >> 2] >> 8 * ((i+2)%4)) & 0xFF); |
| 250 | + for(var j = 0; j < 4; j++) |
| 251 | + { |
| 252 | + if(i * 8 + j * 6 > binarray.length * 32) str += b64pad; |
| 253 | + else str += tab.charAt((triplet >> 6*(3-j)) & 0x3F); |
| 254 | + } |
| 255 | + } |
| 256 | + return str; |
| 257 | +} |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/skin/md5.js |
___________________________________________________________________ |
Name: svn:keywords |
1 | 258 | + Author Date Id Revision |
Name: svn:eol-style |
2 | 259 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/skin/utf8.js |
— | — | @@ -0,0 +1,72 @@ |
| 2 | +/** |
| 3 | + * Obtained from http://homepage3.nifty.com/aokura/jscript/index.html |
| 4 | + * The webpage says, among other things: |
| 5 | + * * ソースコードの全てあるいは一部を使用したことにより生じた損害に関しては一切責任を負いません。 |
| 6 | + * * ソースコードの使用、配布に制限はありません。ご自由にお使いください。 |
| 7 | + * * 動作チェックが不充分な場合もありますので、注意してください。 |
| 8 | + * |
| 9 | + * Which, loosely translated, means: |
| 10 | + * * The author takes no responsibility for damage which occurs due to the use of this code. |
| 11 | + * * There is no restriction on the use and distribution of the source code. Please use freely. |
| 12 | + * * Please be careful, testing may have been insufficient. |
| 13 | + */ |
| 14 | + |
| 15 | + |
| 16 | +/********************************************************************** |
| 17 | + * |
| 18 | + * Unicode ⇔ UTF-8 |
| 19 | + * |
| 20 | + * Copyright (c) 2005 AOK <soft@aokura.com> |
| 21 | + * |
| 22 | + **********************************************************************/ |
| 23 | + |
| 24 | +function _to_utf8(s) { |
| 25 | + var c, d = ""; |
| 26 | + for (var i = 0; i < s.length; i++) { |
| 27 | + c = s.charCodeAt(i); |
| 28 | + if (c <= 0x7f) { |
| 29 | + d += s.charAt(i); |
| 30 | + } else if (c >= 0x80 && c <= 0x7ff) { |
| 31 | + d += String.fromCharCode(((c >> 6) & 0x1f) | 0xc0); |
| 32 | + d += String.fromCharCode((c & 0x3f) | 0x80); |
| 33 | + } else { |
| 34 | + d += String.fromCharCode((c >> 12) | 0xe0); |
| 35 | + d += String.fromCharCode(((c >> 6) & 0x3f) | 0x80); |
| 36 | + d += String.fromCharCode((c & 0x3f) | 0x80); |
| 37 | + } |
| 38 | + } |
| 39 | + return d; |
| 40 | +} |
| 41 | + |
| 42 | +function _from_utf8(s) { |
| 43 | + var c, d = "", flag = 0, tmp; |
| 44 | + for (var i = 0; i < s.length; i++) { |
| 45 | + c = s.charCodeAt(i); |
| 46 | + if (flag == 0) { |
| 47 | + if ((c & 0xe0) == 0xe0) { |
| 48 | + flag = 2; |
| 49 | + tmp = (c & 0x0f) << 12; |
| 50 | + } else if ((c & 0xc0) == 0xc0) { |
| 51 | + flag = 1; |
| 52 | + tmp = (c & 0x1f) << 6; |
| 53 | + } else if ((c & 0x80) == 0) { |
| 54 | + d += s.charAt(i); |
| 55 | + } else { |
| 56 | + flag = 0; |
| 57 | + } |
| 58 | + } else if (flag == 1) { |
| 59 | + flag = 0; |
| 60 | + d += String.fromCharCode(tmp | (c & 0x3f)); |
| 61 | + } else if (flag == 2) { |
| 62 | + flag = 3; |
| 63 | + tmp |= (c & 0x3f) << 6; |
| 64 | + } else if (flag == 3) { |
| 65 | + flag = 0; |
| 66 | + d += String.fromCharCode(tmp | (c & 0x3f)); |
| 67 | + } else { |
| 68 | + flag = 0; |
| 69 | + } |
| 70 | + } |
| 71 | + return d; |
| 72 | +} |
| 73 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/skin/utf8.js |
___________________________________________________________________ |
Name: svn:keywords |
1 | 74 | + Author Date Id Revision |
Name: svn:eol-style |
2 | 75 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/skin/lookup.js |
— | — | @@ -0,0 +1,93 @@ |
| 2 | +/** |
| 3 | + * "Go" function for static HTML dump |
| 4 | + */ |
| 5 | +function goToStatic(depth) { |
| 6 | + var url = getStaticURL(document.getElementById("searchInput").value, depth); |
| 7 | + if (url != "") { |
| 8 | + location = url; |
| 9 | + } else { |
| 10 | + alert("Invalid title"); |
| 11 | + } |
| 12 | +} |
| 13 | + |
| 14 | +/** |
| 15 | + * Determine relative path for a given non-canonical title |
| 16 | + */ |
| 17 | +function getStaticURL(text, depth) { |
| 18 | + var pdbk = getPDBK(text); |
| 19 | + if (pdbk == "") { |
| 20 | + return ""; |
| 21 | + } else { |
| 22 | + var i; |
| 23 | + var path = getHashedDirectory(pdbk, depth) + "/" + getFriendlyName(pdbk) + ".html"; |
| 24 | + if (!/(index\.html|\/)$/.exec(location)) { |
| 25 | + for (i = 0; i < depth; i++) { |
| 26 | + path = "../" + path; |
| 27 | + } |
| 28 | + } else { |
| 29 | + path = "articles/" + path; |
| 30 | + } |
| 31 | + return path; |
| 32 | + } |
| 33 | +} |
| 34 | + |
| 35 | +function getPDBK(text) { |
| 36 | + // Spaces to underscores |
| 37 | + text = text.replace(/ /g, "_"); |
| 38 | + |
| 39 | + // Trim leading and trailing space |
| 40 | + text = text.replace(/^_+/g, ""); |
| 41 | + text = text.replace(/_+$/g, ""); |
| 42 | + |
| 43 | + // Capitalise first letter |
| 44 | + return ucfirst(text); |
| 45 | +} |
| 46 | + |
| 47 | +function getHashedDirectory(pdbk, depth) { |
| 48 | + // Find the first colon if there is one, use characters after it |
| 49 | + var dbk = pdbk.replace(/^[^:]*:_*(.*)$/, "$1"); |
| 50 | + var i, c, dir = ""; |
| 51 | + |
| 52 | + for (i=0; i < depth; i++) { |
| 53 | + if (i) { |
| 54 | + dir += "/"; |
| 55 | + } |
| 56 | + if (i >= dbk.length) { |
| 57 | + dir += "_"; |
| 58 | + } else { |
| 59 | + c = dbk.charAt(i); |
| 60 | + cc = dbk.charCodeAt(i); |
| 61 | + |
| 62 | + if (cc >= 128 || /[a-zA-Z0-9!#$%&()+,[\]^_`{}-]/.exec(c)) { |
| 63 | + dir += c.toLowerCase(); |
| 64 | + } else { |
| 65 | + dir += binl2hex([cc]).substr(0,2).toUpperCase(); |
| 66 | + } |
| 67 | + } |
| 68 | + } |
| 69 | + return dir; |
| 70 | +} |
| 71 | + |
| 72 | +function ucfirst(s) { |
| 73 | + return s.charAt(0).toUpperCase() + s.substring(1, s.length); |
| 74 | +} |
| 75 | + |
| 76 | +function getFriendlyName(name) { |
| 77 | + // Replace illegal characters for Windows paths with underscores |
| 78 | + var friendlyName = name.replace(/[\/\\*?"<>|~]/g, "_"); |
| 79 | + |
| 80 | + // Work out lower case form. We assume we're on a system with case-insensitive |
| 81 | + // filenames, so unless the case is of a special form, we have to disambiguate |
| 82 | + var lowerCase = ucfirst(name.toLowerCase()); |
| 83 | + |
| 84 | + // Make it mostly unique |
| 85 | + if (lowerCase != friendlyName) { |
| 86 | + friendlyName += "_" + hex_md5(_to_utf8(name)).substring(0, 4); |
| 87 | + } |
| 88 | + // Handle colon specially by replacing it with tilde |
| 89 | + // Thus we reduce the number of paths with hashes appended |
| 90 | + friendlyName = friendlyName.replace(":", "~"); |
| 91 | + |
| 92 | + return friendlyName; |
| 93 | +} |
| 94 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/skin/lookup.js |
___________________________________________________________________ |
Name: svn:keywords |
1 | 95 | + Author Date Id Revision |
Name: svn:eol-style |
2 | 96 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/skin/main.css |
— | — | @@ -0,0 +1,9 @@ |
| 2 | +@import "../monobook/main.css"; |
| 3 | + |
| 4 | +#footer li { |
| 5 | + display: block; |
| 6 | +} |
| 7 | +head:first-child + body #footer li { white-space: normal; } |
| 8 | +.usermessage { display: none; } |
| 9 | +.editsection { display: none; } |
| 10 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/skin/main.css |
___________________________________________________________________ |
Name: svn:keywords |
1 | 11 | + Author Date Id Revision |
Name: svn:eol-style |
2 | 12 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/finish-lang |
— | — | @@ -0,0 +1,34 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +if [ "X$3" == "X" ];then |
| 5 | + echo "Usage: finish <lang> <edition> <checkpoint>" |
| 6 | + exit |
| 7 | +fi |
| 8 | + |
| 9 | +. `dirname $0`/config.sh |
| 10 | + |
| 11 | +bindir=$base/scripts |
| 12 | +lang=$1 |
| 13 | +edition=$2 |
| 14 | +checkpoint=$3 |
| 15 | +site=wikipedia |
| 16 | +langdir=$base/new/$site/$lang |
| 17 | + |
| 18 | +if [ -e $langdir/compressed ]; then |
| 19 | + echo "Already compressed $lang" |
| 20 | + echo "everything=done" > $checkpoint |
| 21 | + exit |
| 22 | +fi |
| 23 | +if [ ! -e $langdir/index.html ]; then |
| 24 | + echo "$lang directory is broken, missing index.html, skipping." |
| 25 | + exit |
| 26 | +fi |
| 27 | + |
| 28 | +date > $langdir/compressed |
| 29 | + |
| 30 | +echo "$lang: Compressing HTML..." |
| 31 | +$bindir/compress-html $lang $edition "$checkpoint" |
| 32 | +#echo "$lang: Making image tarball..." |
| 33 | +#ssh albert tar -C /mnt/static -cf /a/upload_snapshot/$edition/downloads/wikipedia-$lang-images.tar -h $lang/upload |
| 34 | +#ln -sf /mnt/upload_snapshot/$edition/downloads/wikipedia-$lang-images.tar /mnt/static/downloads/$edition/$lang/wikipedia-$lang-images.tar |
| 35 | +echo "$lang: Done." |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/finish-lang |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 36 | + native |
Name: svn:executable |
2 | 37 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/index.inc |
— | — | @@ -0,0 +1,154 @@ |
| 2 | +<?php |
| 3 | + $currentDir = readlink( '/a/static/downloads/current' ); |
| 4 | + $currentBase = basename( $currentDir ); |
| 5 | + $currentText = htmlspecialchars( date( 'F Y', strtotime( $currentBase . '-01' ) ) ); |
| 6 | + $encCurrentBase = htmlspecialchars( urlencode( $currentBase ) ); |
| 7 | + $inProgressDir = @readlink( '/a/static/downloads/in_progress' ); |
| 8 | + $inProgressText = $inProgressDir ? |
| 9 | + htmlspecialchars( date( 'F Y', strtotime( basename( $inProgressDir ) . '-01' ) ) ) : false; |
| 10 | +?> |
| 11 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
| 12 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr"> |
| 13 | + <head> |
| 14 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> |
| 15 | + <title>Wikipedia Static HTML Dumps</title> |
| 16 | + <style type="text/css" media="screen">/*<![CDATA[*/ |
| 17 | +body { |
| 18 | + font: x-small sans-serif; |
| 19 | + background: #f9f9f9 url(http://en.wikipedia.org/skins-1.5/monobook/headbg.jpg) 0 0 no-repeat; |
| 20 | + color: black; |
| 21 | + margin: 0; |
| 22 | + padding: 0; |
| 23 | +} |
| 24 | +/* scale back up to a sane default */ |
| 25 | +#globalWrapper { |
| 26 | + font-size: 127%; |
| 27 | + width: 100%; |
| 28 | + margin: 0; |
| 29 | + padding: 0; |
| 30 | +} |
| 31 | +#content { |
| 32 | + margin: 2em 2em 0 2em; |
| 33 | + padding: 0 1em 1.5em 1em; |
| 34 | + background: white; |
| 35 | + color: black; |
| 36 | + border: 1px solid #aaa; |
| 37 | + line-height: 1.5em; |
| 38 | + position: relative; |
| 39 | + z-index: 2; |
| 40 | +} |
| 41 | +h1, h2, h3, h4, h5, h6 { |
| 42 | + color: black; |
| 43 | + background: none; |
| 44 | + font-weight: normal; |
| 45 | + margin: 0; |
| 46 | + padding-top: .5em; |
| 47 | + padding-bottom: .17em; |
| 48 | + border-bottom: 1px solid #aaa; |
| 49 | +} |
| 50 | +h1 { |
| 51 | + font-size: 188%; |
| 52 | + margin-bottom: .6em; |
| 53 | +} |
| 54 | +h2 { |
| 55 | + font-size: 150%; |
| 56 | + margin-bottom: .6em; |
| 57 | +} |
| 58 | +h3, h4, h5, h6 { |
| 59 | + border-bottom: none; |
| 60 | + font-weight: bold; |
| 61 | + margin-bottom: .3em; |
| 62 | +} |
| 63 | + |
| 64 | +ul { |
| 65 | + line-height: 1.5em; |
| 66 | + list-style-type: square; |
| 67 | + margin: .3em 0 0 1.5em; |
| 68 | + padding: 0; |
| 69 | + list-style-image: url(bullet.gif); |
| 70 | +} |
| 71 | +ol { |
| 72 | + line-height: 1.5em; |
| 73 | + margin: .3em 0 0 3.2em; |
| 74 | + padding: 0; |
| 75 | + list-style-image: none; |
| 76 | +} |
| 77 | +li { |
| 78 | + margin-bottom: .1em; |
| 79 | +} |
| 80 | +dt { |
| 81 | + font-weight: bold; |
| 82 | + margin-bottom: .05em; |
| 83 | +} |
| 84 | +dl { |
| 85 | + margin-top: .2em; |
| 86 | + margin-bottom: .5em; |
| 87 | +} |
| 88 | +dd { |
| 89 | + line-height: 1.5em; |
| 90 | + margin-left: 2em; |
| 91 | + margin-bottom: .5em; |
| 92 | +} |
| 93 | +a { |
| 94 | + text-decoration: none; |
| 95 | + color: #002bb8; |
| 96 | + background: none; |
| 97 | +} |
| 98 | +a:visited { |
| 99 | + color: #5a3696; |
| 100 | +} |
| 101 | +a:active { |
| 102 | + color: #faa700; |
| 103 | +} |
| 104 | +a:hover { |
| 105 | + text-decoration: underline; |
| 106 | +} |
| 107 | + |
| 108 | +.visualClear { |
| 109 | + clear: both; |
| 110 | +} |
| 111 | + |
| 112 | +/*]]>*/</style> |
| 113 | + |
| 114 | +</head> |
| 115 | +<body> |
| 116 | + <div id="globalWrapper"> |
| 117 | + <div id="content"> |
| 118 | + <h1>Wikipedia Static HTML Dumps</h1> |
| 119 | +<p>This is a set of static HTML dumps of Wikipedia. Note that putting one of these dumps on the web unmodified will constitute a trademark violation. They are intended for private viewing in an intranet or desktop installation.</p> |
| 120 | + |
| 121 | +<p>The current dump is the <?php echo $currentText; ?> edition. This dump has no image snapshot, it's just HTML. Due to performance problems when compressing millions of files with <a href="http://7-zip.org">7-zip</a>, the archives are now packaged as a 7-zipped tar file.</p> |
| 122 | + |
| 123 | +<?php if ( $inProgressText ): ?> |
| 124 | +<p>A <?php echo $inProgressText; ?> dump is in progress.</p> |
| 125 | +<?php endif; ?> |
| 126 | + |
| 127 | +<ul> |
| 128 | + <li><strong><a href="https://www.mediawiki.org/downloads/<?php echo $encCurrentBase; ?>">Downloads</a></strong></li> |
| 129 | +</ul> |
| 130 | +<h2>Browse</h2> |
| 131 | +<p>Try before you download, click on a language code below.</p> |
| 132 | + |
| 133 | +<p> |
| 134 | +<?php |
| 135 | +$wikipedias = array_map( 'trim', file( '/home/wikipedia/common/wikipedia.dblist' ) ); |
| 136 | +$private = array_map( 'trim', file( '/home/wikipedia/common/private.dblist' ) ); |
| 137 | +$wikipedias = array_diff( $wikipedias, $private ); |
| 138 | +$first = true; |
| 139 | +foreach ( $wikipedias as $db ) { |
| 140 | + $db = trim( $db ); |
| 141 | + $langWithUnderscores = substr( $db, 0, strlen( $db ) - 4 ); |
| 142 | + $lang = str_replace( '_', '-', $langWithUnderscores ); |
| 143 | + if ( $first ) { |
| 144 | + $first = false; |
| 145 | + } else { |
| 146 | + echo " - "; |
| 147 | + } |
| 148 | + echo "<a href=\"new/wikipedia/$langWithUnderscores/index.html\">$lang</a>\n"; |
| 149 | +} |
| 150 | +?> |
| 151 | +</p> |
| 152 | + <div class="visualClear"></div> |
| 153 | + </div> |
| 154 | +</div> |
| 155 | +</body></html> |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/index.inc |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 156 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/queueSlave |
— | — | @@ -0,0 +1,191 @@ |
| 2 | +#!/usr/bin/python |
| 3 | + |
| 4 | +import sys, os, os.path, signal, socket, re, time |
| 5 | + |
| 6 | +def redirectOutput(fileName): |
| 7 | + os.close(1) |
| 8 | + os.close(2) |
| 9 | + fd = os.open(fileName, os.O_WRONLY|os.O_CREAT|os.O_APPEND, 0666) |
| 10 | + os.dup2(fd, 1) |
| 11 | + os.dup2(fd,2) |
| 12 | + |
| 13 | +def dumpHTML(outputFile, *params): |
| 14 | + fullParams = ["nice", "nice", "-n15", "php", "-n", "dumpHTML.php"] |
| 15 | + fullParams.extend(params) |
| 16 | + msg(" ".join(fullParams)) |
| 17 | + pid = os.fork() |
| 18 | + if pid == 0: |
| 19 | + redirectOutput(outputFile) |
| 20 | + os.execvp("nice", fullParams) |
| 21 | + sys.exit(1) |
| 22 | + |
| 23 | + # Wait for the child to exit (or the parent) |
| 24 | + status = os.waitpid(pid, os.WNOHANG) |
| 25 | + while status == (0,0) and os.getppid() > 1: |
| 26 | + time.sleep(5) |
| 27 | + status = os.waitpid(pid, os.WNOHANG) |
| 28 | + |
| 29 | + # If the parent exited, then kill the child |
| 30 | + if status == (0,0): |
| 31 | + os.kill(pid, signal.SIGKILL) |
| 32 | + elif os.WIFSIGNALED(status[1]): |
| 33 | + msg("Process exited on signal %d" % os.WTERMSIG(status[1])) |
| 34 | + |
| 35 | +def finishWiki(outputFile, lang, checkpoint): |
| 36 | + global edition, siteDir, baseDir |
| 37 | + msg("Finishing language "+lang) |
| 38 | + cmd = "%(baseDir)s/scripts/finish-lang %(lang)s %(edition)s %(checkpoint)s 2>&1 >> %(outputFile)s" % { |
| 39 | + 'baseDir' : baseDir, 'lang' : lang, 'edition' : edition, 'outputFile': outputFile, |
| 40 | + 'checkpoint': checkpoint } |
| 41 | + msg(cmd) |
| 42 | + os.system(cmd) |
| 43 | + |
| 44 | +def writeStatus(jobID, status): |
| 45 | + global jobDir |
| 46 | + f = open(jobDir+"/"+jobID, "w") |
| 47 | + print >> f, socket.gethostname(), os.getpid() |
| 48 | + print >> f, status |
| 49 | + f.close() |
| 50 | + |
| 51 | +def isStatusMine(jobID): |
| 52 | + global jobDir |
| 53 | + try: |
| 54 | + f = open(jobDir+"/"+jobID, "r") |
| 55 | + except: |
| 56 | + msg("Status file is missing") |
| 57 | + return False |
| 58 | + |
| 59 | + fields = f.readline().split() |
| 60 | + f.close() |
| 61 | + if len(fields) != 2: |
| 62 | + msg("Warning: invalid status file") |
| 63 | + return False |
| 64 | + |
| 65 | + if fields[0] == socket.gethostname() and fields[1] == str(os.getpid()): |
| 66 | + return True |
| 67 | + else: |
| 68 | + return False |
| 69 | + |
| 70 | +def isDone(checkpoint, jobType): |
| 71 | + test = jobType+'=done' |
| 72 | + try: f = open(checkpoint, "r") |
| 73 | + except: |
| 74 | + return False |
| 75 | + try: |
| 76 | + for line in f: |
| 77 | + if line.rstrip() == test: |
| 78 | + return True |
| 79 | + finally: |
| 80 | + f.close() |
| 81 | + return False |
| 82 | + |
| 83 | + |
| 84 | +def writeStatusIfMine(jobID, status): |
| 85 | + if isStatusMine(jobID): |
| 86 | + writeStatus(jobID, status) |
| 87 | + else: |
| 88 | + msg("Not overwriting status file, it doesn't belong to me.") |
| 89 | + |
| 90 | +def msg(*params): |
| 91 | + print " ".join(params) |
| 92 | + sys.stdout.flush() |
| 93 | + |
| 94 | +#--------------------------------------------------------------------------------- |
| 95 | + |
| 96 | +hostname = socket.gethostname() |
| 97 | +myPid = os.getpid() |
| 98 | + |
| 99 | +msg("queueSlave on %s %d" % (hostname, myPid)) |
| 100 | + |
| 101 | +queueHost = sys.argv[1] |
| 102 | +queuePort = int(sys.argv[2]) |
| 103 | +baseDir = sys.argv[3] |
| 104 | +edition = sys.argv[4] |
| 105 | +siteDir = baseDir+"/new/wikipedia" |
| 106 | +logDir = baseDir+"/var/logs" |
| 107 | +jobDir = baseDir+"/var/jobs" |
| 108 | +checkpointDir = baseDir+"/var/checkpoints" |
| 109 | +downloadsDir = baseDir + "/downloads/" + edition; |
| 110 | + |
| 111 | +try: os.makedirs(logDir) |
| 112 | +except: pass |
| 113 | +try: os.makedirs(jobDir) |
| 114 | +except: pass |
| 115 | +try: os.makedirs(checkpointDir) |
| 116 | +except: pass |
| 117 | + |
| 118 | +queueSock = socket.socket() |
| 119 | +queueSock.connect((queueHost, queuePort)) |
| 120 | +queueFile = queueSock.makefile() |
| 121 | + |
| 122 | +os.chdir("/home/wikipedia/common/php-1.5/extensions/DumpHTML") |
| 123 | +waiting = False |
| 124 | + |
| 125 | +dataRegex = re.compile("data (\w+) ([a-z_-]+) (\w+) (\d+/\d+)") |
| 126 | + |
| 127 | +# Loop until the parent exits |
| 128 | +while (os.getppid() > 1): |
| 129 | + queueFile.write("deq\n") |
| 130 | + queueFile.flush() |
| 131 | + s = queueFile.readline() |
| 132 | + m = dataRegex.match(s) |
| 133 | + if m != None: |
| 134 | + waiting = False |
| 135 | + jobID = m.group(1) |
| 136 | + wiki = m.group(2) |
| 137 | + type = m.group(3) |
| 138 | + slice = m.group(4) |
| 139 | + lang = wiki.replace( 'wiki', '' ) |
| 140 | + dest = siteDir+"/"+lang |
| 141 | + jobString = wiki+"_" + type + "_" + slice.replace( '/', '_' ) |
| 142 | + outputFile = logDir+"/"+jobString |
| 143 | + checkpoint = checkpointDir+"/"+jobString |
| 144 | + |
| 145 | + if type == "articles": |
| 146 | + writeStatus(jobID, 'running') |
| 147 | + msg(wiki + ' articles ' + slice) |
| 148 | + dumpHTML(outputFile, wiki,"--no-shared-desc", "--image-snapshot", |
| 149 | + "--interlang","-d",dest,"--slice",slice, |
| 150 | + "--udp-profile","50", |
| 151 | + "--oom-adj", "6", |
| 152 | + #"--show-titles", |
| 153 | + "--checkpoint",checkpoint,"--no-overwrite") |
| 154 | + |
| 155 | + if isDone(checkpoint, 'everything'): |
| 156 | + msg("Done") |
| 157 | + writeStatusIfMine(jobID, 'done') |
| 158 | + else: |
| 159 | + msg("Terminated, unfinished") |
| 160 | + writeStatusIfMine(jobID, 'terminated') |
| 161 | + |
| 162 | + elif type == "shared": |
| 163 | + writeStatus(jobID, 'running') |
| 164 | + msg(wiki + ' shared ' + slice) |
| 165 | + dumpHTML(outputFile, wiki,"--shared-desc", "--image-snapshot", |
| 166 | + "--interlang","-d",dest,"--slice",slice, |
| 167 | + "--udp-profile", "50", |
| 168 | + "--oom-adj", "4", |
| 169 | + "--checkpoint",checkpoint,"--no-overwrite") |
| 170 | + if isDone(checkpoint, 'shared image'): |
| 171 | + msg("Done") |
| 172 | + writeStatusIfMine(jobID, 'done') |
| 173 | + else: |
| 174 | + msg("Terminated, unfinished") |
| 175 | + writeStatusIfMine(jobID, 'terminated') |
| 176 | + |
| 177 | + elif type == "finish": |
| 178 | + writeStatus(jobID, 'running') |
| 179 | + finishWiki(outputFile, lang, checkpoint) |
| 180 | + if isDone(checkpoint, 'everything'): |
| 181 | + msg("Done") |
| 182 | + writeStatusIfMine(jobID, 'done') |
| 183 | + else: |
| 184 | + msg("Terminated, unfinished") |
| 185 | + writeStatusIfMine(jobID, 'terminated') |
| 186 | + else: |
| 187 | + if not waiting: |
| 188 | + msg("Waiting...") |
| 189 | + waiting = True |
| 190 | + time.sleep(1) |
| 191 | + |
| 192 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/queueSlave |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 193 | + native |
Name: svn:executable |
2 | 194 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/netqueue.py |
— | — | @@ -0,0 +1,72 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# vim: set ts=4 sw=4 : |
| 4 | +import SocketServer, sys, signal, os, threading, Queue |
| 5 | + |
| 6 | +class QueueServer(SocketServer.ThreadingMixIn, SocketServer.TCPServer): |
| 7 | + queue = Queue.Queue(0) |
| 8 | + allow_reuse_address = True |
| 9 | + |
| 10 | + def enqueue(self, value): |
| 11 | + self.queue.put(value) |
| 12 | + |
| 13 | + def dequeue(self): |
| 14 | + try: |
| 15 | + value = self.queue.get_nowait() |
| 16 | + except Queue.Empty: |
| 17 | + value = None |
| 18 | + return value |
| 19 | + |
| 20 | + def blockingDequeue(self, file): |
| 21 | + value = self.queue.get() |
| 22 | + #if file.closed: |
| 23 | + # File doesn't want it, requeue it |
| 24 | + # self.queue.put(value) |
| 25 | + # value = None |
| 26 | + return value; |
| 27 | + |
| 28 | + def clearQueue(self): |
| 29 | + self.queue = Queue.Queue(0) |
| 30 | + |
| 31 | + |
| 32 | +class QueueRequestHandler(SocketServer.StreamRequestHandler): |
| 33 | + def handle(self): |
| 34 | + try: |
| 35 | + for line in self.rfile: |
| 36 | + cmd = line.strip() |
| 37 | + if cmd[:4] == "enq ": |
| 38 | + self.server.enqueue(cmd[4:]) |
| 39 | + self.wfile.write("ok\n") |
| 40 | + elif cmd == "deq": |
| 41 | + value = self.server.dequeue() |
| 42 | + if value is None: |
| 43 | + self.wfile.write("empty\n") |
| 44 | + else: |
| 45 | + self.wfile.write("data " + value + "\n") |
| 46 | + elif cmd == "bdeq": |
| 47 | + value = self.server.blockingDequeue(self.wfile) |
| 48 | + if value is None: |
| 49 | + self.wfile.write("empty\n") |
| 50 | + else: |
| 51 | + self.wfile.write("data " + value + "\n") |
| 52 | + elif cmd == "size": |
| 53 | + self.wfile.write("size " + str(self.server.queue.qsize()) + "\n") |
| 54 | + elif cmd == "clear": |
| 55 | + self.server.clearQueue() |
| 56 | + self.wfile.write("ok\n") |
| 57 | + else: |
| 58 | + self.wfile.write("invalid command\n") |
| 59 | + except: |
| 60 | + sys.stdout.write("netqueue: Error processing socket " + self.request.getpeername() + "\n") |
| 61 | + |
| 62 | + |
| 63 | +if __name__ == '__main__': |
| 64 | + server = QueueServer(('127.0.0.1', 8200), QueueRequestHandler) |
| 65 | + try: |
| 66 | + server.serve_forever() |
| 67 | + except KeyboardInterrupt: |
| 68 | + print "Caught KeyboardInterrupt" |
| 69 | + os.kill(os.getpid(), signal.SIGKILL) |
| 70 | + sys.exit(0) |
| 71 | + |
| 72 | + |
| 73 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/netqueue.py |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 74 | + native |
Name: svn:executable |
2 | 75 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/compress-volumes |
— | — | @@ -0,0 +1,32 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +set -e |
| 5 | + |
| 6 | +if [ -z "$2" ]; then |
| 7 | + echo "Usage: $0 <language> <edition>" |
| 8 | + exit 1 |
| 9 | +fi |
| 10 | + |
| 11 | +. `dirname $0`/config.sh |
| 12 | + |
| 13 | +lang=$1 |
| 14 | +edition=$2 |
| 15 | +dest=$base/downloads/$edition/$lang |
| 16 | +sitebase=$base/wikipedia |
| 17 | +bindir=$base/scripts |
| 18 | + |
| 19 | +cd $dest |
| 20 | +rm html.lst.* |
| 21 | +split -a1 -d -l1000000 html.lst html.lst. |
| 22 | +cd $sitebase |
| 23 | +for f in $dest/html.lst.* ; do |
| 24 | + vol=${f#$dest/html.lst.} |
| 25 | + if [ $vol == 0 ];then |
| 26 | + response_files="@$f @$dest/skins.lst" |
| 27 | + else |
| 28 | + response_files="@$f" |
| 29 | + fi |
| 30 | + $bindir/7za-readdir-hack -l -ms8m a $dest/wikipedia-$lang-html.$vol.7z $response_files |
| 31 | +done |
| 32 | + |
| 33 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/compress-volumes |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 34 | + native |
Name: svn:executable |
2 | 35 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/start-edition |
— | — | @@ -0,0 +1,16 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +. `dirname $0`/config.sh |
| 5 | + |
| 6 | +if [ -z $base ]; then |
| 7 | + echo "No base directory" |
| 8 | + exit; |
| 9 | +fi |
| 10 | + |
| 11 | +rm -rf $base/var/checkpoints |
| 12 | +mkdir $base/var/checkpoints |
| 13 | +rm -rf $base/var/jobs |
| 14 | +mkdir $base/var/jobs |
| 15 | +rm -rf $base/var/logs |
| 16 | +mkdir $base/var/logs |
| 17 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/start-edition |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 18 | + native |
Name: svn:executable |
2 | 19 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/.htaccess |
— | — | @@ -0,0 +1 @@ |
| 2 | +Deny from all |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/.htaccess |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 3 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/do-edition |
— | — | @@ -0,0 +1,96 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +import sys, os, socket, signal, time, stat |
| 4 | + |
| 5 | +base = "/a/static" |
| 6 | +scripts = base + "/scripts" |
| 7 | +#host = socket.gethostbyname(); |
| 8 | +host = "localhost" |
| 9 | + |
| 10 | +if len(sys.argv) < 2: |
| 11 | + print "Usage: do-edition <edition>" |
| 12 | + sys.exit(1) |
| 13 | + |
| 14 | +edition = sys.argv[1] |
| 15 | + |
| 16 | +threads = { |
| 17 | + "localhost": 24, |
| 18 | +} |
| 19 | + |
| 20 | +# Create some directories |
| 21 | +try: os.makedirs(base + "/var/logs") |
| 22 | +except: pass |
| 23 | +try: os.makedirs(base + "/var/checkpoints") |
| 24 | +except: pass |
| 25 | + |
| 26 | +# Set up in_progress symlink |
| 27 | +try: os.unlink(base+'/downloads/in_progress') |
| 28 | +except: pass |
| 29 | +os.symlink(base+'/downloads/'+edition, base+'/downloads/in_progress') |
| 30 | + |
| 31 | +# Start queue server |
| 32 | +print "Starting queue server" |
| 33 | +queueServer = os.fork() |
| 34 | +if 0 == queueServer: |
| 35 | + # Run it in a new group so that its precious finishlang children don't get hurt |
| 36 | + os.close(1) |
| 37 | + os.close(2) |
| 38 | + os.setsid() |
| 39 | + fd = os.open(base+"/var/logs/netqueue.out", os.O_WRONLY|os.O_CREAT|os.O_APPEND, 0666) |
| 40 | + os.dup2(fd, 1) |
| 41 | + os.dup2(fd, 2) |
| 42 | + os.execlp("python", "python", scripts+"/netqueue.py") |
| 43 | + sys.exit(1) |
| 44 | + |
| 45 | +# Wait for it to start up |
| 46 | +queueSock = socket.socket() |
| 47 | +while queueSock.connect_ex(("localhost", 8200)): |
| 48 | + time.sleep(0.1) |
| 49 | + |
| 50 | + |
| 51 | +# Start slave threads |
| 52 | +slaves = [] |
| 53 | +for host, number in threads.iteritems(): |
| 54 | + for i in range(number): |
| 55 | + print "Starting thread %d on host %s" % (i, host) |
| 56 | + pid = os.fork() |
| 57 | + if pid == 0: |
| 58 | + # Redirect stdout |
| 59 | + os.close(1) |
| 60 | + fd = os.open("%s/var/logs/%s-%d.out" % (base, host, i), os.O_WRONLY|os.O_CREAT|os.O_APPEND, 0666) |
| 61 | + os.dup2(fd, 1) |
| 62 | + |
| 63 | + # Redirect stderr |
| 64 | + os.close(2) |
| 65 | + fd = os.open("%s/var/logs/%s-%d.err" % (base, host, i), os.O_WRONLY|os.O_CREAT|os.O_APPEND, 0666) |
| 66 | + os.dup2(fd, 2) |
| 67 | + |
| 68 | + if host == "localhost": |
| 69 | + os.execlp("python", "python", scripts+"/queueSlave", host, "8200", base, edition) |
| 70 | + sys.exit(1) |
| 71 | + else: |
| 72 | + os.execlp("ssh", "ssh", host, "python", scripts+"/queueSlave", host, "8200", base, edition) |
| 73 | + sys.exit(1) |
| 74 | + slaves.append(pid) |
| 75 | + |
| 76 | +# Start controller, wait for it to exit |
| 77 | +print "Starting controller" |
| 78 | +try: |
| 79 | + status = os.spawnlp(os.P_WAIT, "php", "php", "-n", "queueController.php") |
| 80 | +except KeyboardInterrupt: |
| 81 | + status = "interrupted" |
| 82 | + |
| 83 | +# Kill queue server |
| 84 | +os.kill(queueServer, signal.SIGKILL) |
| 85 | + |
| 86 | +if status == 0: |
| 87 | + print "Controller has exited, all done\n" |
| 88 | + |
| 89 | + # Set up current symlink |
| 90 | + try: os.unlink(base+'/downloads/in_progress') |
| 91 | + except: pass |
| 92 | + try: os.unlink(base+'/downloads/current') |
| 93 | + except: pass |
| 94 | + os.symlink(base+'/downloads/'+edition, base+'/downloads/current') |
| 95 | +else: |
| 96 | + print "Exited with status: " + str(status) |
| 97 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/do-edition |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 98 | + native |
Name: svn:executable |
2 | 99 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/start-lang |
— | — | @@ -0,0 +1,27 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +if [ -z $1 ];then |
| 5 | + echo "Usage: start-lang <lang>" |
| 6 | + exit 1 |
| 7 | +fi |
| 8 | + |
| 9 | +lang=$1 |
| 10 | +shift |
| 11 | +dest=/mnt/static/wikipedia/$lang-new |
| 12 | + |
| 13 | +if [ ! -d $dest ];then |
| 14 | + mkdir -p $dest |
| 15 | + #rm -rf /mnt/upload3/wikipedia/$lang/shared |
| 16 | + ln -s /home/wikipedia/htdocs/wikipedia.org/images $dest/images |
| 17 | + |
| 18 | + # Upload snapshot disabled, not enough space |
| 19 | + #[ -d /mnt/upload_snapshot/new/$lang ] || mkdir -p /mnt/upload_snapshot/new/$lang |
| 20 | + #ln -s /mnt/upload_snapshot/new/$lang $dest/upload |
| 21 | + |
| 22 | + mkdir $dest/upload |
| 23 | + ln -s /mnt/upload3/wikipedia/$lang/* $dest/upload/ |
| 24 | + rm -f $dest/upload/shared |
| 25 | + mkdir $dest/upload/shared |
| 26 | + ln -s /home/wikipedia/common/php-1.5/skins $dest/skins |
| 27 | + cp /mnt/static/COPYING.html $dest/COPYING.html |
| 28 | +fi |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/start-lang |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 29 | + native |
Name: svn:executable |
2 | 30 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/README |
— | — | @@ -0,0 +1 @@ |
| 2 | +This directory contains the job control system used to run DumpHTML on Wikimedia. It is Wikimedia-specific and will require some tweaking to make it work in other environments. |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/README |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 3 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/compress-html |
— | — | @@ -0,0 +1,52 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | + |
| 5 | +if [ "X$3" == X ]; then |
| 6 | + echo "Usage: compress-html <language> <edition> <checkpoint>" |
| 7 | + exit |
| 8 | +fi |
| 9 | + |
| 10 | +. `dirname $0`/config.sh |
| 11 | + |
| 12 | +lang=$1 |
| 13 | +edition=$2 |
| 14 | +checkpoint=$3 |
| 15 | +site=wikipedia |
| 16 | + |
| 17 | +sitebase=$base/new/$site |
| 18 | +dest=$base/downloads/$edition/$lang |
| 19 | +p7zip=$base/scripts/7za-readdir-hack |
| 20 | + |
| 21 | +mkdir -p $dest |
| 22 | + |
| 23 | +set -e |
| 24 | + |
| 25 | +echo Finding files... |
| 26 | +cd $sitebase |
| 27 | +find $lang/ -name \*.html > $dest/html.lst |
| 28 | + |
| 29 | +find $lang/skins $lang/raw $lang/misc -type f > $dest/skins.lst |
| 30 | +echo $lang/dumpHTML.version >> $dest/skins.lst |
| 31 | + |
| 32 | +[ -e $lang/images ] && find $lang/images -not -type d > $dest/images.lst |
| 33 | + |
| 34 | +echo Found `wc -l < $dest/html.lst` files |
| 35 | + |
| 36 | +echo Creating HTML archive... |
| 37 | +rm -f $dest/wikipedia-$lang-html.tar.7z |
| 38 | + |
| 39 | + |
| 40 | +# Set chunk size to 8MB for faster random access |
| 41 | +#$p7zip -l -ms8m a $dest/wikipedia-$lang-html.7z @$dest/html.lst @$dest/skins.lst |
| 42 | + |
| 43 | +#fileCount=`wc -l $base/downloads/$edition/$lang/html.lst | awk '{print $1}'` |
| 44 | +#if [ $fileCount -gt 2000000 ]; then |
| 45 | +# echo "Creating split archives" |
| 46 | +# $base/scripts/compress-volumes "$lang" "$edition" |
| 47 | +#fi |
| 48 | + |
| 49 | + |
| 50 | +tar -c -T $dest/html.lst -T $dest/skins.lst | $p7zip a $dest/wikipedia-$lang-html.tar.7z -si -bd |
| 51 | + |
| 52 | +echo "everything=done" > $checkpoint |
| 53 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/compress-html |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 54 | + native |
Name: svn:executable |
2 | 55 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/config.sh |
— | — | @@ -0,0 +1 @@ |
| 2 | +base=/a/static |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/config.sh |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 3 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/finish |
— | — | @@ -0,0 +1,11 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +if [ "X$2" == "X" ];then |
| 5 | + echo "Usage: finish <edition> <lang>" |
| 6 | + exit |
| 7 | +fi |
| 8 | +export bindir=/var/static/scripts |
| 9 | +export edition=$1 |
| 10 | +. $bindir/functions |
| 11 | + |
| 12 | +finishlang $2 |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/finish |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 13 | + native |
Name: svn:executable |
2 | 14 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/compress-volumes |
— | — | @@ -0,0 +1,25 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +if [ "asdf$1" == asdf ];then |
| 5 | + echo "Usage: compress-volumes <language>" |
| 6 | + exit |
| 7 | +fi |
| 8 | + |
| 9 | +basedir=/var/zwinger/htdocs/static |
| 10 | +htmldir=$basedir/$1 |
| 11 | +listdir=$basedir/downloads/$1/listfiles |
| 12 | +destdir=$basedir/downloads/$1/volumes |
| 13 | + |
| 14 | +cd $basedir |
| 15 | + |
| 16 | +for listfile in $listdir/vol* ;do |
| 17 | + vol=`basename $listfile` |
| 18 | + destfile=$destdir/$vol.7z |
| 19 | + |
| 20 | + if [ -e $destfile ];then |
| 21 | + echo "$destfile already done" |
| 22 | + else |
| 23 | + echo 7z a $destdir/$vol.7z @$listfile |
| 24 | + 7z a $destdir/$vol.7z @$listfile |
| 25 | + fi |
| 26 | +done |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/compress-volumes |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 27 | + native |
Name: svn:executable |
2 | 28 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/thread1 |
— | — | @@ -0,0 +1,57 @@ |
| 2 | +#!/bin/bash |
| 3 | +. $bindir/functions |
| 4 | + |
| 5 | +for lang in \ |
| 6 | +aa \ |
| 7 | +ab \ |
| 8 | +af \ |
| 9 | +ak \ |
| 10 | +als \ |
| 11 | +am \ |
| 12 | +ang \ |
| 13 | +an \ |
| 14 | +arc \ |
| 15 | +ar \ |
| 16 | +ast \ |
| 17 | +as \ |
| 18 | +av \ |
| 19 | +ay \ |
| 20 | +az \ |
| 21 | +ba \ |
| 22 | +be \ |
| 23 | +bg \ |
| 24 | +bh \ |
| 25 | +bi \ |
| 26 | +bm \ |
| 27 | +bn \ |
| 28 | +bo \ |
| 29 | +br \ |
| 30 | +bs \ |
| 31 | +ca \ |
| 32 | +ceb \ |
| 33 | +ce \ |
| 34 | +cho \ |
| 35 | +chr \ |
| 36 | +ch \ |
| 37 | +chy \ |
| 38 | +co \ |
| 39 | +cr \ |
| 40 | +csb \ |
| 41 | +cs \ |
| 42 | +cv \ |
| 43 | +cy \ |
| 44 | +da \ |
| 45 | +de \ |
| 46 | +dk \ |
| 47 | +dv \ |
| 48 | +dz \ |
| 49 | +ee \ |
| 50 | +el |
| 51 | +do |
| 52 | + dolang $lang |
| 53 | + finishlang $lang |
| 54 | +done |
| 55 | + |
| 56 | +dolang en -e 694697 |
| 57 | + |
| 58 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/thread1 |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 59 | + native |
Name: svn:executable |
2 | 60 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/thread2 |
— | — | @@ -0,0 +1,6 @@ |
| 2 | +#!/bin/bash |
| 3 | +. $bindir/functions |
| 4 | + |
| 5 | +dolang en -s 694697 -e 2918581 |
| 6 | + |
| 7 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/thread2 |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 8 | + native |
Name: svn:executable |
2 | 9 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/thread3 |
— | — | @@ -0,0 +1,54 @@ |
| 2 | +#!/bin/bash |
| 3 | +. $bindir/functions |
| 4 | + |
| 5 | +dolang en -s 2918581 |
| 6 | + |
| 7 | +for lang in \ |
| 8 | +eo \ |
| 9 | +es \ |
| 10 | +et \ |
| 11 | +eu \ |
| 12 | +fa \ |
| 13 | +ff \ |
| 14 | +fiu-vro \ |
| 15 | +'fi' \ |
| 16 | +fj \ |
| 17 | +fo \ |
| 18 | +fr \ |
| 19 | +fur \ |
| 20 | +fy \ |
| 21 | +ga \ |
| 22 | +gd \ |
| 23 | +gl \ |
| 24 | +gn \ |
| 25 | +got \ |
| 26 | +gu \ |
| 27 | +gv \ |
| 28 | +ha \ |
| 29 | +haw \ |
| 30 | +he \ |
| 31 | +hi \ |
| 32 | +ho \ |
| 33 | +hr \ |
| 34 | +ht \ |
| 35 | +hu \ |
| 36 | +hy \ |
| 37 | +hz \ |
| 38 | +ia \ |
| 39 | +id \ |
| 40 | +ie \ |
| 41 | +ig \ |
| 42 | +ii \ |
| 43 | +ik \ |
| 44 | +ilo \ |
| 45 | +io \ |
| 46 | +is \ |
| 47 | +it \ |
| 48 | +iu |
| 49 | +do |
| 50 | + dolang $lang |
| 51 | + finishlang $lang |
| 52 | +done |
| 53 | + |
| 54 | +dolang ja -e 323460 |
| 55 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/thread3 |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 56 | + native |
Name: svn:executable |
2 | 57 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/ar-fixup |
— | — | @@ -0,0 +1,18 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +if [ "X$1" == X ];then |
| 5 | + echo "Usage: ar-fixup <edition>" |
| 6 | + exit |
| 7 | +fi |
| 8 | + |
| 9 | +cd /var/static |
| 10 | +dest=/var/static/downloads/$edition/ar |
| 11 | +if ! grep -q ar/upload/2/26/Arabisc1.png $dest/skins.lst;then |
| 12 | + echo ar/upload/2/26/Arabisc1.png >> $dest/skins.lst |
| 13 | +fi |
| 14 | +sed -i~ 's!http://upload\.wikimedia\.org/wikipedia/ar/2/26/Arabisc1\.png!../upload/2/26/Arabisc1.png!' ar/raw/ميدياويكي~Monobook.css |
| 15 | + |
| 16 | +rm -f $dest/wikipedia-ar-html.7z |
| 17 | +echo Compressing... |
| 18 | +7z -l a $dest/wikipedia-ar-html.7z @$dest/html.lst @$dest/skins.lst > /dev/null |
| 19 | +echo Done |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/ar-fixup |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 20 | + native |
Name: svn:executable |
2 | 21 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/thread4 |
— | — | @@ -0,0 +1,134 @@ |
| 2 | +#!/bin/bash |
| 3 | +. $bindir/functions |
| 4 | + |
| 5 | +dolang ja -s 323460 |
| 6 | + |
| 7 | +for lang in \ |
| 8 | +jbo \ |
| 9 | +jv \ |
| 10 | +ka \ |
| 11 | +kg \ |
| 12 | +ki \ |
| 13 | +kj \ |
| 14 | +kk \ |
| 15 | +kl \ |
| 16 | +km \ |
| 17 | +kn \ |
| 18 | +ko \ |
| 19 | +kr \ |
| 20 | +ks \ |
| 21 | +ku \ |
| 22 | +kv \ |
| 23 | +kw \ |
| 24 | +ky \ |
| 25 | +lad \ |
| 26 | +la \ |
| 27 | +lb \ |
| 28 | +lg \ |
| 29 | +li \ |
| 30 | +ln \ |
| 31 | +lo \ |
| 32 | +lt \ |
| 33 | +lv \ |
| 34 | +mg \ |
| 35 | +mh \ |
| 36 | +mi \ |
| 37 | +mk \ |
| 38 | +ml \ |
| 39 | +mn \ |
| 40 | +mo \ |
| 41 | +mr \ |
| 42 | +ms \ |
| 43 | +mt \ |
| 44 | +mus \ |
| 45 | +my \ |
| 46 | +nah \ |
| 47 | +nap \ |
| 48 | +na \ |
| 49 | +nds \ |
| 50 | +ne \ |
| 51 | +ng \ |
| 52 | +nl \ |
| 53 | +nn \ |
| 54 | +no \ |
| 55 | +nv \ |
| 56 | +ny \ |
| 57 | +oc \ |
| 58 | +om \ |
| 59 | +or \ |
| 60 | +os \ |
| 61 | +pam \ |
| 62 | +pa \ |
| 63 | +pi \ |
| 64 | +pl \ |
| 65 | +ps \ |
| 66 | +pt \ |
| 67 | +qu \ |
| 68 | +rm \ |
| 69 | +rn \ |
| 70 | +roa-rup \ |
| 71 | +ro \ |
| 72 | +ru \ |
| 73 | +rw \ |
| 74 | +sa \ |
| 75 | +scn \ |
| 76 | +sco \ |
| 77 | +sc \ |
| 78 | +sd \ |
| 79 | +se \ |
| 80 | +sg \ |
| 81 | +sh \ |
| 82 | +simple \ |
| 83 | +si \ |
| 84 | +sk \ |
| 85 | +sl \ |
| 86 | +sm \ |
| 87 | +sn \ |
| 88 | +so \ |
| 89 | +sq \ |
| 90 | +sr \ |
| 91 | +ss \ |
| 92 | +st \ |
| 93 | +su \ |
| 94 | +sv \ |
| 95 | +sw \ |
| 96 | +ta \ |
| 97 | +te \ |
| 98 | +tg \ |
| 99 | +th \ |
| 100 | +ti \ |
| 101 | +tk \ |
| 102 | +tlh \ |
| 103 | +tl \ |
| 104 | +tn \ |
| 105 | +to \ |
| 106 | +tpi \ |
| 107 | +tr \ |
| 108 | +ts \ |
| 109 | +tt \ |
| 110 | +tum \ |
| 111 | +tw \ |
| 112 | +ty \ |
| 113 | +ug \ |
| 114 | +uk \ |
| 115 | +ur \ |
| 116 | +uz \ |
| 117 | +ve \ |
| 118 | +vi \ |
| 119 | +vo \ |
| 120 | +war \ |
| 121 | +wa \ |
| 122 | +wo \ |
| 123 | +xh \ |
| 124 | +yi \ |
| 125 | +yo \ |
| 126 | +za \ |
| 127 | +zh-min-nan \ |
| 128 | +zh \ |
| 129 | +zu |
| 130 | +do |
| 131 | + dolang $lang |
| 132 | + finishlang $lang |
| 133 | +done |
| 134 | + |
| 135 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/thread4 |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 136 | + native |
Name: svn:executable |
2 | 137 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/index++ |
— | — | @@ -0,0 +1,9 @@ |
| 2 | +#!/usr/bin/perl |
| 3 | + |
| 4 | +for ($i=24; $i>=0; $i--) { |
| 5 | + $oldname = sprintf("vol%02d", $i); |
| 6 | + $newname = sprintf("vol%02d", $i+1); |
| 7 | + rename($oldname, $newname) or die "Error moving file $oldname to $newname"; |
| 8 | + |
| 9 | +} |
| 10 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/index++ |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 11 | + native |
Name: svn:executable |
2 | 12 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/dumpHTML.sh |
— | — | @@ -0,0 +1,19 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +lang=$1 |
| 5 | +shift |
| 6 | +dest=/var/static/$lang-new |
| 7 | + |
| 8 | +if [ ! -d $dest ];then |
| 9 | + rm -rf /mnt/upload3/wikipedia/$lang/shared |
| 10 | + mkdir $dest |
| 11 | + ln -s /home/wikipedia/htdocs/wikipedia.org/images $dest/images |
| 12 | + ln -s /mnt/wikipedia/htdocs/wikipedia.org/upload/$lang $dest/upload |
| 13 | + ln -s /home/wikipedia/common/php-1.5/skins $dest/skins |
| 14 | + cp /var/static/COPYING.html $dest/COPYING.html |
| 15 | +fi |
| 16 | + |
| 17 | +cd /home/wikipedia/common/php-1.5/maintenance |
| 18 | +#php dumpHTML.php $lang'wiki' --interlang --force-copy -d $dest "$@" |
| 19 | +php dumpHTML.php $lang'wiki' --interlang -d $dest "$@" |
| 20 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/dumpHTML.sh |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 21 | + native |
Name: svn:executable |
2 | 22 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/compress-volumes2 |
— | — | @@ -0,0 +1,25 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +if [ "asdf$1" == asdf ];then |
| 5 | + echo "Usage: compress-volumes <language>" |
| 6 | + exit |
| 7 | +fi |
| 8 | + |
| 9 | +basedir=/mnt/static |
| 10 | +htmldir=$basedir/$1 |
| 11 | +destdir=$basedir/downloads/$1/volumes |
| 12 | +shift |
| 13 | + |
| 14 | +cd $basedir |
| 15 | + |
| 16 | +for listfile in "$@" ;do |
| 17 | + vol=`basename $listfile` |
| 18 | + destfile=$destdir/$vol.7z |
| 19 | + |
| 20 | + if [ -e $destfile ];then |
| 21 | + echo "$destfile already done" |
| 22 | + else |
| 23 | + echo 7z a $destdir/$vol.7z @$listfile |
| 24 | + 7z a $destdir/$vol.7z @$listfile |
| 25 | + fi |
| 26 | +done |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/compress-volumes2 |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 27 | + native |
Name: svn:executable |
2 | 28 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/queueSlave.php |
— | — | @@ -0,0 +1,47 @@ |
| 2 | +<?php |
| 3 | + |
| 4 | +$queueHost = $argv[1]; |
| 5 | +$queuePort = $argv[2]; |
| 6 | +$baseDir = $argv[3]; |
| 7 | + |
| 8 | +$queueSock = fsockopen( $queueHost, $queuePort ); |
| 9 | +if ( !$queueSock ) { |
| 10 | + echo "Unable to connect to queue server\n"; |
| 11 | + die( 1 ); |
| 12 | +} |
| 13 | + |
| 14 | +chdir( "/home/wikipedia/common/php-1.5/maintenance" ); |
| 15 | +$waiting = false; |
| 16 | +while ( 1 ) { |
| 17 | + if ( !fwrite( $queueSock, "deq\n" ) ) { |
| 18 | + echo "Unable to write to queue server\n"; |
| 19 | + die( 1 ); |
| 20 | + } |
| 21 | + $s = fgets( $queueSock ); |
| 22 | + if ( $s === false ) { |
| 23 | + echo "Unable to read from queue server\n"; |
| 24 | + die( 1 ); |
| 25 | + } |
| 26 | + if ( preg_match( '!^data ([a-z_-]+) (\d+/\d+)!', $s, $m ) ) { |
| 27 | + $waiting = false; |
| 28 | + $wiki = $m[1]; |
| 29 | + $slice = $m[2]; |
| 30 | + echo "-------------------------------------------------------------------\n"; |
| 31 | + echo "$wiki $slice\n"; |
| 32 | + echo "-------------------------------------------------------------------\n"; |
| 33 | + $checkpoint = "$baseDir/checkpoints/{$wiki}_" . str_replace( '/', '_', $slice ); |
| 34 | + $lang = str_replace( 'wiki', '', $wiki ); |
| 35 | + $dest = "$baseDir/$lang-new"; |
| 36 | + |
| 37 | + passthru( "php -n dumpHTML.php $wiki --force-copy --image-snapshot --interlang -d $dest --slice $slice --checkpoint $checkpoint" ); |
| 38 | + } else { |
| 39 | + # Wait for jobs |
| 40 | + if ( !$waiting ) { |
| 41 | + print "Waiting...\n"; |
| 42 | + $waiting = true; |
| 43 | + } |
| 44 | + sleep( 5 ); |
| 45 | + } |
| 46 | +} |
| 47 | + |
| 48 | +?> |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/queueSlave.php |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 49 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/throttle |
— | — | @@ -0,0 +1,47 @@ |
| 2 | +#!/usr/bin/perl -w |
| 3 | + |
| 4 | +if ($#ARGV != 0) { |
| 5 | + print "Usage: throttle <pid>\n"; |
| 6 | + exit 1; |
| 7 | +} |
| 8 | + |
| 9 | +$pid = $ARGV[0]; |
| 10 | +$threshold = 1800; |
| 11 | + |
| 12 | +# Get status |
| 13 | +open STAT, "/proc/$pid/stat" or die "No such process $pid\n"; |
| 14 | +@bits = split(/ /, <STAT>); |
| 15 | +$state = $bits[2]; |
| 16 | +close STAT; |
| 17 | + |
| 18 | +print "Found process, state=$state"; |
| 19 | + |
| 20 | +if ($state eq 'T') { |
| 21 | + print " (not running)\n"; |
| 22 | + $running = 0; |
| 23 | +} else { |
| 24 | + print " (running)\n"; |
| 25 | + $running = 1; |
| 26 | +} |
| 27 | + |
| 28 | +# Monitor albert's NFS traffic stats and continue when the calls per second drops below $threshold |
| 29 | +$alive = 1; |
| 30 | +while ($alive) { |
| 31 | + $traffic = `ganglia-fetch -h albert -p 8662 albert.wikimedia.org nfs_server_calls`; |
| 32 | + chomp($traffic); |
| 33 | + |
| 34 | + if ($running) { |
| 35 | + if ($traffic > $threshold) { |
| 36 | + print "nfs_server_calls = $traffic, stopping\n"; |
| 37 | + $alive = kill SIGSTOP, $pid; |
| 38 | + $running = 0; |
| 39 | + } |
| 40 | + } else { |
| 41 | + if ($traffic < $threshold) { |
| 42 | + print "nfs_server_calls = $traffic, starting\n"; |
| 43 | + $alive = kill SIGCONT, $pid; |
| 44 | + $running = 1; |
| 45 | + } |
| 46 | + } |
| 47 | + sleep 10; |
| 48 | +} |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/throttle |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 49 | + native |
Name: svn:executable |
2 | 50 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/functions |
— | — | @@ -0,0 +1,21 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +dolang() { |
| 5 | + echo ------------------------------------------------- |
| 6 | + echo $1 |
| 7 | + echo ------------------------------------------------- |
| 8 | + cd /home/wikipedia/common/php-1.5/maintenance |
| 9 | + $bindir/dumpHTML.sh "$@" |
| 10 | +} |
| 11 | + |
| 12 | +finishlang() { |
| 13 | + if [ -d /mnt/static/$1 ];then |
| 14 | + mv /mnt/static/$1 /mnt/static/$1-old |
| 15 | + fi |
| 16 | + mv /mnt/static/$1-new /mnt/static/$1 |
| 17 | + |
| 18 | + echo "Compressing $1..." |
| 19 | + $bindir/compress-html $1 $edition 2>&1 >/dev/null |
| 20 | + echo "Done." |
| 21 | +} |
| 22 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/functions |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 23 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/copy-en |
— | — | @@ -0,0 +1,25 @@ |
| 2 | +#!/usr/bin/perl -w |
| 3 | +use File::Copy; |
| 4 | +use File::Path; |
| 5 | +use File::Basename; |
| 6 | + |
| 7 | +while (<>) { |
| 8 | + chomp($_); |
| 9 | + $source = "/mnt/wikipedia/htdocs/static/$_"; |
| 10 | + $dest = "/var/static/$_"; |
| 11 | + if (!-e $dest) { |
| 12 | + if (!-d dirname($dest)) { |
| 13 | + mkpath(dirname($dest)); |
| 14 | + } |
| 15 | + print "$_ "; |
| 16 | + $result = copy($source, $dest); |
| 17 | + if ( $result ) { |
| 18 | + print "OK\n"; |
| 19 | + } else { |
| 20 | + print "failed\n"; |
| 21 | + } |
| 22 | + sleep 0.5 |
| 23 | + } else { |
| 24 | + print "$_ already copied\n"; |
| 25 | + } |
| 26 | +} |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/copy-en |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 27 | + native |
Name: svn:executable |
2 | 28 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/filterNamespaces.php |
— | — | @@ -0,0 +1,35 @@ |
| 2 | +<?php |
| 3 | + |
| 4 | +$oldDir = getcwd(); |
| 5 | +require_once( '/home/wikipedia/common/php/maintenance/commandLine.inc' ); |
| 6 | +chdir( $oldDir ); |
| 7 | + |
| 8 | +$filename = $args[0]; |
| 9 | +if ( !$filename ) { |
| 10 | + die("No filename specified\n"); |
| 11 | +} |
| 12 | + |
| 13 | +$lines = file( $filename ); |
| 14 | +if ( !$lines ) { |
| 15 | + die( "Unable to open file $filename\n" ); |
| 16 | +} |
| 17 | + |
| 18 | +foreach ( $lines as $line ) { |
| 19 | + $base = basename( trim( $line ) ); |
| 20 | + $tildePos = strpos( $base, '~' ); |
| 21 | + $printIt = true; |
| 22 | + if ( $tildePos !== false ) { |
| 23 | + $ns = substr( $base, 0, $tildePos ); |
| 24 | + $nsi = $wgLang->getNsIndex( $ns ); |
| 25 | + if ( $nsi !== false ) { |
| 26 | + if ( !in_array( $nsi, array( NS_IMAGE, NS_PROJECT, NS_HELP, NS_CATEGORY ) ) ) { |
| 27 | + $printIt = false; |
| 28 | + } |
| 29 | + } |
| 30 | + } |
| 31 | + if ( $printIt ) { |
| 32 | + print $line; |
| 33 | + } |
| 34 | +} |
| 35 | + |
| 36 | +?> |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/filterNamespaces.php |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 37 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/do4thread |
— | — | @@ -0,0 +1,16 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +if [ "X$1" == "X" ];then |
| 5 | + echo "Usage: do4thread <edition>" |
| 6 | + exit |
| 7 | +fi |
| 8 | +export bindir=/var/static/scripts |
| 9 | +export edition=$1 |
| 10 | + |
| 11 | + |
| 12 | +cd /home/wikipedia/common/php-1.5/maintenance |
| 13 | +$bindir/thread1 $1 >> /var/static/thread1.log 2>&1 & |
| 14 | +$bindir/thread2 $1 >> /var/static/thread2.log 2>&1 & |
| 15 | +$bindir/thread3 $1 >> /var/static/thread3.log 2>&1 & |
| 16 | +$bindir/thread4 $1 >> /var/static/thread4.log 2>&1 & |
| 17 | + |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/old/do4thread |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 18 | + native |
Name: svn:executable |
2 | 19 | + * |
Index: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/queueController.php |
— | — | @@ -0,0 +1,236 @@ |
| 2 | +<?php |
| 3 | + |
| 4 | +$baseDir = '/a/static'; |
| 5 | + |
| 6 | +$wgNoDBParam = true; |
| 7 | +require_once( '/home/wikipedia/common/php/maintenance/commandLine.inc' ); |
| 8 | + |
| 9 | +$wikiList = array_map( 'trim', file( '/home/wikipedia/common/wikipedia.dblist' ) ); |
| 10 | +$private = array_map( 'trim', file( '/home/wikipedia/common/private.dblist' ) ); |
| 11 | +$closed = array_map( 'trim', file( '/home/wikipedia/common/closed.dblist' ) ); |
| 12 | +$wikiList = array_diff( $wikiList, $private, $closed ); |
| 13 | + |
| 14 | +$targetQueueSize = 20; |
| 15 | +$maxArticlesPerJob = 10000; |
| 16 | +$jobTimeout = 86400; |
| 17 | + |
| 18 | +$queueSock = fsockopen( 'localhost', 8200 ); |
| 19 | +if ( !$queueSock ) { |
| 20 | + echo "Unable to connect to queue server\n"; |
| 21 | + die(1); |
| 22 | +} |
| 23 | + |
| 24 | +# Flush the queue |
| 25 | +fwrite( $queueSock, "clear\n" ); |
| 26 | +fgets( $queueSock ); |
| 27 | + |
| 28 | +# Fetch wiki stats |
| 29 | +$wikiSizes = @file_get_contents( "$baseDir/var/checkpoints/wikiSizes" ); |
| 30 | +if ( $wikiSizes ) { |
| 31 | + $wikiSizes = unserialize( $wikiSizes ); |
| 32 | +} else { |
| 33 | + $wikiSizes = array(); |
| 34 | + foreach ( $wikiList as $wiki ) { |
| 35 | + $lb = wfGetLB( $wiki ); |
| 36 | + $db = $lb->getConnection( DB_SLAVE, array(), $wiki ); |
| 37 | + $wikiSizes[$wiki] = $db->selectField( "`$wiki`.site_stats", 'ss_total_pages' ); |
| 38 | + $lb->reuseConnection( $db ); |
| 39 | + } |
| 40 | + file_put_contents( "$baseDir/var/checkpoints/wikiSizes", serialize( $wikiSizes ) ); |
| 41 | +} |
| 42 | + |
| 43 | +# Update the cached wikiSizes as per the current dblists |
| 44 | +foreach ( $wikiSizes as $wiki => $size ) { |
| 45 | + if ( !in_array( $wiki, $wikiList ) ) { |
| 46 | + unset( $wikiSizes[$wiki] ); |
| 47 | + } |
| 48 | +} |
| 49 | + |
| 50 | +# Compute job array |
| 51 | +$jobs = array(); |
| 52 | +$gates = array( |
| 53 | + 'everything' => count( $wikiSizes ), |
| 54 | +); |
| 55 | + |
| 56 | +foreach ( $wikiSizes as $wiki => $size ) { |
| 57 | + # Article jobs |
| 58 | + $numJobs = intval( ceil( $size / $maxArticlesPerJob ) ); |
| 59 | + $jobsRemainingPerWiki[$wiki] = $numJobs; |
| 60 | + $trigger = "$wiki articles"; |
| 61 | + $gates[$trigger] = $numJobs; |
| 62 | + |
| 63 | + for ( $i = 1; $i <= $numJobs; $i++ ) { |
| 64 | + $jobID = count( $jobs ); |
| 65 | + $jobs[] = array( |
| 66 | + 'id' => $jobID, |
| 67 | + 'cmd' => "$jobID $wiki articles $i/$numJobs", |
| 68 | + 'wiki'=> $wiki, |
| 69 | + 'trigger' => $trigger |
| 70 | + ); |
| 71 | + } |
| 72 | + |
| 73 | + # Shared description page jobs |
| 74 | + $numSharedJobs = min( $numJobs, 256 ); |
| 75 | + $trigger = "$wiki shared"; |
| 76 | + $gates[$trigger] = $numSharedJobs; |
| 77 | + |
| 78 | + for ( $i = 1; $i <= $numSharedJobs; $i++ ) { |
| 79 | + $jobID = count( $jobs ); |
| 80 | + $jobs[] = array( |
| 81 | + 'id' => $jobID, |
| 82 | + 'gate' => "$wiki articles", |
| 83 | + 'cmd' => "$jobID $wiki shared $i/$numSharedJobs", |
| 84 | + 'wiki' => $wiki, |
| 85 | + 'trigger' => $trigger |
| 86 | + ); |
| 87 | + } |
| 88 | + |
| 89 | + # Compression job |
| 90 | + $jobID = count( $jobs ); |
| 91 | + $jobs[] = array( |
| 92 | + 'id' => $jobID, |
| 93 | + 'gate' => "$wiki shared", |
| 94 | + 'cmd' => "$jobID $wiki finish 1/1", |
| 95 | + 'wiki' => $wiki, |
| 96 | + 'trigger' => 'everything', |
| 97 | + ); |
| 98 | +} |
| 99 | + |
| 100 | +# Write job list |
| 101 | +if ( !is_dir( "$baseDir/var/jobs" ) ) { |
| 102 | + mkdir( "$baseDir/var/jobs", true ); |
| 103 | +} |
| 104 | +$file = fopen( "$baseDir/var/jobs/list", 'w' ); |
| 105 | +if ( !$file ) { |
| 106 | + print "Unable to open $baseDir/var/jobs/list for writing\n"; |
| 107 | + exit( 1 ); |
| 108 | +} |
| 109 | +foreach ( $jobs as $job ) { |
| 110 | + fwrite( $file, $job['cmd']."\n" ); |
| 111 | +} |
| 112 | +fclose( $file ); |
| 113 | + |
| 114 | +$doneCount = 0; |
| 115 | +$start = 0; |
| 116 | +$queued = 0; |
| 117 | +$jobCount = count( $jobs ); |
| 118 | +$queueTimes = array(); |
| 119 | +$initialisedWikis = array(); |
| 120 | + |
| 121 | +print "$jobCount jobs to do\n"; |
| 122 | + |
| 123 | +while ( $gates['everything'] ) { |
| 124 | + for ( $i = $start; $i < $jobCount && getQueueSize() < $targetQueueSize; $i++ ) { |
| 125 | + if ( !isset( $jobs[$i] ) ) { |
| 126 | + # Already done and removed |
| 127 | + continue; |
| 128 | + } |
| 129 | + $job = $jobs[$i]; |
| 130 | + |
| 131 | + if ( isset( $job['gate'] ) && $gates[$job['gate']] ) { |
| 132 | + # Job is waiting for a gate |
| 133 | + continue; |
| 134 | + } |
| 135 | + |
| 136 | + $queueing = false; |
| 137 | + if ( isDone( $job ) ) { |
| 138 | + $doneCount++; |
| 139 | + print "Job $i done: {$job['cmd']} ($doneCount of $jobCount)\n"; |
| 140 | + |
| 141 | + # Handle any triggers for this job |
| 142 | + if ( isset( $job['trigger'] ) && $gates[$job['trigger']] ) { |
| 143 | + --$gates[$job['trigger']]; |
| 144 | + } |
| 145 | + # Remove the job from the job list |
| 146 | + unset( $jobs[$i] ); |
| 147 | + # Advance the start pointer |
| 148 | + while ( !isset( $jobs[$start] ) && $start < $jobCount ) { |
| 149 | + $start++; |
| 150 | + } |
| 151 | + } elseif ( !isset( $queueTimes[$i] ) ) { |
| 152 | + print "Queueing job $i: {$job['cmd']}\n"; |
| 153 | + $queueing = true; |
| 154 | + } elseif ( time() > $queueTimes[$i] + $jobTimeout ) { |
| 155 | + print "Timeout, requeueing job $i: {$job['cmd']}\n"; |
| 156 | + $queueing = true; |
| 157 | + } elseif ( isTerminated( $job ) ) { |
| 158 | + print "Job $i died, requeueing: {$job['cmd']}\n"; |
| 159 | + removeJobStatus( $job ); |
| 160 | + $queueing = true; |
| 161 | + } else { |
| 162 | + $queueing = false; |
| 163 | + } |
| 164 | + if ( $queueing ) { |
| 165 | + $wiki = $job['wiki']; |
| 166 | + if ( !isset( $initialisedWikis[$wiki] ) ) { |
| 167 | + startWiki( $wiki ); |
| 168 | + $initialisedWikis[$wiki] = true; |
| 169 | + } |
| 170 | + enqueue( $job ); |
| 171 | + $queueTimes[$i] = time(); |
| 172 | + } |
| 173 | + } |
| 174 | + sleep(10); |
| 175 | +} |
| 176 | + |
| 177 | +//------------------------------------------------------------ |
| 178 | + |
| 179 | +function getQueueSize() { |
| 180 | + global $queueSock; |
| 181 | + if ( fwrite( $queueSock, "size\n" ) === false ) { |
| 182 | + die( "Unable to write to queue server\n" ); |
| 183 | + } |
| 184 | + |
| 185 | + $response = fgets( $queueSock ); |
| 186 | + if ( $response === false ) { |
| 187 | + die( "Unable to read from queue server\n" ); |
| 188 | + } |
| 189 | + if ( !preg_match( "/^size (\d*)/", $response, $m ) ) { |
| 190 | + die( "Invalid response to size request\n" ); |
| 191 | + } |
| 192 | + return $m[1]; |
| 193 | +} |
| 194 | + |
| 195 | +function getJobStatus( $job ) { |
| 196 | + global $baseDir; |
| 197 | + $jobStatusFile = "$baseDir/var/jobs/{$job['id']}"; |
| 198 | + $lines = @file( $jobStatusFile ); |
| 199 | + |
| 200 | + if ( !isset( $lines[1] ) ) { |
| 201 | + return false; |
| 202 | + } else { |
| 203 | + return trim( $lines[1] ); |
| 204 | + } |
| 205 | +} |
| 206 | + |
| 207 | +function removeJobStatus( $job ) { |
| 208 | + global $baseDir; |
| 209 | + $jobStatusFile = "$baseDir/var/jobs/{$job['id']}"; |
| 210 | + @unlink( $jobStatusFile ); |
| 211 | +} |
| 212 | + |
| 213 | +function isDone( $job ) { |
| 214 | + return getJobStatus( $job ) == 'done'; |
| 215 | +} |
| 216 | + |
| 217 | +function isTerminated( $job ) { |
| 218 | + return getJobStatus( $job ) == 'terminated'; |
| 219 | +} |
| 220 | + |
| 221 | +function enqueue( $job ) { |
| 222 | + global $queueSock; |
| 223 | + if ( false === fwrite( $queueSock, "enq {$job['cmd']}\n" ) ) { |
| 224 | + die( "Unable to write to queue server\n" ); |
| 225 | + } |
| 226 | + |
| 227 | + # Read and throw away response |
| 228 | + $response = fgets( $queueSock ); |
| 229 | +} |
| 230 | + |
| 231 | +function startWiki( $wiki ) { |
| 232 | + global $baseDir; |
| 233 | + $lang = str_replace( 'wiki', '', $wiki ); |
| 234 | + print "Starting language $lang\n"; |
| 235 | +} |
| 236 | + |
| 237 | +?> |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/wm-scripts/queueController.php |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 238 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/README |
— | — | @@ -0,0 +1 @@ |
| 2 | +Work in progress on a replacement for maintenance/dumpHTML.php, to work with MW 1.11+. |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/README |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 3 | + native |
Index: branches/wmf-deployment/extensions/DumpHTML/SkinOffline.php |
— | — | @@ -0,0 +1,244 @@ |
| 2 | +<?php |
| 3 | + |
| 4 | +/** |
| 5 | + * Default skin for HTML dumps, based on MonoBook.php |
| 6 | + */ |
| 7 | + |
| 8 | +if( !defined( 'MEDIAWIKI' ) ) |
| 9 | + die( 1 ); |
| 10 | + |
| 11 | +/** |
| 12 | + * Inherit main code from SkinTemplate, set the CSS and template filter. |
| 13 | + * @todo document |
| 14 | + * @addtogroup Skins |
| 15 | + */ |
| 16 | +class SkinOffline extends SkinTemplate { |
| 17 | + /** Using monobook. */ |
| 18 | + function initPage( &$out ) { |
| 19 | + global $wgStylePath; |
| 20 | + SkinTemplate::initPage( $out ); |
| 21 | + $this->template = 'SkinOfflineTemplate'; |
| 22 | + $this->skinpath = "$wgStylePath/offline"; |
| 23 | + } |
| 24 | + |
| 25 | + function setupTemplate( $className, $repository = false, $cache_dir = false ) { |
| 26 | + global $wgFavicon; |
| 27 | + $tpl = parent::setupTemplate( $className, $repository, $cache_dir ); |
| 28 | + $tpl->set( 'skinpath', $this->skinpath ); |
| 29 | + $tpl->set( 'favicon', $wgFavicon ); |
| 30 | + return $tpl; |
| 31 | + } |
| 32 | + |
| 33 | + function buildSidebar() { |
| 34 | + $sections = parent::buildSidebar(); |
| 35 | + $badMessages = array( 'recentchanges-url', 'randompage-url' ); |
| 36 | + $badUrls = array(); |
| 37 | + foreach ( $badMessages as $msg ) { |
| 38 | + $badUrls[] = self::makeInternalOrExternalUrl( wfMsgForContent( $msg ) ); |
| 39 | + } |
| 40 | + |
| 41 | + foreach ( $sections as $heading => $section ) { |
| 42 | + foreach ( $section as $index => $link ) { |
| 43 | + if ( in_array( $link['href'], $badUrls ) ) { |
| 44 | + unset( $sections[$heading][$index] ); |
| 45 | + } |
| 46 | + } |
| 47 | + } |
| 48 | + return $sections; |
| 49 | + } |
| 50 | + |
| 51 | + function buildContentActionUrls() { |
| 52 | + global $wgHTMLDump; |
| 53 | + |
| 54 | + $content_actions = array(); |
| 55 | + $nskey = $this->getNameSpaceKey(); |
| 56 | + $content_actions[$nskey] = $this->tabAction( |
| 57 | + $this->mTitle->getSubjectPage(), |
| 58 | + $nskey, |
| 59 | + !$this->mTitle->isTalkPage() ); |
| 60 | + |
| 61 | + $content_actions['talk'] = $this->tabAction( |
| 62 | + $this->mTitle->getTalkPage(), |
| 63 | + 'talk', |
| 64 | + $this->mTitle->isTalkPage(), |
| 65 | + '', |
| 66 | + true); |
| 67 | + |
| 68 | + if ( isset( $wgHTMLDump ) ) { |
| 69 | + $content_actions['current'] = array( |
| 70 | + 'text' => wfMsg( 'currentrev' ), |
| 71 | + 'href' => str_replace( '$1', wfUrlencode( $this->mTitle->getPrefixedDBkey() ), |
| 72 | + $wgHTMLDump->oldArticlePath ), |
| 73 | + 'class' => false |
| 74 | + ); |
| 75 | + } |
| 76 | + return $content_actions; |
| 77 | + } |
| 78 | + |
| 79 | + function makeBrokenLinkObj( &$nt, $text = '', $query = '', $trail = '', $prefix = '' ) { |
| 80 | + if ( !isset( $nt ) ) { |
| 81 | + return "<!-- ERROR -->{$prefix}{$text}{$trail}"; |
| 82 | + } |
| 83 | + |
| 84 | + if ( $nt->getNamespace() == NS_CATEGORY ) { |
| 85 | + # Determine if the category has any articles in it |
| 86 | + $dbr = wfGetDB( DB_SLAVE ); |
| 87 | + $hasMembers = $dbr->selectField( 'categorylinks', '1', |
| 88 | + array( 'cl_to' => $nt->getDBkey() ), __METHOD__ ); |
| 89 | + if ( $hasMembers ) { |
| 90 | + return $this->makeKnownLinkObj( $nt, $text, $query, $trail, $prefix ); |
| 91 | + } |
| 92 | + } |
| 93 | + |
| 94 | + if ( $text == '' ) { |
| 95 | + $text = $nt->getPrefixedText(); |
| 96 | + } |
| 97 | + return $prefix . $text . $trail; |
| 98 | + } |
| 99 | + |
| 100 | + function printSource() { |
| 101 | + return ''; |
| 102 | + } |
| 103 | +} |
| 104 | + |
| 105 | +/** |
| 106 | + * @todo document |
| 107 | + * @addtogroup Skins |
| 108 | + */ |
| 109 | +class SkinOfflineTemplate extends QuickTemplate { |
| 110 | + /** |
| 111 | + * Template filter callback for MonoBook skin. |
| 112 | + * Takes an associative array of data set from a SkinTemplate-based |
| 113 | + * class, and a wrapper for MediaWiki's localization database, and |
| 114 | + * outputs a formatted page. |
| 115 | + * |
| 116 | + * @private |
| 117 | + */ |
| 118 | + function execute() { |
| 119 | + wfSuppressWarnings(); |
| 120 | +?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
| 121 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="<?php $this->text('lang') ?>" lang="<?php $this->text('lang') ?>" dir="<?php $this->text('dir') ?>"> |
| 122 | + <head> |
| 123 | + <meta http-equiv="Content-Type" content="<?php $this->text('mimetype') ?>; charset=<?php $this->text('charset') ?>" /> |
| 124 | + <!-- headlinks removed --> |
| 125 | + <link rel="shortcut icon" href="<?php $this->text('favicon'); ?>"/> |
| 126 | + <title><?php $this->text('pagetitle') ?></title> |
| 127 | + <style type="text/css">/*<![CDATA[*/ @import "<?php $this->text('skinpath') ?>/main.css"; /*]]>*/</style> |
| 128 | + <link rel="stylesheet" type="text/css" media="print" href="<?php $this->text('stylepath') ?>/common/commonPrint.css" /> |
| 129 | + <!--[if lt IE 5.5000]><style type="text/css">@import "<?php $this->text('stylepath') ?>/<?php $this->text('stylename') ?>/IE50Fixes.css";</style><![endif]--> |
| 130 | + <!--[if IE 5.5000]><style type="text/css">@import "<?php $this->text('stylepath') ?>/<?php $this->text('stylename') ?>/IE55Fixes.css";</style><![endif]--> |
| 131 | + <!--[if IE 6]><style type="text/css">@import "<?php $this->text('stylepath') ?>/<?php $this->text('stylename') ?>/IE60Fixes.css";</style><![endif]--> |
| 132 | + <!--[if IE]><script type="<?php $this->text('jsmimetype') ?>" src="<?php $this->text('stylepath') ?>/common/IEFixes.js"></script> |
| 133 | + <meta http-equiv="imagetoolbar" content="no" /><![endif]--> |
| 134 | + <script type="<?php $this->text('jsmimetype') ?>" src="<?php $this->text('stylepath' ) ?>/common/wikibits.js"></script> |
| 135 | + <script type="<?php $this->text('jsmimetype') ?>" src="<?php $this->text('skinpath' ) ?>/md5.js"></script> |
| 136 | + <script type="<?php $this->text('jsmimetype') ?>" src="<?php $this->text('skinpath' ) ?>/utf8.js"></script> |
| 137 | + <script type="<?php $this->text('jsmimetype') ?>" src="<?php $this->text('skinpath' ) ?>/lookup.js"></script> |
| 138 | + <?php if($this->data['jsvarurl' ]) { ?><script type="<?php $this->text('jsmimetype') ?>" src="<?php $this->text('jsvarurl' ) ?>"></script><?php } ?> |
| 139 | + <?php if($this->data['pagecss' ]) { ?><style type="text/css"><?php $this->html('pagecss' ) ?></style><?php } ?> |
| 140 | + <?php if($this->data['usercss' ]) { ?><style type="text/css"><?php $this->html('usercss' ) ?></style><?php } ?> |
| 141 | + <?php if($this->data['userjs' ]) { ?><script type="<?php $this->text('jsmimetype') ?>" src="<?php $this->text('userjs' ) ?>"></script><?php } ?> |
| 142 | + <?php if($this->data['userjsprev']) { ?><script type="<?php $this->text('jsmimetype') ?>"><?php $this->html('userjsprev') ?></script><?php } ?> |
| 143 | + </head> |
| 144 | + <body |
| 145 | + <?php if($this->data['pageclass']) { ?>class="<?php $this->text('pageclass') ?>"<?php } ?>> |
| 146 | + <div id="globalWrapper"> |
| 147 | + <div id="column-content"> |
| 148 | + <div id="content"> |
| 149 | + <a name="top" id="contentTop"></a> |
| 150 | + <?php if($this->data['sitenotice']) { ?><div id="siteNotice"><?php $this->html('sitenotice') ?></div><?php } ?> |
| 151 | + <h1 class="firstHeading"><?php $this->data['displaytitle']!=""?$this->html('title'):$this->text('title') ?></h1> |
| 152 | + <div id="bodyContent"> |
| 153 | + <h3 id="siteSub"><?php $this->msg('tagline') ?></h3> |
| 154 | + <div id="contentSub"><?php $this->html('subtitle') ?></div> |
| 155 | + <?php if($this->data['undelete']) { ?><div id="contentSub"><?php $this->html('undelete') ?></div><?php } ?> |
| 156 | + <?php if($this->data['newtalk'] ) { ?><div class="usermessage"><?php $this->html('newtalk') ?></div><?php } ?> |
| 157 | + <!-- start content --> |
| 158 | + <?php $this->html('bodytext') ?> |
| 159 | + <?php if($this->data['catlinks']) { ?><div id="catlinks"><?php $this->html('catlinks') ?></div><?php } ?> |
| 160 | + <!-- end content --> |
| 161 | + <div class="visualClear"></div> |
| 162 | + </div> |
| 163 | + </div> |
| 164 | + </div> |
| 165 | + <div id="column-one"> |
| 166 | + <div id="p-cactions" class="portlet"> |
| 167 | + <h5>Views</h5> |
| 168 | + <ul> |
| 169 | + <?php foreach($this->data['content_actions'] as $key => $action) { |
| 170 | + ?><li id="ca-<?php echo htmlspecialchars($key) ?>" |
| 171 | + <?php if($action['class']) { ?>class="<?php echo htmlspecialchars($action['class']) ?>"<?php } ?> |
| 172 | + ><a href="<?php echo htmlspecialchars($action['href']) ?>"><?php |
| 173 | + echo htmlspecialchars($action['text']) ?></a></li><?php |
| 174 | + } ?> |
| 175 | + </ul> |
| 176 | + </div> |
| 177 | + <div class="portlet" id="p-logo"> |
| 178 | + <a style="background-image: url(<?php $this->text('logopath') ?>);" |
| 179 | + href="<?php echo htmlspecialchars($this->data['nav_urls']['mainpage']['href'])?>" |
| 180 | + title="<?php $this->msg('mainpage') ?>"></a> |
| 181 | + </div> |
| 182 | + <script type="<?php $this->text('jsmimetype') ?>"> if (window.isMSIE55) fixalpha(); </script> |
| 183 | + <?php foreach ($this->data['sidebar'] as $bar => $cont) { ?> |
| 184 | + <div class='portlet' id='p-<?php echo htmlspecialchars($bar) ?>'> |
| 185 | + <h5><?php $out = wfMsg( $bar ); if (wfEmptyMsg($bar, $out)) echo $bar; else echo $out; ?></h5> |
| 186 | + <div class='pBody'> |
| 187 | + <ul> |
| 188 | + <?php foreach($cont as $key => $val) { ?> |
| 189 | + <li id="<?php echo htmlspecialchars($val['id']) ?>"><a href="<?php echo htmlspecialchars($val['href']) ?>"><?php echo htmlspecialchars($val['text'])?></a></li> |
| 190 | + <?php } ?> |
| 191 | + </ul> |
| 192 | + </div> |
| 193 | + </div> |
| 194 | + <?php } ?> |
| 195 | + <div id="p-search" class="portlet"> |
| 196 | + <h5><label for="searchInput"><?php $this->msg('search') ?></label></h5> |
| 197 | + <div id="searchBody" class="pBody"> |
| 198 | + <form action="javascript:goToStatic(3)" id="searchform"><div> |
| 199 | + <input id="searchInput" name="search" type="text" |
| 200 | + <?php if($this->haveMsg('accesskey-search')) { |
| 201 | + ?>accesskey="<?php $this->msg('accesskey-search') ?>"<?php } |
| 202 | + if( isset( $this->data['search'] ) ) { |
| 203 | + ?> value="<?php $this->text('search') ?>"<?php } ?> /> |
| 204 | + <input type='submit' name="go" class="searchButton" id="searchGoButton" |
| 205 | + value="<?php $this->msg('go') ?>" /> |
| 206 | + </div></form> |
| 207 | + </div> |
| 208 | + </div> |
| 209 | + <?php if( $this->data['language_urls'] ) { ?><div id="p-lang" class="portlet"> |
| 210 | + <h5><?php $this->msg('otherlanguages') ?></h5> |
| 211 | + <div class="pBody"> |
| 212 | + <ul> |
| 213 | + <?php foreach($this->data['language_urls'] as $langlink) { ?> |
| 214 | + <li> |
| 215 | + <a href="<?php echo htmlspecialchars($langlink['href']) |
| 216 | + ?>"><?php echo $langlink['text'] ?></a> |
| 217 | + </li> |
| 218 | + <?php } ?> |
| 219 | + </ul> |
| 220 | + </div> |
| 221 | + </div> |
| 222 | + <?php } ?> |
| 223 | + </div><!-- end of the left (by default at least) column --> |
| 224 | + <div class="visualClear"></div> |
| 225 | + <div id="footer"> |
| 226 | + <?php if($this->data['poweredbyico']) { ?><div id="f-poweredbyico"><?php $this->html('poweredbyico') ?></div><?php } ?> |
| 227 | + <?php if($this->data['copyrightico']) { ?><div id="f-copyrightico"><?php $this->html('copyrightico') ?></div><?php } ?> |
| 228 | + <ul id="f-list"> |
| 229 | + <?php if($this->data['lastmod' ]) { ?><li id="f-lastmod"><?php $this->html('lastmod') ?></li><?php } ?> |
| 230 | + <?php if($this->data['numberofwatchingusers' ]) { ?><li id="f-numberofwatchingusers"><?php $this->html('numberofwatchingusers') ?></li><?php } ?> |
| 231 | + <?php if($this->data['credits' ]) { ?><li id="f-credits"><?php $this->html('credits') ?></li><?php } ?> |
| 232 | + <?php if($this->data['copyright' ]) { ?><li id="f-copyright"><?php $this->html('copyright') ?></li><?php } ?> |
| 233 | + <?php if($this->data['about' ]) { ?><li id="f-about"><?php $this->html('about') ?></li><?php } ?> |
| 234 | + <?php if($this->data['disclaimer']) { ?><li id="f-disclaimer"><?php $this->html('disclaimer') ?></li><?php } ?> |
| 235 | + <?php if($this->data['tagline']) { ?><li id="f-tagline"><?php echo $this->data['tagline'] ?></li><?php } ?> |
| 236 | + </ul> |
| 237 | + </div> |
| 238 | + </div> |
| 239 | + </body> |
| 240 | +</html> |
| 241 | +<?php |
| 242 | + wfRestoreWarnings(); |
| 243 | + } |
| 244 | +} |
| 245 | +?> |
Property changes on: branches/wmf-deployment/extensions/DumpHTML/SkinOffline.php |
___________________________________________________________________ |
Name: svn:keywords |
1 | 246 | + Author Date Id Revision |
Name: svn:eol-style |
2 | 247 | + native |