r42166 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r42165‎ | r42166 | r42167 >
Date:09:11, 17 October 2008
Author:tstarling
Status:old (Comments)
Tags:
Comment:
Revert revert r41578 of r41531 and fix compressOld.php.
Modified paths:
  • /trunk/phase3/includes/AutoLoader.php (modified) (history)
  • /trunk/phase3/includes/HistoryBlob.php (modified) (history)
  • /trunk/phase3/maintenance/storage/compressOld.inc (modified) (history)
  • /trunk/phase3/maintenance/storage/testCompression.php (added) (history)

Diff [purge]

Index: trunk/phase3/maintenance/storage/compressOld.inc
@@ -4,10 +4,6 @@
55 * @ingroup Maintenance ExternalStorage
66 */
77
8 -/** */
9 -require_once( 'Revision.php' );
10 -require_once( 'ExternalStoreDB.php' );
11 -
128 /** @todo document */
139 function compressOldPages( $start = 0, $extdb = '' ) {
1410 $fname = 'compressOldPages';
@@ -229,7 +225,7 @@
230226 $stub = false;
231227 print 'x';
232228 } else {
233 - $stub = $chunk->addItem( $text );
 229+ $stub = new HistoryBlobStub( $chunk->addItem( $text ) );
234230 $stub->setLocation( $primaryOldid );
235231 $stub->setReferrer( $oldid );
236232 print '.';
Index: trunk/phase3/maintenance/storage/testCompression.php
@@ -0,0 +1,70 @@
 2+<?php
 3+
 4+$optionsWithArgs = array( 'start', 'limit', 'type' );
 5+require( dirname(__FILE__).'/../commandLine.inc' );
 6+
 7+if ( !isset( $args[0] ) ) {
 8+ echo "Usage: php testCompression.php [--type=<type>] [--start=<start-date>] [--limit=<num-revs>] <page-title>\n";
 9+ exit( 1 );
 10+}
 11+
 12+$title = Title::newFromText( $args[0] );
 13+if ( isset( $options['start'] ) ) {
 14+ $start = wfTimestamp( TS_MW, strtotime( $options['start'] ) );
 15+ echo "Starting from " . $wgLang->timeanddate( $start ) . "\n";
 16+} else {
 17+ $start = '19700101000000';
 18+}
 19+$limit = isset( $options['limit'] ) ? $options['limit'] : 10;
 20+$type = isset( $options['type'] ) ? $options['type'] : 'ConcatenatedGzipHistoryBlob';
 21+
 22+
 23+$dbr = wfGetDB( DB_SLAVE );
 24+$res = $dbr->select(
 25+ array( 'page', 'revision', 'text' ),
 26+ '*',
 27+ array(
 28+ 'page_namespace' => $title->getNamespace(),
 29+ 'page_title' => $title->getDBkey(),
 30+ 'page_id=rev_page',
 31+ 'rev_timestamp > ' . $dbr->addQuotes( $dbr->timestamp( $start ) ),
 32+ 'rev_text_id=old_id'
 33+ ), __FILE__, array( 'LIMIT' => $limit )
 34+);
 35+
 36+$blob = new $type;
 37+$hashes = array();
 38+$keys = array();
 39+$uncompressedSize = 0;
 40+$t = -microtime( true );
 41+foreach ( $res as $row ) {
 42+ $revision = new Revision( $row );
 43+ $text = $revision->getText();
 44+ $uncompressedSize += strlen( $text );
 45+ $hashes[$row->rev_id] = md5( $text );
 46+ $keys[$row->rev_id] = $blob->addItem( $text );
 47+}
 48+
 49+$serialized = serialize( $blob );
 50+$t += microtime( true );
 51+
 52+printf( "Compression ratio for %d revisions: %5.2f, %s -> %s\n",
 53+ $res->numRows(),
 54+ $uncompressedSize / strlen( $serialized ),
 55+ $wgLang->formatSize( $uncompressedSize ),
 56+ $wgLang->formatSize( strlen( $serialized ) )
 57+);
 58+printf( "Compression time: %5.2f ms\n", $t * 1000 );
 59+
 60+$t = -microtime( true );
 61+$blob = unserialize( $serialized );
 62+foreach ( $keys as $id => $key ) {
 63+ $text = $blob->getItem( $key );
 64+ if ( md5( $text ) != $hashes[$id] ) {
 65+ echo "Content hash mismatch for rev_id $id\n";
 66+ #var_dump( $text );
 67+ }
 68+}
 69+$t += microtime( true );
 70+printf( "Decompression time: %5.2f ms\n", $t * 1000 );
 71+
Property changes on: trunk/phase3/maintenance/storage/testCompression.php
___________________________________________________________________
Added: svn:eol-style
172 + native
Index: trunk/phase3/includes/AutoLoader.php
@@ -35,6 +35,7 @@
3636 'Credits' => 'includes/Credits.php',
3737 'DBABagOStuff' => 'includes/BagOStuff.php',
3838 'DependencyWrapper' => 'includes/CacheDependency.php',
 39+ 'DiffHistoryBlob' => 'includes/HistoryBlob.php',
3940 'DjVuImage' => 'includes/DjVuImage.php',
4041 'DoubleReplacer' => 'includes/StringUtils.php',
4142 'DoubleRedirectJob' => 'includes/DoubleRedirectJob.php',
Index: trunk/phase3/includes/HistoryBlob.php
@@ -1,41 +1,33 @@
22 <?php
33
44 /**
5 - * Pure virtual parent
6 - * @todo document (needs a one-sentence top-level class description, that answers the question: "what is a HistoryBlob?")
 5+ * Base class for general text storage via the "object" flag in old_flags, or
 6+ * two-part external storage URLs. Used for represent efficient concatenated
 7+ * storage, and migration-related pointer objects.
78 */
89 interface HistoryBlob
910 {
1011 /**
11 - * setMeta and getMeta currently aren't used for anything, I just thought
12 - * they might be useful in the future.
13 - * @param $meta String: a single string.
14 - */
15 - public function setMeta( $meta );
16 -
17 - /**
18 - * setMeta and getMeta currently aren't used for anything, I just thought
19 - * they might be useful in the future.
20 - * Gets the meta-value
21 - */
22 - public function getMeta();
23 -
24 - /**
2512 * Adds an item of text, returns a stub object which points to the item.
2613 * You must call setLocation() on the stub object before storing it to the
2714 * database
 15+ * Returns the key for getItem()
2816 */
2917 public function addItem( $text );
3018
3119 /**
32 - * Get item by hash
 20+ * Get item by key, or false if the key is not present
3321 */
34 - public function getItem( $hash );
 22+ public function getItem( $key );
3523
36 - # Set the "default text"
37 - # This concept is an odd property of the current DB schema, whereby each text item has a revision
38 - # associated with it. The default text is the text of the associated revision. There may, however,
39 - # be other revisions in the same object
 24+ /**
 25+ * Set the "default text"
 26+ * This concept is an odd property of the current DB schema, whereby each text item has a revision
 27+ * associated with it. The default text is the text of the associated revision. There may, however,
 28+ * be other revisions in the same object.
 29+ *
 30+ * Default text is not required for two-part external storage URLs.
 31+ */
4032 public function setText( $text );
4133
4234 /**
@@ -45,8 +37,8 @@
4638 }
4739
4840 /**
49 - * The real object
50 - * @todo document (needs one-sentence top-level class description + function descriptions).
 41+ * Concatenated gzip (CGZ) storage
 42+ * Improves compression ratio by concatenating like objects before gzipping
5143 */
5244 class ConcatenatedGzipHistoryBlob implements HistoryBlob
5345 {
@@ -60,34 +52,15 @@
6153 }
6254 }
6355
64 - #
65 - # HistoryBlob implementation:
66 - #
67 -
68 - /** @todo document */
69 - public function setMeta( $metaData ) {
70 - $this->uncompress();
71 - $this->mItems['meta'] = $metaData;
72 - }
73 -
74 - /** @todo document */
75 - public function getMeta() {
76 - $this->uncompress();
77 - return $this->mItems['meta'];
78 - }
79 -
80 - /** @todo document */
8156 public function addItem( $text ) {
8257 $this->uncompress();
8358 $hash = md5( $text );
8459 $this->mItems[$hash] = $text;
8560 $this->mSize += strlen( $text );
8661
87 - $stub = new HistoryBlobStub( $hash );
88 - return $stub;
 62+ return $hash;
8963 }
9064
91 - /** @todo document */
9265 public function getItem( $hash ) {
9366 $this->uncompress();
9467 if ( array_key_exists( $hash, $this->mItems ) ) {
@@ -97,29 +70,27 @@
9871 }
9972 }
10073
101 - /** @todo document */
10274 public function setText( $text ) {
10375 $this->uncompress();
104 - $stub = $this->addItem( $text );
105 - $this->mDefaultHash = $stub->mHash;
 76+ $this->mDefaultHash = $this->addItem( $text );
10677 }
10778
108 - /** @todo document */
10979 public function getText() {
11080 $this->uncompress();
11181 return $this->getItem( $this->mDefaultHash );
11282 }
11383
114 - # HistoryBlob implemented.
115 -
116 -
117 - /** @todo document */
 84+ /**
 85+ * Remove an item
 86+ */
11887 public function removeItem( $hash ) {
11988 $this->mSize -= strlen( $this->mItems[$hash] );
12089 unset( $this->mItems[$hash] );
12190 }
12291
123 - /** @todo document */
 92+ /**
 93+ * Compress the bulk data in the object
 94+ */
12495 public function compress() {
12596 if ( !$this->mCompressed ) {
12697 $this->mItems = gzdeflate( serialize( $this->mItems ) );
@@ -127,7 +98,9 @@
12899 }
129100 }
130101
131 - /** @todo document */
 102+ /**
 103+ * Uncompress bulk data
 104+ */
132105 public function uncompress() {
133106 if ( $this->mCompressed ) {
134107 $this->mItems = unserialize( gzinflate( $this->mItems ) );
@@ -136,19 +109,18 @@
137110 }
138111
139112
140 - /** @todo document */
141113 function __sleep() {
142114 $this->compress();
143115 return array( 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' );
144116 }
145117
146 - /** @todo document */
147118 function __wakeup() {
148119 $this->uncompress();
149120 }
150121
151122 /**
152 - * Determines if this object is happy
 123+ * Helper function for compression jobs
 124+ * Returns true until the object is "full" and ready to be committed
153125 */
154126 public function isHappy( $maxFactor, $factorThreshold ) {
155127 if ( count( $this->mItems ) == 0 ) {
@@ -184,12 +156,15 @@
185157
186158
187159 /**
188 - * @todo document (needs one-sentence top-level class description + some function descriptions).
 160+ * Pointer object for an item within a CGZ blob stored in the text table.
189161 */
190162 class HistoryBlobStub {
191163 var $mOldId, $mHash, $mRef;
192164
193 - /** @todo document */
 165+ /**
 166+ * @param string $hash The content hash of the text
 167+ * @param integer $oldid The old_id for the CGZ object
 168+ */
194169 function HistoryBlobStub( $hash = '', $oldid = 0 ) {
195170 $this->mHash = $hash;
196171 }
@@ -216,7 +191,6 @@
217192 return $this->mRef;
218193 }
219194
220 - /** @todo document */
221195 function getText() {
222196 $fname = 'HistoryBlobStub::getText';
223197 global $wgBlobCache;
@@ -264,7 +238,9 @@
265239 return $obj->getItem( $this->mHash );
266240 }
267241
268 - /** @todo document */
 242+ /**
 243+ * Get the content hash
 244+ */
269245 function getHash() {
270246 return $this->mHash;
271247 }
@@ -282,7 +258,9 @@
283259 class HistoryBlobCurStub {
284260 var $mCurId;
285261
286 - /** @todo document */
 262+ /**
 263+ * @param integer $curid The cur_id pointed to
 264+ */
287265 function HistoryBlobCurStub( $curid = 0 ) {
288266 $this->mCurId = $curid;
289267 }
@@ -295,7 +273,6 @@
296274 $this->mCurId = $id;
297275 }
298276
299 - /** @todo document */
300277 function getText() {
301278 $dbr = wfGetDB( DB_SLAVE );
302279 $row = $dbr->selectRow( 'cur', array( 'cur_text' ), array( 'cur_id' => $this->mCurId ) );
@@ -305,3 +282,123 @@
306283 return $row->cur_text;
307284 }
308285 }
 286+
 287+/**
 288+ * Diff-based history compression
 289+ * Requires xdiff 1.5+ and zlib
 290+ */
 291+class DiffHistoryBlob implements HistoryBlob {
 292+ /** Uncompressed item cache */
 293+ var $mItems = array();
 294+
 295+ /**
 296+ * Array of diffs, where $this->mDiffs[0] is the diff between
 297+ * $this->mDiffs[0] and $this->mDiffs[1]
 298+ */
 299+ var $mDiffs = array();
 300+
 301+ /**
 302+ * The key for getText()
 303+ */
 304+ var $mDefaultKey;
 305+
 306+ /**
 307+ * Compressed storage
 308+ */
 309+ var $mCompressed;
 310+
 311+ /**
 312+ * True if the object is locked against further writes
 313+ */
 314+ var $mFrozen = false;
 315+
 316+
 317+ function __construct() {
 318+ if ( !function_exists( 'xdiff_string_bdiff' ) ){
 319+ throw new MWException( "Need xdiff 1.5+ support to read or write DiffHistoryBlob\n" );
 320+ }
 321+ if ( !function_exists( 'gzdeflate' ) ) {
 322+ throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" );
 323+ }
 324+ }
 325+
 326+ function addItem( $text ) {
 327+ if ( $this->mFrozen ) {
 328+ throw new MWException( __METHOD__.": Cannot add more items after sleep/wakeup" );
 329+ }
 330+
 331+ $this->mItems[] = $text;
 332+ $i = count( $this->mItems ) - 1;
 333+ if ( $i > 0 ) {
 334+ # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
 335+ # "String is not zero-terminated"
 336+ wfSuppressWarnings();
 337+ $this->mDiffs[] = xdiff_string_bdiff( $this->mItems[$i-1], $text ) . '';
 338+ wfRestoreWarnings();
 339+ }
 340+ return $i;
 341+ }
 342+
 343+ function getItem( $key ) {
 344+ if ( $key > count( $this->mDiffs ) + 1 ) {
 345+ return false;
 346+ }
 347+ $key = intval( $key );
 348+ if ( $key == 0 ) {
 349+ return $this->mItems[0];
 350+ }
 351+
 352+ $last = count( $this->mItems ) - 1;
 353+ for ( $i = $last + 1; $i <= $key; $i++ ) {
 354+ # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
 355+ # "String is not zero-terminated"
 356+ wfSuppressWarnings();
 357+ $this->mItems[$i] = xdiff_string_bpatch( $this->mItems[$i - 1], $this->mDiffs[$i - 1] ) . '';
 358+ wfRestoreWarnings();
 359+ }
 360+ return $this->mItems[$key];
 361+ }
 362+
 363+ function setText( $text ) {
 364+ $this->mDefaultKey = $this->addItem( $text );
 365+ }
 366+
 367+ function getText() {
 368+ return $this->getItem( $this->mDefaultKey );
 369+ }
 370+
 371+ function __sleep() {
 372+ if ( !isset( $this->mItems[0] ) ) {
 373+ // Empty object
 374+ $info = false;
 375+ } else {
 376+ $info = array(
 377+ 'base' => $this->mItems[0],
 378+ 'diffs' => $this->mDiffs
 379+ );
 380+ }
 381+ if ( isset( $this->mDefaultKey ) ) {
 382+ $info['default'] = $this->mDefaultKey;
 383+ }
 384+ $this->mCompressed = gzdeflate( serialize( $info ) );
 385+ return array( 'mCompressed' );
 386+ }
 387+
 388+ function __wakeup() {
 389+ // addItem() doesn't work if mItems is partially filled from mDiffs
 390+ $this->mFrozen = true;
 391+ $info = unserialize( gzinflate( $this->mCompressed ) );
 392+ unset( $this->mCompressed );
 393+
 394+ if ( !$info ) {
 395+ // Empty object
 396+ return;
 397+ }
 398+
 399+ if ( isset( $info['default'] ) ) {
 400+ $this->mDefaultKey = $info['default'];
 401+ }
 402+ $this->mItems[0] = $info['base'];
 403+ $this->mDiffs = $info['diffs'];
 404+ }
 405+}

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r41531* Concept for diff-based compression using the new xdiff beta. Acheives massi...tstarling06:32, 2 October 2008
r41578Backing r41531 out for now ("Concept for diff-based compression using the new...brion00:00, 3 October 2008

Comments

#Comment by Brion VIBBER (talk | contribs)   23:45, 19 October 2008

Hope this is ok... looks sane :D

Status & tagging log