r41531 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r41530‎ | r41531 | r41532 >
Date:06:32, 2 October 2008
Author:tstarling
Status:old (Comments)
Tags:
Comment:
* Concept for diff-based compression using the new xdiff beta. Acheives massively better compression ratio compared to CGZ for articles which are larger than the deflate 32 KB sliding window. Works within the HistoryBlob architecture.
* Fixed documentation in HistoryBlob.php, removed "todo document" for methods that are adequately documented in the interface.
* Added testCompression.php for testing concatenated object compression ratio
Modified paths:
  • /trunk/phase3/includes/AutoLoader.php (modified) (history)
  • /trunk/phase3/includes/HistoryBlob.php (modified) (history)
  • /trunk/phase3/maintenance/storage/testCompression.php (added) (history)

Diff [purge]

Index: trunk/phase3/maintenance/storage/testCompression.php
@@ -0,0 +1,70 @@
 2+<?php
 3+
 4+$optionsWithArgs = array( 'start', 'limit', 'type' );
 5+require( dirname(__FILE__).'/../commandLine.inc' );
 6+
 7+if ( !isset( $args[0] ) ) {
 8+ echo "Usage: php testCompression.php [--type=<type>] [--start=<start-date>] [--limit=<num-revs>] <page-title>\n";
 9+ exit( 1 );
 10+}
 11+
 12+$title = Title::newFromText( $args[0] );
 13+if ( isset( $options['start'] ) ) {
 14+ $start = wfTimestamp( TS_MW, strtotime( $options['start'] ) );
 15+ echo "Starting from " . $wgLang->timeanddate( $start ) . "\n";
 16+} else {
 17+ $start = '19700101000000';
 18+}
 19+$limit = isset( $options['limit'] ) ? $options['limit'] : 10;
 20+$type = isset( $options['type'] ) ? $options['type'] : 'ConcatenatedGzipHistoryBlob';
 21+
 22+
 23+$dbr = wfGetDB( DB_SLAVE );
 24+$res = $dbr->select(
 25+ array( 'page', 'revision', 'text' ),
 26+ '*',
 27+ array(
 28+ 'page_namespace' => $title->getNamespace(),
 29+ 'page_title' => $title->getDBkey(),
 30+ 'page_id=rev_page',
 31+ 'rev_timestamp > ' . $dbr->addQuotes( $dbr->timestamp( $start ) ),
 32+ 'rev_text_id=old_id'
 33+ ), __FILE__, array( 'LIMIT' => $limit )
 34+);
 35+
 36+$blob = new $type;
 37+$hashes = array();
 38+$keys = array();
 39+$uncompressedSize = 0;
 40+$t = -microtime( true );
 41+foreach ( $res as $row ) {
 42+ $revision = new Revision( $row );
 43+ $text = $revision->getText();
 44+ $uncompressedSize += strlen( $text );
 45+ $hashes[$row->rev_id] = md5( $text );
 46+ $keys[$row->rev_id] = $blob->addItem( $text );
 47+}
 48+
 49+$serialized = serialize( $blob );
 50+$t += microtime( true );
 51+
 52+printf( "Compression ratio for %d revisions: %5.2f, %s -> %s\n",
 53+ $res->numRows(),
 54+ $uncompressedSize / strlen( $serialized ),
 55+ $wgLang->formatSize( $uncompressedSize ),
 56+ $wgLang->formatSize( strlen( $serialized ) )
 57+);
 58+printf( "Compression time: %5.2f ms\n", $t * 1000 );
 59+
 60+$t = -microtime( true );
 61+$blob = unserialize( $serialized );
 62+foreach ( $keys as $id => $key ) {
 63+ $text = $blob->getItem( $key );
 64+ if ( md5( $text ) != $hashes[$id] ) {
 65+ echo "Content hash mismatch for rev_id $id\n";
 66+ #var_dump( $text );
 67+ }
 68+}
 69+$t += microtime( true );
 70+printf( "Decompression time: %5.2f ms\n", $t * 1000 );
 71+
Property changes on: trunk/phase3/maintenance/storage/testCompression.php
___________________________________________________________________
Added: svn:eol-style
172 + native
Index: trunk/phase3/includes/AutoLoader.php
@@ -34,6 +34,7 @@
3535 'Credits' => 'includes/Credits.php',
3636 'DBABagOStuff' => 'includes/BagOStuff.php',
3737 'DependencyWrapper' => 'includes/CacheDependency.php',
 38+ 'DiffHistoryBlob' => 'includes/HistoryBlob.php',
3839 'DjVuImage' => 'includes/DjVuImage.php',
3940 'DoubleReplacer' => 'includes/StringUtils.php',
4041 'DoubleRedirectJob' => 'includes/DoubleRedirectJob.php',
Index: trunk/phase3/includes/HistoryBlob.php
@@ -1,41 +1,33 @@
22 <?php
33
44 /**
5 - * Pure virtual parent
6 - * @todo document (needs a one-sentence top-level class description, that answers the question: "what is a HistoryBlob?")
 5+ * Base class for general text storage via the "object" flag in old_flags, or
 6+ * two-part external storage URLs. Used for represent efficient concatenated
 7+ * storage, and migration-related pointer objects.
78 */
89 interface HistoryBlob
910 {
1011 /**
11 - * setMeta and getMeta currently aren't used for anything, I just thought
12 - * they might be useful in the future.
13 - * @param $meta String: a single string.
14 - */
15 - public function setMeta( $meta );
16 -
17 - /**
18 - * setMeta and getMeta currently aren't used for anything, I just thought
19 - * they might be useful in the future.
20 - * Gets the meta-value
21 - */
22 - public function getMeta();
23 -
24 - /**
2512 * Adds an item of text, returns a stub object which points to the item.
2613 * You must call setLocation() on the stub object before storing it to the
2714 * database
 15+ * Returns the key for getItem()
2816 */
2917 public function addItem( $text );
3018
3119 /**
32 - * Get item by hash
 20+ * Get item by key, or false if the key is not present
3321 */
34 - public function getItem( $hash );
 22+ public function getItem( $key );
3523
36 - # Set the "default text"
37 - # This concept is an odd property of the current DB schema, whereby each text item has a revision
38 - # associated with it. The default text is the text of the associated revision. There may, however,
39 - # be other revisions in the same object
 24+ /**
 25+ * Set the "default text"
 26+ * This concept is an odd property of the current DB schema, whereby each text item has a revision
 27+ * associated with it. The default text is the text of the associated revision. There may, however,
 28+ * be other revisions in the same object.
 29+ *
 30+ * Default text is not required for two-part external storage URLs.
 31+ */
4032 public function setText( $text );
4133
4234 /**
@@ -45,8 +37,8 @@
4638 }
4739
4840 /**
49 - * The real object
50 - * @todo document (needs one-sentence top-level class description + function descriptions).
 41+ * Concatenated gzip (CGZ) storage
 42+ * Improves compression ratio by concatenating like objects before gzipping
5143 */
5244 class ConcatenatedGzipHistoryBlob implements HistoryBlob
5345 {
@@ -60,34 +52,15 @@
6153 }
6254 }
6355
64 - #
65 - # HistoryBlob implementation:
66 - #
67 -
68 - /** @todo document */
69 - public function setMeta( $metaData ) {
70 - $this->uncompress();
71 - $this->mItems['meta'] = $metaData;
72 - }
73 -
74 - /** @todo document */
75 - public function getMeta() {
76 - $this->uncompress();
77 - return $this->mItems['meta'];
78 - }
79 -
80 - /** @todo document */
8156 public function addItem( $text ) {
8257 $this->uncompress();
8358 $hash = md5( $text );
8459 $this->mItems[$hash] = $text;
8560 $this->mSize += strlen( $text );
8661
87 - $stub = new HistoryBlobStub( $hash );
88 - return $stub;
 62+ return $hash;
8963 }
9064
91 - /** @todo document */
9265 public function getItem( $hash ) {
9366 $this->uncompress();
9467 if ( array_key_exists( $hash, $this->mItems ) ) {
@@ -97,29 +70,28 @@
9871 }
9972 }
10073
101 - /** @todo document */
10274 public function setText( $text ) {
10375 $this->uncompress();
10476 $stub = $this->addItem( $text );
10577 $this->mDefaultHash = $stub->mHash;
10678 }
10779
108 - /** @todo document */
10980 public function getText() {
11081 $this->uncompress();
11182 return $this->getItem( $this->mDefaultHash );
11283 }
11384
114 - # HistoryBlob implemented.
115 -
116 -
117 - /** @todo document */
 85+ /**
 86+ * Remove an item
 87+ */
11888 public function removeItem( $hash ) {
11989 $this->mSize -= strlen( $this->mItems[$hash] );
12090 unset( $this->mItems[$hash] );
12191 }
12292
123 - /** @todo document */
 93+ /**
 94+ * Compress the bulk data in the object
 95+ */
12496 public function compress() {
12597 if ( !$this->mCompressed ) {
12698 $this->mItems = gzdeflate( serialize( $this->mItems ) );
@@ -127,7 +99,9 @@
128100 }
129101 }
130102
131 - /** @todo document */
 103+ /**
 104+ * Uncompress bulk data
 105+ */
132106 public function uncompress() {
133107 if ( $this->mCompressed ) {
134108 $this->mItems = unserialize( gzinflate( $this->mItems ) );
@@ -136,19 +110,18 @@
137111 }
138112
139113
140 - /** @todo document */
141114 function __sleep() {
142115 $this->compress();
143116 return array( 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' );
144117 }
145118
146 - /** @todo document */
147119 function __wakeup() {
148120 $this->uncompress();
149121 }
150122
151123 /**
152 - * Determines if this object is happy
 124+ * Helper function for compression jobs
 125+ * Returns true until the object is "full" and ready to be committed
153126 */
154127 public function isHappy( $maxFactor, $factorThreshold ) {
155128 if ( count( $this->mItems ) == 0 ) {
@@ -184,12 +157,15 @@
185158
186159
187160 /**
188 - * @todo document (needs one-sentence top-level class description + some function descriptions).
 161+ * Pointer object for an item within a CGZ blob stored in the text table.
189162 */
190163 class HistoryBlobStub {
191164 var $mOldId, $mHash, $mRef;
192165
193 - /** @todo document */
 166+ /**
 167+ * @param string $hash The content hash of the text
 168+ * @param integer $oldid The old_id for the CGZ object
 169+ */
194170 function HistoryBlobStub( $hash = '', $oldid = 0 ) {
195171 $this->mHash = $hash;
196172 }
@@ -216,7 +192,6 @@
217193 return $this->mRef;
218194 }
219195
220 - /** @todo document */
221196 function getText() {
222197 $fname = 'HistoryBlobStub::getText';
223198 global $wgBlobCache;
@@ -264,7 +239,9 @@
265240 return $obj->getItem( $this->mHash );
266241 }
267242
268 - /** @todo document */
 243+ /**
 244+ * Get the content hash
 245+ */
269246 function getHash() {
270247 return $this->mHash;
271248 }
@@ -282,7 +259,9 @@
283260 class HistoryBlobCurStub {
284261 var $mCurId;
285262
286 - /** @todo document */
 263+ /**
 264+ * @param integer $curid The cur_id pointed to
 265+ */
287266 function HistoryBlobCurStub( $curid = 0 ) {
288267 $this->mCurId = $curid;
289268 }
@@ -295,7 +274,6 @@
296275 $this->mCurId = $id;
297276 }
298277
299 - /** @todo document */
300278 function getText() {
301279 $dbr = wfGetDB( DB_SLAVE );
302280 $row = $dbr->selectRow( 'cur', array( 'cur_text' ), array( 'cur_id' => $this->mCurId ) );
@@ -305,3 +283,123 @@
306284 return $row->cur_text;
307285 }
308286 }
 287+
 288+/**
 289+ * Diff-based history compression
 290+ * Requires xdiff 1.5+ and zlib
 291+ */
 292+class DiffHistoryBlob implements HistoryBlob {
 293+ /** Uncompressed item cache */
 294+ var $mItems = array();
 295+
 296+ /**
 297+ * Array of diffs, where $this->mDiffs[0] is the diff between
 298+ * $this->mDiffs[0] and $this->mDiffs[1]
 299+ */
 300+ var $mDiffs = array();
 301+
 302+ /**
 303+ * The key for getText()
 304+ */
 305+ var $mDefaultKey;
 306+
 307+ /**
 308+ * Compressed storage
 309+ */
 310+ var $mCompressed;
 311+
 312+ /**
 313+ * True if the object is locked against further writes
 314+ */
 315+ var $mFrozen = false;
 316+
 317+
 318+ function __construct() {
 319+ if ( !function_exists( 'xdiff_string_bdiff' ) ){
 320+ throw new MWException( "Need xdiff 1.5+ support to read or write DiffHistoryBlob\n" );
 321+ }
 322+ if ( !function_exists( 'gzdeflate' ) ) {
 323+ throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" );
 324+ }
 325+ }
 326+
 327+ function addItem( $text ) {
 328+ if ( $this->mFrozen ) {
 329+ throw new MWException( __METHOD__.": Cannot add more items after sleep/wakeup" );
 330+ }
 331+
 332+ $this->mItems[] = $text;
 333+ $i = count( $this->mItems ) - 1;
 334+ if ( $i > 0 ) {
 335+ # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
 336+ # "String is not zero-terminated"
 337+ wfSuppressWarnings();
 338+ $this->mDiffs[] = xdiff_string_bdiff( $this->mItems[$i-1], $text ) . '';
 339+ wfRestoreWarnings();
 340+ }
 341+ return $i;
 342+ }
 343+
 344+ function getItem( $key ) {
 345+ if ( $key > count( $this->mDiffs ) + 1 ) {
 346+ return false;
 347+ }
 348+ $key = intval( $key );
 349+ if ( $key == 0 ) {
 350+ return $this->mItems[0];
 351+ }
 352+
 353+ $last = count( $this->mItems ) - 1;
 354+ for ( $i = $last + 1; $i <= $key; $i++ ) {
 355+ # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
 356+ # "String is not zero-terminated"
 357+ wfSuppressWarnings();
 358+ $this->mItems[$i] = xdiff_string_bpatch( $this->mItems[$i - 1], $this->mDiffs[$i - 1] ) . '';
 359+ wfRestoreWarnings();
 360+ }
 361+ return $this->mItems[$key];
 362+ }
 363+
 364+ function setText( $text ) {
 365+ $this->mDefaultKey = $this->addItem( $text );
 366+ }
 367+
 368+ function getText() {
 369+ return $this->getItem( $this->mDefaultKey );
 370+ }
 371+
 372+ function __sleep() {
 373+ if ( !isset( $this->mItems[0] ) ) {
 374+ // Empty object
 375+ $info = false;
 376+ } else {
 377+ $info = array(
 378+ 'base' => $this->mItems[0],
 379+ 'diffs' => $this->mDiffs
 380+ );
 381+ }
 382+ if ( isset( $this->mDefaultKey ) ) {
 383+ $info['default'] = $this->mDefaultKey;
 384+ }
 385+ $this->mCompressed = gzdeflate( serialize( $info ) );
 386+ return array( 'mCompressed' );
 387+ }
 388+
 389+ function __wakeup() {
 390+ // addItem() doesn't work if mItems is partially filled from mDiffs
 391+ $this->mFrozen = true;
 392+ $info = unserialize( gzinflate( $this->mCompressed ) );
 393+ unset( $this->mCompressed );
 394+
 395+ if ( !$info ) {
 396+ // Empty object
 397+ return;
 398+ }
 399+
 400+ if ( isset( $info['default'] ) ) {
 401+ $this->mDefaultKey = $info['default'];
 402+ }
 403+ $this->mItems[0] = $info['base'];
 404+ $this->mDiffs = $info['diffs'];
 405+ }
 406+}

Follow-up revisions

RevisionCommit summaryAuthorDate
r41578Backing r41531 out for now ("Concept for diff-based compression using the new...brion00:00, 3 October 2008
r42166Revert revert r41578 of r41531 and fix compressOld.php.tstarling09:11, 17 October 2008

Comments

#Comment by Brion VIBBER (talk | contribs)   23:55, 2 October 2008

Interesting... :)

#Comment by Brion VIBBER (talk | contribs)   23:58, 2 October 2008

This changes the return type of ConcatenatedGzipHistoryBlob::addItem() from a stub object to a hash value, which will break its usages in ConcatenatedGzipHistoryBlob::setText() and maintenance/storage/compressOld.inc.

Backing out for now...

#Comment by Brion VIBBER (talk | contribs)   00:00, 3 October 2008

Reverted in r41578

Status & tagging log