r41578 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r41577‎ | r41578 | r41579 >
Date:00:00, 3 October 2008
Author:brion
Status:old
Tags:
Comment:
Backing r41531 out for now ("Concept for diff-based compression using the new xdiff beta")
Looks cool, but this changes the return type of ConcatenatedGzipHistoryBlob::addItem() from a stub object to a hash value, which will break its usages in ConcatenatedGzipHistoryBlob::setText() and maintenance/storage/compressOld.inc.
Modified paths:
  • /trunk/phase3/includes/AutoLoader.php (modified) (history)
  • /trunk/phase3/includes/HistoryBlob.php (modified) (history)
  • /trunk/phase3/maintenance/storage/testCompression.php (deleted) (history)

Diff [purge]

Index: trunk/phase3/maintenance/storage/testCompression.php
@@ -1,70 +0,0 @@
2 -<?php
3 -
4 -$optionsWithArgs = array( 'start', 'limit', 'type' );
5 -require( dirname(__FILE__).'/../commandLine.inc' );
6 -
7 -if ( !isset( $args[0] ) ) {
8 - echo "Usage: php testCompression.php [--type=<type>] [--start=<start-date>] [--limit=<num-revs>] <page-title>\n";
9 - exit( 1 );
10 -}
11 -
12 -$title = Title::newFromText( $args[0] );
13 -if ( isset( $options['start'] ) ) {
14 - $start = wfTimestamp( TS_MW, strtotime( $options['start'] ) );
15 - echo "Starting from " . $wgLang->timeanddate( $start ) . "\n";
16 -} else {
17 - $start = '19700101000000';
18 -}
19 -$limit = isset( $options['limit'] ) ? $options['limit'] : 10;
20 -$type = isset( $options['type'] ) ? $options['type'] : 'ConcatenatedGzipHistoryBlob';
21 -
22 -
23 -$dbr = wfGetDB( DB_SLAVE );
24 -$res = $dbr->select(
25 - array( 'page', 'revision', 'text' ),
26 - '*',
27 - array(
28 - 'page_namespace' => $title->getNamespace(),
29 - 'page_title' => $title->getDBkey(),
30 - 'page_id=rev_page',
31 - 'rev_timestamp > ' . $dbr->addQuotes( $dbr->timestamp( $start ) ),
32 - 'rev_text_id=old_id'
33 - ), __FILE__, array( 'LIMIT' => $limit )
34 -);
35 -
36 -$blob = new $type;
37 -$hashes = array();
38 -$keys = array();
39 -$uncompressedSize = 0;
40 -$t = -microtime( true );
41 -foreach ( $res as $row ) {
42 - $revision = new Revision( $row );
43 - $text = $revision->getText();
44 - $uncompressedSize += strlen( $text );
45 - $hashes[$row->rev_id] = md5( $text );
46 - $keys[$row->rev_id] = $blob->addItem( $text );
47 -}
48 -
49 -$serialized = serialize( $blob );
50 -$t += microtime( true );
51 -
52 -printf( "Compression ratio for %d revisions: %5.2f, %s -> %s\n",
53 - $res->numRows(),
54 - $uncompressedSize / strlen( $serialized ),
55 - $wgLang->formatSize( $uncompressedSize ),
56 - $wgLang->formatSize( strlen( $serialized ) )
57 -);
58 -printf( "Compression time: %5.2f ms\n", $t * 1000 );
59 -
60 -$t = -microtime( true );
61 -$blob = unserialize( $serialized );
62 -foreach ( $keys as $id => $key ) {
63 - $text = $blob->getItem( $key );
64 - if ( md5( $text ) != $hashes[$id] ) {
65 - echo "Content hash mismatch for rev_id $id\n";
66 - #var_dump( $text );
67 - }
68 -}
69 -$t += microtime( true );
70 -printf( "Decompression time: %5.2f ms\n", $t * 1000 );
71 -
Index: trunk/phase3/includes/AutoLoader.php
@@ -34,7 +34,6 @@
3535 'Credits' => 'includes/Credits.php',
3636 'DBABagOStuff' => 'includes/BagOStuff.php',
3737 'DependencyWrapper' => 'includes/CacheDependency.php',
38 - 'DiffHistoryBlob' => 'includes/HistoryBlob.php',
3938 'DjVuImage' => 'includes/DjVuImage.php',
4039 'DoubleReplacer' => 'includes/StringUtils.php',
4140 'DoubleRedirectJob' => 'includes/DoubleRedirectJob.php',
Index: trunk/phase3/includes/HistoryBlob.php
@@ -1,33 +1,41 @@
22 <?php
33
44 /**
5 - * Base class for general text storage via the "object" flag in old_flags, or
6 - * two-part external storage URLs. Used for represent efficient concatenated
7 - * storage, and migration-related pointer objects.
 5+ * Pure virtual parent
 6+ * @todo document (needs a one-sentence top-level class description, that answers the question: "what is a HistoryBlob?")
87 */
98 interface HistoryBlob
109 {
1110 /**
 11+ * setMeta and getMeta currently aren't used for anything, I just thought
 12+ * they might be useful in the future.
 13+ * @param $meta String: a single string.
 14+ */
 15+ public function setMeta( $meta );
 16+
 17+ /**
 18+ * setMeta and getMeta currently aren't used for anything, I just thought
 19+ * they might be useful in the future.
 20+ * Gets the meta-value
 21+ */
 22+ public function getMeta();
 23+
 24+ /**
1225 * Adds an item of text, returns a stub object which points to the item.
1326 * You must call setLocation() on the stub object before storing it to the
1427 * database
15 - * Returns the key for getItem()
1628 */
1729 public function addItem( $text );
1830
1931 /**
20 - * Get item by key, or false if the key is not present
 32+ * Get item by hash
2133 */
22 - public function getItem( $key );
 34+ public function getItem( $hash );
2335
24 - /**
25 - * Set the "default text"
26 - * This concept is an odd property of the current DB schema, whereby each text item has a revision
27 - * associated with it. The default text is the text of the associated revision. There may, however,
28 - * be other revisions in the same object.
29 - *
30 - * Default text is not required for two-part external storage URLs.
31 - */
 36+ # Set the "default text"
 37+ # This concept is an odd property of the current DB schema, whereby each text item has a revision
 38+ # associated with it. The default text is the text of the associated revision. There may, however,
 39+ # be other revisions in the same object
3240 public function setText( $text );
3341
3442 /**
@@ -37,8 +45,8 @@
3846 }
3947
4048 /**
41 - * Concatenated gzip (CGZ) storage
42 - * Improves compression ratio by concatenating like objects before gzipping
 49+ * The real object
 50+ * @todo document (needs one-sentence top-level class description + function descriptions).
4351 */
4452 class ConcatenatedGzipHistoryBlob implements HistoryBlob
4553 {
@@ -52,15 +60,34 @@
5361 }
5462 }
5563
 64+ #
 65+ # HistoryBlob implementation:
 66+ #
 67+
 68+ /** @todo document */
 69+ public function setMeta( $metaData ) {
 70+ $this->uncompress();
 71+ $this->mItems['meta'] = $metaData;
 72+ }
 73+
 74+ /** @todo document */
 75+ public function getMeta() {
 76+ $this->uncompress();
 77+ return $this->mItems['meta'];
 78+ }
 79+
 80+ /** @todo document */
5681 public function addItem( $text ) {
5782 $this->uncompress();
5883 $hash = md5( $text );
5984 $this->mItems[$hash] = $text;
6085 $this->mSize += strlen( $text );
6186
62 - return $hash;
 87+ $stub = new HistoryBlobStub( $hash );
 88+ return $stub;
6389 }
6490
 91+ /** @todo document */
6592 public function getItem( $hash ) {
6693 $this->uncompress();
6794 if ( array_key_exists( $hash, $this->mItems ) ) {
@@ -70,28 +97,29 @@
7198 }
7299 }
73100
 101+ /** @todo document */
74102 public function setText( $text ) {
75103 $this->uncompress();
76104 $stub = $this->addItem( $text );
77105 $this->mDefaultHash = $stub->mHash;
78106 }
79107
 108+ /** @todo document */
80109 public function getText() {
81110 $this->uncompress();
82111 return $this->getItem( $this->mDefaultHash );
83112 }
84113
85 - /**
86 - * Remove an item
87 - */
 114+ # HistoryBlob implemented.
 115+
 116+
 117+ /** @todo document */
88118 public function removeItem( $hash ) {
89119 $this->mSize -= strlen( $this->mItems[$hash] );
90120 unset( $this->mItems[$hash] );
91121 }
92122
93 - /**
94 - * Compress the bulk data in the object
95 - */
 123+ /** @todo document */
96124 public function compress() {
97125 if ( !$this->mCompressed ) {
98126 $this->mItems = gzdeflate( serialize( $this->mItems ) );
@@ -99,9 +127,7 @@
100128 }
101129 }
102130
103 - /**
104 - * Uncompress bulk data
105 - */
 131+ /** @todo document */
106132 public function uncompress() {
107133 if ( $this->mCompressed ) {
108134 $this->mItems = unserialize( gzinflate( $this->mItems ) );
@@ -110,18 +136,19 @@
111137 }
112138
113139
 140+ /** @todo document */
114141 function __sleep() {
115142 $this->compress();
116143 return array( 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' );
117144 }
118145
 146+ /** @todo document */
119147 function __wakeup() {
120148 $this->uncompress();
121149 }
122150
123151 /**
124 - * Helper function for compression jobs
125 - * Returns true until the object is "full" and ready to be committed
 152+ * Determines if this object is happy
126153 */
127154 public function isHappy( $maxFactor, $factorThreshold ) {
128155 if ( count( $this->mItems ) == 0 ) {
@@ -157,15 +184,12 @@
158185
159186
160187 /**
161 - * Pointer object for an item within a CGZ blob stored in the text table.
 188+ * @todo document (needs one-sentence top-level class description + some function descriptions).
162189 */
163190 class HistoryBlobStub {
164191 var $mOldId, $mHash, $mRef;
165192
166 - /**
167 - * @param string $hash The content hash of the text
168 - * @param integer $oldid The old_id for the CGZ object
169 - */
 193+ /** @todo document */
170194 function HistoryBlobStub( $hash = '', $oldid = 0 ) {
171195 $this->mHash = $hash;
172196 }
@@ -192,6 +216,7 @@
193217 return $this->mRef;
194218 }
195219
 220+ /** @todo document */
196221 function getText() {
197222 $fname = 'HistoryBlobStub::getText';
198223 global $wgBlobCache;
@@ -239,9 +264,7 @@
240265 return $obj->getItem( $this->mHash );
241266 }
242267
243 - /**
244 - * Get the content hash
245 - */
 268+ /** @todo document */
246269 function getHash() {
247270 return $this->mHash;
248271 }
@@ -259,9 +282,7 @@
260283 class HistoryBlobCurStub {
261284 var $mCurId;
262285
263 - /**
264 - * @param integer $curid The cur_id pointed to
265 - */
 286+ /** @todo document */
266287 function HistoryBlobCurStub( $curid = 0 ) {
267288 $this->mCurId = $curid;
268289 }
@@ -274,6 +295,7 @@
275296 $this->mCurId = $id;
276297 }
277298
 299+ /** @todo document */
278300 function getText() {
279301 $dbr = wfGetDB( DB_SLAVE );
280302 $row = $dbr->selectRow( 'cur', array( 'cur_text' ), array( 'cur_id' => $this->mCurId ) );
@@ -283,123 +305,3 @@
284306 return $row->cur_text;
285307 }
286308 }
287 -
288 -/**
289 - * Diff-based history compression
290 - * Requires xdiff 1.5+ and zlib
291 - */
292 -class DiffHistoryBlob implements HistoryBlob {
293 - /** Uncompressed item cache */
294 - var $mItems = array();
295 -
296 - /**
297 - * Array of diffs, where $this->mDiffs[0] is the diff between
298 - * $this->mDiffs[0] and $this->mDiffs[1]
299 - */
300 - var $mDiffs = array();
301 -
302 - /**
303 - * The key for getText()
304 - */
305 - var $mDefaultKey;
306 -
307 - /**
308 - * Compressed storage
309 - */
310 - var $mCompressed;
311 -
312 - /**
313 - * True if the object is locked against further writes
314 - */
315 - var $mFrozen = false;
316 -
317 -
318 - function __construct() {
319 - if ( !function_exists( 'xdiff_string_bdiff' ) ){
320 - throw new MWException( "Need xdiff 1.5+ support to read or write DiffHistoryBlob\n" );
321 - }
322 - if ( !function_exists( 'gzdeflate' ) ) {
323 - throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" );
324 - }
325 - }
326 -
327 - function addItem( $text ) {
328 - if ( $this->mFrozen ) {
329 - throw new MWException( __METHOD__.": Cannot add more items after sleep/wakeup" );
330 - }
331 -
332 - $this->mItems[] = $text;
333 - $i = count( $this->mItems ) - 1;
334 - if ( $i > 0 ) {
335 - # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
336 - # "String is not zero-terminated"
337 - wfSuppressWarnings();
338 - $this->mDiffs[] = xdiff_string_bdiff( $this->mItems[$i-1], $text ) . '';
339 - wfRestoreWarnings();
340 - }
341 - return $i;
342 - }
343 -
344 - function getItem( $key ) {
345 - if ( $key > count( $this->mDiffs ) + 1 ) {
346 - return false;
347 - }
348 - $key = intval( $key );
349 - if ( $key == 0 ) {
350 - return $this->mItems[0];
351 - }
352 -
353 - $last = count( $this->mItems ) - 1;
354 - for ( $i = $last + 1; $i <= $key; $i++ ) {
355 - # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
356 - # "String is not zero-terminated"
357 - wfSuppressWarnings();
358 - $this->mItems[$i] = xdiff_string_bpatch( $this->mItems[$i - 1], $this->mDiffs[$i - 1] ) . '';
359 - wfRestoreWarnings();
360 - }
361 - return $this->mItems[$key];
362 - }
363 -
364 - function setText( $text ) {
365 - $this->mDefaultKey = $this->addItem( $text );
366 - }
367 -
368 - function getText() {
369 - return $this->getItem( $this->mDefaultKey );
370 - }
371 -
372 - function __sleep() {
373 - if ( !isset( $this->mItems[0] ) ) {
374 - // Empty object
375 - $info = false;
376 - } else {
377 - $info = array(
378 - 'base' => $this->mItems[0],
379 - 'diffs' => $this->mDiffs
380 - );
381 - }
382 - if ( isset( $this->mDefaultKey ) ) {
383 - $info['default'] = $this->mDefaultKey;
384 - }
385 - $this->mCompressed = gzdeflate( serialize( $info ) );
386 - return array( 'mCompressed' );
387 - }
388 -
389 - function __wakeup() {
390 - // addItem() doesn't work if mItems is partially filled from mDiffs
391 - $this->mFrozen = true;
392 - $info = unserialize( gzinflate( $this->mCompressed ) );
393 - unset( $this->mCompressed );
394 -
395 - if ( !$info ) {
396 - // Empty object
397 - return;
398 - }
399 -
400 - if ( isset( $info['default'] ) ) {
401 - $this->mDefaultKey = $info['default'];
402 - }
403 - $this->mItems[0] = $info['base'];
404 - $this->mDiffs = $info['diffs'];
405 - }
406 -}

Follow-up revisions

RevisionCommit summaryAuthorDate
r42166Revert revert r41578 of r41531 and fix compressOld.php.tstarling09:11, 17 October 2008

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r41531* Concept for diff-based compression using the new xdiff beta. Acheives massi...tstarling06:32, 2 October 2008

Status & tagging log