Index: trunk/phase3/maintenance/storage/compressOld.inc |
— | — | @@ -4,10 +4,6 @@ |
5 | 5 | * @ingroup Maintenance ExternalStorage |
6 | 6 | */ |
7 | 7 | |
8 | | -/** */ |
9 | | -require_once( 'Revision.php' ); |
10 | | -require_once( 'ExternalStoreDB.php' ); |
11 | | - |
12 | 8 | /** @todo document */ |
13 | 9 | function compressOldPages( $start = 0, $extdb = '' ) { |
14 | 10 | $fname = 'compressOldPages'; |
— | — | @@ -229,7 +225,7 @@ |
230 | 226 | $stub = false; |
231 | 227 | print 'x'; |
232 | 228 | } else { |
233 | | - $stub = $chunk->addItem( $text ); |
| 229 | + $stub = new HistoryBlobStub( $chunk->addItem( $text ) ); |
234 | 230 | $stub->setLocation( $primaryOldid ); |
235 | 231 | $stub->setReferrer( $oldid ); |
236 | 232 | print '.'; |
Index: trunk/phase3/maintenance/storage/testCompression.php |
— | — | @@ -0,0 +1,70 @@ |
| 2 | +<?php |
| 3 | + |
| 4 | +$optionsWithArgs = array( 'start', 'limit', 'type' ); |
| 5 | +require( dirname(__FILE__).'/../commandLine.inc' ); |
| 6 | + |
| 7 | +if ( !isset( $args[0] ) ) { |
| 8 | + echo "Usage: php testCompression.php [--type=<type>] [--start=<start-date>] [--limit=<num-revs>] <page-title>\n"; |
| 9 | + exit( 1 ); |
| 10 | +} |
| 11 | + |
| 12 | +$title = Title::newFromText( $args[0] ); |
| 13 | +if ( isset( $options['start'] ) ) { |
| 14 | + $start = wfTimestamp( TS_MW, strtotime( $options['start'] ) ); |
| 15 | + echo "Starting from " . $wgLang->timeanddate( $start ) . "\n"; |
| 16 | +} else { |
| 17 | + $start = '19700101000000'; |
| 18 | +} |
| 19 | +$limit = isset( $options['limit'] ) ? $options['limit'] : 10; |
| 20 | +$type = isset( $options['type'] ) ? $options['type'] : 'ConcatenatedGzipHistoryBlob'; |
| 21 | + |
| 22 | + |
| 23 | +$dbr = wfGetDB( DB_SLAVE ); |
| 24 | +$res = $dbr->select( |
| 25 | + array( 'page', 'revision', 'text' ), |
| 26 | + '*', |
| 27 | + array( |
| 28 | + 'page_namespace' => $title->getNamespace(), |
| 29 | + 'page_title' => $title->getDBkey(), |
| 30 | + 'page_id=rev_page', |
| 31 | + 'rev_timestamp > ' . $dbr->addQuotes( $dbr->timestamp( $start ) ), |
| 32 | + 'rev_text_id=old_id' |
| 33 | + ), __FILE__, array( 'LIMIT' => $limit ) |
| 34 | +); |
| 35 | + |
| 36 | +$blob = new $type; |
| 37 | +$hashes = array(); |
| 38 | +$keys = array(); |
| 39 | +$uncompressedSize = 0; |
| 40 | +$t = -microtime( true ); |
| 41 | +foreach ( $res as $row ) { |
| 42 | + $revision = new Revision( $row ); |
| 43 | + $text = $revision->getText(); |
| 44 | + $uncompressedSize += strlen( $text ); |
| 45 | + $hashes[$row->rev_id] = md5( $text ); |
| 46 | + $keys[$row->rev_id] = $blob->addItem( $text ); |
| 47 | +} |
| 48 | + |
| 49 | +$serialized = serialize( $blob ); |
| 50 | +$t += microtime( true ); |
| 51 | + |
| 52 | +printf( "Compression ratio for %d revisions: %5.2f, %s -> %s\n", |
| 53 | + $res->numRows(), |
| 54 | + $uncompressedSize / strlen( $serialized ), |
| 55 | + $wgLang->formatSize( $uncompressedSize ), |
| 56 | + $wgLang->formatSize( strlen( $serialized ) ) |
| 57 | +); |
| 58 | +printf( "Compression time: %5.2f ms\n", $t * 1000 ); |
| 59 | + |
| 60 | +$t = -microtime( true ); |
| 61 | +$blob = unserialize( $serialized ); |
| 62 | +foreach ( $keys as $id => $key ) { |
| 63 | + $text = $blob->getItem( $key ); |
| 64 | + if ( md5( $text ) != $hashes[$id] ) { |
| 65 | + echo "Content hash mismatch for rev_id $id\n"; |
| 66 | + #var_dump( $text ); |
| 67 | + } |
| 68 | +} |
| 69 | +$t += microtime( true ); |
| 70 | +printf( "Decompression time: %5.2f ms\n", $t * 1000 ); |
| 71 | + |
Property changes on: trunk/phase3/maintenance/storage/testCompression.php |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 72 | + native |
Index: trunk/phase3/includes/AutoLoader.php |
— | — | @@ -35,6 +35,7 @@ |
36 | 36 | 'Credits' => 'includes/Credits.php', |
37 | 37 | 'DBABagOStuff' => 'includes/BagOStuff.php', |
38 | 38 | 'DependencyWrapper' => 'includes/CacheDependency.php', |
| 39 | + 'DiffHistoryBlob' => 'includes/HistoryBlob.php', |
39 | 40 | 'DjVuImage' => 'includes/DjVuImage.php', |
40 | 41 | 'DoubleReplacer' => 'includes/StringUtils.php', |
41 | 42 | 'DoubleRedirectJob' => 'includes/DoubleRedirectJob.php', |
Index: trunk/phase3/includes/HistoryBlob.php |
— | — | @@ -1,41 +1,33 @@ |
2 | 2 | <?php |
3 | 3 | |
4 | 4 | /** |
5 | | - * Pure virtual parent |
6 | | - * @todo document (needs a one-sentence top-level class description, that answers the question: "what is a HistoryBlob?") |
| 5 | + * Base class for general text storage via the "object" flag in old_flags, or |
| 6 | + * two-part external storage URLs. Used for represent efficient concatenated |
| 7 | + * storage, and migration-related pointer objects. |
7 | 8 | */ |
8 | 9 | interface HistoryBlob |
9 | 10 | { |
10 | 11 | /** |
11 | | - * setMeta and getMeta currently aren't used for anything, I just thought |
12 | | - * they might be useful in the future. |
13 | | - * @param $meta String: a single string. |
14 | | - */ |
15 | | - public function setMeta( $meta ); |
16 | | - |
17 | | - /** |
18 | | - * setMeta and getMeta currently aren't used for anything, I just thought |
19 | | - * they might be useful in the future. |
20 | | - * Gets the meta-value |
21 | | - */ |
22 | | - public function getMeta(); |
23 | | - |
24 | | - /** |
25 | 12 | * Adds an item of text, returns a stub object which points to the item. |
26 | 13 | * You must call setLocation() on the stub object before storing it to the |
27 | 14 | * database |
| 15 | + * Returns the key for getItem() |
28 | 16 | */ |
29 | 17 | public function addItem( $text ); |
30 | 18 | |
31 | 19 | /** |
32 | | - * Get item by hash |
| 20 | + * Get item by key, or false if the key is not present |
33 | 21 | */ |
34 | | - public function getItem( $hash ); |
| 22 | + public function getItem( $key ); |
35 | 23 | |
36 | | - # Set the "default text" |
37 | | - # This concept is an odd property of the current DB schema, whereby each text item has a revision |
38 | | - # associated with it. The default text is the text of the associated revision. There may, however, |
39 | | - # be other revisions in the same object |
| 24 | + /** |
| 25 | + * Set the "default text" |
| 26 | + * This concept is an odd property of the current DB schema, whereby each text item has a revision |
| 27 | + * associated with it. The default text is the text of the associated revision. There may, however, |
| 28 | + * be other revisions in the same object. |
| 29 | + * |
| 30 | + * Default text is not required for two-part external storage URLs. |
| 31 | + */ |
40 | 32 | public function setText( $text ); |
41 | 33 | |
42 | 34 | /** |
— | — | @@ -45,8 +37,8 @@ |
46 | 38 | } |
47 | 39 | |
48 | 40 | /** |
49 | | - * The real object |
50 | | - * @todo document (needs one-sentence top-level class description + function descriptions). |
| 41 | + * Concatenated gzip (CGZ) storage |
| 42 | + * Improves compression ratio by concatenating like objects before gzipping |
51 | 43 | */ |
52 | 44 | class ConcatenatedGzipHistoryBlob implements HistoryBlob |
53 | 45 | { |
— | — | @@ -60,34 +52,15 @@ |
61 | 53 | } |
62 | 54 | } |
63 | 55 | |
64 | | - # |
65 | | - # HistoryBlob implementation: |
66 | | - # |
67 | | - |
68 | | - /** @todo document */ |
69 | | - public function setMeta( $metaData ) { |
70 | | - $this->uncompress(); |
71 | | - $this->mItems['meta'] = $metaData; |
72 | | - } |
73 | | - |
74 | | - /** @todo document */ |
75 | | - public function getMeta() { |
76 | | - $this->uncompress(); |
77 | | - return $this->mItems['meta']; |
78 | | - } |
79 | | - |
80 | | - /** @todo document */ |
81 | 56 | public function addItem( $text ) { |
82 | 57 | $this->uncompress(); |
83 | 58 | $hash = md5( $text ); |
84 | 59 | $this->mItems[$hash] = $text; |
85 | 60 | $this->mSize += strlen( $text ); |
86 | 61 | |
87 | | - $stub = new HistoryBlobStub( $hash ); |
88 | | - return $stub; |
| 62 | + return $hash; |
89 | 63 | } |
90 | 64 | |
91 | | - /** @todo document */ |
92 | 65 | public function getItem( $hash ) { |
93 | 66 | $this->uncompress(); |
94 | 67 | if ( array_key_exists( $hash, $this->mItems ) ) { |
— | — | @@ -97,29 +70,27 @@ |
98 | 71 | } |
99 | 72 | } |
100 | 73 | |
101 | | - /** @todo document */ |
102 | 74 | public function setText( $text ) { |
103 | 75 | $this->uncompress(); |
104 | | - $stub = $this->addItem( $text ); |
105 | | - $this->mDefaultHash = $stub->mHash; |
| 76 | + $this->mDefaultHash = $this->addItem( $text ); |
106 | 77 | } |
107 | 78 | |
108 | | - /** @todo document */ |
109 | 79 | public function getText() { |
110 | 80 | $this->uncompress(); |
111 | 81 | return $this->getItem( $this->mDefaultHash ); |
112 | 82 | } |
113 | 83 | |
114 | | - # HistoryBlob implemented. |
115 | | - |
116 | | - |
117 | | - /** @todo document */ |
| 84 | + /** |
| 85 | + * Remove an item |
| 86 | + */ |
118 | 87 | public function removeItem( $hash ) { |
119 | 88 | $this->mSize -= strlen( $this->mItems[$hash] ); |
120 | 89 | unset( $this->mItems[$hash] ); |
121 | 90 | } |
122 | 91 | |
123 | | - /** @todo document */ |
| 92 | + /** |
| 93 | + * Compress the bulk data in the object |
| 94 | + */ |
124 | 95 | public function compress() { |
125 | 96 | if ( !$this->mCompressed ) { |
126 | 97 | $this->mItems = gzdeflate( serialize( $this->mItems ) ); |
— | — | @@ -127,7 +98,9 @@ |
128 | 99 | } |
129 | 100 | } |
130 | 101 | |
131 | | - /** @todo document */ |
| 102 | + /** |
| 103 | + * Uncompress bulk data |
| 104 | + */ |
132 | 105 | public function uncompress() { |
133 | 106 | if ( $this->mCompressed ) { |
134 | 107 | $this->mItems = unserialize( gzinflate( $this->mItems ) ); |
— | — | @@ -136,19 +109,18 @@ |
137 | 110 | } |
138 | 111 | |
139 | 112 | |
140 | | - /** @todo document */ |
141 | 113 | function __sleep() { |
142 | 114 | $this->compress(); |
143 | 115 | return array( 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' ); |
144 | 116 | } |
145 | 117 | |
146 | | - /** @todo document */ |
147 | 118 | function __wakeup() { |
148 | 119 | $this->uncompress(); |
149 | 120 | } |
150 | 121 | |
151 | 122 | /** |
152 | | - * Determines if this object is happy |
| 123 | + * Helper function for compression jobs |
| 124 | + * Returns true until the object is "full" and ready to be committed |
153 | 125 | */ |
154 | 126 | public function isHappy( $maxFactor, $factorThreshold ) { |
155 | 127 | if ( count( $this->mItems ) == 0 ) { |
— | — | @@ -184,12 +156,15 @@ |
185 | 157 | |
186 | 158 | |
187 | 159 | /** |
188 | | - * @todo document (needs one-sentence top-level class description + some function descriptions). |
| 160 | + * Pointer object for an item within a CGZ blob stored in the text table. |
189 | 161 | */ |
190 | 162 | class HistoryBlobStub { |
191 | 163 | var $mOldId, $mHash, $mRef; |
192 | 164 | |
193 | | - /** @todo document */ |
| 165 | + /** |
| 166 | + * @param string $hash The content hash of the text |
| 167 | + * @param integer $oldid The old_id for the CGZ object |
| 168 | + */ |
194 | 169 | function HistoryBlobStub( $hash = '', $oldid = 0 ) { |
195 | 170 | $this->mHash = $hash; |
196 | 171 | } |
— | — | @@ -216,7 +191,6 @@ |
217 | 192 | return $this->mRef; |
218 | 193 | } |
219 | 194 | |
220 | | - /** @todo document */ |
221 | 195 | function getText() { |
222 | 196 | $fname = 'HistoryBlobStub::getText'; |
223 | 197 | global $wgBlobCache; |
— | — | @@ -264,7 +238,9 @@ |
265 | 239 | return $obj->getItem( $this->mHash ); |
266 | 240 | } |
267 | 241 | |
268 | | - /** @todo document */ |
| 242 | + /** |
| 243 | + * Get the content hash |
| 244 | + */ |
269 | 245 | function getHash() { |
270 | 246 | return $this->mHash; |
271 | 247 | } |
— | — | @@ -282,7 +258,9 @@ |
283 | 259 | class HistoryBlobCurStub { |
284 | 260 | var $mCurId; |
285 | 261 | |
286 | | - /** @todo document */ |
| 262 | + /** |
| 263 | + * @param integer $curid The cur_id pointed to |
| 264 | + */ |
287 | 265 | function HistoryBlobCurStub( $curid = 0 ) { |
288 | 266 | $this->mCurId = $curid; |
289 | 267 | } |
— | — | @@ -295,7 +273,6 @@ |
296 | 274 | $this->mCurId = $id; |
297 | 275 | } |
298 | 276 | |
299 | | - /** @todo document */ |
300 | 277 | function getText() { |
301 | 278 | $dbr = wfGetDB( DB_SLAVE ); |
302 | 279 | $row = $dbr->selectRow( 'cur', array( 'cur_text' ), array( 'cur_id' => $this->mCurId ) ); |
— | — | @@ -305,3 +282,123 @@ |
306 | 283 | return $row->cur_text; |
307 | 284 | } |
308 | 285 | } |
| 286 | + |
| 287 | +/** |
| 288 | + * Diff-based history compression |
| 289 | + * Requires xdiff 1.5+ and zlib |
| 290 | + */ |
| 291 | +class DiffHistoryBlob implements HistoryBlob { |
| 292 | + /** Uncompressed item cache */ |
| 293 | + var $mItems = array(); |
| 294 | + |
| 295 | + /** |
| 296 | + * Array of diffs, where $this->mDiffs[0] is the diff between |
| 297 | + * $this->mDiffs[0] and $this->mDiffs[1] |
| 298 | + */ |
| 299 | + var $mDiffs = array(); |
| 300 | + |
| 301 | + /** |
| 302 | + * The key for getText() |
| 303 | + */ |
| 304 | + var $mDefaultKey; |
| 305 | + |
| 306 | + /** |
| 307 | + * Compressed storage |
| 308 | + */ |
| 309 | + var $mCompressed; |
| 310 | + |
| 311 | + /** |
| 312 | + * True if the object is locked against further writes |
| 313 | + */ |
| 314 | + var $mFrozen = false; |
| 315 | + |
| 316 | + |
| 317 | + function __construct() { |
| 318 | + if ( !function_exists( 'xdiff_string_bdiff' ) ){ |
| 319 | + throw new MWException( "Need xdiff 1.5+ support to read or write DiffHistoryBlob\n" ); |
| 320 | + } |
| 321 | + if ( !function_exists( 'gzdeflate' ) ) { |
| 322 | + throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" ); |
| 323 | + } |
| 324 | + } |
| 325 | + |
| 326 | + function addItem( $text ) { |
| 327 | + if ( $this->mFrozen ) { |
| 328 | + throw new MWException( __METHOD__.": Cannot add more items after sleep/wakeup" ); |
| 329 | + } |
| 330 | + |
| 331 | + $this->mItems[] = $text; |
| 332 | + $i = count( $this->mItems ) - 1; |
| 333 | + if ( $i > 0 ) { |
| 334 | + # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff |
| 335 | + # "String is not zero-terminated" |
| 336 | + wfSuppressWarnings(); |
| 337 | + $this->mDiffs[] = xdiff_string_bdiff( $this->mItems[$i-1], $text ) . ''; |
| 338 | + wfRestoreWarnings(); |
| 339 | + } |
| 340 | + return $i; |
| 341 | + } |
| 342 | + |
| 343 | + function getItem( $key ) { |
| 344 | + if ( $key > count( $this->mDiffs ) + 1 ) { |
| 345 | + return false; |
| 346 | + } |
| 347 | + $key = intval( $key ); |
| 348 | + if ( $key == 0 ) { |
| 349 | + return $this->mItems[0]; |
| 350 | + } |
| 351 | + |
| 352 | + $last = count( $this->mItems ) - 1; |
| 353 | + for ( $i = $last + 1; $i <= $key; $i++ ) { |
| 354 | + # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff |
| 355 | + # "String is not zero-terminated" |
| 356 | + wfSuppressWarnings(); |
| 357 | + $this->mItems[$i] = xdiff_string_bpatch( $this->mItems[$i - 1], $this->mDiffs[$i - 1] ) . ''; |
| 358 | + wfRestoreWarnings(); |
| 359 | + } |
| 360 | + return $this->mItems[$key]; |
| 361 | + } |
| 362 | + |
| 363 | + function setText( $text ) { |
| 364 | + $this->mDefaultKey = $this->addItem( $text ); |
| 365 | + } |
| 366 | + |
| 367 | + function getText() { |
| 368 | + return $this->getItem( $this->mDefaultKey ); |
| 369 | + } |
| 370 | + |
| 371 | + function __sleep() { |
| 372 | + if ( !isset( $this->mItems[0] ) ) { |
| 373 | + // Empty object |
| 374 | + $info = false; |
| 375 | + } else { |
| 376 | + $info = array( |
| 377 | + 'base' => $this->mItems[0], |
| 378 | + 'diffs' => $this->mDiffs |
| 379 | + ); |
| 380 | + } |
| 381 | + if ( isset( $this->mDefaultKey ) ) { |
| 382 | + $info['default'] = $this->mDefaultKey; |
| 383 | + } |
| 384 | + $this->mCompressed = gzdeflate( serialize( $info ) ); |
| 385 | + return array( 'mCompressed' ); |
| 386 | + } |
| 387 | + |
| 388 | + function __wakeup() { |
| 389 | + // addItem() doesn't work if mItems is partially filled from mDiffs |
| 390 | + $this->mFrozen = true; |
| 391 | + $info = unserialize( gzinflate( $this->mCompressed ) ); |
| 392 | + unset( $this->mCompressed ); |
| 393 | + |
| 394 | + if ( !$info ) { |
| 395 | + // Empty object |
| 396 | + return; |
| 397 | + } |
| 398 | + |
| 399 | + if ( isset( $info['default'] ) ) { |
| 400 | + $this->mDefaultKey = $info['default']; |
| 401 | + } |
| 402 | + $this->mItems[0] = $info['base']; |
| 403 | + $this->mDiffs = $info['diffs']; |
| 404 | + } |
| 405 | +} |