r91637 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r91636‎ | r91637 | r91638 >
Date:12:07, 7 July 2011
Author:ariel
Status:deferred
Tags:
Comment:
move bz2 related utils into subdirectory
Modified paths:
  • /branches/ariel/xmldumps-backup/checkforbz2footer.c (deleted) (history)
  • /branches/ariel/xmldumps-backup/dumpbz2filefromoffset.c (deleted) (history)
  • /branches/ariel/xmldumps-backup/dumplastbz2block.c (deleted) (history)
  • /branches/ariel/xmldumps-backup/findpageidinbz2xml.c (deleted) (history)
  • /branches/ariel/xmldumps-backup/findpageidinbz2xml.h (deleted) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils (added) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/checkforbz2footer.c (added) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c (added) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/dumplastbz2block.c (added) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c (added) (history)
  • /branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.h (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/dumpbz2filefromoffset.c
@@ -1,766 +0,0 @@
2 -#include <unistd.h>
3 -#include <stdio.h>
4 -#include <string.h>
5 -#include <sys/types.h>
6 -#include <sys/stat.h>
7 -#include <fcntl.h>
8 -#include <stdlib.h>
9 -#include <errno.h>
10 -#include <sys/types.h>
11 -#include <regex.h>
12 -#include "bzlib.h"
13 -#include "findpageidinbz2xml.h"
14 -
15 -
16 -/* return n ones either at left or right end */
17 -int bit_mask(int numbits, int end) {
18 - if (end == MASKRIGHT) {
19 - return((1<<numbits)-1);
20 - }
21 - else {
22 - return(((1<<numbits)-1) << (8-numbits));
23 - }
24 -}
25 -
26 -void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {
27 - int i;
28 -
29 - if (numbits == 0) {
30 - return;
31 - }
32 -
33 - for (i=0; i<buflen; i++) {
34 - /* left 1 */
35 - buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
36 -
37 - /* grab leftmost from next byte */
38 - if (i < buflen-1) {
39 - buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] | ( ( ((unsigned int) buffer[i+1]) & bit_mask(numbits,MASKLEFT) ) >> (8-numbits) ) );
40 - }
41 - }
42 -}
43 -
44 -
45 -void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {
46 - int i;
47 -
48 - for (i=buflen-1; i>=0; i--) {
49 - /* right 1 */
50 - buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
51 -
52 - /* grab rightmost from prev byte */
53 - if (i > 0) {
54 - buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bit_mask(numbits,MASKLEFT)));
55 - }
56 - }
57 -}
58 -
59 -unsigned char ** init_marker() {
60 - unsigned char **marker = malloc(8*sizeof(unsigned char *));
61 - int i;
62 -
63 - /* set up block marker plus its various right-shifted incarnations */
64 - for (i = 0; i< 8; i++) {
65 - marker[i] = malloc(sizeof(unsigned char)*7);
66 - }
67 - marker[0][0]= (unsigned char) 0x31;
68 - marker[0][1]= (unsigned char) 0x41;
69 - marker[0][2]= (unsigned char) 0x59;
70 - marker[0][3]= (unsigned char) 0x26;
71 - marker[0][4]= (unsigned char) 0x53;
72 - marker[0][5]= (unsigned char) 0x59;
73 - marker[0][6]= (unsigned char) 0x00;
74 - for (i = 1; i< 8; i++) {
75 - memcpy((char *)(marker[i]), (char *)(marker[i-1]),7);
76 - shift_bytes_right(marker[i],7,1);
77 - }
78 - return(marker);
79 -}
80 -
81 -/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
82 - both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
83 - matches and 0 otherwise. */
84 -int bytes_compare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) {
85 - int i;
86 -
87 - if (bitsrightshifted == 0) {
88 - for (i = 0; i< numbytes; i++) {
89 - if (buff1[i] != buff2[i]) {
90 - return(1);
91 - }
92 - }
93 - return(0);
94 - }
95 - else {
96 - for (i = 1; i< numbytes-2; i++) {
97 - if (buff1[i] != buff2[i]) {
98 - return(1);
99 - }
100 - }
101 - /* do leftmost byte */
102 - if ((buff1[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) ) {
103 - return(1);
104 - }
105 - /* do rightmost byte */
106 - if ((buff1[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) ) {
107 - return(1);
108 - }
109 - return(0);
110 - }
111 -}
112 -
113 -/* return -1 if no match
114 - return number of bits rightshifted otherwise */
115 -int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {
116 - int result, i;
117 -
118 - result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);
119 - if (!result) {
120 - return(0);
121 - }
122 - for (i=1; i<8; i++) {
123 - result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);
124 - if (!result) {
125 - return(i);
126 - }
127 - }
128 - return(-1);
129 -}
130 -
131 -/* return: 1 if found, 0 if not, -1 on error */
132 -int find_next_bz2_block_marker(int fin, bz_info_t *bfile) {
133 - int result;
134 -
135 - bfile->bits_shifted = -1;
136 - result = read(fin, bfile->marker_buffer, 7);
137 - if (result == -1) {
138 - fprintf(stderr,"read of file failed\n");
139 - exit(-1);
140 - }
141 - /* must be after 4 byte file header, and we add a leftmost byte to the buffer
142 - of data read in case some bits have been shifted into it */
143 - while (bfile->position <= bfile->file_size - 6 && bfile->bits_shifted < 0) {
144 - bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);
145 - if (bfile->bits_shifted < 0) {
146 - bfile->position++;
147 - result = lseek(fin, (bfile->position), SEEK_SET);
148 - if (result == -1) {
149 - fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);
150 - exit(-1);
151 - }
152 - result = read(fin, bfile->marker_buffer, 7);
153 - if (result < 7) {
154 - /* fprintf(stderr,"read of file failed\n"); */
155 - exit(-1);
156 - }
157 - }
158 - else {
159 - bfile->block_start = bfile->position;
160 - return(1);
161 - }
162 - }
163 - return(0);
164 -}
165 -
166 -/*
167 - initializes the bz2 strm structure,
168 - calls the BZ2 decompression library initializer
169 -
170 - returns:
171 - BZ_OK on success
172 - various BZ_ errors on failure (see bzlib.h)
173 -*/
174 -int init_decompress(bz_info_t *bfile) {
175 - int bz_verbosity = 0;
176 - int bz_small = 0;
177 - int ret;
178 -
179 - bfile->strm.bzalloc = NULL;
180 - bfile->strm.bzfree = NULL;
181 - bfile->strm.opaque = NULL;
182 -
183 - ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
184 - if (ret != BZ_OK) {
185 - fprintf(stderr,"uncompress failed, err %d\n", ret);
186 - exit(-1);
187 - }
188 - return(ret);
189 -}
190 -
191 -/*
192 - reads the first 4 bytes from a bz2 file (should be
193 - "BZh" followed by the block size indicator, typically "9")
194 - and passes them into the BZ2 decompression library.
195 - This must be done before decompression of any block of the
196 - file is attempted.
197 -
198 - returns:
199 - BZ_OK if successful,
200 - various BZ_ errors on failure (see bzlib.h)
201 -*/
202 -int decompress_header(int fin, bz_info_t *bfile) {
203 - int ret, res;
204 -
205 - res = lseek(fin,0,SEEK_SET);
206 - if (res == -1) {
207 - fprintf(stderr,"lseek of file to 0 failed (3)\n");
208 - }
209 - bfile->bytes_read = read(fin, bfile->header_buffer, 4);
210 - if (bfile->bytes_read < 4) {
211 - fprintf(stderr,"failed to read 4 bytes of header, exiting\n");
212 - exit(-1);
213 - }
214 - bfile->strm.next_in = (char *)bfile->header_buffer;
215 - bfile->strm.avail_in = 4;
216 -
217 - ret = BZ2_bzDecompress ( &(bfile->strm) );
218 - if (BZ_OK != ret && BZ_STREAM_END != ret) {
219 - fprintf(stderr,"Corrupt bzip2 header, exiting\n");
220 - exit(-1);
221 - }
222 - return(ret);
223 -}
224 -
225 -/*
226 - seek to appropriate offset as specified in bfile,
227 - read compressed data into buffer indicated by bfile,
228 - update the bfile structure accordingly,
229 - save the overflow byte (bit-shifted data = suck)
230 - this is for the *first* buffer of data in a stream,
231 - for subsequent buffers use fill_buffer_to_decompress()
232 -
233 - this will set bfile->eof on eof. no other indicator
234 - will be provided.
235 -
236 - returns:
237 - 0 on success
238 - -1 on error
239 -*/
240 -int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {
241 - int res;
242 -
243 - if (bfile->bits_shifted == 0) {
244 - res = lseek(fin,bfile->position+1,SEEK_SET);
245 - if (res == -1) {
246 - fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);
247 - return(-1);
248 - }
249 - }
250 - else {
251 - res = lseek(fin,bfile->position,SEEK_SET);
252 - if (res == -1) {
253 - fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);
254 - return(-1);
255 - }
256 - }
257 - bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);
258 - if (bfile->bytes_read > 0) {
259 - bfile->overflow = bfile->bufin[bfile->bytes_read-1];
260 - shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);
261 -
262 - bfile->strm.next_in = (char *)(bfile->bufin);
263 - bfile->strm.avail_in = bfile->bytes_read-1;
264 - }
265 - if (bfile->bytes_read <=0) {
266 - bfile->eof++;
267 - }
268 - return(0);
269 -}
270 -
271 -/*
272 - read compressed data into buffer indicated by bfile,
273 - from current position of file,
274 - stuffing the overflow byte in first.
275 - update the bfile structure accordingly
276 - save the new overflow byte (bit-shifted data = suck)
277 - this function is for decompression of buffers *after
278 - the first one*. for the first one use
279 - setup_first_buffer_to_decompress()
280 -
281 - this will set bfile->eof on eof. no other indicator
282 - will be provided.
283 -
284 - returns:
285 - 0 on success
286 - hmm, it really does not do anything about errors :-D
287 -*/
288 -int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {
289 - if (bfile->strm.avail_in == 0) {
290 - bfile->strm.next_in = (char *)(bfile->bufin);
291 - bfile->bufin[0] = bfile->overflow;
292 - bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);
293 - if (bfile->bytes_read > 0) {
294 - bfile->position+=bfile->bytes_read;
295 - bfile->overflow = bfile->bufin[bfile->bytes_read];
296 - shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);
297 - bfile->strm.avail_in = bfile->bytes_read;
298 - }
299 - else {
300 - bfile->strm.avail_in = 1; /* the overflow byte */
301 - bfile->eof++;
302 - }
303 - }
304 - return(0);
305 -}
306 -
307 -/* size of buffer is bytes usable. there will be a null byte at the end
308 -
309 - what we do with the buffer:
310 - - read from front of buffer to end,
311 - - fill from point where prev read did not fill buffer, or from where
312 - move of data at end of buffer to beginning left room,
313 - - mark a string of bytes (starting from what's available to read) as "read"
314 -
315 -*/
316 -buf_info_t *init_buffer(int size) {
317 - buf_info_t *b;
318 -
319 - b = (buf_info_t *)malloc(sizeof(buf_info_t));
320 - b->buffer = malloc(sizeof(unsigned char)*(size+1));
321 - b->buffer[size]='\0';
322 - b->end = b->buffer + size;
323 - b->next_to_read = b->end; /* nothing available */
324 - b->bytes_avail = 0; /* bytes to read, nothing available */
325 - b->next_to_fill = b->buffer; /* empty */
326 - b->next_to_fill[0] = '\0';
327 - return(b);
328 -}
329 -
330 -/* check if buffer (used for decompressed data output) is empty,
331 - returns 1 if so and 0 if not */
332 -int buffer_is_empty(buf_info_t *b) {
333 - if (b->bytes_avail == 0) {
334 - return(1);
335 - }
336 - else {
337 - return(0);
338 - }
339 -}
340 -
341 -/* check if buffer (used for decompressed data output) is full,
342 -
343 - returns 1 if so and 0 if not
344 - I'm not liking this function so well, fixme */
345 -int buffer_is_full(buf_info_t *b) {
346 - if (b->next_to_fill == b->end) {
347 - return(1);
348 - }
349 - else {
350 - return(0);
351 - }
352 -}
353 -
354 -/* FIXME do this right. whatever. */
355 -int get_file_size(int fin) {
356 - int res;
357 -
358 - res = lseek(fin, 0, SEEK_END);
359 - if (res == -1) {
360 - fprintf(stderr,"lseek of file to 0 failed (6)\n");
361 - exit(-1);
362 - }
363 - return(res);
364 -}
365 -
366 -
367 -/*
368 - set up the marker, seek to right place, get first
369 - buffer of compressed data for processing
370 - bfile->position must be set to desired offset first by caller.
371 - returns:
372 - -1 if no marker or other error, position of next read if ok
373 -*/
374 -int init_bz2_file(bz_info_t *bfile, int fin) {
375 - int res;
376 -
377 - bfile->bufin_size = BUFINSIZE;
378 - bfile->marker = init_marker();
379 - bfile->bytes_read = 0;
380 - bfile->bytes_written = 0;
381 - bfile->eof = 0;
382 -
383 - bfile->initialized++;
384 -
385 - bfile->file_size = get_file_size(fin);
386 - if (bfile->position > bfile->file_size) {
387 - fprintf(stderr,"asked for position past end of file\n");
388 - exit(-1);
389 - }
390 - res = lseek(fin, bfile->position, SEEK_SET);
391 - if (res == -1) {
392 - fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
393 - exit(-1);
394 - }
395 -
396 - find_next_bz2_block_marker(fin, bfile);
397 - if (bfile->bits_shifted >= 0) {
398 - /* fprintf(stderr,"marker bits shifted by is %d\n",bfile->bits_shifted); */
399 - init_decompress(bfile);
400 - decompress_header(fin, bfile);
401 - setup_first_buffer_to_decompress(fin, bfile);
402 - return(0);
403 - }
404 - return(-1);
405 -}
406 -
407 -/* get the next buffer of uncompressed stuff */
408 -int decompress_data(bz_info_t *bfile, int fin, unsigned char *bufferout, int bufout_size) {
409 - int ret;
410 -
411 - bfile->bufout = bufferout;
412 - bfile->bufout_size = bufout_size;
413 - bfile->bytes_written = 0;
414 -
415 - if (! bfile->initialized) {
416 - if (init_bz2_file(bfile, fin) == -1) {
417 - fprintf(stderr,"failed to initialize bz2file\n");
418 - return(-1);
419 - };
420 - bfile->strm.next_out = (char *)bfile->bufout;
421 - bfile->strm.avail_out = bfile->bufout_size;
422 - }
423 -
424 - ret = BZ_OK;
425 - while (BZ_OK == ret && bfile->bytes_written == 0) {
426 - ret = BZ2_bzDecompress ( &(bfile->strm) );
427 - if (BZ_OK == ret || BZ_STREAM_END == ret) {
428 - bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;
429 - }
430 - else {
431 - fprintf(stderr,"error from BZ decompress %d\n",ret);
432 - return(-1);
433 - }
434 - fill_buffer_to_decompress(fin, bfile, ret);
435 - /*
436 - if (bfile->eof && (BZ_OK == ret || BZ_STREAM_END == ret) ) {
437 - fprintf(stderr,"eof reached\n");
438 - }
439 - */
440 - }
441 - return(0);
442 -}
443 -
444 -/*
445 - fill output buffer in b with uncompressed data from bfile
446 - if this is the first call to the function for this file,
447 - the file header will be read, and the first buffer of
448 - uncompressed data will be prepared. bfile->position
449 - should be set to the offset (from the beginning of file) from
450 - which to find the first bz2 block.
451 -
452 - returns:
453 - on success, number of bytes read (may be 0)
454 - -1 on error
455 -*/
456 -int get_buffer_of_uncompressed_data(buf_info_t *b, int fin, bz_info_t *bfile) {
457 - int res;
458 -
459 - if (buffer_is_full(b)) {
460 - fprintf(stdout,"DEBUG buffer full\n");
461 - return(0);
462 - }
463 -
464 - if (buffer_is_empty(b)) {
465 - b->next_to_fill = b->buffer;
466 - }
467 -
468 - res = decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill);
469 - if (res <0 ) {
470 - return(res);
471 - }
472 - if (bfile->bytes_written < 0) {
473 - fprintf(stderr,"read of file failed\n");
474 - return(-1);
475 - }
476 - else {
477 - /* really?? FIXME check this */
478 - if (buffer_is_empty(b)) {
479 - b->next_to_read = b->next_to_fill; /* where we just read */
480 - }
481 - b->bytes_avail += bfile->bytes_written;
482 - b->next_to_fill += bfile->bytes_written;
483 - b->next_to_fill[0] = '\0';
484 - return(0);
485 - }
486 -}
487 -
488 -void dumpbuf_info_t(buf_info_t *b) {
489 - fprintf(stdout, "\n");
490 - fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);
491 - fprintf(stdout, "b->end: %ld\n", (long int) b->end);
492 - fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);
493 - fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);
494 - fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);
495 -}
496 -
497 -/*
498 - copy text from end of buffer to the beginning, that we want to keep
499 - around for further processing (i.e. further regex matches)
500 - returns number of bytes copied
501 -*/
502 -int move_bytes_to_buffer_start(buf_info_t *b, unsigned char *fromwhere, int maxbytes) {
503 - int i, tocopy;
504 -
505 - if (fromwhere >= b->end) {
506 - return(0);
507 - }
508 - else {
509 - tocopy = b->end - fromwhere;
510 - if (maxbytes && (tocopy > maxbytes)) {
511 - tocopy = maxbytes;
512 - }
513 - for (i = 0; i < tocopy; i++) {
514 - b->buffer[i] = fromwhere[i];
515 - }
516 - b->next_to_fill = b->buffer + tocopy;
517 - b->next_to_fill[0] = '\0';
518 - b->next_to_read = b->buffer;
519 - b->bytes_avail = tocopy;
520 - return(tocopy);
521 - }
522 -}
523 -
524 -/*
525 - dump the <meadiawiki> header (up through
526 - </siteinfo> close tag) found at the
527 - beginning of xml dump files.
528 - returns:
529 - 0 on success,
530 - -1 on error
531 -*/
532 -int dump_mw_header(int fin) {
533 - int res;
534 - regmatch_t *match_siteinfo;
535 - regex_t compiled_siteinfo;
536 - int length=5000; /* output buffer size */
537 - char *siteinfo = " </siteinfo>\n";
538 -
539 - buf_info_t *b;
540 - bz_info_t bfile;
541 -
542 - int firstpage = 1;
543 - int done = 0;
544 - bfile.initialized = 0;
545 -
546 - res = regcomp(&compiled_siteinfo, siteinfo, REG_EXTENDED);
547 -
548 - match_siteinfo = (regmatch_t *)malloc(sizeof(regmatch_t)*1);
549 -
550 - b = init_buffer(length);
551 - bfile.bytes_read = 0;
552 - bfile.position = 0;
553 -
554 - while ((get_buffer_of_uncompressed_data(b, fin, &bfile)>=0) && (! bfile.eof) && (!done)) {
555 - /* fixme either we don't check the return code right or we don't notice no bytes read or we don't clear the bytes read */
556 - if (bfile.bytes_read) {
557 - if (firstpage) {
558 - if (bfile.bytes_read >= 11 && !memcmp((char *)b->next_to_read,"<mediawiki ",11)) {
559 - /* good, write it and loop and not firstpage any more */
560 - if (b->bytes_avail) {
561 - if (regexec(&compiled_siteinfo, (char *)b->next_to_read, 2, match_siteinfo, 0 ) == 0) {
562 - fwrite(b->next_to_read,match_siteinfo[0].rm_eo, 1, stdout);
563 - b->next_to_read = b->end;
564 - b->bytes_avail = 0;
565 - b->next_to_fill = b->buffer; /* empty */
566 - bfile.strm.next_out = (char *)b->next_to_fill;
567 - bfile.strm.avail_out = b->end - b->next_to_fill;
568 - done++;
569 - }
570 - else {
571 - fwrite(b->next_to_read,b->bytes_avail,1,stdout);
572 - b->next_to_read = b->end;
573 - b->bytes_avail = 0;
574 - b->next_to_fill = b->buffer; /* empty */
575 - bfile.strm.next_out = (char *)b->next_to_fill;
576 - bfile.strm.avail_out = b->end - b->next_to_fill;
577 - }
578 - }
579 - }
580 - else {
581 - fprintf(stderr,"missing mediawiki header from bz2 xml file\n");
582 - return(-1);
583 - }
584 - firstpage = 0;
585 - }
586 - else { /* not firstpage */
587 - if (regexec(&compiled_siteinfo, (char *)b->next_to_read, 2, match_siteinfo, 0 ) == 0) {
588 - fwrite(b->next_to_read,match_siteinfo[0].rm_eo, 1, stdout);
589 - b->next_to_read = b->end;
590 - b->bytes_avail = 0;
591 - b->next_to_fill = b->buffer; /* empty */
592 - bfile.strm.next_out = (char *)b->next_to_fill;
593 - bfile.strm.avail_out = b->end - b->next_to_fill;
594 - done++;
595 - }
596 - else {
597 - /* could have the first part of the siteinfo tag... so copy up enough bytes to cover that case */
598 - if (b->bytes_avail> 12) {
599 - /* write everything that didn't match, but leave 12 bytes, to stdout */
600 - fwrite(b->next_to_read,b->bytes_avail - 12,1,stdout);
601 - move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 12, 12);
602 - bfile.strm.next_out = (char *)b->next_to_fill;
603 - bfile.strm.avail_out = b->end - b->next_to_fill;
604 - }
605 - else {
606 - if (buffer_is_empty(b)) {
607 - bfile.strm.next_out = (char *)b->buffer;
608 - bfile.strm.avail_out = bfile.bufout_size;
609 - b->next_to_fill = b->buffer; /* empty */
610 - }
611 - else {
612 - /* there were only 12 or less bytes so just save em don't write em to stdout */
613 - move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);
614 - bfile.strm.next_out = (char *)b->next_to_fill;
615 - bfile.strm.avail_out = b->end - b->next_to_fill;
616 - }
617 - }
618 - }
619 - } /* end notfirstpage */
620 - }
621 - }
622 - if (!done) {
623 - fprintf(stderr,"incomplete or no mediawiki header found\n");
624 - return(-1);
625 - }
626 - else {
627 - return(0);
628 - }
629 -}
630 -
631 -/*
632 - find the first page id after position in file
633 - decompress and dump to stdout from that point on
634 - returns:
635 - 0 on success,
636 - -1 on error
637 -*/
638 -int dump_from_first_page_id_after_offset(int fin, int position) {
639 - int res;
640 - regmatch_t *match_page;
641 - regex_t compiled_page;
642 - int length=5000; /* output buffer size */
643 - char *page = " <page>";
644 -
645 - buf_info_t *b;
646 - bz_info_t bfile;
647 -
648 - int firstpage = 1;
649 -
650 - bfile.initialized = 0;
651 -
652 - res = regcomp(&compiled_page, page, REG_EXTENDED);
653 -
654 - match_page = (regmatch_t *)malloc(sizeof(regmatch_t)*1);
655 -
656 - b = init_buffer(length);
657 - bfile.bytes_read = 0;
658 - bfile.position = position;
659 -
660 - while ((get_buffer_of_uncompressed_data(b, fin, &bfile)>=0) && (! bfile.eof)) {
661 - /* fixme either we don't check the return code right or we don't notice no bytes read or we don't clear the bytes read */
662 - if (bfile.bytes_read) {
663 - if (firstpage) {
664 - if (regexec(&compiled_page, (char *)b->next_to_read, 2, match_page, 0 ) == 0) {
665 - fwrite(b->next_to_read+match_page[0].rm_so,b->next_to_fill - (b->next_to_read+match_page[0].rm_so), 1, stdout);
666 - b->next_to_read = b->end;
667 - b->bytes_avail = 0;
668 - b->next_to_fill = b->buffer; /* empty */
669 - bfile.strm.next_out = (char *)b->next_to_fill;
670 - bfile.strm.avail_out = b->end - b->next_to_fill;
671 - firstpage = 0;
672 - }
673 - else {
674 - /* could have the first part of the page tag... so copy up enough bytes to cover that case */
675 - if (b->bytes_avail> 7) {
676 - /* write everything that didn't match, but leave 7 bytes, to stdout */
677 - fwrite(b->next_to_read,b->bytes_avail - 7,1,stdout);
678 - move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 7, 7);
679 - bfile.strm.next_out = (char *)b->next_to_fill;
680 - bfile.strm.avail_out = b->end - b->next_to_fill;
681 - }
682 - else {
683 - if (buffer_is_empty(b)) {
684 - bfile.strm.next_out = (char *)b->buffer;
685 - bfile.strm.avail_out = bfile.bufout_size;
686 - b->next_to_fill = b->buffer; /* empty */
687 - }
688 - else {
689 - /* there were only 7 or less bytes so just save em don't write em to stdout */
690 - move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);
691 - bfile.strm.next_out = (char *)b->next_to_fill;
692 - bfile.strm.avail_out = b->end - b->next_to_fill;
693 - }
694 - }
695 - }
696 - }
697 - else {
698 - if (b->bytes_avail) {
699 - fwrite(b->next_to_read,b->bytes_avail,1,stdout);
700 - b->next_to_read = b->end;
701 - b->bytes_avail = 0;
702 - b->next_to_fill = b->buffer; /* empty */
703 - bfile.strm.next_out = (char *)b->next_to_fill;
704 - bfile.strm.avail_out = b->end - b->next_to_fill;
705 - }
706 - }
707 - }
708 - }
709 - if (b->bytes_avail) {
710 - fwrite(b->next_to_read,b->bytes_avail,1,stdout);
711 - b->next_to_read = b->end;
712 - b->bytes_avail = 0;
713 - b->next_to_fill = b->buffer; /* empty */
714 - bfile.strm.next_out = (char *)b->next_to_fill;
715 - bfile.strm.avail_out = b->end - b->next_to_fill;
716 - }
717 - return(0);
718 -}
719 -
720 -/*
721 - find the first bz2 block after the specified offset,
722 - uncompress from that point on, write out the
723 - contents starting with the first <page> tag,
724 - prefacing first with the <mediawiki> header from
725 - the beginning of the file, up through </siteinfo>.
726 -
727 - note that we may lose some bytes from the very last
728 - block if the blocks are bit shifted, because the
729 - bzip crc at end of file will be wrong. (needs testing to
730 - find a workaround, simply not feeding in the crc doesn't
731 - suffice)
732 -
733 - for purposes of the XML dumps this is fine, since we use
734 - this tool to generate prefetch data starting from
735 - a given pageid, rather than needing to uncompress
736 - gigabytes of data to get to the point in the file
737 - we want.
738 -
739 - returns:
740 - BZ_OK on success, various BZ_ errors otherwise.
741 -*/
742 -int main(int argc, char **argv) {
743 - int fin, position, res;
744 -
745 - if (argc != 3) {
746 - fprintf(stderr,"usage: %s infile position\n", argv[0]);
747 - exit(-1);
748 - }
749 -
750 - fin = open (argv[1], O_RDONLY);
751 - if (fin < 0) {
752 - fprintf(stderr,"failed to open file %s for read\n", argv[1]);
753 - exit(-1);
754 - }
755 -
756 - position = atoi(argv[2]);
757 - if (position <0) {
758 - fprintf(stderr,"please specify a position >= 0.\n");
759 - fprintf(stderr,"usage: %s infile position\n", argv[0]);
760 - exit(-1);
761 - }
762 - /* input file, starting position in file, length of buffer for reading */
763 - res = dump_mw_header(fin);
764 -
765 - res = dump_from_first_page_id_after_offset(fin, position);
766 - exit(res);
767 -}
Index: branches/ariel/xmldumps-backup/findpageidinbz2xml.c
@@ -1,842 +0,0 @@
2 -#include <unistd.h>
3 -#include <stdio.h>
4 -#include <string.h>
5 -#include <sys/types.h>
6 -#include <sys/stat.h>
7 -#include <fcntl.h>
8 -#include <stdlib.h>
9 -#include <errno.h>
10 -#include <sys/types.h>
11 -#include <regex.h>
12 -#include "bzlib.h"
13 -#include "findpageidinbz2xml.h"
14 -
15 -/* return n ones either at left or right end */
16 -int bitmask(int numbits, int end) {
17 - if (end == MASKRIGHT) {
18 - return((1<<numbits)-1);
19 - }
20 - else {
21 - return(((1<<numbits)-1) << (8-numbits));
22 - }
23 -}
24 -
25 -void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {
26 - int i;
27 -
28 - if (numbits == 0) {
29 - return;
30 - }
31 -
32 - for (i=0; i<buflen; i++) {
33 - /* left 1 */
34 - buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
35 -
36 - /* grab leftmost from next byte */
37 - if (i < buflen-1) {
38 - buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] | ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,MASKLEFT) ) >> (8-numbits) ) );
39 - }
40 - }
41 -}
42 -
43 -void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {
44 - int i;
45 -
46 - for (i=buflen-1; i>=0; i--) {
47 - /* right 1 */
48 - buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
49 -
50 - /* grab rightmost from prev byte */
51 - if (i > 0) {
52 - buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,MASKLEFT)));
53 - }
54 - }
55 -}
56 -
57 -unsigned char ** init_marker() {
58 - unsigned char **marker = malloc(8*sizeof(unsigned char *));
59 - int i;
60 -
61 - /* set up block marker plus its various right-shifted incarnations */
62 - for (i = 0; i< 8; i++) {
63 - marker[i] = malloc(sizeof(unsigned char)*7);
64 - }
65 - marker[0][0]= (unsigned char) 0x31;
66 - marker[0][1]= (unsigned char) 0x41;
67 - marker[0][2]= (unsigned char) 0x59;
68 - marker[0][3]= (unsigned char) 0x26;
69 - marker[0][4]= (unsigned char) 0x53;
70 - marker[0][5]= (unsigned char) 0x59;
71 - marker[0][6]= (unsigned char) 0x00;
72 - for (i = 1; i< 8; i++) {
73 - memcpy((char *)(marker[i]), (char *)(marker[i-1]),7);
74 - shift_bytes_right(marker[i],7,1);
75 - }
76 - return(marker);
77 -}
78 -
79 -/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
80 - both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
81 - matches and 0 otherwise. */
82 -int bytes_compare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) {
83 - int i;
84 -
85 - if (bitsrightshifted == 0) {
86 - for (i = 0; i< numbytes; i++) {
87 - if (buff1[i] != buff2[i]) {
88 - return(1);
89 - }
90 - }
91 - return(0);
92 - }
93 - else {
94 - for (i = 1; i< numbytes-2; i++) {
95 - if (buff1[i] != buff2[i]) {
96 - return(1);
97 - }
98 - }
99 - /* do leftmost byte */
100 - if ((buff1[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) ) {
101 - return(1);
102 - }
103 - /* do rightmost byte */
104 - if ((buff1[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) ) {
105 - return(1);
106 - }
107 - return(0);
108 - }
109 -}
110 -
111 -
112 -/* return -1 if no match
113 - return number of bits rightshifted otherwise */
114 -int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {
115 - int result, i;
116 -
117 - result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);
118 - if (!result) {
119 - return(0);
120 - }
121 - for (i=1; i<8; i++) {
122 - result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);
123 - if (!result) {
124 - return(i);
125 - }
126 - }
127 - return(-1);
128 -}
129 -
130 -
131 -/* return: 1 if found, 0 if not, -1 on error */
132 -int find_next_bz2_block_marker(int fin, bz_info_t *bfile) {
133 - int result;
134 -
135 - bfile->bits_shifted = -1;
136 - result = read(fin, bfile->marker_buffer, 7);
137 - if (result == -1) {
138 - /* fprintf(stderr,"read of file failed\n"); */
139 - return(-1);
140 - }
141 - /* must be after 4 byte file header, and we add a leftmost byte to the buffer
142 - of data read in case some bits have been shifted into it */
143 - while (bfile->position <= bfile->file_size - 6 && bfile->bits_shifted < 0) {
144 - bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);
145 - if (bfile->bits_shifted < 0) {
146 - bfile->position++;
147 - result = lseek(fin, (bfile->position), SEEK_SET);
148 - if (result == -1) {
149 - fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);
150 - return(-1);
151 - }
152 - result = read(fin, bfile->marker_buffer, 7);
153 - if (result < 7) {
154 - /* fprintf(stderr,"read of file failed\n"); */
155 - return(-1);
156 - }
157 - }
158 - else {
159 - bfile->block_start = bfile->position;
160 - return(1);
161 - }
162 - }
163 - return(0);
164 -}
165 -
166 -/*
167 - initializes the bz2 strm structure,
168 - calls the BZ2 decompression library initializer
169 -
170 - returns:
171 - BZ_OK on success
172 - various BZ_ errors on failure (see bzlib.h)
173 -*/
174 -int init_decompress(bz_info_t *bfile) {
175 - int bz_verbosity = 0;
176 - int bz_small = 0;
177 - int ret;
178 -
179 - bfile->strm.bzalloc = NULL;
180 - bfile->strm.bzfree = NULL;
181 - bfile->strm.opaque = NULL;
182 -
183 - ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
184 - if (ret != BZ_OK) {
185 - fprintf(stderr,"uncompress failed, err %d\n", ret);
186 - exit(-1);
187 - }
188 - return(ret);
189 -}
190 -
191 -/*
192 - reads the first 4 bytes from a bz2 file (should be
193 - "BZh" followed by the block size indicator, typically "9")
194 - and passes them into the BZ2 decompression library.
195 - This must be done before decompression of any block of the
196 - file is attempted.
197 -
198 - returns:
199 - BZ_OK if successful,
200 - various BZ_ errors on failure (see bzlib.h)
201 -*/
202 -int decompress_header(int fin, bz_info_t *bfile) {
203 - int ret, res;
204 -
205 - res = lseek(fin,0,SEEK_SET);
206 - if (res == -1) {
207 - fprintf(stderr,"lseek of file to 0 failed (3)\n");
208 - exit(-1);
209 - }
210 - bfile->bytes_read = read(fin, bfile->header_buffer, 4);
211 - if (bfile->bytes_read < 4) {
212 - fprintf(stderr,"failed to read 4 bytes of header, exiting\n");
213 - exit(-1);
214 - }
215 - bfile->strm.next_in = (char *)bfile->header_buffer;
216 - bfile->strm.avail_in = 4;
217 -
218 - ret = BZ2_bzDecompress ( &(bfile->strm) );
219 - if (BZ_OK != ret && BZ_STREAM_END != ret) {
220 - fprintf(stderr,"Corrupt bzip2 header, exiting\n");
221 - exit(-1);
222 - }
223 - return(ret);
224 -}
225 -
226 -/*
227 - seek to appropriate offset as specified in bfile,
228 - read compressed data into buffer indicated by bfile,
229 - update the bfile structure accordingly,
230 - save the overflow byte (bit-shifted data = suck)
231 - this is for the *first* buffer of data in a stream,
232 - for subsequent buffers use fill_buffer_to_decompress()
233 -
234 - this will set bfile->eof on eof. no other indicator
235 - will be provided.
236 -
237 - returns:
238 - 0 on success
239 - -1 on error
240 -*/
241 -int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {
242 - int res;
243 -
244 - if (bfile->bits_shifted == 0) {
245 - res = lseek(fin,bfile->position+1,SEEK_SET);
246 - if (res == -1) {
247 - fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);
248 - return(-1);
249 - }
250 - }
251 - else {
252 - res = lseek(fin,bfile->position,SEEK_SET);
253 - if (res == -1) {
254 - fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);
255 - return(-1);
256 - }
257 - }
258 - bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);
259 - if (bfile->bytes_read > 0) {
260 - bfile->overflow = bfile->bufin[bfile->bytes_read-1];
261 - shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);
262 -
263 - bfile->strm.next_in = (char *)(bfile->bufin);
264 - bfile->strm.avail_in = bfile->bytes_read-1;
265 - }
266 - if (bfile->bytes_read <=0) {
267 - bfile->eof++;
268 - }
269 - return(0);
270 -}
271 -
272 -/*
273 - read compressed data into buffer indicated by bfile,
274 - from current position of file,
275 - stuffing the overflow byte in first.
276 - update the bfile structure accordingly
277 - save the new overflow byte (bit-shifted data = suck)
278 - this function is for decompression of buffers *after
279 - the first one*. for the first one use
280 - setup_first_buffer_to_decompress()
281 -
282 - this will set bfile->eof on eof. no other indicator
283 - will be provided.
284 -
285 - returns:
286 - 0 on success
287 - hmm, it really does not do anything about errors :-D
288 -*/
289 -int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {
290 - if (bfile->strm.avail_in == 0) {
291 - bfile->strm.next_in = (char *)(bfile->bufin);
292 - bfile->bufin[0] = bfile->overflow;
293 - bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);
294 - if (bfile->bytes_read > 0) {
295 - bfile->overflow = bfile->bufin[bfile->bytes_read];
296 - shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);
297 - bfile->strm.avail_in = bfile->bytes_read;
298 - bfile->position+=bfile->bytes_read;
299 - }
300 - else {
301 - bfile->strm.avail_in = 1; /* the overflow byte */
302 - bfile->eof++;
303 - }
304 - }
305 - return(0);
306 -}
307 -
308 -/* size of buffer is bytes usable. there will be a null byte at the end
309 -
310 - what we do with the buffer:
311 - - read from front of buffer to end,
312 - - fill from point where prev read did not fill buffer, or from where
313 - move of data at end of buffer to beginning left room,
314 - - mark a string of bytes (starting from what's available to read) as "read"
315 -
316 -*/
317 -buf_info_t *init_buffer(int size) {
318 - buf_info_t *b;
319 -
320 - b = (buf_info_t *)malloc(sizeof(buf_info_t));
321 - b->buffer = malloc(sizeof(unsigned char)*(size+1));
322 - b->buffer[size]='\0';
323 - b->end = b->buffer + size;
324 - b->next_to_read = b->end; /* nothing available */
325 - b->bytes_avail = 0; /* bytes to read, nothing available */
326 - b->next_to_fill = b->buffer; /* empty */
327 - b->next_to_fill[0] = '\0';
328 - return(b);
329 -}
330 -
331 -/* check if buffer (used for decompressed data output) is empty,
332 - returns 1 if so and 0 if not */
333 -int buffer_is_empty(buf_info_t *b) {
334 - if (b->bytes_avail == 0) {
335 - return(1);
336 - }
337 - else {
338 - return(0);
339 - }
340 -}
341 -
342 -/* check if buffer (used for decompressed data output) is full,
343 -
344 - returns 1 if so and 0 if not
345 - I'm not liking this function so well, fixme */
346 -int buffer_is_full(buf_info_t *b) {
347 - if (b->next_to_fill == b->end) {
348 - return(1);
349 - }
350 - else {
351 - return(0);
352 - }
353 -}
354 -
355 -/* FIXME do this right. whatever. */
356 -int get_file_size(int fin) {
357 - int res;
358 -
359 - res = lseek(fin, 0, SEEK_END);
360 - if (res == -1) {
361 - fprintf(stderr,"lseek of file to 0 failed (6)\n");
362 - exit(-1);
363 - }
364 - return(res);
365 -}
366 -
367 -
368 -/*
369 - look for the first bz2 block in the file after specified offset
370 - it tests that the block is valid by doing partial decompression.
371 - this function will update the bfile structure:
372 - bfile->position will contain the current position of the file (? will it?)
373 - bfile->bits_shifted will contain the number of bits that the block is rightshifted
374 - bfile->block_start will contain the offset from start of file to the block
375 - returns:
376 - position of next byte in file to be read, on success
377 - -1 if no marker or other error
378 -*/
379 -int find_first_bz2_block_after_offset(bz_info_t *bfile, int fin, int position) {
380 - int res;
381 -
382 - bfile->bufin_size = BUFINSIZE;
383 - bfile->marker = init_marker();
384 - bfile->position = position;
385 - bfile->block_start = -1;
386 - bfile->bytes_read = 0;
387 - bfile->bytes_written = 0;
388 - bfile->eof = 0;
389 - bfile->bits_shifted = -1;
390 -
391 - bfile->file_size = get_file_size(fin);
392 -
393 - while (bfile->bits_shifted < 0) {
394 - if (bfile->position > bfile->file_size) {
395 - return(-1);
396 - }
397 - res = lseek(fin, bfile->position, SEEK_SET);
398 - if (res == -1) {
399 - fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
400 - exit(-1);
401 - }
402 - res = find_next_bz2_block_marker(fin, bfile);
403 - if (res == 1) {
404 - init_decompress(bfile);
405 - decompress_header(fin, bfile);
406 - res = setup_first_buffer_to_decompress(fin, bfile);
407 - if (res == -1) {
408 - fprintf(stderr,"couldn't get first buffer of data to uncompress\n");
409 - exit(-1);
410 - }
411 - bfile->strm.next_out = (char *)bfile->bufout;
412 - bfile->strm.avail_out = bfile->bufout_size;
413 - res = BZ2_bzDecompress ( &(bfile->strm) );
414 - /* this means we (probably) have a genuine marker */
415 - if (BZ_OK == res || BZ_STREAM_END == res) {
416 - res = BZ2_bzDecompressEnd ( &(bfile->strm) );
417 - bfile->bytes_read = 0;
418 - bfile->bytes_written = 0;
419 - bfile->eof = 0;
420 - /* leave the file at the right position */
421 - res = lseek(fin, bfile->block_start, SEEK_SET);
422 - if (res == -1) {
423 - fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
424 - exit(-1);
425 - }
426 - return(0);
427 - }
428 - /* right bytes, but there by chance, skip and try again */
429 - else {
430 - bfile->position+=6;
431 - bfile->bits_shifted = -1;
432 - bfile->block_start = -1;
433 - }
434 - }
435 - else {
436 - return(-1);
437 - }
438 - }
439 - return(-1);
440 -}
441 -
442 -/*
443 - find the first bz2 block marker in the file,
444 - from its current position,
445 - then set up for decompression from that point
446 - returns:
447 - 0 on success
448 - -1 if no marker or other error
449 -*/
450 -int init_bz2_file(bz_info_t *bfile, int fin) {
451 - int res;
452 -
453 - bfile->initialized++;
454 -
455 - res = find_next_bz2_block_marker(fin, bfile);
456 - if (res ==1) {
457 - init_decompress(bfile);
458 - decompress_header(fin, bfile);
459 - setup_first_buffer_to_decompress(fin, bfile);
460 - return(0);
461 - }
462 - return(-1);
463 -}
464 -
465 -/* return -1 if error */
466 -int decompress_data(bz_info_t *bfile, int fin, unsigned char *bufferout, int bufout_size) {
467 - int ret;
468 -
469 - bfile->bufout = bufferout;
470 - bfile->bufout_size = bufout_size;
471 - bfile->bytes_written = 0;
472 -
473 - if (! bfile->initialized) {
474 - if (init_bz2_file(bfile, fin) == -1) {
475 - /* fprintf(stderr,"failed to find block in bz2file (2)\n"); */
476 - return(-1);
477 - };
478 - bfile->strm.next_out = (char *)bfile->bufout;
479 - bfile->strm.avail_out = bfile->bufout_size;
480 - }
481 -
482 - ret = BZ_OK;
483 - while (BZ_OK == ret && bfile->bytes_written == 0) {
484 - ret = BZ2_bzDecompress ( &(bfile->strm) );
485 - if (BZ_OK == ret || BZ_STREAM_END == ret) {
486 - bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;
487 - }
488 - else {
489 - /* fprintf(stderr,"error from BZ decompress %d\n",ret); */
490 - return(-1);
491 - }
492 - fill_buffer_to_decompress(fin, bfile, ret);
493 - /*
494 - if (bfile->eof && (BZ_OK == ret || BZ_STREAM_END == ret) ) {
495 - fprintf(stderr,"eof reached\n");
496 - }
497 - */
498 - }
499 - return(0);
500 -}
501 -
502 -
503 -/*
504 - fill output buffer in b with uncompressed data from bfile
505 - if this is the first call to the function for this file,
506 - the file header will be read, and the first buffer of
507 - uncompressed data will be prepared. bfile->position
508 - should be set to the offset (from the beginning of file) from
509 - which to find the first bz2 block.
510 -
511 - returns:
512 - on success, number of bytes read (may be 0)
513 - -1 on error
514 -*/
515 -int get_buffer_of_uncompressed_data(buf_info_t *b, int fin, bz_info_t *bfile) {
516 - int res;
517 -
518 - if (buffer_is_full(b)) {
519 - return(0);
520 - }
521 -
522 - if (buffer_is_empty(b)) {
523 - b->next_to_fill = b->buffer;
524 - }
525 -
526 - res = decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill);
527 - if (res == -1) {
528 - return(res);
529 - }
530 - if (bfile->bytes_written < 0) {
531 - /* fprintf(stderr,"read of file failed\n"); */
532 - return(-1);
533 - }
534 - else {
535 - /* really?? FIXME check this */
536 - if (buffer_is_empty(b)) {
537 - b->next_to_read = b->next_to_fill; /* where we just read */
538 - }
539 - b->bytes_avail += bfile->bytes_written;
540 - b->next_to_fill += bfile->bytes_written;
541 - b->next_to_fill[0] = '\0';
542 - return(0);
543 - }
544 -}
545 -
546 -void dumpbuf_info_t(buf_info_t *b) {
547 - fprintf(stdout, "\n");
548 - fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);
549 - fprintf(stdout, "b->end: %ld\n", (long int) b->end);
550 - fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);
551 - fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);
552 - fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);
553 -}
554 -
555 -
556 -/*
557 - copy text from end of buffer to the beginning, that we want to keep
558 - around for further processing (i.e. further regex matches)
559 - returns number of bytes copied
560 -*/
561 -int move_bytes_to_buffer_start(buf_info_t *b, unsigned char *from_where, int maxbytes) {
562 - int i, tocopy;
563 -
564 - if (from_where >= b->end) {
565 - return(0);
566 - }
567 - else {
568 - tocopy = b->end - from_where;
569 - if (maxbytes && (tocopy > maxbytes)) {
570 - tocopy = maxbytes;
571 - }
572 - for (i = 0; i < tocopy; i++) {
573 - b->buffer[i] = from_where[i];
574 - }
575 - b->next_to_fill = b->buffer + tocopy;
576 - b->next_to_fill[0] = '\0';
577 - b->next_to_read = b->buffer;
578 - b->bytes_avail = tocopy;
579 - return(tocopy);
580 - }
581 -}
582 -
583 -/*
584 - get the first page id after position in file
585 - if a pageid is found, the structure pinfo will be updated accordingly
586 - returns:
587 - 1 if a pageid found,
588 - 0 if no pageid found,
589 - -1 on error
590 -*/
591 -int get_first_page_id_after_offset(int fin, int position, page_info_t *pinfo) {
592 - int res;
593 - regmatch_t *match_page, *match_page_id;
594 - regex_t compiled_page, compiled_page_id;
595 - int length=5000; /* output buffer size */
596 - char *page = "<page>";
597 - char *page_id = "<page>\n[ ]+<title>[^<]+</title>\n[ ]+<id>([0-9]+)</id>\n";
598 -
599 - buf_info_t *b;
600 - bz_info_t bfile;
601 -
602 - bfile.initialized = 0;
603 -
604 - res = regcomp(&compiled_page, page, REG_EXTENDED);
605 - res = regcomp(&compiled_page_id, page_id, REG_EXTENDED);
606 -
607 - match_page = (regmatch_t *)malloc(sizeof(regmatch_t)*1);
608 - match_page_id = (regmatch_t *)malloc(sizeof(regmatch_t)*2);
609 -
610 - b = init_buffer(length);
611 -
612 - pinfo->bits_shifted = -1;
613 - pinfo->position = -1;
614 - pinfo->page_id = -1;
615 -
616 - bfile.bytes_read = 0;
617 -
618 - if (find_first_bz2_block_after_offset(&bfile, fin, position) == -1) {
619 - /* fprintf(stderr,"failed to find block in bz2file (1)\n"); */
620 - return(-1);
621 - }
622 -
623 - while (!get_buffer_of_uncompressed_data(b, fin, &bfile) && (! bfile.eof)) {
624 - if (bfile.bytes_read) {
625 - while (regexec(&compiled_page_id, (char *)b->next_to_read, 2, match_page_id, 0 ) == 0) {
626 - if (match_page_id[1].rm_so >=0) {
627 - /* write page_id to stderr */
628 - /*
629 - fwrite(b->next_to_read+match_page_id[1].rm_so, sizeof(unsigned char), match_page_id[1].rm_eo - match_page_id[1].rm_so, stderr);
630 - fwrite("\n",1,1,stderr);
631 - */
632 - pinfo->page_id = atoi((char *)(b->next_to_read+match_page_id[1].rm_so));
633 - pinfo->position = bfile.block_start;
634 - pinfo->bits_shifted = bfile.bits_shifted;
635 - return(1);
636 - /* write up to and including page id tag to stdout */
637 - /*
638 - fwrite(b->next_to_read,match_page_id[0].rm_eo,1,stdout);
639 - b->next_to_read = b->next_to_read+match_page_id[0].rm_eo;
640 - b->bytes_avail -= match_page_id[0].rm_eo;
641 - */
642 - }
643 - else {
644 - /* should never happen */
645 - fprintf(stderr,"regex gone bad...\n");
646 - exit(-1);
647 - }
648 - }
649 - if (regexec(&compiled_page, (char *)b->next_to_read, 1, match_page, 0 ) == 0) {
650 - /* write everything up to but not including the page tag to stdout */
651 - /*
652 - fwrite(b->next_to_read,match_page[0].rm_eo - 6,1,stdout);
653 - */
654 - move_bytes_to_buffer_start(b, b->next_to_read + match_page[0].rm_so, b->bytes_avail - match_page[0].rm_so);
655 - bfile.strm.next_out = (char *)b->next_to_fill;
656 - bfile.strm.avail_out = b->end - b->next_to_fill;
657 - }
658 - else {
659 - /* could have the first part of the page tag... so copy up enough bytes to cover that case */
660 - if (b->bytes_avail> 5) {
661 - /* write everything that didn't match, but leave 5 bytes, to stdout */
662 - /*
663 - fwrite(b->next_to_read,b->bytes_avail - 5,1,stdout);
664 - */
665 - move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 5, 5);
666 - bfile.strm.next_out = (char *)b->next_to_fill;
667 - bfile.strm.avail_out = b->end - b->next_to_fill;
668 - }
669 - else {
670 - if (buffer_is_empty(b)) {
671 - bfile.strm.next_out = (char *)b->buffer;
672 - bfile.strm.avail_out = bfile.bufout_size;
673 - b->next_to_fill = b->buffer; /* empty */
674 - }
675 - else {
676 - /* there were only 5 or less bytes so just save em don't write em to stdout */
677 - move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);
678 - bfile.strm.next_out = (char *)b->next_to_fill;
679 - bfile.strm.avail_out = b->end - b->next_to_fill;
680 - }
681 - }
682 - }
683 - }
684 - }
685 - /*
686 - if (b->bytes_avail) {
687 - fwrite(b->next_to_read,b->bytes_avail,1,stdout);
688 - }
689 - */
690 - return(0);
691 -}
692 -
693 -/* search for pageid in a bz2 file, given start and end offsets
694 - to search for
695 - we guess by the most boring method possible (shrink the
696 - interval according to the value found on the last guess,
697 - try midpoint of the new interval)
698 - multiple calls of this will get the job done.
699 - interval has left end = right end if search is complete.
700 - this function may return the previous guess and simply
701 - shrink the interval.
702 - note that a "match" means either that the pageid we find
703 - is smaller than the one the caller wants, or is equal.
704 - why? because then we can use the output for prefetch
705 - for xml dumps and be sure a specific page range is covered :-P
706 -
707 - return value from guess, or -1 on error.
708 - */
709 -int do_iteration(iter_info_t *iinfo, int fin, page_info_t *pinfo) {
710 - int res;
711 - int new_position;
712 - int interval;
713 -
714 - /*
715 - last_position is somewhere in the interval, perhaps at an end
716 - last_value is the value we had at that position
717 - */
718 -
719 - interval = (iinfo->right_end - iinfo->left_end)/2;
720 - if (interval == 0) {
721 - interval = 1;
722 - }
723 - /* fprintf(stderr,"interval size is %ld, left end %ld, right end %ld, last val %d\n",interval, iinfo->left_end, iinfo->right_end, iinfo->last_value); */
724 - /* if we're this close, we'll check this value and be done with it */
725 - if (iinfo->right_end -iinfo->left_end < 2) {
726 - new_position = iinfo->left_end;
727 - iinfo->right_end = iinfo->left_end;
728 - }
729 - else {
730 - if (iinfo->last_value < iinfo->value_wanted) {
731 - /* fprintf(stderr,"resetting left end\n"); */
732 - iinfo->left_end = iinfo->last_position;
733 - new_position = iinfo->last_position + interval;
734 - }
735 - /* iinfo->last_value > iinfo->value_wanted */
736 - else {
737 - /* fprintf(stderr,"resetting right end\n"); */
738 - iinfo->right_end = iinfo->last_position;
739 - new_position = iinfo->last_position - interval;
740 - }
741 - }
742 - res = get_first_page_id_after_offset(fin, new_position, pinfo);
743 - if (res >0) {
744 - /* caller wants the new value */
745 - iinfo->last_value = pinfo->page_id;
746 - iinfo->last_position = new_position;
747 - return(pinfo->page_id);
748 - }
749 - else {
750 - /* here is the tough case, if we didn't find anything then we are prolly too close to the end, truncation or
751 - there's just no block here.
752 - set the right end, keep the last value and position and let the caller retry with the new interval */
753 - if (iinfo->last_value < iinfo->value_wanted) { /* we were moving towards eof */
754 - iinfo->right_end = new_position;
755 - return(iinfo->last_value);
756 - }
757 - /* in theory we were moving towards beginning of file, should not have issues, so bail here */
758 - else {
759 - /* fprintf(stderr,"something very broken, giving up\n"); */
760 - return(-1);
761 - }
762 - }
763 -}
764 -
765 -/*
766 - given a bzipped and possibly truncated file, and a page id,
767 - hunt for the page id in the file; this assume that the
768 - bz2 header is intact and that page ids are steadily increasing
769 - throughout the file.
770 -
771 - writes the offset of the relevant block (from beginning of file)
772 - and the first pageid found in that block, to stdout
773 -
774 - format of output:
775 - position:xxxxx pageid:nnn
776 -
777 - returns: 0 on success, -1 on error
778 -*/
779 -int main(int argc, char **argv) {
780 - int fin, position, res, interval, page_id, oldmarker, file_size;
781 - page_info_t pinfo;
782 - iter_info_t iinfo;
783 -
784 - if (argc != 3) {
785 - fprintf(stderr,"usage: %s infile id\n", argv[0]);
786 - exit(-1);
787 - }
788 -
789 - fin = open (argv[1], O_RDONLY);
790 - if (fin < 0) {
791 - fprintf(stderr,"failed to open file %s for read\n", argv[1]);
792 - exit(-1);
793 - }
794 -
795 - page_id = atoi(argv[2]);
796 - if (page_id <1) {
797 - fprintf(stderr,"please specify a page_id >= 1.\n");
798 - fprintf(stderr,"usage: %s infile page_id\n", argv[0]);
799 - exit(-1);
800 - }
801 -
802 - file_size = get_file_size(fin);
803 -
804 - interval = file_size;
805 - position = 0;
806 - oldmarker = -1;
807 - pinfo.bits_shifted = -1;
808 - pinfo.position = -1;
809 - pinfo.page_id = -1;
810 -
811 - iinfo.left_end = 0;
812 - file_size = get_file_size(fin);
813 - iinfo.right_end = file_size;
814 - iinfo.value_wanted = page_id;
815 -
816 - res = get_first_page_id_after_offset(fin, 0, &pinfo);
817 - if (res > 0) {
818 - iinfo.last_value = pinfo.page_id;
819 - iinfo.last_position = 0;
820 - }
821 - else {
822 - fprintf(stderr,"failed to get anything useful from the beginning of the file even, bailing.\n");
823 - exit(1);
824 - }
825 - if (pinfo.page_id == page_id) {
826 - fprintf(stdout,"position:%d page_id:%d\n",pinfo.position, pinfo.page_id);
827 - exit(0);
828 - }
829 -
830 - while (1) {
831 - res = do_iteration(&iinfo, fin, &pinfo);
832 - /* things to check: bad return? interval is 0 bytes long? */
833 - if (iinfo.left_end == iinfo.right_end) {
834 - fprintf(stdout,"position:%d page_id:%d\n",pinfo.position, pinfo.page_id);
835 - exit(0);
836 - }
837 - else if (res < 0) {
838 - fprintf(stderr,"broken and quitting\n");
839 - exit(-1);
840 - }
841 - }
842 - exit(0);
843 -}
Index: branches/ariel/xmldumps-backup/checkforbz2footer.c
@@ -1,156 +0,0 @@
2 -#include <unistd.h>
3 -#include <stdio.h>
4 -#include <string.h>
5 -#include <sys/types.h>
6 -#include <sys/stat.h>
7 -#include <fcntl.h>
8 -#include <stdlib.h>
9 -#include <errno.h>
10 -
11 -/*
12 - Check to see whether a file ends with a bz2 footer or not
13 - (i.e. if it is truncated or corrupted).
14 - This is a crude but fast test for integrity; we don't
15 - check the CRC at the end of fthe stream, nor do we check the
16 - bit padding in the last byte of the file.
17 -
18 - Arguments: the name of the file to check, presumably
19 - a bzipped file.
20 - Outputs: none.
21 - Exits with 0 if the file contains the footer at the end,
22 - 1 if the file does not contain the footer, and -1 on error.
23 -*/
24 -
25 -
26 -int read_footer(unsigned char *buffer, int fin) {
27 - int res;
28 -
29 - res = lseek(fin, -11, SEEK_END);
30 - if (res == -1) {
31 - fprintf(stderr,"lseek of file failed\n");
32 - exit(-1);
33 - }
34 - res = read(fin, buffer, 11);
35 - if (res == -1) {
36 - fprintf(stderr,"read of file failed\n");
37 - exit(-1);
38 - }
39 - return(0);
40 -}
41 -
42 -#define LEFT 0
43 -#define RIGHT 1
44 -
45 -/* return n ones either at left or right end */
46 -int bitmask(int numbits, int end) {
47 - if (end == RIGHT) {
48 - return((1<<numbits)-1);
49 - }
50 - else {
51 - return(((1<<numbits)-1) << (8-numbits));
52 - }
53 -}
54 -
55 -void shiftbytesright(unsigned char *buffer, int buflen, int numbits) {
56 - int i;
57 -
58 - for (i=buflen-1; i>=0; i--) {
59 - /* right 1 */
60 - buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
61 -
62 - /* grab rightmost from prev byte */
63 - if (i > 0) {
64 - buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(1,LEFT)));
65 - }
66 - }
67 -}
68 -
69 -/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
70 - both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
71 - matches and 0 otherwise. */
72 -int bytescompare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) {
73 - int i;
74 -
75 - if (bitsrightshifted == 0) {
76 - for (i = 0; i< numbytes; i++) {
77 - if (buff1[i] != buff2[i]) {
78 - return(1);
79 - }
80 - }
81 - return(0);
82 - }
83 - else {
84 - for (i = 1; i< numbytes-2; i++) {
85 - if (buff1[i] != buff2[i]) {
86 - return(1);
87 - }
88 - }
89 - /* do leftmost byte */
90 - if ((buff1[0] & bitmask(8-bitsrightshifted,RIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,RIGHT)) ) {
91 - return(1);
92 - }
93 - /* do rightmost byte */
94 - if ((buff1[numbytes-1] & bitmask(bitsrightshifted,LEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,LEFT)) ) {
95 - return(1);
96 - }
97 - return(0);
98 - }
99 -}
100 -
101 -int checkfileforfooter(int fin) {
102 - unsigned char buffer[11];
103 - int result, i;
104 - unsigned char **footer = malloc(8*sizeof(unsigned char *));
105 -
106 - /* set up footer plus its various right-shifted incarnations */
107 - /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */
108 - for (i = 0; i< 8; i++) {
109 - footer[i] = malloc(sizeof(unsigned char)*7);
110 - }
111 - footer[0][0]= (unsigned char) 0x17;
112 - footer[0][1]= (unsigned char) 0x72;
113 - footer[0][2]= (unsigned char) 0x45;
114 - footer[0][3]= (unsigned char) 0x38;
115 - footer[0][4]= (unsigned char) 0x50;
116 - footer[0][5]= (unsigned char) 0x90;
117 - footer[0][6]= (unsigned char) 0x00;
118 - for (i = 1; i< 8; i++) {
119 - memcpy((char *)(footer[i]), (char *)(footer[i-1]),7);
120 - shiftbytesright(footer[i],7,1);
121 - }
122 -
123 - read_footer(buffer,fin);
124 -
125 - result = bytescompare(footer[0],buffer+1,6,0);
126 - if (!result) {
127 - return(0);
128 - }
129 -
130 - for (i=1; i<8; i++) {
131 - result = bytescompare(footer[i],buffer,7,i);
132 - if (!result) {
133 - return(0);
134 - }
135 - }
136 - return(1);
137 -}
138 -
139 -int main(int argc, char **argv) {
140 -
141 - int fin;
142 - int result;
143 -
144 - if (argc != 2) {
145 - fprintf(stderr,"usage: %s infile\n", argv[0]);
146 - exit(-1);
147 - }
148 - fin = open (argv[1], O_RDONLY);
149 - if (fin < 0) {
150 - fprintf(stderr,"failed to open file %s for read\n", argv[1]);
151 - exit(-1);
152 - }
153 - result = checkfileforfooter(fin);
154 - close(fin);
155 - exit(result);
156 -}
157 -
Index: branches/ariel/xmldumps-backup/dumplastbz2block.c
@@ -1,463 +0,0 @@
2 -#include <unistd.h>
3 -#include <stdio.h>
4 -#include <string.h>
5 -#include <sys/types.h>
6 -#include <sys/stat.h>
7 -#include <fcntl.h>
8 -#include <stdlib.h>
9 -#include <errno.h>
10 -#include "bzlib.h"
11 -
12 -/*
13 - Find the last bz2 block marker in a file
14 - and dump whatever can be decompressed after
15 - that point. The header of the file must
16 - be intact in order for any output to be produced.
17 - This will produce output for truncated files as well,
18 - as long as there is "enough" data after the block
19 - marker.
20 -
21 - Arguments: the name of the file to check, presumably
22 - a bzipped file.
23 - Outputs: the decompressed data at the end of the file.
24 - Exits with 0 if decompression of some data can be done,
25 - 1 if decompression fails, and -1 on error.
26 -*/
27 -
28 -#define BUFSIZE 121072
29 -typedef struct {
30 - unsigned char bufin[BUFSIZE];
31 - unsigned char bufout[BUFSIZE];
32 - int bufsize;
33 - bz_stream strm;
34 - unsigned char overflow;
35 - int bitsshifted;
36 - int position;
37 -} bzinfo;
38 -
39 -int read_footer(unsigned char *buffer, int fin) {
40 - int res;
41 -
42 - res = lseek(fin, -11, SEEK_END);
43 - if (res == -1) {
44 - fprintf(stderr,"lseek of file failed\n");
45 - exit(-1);
46 - }
47 - res = read(fin, buffer, 11);
48 - if (res == -1) {
49 - fprintf(stderr,"read of file failed\n");
50 - exit(-1);
51 - }
52 - return(0);
53 -}
54 -
55 -#define LEFT 0
56 -#define RIGHT 1
57 -
58 -/* return n ones either at left or right end */
59 -int bitmask(int numbits, int end) {
60 - if (end == RIGHT) {
61 - return((1<<numbits)-1);
62 - }
63 - else {
64 - return(((1<<numbits)-1) << (8-numbits));
65 - }
66 -}
67 -
68 -void shiftbytesleft(unsigned char *buffer, int buflen, int numbits) {
69 - int i;
70 -
71 - if (numbits == 0) {
72 - return;
73 - }
74 -
75 - for (i=0; i<buflen; i++) {
76 - /* left 1 */
77 - buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
78 -
79 - /* grab leftmost from next byte */
80 - if (i < buflen-1) {
81 - buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] | ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,LEFT) ) >> (8-numbits) ) );
82 - }
83 - }
84 -}
85 -
86 -
87 -void shiftbytesright(unsigned char *buffer, int buflen, int numbits) {
88 - int i;
89 -
90 - for (i=buflen-1; i>=0; i--) {
91 - /* right 1 */
92 - buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
93 -
94 - /* grab rightmost from prev byte */
95 - if (i > 0) {
96 - buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,LEFT)));
97 - }
98 - }
99 -}
100 -
101 -unsigned char ** init_marker() {
102 - unsigned char **marker = malloc(8*sizeof(unsigned char *));
103 - int i;
104 -
105 - /* set up block marker plus its various right-shifted incarnations */
106 - for (i = 0; i< 8; i++) {
107 - marker[i] = malloc(sizeof(unsigned char)*7);
108 - }
109 - marker[0][0]= (unsigned char) 0x31;
110 - marker[0][1]= (unsigned char) 0x41;
111 - marker[0][2]= (unsigned char) 0x59;
112 - marker[0][3]= (unsigned char) 0x26;
113 - marker[0][4]= (unsigned char) 0x53;
114 - marker[0][5]= (unsigned char) 0x59;
115 - marker[0][6]= (unsigned char) 0x00;
116 - for (i = 1; i< 8; i++) {
117 - memcpy((char *)(marker[i]), (char *)(marker[i-1]),7);
118 - shiftbytesright(marker[i],7,1);
119 - }
120 - return(marker);
121 -}
122 -
123 -unsigned char ** init_footer() {
124 - unsigned char **footer = malloc(8*sizeof(unsigned char *));
125 - int i;
126 -
127 - /* set up footer plus its various right-shifted incarnations */
128 - /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */
129 - for (i = 0; i< 8; i++) {
130 - footer[i] = malloc(sizeof(unsigned char)*7);
131 - }
132 - footer[0][0]= (unsigned char) 0x17;
133 - footer[0][1]= (unsigned char) 0x72;
134 - footer[0][2]= (unsigned char) 0x45;
135 - footer[0][3]= (unsigned char) 0x38;
136 - footer[0][4]= (unsigned char) 0x50;
137 - footer[0][5]= (unsigned char) 0x90;
138 - footer[0][6]= (unsigned char) 0x00;
139 - for (i = 1; i< 8; i++) {
140 - memcpy((char *)(footer[i]), (char *)(footer[i-1]),7);
141 - shiftbytesright(footer[i],7,1);
142 - }
143 - return(footer);
144 -}
145 -
146 -
147 -/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
148 - both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
149 - matches and 0 otherwise. */
150 -int bytescompare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) {
151 - int i;
152 -
153 - if (bitsrightshifted == 0) {
154 - for (i = 0; i< numbytes; i++) {
155 - if (buff1[i] != buff2[i]) {
156 - return(1);
157 - }
158 - }
159 - return(0);
160 - }
161 - else {
162 - for (i = 1; i< numbytes-2; i++) {
163 - if (buff1[i] != buff2[i]) {
164 - return(1);
165 - }
166 - }
167 - /* do leftmost byte */
168 - if ((buff1[0] & bitmask(8-bitsrightshifted,RIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,RIGHT)) ) {
169 - return(1);
170 - }
171 - /* do rightmost byte */
172 - if ((buff1[numbytes-1] & bitmask(bitsrightshifted,LEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,LEFT)) ) {
173 - return(1);
174 - }
175 - return(0);
176 - }
177 -}
178 -
179 -/* return -1 if no match
180 - return number of bits rightshifted otherwise */
181 -int checkfileforfooter(int fin, unsigned char **footer) {
182 - unsigned char buffer[11];
183 - int result, i;
184 -
185 - read_footer(buffer,fin);
186 -
187 - result = bytescompare(footer[0],buffer+1,6,0);
188 - if (!result) {
189 - return(0);
190 - }
191 -
192 - for (i=1; i<8; i++) {
193 - result = bytescompare(footer[i],buffer,7,i);
194 - if (!result) {
195 - return(i);
196 - }
197 - }
198 - return(-1);
199 -}
200 -
201 -/* return -1 if no match
202 - return number of bits rightshifted otherwise */
203 -int checkbufferforblockmarker(unsigned char *buffer, unsigned char **marker) {
204 - int result, i;
205 -
206 - result = bytescompare(marker[0],buffer+1,6,0);
207 - if (!result) {
208 - return(0);
209 - }
210 - for (i=1; i<8; i++) {
211 - result = bytescompare(marker[i],buffer,7,i);
212 - if (!result) {
213 - return(i);
214 - }
215 - }
216 - return(-1);
217 -}
218 -
219 -void clearbuffer(unsigned char *buf, int length) {
220 - int i;
221 -
222 - for (i=0; i<length; i++) {
223 - buf[i]=0;
224 - }
225 - return;
226 -}
227 -
228 -int findnextmarker(int fin, int *start_at, int *position, unsigned char **marker, unsigned char *buffer ) {
229 - int bitsshifted = -1;
230 - int result;
231 -
232 - /* must be after 4 byte file header, and we add a leftmost byte to the buffer
233 - of data read in case some bits have been shifted into it */
234 - while (*position >= 3 && bitsshifted < 0) {
235 - bitsshifted = checkbufferforblockmarker(buffer, marker);
236 - if (bitsshifted < 0) {
237 - (*start_at)++;
238 - /*
239 - if (*start_at % 10000 == 0) {
240 - fprintf(stderr, "starting at %d, position %d\n", *start_at, *position);
241 - }
242 - */
243 - *position = lseek(fin, -1*(*start_at), SEEK_END);
244 - if (*position == -1) {
245 - fprintf(stderr,"lseek of file failed\n");
246 - exit(-1);
247 - }
248 - result = read(fin, buffer, 7);
249 - if (result == -1) {
250 - fprintf(stderr,"read of file failed\n");
251 - exit(-1);
252 - }
253 - }
254 - else {
255 - return(bitsshifted);
256 - }
257 - }
258 - return(bitsshifted);
259 -}
260 -
261 -int init_decompress(bzinfo *bfile) {
262 - int bz_verbosity = 0;
263 - int bz_small = 0;
264 - int ret;
265 -
266 - bfile->strm.bzalloc = NULL;
267 - bfile->strm.bzfree = NULL;
268 - bfile->strm.opaque = NULL;
269 -
270 - ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
271 - if (ret != BZ_OK) {
272 - fprintf(stderr,"uncompress failed, err %d\n", ret);
273 - exit(-1);
274 - }
275 - return(ret);
276 -}
277 -
278 -int decompress_header(int fin, bzinfo *bfile) {
279 - int bytesread, ret;
280 - unsigned char header[4];
281 -
282 - lseek(fin,0,SEEK_SET);
283 - bytesread = read(fin, header, 4);
284 - if (bytesread < 4) {
285 - fprintf(stderr,"failed to read 4 bytes of header, exiting\n");
286 - exit(-1);
287 - }
288 - bfile->strm.next_in = (char *)header;
289 - bfile->strm.avail_in = 4;
290 -
291 - bfile->strm.next_out = (char *)(bfile->bufout);
292 - bfile->strm.avail_out = bfile->bufsize;
293 - ret = BZ2_bzDecompress ( &(bfile->strm) );
294 - if (BZ_OK != ret && BZ_STREAM_END != ret) {
295 - fprintf(stderr,"Corrupt bzip2 header, exiting\n");
296 - exit(-1);
297 - }
298 - return(ret);
299 -}
300 -
301 -int setup_first_buffer(int fin, bzinfo *bfile) {
302 - int bytesread, eof=0;
303 -
304 - if (bfile->bitsshifted == 0) {
305 - lseek(fin,bfile->position+1,SEEK_SET);
306 - }
307 - else {
308 - lseek(fin,bfile->position,SEEK_SET);
309 - }
310 - bytesread = read(fin, bfile->bufin, bfile->bufsize);
311 - if (bytesread > 0) {
312 - bfile->overflow = bfile->bufin[bytesread-1];
313 - shiftbytesleft(bfile->bufin,bytesread,bfile->bitsshifted);
314 -
315 - bfile->strm.next_in = (char *)(bfile->bufin);
316 - bfile->strm.avail_in = bytesread-1;
317 -
318 - bfile->strm.next_out = (char *)(bfile->bufout);
319 - bfile->strm.avail_out = bfile->bufsize;
320 - }
321 - if (bytesread <=0) {
322 - eof++;
323 - }
324 - return(eof);
325 -}
326 -
327 -int do_last_byte(bzinfo *bfile) {
328 - int ret=BZ_OK;
329 - int written;
330 -
331 - if (bfile->strm.avail_in == 0) {
332 - bfile->strm.next_in = (char *)(bfile->bufin);
333 - bfile->bufin[0] = bfile->overflow;
334 - shiftbytesleft(bfile->bufin,1,bfile->bitsshifted);
335 - bfile->strm.avail_in = 1;
336 - bfile->strm.next_out = (char *)(bfile->bufout);
337 - bfile->strm.avail_out = bfile->bufsize;
338 - ret = BZ2_bzDecompress ( &(bfile->strm) );
339 - if (BZ_OK == ret || BZ_STREAM_END == ret) {
340 - written = fwrite(bfile->bufout, sizeof(unsigned char), (unsigned char *)bfile->strm.next_out - bfile->bufout, stdout);
341 - }
342 - }
343 - return(ret);
344 -}
345 -
346 -int read_next_buffer(int fin, bzinfo *bfile, int ret) {
347 - int bytesread, eof=0;
348 -
349 - /* fprintf(stderr," got return from decompress of %d\n", ret); */
350 -
351 - if (bfile->strm.avail_in == 0) {
352 - bfile->strm.next_in = (char *)(bfile->bufin);
353 - bfile->bufin[0] = bfile->overflow;
354 - bytesread = read(fin, bfile->bufin+1, bfile->bufsize-1);
355 - if (bytesread > 0) {
356 - bfile->overflow = bfile->bufin[bytesread];
357 - shiftbytesleft(bfile->bufin,bytesread+1,bfile->bitsshifted);
358 - bfile->strm.avail_in = bytesread;
359 - }
360 - else {
361 - eof++;
362 - bfile->strm.avail_in = 0;
363 - }
364 - }
365 - bfile->strm.next_out = (char *)(bfile->bufout);
366 - bfile->strm.avail_out = bfile->bufsize;
367 -
368 - return(eof);
369 -}
370 -
371 -
372 -int main(int argc, char **argv) {
373 -
374 - bzinfo bfile;
375 -
376 - int fin;
377 - int result, ret;
378 - unsigned char buffer[8];
379 -
380 - unsigned char **footer;
381 - unsigned char **marker;
382 -
383 - int written=0;
384 - int start_at;
385 -
386 - int eof = 0;
387 -
388 - if (argc != 2) {
389 - fprintf(stderr,"usage: %s infile\n", argv[0]);
390 - exit(-1);
391 - }
392 -
393 - marker = init_marker();
394 - footer = init_footer();
395 -
396 - fin = open (argv[1], O_RDONLY);
397 - if (fin < 0) {
398 - fprintf(stderr,"failed to open file %s for read\n", argv[1]);
399 - exit(-1);
400 - }
401 -
402 - bfile.bufsize = BUFSIZE;
403 -
404 - result = checkfileforfooter(fin, footer);
405 - if (result == -1) {
406 - start_at = 0;
407 - }
408 - else {
409 - start_at = 11; /* size of footer, perhaps with 1 byte extra */
410 - }
411 - start_at +=6; /* size of marker */
412 - bfile.position = lseek(fin, -1*start_at, SEEK_END);
413 - if (bfile.position == -1) {
414 - fprintf(stderr,"lseek of file failed\n");
415 - exit(-1);
416 - }
417 - result = read(fin, buffer, 7);
418 - if (result == -1) {
419 - fprintf(stderr,"read of file failed\n");
420 - exit(-1);
421 - }
422 -
423 - while (1) {
424 -
425 - bfile.bitsshifted = findnextmarker(fin, &start_at, &bfile.position, marker, buffer);
426 - if (bfile.bitsshifted >= 0) {
427 - /* fprintf(stderr, "found marker at pos %d and shifted %d, start_at is %d\n", bfile.position, bfile.bitsshifted, start_at); */
428 - ret = init_decompress(&bfile);
429 -
430 - /* pass in the header */
431 - ret = decompress_header(fin,&bfile);
432 -
433 - eof = setup_first_buffer(fin, &bfile);
434 -
435 - while (BZ_OK == ret && !eof) {
436 - ret = BZ2_bzDecompress ( &(bfile.strm) );
437 - if (BZ_OK == ret || BZ_STREAM_END == ret) {
438 - written += fwrite(bfile.bufout, sizeof(unsigned char), (unsigned char *)(bfile.strm.next_out) - bfile.bufout, stdout);
439 - }
440 - eof = read_next_buffer(fin, &bfile, ret);
441 - }
442 - if (BZ_OK == ret || BZ_STREAM_END == ret ) {
443 - /* so we read no bytes, process the last byte we held */
444 - do_last_byte(&bfile);
445 - }
446 - if (written == 0) {
447 - /* truncated block or other corruption, try going back one */
448 - start_at +=5;
449 - clearbuffer(buffer,sizeof(buffer));
450 - continue;
451 - }
452 - else {
453 - break;
454 - }
455 - }
456 - else {
457 - fprintf(stderr,"no block marker in this file.\n");
458 - exit(-1);
459 - }
460 - }
461 - close(fin);
462 - exit(0);
463 -}
464 -
Index: branches/ariel/xmldumps-backup/findpageidinbz2xml.h
@@ -1,81 +0,0 @@
2 -#ifndef _FINDPAGEID_H
3 -#define _FINDPAGEID_H
4 -
5 -typedef struct {
6 - int page_id; /* first id in the block */
7 - int bits_shifted; /* block is right shifted this many bits */
8 - int position; /* position in file of block */
9 -} page_info_t;
10 -
11 -#define BUFINSIZE 5000
12 -
13 -/*
14 - keeps all information about a bzipped file
15 - plus input/output buffers for decompression
16 -*/
17 -typedef struct {
18 - unsigned char bufin[BUFINSIZE]; /* compressed data read from file */
19 - unsigned char *bufout; /* uncompressed data, must be allocated by caller */
20 - unsigned char marker_buffer[7]; /* data to test for bz2 block marker */
21 - unsigned char header_buffer[4]; /* first 4 bytes of file (bzip2 header) */
22 -
23 - int bufin_size; /* size of input buffer for compressed data */
24 - int bufout_size; /* size of output buffer for decompressed data, may vary at each call */
25 -
26 - int initialized; /* whether bz2file has been initialized (header processed, seek to
27 - some bz2 block in the file and input buffer filled) */
28 - int block_start; /* position of bz2 block in file from which we started to read (we
29 - read a sequence of bz2 blocks from a given position, this is
30 - the offset to the first one) */
31 -
32 - bz_stream strm; /* stream structure for libbz2 */
33 - unsigned char overflow; /* since decompressed bytes may not be bit aligned, we keep the last byte
34 - read around so we can grab the lower end bits off the end for
35 - sticking in front of the next pile of compressed bytes we read */
36 -
37 - int bits_shifted; /* number of bits that the compressed data has been right shifted
38 - in the file (if the number is 0, the block marker and subsequent
39 - data is byte-aligned) */
40 - unsigned char **marker; /* bzip2 start of block marker, plus bit-shifted versions of it for
41 - locating the marker in a stream of compressed data */
42 -
43 - int position; /* current offset into file from start of file */
44 -
45 - int bytes_read; /* number of bytes of compressed data read from file (per read) */
46 - int bytes_written; /* number of bytes of decompressed data written into output buffer (per decompress) */
47 - int eof; /* nonzero if eof reached */
48 - int file_size; /* length of file, so we don't search past it for blocks */
49 -} bz_info_t;
50 -
51 -#define MASKLEFT 0
52 -#define MASKRIGHT 1
53 -
54 -/*
55 - this output buffer is used to collect decompressed output.
56 - this is not a circular buffer; when it is full the user is
57 - responsible for emptying it completely or partially and moving
58 - to the beginning any unused bytes.
59 -
60 -*/
61 -typedef struct {
62 - unsigned char *buffer; /* output storage, allocated by the caller */
63 - unsigned char *next_to_read; /* pointer to the next byte in the buffer with data to be read */
64 - unsigned char *next_to_fill; /* pointer to the next byte in the buffer which is empty and can receive data */
65 - int bytes_avail; /* number of bytes available for reading */
66 - unsigned char *end; /* points to byte after end of buffer */
67 -} buf_info_t;
68 -
69 -/*
70 - used for each iteration of narrowing down the location in a bzipped2 file of
71 - a desired pageid, by finding first compressed block after a guessed
72 - position and checking the first pageid (if any) contained in it.
73 -*/
74 -typedef struct {
75 - int left_end; /* left end of interval to search (bytes from start of file) */
76 - int right_end; /* right end of interval to search */
77 - int value_wanted; /* pageid desired */
78 - int last_value; /* pageid we found in last iteration */
79 - int last_position; /* position in file for last iteration */
80 -} iter_info_t;
81 -
82 -#endif
Index: branches/ariel/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c
@@ -0,0 +1,766 @@
 2+#include <unistd.h>
 3+#include <stdio.h>
 4+#include <string.h>
 5+#include <sys/types.h>
 6+#include <sys/stat.h>
 7+#include <fcntl.h>
 8+#include <stdlib.h>
 9+#include <errno.h>
 10+#include <sys/types.h>
 11+#include <regex.h>
 12+#include "bzlib.h"
 13+#include "findpageidinbz2xml.h"
 14+
 15+
 16+/* return n ones either at left or right end */
 17+int bit_mask(int numbits, int end) {
 18+ if (end == MASKRIGHT) {
 19+ return((1<<numbits)-1);
 20+ }
 21+ else {
 22+ return(((1<<numbits)-1) << (8-numbits));
 23+ }
 24+}
 25+
 26+void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {
 27+ int i;
 28+
 29+ if (numbits == 0) {
 30+ return;
 31+ }
 32+
 33+ for (i=0; i<buflen; i++) {
 34+ /* left 1 */
 35+ buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
 36+
 37+ /* grab leftmost from next byte */
 38+ if (i < buflen-1) {
 39+ buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] | ( ( ((unsigned int) buffer[i+1]) & bit_mask(numbits,MASKLEFT) ) >> (8-numbits) ) );
 40+ }
 41+ }
 42+}
 43+
 44+
 45+void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {
 46+ int i;
 47+
 48+ for (i=buflen-1; i>=0; i--) {
 49+ /* right 1 */
 50+ buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
 51+
 52+ /* grab rightmost from prev byte */
 53+ if (i > 0) {
 54+ buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bit_mask(numbits,MASKLEFT)));
 55+ }
 56+ }
 57+}
 58+
 59+unsigned char ** init_marker() {
 60+ unsigned char **marker = malloc(8*sizeof(unsigned char *));
 61+ int i;
 62+
 63+ /* set up block marker plus its various right-shifted incarnations */
 64+ for (i = 0; i< 8; i++) {
 65+ marker[i] = malloc(sizeof(unsigned char)*7);
 66+ }
 67+ marker[0][0]= (unsigned char) 0x31;
 68+ marker[0][1]= (unsigned char) 0x41;
 69+ marker[0][2]= (unsigned char) 0x59;
 70+ marker[0][3]= (unsigned char) 0x26;
 71+ marker[0][4]= (unsigned char) 0x53;
 72+ marker[0][5]= (unsigned char) 0x59;
 73+ marker[0][6]= (unsigned char) 0x00;
 74+ for (i = 1; i< 8; i++) {
 75+ memcpy((char *)(marker[i]), (char *)(marker[i-1]),7);
 76+ shift_bytes_right(marker[i],7,1);
 77+ }
 78+ return(marker);
 79+}
 80+
 81+/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
 82+ both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
 83+ matches and 0 otherwise. */
 84+int bytes_compare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) {
 85+ int i;
 86+
 87+ if (bitsrightshifted == 0) {
 88+ for (i = 0; i< numbytes; i++) {
 89+ if (buff1[i] != buff2[i]) {
 90+ return(1);
 91+ }
 92+ }
 93+ return(0);
 94+ }
 95+ else {
 96+ for (i = 1; i< numbytes-2; i++) {
 97+ if (buff1[i] != buff2[i]) {
 98+ return(1);
 99+ }
 100+ }
 101+ /* do leftmost byte */
 102+ if ((buff1[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bit_mask(8-bitsrightshifted,MASKRIGHT)) ) {
 103+ return(1);
 104+ }
 105+ /* do rightmost byte */
 106+ if ((buff1[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bit_mask(bitsrightshifted,MASKLEFT)) ) {
 107+ return(1);
 108+ }
 109+ return(0);
 110+ }
 111+}
 112+
 113+/* return -1 if no match
 114+ return number of bits rightshifted otherwise */
 115+int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {
 116+ int result, i;
 117+
 118+ result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);
 119+ if (!result) {
 120+ return(0);
 121+ }
 122+ for (i=1; i<8; i++) {
 123+ result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);
 124+ if (!result) {
 125+ return(i);
 126+ }
 127+ }
 128+ return(-1);
 129+}
 130+
 131+/* return: 1 if found, 0 if not, -1 on error */
 132+int find_next_bz2_block_marker(int fin, bz_info_t *bfile) {
 133+ int result;
 134+
 135+ bfile->bits_shifted = -1;
 136+ result = read(fin, bfile->marker_buffer, 7);
 137+ if (result == -1) {
 138+ fprintf(stderr,"read of file failed\n");
 139+ exit(-1);
 140+ }
 141+ /* must be after 4 byte file header, and we add a leftmost byte to the buffer
 142+ of data read in case some bits have been shifted into it */
 143+ while (bfile->position <= bfile->file_size - 6 && bfile->bits_shifted < 0) {
 144+ bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);
 145+ if (bfile->bits_shifted < 0) {
 146+ bfile->position++;
 147+ result = lseek(fin, (bfile->position), SEEK_SET);
 148+ if (result == -1) {
 149+ fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);
 150+ exit(-1);
 151+ }
 152+ result = read(fin, bfile->marker_buffer, 7);
 153+ if (result < 7) {
 154+ /* fprintf(stderr,"read of file failed\n"); */
 155+ exit(-1);
 156+ }
 157+ }
 158+ else {
 159+ bfile->block_start = bfile->position;
 160+ return(1);
 161+ }
 162+ }
 163+ return(0);
 164+}
 165+
 166+/*
 167+ initializes the bz2 strm structure,
 168+ calls the BZ2 decompression library initializer
 169+
 170+ returns:
 171+ BZ_OK on success
 172+ various BZ_ errors on failure (see bzlib.h)
 173+*/
 174+int init_decompress(bz_info_t *bfile) {
 175+ int bz_verbosity = 0;
 176+ int bz_small = 0;
 177+ int ret;
 178+
 179+ bfile->strm.bzalloc = NULL;
 180+ bfile->strm.bzfree = NULL;
 181+ bfile->strm.opaque = NULL;
 182+
 183+ ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
 184+ if (ret != BZ_OK) {
 185+ fprintf(stderr,"uncompress failed, err %d\n", ret);
 186+ exit(-1);
 187+ }
 188+ return(ret);
 189+}
 190+
 191+/*
 192+ reads the first 4 bytes from a bz2 file (should be
 193+ "BZh" followed by the block size indicator, typically "9")
 194+ and passes them into the BZ2 decompression library.
 195+ This must be done before decompression of any block of the
 196+ file is attempted.
 197+
 198+ returns:
 199+ BZ_OK if successful,
 200+ various BZ_ errors on failure (see bzlib.h)
 201+*/
 202+int decompress_header(int fin, bz_info_t *bfile) {
 203+ int ret, res;
 204+
 205+ res = lseek(fin,0,SEEK_SET);
 206+ if (res == -1) {
 207+ fprintf(stderr,"lseek of file to 0 failed (3)\n");
 208+ }
 209+ bfile->bytes_read = read(fin, bfile->header_buffer, 4);
 210+ if (bfile->bytes_read < 4) {
 211+ fprintf(stderr,"failed to read 4 bytes of header, exiting\n");
 212+ exit(-1);
 213+ }
 214+ bfile->strm.next_in = (char *)bfile->header_buffer;
 215+ bfile->strm.avail_in = 4;
 216+
 217+ ret = BZ2_bzDecompress ( &(bfile->strm) );
 218+ if (BZ_OK != ret && BZ_STREAM_END != ret) {
 219+ fprintf(stderr,"Corrupt bzip2 header, exiting\n");
 220+ exit(-1);
 221+ }
 222+ return(ret);
 223+}
 224+
 225+/*
 226+ seek to appropriate offset as specified in bfile,
 227+ read compressed data into buffer indicated by bfile,
 228+ update the bfile structure accordingly,
 229+ save the overflow byte (bit-shifted data = suck)
 230+ this is for the *first* buffer of data in a stream,
 231+ for subsequent buffers use fill_buffer_to_decompress()
 232+
 233+ this will set bfile->eof on eof. no other indicator
 234+ will be provided.
 235+
 236+ returns:
 237+ 0 on success
 238+ -1 on error
 239+*/
 240+int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {
 241+ int res;
 242+
 243+ if (bfile->bits_shifted == 0) {
 244+ res = lseek(fin,bfile->position+1,SEEK_SET);
 245+ if (res == -1) {
 246+ fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);
 247+ return(-1);
 248+ }
 249+ }
 250+ else {
 251+ res = lseek(fin,bfile->position,SEEK_SET);
 252+ if (res == -1) {
 253+ fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);
 254+ return(-1);
 255+ }
 256+ }
 257+ bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);
 258+ if (bfile->bytes_read > 0) {
 259+ bfile->overflow = bfile->bufin[bfile->bytes_read-1];
 260+ shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);
 261+
 262+ bfile->strm.next_in = (char *)(bfile->bufin);
 263+ bfile->strm.avail_in = bfile->bytes_read-1;
 264+ }
 265+ if (bfile->bytes_read <=0) {
 266+ bfile->eof++;
 267+ }
 268+ return(0);
 269+}
 270+
 271+/*
 272+ read compressed data into buffer indicated by bfile,
 273+ from current position of file,
 274+ stuffing the overflow byte in first.
 275+ update the bfile structure accordingly
 276+ save the new overflow byte (bit-shifted data = suck)
 277+ this function is for decompression of buffers *after
 278+ the first one*. for the first one use
 279+ setup_first_buffer_to_decompress()
 280+
 281+ this will set bfile->eof on eof. no other indicator
 282+ will be provided.
 283+
 284+ returns:
 285+ 0 on success
 286+ hmm, it really does not do anything about errors :-D
 287+*/
 288+int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {
 289+ if (bfile->strm.avail_in == 0) {
 290+ bfile->strm.next_in = (char *)(bfile->bufin);
 291+ bfile->bufin[0] = bfile->overflow;
 292+ bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);
 293+ if (bfile->bytes_read > 0) {
 294+ bfile->position+=bfile->bytes_read;
 295+ bfile->overflow = bfile->bufin[bfile->bytes_read];
 296+ shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);
 297+ bfile->strm.avail_in = bfile->bytes_read;
 298+ }
 299+ else {
 300+ bfile->strm.avail_in = 1; /* the overflow byte */
 301+ bfile->eof++;
 302+ }
 303+ }
 304+ return(0);
 305+}
 306+
 307+/* size of buffer is bytes usable. there will be a null byte at the end
 308+
 309+ what we do with the buffer:
 310+ - read from front of buffer to end,
 311+ - fill from point where prev read did not fill buffer, or from where
 312+ move of data at end of buffer to beginning left room,
 313+ - mark a string of bytes (starting from what's available to read) as "read"
 314+
 315+*/
 316+buf_info_t *init_buffer(int size) {
 317+ buf_info_t *b;
 318+
 319+ b = (buf_info_t *)malloc(sizeof(buf_info_t));
 320+ b->buffer = malloc(sizeof(unsigned char)*(size+1));
 321+ b->buffer[size]='\0';
 322+ b->end = b->buffer + size;
 323+ b->next_to_read = b->end; /* nothing available */
 324+ b->bytes_avail = 0; /* bytes to read, nothing available */
 325+ b->next_to_fill = b->buffer; /* empty */
 326+ b->next_to_fill[0] = '\0';
 327+ return(b);
 328+}
 329+
 330+/* check if buffer (used for decompressed data output) is empty,
 331+ returns 1 if so and 0 if not */
 332+int buffer_is_empty(buf_info_t *b) {
 333+ if (b->bytes_avail == 0) {
 334+ return(1);
 335+ }
 336+ else {
 337+ return(0);
 338+ }
 339+}
 340+
 341+/* check if buffer (used for decompressed data output) is full,
 342+
 343+ returns 1 if so and 0 if not
 344+ I'm not liking this function so well, fixme */
 345+int buffer_is_full(buf_info_t *b) {
 346+ if (b->next_to_fill == b->end) {
 347+ return(1);
 348+ }
 349+ else {
 350+ return(0);
 351+ }
 352+}
 353+
 354+/* FIXME do this right. whatever. */
 355+int get_file_size(int fin) {
 356+ int res;
 357+
 358+ res = lseek(fin, 0, SEEK_END);
 359+ if (res == -1) {
 360+ fprintf(stderr,"lseek of file to 0 failed (6)\n");
 361+ exit(-1);
 362+ }
 363+ return(res);
 364+}
 365+
 366+
 367+/*
 368+ set up the marker, seek to right place, get first
 369+ buffer of compressed data for processing
 370+ bfile->position must be set to desired offset first by caller.
 371+ returns:
 372+ -1 if no marker or other error, position of next read if ok
 373+*/
 374+int init_bz2_file(bz_info_t *bfile, int fin) {
 375+ int res;
 376+
 377+ bfile->bufin_size = BUFINSIZE;
 378+ bfile->marker = init_marker();
 379+ bfile->bytes_read = 0;
 380+ bfile->bytes_written = 0;
 381+ bfile->eof = 0;
 382+
 383+ bfile->initialized++;
 384+
 385+ bfile->file_size = get_file_size(fin);
 386+ if (bfile->position > bfile->file_size) {
 387+ fprintf(stderr,"asked for position past end of file\n");
 388+ exit(-1);
 389+ }
 390+ res = lseek(fin, bfile->position, SEEK_SET);
 391+ if (res == -1) {
 392+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
 393+ exit(-1);
 394+ }
 395+
 396+ find_next_bz2_block_marker(fin, bfile);
 397+ if (bfile->bits_shifted >= 0) {
 398+ /* fprintf(stderr,"marker bits shifted by is %d\n",bfile->bits_shifted); */
 399+ init_decompress(bfile);
 400+ decompress_header(fin, bfile);
 401+ setup_first_buffer_to_decompress(fin, bfile);
 402+ return(0);
 403+ }
 404+ return(-1);
 405+}
 406+
 407+/* get the next buffer of uncompressed stuff */
 408+int decompress_data(bz_info_t *bfile, int fin, unsigned char *bufferout, int bufout_size) {
 409+ int ret;
 410+
 411+ bfile->bufout = bufferout;
 412+ bfile->bufout_size = bufout_size;
 413+ bfile->bytes_written = 0;
 414+
 415+ if (! bfile->initialized) {
 416+ if (init_bz2_file(bfile, fin) == -1) {
 417+ fprintf(stderr,"failed to initialize bz2file\n");
 418+ return(-1);
 419+ };
 420+ bfile->strm.next_out = (char *)bfile->bufout;
 421+ bfile->strm.avail_out = bfile->bufout_size;
 422+ }
 423+
 424+ ret = BZ_OK;
 425+ while (BZ_OK == ret && bfile->bytes_written == 0) {
 426+ ret = BZ2_bzDecompress ( &(bfile->strm) );
 427+ if (BZ_OK == ret || BZ_STREAM_END == ret) {
 428+ bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;
 429+ }
 430+ else {
 431+ fprintf(stderr,"error from BZ decompress %d\n",ret);
 432+ return(-1);
 433+ }
 434+ fill_buffer_to_decompress(fin, bfile, ret);
 435+ /*
 436+ if (bfile->eof && (BZ_OK == ret || BZ_STREAM_END == ret) ) {
 437+ fprintf(stderr,"eof reached\n");
 438+ }
 439+ */
 440+ }
 441+ return(0);
 442+}
 443+
 444+/*
 445+ fill output buffer in b with uncompressed data from bfile
 446+ if this is the first call to the function for this file,
 447+ the file header will be read, and the first buffer of
 448+ uncompressed data will be prepared. bfile->position
 449+ should be set to the offset (from the beginning of file) from
 450+ which to find the first bz2 block.
 451+
 452+ returns:
 453+ on success, number of bytes read (may be 0)
 454+ -1 on error
 455+*/
 456+int get_buffer_of_uncompressed_data(buf_info_t *b, int fin, bz_info_t *bfile) {
 457+ int res;
 458+
 459+ if (buffer_is_full(b)) {
 460+ fprintf(stdout,"DEBUG buffer full\n");
 461+ return(0);
 462+ }
 463+
 464+ if (buffer_is_empty(b)) {
 465+ b->next_to_fill = b->buffer;
 466+ }
 467+
 468+ res = decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill);
 469+ if (res <0 ) {
 470+ return(res);
 471+ }
 472+ if (bfile->bytes_written < 0) {
 473+ fprintf(stderr,"read of file failed\n");
 474+ return(-1);
 475+ }
 476+ else {
 477+ /* really?? FIXME check this */
 478+ if (buffer_is_empty(b)) {
 479+ b->next_to_read = b->next_to_fill; /* where we just read */
 480+ }
 481+ b->bytes_avail += bfile->bytes_written;
 482+ b->next_to_fill += bfile->bytes_written;
 483+ b->next_to_fill[0] = '\0';
 484+ return(0);
 485+ }
 486+}
 487+
 488+void dumpbuf_info_t(buf_info_t *b) {
 489+ fprintf(stdout, "\n");
 490+ fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);
 491+ fprintf(stdout, "b->end: %ld\n", (long int) b->end);
 492+ fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);
 493+ fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);
 494+ fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);
 495+}
 496+
 497+/*
 498+ copy text from end of buffer to the beginning, that we want to keep
 499+ around for further processing (i.e. further regex matches)
 500+ returns number of bytes copied
 501+*/
 502+int move_bytes_to_buffer_start(buf_info_t *b, unsigned char *fromwhere, int maxbytes) {
 503+ int i, tocopy;
 504+
 505+ if (fromwhere >= b->end) {
 506+ return(0);
 507+ }
 508+ else {
 509+ tocopy = b->end - fromwhere;
 510+ if (maxbytes && (tocopy > maxbytes)) {
 511+ tocopy = maxbytes;
 512+ }
 513+ for (i = 0; i < tocopy; i++) {
 514+ b->buffer[i] = fromwhere[i];
 515+ }
 516+ b->next_to_fill = b->buffer + tocopy;
 517+ b->next_to_fill[0] = '\0';
 518+ b->next_to_read = b->buffer;
 519+ b->bytes_avail = tocopy;
 520+ return(tocopy);
 521+ }
 522+}
 523+
 524+/*
 525+ dump the <meadiawiki> header (up through
 526+ </siteinfo> close tag) found at the
 527+ beginning of xml dump files.
 528+ returns:
 529+ 0 on success,
 530+ -1 on error
 531+*/
 532+int dump_mw_header(int fin) {
 533+ int res;
 534+ regmatch_t *match_siteinfo;
 535+ regex_t compiled_siteinfo;
 536+ int length=5000; /* output buffer size */
 537+ char *siteinfo = " </siteinfo>\n";
 538+
 539+ buf_info_t *b;
 540+ bz_info_t bfile;
 541+
 542+ int firstpage = 1;
 543+ int done = 0;
 544+ bfile.initialized = 0;
 545+
 546+ res = regcomp(&compiled_siteinfo, siteinfo, REG_EXTENDED);
 547+
 548+ match_siteinfo = (regmatch_t *)malloc(sizeof(regmatch_t)*1);
 549+
 550+ b = init_buffer(length);
 551+ bfile.bytes_read = 0;
 552+ bfile.position = 0;
 553+
 554+ while ((get_buffer_of_uncompressed_data(b, fin, &bfile)>=0) && (! bfile.eof) && (!done)) {
 555+ /* fixme either we don't check the return code right or we don't notice no bytes read or we don't clear the bytes read */
 556+ if (bfile.bytes_read) {
 557+ if (firstpage) {
 558+ if (bfile.bytes_read >= 11 && !memcmp((char *)b->next_to_read,"<mediawiki ",11)) {
 559+ /* good, write it and loop and not firstpage any more */
 560+ if (b->bytes_avail) {
 561+ if (regexec(&compiled_siteinfo, (char *)b->next_to_read, 2, match_siteinfo, 0 ) == 0) {
 562+ fwrite(b->next_to_read,match_siteinfo[0].rm_eo, 1, stdout);
 563+ b->next_to_read = b->end;
 564+ b->bytes_avail = 0;
 565+ b->next_to_fill = b->buffer; /* empty */
 566+ bfile.strm.next_out = (char *)b->next_to_fill;
 567+ bfile.strm.avail_out = b->end - b->next_to_fill;
 568+ done++;
 569+ }
 570+ else {
 571+ fwrite(b->next_to_read,b->bytes_avail,1,stdout);
 572+ b->next_to_read = b->end;
 573+ b->bytes_avail = 0;
 574+ b->next_to_fill = b->buffer; /* empty */
 575+ bfile.strm.next_out = (char *)b->next_to_fill;
 576+ bfile.strm.avail_out = b->end - b->next_to_fill;
 577+ }
 578+ }
 579+ }
 580+ else {
 581+ fprintf(stderr,"missing mediawiki header from bz2 xml file\n");
 582+ return(-1);
 583+ }
 584+ firstpage = 0;
 585+ }
 586+ else { /* not firstpage */
 587+ if (regexec(&compiled_siteinfo, (char *)b->next_to_read, 2, match_siteinfo, 0 ) == 0) {
 588+ fwrite(b->next_to_read,match_siteinfo[0].rm_eo, 1, stdout);
 589+ b->next_to_read = b->end;
 590+ b->bytes_avail = 0;
 591+ b->next_to_fill = b->buffer; /* empty */
 592+ bfile.strm.next_out = (char *)b->next_to_fill;
 593+ bfile.strm.avail_out = b->end - b->next_to_fill;
 594+ done++;
 595+ }
 596+ else {
 597+ /* could have the first part of the siteinfo tag... so copy up enough bytes to cover that case */
 598+ if (b->bytes_avail> 12) {
 599+ /* write everything that didn't match, but leave 12 bytes, to stdout */
 600+ fwrite(b->next_to_read,b->bytes_avail - 12,1,stdout);
 601+ move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 12, 12);
 602+ bfile.strm.next_out = (char *)b->next_to_fill;
 603+ bfile.strm.avail_out = b->end - b->next_to_fill;
 604+ }
 605+ else {
 606+ if (buffer_is_empty(b)) {
 607+ bfile.strm.next_out = (char *)b->buffer;
 608+ bfile.strm.avail_out = bfile.bufout_size;
 609+ b->next_to_fill = b->buffer; /* empty */
 610+ }
 611+ else {
 612+ /* there were only 12 or less bytes so just save em don't write em to stdout */
 613+ move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);
 614+ bfile.strm.next_out = (char *)b->next_to_fill;
 615+ bfile.strm.avail_out = b->end - b->next_to_fill;
 616+ }
 617+ }
 618+ }
 619+ } /* end notfirstpage */
 620+ }
 621+ }
 622+ if (!done) {
 623+ fprintf(stderr,"incomplete or no mediawiki header found\n");
 624+ return(-1);
 625+ }
 626+ else {
 627+ return(0);
 628+ }
 629+}
 630+
 631+/*
 632+ find the first page id after position in file
 633+ decompress and dump to stdout from that point on
 634+ returns:
 635+ 0 on success,
 636+ -1 on error
 637+*/
 638+int dump_from_first_page_id_after_offset(int fin, int position) {
 639+ int res;
 640+ regmatch_t *match_page;
 641+ regex_t compiled_page;
 642+ int length=5000; /* output buffer size */
 643+ char *page = " <page>";
 644+
 645+ buf_info_t *b;
 646+ bz_info_t bfile;
 647+
 648+ int firstpage = 1;
 649+
 650+ bfile.initialized = 0;
 651+
 652+ res = regcomp(&compiled_page, page, REG_EXTENDED);
 653+
 654+ match_page = (regmatch_t *)malloc(sizeof(regmatch_t)*1);
 655+
 656+ b = init_buffer(length);
 657+ bfile.bytes_read = 0;
 658+ bfile.position = position;
 659+
 660+ while ((get_buffer_of_uncompressed_data(b, fin, &bfile)>=0) && (! bfile.eof)) {
 661+ /* fixme either we don't check the return code right or we don't notice no bytes read or we don't clear the bytes read */
 662+ if (bfile.bytes_read) {
 663+ if (firstpage) {
 664+ if (regexec(&compiled_page, (char *)b->next_to_read, 2, match_page, 0 ) == 0) {
 665+ fwrite(b->next_to_read+match_page[0].rm_so,b->next_to_fill - (b->next_to_read+match_page[0].rm_so), 1, stdout);
 666+ b->next_to_read = b->end;
 667+ b->bytes_avail = 0;
 668+ b->next_to_fill = b->buffer; /* empty */
 669+ bfile.strm.next_out = (char *)b->next_to_fill;
 670+ bfile.strm.avail_out = b->end - b->next_to_fill;
 671+ firstpage = 0;
 672+ }
 673+ else {
 674+ /* could have the first part of the page tag... so copy up enough bytes to cover that case */
 675+ if (b->bytes_avail> 7) {
 676+ /* write everything that didn't match, but leave 7 bytes, to stdout */
 677+ fwrite(b->next_to_read,b->bytes_avail - 7,1,stdout);
 678+ move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 7, 7);
 679+ bfile.strm.next_out = (char *)b->next_to_fill;
 680+ bfile.strm.avail_out = b->end - b->next_to_fill;
 681+ }
 682+ else {
 683+ if (buffer_is_empty(b)) {
 684+ bfile.strm.next_out = (char *)b->buffer;
 685+ bfile.strm.avail_out = bfile.bufout_size;
 686+ b->next_to_fill = b->buffer; /* empty */
 687+ }
 688+ else {
 689+ /* there were only 7 or less bytes so just save em don't write em to stdout */
 690+ move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);
 691+ bfile.strm.next_out = (char *)b->next_to_fill;
 692+ bfile.strm.avail_out = b->end - b->next_to_fill;
 693+ }
 694+ }
 695+ }
 696+ }
 697+ else {
 698+ if (b->bytes_avail) {
 699+ fwrite(b->next_to_read,b->bytes_avail,1,stdout);
 700+ b->next_to_read = b->end;
 701+ b->bytes_avail = 0;
 702+ b->next_to_fill = b->buffer; /* empty */
 703+ bfile.strm.next_out = (char *)b->next_to_fill;
 704+ bfile.strm.avail_out = b->end - b->next_to_fill;
 705+ }
 706+ }
 707+ }
 708+ }
 709+ if (b->bytes_avail) {
 710+ fwrite(b->next_to_read,b->bytes_avail,1,stdout);
 711+ b->next_to_read = b->end;
 712+ b->bytes_avail = 0;
 713+ b->next_to_fill = b->buffer; /* empty */
 714+ bfile.strm.next_out = (char *)b->next_to_fill;
 715+ bfile.strm.avail_out = b->end - b->next_to_fill;
 716+ }
 717+ return(0);
 718+}
 719+
 720+/*
 721+ find the first bz2 block after the specified offset,
 722+ uncompress from that point on, write out the
 723+ contents starting with the first <page> tag,
 724+ prefacing first with the <mediawiki> header from
 725+ the beginning of the file, up through </siteinfo>.
 726+
 727+ note that we may lose some bytes from the very last
 728+ block if the blocks are bit shifted, because the
 729+ bzip crc at end of file will be wrong. (needs testing to
 730+ find a workaround, simply not feeding in the crc doesn't
 731+ suffice)
 732+
 733+ for purposes of the XML dumps this is fine, since we use
 734+ this tool to generate prefetch data starting from
 735+ a given pageid, rather than needing to uncompress
 736+ gigabytes of data to get to the point in the file
 737+ we want.
 738+
 739+ returns:
 740+ BZ_OK on success, various BZ_ errors otherwise.
 741+*/
 742+int main(int argc, char **argv) {
 743+ int fin, position, res;
 744+
 745+ if (argc != 3) {
 746+ fprintf(stderr,"usage: %s infile position\n", argv[0]);
 747+ exit(-1);
 748+ }
 749+
 750+ fin = open (argv[1], O_RDONLY);
 751+ if (fin < 0) {
 752+ fprintf(stderr,"failed to open file %s for read\n", argv[1]);
 753+ exit(-1);
 754+ }
 755+
 756+ position = atoi(argv[2]);
 757+ if (position <0) {
 758+ fprintf(stderr,"please specify a position >= 0.\n");
 759+ fprintf(stderr,"usage: %s infile position\n", argv[0]);
 760+ exit(-1);
 761+ }
 762+ /* input file, starting position in file, length of buffer for reading */
 763+ res = dump_mw_header(fin);
 764+
 765+ res = dump_from_first_page_id_after_offset(fin, position);
 766+ exit(res);
 767+}
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c
___________________________________________________________________
Added: svn:eol-style
1768 + native
Index: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
@@ -0,0 +1,842 @@
 2+#include <unistd.h>
 3+#include <stdio.h>
 4+#include <string.h>
 5+#include <sys/types.h>
 6+#include <sys/stat.h>
 7+#include <fcntl.h>
 8+#include <stdlib.h>
 9+#include <errno.h>
 10+#include <sys/types.h>
 11+#include <regex.h>
 12+#include "bzlib.h"
 13+#include "findpageidinbz2xml.h"
 14+
 15+/* return n ones either at left or right end */
 16+int bitmask(int numbits, int end) {
 17+ if (end == MASKRIGHT) {
 18+ return((1<<numbits)-1);
 19+ }
 20+ else {
 21+ return(((1<<numbits)-1) << (8-numbits));
 22+ }
 23+}
 24+
 25+void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {
 26+ int i;
 27+
 28+ if (numbits == 0) {
 29+ return;
 30+ }
 31+
 32+ for (i=0; i<buflen; i++) {
 33+ /* left 1 */
 34+ buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
 35+
 36+ /* grab leftmost from next byte */
 37+ if (i < buflen-1) {
 38+ buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] | ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,MASKLEFT) ) >> (8-numbits) ) );
 39+ }
 40+ }
 41+}
 42+
 43+void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {
 44+ int i;
 45+
 46+ for (i=buflen-1; i>=0; i--) {
 47+ /* right 1 */
 48+ buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
 49+
 50+ /* grab rightmost from prev byte */
 51+ if (i > 0) {
 52+ buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,MASKLEFT)));
 53+ }
 54+ }
 55+}
 56+
 57+unsigned char ** init_marker() {
 58+ unsigned char **marker = malloc(8*sizeof(unsigned char *));
 59+ int i;
 60+
 61+ /* set up block marker plus its various right-shifted incarnations */
 62+ for (i = 0; i< 8; i++) {
 63+ marker[i] = malloc(sizeof(unsigned char)*7);
 64+ }
 65+ marker[0][0]= (unsigned char) 0x31;
 66+ marker[0][1]= (unsigned char) 0x41;
 67+ marker[0][2]= (unsigned char) 0x59;
 68+ marker[0][3]= (unsigned char) 0x26;
 69+ marker[0][4]= (unsigned char) 0x53;
 70+ marker[0][5]= (unsigned char) 0x59;
 71+ marker[0][6]= (unsigned char) 0x00;
 72+ for (i = 1; i< 8; i++) {
 73+ memcpy((char *)(marker[i]), (char *)(marker[i-1]),7);
 74+ shift_bytes_right(marker[i],7,1);
 75+ }
 76+ return(marker);
 77+}
 78+
 79+/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
 80+ both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
 81+ matches and 0 otherwise. */
 82+int bytes_compare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) {
 83+ int i;
 84+
 85+ if (bitsrightshifted == 0) {
 86+ for (i = 0; i< numbytes; i++) {
 87+ if (buff1[i] != buff2[i]) {
 88+ return(1);
 89+ }
 90+ }
 91+ return(0);
 92+ }
 93+ else {
 94+ for (i = 1; i< numbytes-2; i++) {
 95+ if (buff1[i] != buff2[i]) {
 96+ return(1);
 97+ }
 98+ }
 99+ /* do leftmost byte */
 100+ if ((buff1[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) ) {
 101+ return(1);
 102+ }
 103+ /* do rightmost byte */
 104+ if ((buff1[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) ) {
 105+ return(1);
 106+ }
 107+ return(0);
 108+ }
 109+}
 110+
 111+
 112+/* return -1 if no match
 113+ return number of bits rightshifted otherwise */
 114+int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {
 115+ int result, i;
 116+
 117+ result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);
 118+ if (!result) {
 119+ return(0);
 120+ }
 121+ for (i=1; i<8; i++) {
 122+ result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);
 123+ if (!result) {
 124+ return(i);
 125+ }
 126+ }
 127+ return(-1);
 128+}
 129+
 130+
 131+/* return: 1 if found, 0 if not, -1 on error */
 132+int find_next_bz2_block_marker(int fin, bz_info_t *bfile) {
 133+ int result;
 134+
 135+ bfile->bits_shifted = -1;
 136+ result = read(fin, bfile->marker_buffer, 7);
 137+ if (result == -1) {
 138+ /* fprintf(stderr,"read of file failed\n"); */
 139+ return(-1);
 140+ }
 141+ /* must be after 4 byte file header, and we add a leftmost byte to the buffer
 142+ of data read in case some bits have been shifted into it */
 143+ while (bfile->position <= bfile->file_size - 6 && bfile->bits_shifted < 0) {
 144+ bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);
 145+ if (bfile->bits_shifted < 0) {
 146+ bfile->position++;
 147+ result = lseek(fin, (bfile->position), SEEK_SET);
 148+ if (result == -1) {
 149+ fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);
 150+ return(-1);
 151+ }
 152+ result = read(fin, bfile->marker_buffer, 7);
 153+ if (result < 7) {
 154+ /* fprintf(stderr,"read of file failed\n"); */
 155+ return(-1);
 156+ }
 157+ }
 158+ else {
 159+ bfile->block_start = bfile->position;
 160+ return(1);
 161+ }
 162+ }
 163+ return(0);
 164+}
 165+
 166+/*
 167+ initializes the bz2 strm structure,
 168+ calls the BZ2 decompression library initializer
 169+
 170+ returns:
 171+ BZ_OK on success
 172+ various BZ_ errors on failure (see bzlib.h)
 173+*/
 174+int init_decompress(bz_info_t *bfile) {
 175+ int bz_verbosity = 0;
 176+ int bz_small = 0;
 177+ int ret;
 178+
 179+ bfile->strm.bzalloc = NULL;
 180+ bfile->strm.bzfree = NULL;
 181+ bfile->strm.opaque = NULL;
 182+
 183+ ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
 184+ if (ret != BZ_OK) {
 185+ fprintf(stderr,"uncompress failed, err %d\n", ret);
 186+ exit(-1);
 187+ }
 188+ return(ret);
 189+}
 190+
 191+/*
 192+ reads the first 4 bytes from a bz2 file (should be
 193+ "BZh" followed by the block size indicator, typically "9")
 194+ and passes them into the BZ2 decompression library.
 195+ This must be done before decompression of any block of the
 196+ file is attempted.
 197+
 198+ returns:
 199+ BZ_OK if successful,
 200+ various BZ_ errors on failure (see bzlib.h)
 201+*/
 202+int decompress_header(int fin, bz_info_t *bfile) {
 203+ int ret, res;
 204+
 205+ res = lseek(fin,0,SEEK_SET);
 206+ if (res == -1) {
 207+ fprintf(stderr,"lseek of file to 0 failed (3)\n");
 208+ exit(-1);
 209+ }
 210+ bfile->bytes_read = read(fin, bfile->header_buffer, 4);
 211+ if (bfile->bytes_read < 4) {
 212+ fprintf(stderr,"failed to read 4 bytes of header, exiting\n");
 213+ exit(-1);
 214+ }
 215+ bfile->strm.next_in = (char *)bfile->header_buffer;
 216+ bfile->strm.avail_in = 4;
 217+
 218+ ret = BZ2_bzDecompress ( &(bfile->strm) );
 219+ if (BZ_OK != ret && BZ_STREAM_END != ret) {
 220+ fprintf(stderr,"Corrupt bzip2 header, exiting\n");
 221+ exit(-1);
 222+ }
 223+ return(ret);
 224+}
 225+
 226+/*
 227+ seek to appropriate offset as specified in bfile,
 228+ read compressed data into buffer indicated by bfile,
 229+ update the bfile structure accordingly,
 230+ save the overflow byte (bit-shifted data = suck)
 231+ this is for the *first* buffer of data in a stream,
 232+ for subsequent buffers use fill_buffer_to_decompress()
 233+
 234+ this will set bfile->eof on eof. no other indicator
 235+ will be provided.
 236+
 237+ returns:
 238+ 0 on success
 239+ -1 on error
 240+*/
 241+int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {
 242+ int res;
 243+
 244+ if (bfile->bits_shifted == 0) {
 245+ res = lseek(fin,bfile->position+1,SEEK_SET);
 246+ if (res == -1) {
 247+ fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);
 248+ return(-1);
 249+ }
 250+ }
 251+ else {
 252+ res = lseek(fin,bfile->position,SEEK_SET);
 253+ if (res == -1) {
 254+ fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);
 255+ return(-1);
 256+ }
 257+ }
 258+ bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);
 259+ if (bfile->bytes_read > 0) {
 260+ bfile->overflow = bfile->bufin[bfile->bytes_read-1];
 261+ shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);
 262+
 263+ bfile->strm.next_in = (char *)(bfile->bufin);
 264+ bfile->strm.avail_in = bfile->bytes_read-1;
 265+ }
 266+ if (bfile->bytes_read <=0) {
 267+ bfile->eof++;
 268+ }
 269+ return(0);
 270+}
 271+
 272+/*
 273+ read compressed data into buffer indicated by bfile,
 274+ from current position of file,
 275+ stuffing the overflow byte in first.
 276+ update the bfile structure accordingly
 277+ save the new overflow byte (bit-shifted data = suck)
 278+ this function is for decompression of buffers *after
 279+ the first one*. for the first one use
 280+ setup_first_buffer_to_decompress()
 281+
 282+ this will set bfile->eof on eof. no other indicator
 283+ will be provided.
 284+
 285+ returns:
 286+ 0 on success
 287+ hmm, it really does not do anything about errors :-D
 288+*/
 289+int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {
 290+ if (bfile->strm.avail_in == 0) {
 291+ bfile->strm.next_in = (char *)(bfile->bufin);
 292+ bfile->bufin[0] = bfile->overflow;
 293+ bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);
 294+ if (bfile->bytes_read > 0) {
 295+ bfile->overflow = bfile->bufin[bfile->bytes_read];
 296+ shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);
 297+ bfile->strm.avail_in = bfile->bytes_read;
 298+ bfile->position+=bfile->bytes_read;
 299+ }
 300+ else {
 301+ bfile->strm.avail_in = 1; /* the overflow byte */
 302+ bfile->eof++;
 303+ }
 304+ }
 305+ return(0);
 306+}
 307+
 308+/* size of buffer is bytes usable. there will be a null byte at the end
 309+
 310+ what we do with the buffer:
 311+ - read from front of buffer to end,
 312+ - fill from point where prev read did not fill buffer, or from where
 313+ move of data at end of buffer to beginning left room,
 314+ - mark a string of bytes (starting from what's available to read) as "read"
 315+
 316+*/
 317+buf_info_t *init_buffer(int size) {
 318+ buf_info_t *b;
 319+
 320+ b = (buf_info_t *)malloc(sizeof(buf_info_t));
 321+ b->buffer = malloc(sizeof(unsigned char)*(size+1));
 322+ b->buffer[size]='\0';
 323+ b->end = b->buffer + size;
 324+ b->next_to_read = b->end; /* nothing available */
 325+ b->bytes_avail = 0; /* bytes to read, nothing available */
 326+ b->next_to_fill = b->buffer; /* empty */
 327+ b->next_to_fill[0] = '\0';
 328+ return(b);
 329+}
 330+
 331+/* check if buffer (used for decompressed data output) is empty,
 332+ returns 1 if so and 0 if not */
 333+int buffer_is_empty(buf_info_t *b) {
 334+ if (b->bytes_avail == 0) {
 335+ return(1);
 336+ }
 337+ else {
 338+ return(0);
 339+ }
 340+}
 341+
 342+/* check if buffer (used for decompressed data output) is full,
 343+
 344+ returns 1 if so and 0 if not
 345+ I'm not liking this function so well, fixme */
 346+int buffer_is_full(buf_info_t *b) {
 347+ if (b->next_to_fill == b->end) {
 348+ return(1);
 349+ }
 350+ else {
 351+ return(0);
 352+ }
 353+}
 354+
 355+/* FIXME do this right. whatever. */
 356+int get_file_size(int fin) {
 357+ int res;
 358+
 359+ res = lseek(fin, 0, SEEK_END);
 360+ if (res == -1) {
 361+ fprintf(stderr,"lseek of file to 0 failed (6)\n");
 362+ exit(-1);
 363+ }
 364+ return(res);
 365+}
 366+
 367+
 368+/*
 369+ look for the first bz2 block in the file after specified offset
 370+ it tests that the block is valid by doing partial decompression.
 371+ this function will update the bfile structure:
 372+ bfile->position will contain the current position of the file (? will it?)
 373+ bfile->bits_shifted will contain the number of bits that the block is rightshifted
 374+ bfile->block_start will contain the offset from start of file to the block
 375+ returns:
 376+ position of next byte in file to be read, on success
 377+ -1 if no marker or other error
 378+*/
 379+int find_first_bz2_block_after_offset(bz_info_t *bfile, int fin, int position) {
 380+ int res;
 381+
 382+ bfile->bufin_size = BUFINSIZE;
 383+ bfile->marker = init_marker();
 384+ bfile->position = position;
 385+ bfile->block_start = -1;
 386+ bfile->bytes_read = 0;
 387+ bfile->bytes_written = 0;
 388+ bfile->eof = 0;
 389+ bfile->bits_shifted = -1;
 390+
 391+ bfile->file_size = get_file_size(fin);
 392+
 393+ while (bfile->bits_shifted < 0) {
 394+ if (bfile->position > bfile->file_size) {
 395+ return(-1);
 396+ }
 397+ res = lseek(fin, bfile->position, SEEK_SET);
 398+ if (res == -1) {
 399+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
 400+ exit(-1);
 401+ }
 402+ res = find_next_bz2_block_marker(fin, bfile);
 403+ if (res == 1) {
 404+ init_decompress(bfile);
 405+ decompress_header(fin, bfile);
 406+ res = setup_first_buffer_to_decompress(fin, bfile);
 407+ if (res == -1) {
 408+ fprintf(stderr,"couldn't get first buffer of data to uncompress\n");
 409+ exit(-1);
 410+ }
 411+ bfile->strm.next_out = (char *)bfile->bufout;
 412+ bfile->strm.avail_out = bfile->bufout_size;
 413+ res = BZ2_bzDecompress ( &(bfile->strm) );
 414+ /* this means we (probably) have a genuine marker */
 415+ if (BZ_OK == res || BZ_STREAM_END == res) {
 416+ res = BZ2_bzDecompressEnd ( &(bfile->strm) );
 417+ bfile->bytes_read = 0;
 418+ bfile->bytes_written = 0;
 419+ bfile->eof = 0;
 420+ /* leave the file at the right position */
 421+ res = lseek(fin, bfile->block_start, SEEK_SET);
 422+ if (res == -1) {
 423+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
 424+ exit(-1);
 425+ }
 426+ return(0);
 427+ }
 428+ /* right bytes, but there by chance, skip and try again */
 429+ else {
 430+ bfile->position+=6;
 431+ bfile->bits_shifted = -1;
 432+ bfile->block_start = -1;
 433+ }
 434+ }
 435+ else {
 436+ return(-1);
 437+ }
 438+ }
 439+ return(-1);
 440+}
 441+
 442+/*
 443+ find the first bz2 block marker in the file,
 444+ from its current position,
 445+ then set up for decompression from that point
 446+ returns:
 447+ 0 on success
 448+ -1 if no marker or other error
 449+*/
 450+int init_bz2_file(bz_info_t *bfile, int fin) {
 451+ int res;
 452+
 453+ bfile->initialized++;
 454+
 455+ res = find_next_bz2_block_marker(fin, bfile);
 456+ if (res ==1) {
 457+ init_decompress(bfile);
 458+ decompress_header(fin, bfile);
 459+ setup_first_buffer_to_decompress(fin, bfile);
 460+ return(0);
 461+ }
 462+ return(-1);
 463+}
 464+
 465+/* return -1 if error */
 466+int decompress_data(bz_info_t *bfile, int fin, unsigned char *bufferout, int bufout_size) {
 467+ int ret;
 468+
 469+ bfile->bufout = bufferout;
 470+ bfile->bufout_size = bufout_size;
 471+ bfile->bytes_written = 0;
 472+
 473+ if (! bfile->initialized) {
 474+ if (init_bz2_file(bfile, fin) == -1) {
 475+ /* fprintf(stderr,"failed to find block in bz2file (2)\n"); */
 476+ return(-1);
 477+ };
 478+ bfile->strm.next_out = (char *)bfile->bufout;
 479+ bfile->strm.avail_out = bfile->bufout_size;
 480+ }
 481+
 482+ ret = BZ_OK;
 483+ while (BZ_OK == ret && bfile->bytes_written == 0) {
 484+ ret = BZ2_bzDecompress ( &(bfile->strm) );
 485+ if (BZ_OK == ret || BZ_STREAM_END == ret) {
 486+ bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;
 487+ }
 488+ else {
 489+ /* fprintf(stderr,"error from BZ decompress %d\n",ret); */
 490+ return(-1);
 491+ }
 492+ fill_buffer_to_decompress(fin, bfile, ret);
 493+ /*
 494+ if (bfile->eof && (BZ_OK == ret || BZ_STREAM_END == ret) ) {
 495+ fprintf(stderr,"eof reached\n");
 496+ }
 497+ */
 498+ }
 499+ return(0);
 500+}
 501+
 502+
 503+/*
 504+ fill output buffer in b with uncompressed data from bfile
 505+ if this is the first call to the function for this file,
 506+ the file header will be read, and the first buffer of
 507+ uncompressed data will be prepared. bfile->position
 508+ should be set to the offset (from the beginning of file) from
 509+ which to find the first bz2 block.
 510+
 511+ returns:
 512+ on success, number of bytes read (may be 0)
 513+ -1 on error
 514+*/
 515+int get_buffer_of_uncompressed_data(buf_info_t *b, int fin, bz_info_t *bfile) {
 516+ int res;
 517+
 518+ if (buffer_is_full(b)) {
 519+ return(0);
 520+ }
 521+
 522+ if (buffer_is_empty(b)) {
 523+ b->next_to_fill = b->buffer;
 524+ }
 525+
 526+ res = decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill);
 527+ if (res == -1) {
 528+ return(res);
 529+ }
 530+ if (bfile->bytes_written < 0) {
 531+ /* fprintf(stderr,"read of file failed\n"); */
 532+ return(-1);
 533+ }
 534+ else {
 535+ /* really?? FIXME check this */
 536+ if (buffer_is_empty(b)) {
 537+ b->next_to_read = b->next_to_fill; /* where we just read */
 538+ }
 539+ b->bytes_avail += bfile->bytes_written;
 540+ b->next_to_fill += bfile->bytes_written;
 541+ b->next_to_fill[0] = '\0';
 542+ return(0);
 543+ }
 544+}
 545+
 546+void dumpbuf_info_t(buf_info_t *b) {
 547+ fprintf(stdout, "\n");
 548+ fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);
 549+ fprintf(stdout, "b->end: %ld\n", (long int) b->end);
 550+ fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);
 551+ fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);
 552+ fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);
 553+}
 554+
 555+
 556+/*
 557+ copy text from end of buffer to the beginning, that we want to keep
 558+ around for further processing (i.e. further regex matches)
 559+ returns number of bytes copied
 560+*/
 561+int move_bytes_to_buffer_start(buf_info_t *b, unsigned char *from_where, int maxbytes) {
 562+ int i, tocopy;
 563+
 564+ if (from_where >= b->end) {
 565+ return(0);
 566+ }
 567+ else {
 568+ tocopy = b->end - from_where;
 569+ if (maxbytes && (tocopy > maxbytes)) {
 570+ tocopy = maxbytes;
 571+ }
 572+ for (i = 0; i < tocopy; i++) {
 573+ b->buffer[i] = from_where[i];
 574+ }
 575+ b->next_to_fill = b->buffer + tocopy;
 576+ b->next_to_fill[0] = '\0';
 577+ b->next_to_read = b->buffer;
 578+ b->bytes_avail = tocopy;
 579+ return(tocopy);
 580+ }
 581+}
 582+
 583+/*
 584+ get the first page id after position in file
 585+ if a pageid is found, the structure pinfo will be updated accordingly
 586+ returns:
 587+ 1 if a pageid found,
 588+ 0 if no pageid found,
 589+ -1 on error
 590+*/
 591+int get_first_page_id_after_offset(int fin, int position, page_info_t *pinfo) {
 592+ int res;
 593+ regmatch_t *match_page, *match_page_id;
 594+ regex_t compiled_page, compiled_page_id;
 595+ int length=5000; /* output buffer size */
 596+ char *page = "<page>";
 597+ char *page_id = "<page>\n[ ]+<title>[^<]+</title>\n[ ]+<id>([0-9]+)</id>\n";
 598+
 599+ buf_info_t *b;
 600+ bz_info_t bfile;
 601+
 602+ bfile.initialized = 0;
 603+
 604+ res = regcomp(&compiled_page, page, REG_EXTENDED);
 605+ res = regcomp(&compiled_page_id, page_id, REG_EXTENDED);
 606+
 607+ match_page = (regmatch_t *)malloc(sizeof(regmatch_t)*1);
 608+ match_page_id = (regmatch_t *)malloc(sizeof(regmatch_t)*2);
 609+
 610+ b = init_buffer(length);
 611+
 612+ pinfo->bits_shifted = -1;
 613+ pinfo->position = -1;
 614+ pinfo->page_id = -1;
 615+
 616+ bfile.bytes_read = 0;
 617+
 618+ if (find_first_bz2_block_after_offset(&bfile, fin, position) == -1) {
 619+ /* fprintf(stderr,"failed to find block in bz2file (1)\n"); */
 620+ return(-1);
 621+ }
 622+
 623+ while (!get_buffer_of_uncompressed_data(b, fin, &bfile) && (! bfile.eof)) {
 624+ if (bfile.bytes_read) {
 625+ while (regexec(&compiled_page_id, (char *)b->next_to_read, 2, match_page_id, 0 ) == 0) {
 626+ if (match_page_id[1].rm_so >=0) {
 627+ /* write page_id to stderr */
 628+ /*
 629+ fwrite(b->next_to_read+match_page_id[1].rm_so, sizeof(unsigned char), match_page_id[1].rm_eo - match_page_id[1].rm_so, stderr);
 630+ fwrite("\n",1,1,stderr);
 631+ */
 632+ pinfo->page_id = atoi((char *)(b->next_to_read+match_page_id[1].rm_so));
 633+ pinfo->position = bfile.block_start;
 634+ pinfo->bits_shifted = bfile.bits_shifted;
 635+ return(1);
 636+ /* write up to and including page id tag to stdout */
 637+ /*
 638+ fwrite(b->next_to_read,match_page_id[0].rm_eo,1,stdout);
 639+ b->next_to_read = b->next_to_read+match_page_id[0].rm_eo;
 640+ b->bytes_avail -= match_page_id[0].rm_eo;
 641+ */
 642+ }
 643+ else {
 644+ /* should never happen */
 645+ fprintf(stderr,"regex gone bad...\n");
 646+ exit(-1);
 647+ }
 648+ }
 649+ if (regexec(&compiled_page, (char *)b->next_to_read, 1, match_page, 0 ) == 0) {
 650+ /* write everything up to but not including the page tag to stdout */
 651+ /*
 652+ fwrite(b->next_to_read,match_page[0].rm_eo - 6,1,stdout);
 653+ */
 654+ move_bytes_to_buffer_start(b, b->next_to_read + match_page[0].rm_so, b->bytes_avail - match_page[0].rm_so);
 655+ bfile.strm.next_out = (char *)b->next_to_fill;
 656+ bfile.strm.avail_out = b->end - b->next_to_fill;
 657+ }
 658+ else {
 659+ /* could have the first part of the page tag... so copy up enough bytes to cover that case */
 660+ if (b->bytes_avail> 5) {
 661+ /* write everything that didn't match, but leave 5 bytes, to stdout */
 662+ /*
 663+ fwrite(b->next_to_read,b->bytes_avail - 5,1,stdout);
 664+ */
 665+ move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 5, 5);
 666+ bfile.strm.next_out = (char *)b->next_to_fill;
 667+ bfile.strm.avail_out = b->end - b->next_to_fill;
 668+ }
 669+ else {
 670+ if (buffer_is_empty(b)) {
 671+ bfile.strm.next_out = (char *)b->buffer;
 672+ bfile.strm.avail_out = bfile.bufout_size;
 673+ b->next_to_fill = b->buffer; /* empty */
 674+ }
 675+ else {
 676+ /* there were only 5 or less bytes so just save em don't write em to stdout */
 677+ move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);
 678+ bfile.strm.next_out = (char *)b->next_to_fill;
 679+ bfile.strm.avail_out = b->end - b->next_to_fill;
 680+ }
 681+ }
 682+ }
 683+ }
 684+ }
 685+ /*
 686+ if (b->bytes_avail) {
 687+ fwrite(b->next_to_read,b->bytes_avail,1,stdout);
 688+ }
 689+ */
 690+ return(0);
 691+}
 692+
 693+/* search for pageid in a bz2 file, given start and end offsets
 694+ to search for
 695+ we guess by the most boring method possible (shrink the
 696+ interval according to the value found on the last guess,
 697+ try midpoint of the new interval)
 698+ multiple calls of this will get the job done.
 699+ interval has left end = right end if search is complete.
 700+ this function may return the previous guess and simply
 701+ shrink the interval.
 702+ note that a "match" means either that the pageid we find
 703+ is smaller than the one the caller wants, or is equal.
 704+ why? because then we can use the output for prefetch
 705+ for xml dumps and be sure a specific page range is covered :-P
 706+
 707+ return value from guess, or -1 on error.
 708+ */
 709+int do_iteration(iter_info_t *iinfo, int fin, page_info_t *pinfo) {
 710+ int res;
 711+ int new_position;
 712+ int interval;
 713+
 714+ /*
 715+ last_position is somewhere in the interval, perhaps at an end
 716+ last_value is the value we had at that position
 717+ */
 718+
 719+ interval = (iinfo->right_end - iinfo->left_end)/2;
 720+ if (interval == 0) {
 721+ interval = 1;
 722+ }
 723+ /* fprintf(stderr,"interval size is %ld, left end %ld, right end %ld, last val %d\n",interval, iinfo->left_end, iinfo->right_end, iinfo->last_value); */
 724+ /* if we're this close, we'll check this value and be done with it */
 725+ if (iinfo->right_end -iinfo->left_end < 2) {
 726+ new_position = iinfo->left_end;
 727+ iinfo->right_end = iinfo->left_end;
 728+ }
 729+ else {
 730+ if (iinfo->last_value < iinfo->value_wanted) {
 731+ /* fprintf(stderr,"resetting left end\n"); */
 732+ iinfo->left_end = iinfo->last_position;
 733+ new_position = iinfo->last_position + interval;
 734+ }
 735+ /* iinfo->last_value > iinfo->value_wanted */
 736+ else {
 737+ /* fprintf(stderr,"resetting right end\n"); */
 738+ iinfo->right_end = iinfo->last_position;
 739+ new_position = iinfo->last_position - interval;
 740+ }
 741+ }
 742+ res = get_first_page_id_after_offset(fin, new_position, pinfo);
 743+ if (res >0) {
 744+ /* caller wants the new value */
 745+ iinfo->last_value = pinfo->page_id;
 746+ iinfo->last_position = new_position;
 747+ return(pinfo->page_id);
 748+ }
 749+ else {
 750+ /* here is the tough case, if we didn't find anything then we are prolly too close to the end, truncation or
 751+ there's just no block here.
 752+ set the right end, keep the last value and position and let the caller retry with the new interval */
 753+ if (iinfo->last_value < iinfo->value_wanted) { /* we were moving towards eof */
 754+ iinfo->right_end = new_position;
 755+ return(iinfo->last_value);
 756+ }
 757+ /* in theory we were moving towards beginning of file, should not have issues, so bail here */
 758+ else {
 759+ /* fprintf(stderr,"something very broken, giving up\n"); */
 760+ return(-1);
 761+ }
 762+ }
 763+}
 764+
 765+/*
 766+ given a bzipped and possibly truncated file, and a page id,
 767+ hunt for the page id in the file; this assume that the
 768+ bz2 header is intact and that page ids are steadily increasing
 769+ throughout the file.
 770+
 771+ writes the offset of the relevant block (from beginning of file)
 772+ and the first pageid found in that block, to stdout
 773+
 774+ format of output:
 775+ position:xxxxx pageid:nnn
 776+
 777+ returns: 0 on success, -1 on error
 778+*/
 779+int main(int argc, char **argv) {
 780+ int fin, position, res, interval, page_id, oldmarker, file_size;
 781+ page_info_t pinfo;
 782+ iter_info_t iinfo;
 783+
 784+ if (argc != 3) {
 785+ fprintf(stderr,"usage: %s infile id\n", argv[0]);
 786+ exit(-1);
 787+ }
 788+
 789+ fin = open (argv[1], O_RDONLY);
 790+ if (fin < 0) {
 791+ fprintf(stderr,"failed to open file %s for read\n", argv[1]);
 792+ exit(-1);
 793+ }
 794+
 795+ page_id = atoi(argv[2]);
 796+ if (page_id <1) {
 797+ fprintf(stderr,"please specify a page_id >= 1.\n");
 798+ fprintf(stderr,"usage: %s infile page_id\n", argv[0]);
 799+ exit(-1);
 800+ }
 801+
 802+ file_size = get_file_size(fin);
 803+
 804+ interval = file_size;
 805+ position = 0;
 806+ oldmarker = -1;
 807+ pinfo.bits_shifted = -1;
 808+ pinfo.position = -1;
 809+ pinfo.page_id = -1;
 810+
 811+ iinfo.left_end = 0;
 812+ file_size = get_file_size(fin);
 813+ iinfo.right_end = file_size;
 814+ iinfo.value_wanted = page_id;
 815+
 816+ res = get_first_page_id_after_offset(fin, 0, &pinfo);
 817+ if (res > 0) {
 818+ iinfo.last_value = pinfo.page_id;
 819+ iinfo.last_position = 0;
 820+ }
 821+ else {
 822+ fprintf(stderr,"failed to get anything useful from the beginning of the file even, bailing.\n");
 823+ exit(1);
 824+ }
 825+ if (pinfo.page_id == page_id) {
 826+ fprintf(stdout,"position:%d page_id:%d\n",pinfo.position, pinfo.page_id);
 827+ exit(0);
 828+ }
 829+
 830+ while (1) {
 831+ res = do_iteration(&iinfo, fin, &pinfo);
 832+ /* things to check: bad return? interval is 0 bytes long? */
 833+ if (iinfo.left_end == iinfo.right_end) {
 834+ fprintf(stdout,"position:%d page_id:%d\n",pinfo.position, pinfo.page_id);
 835+ exit(0);
 836+ }
 837+ else if (res < 0) {
 838+ fprintf(stderr,"broken and quitting\n");
 839+ exit(-1);
 840+ }
 841+ }
 842+ exit(0);
 843+}
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.c
___________________________________________________________________
Added: svn:eol-style
1844 + native
Index: branches/ariel/xmldumps-backup/mwbzutils/checkforbz2footer.c
@@ -0,0 +1,156 @@
 2+#include <unistd.h>
 3+#include <stdio.h>
 4+#include <string.h>
 5+#include <sys/types.h>
 6+#include <sys/stat.h>
 7+#include <fcntl.h>
 8+#include <stdlib.h>
 9+#include <errno.h>
 10+
 11+/*
 12+ Check to see whether a file ends with a bz2 footer or not
 13+ (i.e. if it is truncated or corrupted).
 14+ This is a crude but fast test for integrity; we don't
 15+ check the CRC at the end of fthe stream, nor do we check the
 16+ bit padding in the last byte of the file.
 17+
 18+ Arguments: the name of the file to check, presumably
 19+ a bzipped file.
 20+ Outputs: none.
 21+ Exits with 0 if the file contains the footer at the end,
 22+ 1 if the file does not contain the footer, and -1 on error.
 23+*/
 24+
 25+
 26+int read_footer(unsigned char *buffer, int fin) {
 27+ int res;
 28+
 29+ res = lseek(fin, -11, SEEK_END);
 30+ if (res == -1) {
 31+ fprintf(stderr,"lseek of file failed\n");
 32+ exit(-1);
 33+ }
 34+ res = read(fin, buffer, 11);
 35+ if (res == -1) {
 36+ fprintf(stderr,"read of file failed\n");
 37+ exit(-1);
 38+ }
 39+ return(0);
 40+}
 41+
 42+#define LEFT 0
 43+#define RIGHT 1
 44+
 45+/* return n ones either at left or right end */
 46+int bitmask(int numbits, int end) {
 47+ if (end == RIGHT) {
 48+ return((1<<numbits)-1);
 49+ }
 50+ else {
 51+ return(((1<<numbits)-1) << (8-numbits));
 52+ }
 53+}
 54+
 55+void shiftbytesright(unsigned char *buffer, int buflen, int numbits) {
 56+ int i;
 57+
 58+ for (i=buflen-1; i>=0; i--) {
 59+ /* right 1 */
 60+ buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
 61+
 62+ /* grab rightmost from prev byte */
 63+ if (i > 0) {
 64+ buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(1,LEFT)));
 65+ }
 66+ }
 67+}
 68+
 69+/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
 70+ both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
 71+ matches and 0 otherwise. */
 72+int bytescompare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) {
 73+ int i;
 74+
 75+ if (bitsrightshifted == 0) {
 76+ for (i = 0; i< numbytes; i++) {
 77+ if (buff1[i] != buff2[i]) {
 78+ return(1);
 79+ }
 80+ }
 81+ return(0);
 82+ }
 83+ else {
 84+ for (i = 1; i< numbytes-2; i++) {
 85+ if (buff1[i] != buff2[i]) {
 86+ return(1);
 87+ }
 88+ }
 89+ /* do leftmost byte */
 90+ if ((buff1[0] & bitmask(8-bitsrightshifted,RIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,RIGHT)) ) {
 91+ return(1);
 92+ }
 93+ /* do rightmost byte */
 94+ if ((buff1[numbytes-1] & bitmask(bitsrightshifted,LEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,LEFT)) ) {
 95+ return(1);
 96+ }
 97+ return(0);
 98+ }
 99+}
 100+
 101+int checkfileforfooter(int fin) {
 102+ unsigned char buffer[11];
 103+ int result, i;
 104+ unsigned char **footer = malloc(8*sizeof(unsigned char *));
 105+
 106+ /* set up footer plus its various right-shifted incarnations */
 107+ /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */
 108+ for (i = 0; i< 8; i++) {
 109+ footer[i] = malloc(sizeof(unsigned char)*7);
 110+ }
 111+ footer[0][0]= (unsigned char) 0x17;
 112+ footer[0][1]= (unsigned char) 0x72;
 113+ footer[0][2]= (unsigned char) 0x45;
 114+ footer[0][3]= (unsigned char) 0x38;
 115+ footer[0][4]= (unsigned char) 0x50;
 116+ footer[0][5]= (unsigned char) 0x90;
 117+ footer[0][6]= (unsigned char) 0x00;
 118+ for (i = 1; i< 8; i++) {
 119+ memcpy((char *)(footer[i]), (char *)(footer[i-1]),7);
 120+ shiftbytesright(footer[i],7,1);
 121+ }
 122+
 123+ read_footer(buffer,fin);
 124+
 125+ result = bytescompare(footer[0],buffer+1,6,0);
 126+ if (!result) {
 127+ return(0);
 128+ }
 129+
 130+ for (i=1; i<8; i++) {
 131+ result = bytescompare(footer[i],buffer,7,i);
 132+ if (!result) {
 133+ return(0);
 134+ }
 135+ }
 136+ return(1);
 137+}
 138+
 139+int main(int argc, char **argv) {
 140+
 141+ int fin;
 142+ int result;
 143+
 144+ if (argc != 2) {
 145+ fprintf(stderr,"usage: %s infile\n", argv[0]);
 146+ exit(-1);
 147+ }
 148+ fin = open (argv[1], O_RDONLY);
 149+ if (fin < 0) {
 150+ fprintf(stderr,"failed to open file %s for read\n", argv[1]);
 151+ exit(-1);
 152+ }
 153+ result = checkfileforfooter(fin);
 154+ close(fin);
 155+ exit(result);
 156+}
 157+
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/checkforbz2footer.c
___________________________________________________________________
Added: svn:eol-style
1158 + native
Index: branches/ariel/xmldumps-backup/mwbzutils/dumplastbz2block.c
@@ -0,0 +1,463 @@
 2+#include <unistd.h>
 3+#include <stdio.h>
 4+#include <string.h>
 5+#include <sys/types.h>
 6+#include <sys/stat.h>
 7+#include <fcntl.h>
 8+#include <stdlib.h>
 9+#include <errno.h>
 10+#include "bzlib.h"
 11+
 12+/*
 13+ Find the last bz2 block marker in a file
 14+ and dump whatever can be decompressed after
 15+ that point. The header of the file must
 16+ be intact in order for any output to be produced.
 17+ This will produce output for truncated files as well,
 18+ as long as there is "enough" data after the block
 19+ marker.
 20+
 21+ Arguments: the name of the file to check, presumably
 22+ a bzipped file.
 23+ Outputs: the decompressed data at the end of the file.
 24+ Exits with 0 if decompression of some data can be done,
 25+ 1 if decompression fails, and -1 on error.
 26+*/
 27+
 28+#define BUFSIZE 121072
 29+typedef struct {
 30+ unsigned char bufin[BUFSIZE];
 31+ unsigned char bufout[BUFSIZE];
 32+ int bufsize;
 33+ bz_stream strm;
 34+ unsigned char overflow;
 35+ int bitsshifted;
 36+ int position;
 37+} bzinfo;
 38+
 39+int read_footer(unsigned char *buffer, int fin) {
 40+ int res;
 41+
 42+ res = lseek(fin, -11, SEEK_END);
 43+ if (res == -1) {
 44+ fprintf(stderr,"lseek of file failed\n");
 45+ exit(-1);
 46+ }
 47+ res = read(fin, buffer, 11);
 48+ if (res == -1) {
 49+ fprintf(stderr,"read of file failed\n");
 50+ exit(-1);
 51+ }
 52+ return(0);
 53+}
 54+
 55+#define LEFT 0
 56+#define RIGHT 1
 57+
 58+/* return n ones either at left or right end */
 59+int bitmask(int numbits, int end) {
 60+ if (end == RIGHT) {
 61+ return((1<<numbits)-1);
 62+ }
 63+ else {
 64+ return(((1<<numbits)-1) << (8-numbits));
 65+ }
 66+}
 67+
 68+void shiftbytesleft(unsigned char *buffer, int buflen, int numbits) {
 69+ int i;
 70+
 71+ if (numbits == 0) {
 72+ return;
 73+ }
 74+
 75+ for (i=0; i<buflen; i++) {
 76+ /* left 1 */
 77+ buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
 78+
 79+ /* grab leftmost from next byte */
 80+ if (i < buflen-1) {
 81+ buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] | ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,LEFT) ) >> (8-numbits) ) );
 82+ }
 83+ }
 84+}
 85+
 86+
 87+void shiftbytesright(unsigned char *buffer, int buflen, int numbits) {
 88+ int i;
 89+
 90+ for (i=buflen-1; i>=0; i--) {
 91+ /* right 1 */
 92+ buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
 93+
 94+ /* grab rightmost from prev byte */
 95+ if (i > 0) {
 96+ buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] | ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,LEFT)));
 97+ }
 98+ }
 99+}
 100+
 101+unsigned char ** init_marker() {
 102+ unsigned char **marker = malloc(8*sizeof(unsigned char *));
 103+ int i;
 104+
 105+ /* set up block marker plus its various right-shifted incarnations */
 106+ for (i = 0; i< 8; i++) {
 107+ marker[i] = malloc(sizeof(unsigned char)*7);
 108+ }
 109+ marker[0][0]= (unsigned char) 0x31;
 110+ marker[0][1]= (unsigned char) 0x41;
 111+ marker[0][2]= (unsigned char) 0x59;
 112+ marker[0][3]= (unsigned char) 0x26;
 113+ marker[0][4]= (unsigned char) 0x53;
 114+ marker[0][5]= (unsigned char) 0x59;
 115+ marker[0][6]= (unsigned char) 0x00;
 116+ for (i = 1; i< 8; i++) {
 117+ memcpy((char *)(marker[i]), (char *)(marker[i-1]),7);
 118+ shiftbytesright(marker[i],7,1);
 119+ }
 120+ return(marker);
 121+}
 122+
 123+unsigned char ** init_footer() {
 124+ unsigned char **footer = malloc(8*sizeof(unsigned char *));
 125+ int i;
 126+
 127+ /* set up footer plus its various right-shifted incarnations */
 128+ /* dude why couldn't you have 0 padded each bzip2 block? seriously ... */
 129+ for (i = 0; i< 8; i++) {
 130+ footer[i] = malloc(sizeof(unsigned char)*7);
 131+ }
 132+ footer[0][0]= (unsigned char) 0x17;
 133+ footer[0][1]= (unsigned char) 0x72;
 134+ footer[0][2]= (unsigned char) 0x45;
 135+ footer[0][3]= (unsigned char) 0x38;
 136+ footer[0][4]= (unsigned char) 0x50;
 137+ footer[0][5]= (unsigned char) 0x90;
 138+ footer[0][6]= (unsigned char) 0x00;
 139+ for (i = 1; i< 8; i++) {
 140+ memcpy((char *)(footer[i]), (char *)(footer[i-1]),7);
 141+ shiftbytesright(footer[i],7,1);
 142+ }
 143+ return(footer);
 144+}
 145+
 146+
 147+/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
 148+ both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
 149+ matches and 0 otherwise. */
 150+int bytescompare(unsigned char *buff1, unsigned char *buff2, int numbytes, int bitsrightshifted) {
 151+ int i;
 152+
 153+ if (bitsrightshifted == 0) {
 154+ for (i = 0; i< numbytes; i++) {
 155+ if (buff1[i] != buff2[i]) {
 156+ return(1);
 157+ }
 158+ }
 159+ return(0);
 160+ }
 161+ else {
 162+ for (i = 1; i< numbytes-2; i++) {
 163+ if (buff1[i] != buff2[i]) {
 164+ return(1);
 165+ }
 166+ }
 167+ /* do leftmost byte */
 168+ if ((buff1[0] & bitmask(8-bitsrightshifted,RIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,RIGHT)) ) {
 169+ return(1);
 170+ }
 171+ /* do rightmost byte */
 172+ if ((buff1[numbytes-1] & bitmask(bitsrightshifted,LEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,LEFT)) ) {
 173+ return(1);
 174+ }
 175+ return(0);
 176+ }
 177+}
 178+
 179+/* return -1 if no match
 180+ return number of bits rightshifted otherwise */
 181+int checkfileforfooter(int fin, unsigned char **footer) {
 182+ unsigned char buffer[11];
 183+ int result, i;
 184+
 185+ read_footer(buffer,fin);
 186+
 187+ result = bytescompare(footer[0],buffer+1,6,0);
 188+ if (!result) {
 189+ return(0);
 190+ }
 191+
 192+ for (i=1; i<8; i++) {
 193+ result = bytescompare(footer[i],buffer,7,i);
 194+ if (!result) {
 195+ return(i);
 196+ }
 197+ }
 198+ return(-1);
 199+}
 200+
 201+/* return -1 if no match
 202+ return number of bits rightshifted otherwise */
 203+int checkbufferforblockmarker(unsigned char *buffer, unsigned char **marker) {
 204+ int result, i;
 205+
 206+ result = bytescompare(marker[0],buffer+1,6,0);
 207+ if (!result) {
 208+ return(0);
 209+ }
 210+ for (i=1; i<8; i++) {
 211+ result = bytescompare(marker[i],buffer,7,i);
 212+ if (!result) {
 213+ return(i);
 214+ }
 215+ }
 216+ return(-1);
 217+}
 218+
 219+void clearbuffer(unsigned char *buf, int length) {
 220+ int i;
 221+
 222+ for (i=0; i<length; i++) {
 223+ buf[i]=0;
 224+ }
 225+ return;
 226+}
 227+
 228+int findnextmarker(int fin, int *start_at, int *position, unsigned char **marker, unsigned char *buffer ) {
 229+ int bitsshifted = -1;
 230+ int result;
 231+
 232+ /* must be after 4 byte file header, and we add a leftmost byte to the buffer
 233+ of data read in case some bits have been shifted into it */
 234+ while (*position >= 3 && bitsshifted < 0) {
 235+ bitsshifted = checkbufferforblockmarker(buffer, marker);
 236+ if (bitsshifted < 0) {
 237+ (*start_at)++;
 238+ /*
 239+ if (*start_at % 10000 == 0) {
 240+ fprintf(stderr, "starting at %d, position %d\n", *start_at, *position);
 241+ }
 242+ */
 243+ *position = lseek(fin, -1*(*start_at), SEEK_END);
 244+ if (*position == -1) {
 245+ fprintf(stderr,"lseek of file failed\n");
 246+ exit(-1);
 247+ }
 248+ result = read(fin, buffer, 7);
 249+ if (result == -1) {
 250+ fprintf(stderr,"read of file failed\n");
 251+ exit(-1);
 252+ }
 253+ }
 254+ else {
 255+ return(bitsshifted);
 256+ }
 257+ }
 258+ return(bitsshifted);
 259+}
 260+
 261+int init_decompress(bzinfo *bfile) {
 262+ int bz_verbosity = 0;
 263+ int bz_small = 0;
 264+ int ret;
 265+
 266+ bfile->strm.bzalloc = NULL;
 267+ bfile->strm.bzfree = NULL;
 268+ bfile->strm.opaque = NULL;
 269+
 270+ ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
 271+ if (ret != BZ_OK) {
 272+ fprintf(stderr,"uncompress failed, err %d\n", ret);
 273+ exit(-1);
 274+ }
 275+ return(ret);
 276+}
 277+
 278+int decompress_header(int fin, bzinfo *bfile) {
 279+ int bytesread, ret;
 280+ unsigned char header[4];
 281+
 282+ lseek(fin,0,SEEK_SET);
 283+ bytesread = read(fin, header, 4);
 284+ if (bytesread < 4) {
 285+ fprintf(stderr,"failed to read 4 bytes of header, exiting\n");
 286+ exit(-1);
 287+ }
 288+ bfile->strm.next_in = (char *)header;
 289+ bfile->strm.avail_in = 4;
 290+
 291+ bfile->strm.next_out = (char *)(bfile->bufout);
 292+ bfile->strm.avail_out = bfile->bufsize;
 293+ ret = BZ2_bzDecompress ( &(bfile->strm) );
 294+ if (BZ_OK != ret && BZ_STREAM_END != ret) {
 295+ fprintf(stderr,"Corrupt bzip2 header, exiting\n");
 296+ exit(-1);
 297+ }
 298+ return(ret);
 299+}
 300+
 301+int setup_first_buffer(int fin, bzinfo *bfile) {
 302+ int bytesread, eof=0;
 303+
 304+ if (bfile->bitsshifted == 0) {
 305+ lseek(fin,bfile->position+1,SEEK_SET);
 306+ }
 307+ else {
 308+ lseek(fin,bfile->position,SEEK_SET);
 309+ }
 310+ bytesread = read(fin, bfile->bufin, bfile->bufsize);
 311+ if (bytesread > 0) {
 312+ bfile->overflow = bfile->bufin[bytesread-1];
 313+ shiftbytesleft(bfile->bufin,bytesread,bfile->bitsshifted);
 314+
 315+ bfile->strm.next_in = (char *)(bfile->bufin);
 316+ bfile->strm.avail_in = bytesread-1;
 317+
 318+ bfile->strm.next_out = (char *)(bfile->bufout);
 319+ bfile->strm.avail_out = bfile->bufsize;
 320+ }
 321+ if (bytesread <=0) {
 322+ eof++;
 323+ }
 324+ return(eof);
 325+}
 326+
 327+int do_last_byte(bzinfo *bfile) {
 328+ int ret=BZ_OK;
 329+ int written;
 330+
 331+ if (bfile->strm.avail_in == 0) {
 332+ bfile->strm.next_in = (char *)(bfile->bufin);
 333+ bfile->bufin[0] = bfile->overflow;
 334+ shiftbytesleft(bfile->bufin,1,bfile->bitsshifted);
 335+ bfile->strm.avail_in = 1;
 336+ bfile->strm.next_out = (char *)(bfile->bufout);
 337+ bfile->strm.avail_out = bfile->bufsize;
 338+ ret = BZ2_bzDecompress ( &(bfile->strm) );
 339+ if (BZ_OK == ret || BZ_STREAM_END == ret) {
 340+ written = fwrite(bfile->bufout, sizeof(unsigned char), (unsigned char *)bfile->strm.next_out - bfile->bufout, stdout);
 341+ }
 342+ }
 343+ return(ret);
 344+}
 345+
 346+int read_next_buffer(int fin, bzinfo *bfile, int ret) {
 347+ int bytesread, eof=0;
 348+
 349+ /* fprintf(stderr," got return from decompress of %d\n", ret); */
 350+
 351+ if (bfile->strm.avail_in == 0) {
 352+ bfile->strm.next_in = (char *)(bfile->bufin);
 353+ bfile->bufin[0] = bfile->overflow;
 354+ bytesread = read(fin, bfile->bufin+1, bfile->bufsize-1);
 355+ if (bytesread > 0) {
 356+ bfile->overflow = bfile->bufin[bytesread];
 357+ shiftbytesleft(bfile->bufin,bytesread+1,bfile->bitsshifted);
 358+ bfile->strm.avail_in = bytesread;
 359+ }
 360+ else {
 361+ eof++;
 362+ bfile->strm.avail_in = 0;
 363+ }
 364+ }
 365+ bfile->strm.next_out = (char *)(bfile->bufout);
 366+ bfile->strm.avail_out = bfile->bufsize;
 367+
 368+ return(eof);
 369+}
 370+
 371+
 372+int main(int argc, char **argv) {
 373+
 374+ bzinfo bfile;
 375+
 376+ int fin;
 377+ int result, ret;
 378+ unsigned char buffer[8];
 379+
 380+ unsigned char **footer;
 381+ unsigned char **marker;
 382+
 383+ int written=0;
 384+ int start_at;
 385+
 386+ int eof = 0;
 387+
 388+ if (argc != 2) {
 389+ fprintf(stderr,"usage: %s infile\n", argv[0]);
 390+ exit(-1);
 391+ }
 392+
 393+ marker = init_marker();
 394+ footer = init_footer();
 395+
 396+ fin = open (argv[1], O_RDONLY);
 397+ if (fin < 0) {
 398+ fprintf(stderr,"failed to open file %s for read\n", argv[1]);
 399+ exit(-1);
 400+ }
 401+
 402+ bfile.bufsize = BUFSIZE;
 403+
 404+ result = checkfileforfooter(fin, footer);
 405+ if (result == -1) {
 406+ start_at = 0;
 407+ }
 408+ else {
 409+ start_at = 11; /* size of footer, perhaps with 1 byte extra */
 410+ }
 411+ start_at +=6; /* size of marker */
 412+ bfile.position = lseek(fin, -1*start_at, SEEK_END);
 413+ if (bfile.position == -1) {
 414+ fprintf(stderr,"lseek of file failed\n");
 415+ exit(-1);
 416+ }
 417+ result = read(fin, buffer, 7);
 418+ if (result == -1) {
 419+ fprintf(stderr,"read of file failed\n");
 420+ exit(-1);
 421+ }
 422+
 423+ while (1) {
 424+
 425+ bfile.bitsshifted = findnextmarker(fin, &start_at, &bfile.position, marker, buffer);
 426+ if (bfile.bitsshifted >= 0) {
 427+ /* fprintf(stderr, "found marker at pos %d and shifted %d, start_at is %d\n", bfile.position, bfile.bitsshifted, start_at); */
 428+ ret = init_decompress(&bfile);
 429+
 430+ /* pass in the header */
 431+ ret = decompress_header(fin,&bfile);
 432+
 433+ eof = setup_first_buffer(fin, &bfile);
 434+
 435+ while (BZ_OK == ret && !eof) {
 436+ ret = BZ2_bzDecompress ( &(bfile.strm) );
 437+ if (BZ_OK == ret || BZ_STREAM_END == ret) {
 438+ written += fwrite(bfile.bufout, sizeof(unsigned char), (unsigned char *)(bfile.strm.next_out) - bfile.bufout, stdout);
 439+ }
 440+ eof = read_next_buffer(fin, &bfile, ret);
 441+ }
 442+ if (BZ_OK == ret || BZ_STREAM_END == ret ) {
 443+ /* so we read no bytes, process the last byte we held */
 444+ do_last_byte(&bfile);
 445+ }
 446+ if (written == 0) {
 447+ /* truncated block or other corruption, try going back one */
 448+ start_at +=5;
 449+ clearbuffer(buffer,sizeof(buffer));
 450+ continue;
 451+ }
 452+ else {
 453+ break;
 454+ }
 455+ }
 456+ else {
 457+ fprintf(stderr,"no block marker in this file.\n");
 458+ exit(-1);
 459+ }
 460+ }
 461+ close(fin);
 462+ exit(0);
 463+}
 464+
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/dumplastbz2block.c
___________________________________________________________________
Added: svn:eol-style
1465 + native
Index: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.h
@@ -0,0 +1,81 @@
 2+#ifndef _FINDPAGEID_H
 3+#define _FINDPAGEID_H
 4+
 5+typedef struct {
 6+ int page_id; /* first id in the block */
 7+ int bits_shifted; /* block is right shifted this many bits */
 8+ int position; /* position in file of block */
 9+} page_info_t;
 10+
 11+#define BUFINSIZE 5000
 12+
 13+/*
 14+ keeps all information about a bzipped file
 15+ plus input/output buffers for decompression
 16+*/
 17+typedef struct {
 18+ unsigned char bufin[BUFINSIZE]; /* compressed data read from file */
 19+ unsigned char *bufout; /* uncompressed data, must be allocated by caller */
 20+ unsigned char marker_buffer[7]; /* data to test for bz2 block marker */
 21+ unsigned char header_buffer[4]; /* first 4 bytes of file (bzip2 header) */
 22+
 23+ int bufin_size; /* size of input buffer for compressed data */
 24+ int bufout_size; /* size of output buffer for decompressed data, may vary at each call */
 25+
 26+ int initialized; /* whether bz2file has been initialized (header processed, seek to
 27+ some bz2 block in the file and input buffer filled) */
 28+ int block_start; /* position of bz2 block in file from which we started to read (we
 29+ read a sequence of bz2 blocks from a given position, this is
 30+ the offset to the first one) */
 31+
 32+ bz_stream strm; /* stream structure for libbz2 */
 33+ unsigned char overflow; /* since decompressed bytes may not be bit aligned, we keep the last byte
 34+ read around so we can grab the lower end bits off the end for
 35+ sticking in front of the next pile of compressed bytes we read */
 36+
 37+ int bits_shifted; /* number of bits that the compressed data has been right shifted
 38+ in the file (if the number is 0, the block marker and subsequent
 39+ data is byte-aligned) */
 40+ unsigned char **marker; /* bzip2 start of block marker, plus bit-shifted versions of it for
 41+ locating the marker in a stream of compressed data */
 42+
 43+ int position; /* current offset into file from start of file */
 44+
 45+ int bytes_read; /* number of bytes of compressed data read from file (per read) */
 46+ int bytes_written; /* number of bytes of decompressed data written into output buffer (per decompress) */
 47+ int eof; /* nonzero if eof reached */
 48+ int file_size; /* length of file, so we don't search past it for blocks */
 49+} bz_info_t;
 50+
 51+#define MASKLEFT 0
 52+#define MASKRIGHT 1
 53+
 54+/*
 55+ this output buffer is used to collect decompressed output.
 56+ this is not a circular buffer; when it is full the user is
 57+ responsible for emptying it completely or partially and moving
 58+ to the beginning any unused bytes.
 59+
 60+*/
 61+typedef struct {
 62+ unsigned char *buffer; /* output storage, allocated by the caller */
 63+ unsigned char *next_to_read; /* pointer to the next byte in the buffer with data to be read */
 64+ unsigned char *next_to_fill; /* pointer to the next byte in the buffer which is empty and can receive data */
 65+ int bytes_avail; /* number of bytes available for reading */
 66+ unsigned char *end; /* points to byte after end of buffer */
 67+} buf_info_t;
 68+
 69+/*
 70+ used for each iteration of narrowing down the location in a bzipped2 file of
 71+ a desired pageid, by finding first compressed block after a guessed
 72+ position and checking the first pageid (if any) contained in it.
 73+*/
 74+typedef struct {
 75+ int left_end; /* left end of interval to search (bytes from start of file) */
 76+ int right_end; /* right end of interval to search */
 77+ int value_wanted; /* pageid desired */
 78+ int last_value; /* pageid we found in last iteration */
 79+ int last_position; /* position in file for last iteration */
 80+} iter_info_t;
 81+
 82+#endif
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/findpageidinbz2xml.h
___________________________________________________________________
Added: svn:eol-style
183 + native

Status & tagging log