r91454 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r91453‎ \| r91454 \| r91455 >
Date:	13:38, 5 July 2011
Author:	ariel
Status:	deferred
Tags:
Comment:	locate offset of bz2block in bz2 file with specified pageid (or slightly before it)
Modified paths:	/branches/ariel/xmldumps-backup/findpageidinbz2xml.c (added) (history) /branches/ariel/xmldumps-backup/findpageidinbz2xml.h (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/findpageidinbz2xml.c
—	—	@@ -0,0 +1,842 @@
	2	+#include <unistd.h>
	3	+#include <stdio.h>
	4	+#include <string.h>
	5	+#include <sys/types.h>
	6	+#include <sys/stat.h>
	7	+#include <fcntl.h>
	8	+#include <stdlib.h>
	9	+#include <errno.h>
	10	+#include <sys/types.h>
	11	+#include <regex.h>
	12	+#include "bzlib.h"
	13	+#include "findpageidinbz2xml.h"
	14	+
	15	+/* return n ones either at left or right end */
	16	+int bitmask(int numbits, int end) {
	17	+ if (end == MASKRIGHT) {
	18	+ return((1<<numbits)-1);
	19	+ }
	20	+ else {
	21	+ return(((1<<numbits)-1) << (8-numbits));
	22	+ }
	23	+}
	24	+
	25	+void shift_bytes_left(unsigned char *buffer, int buflen, int numbits) {
	26	+ int i;
	27	+
	28	+ if (numbits == 0) {
	29	+ return;
	30	+ }
	31	+
	32	+ for (i=0; i<buflen; i++) {
	33	+ /* left 1 */
	34	+ buffer[i] = (unsigned char) ((int) (buffer[i]) << numbits);
	35	+
	36	+ /* grab leftmost from next byte */
	37	+ if (i < buflen-1) {
	38	+ buffer[i] = ( unsigned char ) ( (unsigned int) buffer[i] \| ( ( ((unsigned int) buffer[i+1]) & bitmask(numbits,MASKLEFT) ) >> (8-numbits) ) );
	39	+ }
	40	+ }
	41	+}
	42	+
	43	+void shift_bytes_right(unsigned char *buffer, int buflen, int numbits) {
	44	+ int i;
	45	+
	46	+ for (i=buflen-1; i>=0; i--) {
	47	+ /* right 1 */
	48	+ buffer[i] = (unsigned char) ((int) (buffer[i]) >> numbits);
	49	+
	50	+ /* grab rightmost from prev byte */
	51	+ if (i > 0) {
	52	+ buffer[i] = ( unsigned char ) ((unsigned int) buffer[i] \| ( ((unsigned int) (buffer[i-1])<<(8-numbits)) & bitmask(numbits,MASKLEFT)));
	53	+ }
	54	+ }
	55	+}
	56	+
	57	+unsigned char ** init_marker() {
	58	+ unsigned char *marker = malloc(8sizeof(unsigned char *));
	59	+ int i;
	60	+
	61	+ /* set up block marker plus its various right-shifted incarnations */
	62	+ for (i = 0; i< 8; i++) {
	63	+ marker[i] = malloc(sizeof(unsigned char)*7);
	64	+ }
	65	+ marker[0][0]= (unsigned char) 0x31;
	66	+ marker[0][1]= (unsigned char) 0x41;
	67	+ marker[0][2]= (unsigned char) 0x59;
	68	+ marker[0][3]= (unsigned char) 0x26;
	69	+ marker[0][4]= (unsigned char) 0x53;
	70	+ marker[0][5]= (unsigned char) 0x59;
	71	+ marker[0][6]= (unsigned char) 0x00;
	72	+ for (i = 1; i< 8; i++) {
	73	+ memcpy((char )(marker[i]), (char )(marker[i-1]),7);
	74	+ shift_bytes_right(marker[i],7,1);
	75	+ }
	76	+ return(marker);
	77	+}
	78	+
	79	+/* buff1 is some random bytes, buff2 is some random bytes which we expect to start with the contents of buff1,
	80	+ both buffers are bit-shifted to the right "bitsrightshifted". this function compares the two and returns 1 if buff2
	81	+ matches and 0 otherwise. */
	82	+int bytes_compare(unsigned char buff1, unsigned char buff2, int numbytes, int bitsrightshifted) {
	83	+ int i;
	84	+
	85	+ if (bitsrightshifted == 0) {
	86	+ for (i = 0; i< numbytes; i++) {
	87	+ if (buff1[i] != buff2[i]) {
	88	+ return(1);
	89	+ }
	90	+ }
	91	+ return(0);
	92	+ }
	93	+ else {
	94	+ for (i = 1; i< numbytes-2; i++) {
	95	+ if (buff1[i] != buff2[i]) {
	96	+ return(1);
	97	+ }
	98	+ }
	99	+ /* do leftmost byte */
	100	+ if ((buff1[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) != (buff2[0] & bitmask(8-bitsrightshifted,MASKRIGHT)) ) {
	101	+ return(1);
	102	+ }
	103	+ /* do rightmost byte */
	104	+ if ((buff1[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) != (buff2[numbytes-1] & bitmask(bitsrightshifted,MASKLEFT)) ) {
	105	+ return(1);
	106	+ }
	107	+ return(0);
	108	+ }
	109	+}
	110	+
	111	+
	112	+/* return -1 if no match
	113	+ return number of bits rightshifted otherwise */
	114	+int check_buffer_for_bz2_block_marker(bz_info_t *bfile) {
	115	+ int result, i;
	116	+
	117	+ result = bytes_compare(bfile->marker[0],bfile->marker_buffer+1,6,0);
	118	+ if (!result) {
	119	+ return(0);
	120	+ }
	121	+ for (i=1; i<8; i++) {
	122	+ result = bytes_compare(bfile->marker[i],bfile->marker_buffer,7,i);
	123	+ if (!result) {
	124	+ return(i);
	125	+ }
	126	+ }
	127	+ return(-1);
	128	+}
	129	+
	130	+
	131	+/* return: 1 if found, 0 if not, -1 on error */
	132	+int find_next_bz2_block_marker(int fin, bz_info_t *bfile) {
	133	+ int result;
	134	+
	135	+ bfile->bits_shifted = -1;
	136	+ result = read(fin, bfile->marker_buffer, 7);
	137	+ if (result < 0) {
	138	+ /* fprintf(stderr,"read of file failed\n"); */
	139	+ return(-1);
	140	+ }
	141	+ /* must be after 4 byte file header, and we add a leftmost byte to the buffer
	142	+ of data read in case some bits have been shifted into it */
	143	+ while (bfile->position <= bfile->file_size - 6 && bfile->bits_shifted < 0) {
	144	+ bfile->bits_shifted = check_buffer_for_bz2_block_marker(bfile);
	145	+ if (bfile->bits_shifted < 0) {
	146	+ bfile->position++;
	147	+ result = lseek(fin, (bfile->position), SEEK_SET);
	148	+ if (result == -1) {
	149	+ fprintf(stderr,"lseek of file to %ld failed (2)\n",(long int) bfile->position);
	150	+ return(-1);
	151	+ }
	152	+ result = read(fin, bfile->marker_buffer, 7);
	153	+ if (result < 7) {
	154	+ /* fprintf(stderr,"read of file failed\n"); */
	155	+ return(-1);
	156	+ }
	157	+ }
	158	+ else {
	159	+ bfile->block_start = bfile->position;
	160	+ return(1);
	161	+ }
	162	+ }
	163	+ return(0);
	164	+}
	165	+
	166	+/*
	167	+ initializes the bz2 strm structure,
	168	+ calls the BZ2 decompression library initializer
	169	+
	170	+ returns:
	171	+ BZ_OK on success
	172	+ various BZ_ errors on failure (see bzlib.h)
	173	+*/
	174	+int init_decompress(bz_info_t *bfile) {
	175	+ int bz_verbosity = 0;
	176	+ int bz_small = 0;
	177	+ int ret;
	178	+
	179	+ bfile->strm.bzalloc = NULL;
	180	+ bfile->strm.bzfree = NULL;
	181	+ bfile->strm.opaque = NULL;
	182	+
	183	+ ret = BZ2_bzDecompressInit ( &(bfile->strm), bz_verbosity, bz_small );
	184	+ if (ret != BZ_OK) {
	185	+ fprintf(stderr,"uncompress failed, err %d\n", ret);
	186	+ exit(-1);
	187	+ }
	188	+ return(ret);
	189	+}
	190	+
	191	+/*
	192	+ reads the first 4 bytes from a bz2 file (should be
	193	+ "BZh" followed by the block size indicator, typically "9")
	194	+ and passes them into the BZ2 decompression library.
	195	+ This must be done before decompression of any block of the
	196	+ file is attempted.
	197	+
	198	+ returns:
	199	+ BZ_OK if successful,
	200	+ various BZ_ errors on failure (see bzlib.h)
	201	+*/
	202	+int decompress_header(int fin, bz_info_t *bfile) {
	203	+ int ret, res;
	204	+
	205	+ res = lseek(fin,0,SEEK_SET);
	206	+ if (res < 0) {
	207	+ fprintf(stderr,"lseek of file to 0 failed (3)\n");
	208	+ exit(-1);
	209	+ }
	210	+ bfile->bytes_read = read(fin, bfile->header_buffer, 4);
	211	+ if (bfile->bytes_read < 4) {
	212	+ fprintf(stderr,"failed to read 4 bytes of header, exiting\n");
	213	+ exit(-1);
	214	+ }
	215	+ bfile->strm.next_in = (char *)bfile->header_buffer;
	216	+ bfile->strm.avail_in = 4;
	217	+
	218	+ ret = BZ2_bzDecompress ( &(bfile->strm) );
	219	+ if (BZ_OK != ret && BZ_STREAM_END != ret) {
	220	+ fprintf(stderr,"Corrupt bzip2 header, exiting\n");
	221	+ exit(-1);
	222	+ }
	223	+ return(ret);
	224	+}
	225	+
	226	+/*
	227	+ seek to appropriate offset as specified in bfile,
	228	+ read compressed data into buffer indicated by bfile,
	229	+ update the bfile structure accordingly,
	230	+ save the overflow byte (bit-shifted data = suck)
	231	+ this is for the first buffer of data in a stream,
	232	+ for subsequent buffers use fill_buffer_to_decompress()
	233	+
	234	+ this will set bfile->eof on eof. no other indicator
	235	+ will be provided.
	236	+
	237	+ returns:
	238	+ 0 on success
	239	+ -1 on error
	240	+*/
	241	+int setup_first_buffer_to_decompress(int fin, bz_info_t *bfile) {
	242	+ int res;
	243	+
	244	+ if (bfile->bits_shifted == 0) {
	245	+ res = lseek(fin,bfile->position+1,SEEK_SET);
	246	+ if (res < 0) {
	247	+ fprintf(stderr,"lseek of file to %ld failed (4)\n",(long int) bfile->position+1);
	248	+ return(-1);
	249	+ }
	250	+ }
	251	+ else {
	252	+ res = lseek(fin,bfile->position,SEEK_SET);
	253	+ if (res < 0) {
	254	+ fprintf(stderr,"lseek of file to %ld failed (5)\n",(long int) bfile->position);
	255	+ return(-1);
	256	+ }
	257	+ }
	258	+ bfile->bytes_read = read(fin, bfile->bufin, bfile->bufin_size);
	259	+ if (bfile->bytes_read > 0) {
	260	+ bfile->overflow = bfile->bufin[bfile->bytes_read-1];
	261	+ shift_bytes_left(bfile->bufin, bfile->bytes_read, bfile->bits_shifted);
	262	+
	263	+ bfile->strm.next_in = (char *)(bfile->bufin);
	264	+ bfile->strm.avail_in = bfile->bytes_read-1;
	265	+ }
	266	+ if (bfile->bytes_read <=0) {
	267	+ bfile->eof++;
	268	+ }
	269	+ return(0);
	270	+}
	271	+
	272	+/*
	273	+ read compressed data into buffer indicated by bfile,
	274	+ from current position of file,
	275	+ stuffing the overflow byte in first.
	276	+ update the bfile structure accordingly
	277	+ save the new overflow byte (bit-shifted data = suck)
	278	+ this function is for decompression of buffers *after
	279	+ the first one*. for the first one use
	280	+ setup_first_buffer_to_decompress()
	281	+
	282	+ this will set bfile->eof on eof. no other indicator
	283	+ will be provided.
	284	+
	285	+ returns:
	286	+ 0 on success
	287	+ hmm, it really does not do anything about errors :-D
	288	+*/
	289	+int fill_buffer_to_decompress(int fin, bz_info_t *bfile, int ret) {
	290	+ if (bfile->strm.avail_in == 0) {
	291	+ bfile->strm.next_in = (char *)(bfile->bufin);
	292	+ bfile->bufin[0] = bfile->overflow;
	293	+ bfile->bytes_read = read(fin, bfile->bufin+1, bfile->bufin_size-1);
	294	+ if (bfile->bytes_read > 0) {
	295	+ bfile->overflow = bfile->bufin[bfile->bytes_read];
	296	+ shift_bytes_left(bfile->bufin,bfile->bytes_read+1,bfile->bits_shifted);
	297	+ bfile->strm.avail_in = bfile->bytes_read;
	298	+ bfile->position+=bfile->bytes_read;
	299	+ }
	300	+ else {
	301	+ bfile->strm.avail_in = 1; /* the overflow byte */
	302	+ bfile->eof++;
	303	+ }
	304	+ }
	305	+ return(0);
	306	+}
	307	+
	308	+/* size of buffer is bytes usable. there will be a null byte at the end
	309	+
	310	+ what we do with the buffer:
	311	+ - read from front of buffer to end,
	312	+ - fill from point where prev read did not fill buffer, or from where
	313	+ move of data at end of buffer to beginning left room,
	314	+ - mark a string of bytes (starting from what's available to read) as "read"
	315	+
	316	+*/
	317	+buf_info_t *init_buffer(int size) {
	318	+ buf_info_t *b;
	319	+
	320	+ b = (buf_info_t *)malloc(sizeof(buf_info_t));
	321	+ b->buffer = malloc(sizeof(unsigned char)*(size+1));
	322	+ b->buffer[size]='\0';
	323	+ b->end = b->buffer + size;
	324	+ b->next_to_read = b->end; /* nothing available */
	325	+ b->bytes_avail = 0; /* bytes to read, nothing available */
	326	+ b->next_to_fill = b->buffer; /* empty */
	327	+ b->next_to_fill[0] = '\0';
	328	+ return(b);
	329	+}
	330	+
	331	+/* check if buffer (used for decompressed data output) is empty,
	332	+ returns 1 if so and 0 if not */
	333	+int buffer_is_empty(buf_info_t *b) {
	334	+ if (b->bytes_avail == 0) {
	335	+ return(1);
	336	+ }
	337	+ else {
	338	+ return(0);
	339	+ }
	340	+}
	341	+
	342	+/* check if buffer (used for decompressed data output) is full,
	343	+
	344	+ returns 1 if so and 0 if not
	345	+ I'm not liking this function so well, fixme */
	346	+int buffer_is_full(buf_info_t *b) {
	347	+ if (b->next_to_fill == b->end) {
	348	+ return(1);
	349	+ }
	350	+ else {
	351	+ return(0);
	352	+ }
	353	+}
	354	+
	355	+/* FIXME do this right. whatever. */
	356	+int get_file_size(int fin) {
	357	+ int res;
	358	+
	359	+ res = lseek(fin, 0, SEEK_END);
	360	+ if (res < 0) {
	361	+ fprintf(stderr,"lseek of file to 0 failed (6)\n");
	362	+ exit(-1);
	363	+ }
	364	+ return(res);
	365	+}
	366	+
	367	+
	368	+/*
	369	+ look for the first bz2 block in the file after specified offset
	370	+ it tests that the block is valid by doing partial decompression.
	371	+ this function will update the bfile structure:
	372	+ bfile->position will contain the current position of the file (? will it?)
	373	+ bfile->bits_shifted will contain the number of bits that the block is rightshifted
	374	+ bfile->block_start will contain the offset from start of file to the block
	375	+ returns:
	376	+ position of next byte in file to be read, on success
	377	+ -1 if no marker or other error
	378	+*/
	379	+int find_first_bz2_block_after_offset(bz_info_t *bfile, int fin, int position) {
	380	+ int res;
	381	+
	382	+ bfile->bufin_size = BUFINSIZE;
	383	+ bfile->marker = init_marker();
	384	+ bfile->position = position;
	385	+ bfile->block_start = -1;
	386	+ bfile->bytes_read = 0;
	387	+ bfile->bytes_written = 0;
	388	+ bfile->eof = 0;
	389	+ bfile->bits_shifted = -1;
	390	+
	391	+ bfile->file_size = get_file_size(fin);
	392	+
	393	+ while (bfile->bits_shifted < 0) {
	394	+ if (bfile->position > bfile->file_size) {
	395	+ return(-1);
	396	+ }
	397	+ res = lseek(fin, bfile->position, SEEK_SET);
	398	+ if (res < 0) {
	399	+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
	400	+ exit(-1);
	401	+ }
	402	+ res = find_next_bz2_block_marker(fin, bfile);
	403	+ if (res == 1) {
	404	+ init_decompress(bfile);
	405	+ decompress_header(fin, bfile);
	406	+ res = setup_first_buffer_to_decompress(fin, bfile);
	407	+ if (res == -1) {
	408	+ fprintf(stderr,"couldn't get first buffer of data to uncompress\n");
	409	+ exit(-1);
	410	+ }
	411	+ bfile->strm.next_out = (char *)bfile->bufout;
	412	+ bfile->strm.avail_out = bfile->bufout_size;
	413	+ res = BZ2_bzDecompress ( &(bfile->strm) );
	414	+ /* this means we (probably) have a genuine marker */
	415	+ if (BZ_OK == res \|\| BZ_STREAM_END == res) {
	416	+ res = BZ2_bzDecompressEnd ( &(bfile->strm) );
	417	+ bfile->bytes_read = 0;
	418	+ bfile->bytes_written = 0;
	419	+ bfile->eof = 0;
	420	+ /* leave the file at the right position */
	421	+ res = lseek(fin, bfile->block_start, SEEK_SET);
	422	+ if (res < 0) {
	423	+ fprintf(stderr,"lseek of file to %ld failed (7)\n",(long int) bfile->position);
	424	+ exit(-1);
	425	+ }
	426	+ return(0);
	427	+ }
	428	+ /* right bytes, but there by chance, skip and try again */
	429	+ else {
	430	+ bfile->position+=6;
	431	+ bfile->bits_shifted = -1;
	432	+ bfile->block_start = -1;
	433	+ }
	434	+ }
	435	+ else {
	436	+ return(-1);
	437	+ }
	438	+ }
	439	+ return(-1);
	440	+}
	441	+
	442	+/*
	443	+ find the first bz2 block marker in the file,
	444	+ from its current position,
	445	+ then set up for decompression from that point
	446	+ returns:
	447	+ 0 on success
	448	+ -1 if no marker or other error
	449	+*/
	450	+int init_bz2_file(bz_info_t *bfile, int fin) {
	451	+ int res;
	452	+
	453	+ bfile->initialized++;
	454	+
	455	+ res = find_next_bz2_block_marker(fin, bfile);
	456	+ if (res ==1) {
	457	+ init_decompress(bfile);
	458	+ decompress_header(fin, bfile);
	459	+ setup_first_buffer_to_decompress(fin, bfile);
	460	+ return(0);
	461	+ }
	462	+ return(-1);
	463	+}
	464	+
	465	+/* return -1 if error */
	466	+int decompress_data(bz_info_t bfile, int fin, unsigned char bufferout, int bufout_size) {
	467	+ int ret;
	468	+
	469	+ bfile->bufout = bufferout;
	470	+ bfile->bufout_size = bufout_size;
	471	+ bfile->bytes_written = 0;
	472	+
	473	+ if (! bfile->initialized) {
	474	+ if (init_bz2_file(bfile, fin) == -1) {
	475	+ /* fprintf(stderr,"failed to find block in bz2file (2)\n"); */
	476	+ return(-1);
	477	+ };
	478	+ bfile->strm.next_out = (char *)bfile->bufout;
	479	+ bfile->strm.avail_out = bfile->bufout_size;
	480	+ }
	481	+
	482	+ ret = BZ_OK;
	483	+ while (BZ_OK == ret && bfile->bytes_written == 0) {
	484	+ ret = BZ2_bzDecompress ( &(bfile->strm) );
	485	+ if (BZ_OK == ret \|\| BZ_STREAM_END == ret) {
	486	+ bfile->bytes_written = (unsigned char *)(bfile->strm.next_out) - bfile->bufout;
	487	+ }
	488	+ else {
	489	+ /* fprintf(stderr,"error from BZ decompress %d\n",ret); */
	490	+ return(-1);
	491	+ }
	492	+ fill_buffer_to_decompress(fin, bfile, ret);
	493	+ /*
	494	+ if (bfile->eof && (BZ_OK == ret \|\| BZ_STREAM_END == ret) ) {
	495	+ fprintf(stderr,"eof reached\n");
	496	+ }
	497	+ */
	498	+ }
	499	+ return(0);
	500	+}
	501	+
	502	+
	503	+/*
	504	+ fill output buffer in b with uncompressed data from bfile
	505	+ if this is the first call to the function for this file,
	506	+ the file header will be read, and the first buffer of
	507	+ uncompressed data will be prepared. bfile->position
	508	+ should be set to the offset (from the beginning of file) from
	509	+ which to find the first bz2 block.
	510	+
	511	+ returns:
	512	+ on success, number of bytes read (may be 0)
	513	+ -1 on error
	514	+*/
	515	+int get_buffer_of_uncompressed_data(buf_info_t b, int fin, bz_info_t bfile) {
	516	+ int res;
	517	+
	518	+ if (buffer_is_full(b)) {
	519	+ return(0);
	520	+ }
	521	+
	522	+ if (buffer_is_empty(b)) {
	523	+ b->next_to_fill = b->buffer;
	524	+ }
	525	+
	526	+ res = decompress_data(bfile, fin, b->next_to_fill, b->end - b->next_to_fill);
	527	+ if (res == -1) {
	528	+ return(res);
	529	+ }
	530	+ if (bfile->bytes_written < 0) {
	531	+ /* fprintf(stderr,"read of file failed\n"); */
	532	+ return(-1);
	533	+ }
	534	+ else {
	535	+ /* really?? FIXME check this */
	536	+ if (buffer_is_empty(b)) {
	537	+ b->next_to_read = b->next_to_fill; /* where we just read */
	538	+ }
	539	+ b->bytes_avail += bfile->bytes_written;
	540	+ b->next_to_fill += bfile->bytes_written;
	541	+ b->next_to_fill[0] = '\0';
	542	+ return(0);
	543	+ }
	544	+}
	545	+
	546	+void dumpbuf_info_t(buf_info_t *b) {
	547	+ fprintf(stdout, "\n");
	548	+ fprintf(stdout, "b->buffer: %ld\n", (long int) b->buffer);
	549	+ fprintf(stdout, "b->end: %ld\n", (long int) b->end);
	550	+ fprintf(stdout, "b->next_to_read: %ld\n", (long int) b->next_to_read);
	551	+ fprintf(stdout, "b->next_to_fill: %ld\n", (long int) b->next_to_fill);
	552	+ fprintf(stdout, "b->bytes_avail: %ld\n", (long int) b->bytes_avail);
	553	+}
	554	+
	555	+
	556	+/*
	557	+ copy text from end of buffer to the beginning, that we want to keep
	558	+ around for further processing (i.e. further regex matches)
	559	+ returns number of bytes copied
	560	+*/
	561	+int move_bytes_to_buffer_start(buf_info_t b, unsigned char from_where, int maxbytes) {
	562	+ int i, tocopy;
	563	+
	564	+ if (from_where >= b->end) {
	565	+ return(0);
	566	+ }
	567	+ else {
	568	+ tocopy = b->end - from_where;
	569	+ if (maxbytes && (tocopy > maxbytes)) {
	570	+ tocopy = maxbytes;
	571	+ }
	572	+ for (i = 0; i < tocopy; i++) {
	573	+ b->buffer[i] = from_where[i];
	574	+ }
	575	+ b->next_to_fill = b->buffer + tocopy;
	576	+ b->next_to_fill[0] = '\0';
	577	+ b->next_to_read = b->buffer;
	578	+ b->bytes_avail = tocopy;
	579	+ return(tocopy);
	580	+ }
	581	+}
	582	+
	583	+/*
	584	+ get the first page id after position in file
	585	+ if a pageid is found, the structure pinfo will be updated accordingly
	586	+ returns:
	587	+ 1 if a pageid found,
	588	+ 0 if no pageid found,
	589	+ -1 on error
	590	+*/
	591	+int get_first_page_id_after_offset(int fin, int position, page_info_t *pinfo) {
	592	+ int res;
	593	+ regmatch_t match_page, match_page_id;
	594	+ regex_t compiled_page, compiled_page_id;
	595	+ int length=5000; /* output buffer size */
	596	+ char *page = "<page>";
	597	+ char *page_id = "<page>\n[ ]+<title>[^<]+</title>\n[ ]+<id>([0-9]+)</id>\n";
	598	+
	599	+ buf_info_t *b;
	600	+ bz_info_t bfile;
	601	+
	602	+ bfile.initialized = 0;
	603	+
	604	+ res = regcomp(&compiled_page, page, REG_EXTENDED);
	605	+ res = regcomp(&compiled_page_id, page_id, REG_EXTENDED);
	606	+
	607	+ match_page = (regmatch_t )malloc(sizeof(regmatch_t)1);
	608	+ match_page_id = (regmatch_t )malloc(sizeof(regmatch_t)2);
	609	+
	610	+ b = init_buffer(length);
	611	+
	612	+ pinfo->bits_shifted = -1;
	613	+ pinfo->position = -1;
	614	+ pinfo->page_id = -1;
	615	+
	616	+ bfile.bytes_read = 0;
	617	+
	618	+ if (find_first_bz2_block_after_offset(&bfile, fin, position) == -1) {
	619	+ /* fprintf(stderr,"failed to find block in bz2file (1)\n"); */
	620	+ return(-1);
	621	+ }
	622	+
	623	+ while (!get_buffer_of_uncompressed_data(b, fin, &bfile) && (! bfile.eof)) {
	624	+ if (bfile.bytes_read) {
	625	+ while (regexec(&compiled_page_id, (char *)b->next_to_read, 2, match_page_id, 0 ) == 0) {
	626	+ if (match_page_id[1].rm_so >=0) {
	627	+ /* write page_id to stderr */
	628	+ /*
	629	+ fwrite(b->next_to_read+match_page_id[1].rm_so, sizeof(unsigned char), match_page_id[1].rm_eo - match_page_id[1].rm_so, stderr);
	630	+ fwrite("\n",1,1,stderr);
	631	+ */
	632	+ pinfo->page_id = atoi((char *)(b->next_to_read+match_page_id[1].rm_so));
	633	+ pinfo->position = bfile.block_start;
	634	+ pinfo->bits_shifted = bfile.bits_shifted;
	635	+ return(1);
	636	+ /* write up to and including page id tag to stdout */
	637	+ /*
	638	+ fwrite(b->next_to_read,match_page_id[0].rm_eo,1,stdout);
	639	+ b->next_to_read = b->next_to_read+match_page_id[0].rm_eo;
	640	+ b->bytes_avail -= match_page_id[0].rm_eo;
	641	+ */
	642	+ }
	643	+ else {
	644	+ /* should never happen */
	645	+ fprintf(stderr,"regex gone bad...\n");
	646	+ exit(-1);
	647	+ }
	648	+ }
	649	+ if (regexec(&compiled_page, (char *)b->next_to_read, 1, match_page, 0 ) == 0) {
	650	+ /* write everything up to but not including the page tag to stdout */
	651	+ /*
	652	+ fwrite(b->next_to_read,match_page[0].rm_eo - 6,1,stdout);
	653	+ */
	654	+ move_bytes_to_buffer_start(b, b->next_to_read + match_page[0].rm_so, b->bytes_avail - match_page[0].rm_so);
	655	+ bfile.strm.next_out = (char *)b->next_to_fill;
	656	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	657	+ }
	658	+ else {
	659	+ /* could have the first part of the page tag... so copy up enough bytes to cover that case */
	660	+ if (b->bytes_avail> 5) {
	661	+ /* write everything that didn't match, but leave 5 bytes, to stdout */
	662	+ /*
	663	+ fwrite(b->next_to_read,b->bytes_avail - 5,1,stdout);
	664	+ */
	665	+ move_bytes_to_buffer_start(b, b->next_to_read + b->bytes_avail - 5, 5);
	666	+ bfile.strm.next_out = (char *)b->next_to_fill;
	667	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	668	+ }
	669	+ else {
	670	+ if (buffer_is_empty(b)) {
	671	+ bfile.strm.next_out = (char *)b->buffer;
	672	+ bfile.strm.avail_out = bfile.bufout_size;
	673	+ b->next_to_fill = b->buffer; /* empty */
	674	+ }
	675	+ else {
	676	+ /* there were only 5 or less bytes so just save em don't write em to stdout */
	677	+ move_bytes_to_buffer_start(b, b->next_to_read, b->bytes_avail);
	678	+ bfile.strm.next_out = (char *)b->next_to_fill;
	679	+ bfile.strm.avail_out = b->end - b->next_to_fill;
	680	+ }
	681	+ }
	682	+ }
	683	+ }
	684	+ }
	685	+ /*
	686	+ if (b->bytes_avail) {
	687	+ fwrite(b->next_to_read,b->bytes_avail,1,stdout);
	688	+ }
	689	+ */
	690	+ return(0);
	691	+}
	692	+
	693	+/* search for pageid in a bz2 file, given start and end offsets
	694	+ to search for
	695	+ we guess by the most boring method possible (shrink the
	696	+ interval according to the value found on the last guess,
	697	+ try midpoint of the new interval)
	698	+ multiple calls of this will get the job done.
	699	+ interval has left end = right end if search is complete.
	700	+ this function may return the previous guess and simply
	701	+ shrink the interval.
	702	+ note that a "match" means either that the pageid we find
	703	+ is smaller than the one the caller wants, or is equal.
	704	+ why? because then we can use the output for prefetch
	705	+ for xml dumps and be sure a specific page range is covered :-P
	706	+
	707	+ return value from guess, or -1 on error.
	708	+ */
	709	+int do_iteration(iter_info_t iinfo, int fin, page_info_t pinfo) {
	710	+ int res;
	711	+ int new_position;
	712	+ int interval;
	713	+
	714	+ /*
	715	+ last_position is somewhere in the interval, perhaps at an end
	716	+ last_value is the value we had at that position
	717	+ */
	718	+
	719	+ interval = (iinfo->right_end - iinfo->left_end)/2;
	720	+ if (interval == 0) {
	721	+ interval = 1;
	722	+ }
	723	+ /* fprintf(stderr,"interval size is %ld, left end %ld, right end %ld, last val %d\n",interval, iinfo->left_end, iinfo->right_end, iinfo->last_value); */
	724	+ /* if we're this close, we'll check this value and be done with it */
	725	+ if (iinfo->right_end -iinfo->left_end < 2) {
	726	+ new_position = iinfo->left_end;
	727	+ iinfo->right_end = iinfo->left_end;
	728	+ }
	729	+ else {
	730	+ if (iinfo->last_value < iinfo->value_wanted) {
	731	+ /* fprintf(stderr,"resetting left end\n"); */
	732	+ iinfo->left_end = iinfo->last_position;
	733	+ new_position = iinfo->last_position + interval;
	734	+ }
	735	+ /* iinfo->last_value > iinfo->value_wanted */
	736	+ else {
	737	+ /* fprintf(stderr,"resetting right end\n"); */
	738	+ iinfo->right_end = iinfo->last_position;
	739	+ new_position = iinfo->last_position - interval;
	740	+ }
	741	+ }
	742	+ res = get_first_page_id_after_offset(fin, new_position, pinfo);
	743	+ if (res >0) {
	744	+ /* caller wants the new value */
	745	+ iinfo->last_value = pinfo->page_id;
	746	+ iinfo->last_position = new_position;
	747	+ return(pinfo->page_id);
	748	+ }
	749	+ else {
	750	+ /* here is the tough case, if we didn't find anything then we are prolly too close to the end, truncation or
	751	+ there's just no block here.
	752	+ set the right end, keep the last value and position and let the caller retry with the new interval */
	753	+ if (iinfo->last_value < iinfo->value_wanted) { /* we were moving towards eof */
	754	+ iinfo->right_end = new_position;
	755	+ return(iinfo->last_value);
	756	+ }
	757	+ /* in theory we were moving towards beginning of file, should not have issues, so bail here */
	758	+ else {
	759	+ /* fprintf(stderr,"something very broken, giving up\n"); */
	760	+ return(-1);
	761	+ }
	762	+ }
	763	+}
	764	+
	765	+/*
	766	+ given a bzipped and possibly truncated file, and a page id,
	767	+ hunt for the page id in the file; this assume that the
	768	+ bz2 header is intact and that page ids are steadily increasing
	769	+ throughout the file.
	770	+
	771	+ writes the offset of the relevant block (from beginning of file)
	772	+ and the first pageid found in that block, to stdout
	773	+
	774	+ format of output:
	775	+ position:xxxxx pageid:nnn
	776	+
	777	+ returns: 0 on success, -1 on error
	778	+*/
	779	+int main(int argc, char **argv) {
	780	+ int fin, position, res, interval, page_id, oldmarker, file_size;
	781	+ page_info_t pinfo;
	782	+ iter_info_t iinfo;
	783	+
	784	+ if (argc != 3) {
	785	+ fprintf(stderr,"usage: %s infile id\n", argv[0]);
	786	+ exit(-1);
	787	+ }
	788	+
	789	+ fin = open (argv[1], O_RDONLY);
	790	+ if (fin < 0) {
	791	+ fprintf(stderr,"failed to open file %s for read\n", argv[1]);
	792	+ exit(-1);
	793	+ }
	794	+
	795	+ page_id = atoi(argv[2]);
	796	+ if (page_id <1) {
	797	+ fprintf(stderr,"please specify a page_id >= 1.\n");
	798	+ fprintf(stderr,"usage: %s infile page_id\n", argv[0]);
	799	+ exit(-1);
	800	+ }
	801	+
	802	+ file_size = get_file_size(fin);
	803	+
	804	+ interval = file_size;
	805	+ position = 0;
	806	+ oldmarker = -1;
	807	+ pinfo.bits_shifted = -1;
	808	+ pinfo.position = -1;
	809	+ pinfo.page_id = -1;
	810	+
	811	+ iinfo.left_end = 0;
	812	+ file_size = get_file_size(fin);
	813	+ iinfo.right_end = file_size;
	814	+ iinfo.value_wanted = page_id;
	815	+
	816	+ res = get_first_page_id_after_offset(fin, 0, &pinfo);
	817	+ if (res > 0) {
	818	+ iinfo.last_value = pinfo.page_id;
	819	+ iinfo.last_position = 0;
	820	+ }
	821	+ else {
	822	+ fprintf(stderr,"failed to get anything useful from the beginning of the file even, bailing.\n");
	823	+ exit(1);
	824	+ }
	825	+ if (pinfo.page_id == page_id) {
	826	+ fprintf(stdout,"position:%d page_id:%d\n",pinfo.position, pinfo.page_id);
	827	+ exit(0);
	828	+ }
	829	+
	830	+ while (1) {
	831	+ res = do_iteration(&iinfo, fin, &pinfo);
	832	+ /* things to check: bad return? interval is 0 bytes long? */
	833	+ if (iinfo.left_end == iinfo.right_end) {
	834	+ fprintf(stdout,"position:%d page_id:%d\n",pinfo.position, pinfo.page_id);
	835	+ exit(0);
	836	+ }
	837	+ else if (res < 0) {
	838	+ fprintf(stderr,"broken and quitting\n");
	839	+ exit(-1);
	840	+ }
	841	+ }
	842	+ exit(0);
	843	+}
Property changes on: branches/ariel/xmldumps-backup/findpageidinbz2xml.c
___________________________________________________________________
Added: svn:eol-style
1	844	+ native
Index: branches/ariel/xmldumps-backup/findpageidinbz2xml.h
—	—	@@ -0,0 +1,81 @@
	2	+#ifndef _FINDPAGEID_H
	3	+#define _FINDPAGEID_H
	4	+
	5	+typedef struct {
	6	+ int page_id; /* first id in the block */
	7	+ int bits_shifted; /* block is right shifted this many bits */
	8	+ int position; /* position in file of block */
	9	+} page_info_t;
	10	+
	11	+#define BUFINSIZE 5000
	12	+
	13	+/*
	14	+ keeps all information about a bzipped file
	15	+ plus input/output buffers for decompression
	16	+*/
	17	+typedef struct {
	18	+ unsigned char bufin[BUFINSIZE]; /* compressed data read from file */
	19	+ unsigned char bufout; / uncompressed data, must be allocated by caller */
	20	+ unsigned char marker_buffer[7]; /* data to test for bz2 block marker */
	21	+ unsigned char header_buffer[4]; /* first 4 bytes of file (bzip2 header) */
	22	+
	23	+ int bufin_size; /* size of input buffer for compressed data */
	24	+ int bufout_size; /* size of output buffer for decompressed data, may vary at each call */
	25	+
	26	+ int initialized; /* whether bz2file has been initialized (header processed, seek to
	27	+ some bz2 block in the file and input buffer filled) */
	28	+ int block_start; /* position of bz2 block in file from which we started to read (we
	29	+ read a sequence of bz2 blocks from a given position, this is
	30	+ the offset to the first one) */
	31	+
	32	+ bz_stream strm; /* stream structure for libbz2 */
	33	+ unsigned char overflow; /* since decompressed bytes may not be bit aligned, we keep the last byte
	34	+ read around so we can grab the lower end bits off the end for
	35	+ sticking in front of the next pile of compressed bytes we read */
	36	+
	37	+ int bits_shifted; /* number of bits that the compressed data has been right shifted
	38	+ in the file (if the number is 0, the block marker and subsequent
	39	+ data is byte-aligned) */
	40	+ unsigned char *marker; / bzip2 start of block marker, plus bit-shifted versions of it for
	41	+ locating the marker in a stream of compressed data */
	42	+
	43	+ int position; /* current offset into file from start of file */
	44	+
	45	+ int bytes_read; /* number of bytes of compressed data read from file (per read) */
	46	+ int bytes_written; /* number of bytes of decompressed data written into output buffer (per decompress) */
	47	+ int eof; /* nonzero if eof reached */
	48	+ int file_size; /* length of file, so we don't search past it for blocks */
	49	+} bz_info_t;
	50	+
	51	+#define MASKLEFT 0
	52	+#define MASKRIGHT 1
	53	+
	54	+/*
	55	+ this output buffer is used to collect decompressed output.
	56	+ this is not a circular buffer; when it is full the user is
	57	+ responsible for emptying it completely or partially and moving
	58	+ to the beginning any unused bytes.
	59	+
	60	+*/
	61	+typedef struct {
	62	+ unsigned char buffer; / output storage, allocated by the caller */
	63	+ unsigned char next_to_read; / pointer to the next byte in the buffer with data to be read */
	64	+ unsigned char next_to_fill; / pointer to the next byte in the buffer which is empty and can receive data */
	65	+ int bytes_avail; /* number of bytes available for reading */
	66	+ unsigned char end; / points to byte after end of buffer */
	67	+} buf_info_t;
	68	+
	69	+/*
	70	+ used for each iteration of narrowing down the location in a bzipped2 file of
	71	+ a desired pageid, by finding first compressed block after a guessed
	72	+ position and checking the first pageid (if any) contained in it.
	73	+*/
	74	+typedef struct {
	75	+ int left_end; /* left end of interval to search (bytes from start of file) */
	76	+ int right_end; /* right end of interval to search */
	77	+ int value_wanted; /* pageid desired */
	78	+ int last_value; /* pageid we found in last iteration */
	79	+ int last_position; /* position in file for last iteration */
	80	+} iter_info_t;
	81	+
	82	+#endif
Property changes on: branches/ariel/xmldumps-backup/findpageidinbz2xml.h
___________________________________________________________________
Added: svn:eol-style
1	83	+ native

Status & tagging log

00:39, 7 July 2011 Reedy (talk | contribs) changed the status of r91454 [removed: new added: deferred]