r107839 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r107838‎ \| r107839 \| r107840 >
Date:	17:20, 2 January 2012
Author:	ariel
Status:	deferred
Tags:
Comment:	utility to compress an input stream into multiple bz2 streams on output, with index of pages and offsets
Modified paths:	/branches/ariel/xmldumps-backup/mwbzutils/Makefile (modified) (history) /branches/ariel/xmldumps-backup/mwbzutils/README (modified) (history) /branches/ariel/xmldumps-backup/mwbzutils/recompressxml.c (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/mwbzutils/recompressxml.c
—	—	@@ -0,0 +1,360 @@
	2	+#include <unistd.h>
	3	+#include <stdio.h>
	4	+#include <getopt.h>
	5	+#include <string.h>
	6	+#include <sys/types.h>
	7	+#include <sys/stat.h>
	8	+#include <fcntl.h>
	9	+#include <stdlib.h>
	10	+#include <errno.h>
	11	+#include <sys/types.h>
	12	+#include <regex.h>
	13	+#include <ctype.h>
	14	+#include "bzlib.h"
	15	+
	16	+char inBuf[4096];
	17	+char outBuf[8192];
	18	+
	19	+char inBuf_indx[4096];
	20	+char outBuf_indx[8192];
	21	+
	22	+char *pageOpenTag = "<page>\n";
	23	+
	24	+char *pageTitleExpr = "<title>(.+)</title>\n";
	25	+regmatch_t *matchPageTitleExpr;
	26	+regex_t compiledMatchPageTitleExpr;
	27	+
	28	+char *idExpr = "<id>([0-9]+)</id>\n";
	29	+regmatch_t *matchIdExpr;
	30	+regex_t compiledMatchIdExpr;
	31	+
	32	+bz_stream strm_indx;
	33	+
	34	+void setupIndexBz2Stream() {
	35	+ int bz_verbosity = 0;
	36	+ int bz_workFactor = 0;
	37	+ int bz_blockSize100k = 9;
	38	+
	39	+ strm_indx.bzalloc = NULL;
	40	+ strm_indx.bzfree = NULL;
	41	+ strm_indx.opaque = NULL;
	42	+
	43	+ /* init bzip compression stuff */
	44	+ BZ2_bzCompressInit(&(strm_indx), bz_blockSize100k, bz_verbosity, bz_workFactor);
	45	+}
	46	+
	47	+void setupRegexps() {
	48	+ matchPageTitleExpr = (regmatch_t )malloc(sizeof(regmatch_t)2);
	49	+ regcomp(&compiledMatchPageTitleExpr, pageTitleExpr, REG_EXTENDED);
	50	+ matchIdExpr = (regmatch_t )malloc(sizeof(regmatch_t)2);
	51	+ regcomp(&compiledMatchIdExpr, idExpr, REG_EXTENDED);
	52	+ return;
	53	+}
	54	+
	55	+int startsPage(char *buf) {
	56	+ while (*buf == ' ') buf++;
	57	+
	58	+ if (!strcmp(buf,pageOpenTag)) return 1;
	59	+ else return 0;
	60	+}
	61	+
	62	+char hasPageTitle(char buf) {
	63	+ static char pageTitle[513];
	64	+ int length = 0;
	65	+
	66	+ pageTitle[0]='\0';
	67	+
	68	+ while (*buf == ' ') buf++;
	69	+
	70	+ if (regexec(&compiledMatchPageTitleExpr, buf, 2, matchPageTitleExpr, 0 ) == 0) {
	71	+ if (matchPageTitleExpr[1].rm_so >=0) {
	72	+ length = matchPageTitleExpr[1].rm_eo - matchPageTitleExpr[1].rm_so;
	73	+ if (length > 512) {
	74	+ fprintf(stderr,"Page title length > 512 bytes... really? Bailing.\n");
	75	+ exit(1);
	76	+ }
	77	+ strncpy(pageTitle,buf+matchPageTitleExpr[1].rm_so, length);
	78	+ pageTitle[length] = '\0';
	79	+ }
	80	+ }
	81	+ return(pageTitle);
	82	+}
	83	+
	84	+int hasId(char *buf) {
	85	+ int id = 0;
	86	+
	87	+ while (*buf == ' ') buf++;
	88	+
	89	+ if (regexec(&compiledMatchIdExpr, buf, 2, matchIdExpr, 0 ) == 0) {
	90	+ if (matchIdExpr[1].rm_so >=0) {
	91	+ id = atoi(buf+matchIdExpr[1].rm_so);
	92	+ }
	93	+ }
	94	+ return(id);
	95	+}
	96	+
	97	+int endsXmlBlock(char *buf, int header) {
	98	+ char *pageCloseTag = "</page>\n";
	99	+ char *mediawikiCloseTag = "</mediawiki>\n";
	100	+ char *siteinfoCloseTag = "</siteinfo>\n";
	101	+
	102	+ while (*buf == ' ') buf++;
	103	+
	104	+ /* if we are trying to process the header, check for that only */
	105	+ if (header) {
	106	+ if (!strcmp(buf,siteinfoCloseTag)) return 1;
	107	+ else return 0;
	108	+ }
	109	+
	110	+ /* normal check for end of page, end of content */
	111	+ if (!strcmp(buf,pageCloseTag) \|\| !strcmp(buf,mediawikiCloseTag)) return 1;
	112	+ else return 0;
	113	+}
	114	+
	115	+int endBz2Stream(bz_stream strm, char outBuf, int bufSize, FILE *fd) {
	116	+ int result;
	117	+ int offset;
	118	+
	119	+ do {
	120	+ strm->avail_in = 0;
	121	+ result = BZ2_bzCompress ( strm, BZ_FINISH );
	122	+ fwrite(outBuf,bufSize-strm->avail_out,1,fd);
	123	+ strm->next_out = outBuf;
	124	+ strm->avail_out = 8192;
	125	+ } while (result != BZ_STREAM_END);
	126	+ offset = strm->total_out_lo32;
	127	+ BZ2_bzCompressEnd(strm);
	128	+ return(offset);
	129	+}
	130	+
	131	+int writeCompressedXmlBlock(int header, int count, int fileOffset, FILE *indexfd, int indexcompressed, int verbose) {
	132	+
	133	+ bz_stream strm;
	134	+ int bz_verbosity = 0;
	135	+ int bz_workFactor = 0;
	136	+ int bz_blockSize100k = 9;
	137	+ int wroteSomething = 0;
	138	+ int blocksDone = 0;
	139	+
	140	+ strm.bzalloc = NULL;
	141	+ strm.bzfree = NULL;
	142	+ strm.opaque = NULL;
	143	+
	144	+ char *pageTitle = NULL;
	145	+ int pageId = 0;
	146	+ enum States{WantPage,WantPageTitle,WantPageId};
	147	+ int state = WantPage;
	148	+
	149	+ /* init bzip compression stuff */
	150	+ BZ2_bzCompressInit(&strm, bz_blockSize100k, bz_verbosity, bz_workFactor);
	151	+
	152	+ while (fgets(inBuf, sizeof(inBuf), stdin) != NULL) {
	153	+ if (verbose > 1) {
	154	+ fprintf(stderr,"input buffer is: ");
	155	+ fprintf(stderr,"%s",inBuf);
	156	+ }
	157	+
	158	+ wroteSomething = 1;
	159	+ /* add the buffer content to stuff to be compressed */
	160	+ strm.next_in = inBuf;
	161	+ strm.avail_in = strlen(inBuf);
	162	+ strm.next_out = outBuf;
	163	+ strm.avail_out = 8192;
	164	+
	165	+ /* we are to build an index. */
	166	+ if (indexfd) {
	167	+ if (verbose > 2) {
	168	+ fprintf(stderr,"doing index check\n");
	169	+ }
	170	+ if (state == WantPage) {
	171	+ if (verbose > 2) {
	172	+ fprintf(stderr,"checking for page tag\n");
	173	+ }
	174	+ if (startsPage(inBuf)) {
	175	+ state = WantPageTitle;
	176	+ }
	177	+ }
	178	+ else if (state == WantPageTitle) {
	179	+ if (verbose > 1) {
	180	+ fprintf(stderr,"checking for page title tag\n");
	181	+ }
	182	+ pageTitle = hasPageTitle(inBuf);
	183	+ if (pageTitle[0]) {
	184	+ state = WantPageId;
	185	+ }
	186	+ }
	187	+ else if (state == WantPageId) {
	188	+ if (verbose > 1) {
	189	+ fprintf(stderr,"checking for page id tag\n");
	190	+ }
	191	+ pageId = hasId(inBuf);
	192	+ if (pageId) {
	193	+ state = WantPage;
	194	+ }
	195	+ if (indexcompressed) {
	196	+ if (verbose) {
	197	+ fprintf(stderr,"writing line to compressed index file\n");
	198	+ }
	199	+ sprintf(inBuf_indx,"%d:%d:%s\n",fileOffset,pageId,pageTitle);
	200	+ strm_indx.next_in = inBuf_indx;
	201	+ strm_indx.avail_in = strlen(inBuf_indx);
	202	+ do {
	203	+ if (verbose > 2) {
	204	+ fprintf(stderr,"bytes left to read for index compression: %d\n",strm_indx.avail_in);
	205	+ }
	206	+ strm_indx.next_out = outBuf_indx;
	207	+ strm_indx.avail_out = 8192;
	208	+ BZ2_bzCompress ( &strm_indx, BZ_RUN );
	209	+ fwrite(outBuf_indx,sizeof(outBuf_indx)-strm_indx.avail_out,1,indexfd);
	210	+ } while (strm_indx.avail_in >0);
	211	+ }
	212	+ else {
	213	+ if (verbose) {
	214	+ fprintf(stderr,"writing line to index file\n");
	215	+ }
	216	+ fprintf(indexfd,"%d:%d:%s\n",fileOffset,pageId,pageTitle);
	217	+ }
	218	+ pageId = 0;
	219	+ pageTitle = NULL;
	220	+ }
	221	+ }
	222	+ do {
	223	+ if (verbose > 2) {
	224	+ fprintf(stderr,"bytes left to read for text compression: %d\n",strm.avail_in);
	225	+ }
	226	+ strm.next_out = outBuf;
	227	+ strm.avail_out = 8192;
	228	+ BZ2_bzCompress ( &strm, BZ_RUN );
	229	+ fwrite(outBuf,sizeof(outBuf)-strm.avail_out,1,stdout);
	230	+ } while (strm.avail_in > 0);
	231	+ if (verbose > 1) fprintf(stderr,"avail_out is now: %d\n", strm.avail_out);
	232	+
	233	+ if (endsXmlBlock(inBuf, header)) {
	234	+ /* special case: doing the siteinfo stuff at the beginning */
	235	+ if (verbose) {
	236	+ fprintf(stderr,"end of header found\n");
	237	+ }
	238	+ if (header) {
	239	+ fileOffset += endBz2Stream(&strm, outBuf, sizeof(outBuf), stdout);
	240	+ return(fileOffset);
	241	+ }
	242	+
	243	+ blocksDone++;
	244	+ if (blocksDone % count == 0) {
	245	+ if (verbose) fprintf(stderr, "end of xml block found\n");
	246	+ /* close down bzip stream, we are done with this block */
	247	+ fileOffset += endBz2Stream(&strm, outBuf, sizeof(outBuf), stdout);
	248	+ return(fileOffset);
	249	+ }
	250	+ }
	251	+ }
	252	+ if (verbose) fprintf(stderr,"eof reached\n");
	253	+ if (wroteSomething) {
	254	+ /* close down bzip stream, we are done with this block */
	255	+ fileOffset += endBz2Stream(&strm, outBuf, sizeof(outBuf), stdout);
	256	+ }
	257	+ return(fileOffset);
	258	+}
	259	+
	260	+void usage(char whoami, char message) {
	261	+ if (message) {
	262	+ fprintf(stderr,"%s",message);
	263	+ }
	264	+ fprintf(stderr,"Usage: %s --pagesperstream n [--buildindex indexfilename] [--verbose]\n\n", whoami);
	265	+ fprintf(stderr,"Reads a stream of XML pages from stdin,\n");
	266	+ fprintf(stderr,"and writes to stdout the bz2 compressed\n");
	267	+ fprintf(stderr,"data, one bz2 stream per count pages.\n\n");
	268	+ fprintf(stderr,"Options:\n");
	269	+ fprintf(stderr,"pagesperstream: compress this many pages in each complete bz2stream before\n");
	270	+ fprintf(stderr," opening a new stream. The siteinfo header is written to a\n");
	271	+ fprintf(stderr," separate stream at the beginning of all output, and the closing\n");
	272	+ fprintf(stderr," mediawiki tag is written into a separate stream at the end.\n");
	273	+ fprintf(stderr,"buildindex: generate a file containing an index of pages ids and titles\n");
	274	+ fprintf(stderr," per stream. Each line contains: offset-to-stream:pageid:pagetitle\n");
	275	+ fprintf(stderr," If filename ends in '.bz2' the file will be written in bz2 format.\n");
	276	+ fprintf(stderr,"verbose: produce lots of debugging output to stderr. This option can be used\n");
	277	+ fprintf(stderr," multiple times to increase verbosity.\n");
	278	+ exit(-1);
	279	+}
	280	+
	281	+int main(int argc, char **argv) {
	282	+ int optindex=0;
	283	+ int optc;
	284	+ int offset = 0;
	285	+
	286	+ struct option optvalues[] = {
	287	+ {"buildindex", 1, 0, 'b'},
	288	+ {"pagesperstream", 1, 0, 'p'},
	289	+ {"verbose", 0, 0, 'v'},
	290	+ {NULL, 0, NULL, 0}
	291	+ };
	292	+
	293	+ int count = 0;
	294	+ int doIndex = 0;
	295	+ char *indexFilename = NULL;
	296	+ int verbose = 0;
	297	+ FILE *indexfd = NULL;
	298	+ int indexcompressed = 0;
	299	+
	300	+ while (1) {
	301	+ optc=getopt_long_only(argc,argv,"pagesperstream:buildindex:verbose", optvalues, &optindex);
	302	+ if (optc=='b') {
	303	+ doIndex=1;
	304	+ indexFilename = optarg;
	305	+ }
	306	+ else if (optc=='p') {
	307	+ if (!(isdigit(optarg[0]))) usage(argv[0],NULL);
	308	+ count=atoi(optarg);
	309	+ }
	310	+ else if (optc=='v')
	311	+ verbose++;
	312	+ else if (optc==-1) break;
	313	+ else usage(argv[0],"unknown option or other error\n");
	314	+ }
	315	+
	316	+ if (count <= 0) {
	317	+ usage(argv[0],"bad or no argument given for count.\n");
	318	+ }
	319	+
	320	+ if (indexFilename) {
	321	+ if (verbose) {
	322	+ fprintf(stderr,"setting up index file creation.\n");
	323	+ }
	324	+ indexfd = fopen(indexFilename, "w");
	325	+ if (! indexfd) {
	326	+ usage(argv[0],"failed to open index file for write.\n");
	327	+ }
	328	+ if (!strcmp(indexFilename+(strlen(indexFilename)-4),".bz2")) {
	329	+ if (verbose) {
	330	+ fprintf(stderr,"index file will be bz2 compressed.\n");
	331	+ }
	332	+ indexcompressed++;
	333	+ setupIndexBz2Stream();
	334	+ }
	335	+ }
	336	+
	337	+ setupRegexps();
	338	+
	339	+ /* deal with the XML header */
	340	+ offset = writeCompressedXmlBlock(1,count,0,indexfd,indexcompressed,verbose);
	341	+
	342	+ while (!feof(stdin)) {
	343	+ offset = writeCompressedXmlBlock(0,count,offset,indexfd,indexcompressed,verbose);
	344	+ }
	345	+
	346	+ if (indexFilename) {
	347	+ if (indexcompressed) {
	348	+ if (verbose) {
	349	+ fprintf(stderr,"closing bz2 index file stream.\n");
	350	+ }
	351	+ endBz2Stream(&strm_indx, outBuf_indx, sizeof(outBuf_indx), indexfd);
	352	+ }
	353	+ if (verbose) {
	354	+ fprintf(stderr,"closing index file.\n");
	355	+ }
	356	+ fclose(indexfd);
	357	+ }
	358	+
	359	+ exit(0);
	360	+
	361	+}
Property changes on: branches/ariel/xmldumps-backup/mwbzutils/recompressxml.c
___________________________________________________________________
Added: svn:eol-style
1	362	+ native
Index: branches/ariel/xmldumps-backup/mwbzutils/README
—	—	@@ -42,6 +42,18 @@
43	43	position:xxxxx pageid:nnn
44	44	It exits with 0 on success, -1 on error.
45	45
	46	+recompresszml - Reads an xml stream of pages and writes multiple bz2 compressed
	47	+ streams, concatenated, to stdout, with the specified number of
	48	+ pages per stream. The mediawiki site info header is in its
	49	+ own bz2 stream. Each stream can be extracted as a separate file
	50	+ by an appropriate tool, checking for the byte-aligned string "BZh91AY&SY"
	51	+ and a following <page> tag (after uncompressing the first chunk
	52	+ of data after that string). Alternatively, a tool can seek to
	53	+ the location of one of the streams in order to find a particular
	54	+ page. An index of file-offset:page-id:page-title lines
	55	+ is written to a specified file if desired; the index file will be
	56	+ bz2 compressed if the filename given ends with .bz2.
	57	+
46	58	Library routines:
47	59
48	60	mwbz2lib.c - various utility functions (bitmasks, shifting and comparing bytes,
Index: branches/ariel/xmldumps-backup/mwbzutils/Makefile
—	—	@@ -29,7 +29,8 @@
30	30	all: checkforbz2footer \
31	31	dumpbz2filefromoffset \
32	32	dumplastbz2block \
33		~~- findpageidinbz2xml~~
	33	+ findpageidinbz2xml \
	34	+ recompressxml
34	35
35	36	dumplastbz2block: $(OBJSBZ) mwbzlib.o dumplastbz2block.o
36	37	$(CC) $(CFLAGS) $(LDFLAGS) -o dumplastbz2block dumplastbz2block.o mwbzlib.o $(OBJSBZ) -lbz2
—	—	@@ -43,20 +44,26 @@
44	45	dumpbz2filefromoffset: $(OBJSBZ) mwbzlib.o dumpbz2filefromoffset.o
45	46	$(CC) $(CFLAGS) $(LDFLAGS) -o dumpbz2filefromoffset dumpbz2filefromoffset.o mwbzlib.o $(OBJSBZ) -lbz2
46	47
47		~~-install: dumplastbz2block findpageidinbz2xml checkforbz2footer dumpbz2filefromoffset~~
	48	+recompressxml: $(OBJSBZ) recompressxml.o
	49	+ $(CC) $(CFLAGS) $(LDFLAGS) -o recompressxml recompressxml.o -lbz2
	50	+
	51	+install: dumplastbz2block findpageidinbz2xml checkforbz2footer dumpbz2filefromoffset recompressxml
48	52	if ( test ! -d $(PREFIX)/bin ) ; then mkdir -p $(PREFIX)/bin ; fi
49	53	cp -f dumplastbz2block $(PREFIX)/bin/dumplastbz2block
50	54	cp -f findpageidinbz2xml $(PREFIX)/bin/findpageidinbz2xml
51	55	cp -f checkforbz2footer $(PREFIX)/bin/checkforbz2footer
52	56	cp -f dumpbz2filefromoffset $(PREFIX)/bin/dumpbz2filefromoffset
	57	+ cp -f recompressxml $(PREFIX)/bin/recompressxml
53	58	chmod a+x $(PREFIX)/bin/dumplastbz2block
54	59	chmod a+x $(PREFIX)/bin/findpageidinbz2xml
55	60	chmod a+x $(PREFIX)/bin/checkforbz2footer
56	61	chmod a+x $(PREFIX)/bin/dumpbz2filefromoffset
	62	+ chmod a+x $(PREFIX)/bin/recompressxml
57	63
58	64	clean:
59	65	rm -f .o .a dumplastbz2block findpageidinbz2xml \
60		~~- checkforbz2footer dumpbz2filefromoffset~~
	66	+ checkforbz2footer dumpbz2filefromoffset \
	67	+ recompressxml
61	68
62	69	bzlibfuncs.o: bzlibfuncs.c bzlib.h bzlib_private.h
63	70	$(CC) $(CFLAGS) -c bzlibfuncs.c
—	—	@@ -72,6 +79,8 @@
73	80	$(CC) $(CFLAGS) -c checkforbz2footer.c
74	81	dumpbz2filefromoffset.o: dumpbz2filefromoffset.c
75	82	$(CC) $(CFLAGS) -c dumpbz2filefromoffset.c
	83	+recompressxml.o: recompressxml.c
	84	+ $(CC) $(CFLAGS) -c recompressxml.c
76	85
77	86	distclean: clean
78	87
—	—	@@ -80,6 +89,7 @@
81	90	rm -f $(DISTNAME)
82	91	ln -s -f . $(DISTNAME)
83	92	tar cvf $(DISTNAME).tar \
	93	+ $(DISTNAME)/recompressxml.c \
84	94	$(DISTNAME)/dumplastbz2block.c \
85	95	$(DISTNAME)/findpageidinbz2xml.c \
86	96	$(DISTNAME)/checkforbz2footer.c \

Status & tagging log

09:26, 3 January 2012 Siebrand (talk | contribs) changed the status of r107839 [removed: new added: deferred]