Index: branches/ariel/xmldumps-backup/writeuptopageid.c |
— | — | @@ -0,0 +1,141 @@ |
| 2 | +#include <stdio.h> |
| 3 | +#include <stdlib.h> |
| 4 | +#include <stdarg.h> |
| 5 | +#include <errno.h> |
| 6 | +#include <string.h> |
| 7 | + |
| 8 | +typedef enum { None, StartHeader, StartPage, AtPageID, WriteMem, Write, EndPage, AtLastPageID } States; |
| 9 | + |
| 10 | +/* assume the header is never going to be longer than 1000 x 80 4-byte characters... how many |
| 11 | + namespaces will one project want? */ |
| 12 | +#define MAXHEADERLEN 524289 |
| 13 | + |
| 14 | +void usage(char *me) { |
| 15 | + fprintf(stderr,"Usage: %s pageID\n",me); |
| 16 | + fprintf(stderr,"Copies the contents of an XML file up to but not including\n"); |
| 17 | + fprintf(stderr,"the specified pageID. This program is used in processing XML\n"); |
| 18 | + fprintf(stderr,"dump files that were only partially written.\n"); |
| 19 | +} |
| 20 | + |
| 21 | +/* note that even if we have only read a partial line |
| 22 | + of text from the body of the page, (cause the text |
| 23 | + is longer than our buffer), it's fine, since the |
| 24 | + <> delimiters only mark xml, they can't appear |
| 25 | + in the page text. |
| 26 | + |
| 27 | + returns new state */ |
| 28 | +States setState (char *line, States currentState, int endPageID) { |
| 29 | + int pageID = 0; |
| 30 | + |
| 31 | + if (!strncmp(line,"<mediawiki",10)) { |
| 32 | + return(StartHeader); |
| 33 | + } |
| 34 | + else if (!strncmp(line,"<page>",6)) { |
| 35 | + return(StartPage); |
| 36 | + } |
| 37 | + /* there are also user ids, revision ids, etc... pageid will be the first one */ |
| 38 | + else if (currentState == StartPage && (!strncmp(line, "<id>", 4))) { |
| 39 | + /* dig the id out, format is <id>num</id> */ |
| 40 | + pageID = atoi(line+4); |
| 41 | + if (pageID == endPageID) { |
| 42 | + return(AtLastPageID); |
| 43 | + } |
| 44 | + else { |
| 45 | + return(WriteMem); |
| 46 | + } |
| 47 | + } |
| 48 | + else if (currentState == WriteMem) { |
| 49 | + return(Write); |
| 50 | + } |
| 51 | + else if (!strncmp(line, "</page>", 6)) { |
| 52 | + return(EndPage); |
| 53 | + } |
| 54 | + return(currentState); |
| 55 | +} |
| 56 | + |
| 57 | +/* returns 1 on success, 0 on error */ |
| 58 | +int writeMemoryIfNeeded(char *mem, States state) { |
| 59 | + int res = 0; |
| 60 | + |
| 61 | + if (state == WriteMem) { |
| 62 | + res = fwrite(mem,strlen(mem),1,stdout); |
| 63 | + mem[0]='\0'; |
| 64 | + return(res); |
| 65 | + } |
| 66 | +} |
| 67 | + |
| 68 | +/* returns 1 on success, 0 on error */ |
| 69 | +int writeIfNeeded(char *line, States state) { |
| 70 | + if (state == StartHeader || state == WriteMem || state == Write || state == EndPage) { |
| 71 | + return(fwrite(line,strlen(line),1,stdout)); |
| 72 | + } |
| 73 | +} |
| 74 | + |
| 75 | +/* returns 1 on success, 0 on error */ |
| 76 | +int saveInMemIfNeeded(char *mem, char *line, States state) { |
| 77 | + if (state == StartPage) { |
| 78 | + if (strlen(mem) + strlen(line) < MAXHEADERLEN) { |
| 79 | + strcpy(mem + strlen(mem),line); |
| 80 | + } |
| 81 | + else { |
| 82 | + /* we actually ran out of room, who knew */ |
| 83 | + return(0); |
| 84 | + } |
| 85 | + } |
| 86 | + return(1); |
| 87 | +} |
| 88 | + |
| 89 | +int main(int argc,char **argv) { |
| 90 | + long int pageID = 0; |
| 91 | + char *nonNumeric = 0; |
| 92 | + States state = None; |
| 93 | + char *text; |
| 94 | + char line[4097]; |
| 95 | + /* order of magnitude of 2K lines of 80 chrs each, |
| 96 | + no header of either a page nor the mw header should |
| 97 | + ever be longer than that. At least not for some good |
| 98 | + length of time. */ |
| 99 | + char mem[MAXHEADERLEN]; |
| 100 | + |
| 101 | + if (argc != 2) { |
| 102 | + usage(argv[0]); |
| 103 | + exit(-1); |
| 104 | + } |
| 105 | + |
| 106 | + errno = 0; |
| 107 | + pageID = strtol(argv[1], &nonNumeric, 10); |
| 108 | + if (pageID == 0 || |
| 109 | + *nonNumeric != 0 || |
| 110 | + nonNumeric == (char *) &pageID || |
| 111 | + errno != 0) { |
| 112 | + fprintf (stderr,"The value you entered for pageID must be a positive integer.\n"); |
| 113 | + usage(argv[0]); |
| 114 | + exit(-1); |
| 115 | + } |
| 116 | + |
| 117 | + while (fgets(line, sizeof(line)-1, stdin) != NULL) { |
| 118 | + text=line; |
| 119 | + while (*text && isspace(*text)) |
| 120 | + text++; |
| 121 | + state = setState(text, state, pageID); |
| 122 | + if (!saveInMemIfNeeded(mem,line,state)) { |
| 123 | + fprintf(stderr,"failed to save text in temp memory, bailing\n"); |
| 124 | + exit(-1); |
| 125 | + }; |
| 126 | + if (!writeMemoryIfNeeded(mem,state)) { |
| 127 | + fprintf(stderr,"failed to write text from memory, bailing\n"); |
| 128 | + exit(-1); |
| 129 | + } |
| 130 | + if (!writeIfNeeded(line,state)) { |
| 131 | + fprintf(stderr,"failed to write text, bailing\n"); |
| 132 | + exit(-1); |
| 133 | + } |
| 134 | + if (state == AtLastPageID) { |
| 135 | + /* we are done. */ |
| 136 | + break; |
| 137 | + } |
| 138 | + } |
| 139 | + fwrite("</mediawiki>\n",13,1,stdout); |
| 140 | + exit(0); |
| 141 | +} |
| 142 | + |
Property changes on: branches/ariel/xmldumps-backup/writeuptopageid.c |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 143 | + native |