Index: branches/ariel/xmldumps-backup/WikiDump.py |
— | — | @@ -69,6 +69,8 @@ |
70 | 70 | |
71 | 71 | def dbList(filename): |
72 | 72 | """Read database list from a file""" |
| 73 | + if (not filename): |
| 74 | + return [] |
73 | 75 | infile = open(filename) |
74 | 76 | dbs = [] |
75 | 77 | for line in infile: |
— | — | @@ -93,7 +95,8 @@ |
94 | 96 | "dblist": "", |
95 | 97 | "privatelist": "", |
96 | 98 | "biglist": "", |
97 | | - "dir": "", |
| 99 | + "flaggedrevslist": "", |
| 100 | +# "dir": "", |
98 | 101 | "forcenormal": "0", |
99 | 102 | "halt": "0", |
100 | 103 | "skipdblist" : "", |
— | — | @@ -115,7 +118,7 @@ |
116 | 119 | "password": "", |
117 | 120 | #"tools": { |
118 | 121 | "php": "/bin/php", |
119 | | - "gzip2": "/usr/bin/gzip", |
| 122 | + "gzip": "/usr/bin/gzip", |
120 | 123 | "bzip2": "/usr/bin/bzip2", |
121 | 124 | "sevenzip": "/bin/7za", |
122 | 125 | "mysql": "/usr/bin/mysql", |
— | — | @@ -127,8 +130,8 @@ |
128 | 131 | #"cleanup": { |
129 | 132 | "keep": "3", |
130 | 133 | #"chunks": { |
131 | | - # set this to True to enable runing the various xml dump stages as chunks in parallel |
132 | | - "chunksEnabled" : False, |
| 134 | + # set this to 1 to enable runing the various xml dump stages as chunks in parallel |
| 135 | + "chunksEnabled" : "0", |
133 | 136 | # for page history runs, number of pages for each chunk, specified separately |
134 | 137 | # e.g. "1000,10000,100000,2000000,2000000,2000000,2000000,2000000,2000000,2000000" |
135 | 138 | # would define 10 chunks with the specified number of pages in each and any extra in |
— | — | @@ -141,39 +144,33 @@ |
142 | 145 | "revsPerChunkHistory" : False, |
143 | 146 | # pages per chunk for abstract runs |
144 | 147 | "pagesPerChunkAbstract" : False, |
| 148 | + # whether or not to recombine the history pieces |
| 149 | + "recombineHistory" : "1", |
145 | 150 | } |
146 | 151 | conf = ConfigParser.SafeConfigParser(defaults) |
147 | 152 | conf.read(files) |
148 | 153 | |
149 | | - try: |
150 | | - self.dbList = dbList(conf.get("wiki", "dblist")) |
151 | | - except ConfigParser.NoSectionError: |
| 154 | + if not conf.has_section("wiki"): |
152 | 155 | print "The mandatory configuration section 'wiki' was not defined." |
153 | | - print "Either the section was ommitted or none of the files in the list" |
154 | | - print "%s exists. Giving up." % files |
155 | | - raise |
156 | | - try: |
157 | | - self.skipDbList = dbList(conf.get("wiki", "skipdblist")) |
158 | | - except ConfigParser.NoSectionError: |
159 | | - self.skipDbList = [] |
160 | | - self.dbList = list(set(self.dbList) - set(self.skipDbList)) |
| 156 | + raise ConfigParser.NoSectionError('wiki') |
161 | 157 | |
| 158 | + if not conf.has_option("wiki","dir"): |
| 159 | + print "The mandatory setting 'dir' in the section 'wiki' was not defined." |
| 160 | + raise ConfigParser.NoOptionError('wiki','dir') |
| 161 | + |
| 162 | + self.dbList = dbList(conf.get("wiki", "dblist")) |
| 163 | + self.skipDbList = dbList(conf.get("wiki", "skipdblist")) |
162 | 164 | self.privateList = dbList(conf.get("wiki", "privatelist")) |
163 | | - biglistFile = conf.get("wiki", "biglist") |
164 | | - if biglistFile: |
165 | | - self.bigList = dbList(biglistFile) |
166 | | - else: |
167 | | - self.bigList = [] |
168 | | - flaggedRevsFile = conf.get("wiki", "flaggedrevslist") |
169 | | - if flaggedRevsFile: |
170 | | - self.flaggedRevsList = dbList(flaggedRevsFile) |
171 | | - else: |
172 | | - self.flaggedRevsList = [] |
173 | | - |
| 165 | + self.bigList = dbList(conf.get("wiki", "biglist")) |
| 166 | + self.flaggedRevsList = dbList(conf.get("wiki", "flaggedrevslist")) |
174 | 167 | self.wikiDir = conf.get("wiki", "dir") |
175 | 168 | self.forceNormal = conf.getint("wiki", "forceNormal") |
176 | 169 | self.halt = conf.getint("wiki", "halt") |
177 | | - |
| 170 | + |
| 171 | + self.dbList = list(set(self.dbList) - set(self.skipDbList)) |
| 172 | + |
| 173 | + if not conf.has_section('output'): |
| 174 | + conf.add_section('output') |
178 | 175 | self.publicDir = conf.get("output", "public") |
179 | 176 | self.privateDir = conf.get("output", "private") |
180 | 177 | self.webRoot = conf.get("output", "webroot") |
— | — | @@ -182,14 +179,20 @@ |
183 | 180 | self.perDumpIndex = conf.get("output", "perdumpindex") |
184 | 181 | self.logFile = conf.get("output", "logfile") |
185 | 182 | |
| 183 | + if not conf.has_section('reporting'): |
| 184 | + conf.add_section('reporting') |
186 | 185 | self.adminMail = conf.get("reporting", "adminmail") |
187 | 186 | self.mailFrom = conf.get("reporting", "mailfrom") |
188 | 187 | self.smtpServer = conf.get("reporting", "smtpserver") |
189 | 188 | self.staleAge = conf.getint("reporting", "staleAge") |
190 | 189 | |
| 190 | + if not conf.has_section('database'): |
| 191 | + conf.add_section('database') |
191 | 192 | self.dbUser = conf.get("database", "user") |
192 | 193 | self.dbPassword = conf.get("database", "password") |
193 | 194 | |
| 195 | + if not conf.has_section('tools'): |
| 196 | + conf.add_section('tools') |
194 | 197 | self.php = conf.get("tools", "php") |
195 | 198 | self.gzip = conf.get("tools", "gzip") |
196 | 199 | self.bzip2 = conf.get("tools", "bzip2") |
— | — | @@ -201,13 +204,18 @@ |
202 | 205 | self.cat = conf.get("tools", "cat") |
203 | 206 | self.grep = conf.get("tools", "grep") |
204 | 207 | |
205 | | - self.chunksEnabled = conf.get("chunks","chunksEnabled") |
| 208 | + if not conf.has_section('chunks'): |
| 209 | + conf.add_section('chunks') |
| 210 | + self.chunksEnabled = conf.getint("chunks","chunksEnabled") |
206 | 211 | self.pagesPerChunkHistory = conf.get("chunks","pagesPerChunkHistory") |
207 | 212 | self.revsPerChunkHistory = conf.get("chunks","revsPerChunkHistory") |
208 | 213 | self.pagesPerChunkAbstract = conf.get("chunks","pagesPerChunkAbstract") |
| 214 | + self.recombineHistory = conf.getint("chunks","recombineHistory") |
209 | 215 | |
| 216 | + if not conf.has_section('cleanup'): |
| 217 | + conf.add_section('cleanup') |
210 | 218 | self.keep = conf.getint("cleanup", "keep") |
211 | | - |
| 219 | + |
212 | 220 | def dbListByAge(self): |
213 | 221 | """ |
214 | 222 | Sort wikis in reverse order of last successful dump : |
Index: branches/ariel/xmldumps-backup/worker.py |
— | — | @@ -99,6 +99,7 @@ |
100 | 100 | self._pagesPerChunkHistory = self.convertCommaSepLineToNumbers(wiki.config.pagesPerChunkHistory) |
101 | 101 | self._revsPerChunkHistory = self.convertCommaSepLineToNumbers(wiki.config.revsPerChunkHistory) |
102 | 102 | self._pagesPerChunkAbstract = self.convertCommaSepLineToNumbers(wiki.config.pagesPerChunkAbstract) |
| 103 | + self._recombineHistory = wiki.config.recombineHistory |
103 | 104 | |
104 | 105 | if (self._chunksEnabled): |
105 | 106 | self.Stats = PageAndEditStats(wiki,dbName) |
— | — | @@ -156,6 +157,9 @@ |
157 | 158 | def chunksEnabled(self): |
158 | 159 | return self._chunksEnabled |
159 | 160 | |
| 161 | + def recombineHistory(self): |
| 162 | + return self._recombineHistory |
| 163 | + |
160 | 164 | # args: total (pages or revs), and the number of (pages or revs) per chunk. |
161 | 165 | def getNumberOfChunksForXMLDumps(self, total, perChunk): |
162 | 166 | if (not total): |
— | — | @@ -430,7 +434,7 @@ |
431 | 435 | "All pages with complete page edit history (.bz2)", |
432 | 436 | "These dumps can be *very* large, uncompressing up to 20 times the archive download size. " + |
433 | 437 | "Suitable for archival and statistical use, most mirror sites won't want or need this.", self._prefetch, self._spawn, self.chunkInfo.getPagesPerChunkHistory())) |
434 | | - if (self.chunkInfo.chunksEnabled()): |
| 438 | + if (self.chunkInfo.chunksEnabled() and self.chunkInfo.recombineHistory()): |
435 | 439 | self.dumpItems.append( |
436 | 440 | RecombineXmlDump("meta-history", |
437 | 441 | "metahistorybz2dumprecombine", |
— | — | @@ -443,7 +447,7 @@ |
444 | 448 | "All pages with complete edit history (.7z)", |
445 | 449 | "These dumps can be *very* large, uncompressing up to 100 times the archive download size. " + |
446 | 450 | "Suitable for archival and statistical use, most mirror sites won't want or need this.", self.chunkInfo.getPagesPerChunkHistory())) |
447 | | - if (self.chunkInfo.chunksEnabled()): |
| 451 | + if (self.chunkInfo.chunksEnabled() and self.chunkInfo.recombineHistory()): |
448 | 452 | self.dumpItems.append( |
449 | 453 | RecombineXmlRecompressDump("meta-history", |
450 | 454 | "metahistory7zdumprecombine", |