r101971 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r101970‎ | r101971 | r101972 >
Date:09:46, 4 November 2011
Author:ariel
Status:deferred
Tags:
Comment:
possibily useful, likely buggy script for grabbing specific columns from desired rows of a mysql table dump
Modified paths:
  • /branches/ariel/xmldumps-backup/mysql2txt.py (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/mysql2txt.py
@@ -0,0 +1,405 @@
 2+# this script reads from stdin a sql file created by mysqldump, grabs the requested columns from
 3+# the requested table from each tuple, and writes them out one tuple per line
 4+# with a comma between columns, keeping the original escaping of values as done by mysql.
 5+
 6+import getopt
 7+import os
 8+import re
 9+import sys
 10+
 11+class ConverterError(Exception):
 12+ pass
 13+
 14+class MysqlFile:
 15+ def __init__(self, f, tableRequested, columnsRequested, valuesRequestedCols, valuesRequestedVals, fieldSeparator):
 16+ self.file = f
 17+ self.tableRequested = tableRequested
 18+ self.columnsRequested = columnsRequested
 19+ self.valuesRequestedCols = valuesRequestedCols
 20+ self.valuesRequestedVals = valuesRequestedVals
 21+ self.fieldSeparator = fieldSeparator
 22+
 23+ self.buffer = ""
 24+ self.bufferInd = 0
 25+ self.eof = False
 26+ self.rowsDone = False
 27+ self.GET = 1
 28+ self.CHECK = 2
 29+ self.SKIP = 0
 30+
 31+ def findCreateStatement(self):
 32+ tableFound = False
 33+ toFind = "CREATE TABLE `%s` (\n" % self.tableRequested
 34+ line = self.getLine(len(toFind))
 35+ if (not line.endswith("\n")):
 36+ self.skipLineRemainder()
 37+ while line != "":
 38+ if line == toFind:
 39+ tableFound = True
 40+ break
 41+ line = self.getLine(len(toFind))
 42+ if (not line.endswith("\n")):
 43+ self.skipLineRemainder()
 44+ if not tableFound:
 45+ raise ConverterError("create statement for requested table not found in file")
 46+
 47+ def getLine(self, maxbytes = 0):
 48+ """returns line including the \n, up to maxbytes"""
 49+ line = ""
 50+ length = 0
 51+ if self.eof:
 52+ return False
 53+ while self.buffer[self.bufferInd] != '\n':
 54+ line = line + self.buffer[self.bufferInd]
 55+ if not self.incrementBufferPtr():
 56+ return False
 57+ length = length + 1
 58+ if maxbytes and length == maxbytes:
 59+ return line
 60+
 61+ if not self.skipChar('\n'):
 62+ return False
 63+ return line + "\n"
 64+
 65+ def skipLineRemainder(self):
 66+ # skip up to the newline...
 67+ while self.buffer[self.bufferInd] != '\n':
 68+ if not self.incrementBufferPtr():
 69+ return False
 70+ # and now the newline.
 71+ return self.incrementBufferPtr()
 72+
 73+ def findInsertStatement(self):
 74+ """leave the file contents at the line immediately following
 75+ an INSERT statement"""
 76+ if m.eof:
 77+ return False
 78+ insertFound = False
 79+ toFind = "INSERT INTO `%s` VALUES " % self.tableRequested
 80+ line = self.getLine(len(toFind))
 81+ while line and not self.eof:
 82+ if line.startswith(toFind):
 83+ insertFound = True
 84+ break
 85+ if (not line.endswith("\n")):
 86+ self.skipLineRemainder()
 87+ line = self.getLine(len(toFind))
 88+ return insertFound
 89+
 90+ def setupColumnRetrieval(self):
 91+ self.columnsInTable = []
 92+ columnNameExpr = re.compile('\s+`([^`]+)`')
 93+ line = self.getLine()
 94+ while (line and not self.eof and line[0] != ')' ):
 95+ columnNameMatch = columnNameExpr.match(line)
 96+ if (columnNameMatch):
 97+ self.columnsInTable.append(columnNameMatch.group(1))
 98+ line = self.getLine()
 99+
 100+ for c in self.columnsRequested:
 101+ if not c in self.columnsInTable:
 102+ raise ConverterError("requested column %s not found in table" % c)
 103+
 104+# print "columns in table: ", self.columnsInTable
 105+# print "columnsRequested: ", self.columnsRequested
 106+
 107+ self.columnsToGet = []
 108+ for c in self.columnsInTable:
 109+ v = self.SKIP
 110+ if c in self.columnsRequested:
 111+ v = v | self.GET
 112+ if c in self.valuesRequestedCols:
 113+ v = v | self.CHECK
 114+ self.columnsToGet.append( v )
 115+
 116+# print "columns to get: ", self.columnsToGet
 117+
 118+ self.columnOrder = []
 119+ # we want here a list which tells us to
 120+ # write the ith column we read from tuple first,
 121+ # the jth one second, the kth one third etc.
 122+ columnsToGetTrue = []
 123+ for i in range(0,len(self.columnsToGet)):
 124+ if self.columnsToGet[i] & self.GET:
 125+ columnsToGetTrue.append(self.columnsInTable[i])
 126+ for c in self.columnsRequested:
 127+ self.columnOrder.append(columnsToGetTrue.index(c))
 128+
 129+# print "column order: ", self.columnOrder
 130+
 131+ def whine(self, message = None):
 132+ if (message):
 133+ raise ConverterError("whine whine whine: " + message )
 134+ else:
 135+ raise ConverterError("whine whine whine. failed to parse a row.")
 136+
 137+ def getColumnsFromRow(self):
 138+ """returns a list of column values extracted from a row.
 139+ f is an open input file positioned at the beginning of a
 140+ tuple representing a row in mysql output format,
 141+ colsToGet is a list of True/False correspnding to which
 142+ elements in the tuple we want to retrieve and return"""
 143+
 144+# print "buffer is ", self.buffer[self.bufferInd:self.bufferInd+80], "..."
 145+ if not self.skipStartOfRow():
 146+ self.whine("couldn't find start of row")
 147+ cols = []
 148+ ind = 0
 149+ skip = False
 150+ for c in self.columnsToGet:
 151+ if skip:
 152+ self.skipColValue()
 153+ elif c & self.GET:
 154+ cols.append(self.getColValue())
 155+ if c & self.CHECK:
 156+ colName = self.columnsInTable[ind]
 157+ j = self.valuesRequestedCols.index(colName)
 158+ if self.getColValue() != self.valuesRequestedVals[j]:
 159+ skip = True
 160+ cols = None
 161+ elif c & self.CHECK:
 162+ colName = self.columnsInTable[ind]
 163+ j = self.valuesRequestedCols.index(colName)
 164+ if self.getColValue() != self.valuesRequestedVals[j]:
 165+ skip = True
 166+ cols = None
 167+ else:
 168+ self.skipColValue()
 169+ ind = ind + 1
 170+
 171+ self.skipEndOfRow()
 172+ return(cols)
 173+
 174+ def skipStartOfRow(self):
 175+ # expect (
 176+ if not self.skipChar('('):
 177+ return False
 178+ return True
 179+
 180+ def skipEndOfRow(self):
 181+ # expect... what do we expect? ); or ),
 182+ # the first means end of row with no more rows after, the second means end of
 183+ # specific row only
 184+ if not self.skipChar(')'):
 185+ self.whine()
 186+ if not self.skipChar(','):
 187+ if self.skipChar(';'):
 188+ self.rowsDone = True
 189+ else:
 190+ self.whine()
 191+ self.skipChar('\n')
 192+
 193+ def getColValue(self):
 194+ #expect: a string of digits
 195+ # or: ' some stuff, '
 196+ value=""
 197+ if (self.buffer[self.bufferInd].isdigit()):
 198+ while self.buffer[self.bufferInd].isdigit():
 199+ value=value + self.buffer[self.bufferInd]
 200+ if not self.incrementBufferPtr():
 201+ return False
 202+ # there will be a comma before the next
 203+ # column if we aren't at the end of the row.
 204+ self.skipChar(',')
 205+ return value
 206+ elif (self.skipChar("'")):
 207+ value = "'"
 208+ done = False
 209+ escaped = False
 210+ while not done:
 211+ if self.buffer[self.bufferInd] != "'" and self.buffer[self.bufferInd] != '\\':
 212+ value=value + self.buffer[self.bufferInd]
 213+ if not self.incrementBufferPtr():
 214+ return False
 215+ escaped = False
 216+ elif self.buffer[self.bufferInd] == "'":
 217+ value=value + self.buffer[self.bufferInd]
 218+ if not self.incrementBufferPtr():
 219+ return False
 220+ if not escaped:
 221+ done = True
 222+ else:
 223+ escaped = False
 224+ else: # escape char \ found
 225+ value=value + self.buffer[self.bufferInd]
 226+ if not self.incrementBufferPtr():
 227+ return False
 228+ if escaped:
 229+ escaped = False
 230+ else:
 231+ escaped = True
 232+ if done:
 233+ # there will be a comma before the next
 234+ # column if we aren't at the end of the row.
 235+ self.skipChar(',')
 236+ return value
 237+ else:
 238+ self.whine()
 239+
 240+ def skipColValue(self):
 241+ #expect: a string of digits with possibly a . in there
 242+ # or: ' some stuff, '
 243+ if (self.buffer[self.bufferInd].isdigit()):
 244+ # might have a float so... crudely...
 245+ while self.buffer[self.bufferInd].isdigit() or self.buffer[self.bufferInd] == '.' or self.buffer[self.bufferInd] == 'e' or self.buffer[self.bufferInd] == '-':
 246+ if not self.incrementBufferPtr():
 247+ return False
 248+ # there will be a comma before the next
 249+ # column if we aren't at the end of the row.
 250+ self.skipChar(',')
 251+ elif (self.skipChar("'")):
 252+ done = False
 253+ escaped = False
 254+ while not done:
 255+ if self.buffer[self.bufferInd] != "'" and self.buffer[self.bufferInd] != '\\':
 256+ if not self.incrementBufferPtr():
 257+ return False
 258+ escaped = False
 259+ elif self.buffer[self.bufferInd] == "'":
 260+ if not self.incrementBufferPtr():
 261+ return False
 262+ if not escaped:
 263+ done = True
 264+ else:
 265+ escaped = False
 266+ else: # escape char \ found
 267+ if not self.incrementBufferPtr():
 268+ return False
 269+ if escaped:
 270+ escaped = False
 271+ else:
 272+ escaped = True
 273+ if done:
 274+ # there will be a comma before the next
 275+ # column if we aren't at the end of the row.
 276+ self.skipChar(',')
 277+ else:
 278+# print "buffer is ", self.buffer[self.bufferInd:self.bufferInd+80], "..."
 279+ self.whine("failed to parse a value, found start character " + self.buffer[self.bufferInd])
 280+
 281+ def skipChar(self, c):
 282+ if self.buffer[self.bufferInd] == c:
 283+ if not self.incrementBufferPtr():
 284+ return False
 285+ return True
 286+ else:
 287+ return False
 288+
 289+ def incrementBufferPtr(self):
 290+ self.bufferInd = self.bufferInd + 1
 291+ if self.bufferInd == len(self.buffer):
 292+ return self.fillBuffer() # this will move the index accordingly
 293+ return True
 294+
 295+ def fillBuffer(self):
 296+ if self.bufferInd == len(self.buffer) and not self.rowsDone:
 297+ # we are out of data in the buffer, and there's more
 298+ # rows to be gotten
 299+
 300+ # fixme this should be a constant someplace configurable
 301+ self.buffer = self.file.read(8192)
 302+ if (self.buffer == ""):
 303+ self.rowsDone = True
 304+ self.eof = True
 305+ return False
 306+ else:
 307+ self.bufferInd = 0
 308+ return True
 309+
 310+ def formatColumn(self, column):
 311+ """for now we do nothing. maybe we want this in the future."""
 312+ return column
 313+
 314+ def writeColumns(self, columns, outFile):
 315+ """takes a list of column values without names.
 316+ must find the names these correspond to, figure out the right
 317+ order (or alternatively maybe we have a map that tells us the order)
 318+ and write the values out in the new order."""
 319+ if columns:
 320+ ind = 0
 321+ for i in self.columnOrder:
 322+ outFile.write(self.formatColumn(columns[i]))
 323+ if ind < len(self.columnOrder)-1:
 324+ outFile.write(self.fieldSeparator)
 325+ ind = ind + 1
 326+ outFile.write('\n')
 327+
 328+def usage(message = None):
 329+ if message:
 330+ print message
 331+ print "Usage: python mysql2txt.py --table=tablename --columns=col1,col2... "
 332+ print " [--values=col1=val1,col2=val2...] [--separator=<string>]"
 333+ print ""
 334+ print "This script reads a table dump in mysql format from stdin and writes"
 335+ print "specified columns from desired rows to stdout, one line per row."
 336+ print ""
 337+ print "--table: the name of the table from which we want to extract values"
 338+ print "--columns: the names of the columns from the table, separated by commas,"
 339+ print " the values of which we want to retrieve, in the order we want"
 340+ print " them to be written on each line of the output"
 341+ print "--values: pairs of column names and values we want the column to have, for"
 342+ print " each row to be printed; in each pair the column name and"
 343+ print " the value are separated by an equals sign, and these pairs are"
 344+ print " separated from each other by commas"
 345+ print "--separator: the string which will be written after each value in a row"
 346+ print " to separate it from the next value, by default a space"
 347+ print ""
 348+ print "Example: zcat elwikidb-20111102-page.sql.gz | python mysql2txt.py --table=page \\"
 349+ print " --columns=page_title,page_id --values=page_namespace=15 --separator=' | '"
 350+ sys.exit(1)
 351+
 352+if __name__ == "__main__":
 353+ tableRequested = None
 354+ columnsRequested = None
 355+ valuesRequestedCols = []
 356+ valuesRequestedVals = []
 357+ fieldSeparator = ' '
 358+
 359+ try:
 360+ (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", ['table=', 'columns=', 'values=', 'separator=' ])
 361+ except:
 362+ usage("Unknown option specified")
 363+
 364+ for (opt, val) in options:
 365+ if opt == "--table":
 366+ tableRequested = val
 367+ elif opt == "--columns":
 368+ if ',' in val:
 369+ columnsRequested = val.split(',')
 370+ else:
 371+ columnsRequested = [ val ]
 372+ elif opt == "--values":
 373+ if ',' in val:
 374+ vlist = val.split(',')
 375+ else:
 376+ vlist = [ val ]
 377+ valuesRequestedCols = [ v.split('=')[0] for v in vlist ]
 378+ valuesRequestedVals = [ v.split('=')[1] for v in vlist ]
 379+ elif opt == "--separator":
 380+ fieldSeparator = val
 381+
 382+ if (len(remainder) > 0):
 383+ usage("Unknown option specified")
 384+
 385+ if (not tableRequested or not columnsRequested):
 386+ usage("Missing required option")
 387+
 388+ m = MysqlFile(sys.stdin, tableRequested, columnsRequested, valuesRequestedCols, valuesRequestedVals, fieldSeparator)
 389+ m.fillBuffer()
 390+
 391+ m.findCreateStatement()
 392+ m.setupColumnRetrieval()
 393+
 394+ if not m.findInsertStatement():
 395+ raise ConverterError("insert statement for requested table not found in file")
 396+ while (not m.eof):
 397+ cols = m.getColumnsFromRow()
 398+ # write them out in the correct order...
 399+ m.writeColumns(cols, sys.stdout)
 400+ if m.rowsDone and not m.eof:
 401+ # could have multiple inserts for the same table
 402+ m.rowsDone = False
 403+ m.findInsertStatement()
 404+
 405+ exit(0);
 406+
Property changes on: branches/ariel/xmldumps-backup/mysql2txt.py
___________________________________________________________________
Added: svn:eol-style
1407 + native

Status & tagging log