r106020 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r106019‎ | r106020 | r106021 >
Date:13:25, 13 December 2011
Author:ariel
Status:deferred
Tags:
Comment:
some scripts we used for looking at thumbs counts, sizes, etc... in case we need 'em again
Modified paths:
  • /branches/ariel/tools (added) (history)
  • /branches/ariel/tools/thumbs (added) (history)
  • /branches/ariel/tools/thumbs/crunchinglogs (added) (history)
  • /branches/ariel/tools/thumbs/crunchinglogs/README (added) (history)
  • /branches/ariel/tools/thumbs/crunchinglogs/datascripts (added) (history)
  • /branches/ariel/tools/thumbs/crunchinglogs/datascripts/thumbDateAnalysis.py (added) (history)
  • /branches/ariel/tools/thumbs/crunchinglogs/datascripts/thumbFilesSizesCounts.py (added) (history)
  • /branches/ariel/tools/thumbs/crunchinglogs/datascripts/thumbPxSize.py (added) (history)
  • /branches/ariel/tools/thumbs/crunchinglogs/otherscripts (added) (history)
  • /branches/ariel/tools/thumbs/crunchinglogs/otherscripts/checkExistingThumbDirs.py (added) (history)
  • /branches/ariel/tools/thumbs/crunchinglogs/otherscripts/listFileNames.py (added) (history)
  • /branches/ariel/tools/thumbs/crunchinglogs/otherscripts/listThumbFilesByDir.py (added) (history)
  • /branches/ariel/tools/thumbs/crunchinglogs/otherscripts/removeThumbDirs.py (added) (history)
  • /branches/ariel/tools/thumbs/crunchinglogs/samples (added) (history)
  • /branches/ariel/tools/thumbs/crunchinglogs/samples/do-dateanal-dates-created.sh (added) (history)
  • /branches/ariel/tools/thumbs/crunchinglogs/samples/do-dateanal-dates.sh (added) (history)
  • /branches/ariel/tools/thumbs/crunchinglogs/samples/do-pixel-sizes.sh (added) (history)

Diff [purge]

Index: branches/ariel/tools/thumbs/crunchinglogs/datascripts/thumbFilesSizesCounts.py
@@ -0,0 +1,73 @@
 2+# -*- coding: utf-8 -*-
 3+
 4+import os
 5+import re
 6+import sys
 7+import time
 8+import getopt
 9+
 10+def usage(message=None):
 11+ print "Usage: %s [--sdate=date --edate=date [filename]" % sys.argv[0]
 12+ print "sdate: start date for which to print stats, default: earliest date in file "
 13+ print "edate: end date for which to print stats, default: latest date in file"
 14+ print "Date format: yyyy-mm-dd"
 15+ print "If filename is not specified, reads from stdin"
 16+ print ""
 17+ print "Format of input file: (sample line)"
 18+ print "2011-10-29 01:57:51 100311 Festiwal_Słowian_i_Wikingów_2009_121.jpg/640px-Festiwal_Słowian_i_Wikingów_2009_121.jpg"
 19+ print "date in yyyy-mm-dd format, time in hh:mm::ss format, size in bytes, thumb directory/thumb filename"
 20+ sys.exit(1)
 21+
 22+if __name__ == "__main__":
 23+ sdate = None
 24+ edate = None
 25+
 26+ try:
 27+ (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "",
 28+ [ 'sdate=', 'edate=', ])
 29+ except:
 30+ usage("Unknown option specified")
 31+
 32+ for (opt, val) in options:
 33+ if opt == "--sdate":
 34+ sdate = val
 35+ elif opt == "--edate":
 36+ edate = val
 37+
 38+ dateexp = re.compile(r"^\d{4}-\d{2}-\d{2}$")
 39+ for d in filter(None, [ sdate, edate ]):
 40+ if not dateexp.match(d):
 41+ usage("Bad date format.")
 42+
 43+ if len(remainder) == 1:
 44+ inputFile = remainder[0]
 45+ fHandle = open(inputFile,"r")
 46+ elif len(remainder) == 0:
 47+ fHandle = sys.stdin
 48+ else:
 49+ usage("Too many arguments.")
 50+
 51+ sizes = {}
 52+ counts = {}
 53+ totalSize = 0
 54+ totalNum = 0
 55+ for line in fHandle:
 56+ try:
 57+ ( fDate, fTime, fSize, fName ) = line.rstrip().split()
 58+ except:
 59+ print >> sys.stderr, "skipping badly formatted line: ", line.rstrip()
 60+ continue
 61+ if (sdate and (fDate >= sdate)) or not sdate:
 62+ if (edate and (fDate <= edate)) or not edate:
 63+ if not fDate in sizes:
 64+ sizes[fDate] = 0
 65+ counts[fDate] = 0
 66+ sizes[fDate] = sizes[fDate] + int(fSize)
 67+ counts[fDate] = counts[fDate] + 1
 68+
 69+ dates = sizes.keys()
 70+ dates.sort()
 71+ for d in dates:
 72+ print "Date:", d, "Bytes:", sizes[d], "Files:", counts[d]
 73+ sys.exit(0)
 74+
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/datascripts/thumbFilesSizesCounts.py
___________________________________________________________________
Added: svn:eol-style
175 + native
Index: branches/ariel/tools/thumbs/crunchinglogs/datascripts/thumbDateAnalysis.py
@@ -0,0 +1,111 @@
 2+# -*- coding: utf-8 -*-
 3+
 4+import os
 5+import re
 6+import sys
 7+import time
 8+import getopt
 9+
 10+def usage(message=None):
 11+ print "Usage: %s [--sdate=date --edate=date --created [filename]" % sys.argv[0]
 12+ print "sdate: start date for which to print stats, default: earliest date in file "
 13+ print "edate: end date for which to print stats, default: latest date in file"
 14+ print "created: show only the number of files and sizes on the date the first thumb"
 15+ print "was created (presumably the date the image itself was first uploaded)"
 16+ print ""
 17+ print "Date format for sdate and edate: yyyy-mm-dd"
 18+ print ""
 19+ print "If no filename is specified, input is read from stdin"
 20+ print
 21+ print "Format of input file: (sample line)"
 22+ print "2011-10-29 01:57:51 100311 Festiwal_Słowian_i_Wikingów_2009_121.jpg/640px-Festiwal_Słowian_i_Wikingów_2009_121.jpg"
 23+ print "date in yyyy-mm-dd format, time in hh:mm::ss format, size in bytes, thumb directory/thumb filename"
 24+ sys.exit(1)
 25+
 26+if __name__ == "__main__":
 27+ sdate = None
 28+ edate = None
 29+ created = False
 30+ try:
 31+ (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "",
 32+ [ 'sdate=', 'edate=', 'created' ])
 33+ except:
 34+ usage("Unknown option specified")
 35+
 36+ for (opt, val) in options:
 37+ if opt == "--sdate":
 38+ sdate = val
 39+ elif opt == "--edate":
 40+ edate = val
 41+ elif opt == "--created":
 42+ created = True
 43+
 44+ dateexp = re.compile(r"^\d{4}-\d{2}-\d{2}$")
 45+ for d in filter(None, [ sdate, edate ]):
 46+ if not dateexp.match(d):
 47+ usage("Bad date format.")
 48+
 49+ if len(remainder) == 1:
 50+ inputFile = remainder[0]
 51+ fHandle = open(inputFile,"r")
 52+ elif len(remainder) == 0:
 53+ fHandle = sys.stdin
 54+ else:
 55+ usage("Too many arguments.")
 56+
 57+ lastDirName = None
 58+ numFilesSameDate = 0
 59+ byteCountSameDate = 0
 60+ fileCounts = {}
 61+ byteCounts = {}
 62+ for line in fHandle:
 63+ try:
 64+ ( fDate, fTime, fSize, path ) = line.rstrip().split()
 65+ except:
 66+ print >> sys.stderr, "skipping badly formatted line: ", line.rstrip()
 67+ continue
 68+ ( dirName, fName ) = path.split('/',2)
 69+ if not lastDirName:
 70+ lastDirName = dirName
 71+ if dirName != lastDirName:
 72+ # should just print the number of files for every date sorted by date order, plus the dir name of course"
 73+ if (sdate and (fDate >= sdate)) or not sdate:
 74+ if (edate and (fDate <= edate)) or not edate:
 75+ # print the stats
 76+ dateStrings = fileCounts.keys()
 77+ dateStrings.sort()
 78+ if created:
 79+ printDates = [ dateStrings[0] ]
 80+ else:
 81+ printDates = dateStrings
 82+ for d in printDates:
 83+ print "Date:", d, "FilesThisDate:", fileCounts[d], "ByteCountThisDate:", byteCounts[d], "Dir: ", lastDirName
 84+ lastDirName = dirName
 85+ # reinitialize stats
 86+ numFilesSameDate = 0
 87+ byteCountSameDate = 0
 88+ fileCounts = {}
 89+ byteCounts = {}
 90+ # add to the stats.
 91+ if (sdate and (fDate >= sdate)) or not sdate:
 92+ if (edate and (fDate <= edate)) or not edate:
 93+ if fDate not in fileCounts:
 94+ fileCounts[fDate] = 0
 95+ fileCounts[fDate] = fileCounts[fDate] + 1
 96+ if fDate not in byteCounts:
 97+ byteCounts[fDate] = 0
 98+ byteCounts[fDate] = byteCounts[fDate] + int(fSize)
 99+
 100+ # print stats for final dir
 101+ if (sdate and (fDate >= sdate)) or not sdate:
 102+ if (edate and (fDate <= edate)) or not edate:
 103+ dateStrings = fileCounts.keys()
 104+ dateStrings.sort()
 105+ if created:
 106+ printDates = [ dateStrings[0] ]
 107+ else:
 108+ printDates = dateStrings
 109+ for d in printDates:
 110+ print "Date:", d, "FilesThisDate:", fileCounts[d], "ByteCountThisDate:", byteCounts[d], "Dir: ", dirName
 111+ sys.exit(0)
 112+
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/datascripts/thumbDateAnalysis.py
___________________________________________________________________
Added: svn:eol-style
1113 + native
Index: branches/ariel/tools/thumbs/crunchinglogs/datascripts/thumbPxSize.py
@@ -0,0 +1,107 @@
 2+# -*- coding: utf-8 -*-
 3+
 4+import os
 5+import re
 6+import sys
 7+import time
 8+import getopt
 9+
 10+
 11+def usage(message=None):
 12+ print "Usage: %s [--sdate=date --edate=date [filename]" % sys.argv[0]
 13+ print "sdate: start date for which to print stats, default: earliest date in file "
 14+ print "edate: end date for which to print stats, default: latest date in file"
 15+ print "Date format: yyyy-mm-dd"
 16+ print "If filename is not specified, reads from stdin"
 17+ print ""
 18+ print "Format of input file: (sample line)"
 19+ print "2011-10-29 01:57:51 100311 Festiwal_Słowian_i_Wikingów_2009_121.jpg/640px-Festiwal_Słowian_i_Wikingów_2009_121.jpg"
 20+ print "date in yyyy-mm-dd format, time in hh:mm::ss format, size in bytes, thumb directory/thumb filename"
 21+ sys.exit(1)
 22+
 23+if __name__ == "__main__":
 24+ sdate = None
 25+ edate = None
 26+
 27+ try:
 28+ (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "",
 29+ [ 'sdate=', 'edate=', ])
 30+ except:
 31+ usage("Unknown option specified")
 32+
 33+ for (opt, val) in options:
 34+ if opt == "--sdate":
 35+ sdate = val
 36+ elif opt == "--edate":
 37+ edate = val
 38+
 39+ dateexp = re.compile(r"^\d{4}-\d{2}-\d{2}$")
 40+ for d in filter(None, [ sdate, edate ]):
 41+ if not dateexp.match(d):
 42+ usage("Bad date format.")
 43+
 44+ if len(remainder) == 1:
 45+ inputFile = remainder[0]
 46+ fHandle = open(inputFile,"r")
 47+ elif len(remainder) == 0:
 48+ fHandle = sys.stdin
 49+ else:
 50+ usage("Too many arguments.")
 51+
 52+ lastDirName = None
 53+ fileCounts = {}
 54+ sizes = {}
 55+ for line in fHandle:
 56+ try:
 57+ ( fDate, fTime, fSize, path ) = line.rstrip().split()
 58+ except:
 59+ print >> sys.stderr, "skipping badly formatted line: ", line.rstrip()
 60+ continue
 61+ try:
 62+ ( dirName, fName ) = path.split('/',2)
 63+ except:
 64+ continue
 65+ if not lastDirName:
 66+ lastDirName = dirName
 67+ if dirName != lastDirName:
 68+ if (sdate and (fDate >= sdate)) or not sdate:
 69+ if (edate and (fDate <= edate)) or not edate:
 70+ # print the stats
 71+ dateStrings = fileCounts.keys()
 72+ dateStrings.sort()
 73+ for d in dateStrings:
 74+ print "Date:", d, "ThumbsForFileThisDate:", fileCounts[d], "PixelSizes:",
 75+ for k in sizes[d].keys():
 76+ print "%s:%s" % (k, sizes[d][k]),
 77+ print "Dir:", lastDirName
 78+ lastDirName = dirName
 79+ # reinitialize stats
 80+ fileCounts = {}
 81+ sizes = {}
 82+ # add to the stats.
 83+ if (sdate and (fDate >= sdate)) or not sdate:
 84+ if (edate and (fDate <= edate)) or not edate:
 85+ try:
 86+ ( pixelSize, junk ) = fName.split('px-',1)
 87+ except:
 88+ continue
 89+ if not fDate in sizes:
 90+ sizes[fDate] = {}
 91+ if not pixelSize in sizes[fDate]:
 92+ sizes[fDate][pixelSize] = 0
 93+ sizes[fDate][pixelSize] = sizes[fDate][pixelSize] + 1
 94+ if not fDate in fileCounts:
 95+ fileCounts[fDate] = 0
 96+ fileCounts[fDate] = fileCounts[fDate] + 1
 97+ # print stats for final dir
 98+ if (sdate and (fDate >= sdate)) or not sdate:
 99+ if (edate and (fDate <= edate)) or not edate:
 100+ dateStrings = fileCounts.keys()
 101+ dateStrings.sort()
 102+ for d in dateStrings:
 103+ print "Date:", d, "ThumbsForFileThisDate:", fileCounts[d], "PixelSizes:",
 104+ for k in sizes[d].keys():
 105+ print "%s:%s" % (k, sizes[d][k]),
 106+ print "Dir:", dirName
 107+ sys.exit(0)
 108+
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/datascripts/thumbPxSize.py
___________________________________________________________________
Added: svn:eol-style
1109 + native
Index: branches/ariel/tools/thumbs/crunchinglogs/samples/do-dateanal-dates.sh
@@ -0,0 +1,37 @@
 2+#!/bin/bash
 3+
 4+# creates output files each of which has a list by date of
 5+# the number of thumb dirs with 1 thumb file created on that date,
 6+# with 2 thumb files created on that date, with 3, etc.
 7+
 8+python thumbDateAnalysis.py 0-00-files.txt.nobad > dateanalysis.txt
 9+
 10+domonth() {
 11+ outfile=${outfileprefix}-dateanalysis.txt
 12+ rm "$outfile"
 13+ for d in $dates; do
 14+ ymdstring=${ymstring}-$d
 15+ echo -n "$ymdstring " >> "$outfile"
 16+ grep " $ymdstring " dateanalysis.txt | awk '{ print $4 }' | sort | uniq -c | sort -n -k2,2 | sed -e ':a;N;$!ba;s/ \+/ /g; s/\n/,/g' >> "$outfile"
 17+ done
 18+}
 19+
 20+dates30="01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30"
 21+dates31="01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31"
 22+datesdec="01 02 03 04 05 06 07"
 23+
 24+dates="$dates31" ; outfileprefix=aug ; ymstring="2011-08"
 25+domonth
 26+
 27+dates="$dates30" ; outfileprefix=sept ; ymstring="2011-09"
 28+domonth
 29+
 30+dates="$dates31" ; outfileprefix=oct ; ymstring="2011-10"
 31+domonth
 32+
 33+dates="$dates30" ; outfileprefix=nov ; ymstring="2011-11"
 34+domonth
 35+
 36+dates="$datesdec" ; outfileprefix=dec ; ymstring="2011-12"
 37+domonth
 38+
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/samples/do-dateanal-dates.sh
___________________________________________________________________
Added: svn:eol-style
139 + native
Added: svn:executable
240 + *
Index: branches/ariel/tools/thumbs/crunchinglogs/samples/do-dateanal-dates-created.sh
@@ -0,0 +1,41 @@
 2+#!/bin/bash
 3+
 4+# like do-dateanal-dates.sh, creates output files each of which has
 5+# a list by date of # the number of thumb dirs with 1 thumb file created
 6+# on that date, with 2 thumb files created on that date, with 3, etc.
 7+# but with the difference that it counts only the thumbs made at the
 8+# time of creation of the dir (i.e. the earliest dated files in the
 9+# dir). These would presumably be created as the result of an
 10+# image upload.
 11+
 12+python thumbDateAnalysis.py --created 0-00-files.txt.nobad > datecreated.txt
 13+
 14+domonth() {
 15+ outfile=${outfileprefix}-datecreated.txt
 16+ rm "$outfile"
 17+ for d in $dates; do
 18+ ymdstring=${ymstring}-$d
 19+ echo -n "$ymdstring " >> "$outfile"
 20+ grep " $ymdstring " datecreated.txt | awk '{ print $4 }' | sort | uniq -c | sort -n -k2,2 | sed -e ':a;N;$!ba;s/ \+/ /g; s/\n/,/g' >> "$outfile"
 21+ done
 22+}
 23+
 24+dates30="01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30"
 25+dates31="01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31"
 26+datesdec="01 02 03 04 05 06 07"
 27+
 28+dates="$dates31" ; outfileprefix=aug ; ymstring="2011-08"
 29+domonth
 30+
 31+dates="$dates30" ; outfileprefix=sept ; ymstring="2011-09"
 32+domonth
 33+
 34+dates="$dates31" ; outfileprefix=oct ; ymstring="2011-10"
 35+domonth
 36+
 37+dates="$dates30" ; outfileprefix=nov ; ymstring="2011-11"
 38+domonth
 39+
 40+dates="$datesdec" ; outfileprefix=dec ; ymstring="2011-12"
 41+domonth
 42+
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/samples/do-dateanal-dates-created.sh
___________________________________________________________________
Added: svn:eol-style
143 + native
Added: svn:executable
244 + *
Index: branches/ariel/tools/thumbs/crunchinglogs/samples/do-pixel-sizes.sh
@@ -0,0 +1,43 @@
 2+#!/bin/bash
 3+
 4+# creates a list of files by month, each file has a list of
 5+# date, number of thumbs created for each of a fixed number
 6+# of sizes (sizes list: 320px, 640px, 1024px, 1280px)
 7+
 8+python thumbPxSize.py 0-00-files.txt.nobad > pixelsizes.txt
 9+
 10+domonth() {
 11+ outfile=${outfileprefix}-pixelsizes.txt
 12+ rm "$outfile"
 13+ for d in $dates; do
 14+ ymdstring=${ymstring}-$d
 15+ echo -n "$ymdstring " >> "$outfile"
 16+ for size in $sizes; do
 17+ printf "%d " "$size" >> "$outfile"
 18+ count=`grep " $ymdstring " pixelsizes.txt | grep " ${size}:" | wc -l`
 19+ printf "%3d " "$count" >> "$outfile"
 20+ done
 21+ echo >> "$outfile"
 22+ done
 23+}
 24+
 25+dates30="01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30"
 26+dates31="01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31"
 27+datesdec="01 02 03 04 05 06 07"
 28+sizes="320 640 1024 1280"
 29+
 30+dates="$dates31" ; outfileprefix=aug ; ymstring="2011-08"
 31+domonth
 32+
 33+dates="$dates30" ; outfileprefix=sept ; ymstring="2011-09"
 34+domonth
 35+
 36+dates="$dates31" ; outfileprefix=oct ; ymstring="2011-10"
 37+domonth
 38+
 39+dates="$dates30" ; outfileprefix=nov ; ymstring="2011-11"
 40+domonth
 41+
 42+dates="$datesdec" ; outfileprefix=dec ; ymstring="2011-12"
 43+domonth
 44+
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/samples/do-pixel-sizes.sh
___________________________________________________________________
Added: svn:eol-style
145 + native
Added: svn:executable
246 + *
Index: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/listThumbFilesByDir.py
@@ -0,0 +1,28 @@
 2+# -*- coding: utf-8 -*-
 3+
 4+import os
 5+import re
 6+import sys
 7+import time
 8+
 9+# given a list of thumb dirs, list the files in
 10+# each dir, not sorted in any fashion, for input
 11+# to other scripts
 12+
 13+def listFiles(dirName):
 14+ for f in os.listdir(dirName):
 15+ fName = dirName + "/" + f
 16+ if os.path.isfile(fName):
 17+ stat = os.stat(fName)
 18+ fileDate = time.strftime("%Y-%m-%d %H:%M:%S",time.gmtime(stat.st_mtime))
 19+ fileSize = stat.st_size
 20+ print fileDate, " ", fileSize, " ", fName
 21+
 22+if __name__ == "__main__":
 23+ count = 0
 24+ for line in sys.stdin:
 25+ dName = line.rstrip()
 26+ if os.path.isdir(dName):
 27+ listFiles(dName)
 28+ sys.exit(0)
 29+
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/listThumbFilesByDir.py
___________________________________________________________________
Added: svn:eol-style
130 + native
Index: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/checkExistingThumbDirs.py
@@ -0,0 +1,22 @@
 2+# -*- coding: utf-8 -*-
 3+
 4+import os
 5+import re
 6+import sys
 7+import time
 8+
 9+# after a cleanup of some dirs, run this on the list of dirs to check
 10+# how many of them have been recreated in the meantime
 11+
 12+if __name__ == "__main__":
 13+ count = 0
 14+ numDirs = 0
 15+ for line in sys.stdin:
 16+ dname = line.rstrip()
 17+ if os.path.isdir(dname):
 18+ numDirs = numDirs + 1
 19+ count = count + 1
 20+ if count % 1000 == 0:
 21+ print "count:", count, "reached dir:", dname, "existing dirs:", numDirs
 22+ sys.exit(0)
 23+
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/checkExistingThumbDirs.py
___________________________________________________________________
Added: svn:eol-style
124 + native
Index: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/listFileNames.py
@@ -0,0 +1,30 @@
 2+# -*- coding: utf-8 -*-
 3+
 4+import os
 5+import re
 6+import sys
 7+import time
 8+import hashlib
 9+
 10+# convert image filenames (with _) to
 11+# full path with hash, for use by other scripts
 12+
 13+def getHashPathForLevel( name, levels ):
 14+ if levels == 0:
 15+ return ''
 16+ else:
 17+ summer = hashlib.md5()
 18+ summer.update( name )
 19+ md5Hash = summer.hexdigest()
 20+ path = ''
 21+ for i in range( 1,levels+1 ):
 22+ path = path + md5Hash[0:i] + '/'
 23+ return path
 24+
 25+if __name__ == "__main__":
 26+ basedir="/export/thumbs/wikipedia/commons/thumb/"
 27+ for line in sys.stdin:
 28+ fname = line.rstrip()
 29+ hashpath = getHashPathForLevel(fname,2)
 30+ result = basedir + hashpath + fname
 31+ print result
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/listFileNames.py
___________________________________________________________________
Added: svn:eol-style
132 + native
Index: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/removeThumbDirs.py
@@ -0,0 +1,29 @@
 2+# -*- coding: utf-8 -*-
 3+
 4+import os
 5+import re
 6+import sys
 7+import time
 8+
 9+# given a list of thumb dirs for cleanup,
 10+# toss all files in them and also the dirs themselves
 11+
 12+def rmDirAndFiles(dirName):
 13+ for f in os.listdir(dirName):
 14+ if os.path.isfile(dirName + "/" + f):
 15+ os.remove(dirName + "/" + f)
 16+ os.rmdir(dirName)
 17+
 18+if __name__ == "__main__":
 19+ count = 0
 20+ for line in sys.stdin:
 21+ dname = line.rstrip()
 22+ if os.path.isdir(dname):
 23+ rmDirAndFiles(dname)
 24+ count = count + 1
 25+ if count % 1000 == 0:
 26+ print "count ", count, "removed dir", dname
 27+ if count % 100 == 0:
 28+ time.sleep(5)
 29+ sys.exit(0)
 30+
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/removeThumbDirs.py
___________________________________________________________________
Added: svn:eol-style
131 + native
Index: branches/ariel/tools/thumbs/crunchinglogs/README
@@ -0,0 +1,16 @@
 2+These scripts were written so we could get some notion of what was going on with thumbs, given that
 3+we don't keep logs and the host is i/o-bound so we can't just do a pile of finds.
 4+
 5+I can't imagine they will be useful to someone else but they might be useful to us sometime, who knows
 6+
 7+stats on the thumbs files on the filesystem:
 8+* go to (for example) commons/thumb/0/00, run an ls --sort=none, capture results into some file
 9+* cat the input of that to python listThumbFilesByDir.py and save the output of that to a file
 10+* filter it as needed for crap names, results into 0-00-files.txt.nobad
 11+* now you can run the following: do-dateanal-dates-created.sh do-dateanal-dates.sh do-pixel-sizes.sh
 12+ they will create a little pile of files aug*txt sept*txt etc.
 13+
 14+stats on googlebot requests:
 15+* go to locke, zcat sample*log*gz | grep "Googlebot-Image" and gzip the output into googlebot-image-requests.gz
 16+* now you can run check-all-dates.sh and it produces a small pile of output files
 17+

Status & tagging log