Index: branches/ariel/tools/thumbs/crunchinglogs/datascripts/thumbFilesSizesCounts.py |
— | — | @@ -0,0 +1,73 @@ |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +import os |
| 5 | +import re |
| 6 | +import sys |
| 7 | +import time |
| 8 | +import getopt |
| 9 | + |
| 10 | +def usage(message=None): |
| 11 | + print "Usage: %s [--sdate=date --edate=date [filename]" % sys.argv[0] |
| 12 | + print "sdate: start date for which to print stats, default: earliest date in file " |
| 13 | + print "edate: end date for which to print stats, default: latest date in file" |
| 14 | + print "Date format: yyyy-mm-dd" |
| 15 | + print "If filename is not specified, reads from stdin" |
| 16 | + print "" |
| 17 | + print "Format of input file: (sample line)" |
| 18 | + print "2011-10-29 01:57:51 100311 Festiwal_Słowian_i_Wikingów_2009_121.jpg/640px-Festiwal_Słowian_i_Wikingów_2009_121.jpg" |
| 19 | + print "date in yyyy-mm-dd format, time in hh:mm::ss format, size in bytes, thumb directory/thumb filename" |
| 20 | + sys.exit(1) |
| 21 | + |
| 22 | +if __name__ == "__main__": |
| 23 | + sdate = None |
| 24 | + edate = None |
| 25 | + |
| 26 | + try: |
| 27 | + (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", |
| 28 | + [ 'sdate=', 'edate=', ]) |
| 29 | + except: |
| 30 | + usage("Unknown option specified") |
| 31 | + |
| 32 | + for (opt, val) in options: |
| 33 | + if opt == "--sdate": |
| 34 | + sdate = val |
| 35 | + elif opt == "--edate": |
| 36 | + edate = val |
| 37 | + |
| 38 | + dateexp = re.compile(r"^\d{4}-\d{2}-\d{2}$") |
| 39 | + for d in filter(None, [ sdate, edate ]): |
| 40 | + if not dateexp.match(d): |
| 41 | + usage("Bad date format.") |
| 42 | + |
| 43 | + if len(remainder) == 1: |
| 44 | + inputFile = remainder[0] |
| 45 | + fHandle = open(inputFile,"r") |
| 46 | + elif len(remainder) == 0: |
| 47 | + fHandle = sys.stdin |
| 48 | + else: |
| 49 | + usage("Too many arguments.") |
| 50 | + |
| 51 | + sizes = {} |
| 52 | + counts = {} |
| 53 | + totalSize = 0 |
| 54 | + totalNum = 0 |
| 55 | + for line in fHandle: |
| 56 | + try: |
| 57 | + ( fDate, fTime, fSize, fName ) = line.rstrip().split() |
| 58 | + except: |
| 59 | + print >> sys.stderr, "skipping badly formatted line: ", line.rstrip() |
| 60 | + continue |
| 61 | + if (sdate and (fDate >= sdate)) or not sdate: |
| 62 | + if (edate and (fDate <= edate)) or not edate: |
| 63 | + if not fDate in sizes: |
| 64 | + sizes[fDate] = 0 |
| 65 | + counts[fDate] = 0 |
| 66 | + sizes[fDate] = sizes[fDate] + int(fSize) |
| 67 | + counts[fDate] = counts[fDate] + 1 |
| 68 | + |
| 69 | + dates = sizes.keys() |
| 70 | + dates.sort() |
| 71 | + for d in dates: |
| 72 | + print "Date:", d, "Bytes:", sizes[d], "Files:", counts[d] |
| 73 | + sys.exit(0) |
| 74 | + |
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/datascripts/thumbFilesSizesCounts.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 75 | + native |
Index: branches/ariel/tools/thumbs/crunchinglogs/datascripts/thumbDateAnalysis.py |
— | — | @@ -0,0 +1,111 @@ |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +import os |
| 5 | +import re |
| 6 | +import sys |
| 7 | +import time |
| 8 | +import getopt |
| 9 | + |
| 10 | +def usage(message=None): |
| 11 | + print "Usage: %s [--sdate=date --edate=date --created [filename]" % sys.argv[0] |
| 12 | + print "sdate: start date for which to print stats, default: earliest date in file " |
| 13 | + print "edate: end date for which to print stats, default: latest date in file" |
| 14 | + print "created: show only the number of files and sizes on the date the first thumb" |
| 15 | + print "was created (presumably the date the image itself was first uploaded)" |
| 16 | + print "" |
| 17 | + print "Date format for sdate and edate: yyyy-mm-dd" |
| 18 | + print "" |
| 19 | + print "If no filename is specified, input is read from stdin" |
| 20 | + print |
| 21 | + print "Format of input file: (sample line)" |
| 22 | + print "2011-10-29 01:57:51 100311 Festiwal_Słowian_i_Wikingów_2009_121.jpg/640px-Festiwal_Słowian_i_Wikingów_2009_121.jpg" |
| 23 | + print "date in yyyy-mm-dd format, time in hh:mm::ss format, size in bytes, thumb directory/thumb filename" |
| 24 | + sys.exit(1) |
| 25 | + |
| 26 | +if __name__ == "__main__": |
| 27 | + sdate = None |
| 28 | + edate = None |
| 29 | + created = False |
| 30 | + try: |
| 31 | + (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", |
| 32 | + [ 'sdate=', 'edate=', 'created' ]) |
| 33 | + except: |
| 34 | + usage("Unknown option specified") |
| 35 | + |
| 36 | + for (opt, val) in options: |
| 37 | + if opt == "--sdate": |
| 38 | + sdate = val |
| 39 | + elif opt == "--edate": |
| 40 | + edate = val |
| 41 | + elif opt == "--created": |
| 42 | + created = True |
| 43 | + |
| 44 | + dateexp = re.compile(r"^\d{4}-\d{2}-\d{2}$") |
| 45 | + for d in filter(None, [ sdate, edate ]): |
| 46 | + if not dateexp.match(d): |
| 47 | + usage("Bad date format.") |
| 48 | + |
| 49 | + if len(remainder) == 1: |
| 50 | + inputFile = remainder[0] |
| 51 | + fHandle = open(inputFile,"r") |
| 52 | + elif len(remainder) == 0: |
| 53 | + fHandle = sys.stdin |
| 54 | + else: |
| 55 | + usage("Too many arguments.") |
| 56 | + |
| 57 | + lastDirName = None |
| 58 | + numFilesSameDate = 0 |
| 59 | + byteCountSameDate = 0 |
| 60 | + fileCounts = {} |
| 61 | + byteCounts = {} |
| 62 | + for line in fHandle: |
| 63 | + try: |
| 64 | + ( fDate, fTime, fSize, path ) = line.rstrip().split() |
| 65 | + except: |
| 66 | + print >> sys.stderr, "skipping badly formatted line: ", line.rstrip() |
| 67 | + continue |
| 68 | + ( dirName, fName ) = path.split('/',2) |
| 69 | + if not lastDirName: |
| 70 | + lastDirName = dirName |
| 71 | + if dirName != lastDirName: |
| 72 | + # should just print the number of files for every date sorted by date order, plus the dir name of course" |
| 73 | + if (sdate and (fDate >= sdate)) or not sdate: |
| 74 | + if (edate and (fDate <= edate)) or not edate: |
| 75 | + # print the stats |
| 76 | + dateStrings = fileCounts.keys() |
| 77 | + dateStrings.sort() |
| 78 | + if created: |
| 79 | + printDates = [ dateStrings[0] ] |
| 80 | + else: |
| 81 | + printDates = dateStrings |
| 82 | + for d in printDates: |
| 83 | + print "Date:", d, "FilesThisDate:", fileCounts[d], "ByteCountThisDate:", byteCounts[d], "Dir: ", lastDirName |
| 84 | + lastDirName = dirName |
| 85 | + # reinitialize stats |
| 86 | + numFilesSameDate = 0 |
| 87 | + byteCountSameDate = 0 |
| 88 | + fileCounts = {} |
| 89 | + byteCounts = {} |
| 90 | + # add to the stats. |
| 91 | + if (sdate and (fDate >= sdate)) or not sdate: |
| 92 | + if (edate and (fDate <= edate)) or not edate: |
| 93 | + if fDate not in fileCounts: |
| 94 | + fileCounts[fDate] = 0 |
| 95 | + fileCounts[fDate] = fileCounts[fDate] + 1 |
| 96 | + if fDate not in byteCounts: |
| 97 | + byteCounts[fDate] = 0 |
| 98 | + byteCounts[fDate] = byteCounts[fDate] + int(fSize) |
| 99 | + |
| 100 | + # print stats for final dir |
| 101 | + if (sdate and (fDate >= sdate)) or not sdate: |
| 102 | + if (edate and (fDate <= edate)) or not edate: |
| 103 | + dateStrings = fileCounts.keys() |
| 104 | + dateStrings.sort() |
| 105 | + if created: |
| 106 | + printDates = [ dateStrings[0] ] |
| 107 | + else: |
| 108 | + printDates = dateStrings |
| 109 | + for d in printDates: |
| 110 | + print "Date:", d, "FilesThisDate:", fileCounts[d], "ByteCountThisDate:", byteCounts[d], "Dir: ", dirName |
| 111 | + sys.exit(0) |
| 112 | + |
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/datascripts/thumbDateAnalysis.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 113 | + native |
Index: branches/ariel/tools/thumbs/crunchinglogs/datascripts/thumbPxSize.py |
— | — | @@ -0,0 +1,107 @@ |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +import os |
| 5 | +import re |
| 6 | +import sys |
| 7 | +import time |
| 8 | +import getopt |
| 9 | + |
| 10 | + |
| 11 | +def usage(message=None): |
| 12 | + print "Usage: %s [--sdate=date --edate=date [filename]" % sys.argv[0] |
| 13 | + print "sdate: start date for which to print stats, default: earliest date in file " |
| 14 | + print "edate: end date for which to print stats, default: latest date in file" |
| 15 | + print "Date format: yyyy-mm-dd" |
| 16 | + print "If filename is not specified, reads from stdin" |
| 17 | + print "" |
| 18 | + print "Format of input file: (sample line)" |
| 19 | + print "2011-10-29 01:57:51 100311 Festiwal_Słowian_i_Wikingów_2009_121.jpg/640px-Festiwal_Słowian_i_Wikingów_2009_121.jpg" |
| 20 | + print "date in yyyy-mm-dd format, time in hh:mm::ss format, size in bytes, thumb directory/thumb filename" |
| 21 | + sys.exit(1) |
| 22 | + |
| 23 | +if __name__ == "__main__": |
| 24 | + sdate = None |
| 25 | + edate = None |
| 26 | + |
| 27 | + try: |
| 28 | + (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", |
| 29 | + [ 'sdate=', 'edate=', ]) |
| 30 | + except: |
| 31 | + usage("Unknown option specified") |
| 32 | + |
| 33 | + for (opt, val) in options: |
| 34 | + if opt == "--sdate": |
| 35 | + sdate = val |
| 36 | + elif opt == "--edate": |
| 37 | + edate = val |
| 38 | + |
| 39 | + dateexp = re.compile(r"^\d{4}-\d{2}-\d{2}$") |
| 40 | + for d in filter(None, [ sdate, edate ]): |
| 41 | + if not dateexp.match(d): |
| 42 | + usage("Bad date format.") |
| 43 | + |
| 44 | + if len(remainder) == 1: |
| 45 | + inputFile = remainder[0] |
| 46 | + fHandle = open(inputFile,"r") |
| 47 | + elif len(remainder) == 0: |
| 48 | + fHandle = sys.stdin |
| 49 | + else: |
| 50 | + usage("Too many arguments.") |
| 51 | + |
| 52 | + lastDirName = None |
| 53 | + fileCounts = {} |
| 54 | + sizes = {} |
| 55 | + for line in fHandle: |
| 56 | + try: |
| 57 | + ( fDate, fTime, fSize, path ) = line.rstrip().split() |
| 58 | + except: |
| 59 | + print >> sys.stderr, "skipping badly formatted line: ", line.rstrip() |
| 60 | + continue |
| 61 | + try: |
| 62 | + ( dirName, fName ) = path.split('/',2) |
| 63 | + except: |
| 64 | + continue |
| 65 | + if not lastDirName: |
| 66 | + lastDirName = dirName |
| 67 | + if dirName != lastDirName: |
| 68 | + if (sdate and (fDate >= sdate)) or not sdate: |
| 69 | + if (edate and (fDate <= edate)) or not edate: |
| 70 | + # print the stats |
| 71 | + dateStrings = fileCounts.keys() |
| 72 | + dateStrings.sort() |
| 73 | + for d in dateStrings: |
| 74 | + print "Date:", d, "ThumbsForFileThisDate:", fileCounts[d], "PixelSizes:", |
| 75 | + for k in sizes[d].keys(): |
| 76 | + print "%s:%s" % (k, sizes[d][k]), |
| 77 | + print "Dir:", lastDirName |
| 78 | + lastDirName = dirName |
| 79 | + # reinitialize stats |
| 80 | + fileCounts = {} |
| 81 | + sizes = {} |
| 82 | + # add to the stats. |
| 83 | + if (sdate and (fDate >= sdate)) or not sdate: |
| 84 | + if (edate and (fDate <= edate)) or not edate: |
| 85 | + try: |
| 86 | + ( pixelSize, junk ) = fName.split('px-',1) |
| 87 | + except: |
| 88 | + continue |
| 89 | + if not fDate in sizes: |
| 90 | + sizes[fDate] = {} |
| 91 | + if not pixelSize in sizes[fDate]: |
| 92 | + sizes[fDate][pixelSize] = 0 |
| 93 | + sizes[fDate][pixelSize] = sizes[fDate][pixelSize] + 1 |
| 94 | + if not fDate in fileCounts: |
| 95 | + fileCounts[fDate] = 0 |
| 96 | + fileCounts[fDate] = fileCounts[fDate] + 1 |
| 97 | + # print stats for final dir |
| 98 | + if (sdate and (fDate >= sdate)) or not sdate: |
| 99 | + if (edate and (fDate <= edate)) or not edate: |
| 100 | + dateStrings = fileCounts.keys() |
| 101 | + dateStrings.sort() |
| 102 | + for d in dateStrings: |
| 103 | + print "Date:", d, "ThumbsForFileThisDate:", fileCounts[d], "PixelSizes:", |
| 104 | + for k in sizes[d].keys(): |
| 105 | + print "%s:%s" % (k, sizes[d][k]), |
| 106 | + print "Dir:", dirName |
| 107 | + sys.exit(0) |
| 108 | + |
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/datascripts/thumbPxSize.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 109 | + native |
Index: branches/ariel/tools/thumbs/crunchinglogs/samples/do-dateanal-dates.sh |
— | — | @@ -0,0 +1,37 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +# creates output files each of which has a list by date of |
| 5 | +# the number of thumb dirs with 1 thumb file created on that date, |
| 6 | +# with 2 thumb files created on that date, with 3, etc. |
| 7 | + |
| 8 | +python thumbDateAnalysis.py 0-00-files.txt.nobad > dateanalysis.txt |
| 9 | + |
| 10 | +domonth() { |
| 11 | + outfile=${outfileprefix}-dateanalysis.txt |
| 12 | + rm "$outfile" |
| 13 | + for d in $dates; do |
| 14 | + ymdstring=${ymstring}-$d |
| 15 | + echo -n "$ymdstring " >> "$outfile" |
| 16 | + grep " $ymdstring " dateanalysis.txt | awk '{ print $4 }' | sort | uniq -c | sort -n -k2,2 | sed -e ':a;N;$!ba;s/ \+/ /g; s/\n/,/g' >> "$outfile" |
| 17 | + done |
| 18 | +} |
| 19 | + |
| 20 | +dates30="01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30" |
| 21 | +dates31="01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31" |
| 22 | +datesdec="01 02 03 04 05 06 07" |
| 23 | + |
| 24 | +dates="$dates31" ; outfileprefix=aug ; ymstring="2011-08" |
| 25 | +domonth |
| 26 | + |
| 27 | +dates="$dates30" ; outfileprefix=sept ; ymstring="2011-09" |
| 28 | +domonth |
| 29 | + |
| 30 | +dates="$dates31" ; outfileprefix=oct ; ymstring="2011-10" |
| 31 | +domonth |
| 32 | + |
| 33 | +dates="$dates30" ; outfileprefix=nov ; ymstring="2011-11" |
| 34 | +domonth |
| 35 | + |
| 36 | +dates="$datesdec" ; outfileprefix=dec ; ymstring="2011-12" |
| 37 | +domonth |
| 38 | + |
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/samples/do-dateanal-dates.sh |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 39 | + native |
Added: svn:executable |
2 | 40 | + * |
Index: branches/ariel/tools/thumbs/crunchinglogs/samples/do-dateanal-dates-created.sh |
— | — | @@ -0,0 +1,41 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +# like do-dateanal-dates.sh, creates output files each of which has |
| 5 | +# a list by date of # the number of thumb dirs with 1 thumb file created |
| 6 | +# on that date, with 2 thumb files created on that date, with 3, etc. |
| 7 | +# but with the difference that it counts only the thumbs made at the |
| 8 | +# time of creation of the dir (i.e. the earliest dated files in the |
| 9 | +# dir). These would presumably be created as the result of an |
| 10 | +# image upload. |
| 11 | + |
| 12 | +python thumbDateAnalysis.py --created 0-00-files.txt.nobad > datecreated.txt |
| 13 | + |
| 14 | +domonth() { |
| 15 | + outfile=${outfileprefix}-datecreated.txt |
| 16 | + rm "$outfile" |
| 17 | + for d in $dates; do |
| 18 | + ymdstring=${ymstring}-$d |
| 19 | + echo -n "$ymdstring " >> "$outfile" |
| 20 | + grep " $ymdstring " datecreated.txt | awk '{ print $4 }' | sort | uniq -c | sort -n -k2,2 | sed -e ':a;N;$!ba;s/ \+/ /g; s/\n/,/g' >> "$outfile" |
| 21 | + done |
| 22 | +} |
| 23 | + |
| 24 | +dates30="01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30" |
| 25 | +dates31="01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31" |
| 26 | +datesdec="01 02 03 04 05 06 07" |
| 27 | + |
| 28 | +dates="$dates31" ; outfileprefix=aug ; ymstring="2011-08" |
| 29 | +domonth |
| 30 | + |
| 31 | +dates="$dates30" ; outfileprefix=sept ; ymstring="2011-09" |
| 32 | +domonth |
| 33 | + |
| 34 | +dates="$dates31" ; outfileprefix=oct ; ymstring="2011-10" |
| 35 | +domonth |
| 36 | + |
| 37 | +dates="$dates30" ; outfileprefix=nov ; ymstring="2011-11" |
| 38 | +domonth |
| 39 | + |
| 40 | +dates="$datesdec" ; outfileprefix=dec ; ymstring="2011-12" |
| 41 | +domonth |
| 42 | + |
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/samples/do-dateanal-dates-created.sh |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 43 | + native |
Added: svn:executable |
2 | 44 | + * |
Index: branches/ariel/tools/thumbs/crunchinglogs/samples/do-pixel-sizes.sh |
— | — | @@ -0,0 +1,43 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +# creates a list of files by month, each file has a list of |
| 5 | +# date, number of thumbs created for each of a fixed number |
| 6 | +# of sizes (sizes list: 320px, 640px, 1024px, 1280px) |
| 7 | + |
| 8 | +python thumbPxSize.py 0-00-files.txt.nobad > pixelsizes.txt |
| 9 | + |
| 10 | +domonth() { |
| 11 | + outfile=${outfileprefix}-pixelsizes.txt |
| 12 | + rm "$outfile" |
| 13 | + for d in $dates; do |
| 14 | + ymdstring=${ymstring}-$d |
| 15 | + echo -n "$ymdstring " >> "$outfile" |
| 16 | + for size in $sizes; do |
| 17 | + printf "%d " "$size" >> "$outfile" |
| 18 | + count=`grep " $ymdstring " pixelsizes.txt | grep " ${size}:" | wc -l` |
| 19 | + printf "%3d " "$count" >> "$outfile" |
| 20 | + done |
| 21 | + echo >> "$outfile" |
| 22 | + done |
| 23 | +} |
| 24 | + |
| 25 | +dates30="01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30" |
| 26 | +dates31="01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31" |
| 27 | +datesdec="01 02 03 04 05 06 07" |
| 28 | +sizes="320 640 1024 1280" |
| 29 | + |
| 30 | +dates="$dates31" ; outfileprefix=aug ; ymstring="2011-08" |
| 31 | +domonth |
| 32 | + |
| 33 | +dates="$dates30" ; outfileprefix=sept ; ymstring="2011-09" |
| 34 | +domonth |
| 35 | + |
| 36 | +dates="$dates31" ; outfileprefix=oct ; ymstring="2011-10" |
| 37 | +domonth |
| 38 | + |
| 39 | +dates="$dates30" ; outfileprefix=nov ; ymstring="2011-11" |
| 40 | +domonth |
| 41 | + |
| 42 | +dates="$datesdec" ; outfileprefix=dec ; ymstring="2011-12" |
| 43 | +domonth |
| 44 | + |
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/samples/do-pixel-sizes.sh |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 45 | + native |
Added: svn:executable |
2 | 46 | + * |
Index: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/listThumbFilesByDir.py |
— | — | @@ -0,0 +1,28 @@ |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +import os |
| 5 | +import re |
| 6 | +import sys |
| 7 | +import time |
| 8 | + |
| 9 | +# given a list of thumb dirs, list the files in |
| 10 | +# each dir, not sorted in any fashion, for input |
| 11 | +# to other scripts |
| 12 | + |
| 13 | +def listFiles(dirName): |
| 14 | + for f in os.listdir(dirName): |
| 15 | + fName = dirName + "/" + f |
| 16 | + if os.path.isfile(fName): |
| 17 | + stat = os.stat(fName) |
| 18 | + fileDate = time.strftime("%Y-%m-%d %H:%M:%S",time.gmtime(stat.st_mtime)) |
| 19 | + fileSize = stat.st_size |
| 20 | + print fileDate, " ", fileSize, " ", fName |
| 21 | + |
| 22 | +if __name__ == "__main__": |
| 23 | + count = 0 |
| 24 | + for line in sys.stdin: |
| 25 | + dName = line.rstrip() |
| 26 | + if os.path.isdir(dName): |
| 27 | + listFiles(dName) |
| 28 | + sys.exit(0) |
| 29 | + |
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/listThumbFilesByDir.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 30 | + native |
Index: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/checkExistingThumbDirs.py |
— | — | @@ -0,0 +1,22 @@ |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +import os |
| 5 | +import re |
| 6 | +import sys |
| 7 | +import time |
| 8 | + |
| 9 | +# after a cleanup of some dirs, run this on the list of dirs to check |
| 10 | +# how many of them have been recreated in the meantime |
| 11 | + |
| 12 | +if __name__ == "__main__": |
| 13 | + count = 0 |
| 14 | + numDirs = 0 |
| 15 | + for line in sys.stdin: |
| 16 | + dname = line.rstrip() |
| 17 | + if os.path.isdir(dname): |
| 18 | + numDirs = numDirs + 1 |
| 19 | + count = count + 1 |
| 20 | + if count % 1000 == 0: |
| 21 | + print "count:", count, "reached dir:", dname, "existing dirs:", numDirs |
| 22 | + sys.exit(0) |
| 23 | + |
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/checkExistingThumbDirs.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 24 | + native |
Index: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/listFileNames.py |
— | — | @@ -0,0 +1,30 @@ |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +import os |
| 5 | +import re |
| 6 | +import sys |
| 7 | +import time |
| 8 | +import hashlib |
| 9 | + |
| 10 | +# convert image filenames (with _) to |
| 11 | +# full path with hash, for use by other scripts |
| 12 | + |
| 13 | +def getHashPathForLevel( name, levels ): |
| 14 | + if levels == 0: |
| 15 | + return '' |
| 16 | + else: |
| 17 | + summer = hashlib.md5() |
| 18 | + summer.update( name ) |
| 19 | + md5Hash = summer.hexdigest() |
| 20 | + path = '' |
| 21 | + for i in range( 1,levels+1 ): |
| 22 | + path = path + md5Hash[0:i] + '/' |
| 23 | + return path |
| 24 | + |
| 25 | +if __name__ == "__main__": |
| 26 | + basedir="/export/thumbs/wikipedia/commons/thumb/" |
| 27 | + for line in sys.stdin: |
| 28 | + fname = line.rstrip() |
| 29 | + hashpath = getHashPathForLevel(fname,2) |
| 30 | + result = basedir + hashpath + fname |
| 31 | + print result |
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/listFileNames.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 32 | + native |
Index: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/removeThumbDirs.py |
— | — | @@ -0,0 +1,29 @@ |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +import os |
| 5 | +import re |
| 6 | +import sys |
| 7 | +import time |
| 8 | + |
| 9 | +# given a list of thumb dirs for cleanup, |
| 10 | +# toss all files in them and also the dirs themselves |
| 11 | + |
| 12 | +def rmDirAndFiles(dirName): |
| 13 | + for f in os.listdir(dirName): |
| 14 | + if os.path.isfile(dirName + "/" + f): |
| 15 | + os.remove(dirName + "/" + f) |
| 16 | + os.rmdir(dirName) |
| 17 | + |
| 18 | +if __name__ == "__main__": |
| 19 | + count = 0 |
| 20 | + for line in sys.stdin: |
| 21 | + dname = line.rstrip() |
| 22 | + if os.path.isdir(dname): |
| 23 | + rmDirAndFiles(dname) |
| 24 | + count = count + 1 |
| 25 | + if count % 1000 == 0: |
| 26 | + print "count ", count, "removed dir", dname |
| 27 | + if count % 100 == 0: |
| 28 | + time.sleep(5) |
| 29 | + sys.exit(0) |
| 30 | + |
Property changes on: branches/ariel/tools/thumbs/crunchinglogs/otherscripts/removeThumbDirs.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 31 | + native |
Index: branches/ariel/tools/thumbs/crunchinglogs/README |
— | — | @@ -0,0 +1,16 @@ |
| 2 | +These scripts were written so we could get some notion of what was going on with thumbs, given that |
| 3 | +we don't keep logs and the host is i/o-bound so we can't just do a pile of finds. |
| 4 | + |
| 5 | +I can't imagine they will be useful to someone else but they might be useful to us sometime, who knows |
| 6 | + |
| 7 | +stats on the thumbs files on the filesystem: |
| 8 | +* go to (for example) commons/thumb/0/00, run an ls --sort=none, capture results into some file |
| 9 | +* cat the input of that to python listThumbFilesByDir.py and save the output of that to a file |
| 10 | +* filter it as needed for crap names, results into 0-00-files.txt.nobad |
| 11 | +* now you can run the following: do-dateanal-dates-created.sh do-dateanal-dates.sh do-pixel-sizes.sh |
| 12 | + they will create a little pile of files aug*txt sept*txt etc. |
| 13 | + |
| 14 | +stats on googlebot requests: |
| 15 | +* go to locke, zcat sample*log*gz | grep "Googlebot-Image" and gzip the output into googlebot-image-requests.gz |
| 16 | +* now you can run check-all-dates.sh and it produces a small pile of output files |
| 17 | + |