diff options
author | Jehiah Czebotar <jehiah@gmail.com> | 2010-10-14 18:36:34 -0400 |
---|---|---|
committer | Jehiah Czebotar <jehiah@gmail.com> | 2010-10-14 18:36:34 -0400 |
commit | 1db934e826ce502d2fa739f677b513a7fa081063 (patch) | |
tree | ff1cb5e85676b55c50f038fd6cbcd2409eb1782f | |
parent | 32118e88770a21b4e7d193feb9e5a176034114d4 (diff) | |
download | data_hacks-1db934e826ce502d2fa739f677b513a7fa081063.zip data_hacks-1db934e826ce502d2fa739f677b513a7fa081063.tar.gz data_hacks-1db934e826ce502d2fa739f677b513a7fa081063.tar.bz2 |
adding command line args
-rw-r--r-- | data_hacks/histogram.py | 64 | ||||
-rw-r--r-- | data_hacks/nintey_five_percent.py | 4 | ||||
-rw-r--r-- | data_hacks/run_for.py | 18 | ||||
-rw-r--r-- | data_hacks/sample.py | 28 | ||||
-rw-r--r-- | setup.py | 1 |
5 files changed, 90 insertions, 25 deletions
diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py index 46c4796..383e0de 100644 --- a/data_hacks/histogram.py +++ b/data_hacks/histogram.py @@ -15,6 +15,7 @@ http://github.com/bitly/data_hacks import sys from decimal import Decimal import math +from optparse import OptionParser class MVSD(object): """ A class that calculates a running Mean / Variance / Standard Deviation""" @@ -76,14 +77,29 @@ def load_stream(input_stream): except: print >>sys.stderr, "invalid line %r" % line -def histogram(stream): +def histogram(stream, options): # we can't iterate on stream because we need to get min/max first and then put it into buckets - data = list(stream) - buckets = 10 + if not options.min or not options.max: + # glob the data here so we can do min/max on it + data = list(stream) + else: + data = stream bucket_scale = 1 - min_v = min(data) - max_v = max(data) + if options.min: + min_v = Decimal(options.min) + else: + min_v = min(data) + if options.max: + max_v = Decimal(options.max) + else: + max_v = max(data) + buckets = options.buckets and int(options.buckets) or 10 + if buckets <= 0: + raise ValueError('# of buckets must be > 0') + if not max_v > min_v: + raise ValueError('max must be > min. max:%s min:%s' % (max_v, min_v)) + diff = max_v - min_v step = diff / buckets bucket_counts = [0 for x in range(buckets)] @@ -91,21 +107,31 @@ def histogram(stream): for x in range(buckets): boundaries.append(min_v + (step * (x + 1))) + skipped = 0 + samples = 0 mvsd = MVSD() for value in data: - mvsd.add(value) + samples +=1 + if options.mvsd: + mvsd.add(value) # find the bucket this goes in + if value < min_v or value > max_v: + skipped +=1 + continue for bucket_postion, boundary in enumerate(boundaries): if value <= boundary: bucket_counts[bucket_postion] +=1 break - # auto-pick the bucket size + # auto-pick the hash scale if max(bucket_counts) > 75: bucket_scale = int(max(bucket_counts) / 75) - print "# NumSamples = %d; Max = %0.2f; Min = %0.2f" % (len(data), max_v, min_v) - print "# Mean = %f; Variance = %f; SD = %f" % (mvsd.mean(), mvsd.var(), mvsd.sd()) + print "# NumSamples = %d; Min = %0.2f; Max = %0.2f" % (samples, min_v, max_v) + if skipped: + print "# %d value%s outside of min/max" % (skipped, skipped > 1 and 's' or '') + if options.mvsd: + print "# Mean = %f; Variance = %f; SD = %f" % (mvsd.mean(), mvsd.var(), mvsd.sd()) print "# each * represents a count of %d" % bucket_scale bucket_min = min_v bucket_max = min_v @@ -120,4 +146,22 @@ def histogram(stream): if __name__ == "__main__": - histogram(load_stream(sys.stdin)) + parser = OptionParser() + parser.usage = "cat data | %prog [options]" + parser.add_option("-m", "--min", dest="min", + help="minimum value for graph") + parser.add_option("-x", "--max", dest="max", + help="maximum value for graph") + parser.add_option("-b", "--buckets", dest="buckets", + help="Number of buckets to use for the histogram") + parser.add_option("--no-mvsd", dest="mvsd", action="store_false", default=True, + help="Dissable the calculation of Mean, Vairance and SD. (improves performance)") + + (options, args) = parser.parse_args() + if sys.stdin.isatty(): + # if isatty() that means it's run without anything piped into it + parser.print_usage() + print "for more help use --help" + sys.exit(1) + histogram(load_stream(sys.stdin), options) + diff --git a/data_hacks/nintey_five_percent.py b/data_hacks/nintey_five_percent.py index b897fe8..2717486 100644 --- a/data_hacks/nintey_five_percent.py +++ b/data_hacks/nintey_five_percent.py @@ -8,6 +8,7 @@ Copyright (c) 2010 bit.ly. All rights reserved. http://github.com/bitly/data_hacks """ import sys +import os from decimal import Decimal def run(): @@ -42,4 +43,7 @@ def calc_95(data, count): return t if __name__ == "__main__": + if sys.stdin.isatty(): + print "Usage: cat data | %(prog)s" % os.path.basename(sys.argv[0]) + sys.exit(1) run() diff --git a/data_hacks/run_for.py b/data_hacks/run_for.py index df7630e..08883b4 100644 --- a/data_hacks/run_for.py +++ b/data_hacks/run_for.py @@ -9,6 +9,7 @@ http://github.com/bitly/data_hacks """ import time import sys +import os def getruntime(arg): if not arg: @@ -17,12 +18,14 @@ def getruntime(arg): base = int(arg[:-1]) if suffix == "s": return base - elif suffix == "h": + elif suffix == "m": return base * 60 + elif suffix == "h": + return base * 60 * 60 elif suffix == "d": - return base * 60 * 24 + return base * 60 * 60 * 24 else: - print >>sys.stderr, "invalid time suffix %r" % arg + print >>sys.stderr, "invalid time suffix %r. must be one of s,m,h,d" % arg def run(runtime): end = time.time() + runtime @@ -35,8 +38,15 @@ def run(runtime): return if __name__ == "__main__": + usage = "Usage: tail -f access.log | %(prog)s [time] | ..." % os.path.basename(sys.argv[0]) + help = "time can be in the format 10s 10m 10h etc" + if sys.stdin.isatty(): + print usage + print help + sys.exit(1) + runtime = getruntime(sys.argv[-1]) if not runtime: - print >>sys.stderr, "usage: tail -f access.log | run_for.py 10s | wc -l" + print usage sys.exit(1) run(runtime)
\ No newline at end of file diff --git a/data_hacks/sample.py b/data_hacks/sample.py index dbed4f3..a57bd43 100644 --- a/data_hacks/sample.py +++ b/data_hacks/sample.py @@ -10,6 +10,7 @@ http://github.com/bitly/data_hacks import sys import random +from optparse import OptionParser from decimal import Decimal def usage(): @@ -37,22 +38,27 @@ def get_sample_rate(rate_string): rate = Decimal(x) / (Decimal(y) * Decimal('1.0')) rate = int(rate * 100) else: - raise Exception("rate %r is invalid rate format must be '10%%' or '1/10'" % rate_string) + raise ValueError("rate %r is invalid rate format must be '10%%' or '1/10'" % rate_string) if rate < 1 or rate > 100: - raise Exception('rate %r must be 1%% <= rate <= 100%% ' % rate_string) + raise ValueError('rate %r must be 1%% <= rate <= 100%% ' % rate_string) return rate if __name__ == "__main__": - debug = '--debug' in sys.argv - try: - sys.argv.remove('--debug') - except ValueError: - pass - if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv) != 2: - usage() + parser = OptionParser() + parser.usage = "cat data | %prog [options] [sample_rate]" + parser.add_option("-v", "--verbose", dest="verbose", default=False, action="store_true") + (options, args) = parser.parse_args() + + if not args or sys.stdin.isatty(): + parser.print_usage() sys.exit(1) - sample_rate = get_sample_rate(sys.argv[-1]) - if debug: + try: + sample_rate = get_sample_rate(sys.argv[-1]) + except ValueError, e: + print >>sys.stderr, e + parser.print_usage() + sys.exit(1) + if options.verbose: print >>sys.stderr, "Sample rate is %d%%" % sample_rate run(sample_rate) @@ -9,5 +9,6 @@ setup(name='data_hacks', # packages=['data_hacks'], scripts = ['data_hacks/histogram.py', 'data_hacks/nintey_five_percent.py', + 'data_hacks/run_for.py', 'data_hacks/sample.py'] )
\ No newline at end of file |