adding command line args

author: Jehiah Czebotar <jehiah@gmail.com> 2010-10-14 18:36:34 -0400
committer: Jehiah Czebotar <jehiah@gmail.com> 2010-10-14 18:36:34 -0400
commit: 1db934e826ce502d2fa739f677b513a7fa081063 (patch)
tree: ff1cb5e85676b55c50f038fd6cbcd2409eb1782f
parent: 32118e88770a21b4e7d193feb9e5a176034114d4 (diff)
download: data_hacks-1db934e826ce502d2fa739f677b513a7fa081063.zip
data_hacks-1db934e826ce502d2fa739f677b513a7fa081063.tar.gz
data_hacks-1db934e826ce502d2fa739f677b513a7fa081063.tar.bz2
5 files changed, 90 insertions, 25 deletions
diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py
index 46c4796..383e0de 100644
--- a/data_hacks/histogram.py
+++ b/data_hacks/histogram.py
@@ -15,6 +15,7 @@ http://github.com/bitly/data_hacks
 import sys
 from decimal import Decimal
 import math
+from optparse import OptionParser
 
 class MVSD(object):
     """ A class that calculates a running Mean / Variance / Standard Deviation"""
@@ -76,14 +77,29 @@ def load_stream(input_stream):
             except:
                 print >>sys.stderr, "invalid line %r" % line
 
-def histogram(stream):
+def histogram(stream, options):
     # we can't iterate on stream because we need to get min/max first and then put it into buckets
-    data = list(stream)
-    buckets = 10
+    if not options.min or not options.max:
+        # glob the data here so we can do min/max on it
+        data = list(stream)
+    else:
+        data = stream
     bucket_scale = 1
     
-    min_v = min(data)
-    max_v = max(data)
+    if options.min:
+        min_v = Decimal(options.min)
+    else:
+        min_v = min(data)
+    if options.max:
+        max_v = Decimal(options.max)
+    else:
+        max_v = max(data)
+    buckets = options.buckets and int(options.buckets) or 10
+    if buckets <= 0:
+        raise ValueError('# of buckets must be > 0')
+    if not max_v > min_v:
+        raise ValueError('max must be > min. max:%s min:%s' % (max_v, min_v))
+        
     diff = max_v - min_v
     step = diff / buckets
     bucket_counts = [0 for x in range(buckets)]
@@ -91,21 +107,31 @@ def histogram(stream):
     for x in range(buckets):
         boundaries.append(min_v + (step * (x + 1)))
     
+    skipped = 0
+    samples = 0
     mvsd = MVSD()
     for value in data:
-        mvsd.add(value)
+        samples +=1
+        if options.mvsd:
+            mvsd.add(value)
         # find the bucket this goes in
+        if value < min_v or value > max_v:
+            skipped +=1
+            continue
         for bucket_postion, boundary in enumerate(boundaries):
             if value <= boundary:
                 bucket_counts[bucket_postion] +=1
                 break
     
-    # auto-pick the bucket size
+    # auto-pick the hash scale
     if max(bucket_counts) > 75:
         bucket_scale = int(max(bucket_counts) / 75)
     
-    print "# NumSamples = %d; Max = %0.2f; Min = %0.2f" % (len(data), max_v, min_v)
-    print "# Mean = %f; Variance = %f; SD = %f" % (mvsd.mean(), mvsd.var(), mvsd.sd())
+    print "# NumSamples = %d; Min = %0.2f; Max = %0.2f" % (samples, min_v, max_v)
+    if skipped:
+        print "# %d value%s outside of min/max" % (skipped, skipped > 1 and 's' or '')
+    if options.mvsd:
+        print "# Mean = %f; Variance = %f; SD = %f" % (mvsd.mean(), mvsd.var(), mvsd.sd())
     print "# each * represents a count of %d" % bucket_scale
     bucket_min = min_v
     bucket_max = min_v
@@ -120,4 +146,22 @@ def histogram(stream):
         
 
 if __name__ == "__main__":
-    histogram(load_stream(sys.stdin))
+    parser = OptionParser()
+    parser.usage = "cat data | %prog [options]"
+    parser.add_option("-m", "--min", dest="min",
+                        help="minimum value for graph")
+    parser.add_option("-x", "--max", dest="max",
+                        help="maximum value for graph")
+    parser.add_option("-b", "--buckets", dest="buckets",
+                        help="Number of buckets to use for the histogram")
+    parser.add_option("--no-mvsd", dest="mvsd", action="store_false", default=True,
+                        help="Dissable the calculation of Mean, Vairance and SD. (improves performance)")
+
+    (options, args) = parser.parse_args()
+    if sys.stdin.isatty():
+        # if isatty() that means it's run without anything piped into it
+        parser.print_usage()
+        print "for more help use --help"
+        sys.exit(1)
+    histogram(load_stream(sys.stdin), options)
+
diff --git a/data_hacks/nintey_five_percent.py b/data_hacks/nintey_five_percent.py
index b897fe8..2717486 100644
--- a/data_hacks/nintey_five_percent.py
+++ b/data_hacks/nintey_five_percent.py
@@ -8,6 +8,7 @@ Copyright (c) 2010 bit.ly. All rights reserved.
 http://github.com/bitly/data_hacks
 """
 import sys
+import os
 from decimal import Decimal
 
 def run():
@@ -42,4 +43,7 @@ def calc_95(data, count):
             return t
 
 if __name__ == "__main__":
+    if sys.stdin.isatty():
+        print "Usage: cat data | %(prog)s" % os.path.basename(sys.argv[0])
+        sys.exit(1)
     run()
diff --git a/data_hacks/run_for.py b/data_hacks/run_for.py
index df7630e..08883b4 100644
--- a/data_hacks/run_for.py
+++ b/data_hacks/run_for.py
@@ -9,6 +9,7 @@ http://github.com/bitly/data_hacks
 """
 import time
 import sys
+import os
 
 def getruntime(arg):
     if not arg:
@@ -17,12 +18,14 @@ def getruntime(arg):
     base = int(arg[:-1])
     if suffix == "s":
         return base
-    elif suffix == "h":
+    elif suffix == "m":
         return base * 60
+    elif suffix == "h":
+        return base * 60 * 60
     elif suffix == "d":
-        return base * 60 * 24
+        return base * 60 * 60 * 24
     else:
-        print >>sys.stderr, "invalid time suffix %r" % arg
+        print >>sys.stderr, "invalid time suffix %r. must be one of s,m,h,d" % arg
 
 def run(runtime):
     end = time.time() + runtime
@@ -35,8 +38,15 @@ def run(runtime):
             return
 
 if __name__ == "__main__":
+    usage = "Usage: tail -f access.log | %(prog)s [time] | ..." % os.path.basename(sys.argv[0])
+    help = "time can be in the format 10s 10m 10h etc"
+    if sys.stdin.isatty():
+        print usage
+        print help
+        sys.exit(1)
+
     runtime = getruntime(sys.argv[-1])
     if not runtime:
-        print >>sys.stderr, "usage: tail -f access.log | run_for.py 10s | wc -l"
+        print usage
         sys.exit(1)
     run(runtime)
 \ No newline at end of file
diff --git a/data_hacks/sample.py b/data_hacks/sample.py
index dbed4f3..a57bd43 100644
--- a/data_hacks/sample.py
+++ b/data_hacks/sample.py
@@ -10,6 +10,7 @@ http://github.com/bitly/data_hacks
 
 import sys
 import random
+from optparse import OptionParser
 from decimal import Decimal
 
 def usage():
@@ -37,22 +38,27 @@ def get_sample_rate(rate_string):
         rate = Decimal(x) / (Decimal(y) * Decimal('1.0'))
         rate = int(rate * 100)
     else:
-        raise Exception("rate %r is invalid rate format must be '10%%' or '1/10'" % rate_string)
+        raise ValueError("rate %r is invalid rate format must be '10%%' or '1/10'" % rate_string)
     if rate < 1 or rate > 100:
-        raise Exception('rate %r must be 1%% <= rate <= 100%% ' % rate_string)
+        raise ValueError('rate %r must be 1%% <= rate <= 100%% ' % rate_string)
     return rate
 
 if __name__ == "__main__":
-    debug = '--debug' in sys.argv
-    try:
-        sys.argv.remove('--debug')
-    except ValueError:
-        pass
-    if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv) != 2:
-        usage()
+    parser = OptionParser()
+    parser.usage = "cat data | %prog [options] [sample_rate]"
+    parser.add_option("-v", "--verbose", dest="verbose", default=False, action="store_true")
+    (options, args) = parser.parse_args()
+    
+    if not args or sys.stdin.isatty():
+        parser.print_usage()
         sys.exit(1)
     
-    sample_rate = get_sample_rate(sys.argv[-1])
-    if debug:
+    try:
+        sample_rate = get_sample_rate(sys.argv[-1])
+    except ValueError, e:
+        print >>sys.stderr, e
+        parser.print_usage()
+        sys.exit(1)
+    if options.verbose:
         print >>sys.stderr, "Sample rate is %d%%" % sample_rate 
     run(sample_rate)
diff --git a/setup.py b/setup.py
index b4f46af..111854f 100644
--- a/setup.py
+++ b/setup.py
@@ -9,5 +9,6 @@ setup(name='data_hacks',
       # packages=['data_hacks'],
       scripts = ['data_hacks/histogram.py', 
                 'data_hacks/nintey_five_percent.py',
+                'data_hacks/run_for.py',
                 'data_hacks/sample.py']
      )
 \ No newline at end of file
author	Jehiah Czebotar <jehiah@gmail.com>	2010-10-14 18:36:34 -0400
committer	Jehiah Czebotar <jehiah@gmail.com>	2010-10-14 18:36:34 -0400
commit	1db934e826ce502d2fa739f677b513a7fa081063 (patch)
tree	ff1cb5e85676b55c50f038fd6cbcd2409eb1782f
parent	32118e88770a21b4e7d193feb9e5a176034114d4 (diff)
download	data_hacks-1db934e826ce502d2fa739f677b513a7fa081063.zip data_hacks-1db934e826ce502d2fa739f677b513a7fa081063.tar.gz data_hacks-1db934e826ce502d2fa739f677b513a7fa081063.tar.bz2