summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJehiah Czebotar <jehiah@gmail.com>2010-10-14 18:36:34 -0400
committerJehiah Czebotar <jehiah@gmail.com>2010-10-14 18:36:34 -0400
commit1db934e826ce502d2fa739f677b513a7fa081063 (patch)
treeff1cb5e85676b55c50f038fd6cbcd2409eb1782f
parent32118e88770a21b4e7d193feb9e5a176034114d4 (diff)
downloaddata_hacks-1db934e826ce502d2fa739f677b513a7fa081063.zip
data_hacks-1db934e826ce502d2fa739f677b513a7fa081063.tar.gz
data_hacks-1db934e826ce502d2fa739f677b513a7fa081063.tar.bz2
adding command line args
-rw-r--r--data_hacks/histogram.py64
-rw-r--r--data_hacks/nintey_five_percent.py4
-rw-r--r--data_hacks/run_for.py18
-rw-r--r--data_hacks/sample.py28
-rw-r--r--setup.py1
5 files changed, 90 insertions, 25 deletions
diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py
index 46c4796..383e0de 100644
--- a/data_hacks/histogram.py
+++ b/data_hacks/histogram.py
@@ -15,6 +15,7 @@ http://github.com/bitly/data_hacks
import sys
from decimal import Decimal
import math
+from optparse import OptionParser
class MVSD(object):
""" A class that calculates a running Mean / Variance / Standard Deviation"""
@@ -76,14 +77,29 @@ def load_stream(input_stream):
except:
print >>sys.stderr, "invalid line %r" % line
-def histogram(stream):
+def histogram(stream, options):
# we can't iterate on stream because we need to get min/max first and then put it into buckets
- data = list(stream)
- buckets = 10
+ if not options.min or not options.max:
+ # glob the data here so we can do min/max on it
+ data = list(stream)
+ else:
+ data = stream
bucket_scale = 1
- min_v = min(data)
- max_v = max(data)
+ if options.min:
+ min_v = Decimal(options.min)
+ else:
+ min_v = min(data)
+ if options.max:
+ max_v = Decimal(options.max)
+ else:
+ max_v = max(data)
+ buckets = options.buckets and int(options.buckets) or 10
+ if buckets <= 0:
+ raise ValueError('# of buckets must be > 0')
+ if not max_v > min_v:
+ raise ValueError('max must be > min. max:%s min:%s' % (max_v, min_v))
+
diff = max_v - min_v
step = diff / buckets
bucket_counts = [0 for x in range(buckets)]
@@ -91,21 +107,31 @@ def histogram(stream):
for x in range(buckets):
boundaries.append(min_v + (step * (x + 1)))
+ skipped = 0
+ samples = 0
mvsd = MVSD()
for value in data:
- mvsd.add(value)
+ samples +=1
+ if options.mvsd:
+ mvsd.add(value)
# find the bucket this goes in
+ if value < min_v or value > max_v:
+ skipped +=1
+ continue
for bucket_postion, boundary in enumerate(boundaries):
if value <= boundary:
bucket_counts[bucket_postion] +=1
break
- # auto-pick the bucket size
+ # auto-pick the hash scale
if max(bucket_counts) > 75:
bucket_scale = int(max(bucket_counts) / 75)
- print "# NumSamples = %d; Max = %0.2f; Min = %0.2f" % (len(data), max_v, min_v)
- print "# Mean = %f; Variance = %f; SD = %f" % (mvsd.mean(), mvsd.var(), mvsd.sd())
+ print "# NumSamples = %d; Min = %0.2f; Max = %0.2f" % (samples, min_v, max_v)
+ if skipped:
+ print "# %d value%s outside of min/max" % (skipped, skipped > 1 and 's' or '')
+ if options.mvsd:
+ print "# Mean = %f; Variance = %f; SD = %f" % (mvsd.mean(), mvsd.var(), mvsd.sd())
print "# each * represents a count of %d" % bucket_scale
bucket_min = min_v
bucket_max = min_v
@@ -120,4 +146,22 @@ def histogram(stream):
if __name__ == "__main__":
- histogram(load_stream(sys.stdin))
+ parser = OptionParser()
+ parser.usage = "cat data | %prog [options]"
+ parser.add_option("-m", "--min", dest="min",
+ help="minimum value for graph")
+ parser.add_option("-x", "--max", dest="max",
+ help="maximum value for graph")
+ parser.add_option("-b", "--buckets", dest="buckets",
+ help="Number of buckets to use for the histogram")
+ parser.add_option("--no-mvsd", dest="mvsd", action="store_false", default=True,
+ help="Dissable the calculation of Mean, Vairance and SD. (improves performance)")
+
+ (options, args) = parser.parse_args()
+ if sys.stdin.isatty():
+ # if isatty() that means it's run without anything piped into it
+ parser.print_usage()
+ print "for more help use --help"
+ sys.exit(1)
+ histogram(load_stream(sys.stdin), options)
+
diff --git a/data_hacks/nintey_five_percent.py b/data_hacks/nintey_five_percent.py
index b897fe8..2717486 100644
--- a/data_hacks/nintey_five_percent.py
+++ b/data_hacks/nintey_five_percent.py
@@ -8,6 +8,7 @@ Copyright (c) 2010 bit.ly. All rights reserved.
http://github.com/bitly/data_hacks
"""
import sys
+import os
from decimal import Decimal
def run():
@@ -42,4 +43,7 @@ def calc_95(data, count):
return t
if __name__ == "__main__":
+ if sys.stdin.isatty():
+ print "Usage: cat data | %(prog)s" % os.path.basename(sys.argv[0])
+ sys.exit(1)
run()
diff --git a/data_hacks/run_for.py b/data_hacks/run_for.py
index df7630e..08883b4 100644
--- a/data_hacks/run_for.py
+++ b/data_hacks/run_for.py
@@ -9,6 +9,7 @@ http://github.com/bitly/data_hacks
"""
import time
import sys
+import os
def getruntime(arg):
if not arg:
@@ -17,12 +18,14 @@ def getruntime(arg):
base = int(arg[:-1])
if suffix == "s":
return base
- elif suffix == "h":
+ elif suffix == "m":
return base * 60
+ elif suffix == "h":
+ return base * 60 * 60
elif suffix == "d":
- return base * 60 * 24
+ return base * 60 * 60 * 24
else:
- print >>sys.stderr, "invalid time suffix %r" % arg
+ print >>sys.stderr, "invalid time suffix %r. must be one of s,m,h,d" % arg
def run(runtime):
end = time.time() + runtime
@@ -35,8 +38,15 @@ def run(runtime):
return
if __name__ == "__main__":
+ usage = "Usage: tail -f access.log | %(prog)s [time] | ..." % os.path.basename(sys.argv[0])
+ help = "time can be in the format 10s 10m 10h etc"
+ if sys.stdin.isatty():
+ print usage
+ print help
+ sys.exit(1)
+
runtime = getruntime(sys.argv[-1])
if not runtime:
- print >>sys.stderr, "usage: tail -f access.log | run_for.py 10s | wc -l"
+ print usage
sys.exit(1)
run(runtime) \ No newline at end of file
diff --git a/data_hacks/sample.py b/data_hacks/sample.py
index dbed4f3..a57bd43 100644
--- a/data_hacks/sample.py
+++ b/data_hacks/sample.py
@@ -10,6 +10,7 @@ http://github.com/bitly/data_hacks
import sys
import random
+from optparse import OptionParser
from decimal import Decimal
def usage():
@@ -37,22 +38,27 @@ def get_sample_rate(rate_string):
rate = Decimal(x) / (Decimal(y) * Decimal('1.0'))
rate = int(rate * 100)
else:
- raise Exception("rate %r is invalid rate format must be '10%%' or '1/10'" % rate_string)
+ raise ValueError("rate %r is invalid rate format must be '10%%' or '1/10'" % rate_string)
if rate < 1 or rate > 100:
- raise Exception('rate %r must be 1%% <= rate <= 100%% ' % rate_string)
+ raise ValueError('rate %r must be 1%% <= rate <= 100%% ' % rate_string)
return rate
if __name__ == "__main__":
- debug = '--debug' in sys.argv
- try:
- sys.argv.remove('--debug')
- except ValueError:
- pass
- if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv) != 2:
- usage()
+ parser = OptionParser()
+ parser.usage = "cat data | %prog [options] [sample_rate]"
+ parser.add_option("-v", "--verbose", dest="verbose", default=False, action="store_true")
+ (options, args) = parser.parse_args()
+
+ if not args or sys.stdin.isatty():
+ parser.print_usage()
sys.exit(1)
- sample_rate = get_sample_rate(sys.argv[-1])
- if debug:
+ try:
+ sample_rate = get_sample_rate(sys.argv[-1])
+ except ValueError, e:
+ print >>sys.stderr, e
+ parser.print_usage()
+ sys.exit(1)
+ if options.verbose:
print >>sys.stderr, "Sample rate is %d%%" % sample_rate
run(sample_rate)
diff --git a/setup.py b/setup.py
index b4f46af..111854f 100644
--- a/setup.py
+++ b/setup.py
@@ -9,5 +9,6 @@ setup(name='data_hacks',
# packages=['data_hacks'],
scripts = ['data_hacks/histogram.py',
'data_hacks/nintey_five_percent.py',
+ 'data_hacks/run_for.py',
'data_hacks/sample.py']
) \ No newline at end of file