diff options
author | Jehiah Czebotar <jehiah@gmail.com> | 2010-09-28 18:10:26 -0400 |
---|---|---|
committer | Jehiah Czebotar <jehiah@gmail.com> | 2010-09-28 18:10:26 -0400 |
commit | 41ea77f62c3adf9164c46cdddc733aa594517be9 (patch) | |
tree | 6381e4f0ddf3dfc9cf983aedf55ba6beb5bd2693 | |
download | data_hacks-41ea77f62c3adf9164c46cdddc733aa594517be9.zip data_hacks-41ea77f62c3adf9164c46cdddc733aa594517be9.tar.gz data_hacks-41ea77f62c3adf9164c46cdddc733aa594517be9.tar.bz2 |
initial histogram and 95% utilities
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | README.markdown | 44 | ||||
-rw-r--r-- | data_hacks/histogram.py | 123 | ||||
-rw-r--r-- | data_hacks/nintey_five_percent.py | 45 | ||||
-rw-r--r-- | setup.py | 12 |
5 files changed, 225 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c795b05 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +build
\ No newline at end of file diff --git a/README.markdown b/README.markdown new file mode 100644 index 0000000..94553d9 --- /dev/null +++ b/README.markdown @@ -0,0 +1,44 @@ +data_hacks +======== + +Command line utilities for data analysis + +histogram +========= + +A utility that parses input data points and outputs a text histogram + +Example: + + $ cat /tmp/data | histogram.py + # NumSamples = 29; Max = 10.00; Min = 1.00 + # Mean = 4.379310; Variance = 5.131986; SD = 2.265389 + # each * represents a count of 1 + 1.0000 - 1.9000 [ 1]: * + 1.9000 - 2.8000 [ 5]: ***** + 2.8000 - 3.7000 [ 8]: ******** + 3.7000 - 4.6000 [ 3]: *** + 4.6000 - 5.5000 [ 4]: **** + 5.5000 - 6.4000 [ 2]: ** + 6.4000 - 7.3000 [ 3]: *** + 7.3000 - 8.2000 [ 1]: * + 8.2000 - 9.1000 [ 1]: * + 9.1000 - 10.0000 [ 1]: * + +nintey_five_percent +=================== + +A utility script that takes a stream of decimal values and outputs the 95% time. + +This is useful for finding the 95% response time from access logs. + +Example: + + $ cat access.log | awk '{print $NF}' | nintey_five_percent.py + +Installation +============ + +Installing from source: + +pip install -e git://github.com/bitly/data_hacks.git#egg=data_hacks diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py new file mode 100644 index 0000000..46c4796 --- /dev/null +++ b/data_hacks/histogram.py @@ -0,0 +1,123 @@ +#!/bin/env python +""" +Generate a text format histogram + +This is a loose port to python of the Perl version at +http://www.pandamatak.com/people/anand/xfer/histo + +created by Jehiah Czebotar 2010-09-27 +Copyright (c) 2010 bit.ly. All rights reserved. + +http://github.com/bitly/data_hacks + +""" + +import sys +from decimal import Decimal +import math + +class MVSD(object): + """ A class that calculates a running Mean / Variance / Standard Deviation""" + def __init__(self): + self.is_started = False + self.ss = Decimal(0) # (running) sum of square deviations from mean + self.m = Decimal(0) # (running) mean + self.total_w = Decimal(0) # weight of items seen + + def add(self, x, w=1): + x = x * Decimal('1.0') + if not self.is_started: + self.m = x + self.ss = Decimal(0) + self.total_w = w + self.is_started = True + else: + temp_w = self.total_w + w + self.ss += (self.total_w * w * (x - self.m) * (x - self.m )) / temp_w + self.m += (x - self.m) / temp_w + self.total_w = temp_w + + # print "added %-2d mean=%0.2f var=%0.2f std=%0.2f" % (x, self.mean(), self.var(), self.sd()) + + def var(self): + return self.ss / self.total_w + + def sd(self): + return math.sqrt(self.var()) + + def mean(self): + return self.m + +def test_mvsd(): + mvsd = MVSD() + for x in range(10): + mvsd.add(x) + + assert '%.2f' % mvsd.mean() == "4.50" + assert '%.2f' % mvsd.var() == "8.25" + assert '%.14f' % mvsd.sd() == "2.87228132326901" + + +def load_stream(input_stream): + while True: + line = input_stream.readline() + if not line: + break + line = line.strip() + if not line: + # skip empty lines (ie: newlines) + continue + try: + yield Decimal(line) + except: + try: + line = line.strip('"').strip("'") + yield Decimal(line) + except: + print >>sys.stderr, "invalid line %r" % line + +def histogram(stream): + # we can't iterate on stream because we need to get min/max first and then put it into buckets + data = list(stream) + buckets = 10 + bucket_scale = 1 + + min_v = min(data) + max_v = max(data) + diff = max_v - min_v + step = diff / buckets + bucket_counts = [0 for x in range(buckets)] + boundaries = [] + for x in range(buckets): + boundaries.append(min_v + (step * (x + 1))) + + mvsd = MVSD() + for value in data: + mvsd.add(value) + # find the bucket this goes in + for bucket_postion, boundary in enumerate(boundaries): + if value <= boundary: + bucket_counts[bucket_postion] +=1 + break + + # auto-pick the bucket size + if max(bucket_counts) > 75: + bucket_scale = int(max(bucket_counts) / 75) + + print "# NumSamples = %d; Max = %0.2f; Min = %0.2f" % (len(data), max_v, min_v) + print "# Mean = %f; Variance = %f; SD = %f" % (mvsd.mean(), mvsd.var(), mvsd.sd()) + print "# each * represents a count of %d" % bucket_scale + bucket_min = min_v + bucket_max = min_v + for bucket in range(buckets): + bucket_min = bucket_max + bucket_max = boundaries[bucket] + bucket_count = bucket_counts[bucket] + star_count = 0 + if bucket_count: + star_count = bucket_count / bucket_scale + print '%10.4f - %10.4f [%6d]: %s' % (bucket_min, bucket_max, bucket_count, '*' * star_count) + + +if __name__ == "__main__": + histogram(load_stream(sys.stdin)) diff --git a/data_hacks/nintey_five_percent.py b/data_hacks/nintey_five_percent.py new file mode 100644 index 0000000..b897fe8 --- /dev/null +++ b/data_hacks/nintey_five_percent.py @@ -0,0 +1,45 @@ +#!/bin/env python +""" +Calculate the 95% time from a list of times given on stdin + +created by Jehiah Czebotar 2010-09-27 +Copyright (c) 2010 bit.ly. All rights reserved. + +http://github.com/bitly/data_hacks +""" +import sys +from decimal import Decimal + +def run(): + count = 0 + data = {} + while True: + line = sys.stdin.readline() + if not line: + break + line = line.strip() + if not line: + # skip empty lines (ie: newlines) + continue + try: + t = Decimal(line) + except: + print >>sys.stderr, "invalid line %r" % line + count +=1 + data[t] = data.get(t, 0) + 1 + print calc_95(data, count) + +def calc_95(data, count): + # find the time it took for x entry, where x is the threshold + threshold = Decimal(count) * Decimal('.95') + start = Decimal(0) + times = data.keys() + times.sort() + for t in times: + # increment our count by the # of items in this time bucket + start += data[t] + if start > threshold: + return t + +if __name__ == "__main__": + run() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f371a11 --- /dev/null +++ b/setup.py @@ -0,0 +1,12 @@ +from distutils.core import setup + +setup(name='data_hacks', + version='0.1', + description='Command line utilities for data analysis', + author='bitly', + author_email='support@bit.ly', + url='http://github.com/bitly/data_analysis', + # packages=['data_hacks'], + scripts = ['data_hacks/histogram.py', + 'data_hacks/nintey_five_percent.py'] + )
\ No newline at end of file |