initial histogram and 95% utilities

author: Jehiah Czebotar <jehiah@gmail.com> 2010-09-28 18:10:26 -0400
committer: Jehiah Czebotar <jehiah@gmail.com> 2010-09-28 18:10:26 -0400
commit: 41ea77f62c3adf9164c46cdddc733aa594517be9 (patch)
tree: 6381e4f0ddf3dfc9cf983aedf55ba6beb5bd2693
download: data_hacks-41ea77f62c3adf9164c46cdddc733aa594517be9.zip
data_hacks-41ea77f62c3adf9164c46cdddc733aa594517be9.tar.gz
data_hacks-41ea77f62c3adf9164c46cdddc733aa594517be9.tar.bz2
5 files changed, 225 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c795b05
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+build
+\ No newline at end of file
diff --git a/README.markdown b/README.markdown
new file mode 100644
index 0000000..94553d9
--- /dev/null
+++ b/README.markdown
@@ -0,0 +1,44 @@
+data_hacks
+========
+
+Command line utilities for data analysis
+
+histogram
+=========
+
+A utility that parses input data points and outputs a text histogram
+
+Example:
+
+    $ cat /tmp/data | histogram.py
+    # NumSamples = 29; Max = 10.00; Min = 1.00
+    # Mean = 4.379310; Variance = 5.131986; SD = 2.265389
+    # each * represents a count of 1
+        1.0000 -     1.9000 [     1]: *
+        1.9000 -     2.8000 [     5]: *****
+        2.8000 -     3.7000 [     8]: ********
+        3.7000 -     4.6000 [     3]: ***
+        4.6000 -     5.5000 [     4]: ****
+        5.5000 -     6.4000 [     2]: **
+        6.4000 -     7.3000 [     3]: ***
+        7.3000 -     8.2000 [     1]: *
+        8.2000 -     9.1000 [     1]: *
+        9.1000 -    10.0000 [     1]: *
+
+nintey_five_percent
+===================
+
+A utility script that takes a stream of decimal values and outputs the 95% time.
+
+This is useful for finding the 95% response time from access logs.
+
+Example:
+
+    $ cat access.log | awk '{print $NF}' | nintey_five_percent.py
+    
+Installation
+============
+
+Installing from source:
+
+pip install -e git://github.com/bitly/data_hacks.git#egg=data_hacks
diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py
new file mode 100644
index 0000000..46c4796
--- /dev/null
+++ b/data_hacks/histogram.py
@@ -0,0 +1,123 @@
+#!/bin/env python
+"""
+Generate a text format histogram 
+
+This is a loose port to python of the Perl version at
+http://www.pandamatak.com/people/anand/xfer/histo
+
+created by Jehiah Czebotar 2010-09-27
+Copyright (c) 2010 bit.ly. All rights reserved.
+
+http://github.com/bitly/data_hacks
+
+"""
+
+import sys
+from decimal import Decimal
+import math
+
+class MVSD(object):
+    """ A class that calculates a running Mean / Variance / Standard Deviation"""
+    def __init__(self):
+        self.is_started = False
+        self.ss = Decimal(0) # (running) sum of square deviations from mean
+        self.m = Decimal(0) # (running) mean
+        self.total_w = Decimal(0) # weight of items seen
+        
+    def add(self, x, w=1):
+        x = x * Decimal('1.0')
+        if not self.is_started:
+            self.m = x
+            self.ss = Decimal(0)
+            self.total_w = w
+            self.is_started = True
+        else:
+            temp_w = self.total_w + w
+            self.ss += (self.total_w * w * (x - self.m) * (x - self.m )) / temp_w
+            self.m += (x - self.m) / temp_w 
+            self.total_w = temp_w
+        
+        # print "added %-2d mean=%0.2f var=%0.2f std=%0.2f" % (x, self.mean(), self.var(), self.sd())
+        
+    def var(self):
+        return self.ss / self.total_w
+    
+    def sd(self):
+        return math.sqrt(self.var())
+    
+    def mean(self):
+        return self.m
+
+def test_mvsd():
+    mvsd = MVSD()
+    for x in range(10):
+        mvsd.add(x)
+    
+    assert '%.2f' % mvsd.mean() == "4.50"
+    assert '%.2f' % mvsd.var() == "8.25"
+    assert '%.14f' % mvsd.sd() == "2.87228132326901"
+
+
+def load_stream(input_stream):
+    while True:
+        line = input_stream.readline()
+        if not line:
+            break
+        line = line.strip()
+        if not line:
+            # skip empty lines (ie: newlines)
+            continue
+        try:
+            yield Decimal(line)
+        except:
+            try:
+                line = line.strip('"').strip("'")
+                yield Decimal(line)
+            except:
+                print >>sys.stderr, "invalid line %r" % line
+
+def histogram(stream):
+    # we can't iterate on stream because we need to get min/max first and then put it into buckets
+    data = list(stream)
+    buckets = 10
+    bucket_scale = 1
+    
+    min_v = min(data)
+    max_v = max(data)
+    diff = max_v - min_v
+    step = diff / buckets
+    bucket_counts = [0 for x in range(buckets)]
+    boundaries = []
+    for x in range(buckets):
+        boundaries.append(min_v + (step * (x + 1)))
+    
+    mvsd = MVSD()
+    for value in data:
+        mvsd.add(value)
+        # find the bucket this goes in
+        for bucket_postion, boundary in enumerate(boundaries):
+            if value <= boundary:
+                bucket_counts[bucket_postion] +=1
+                break
+    
+    # auto-pick the bucket size
+    if max(bucket_counts) > 75:
+        bucket_scale = int(max(bucket_counts) / 75)
+    
+    print "# NumSamples = %d; Max = %0.2f; Min = %0.2f" % (len(data), max_v, min_v)
+    print "# Mean = %f; Variance = %f; SD = %f" % (mvsd.mean(), mvsd.var(), mvsd.sd())
+    print "# each * represents a count of %d" % bucket_scale
+    bucket_min = min_v
+    bucket_max = min_v
+    for bucket in range(buckets):
+        bucket_min = bucket_max
+        bucket_max = boundaries[bucket]
+        bucket_count = bucket_counts[bucket]
+        star_count = 0
+        if bucket_count:
+            star_count = bucket_count / bucket_scale
+        print '%10.4f - %10.4f [%6d]: %s' % (bucket_min, bucket_max, bucket_count, '*' * star_count)
+        
+
+if __name__ == "__main__":
+    histogram(load_stream(sys.stdin))
diff --git a/data_hacks/nintey_five_percent.py b/data_hacks/nintey_five_percent.py
new file mode 100644
index 0000000..b897fe8
--- /dev/null
+++ b/data_hacks/nintey_five_percent.py
@@ -0,0 +1,45 @@
+#!/bin/env python
+"""
+Calculate the 95% time from a list of times given on stdin
+
+created by Jehiah Czebotar 2010-09-27
+Copyright (c) 2010 bit.ly. All rights reserved.
+
+http://github.com/bitly/data_hacks
+"""
+import sys
+from decimal import Decimal
+
+def run():
+    count = 0
+    data = {}
+    while True:
+        line = sys.stdin.readline()
+        if not line:
+            break
+        line = line.strip()
+        if not line:
+            # skip empty lines (ie: newlines)
+            continue
+        try:
+            t = Decimal(line)
+        except:
+            print >>sys.stderr, "invalid line %r" % line
+        count +=1
+        data[t] = data.get(t, 0) + 1
+    print calc_95(data, count)
+        
+def calc_95(data, count):
+    # find the time it took for x entry, where x is the threshold
+    threshold = Decimal(count) * Decimal('.95')
+    start = Decimal(0)
+    times = data.keys()
+    times.sort()
+    for t in times:
+        # increment our count by the # of items in this time bucket
+        start += data[t]
+        if start > threshold:
+            return t
+
+if __name__ == "__main__":
+    run()
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..f371a11
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,12 @@
+from distutils.core import setup
+
+setup(name='data_hacks',
+      version='0.1',
+      description='Command line utilities for data analysis',
+      author='bitly',
+      author_email='support@bit.ly',
+      url='http://github.com/bitly/data_analysis',
+      # packages=['data_hacks'],
+      scripts = ['data_hacks/histogram.py', 
+                'data_hacks/nintey_five_percent.py']
+     )
+\ No newline at end of file
author	Jehiah Czebotar <jehiah@gmail.com>	2010-09-28 18:10:26 -0400
committer	Jehiah Czebotar <jehiah@gmail.com>	2010-09-28 18:10:26 -0400
commit	41ea77f62c3adf9164c46cdddc733aa594517be9 (patch)
tree	6381e4f0ddf3dfc9cf983aedf55ba6beb5bd2693
download	data_hacks-41ea77f62c3adf9164c46cdddc733aa594517be9.zip data_hacks-41ea77f62c3adf9164c46cdddc733aa594517be9.tar.gz data_hacks-41ea77f62c3adf9164c46cdddc733aa594517be9.tar.bz2