summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJehiah Czebotar <jehiah@gmail.com>2010-09-28 18:10:26 -0400
committerJehiah Czebotar <jehiah@gmail.com>2010-09-28 18:10:26 -0400
commit41ea77f62c3adf9164c46cdddc733aa594517be9 (patch)
tree6381e4f0ddf3dfc9cf983aedf55ba6beb5bd2693
downloaddata_hacks-41ea77f62c3adf9164c46cdddc733aa594517be9.zip
data_hacks-41ea77f62c3adf9164c46cdddc733aa594517be9.tar.gz
data_hacks-41ea77f62c3adf9164c46cdddc733aa594517be9.tar.bz2
initial histogram and 95% utilities
-rw-r--r--.gitignore1
-rw-r--r--README.markdown44
-rw-r--r--data_hacks/histogram.py123
-rw-r--r--data_hacks/nintey_five_percent.py45
-rw-r--r--setup.py12
5 files changed, 225 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c795b05
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+build \ No newline at end of file
diff --git a/README.markdown b/README.markdown
new file mode 100644
index 0000000..94553d9
--- /dev/null
+++ b/README.markdown
@@ -0,0 +1,44 @@
+data_hacks
+========
+
+Command line utilities for data analysis
+
+histogram
+=========
+
+A utility that parses input data points and outputs a text histogram
+
+Example:
+
+ $ cat /tmp/data | histogram.py
+ # NumSamples = 29; Max = 10.00; Min = 1.00
+ # Mean = 4.379310; Variance = 5.131986; SD = 2.265389
+ # each * represents a count of 1
+ 1.0000 - 1.9000 [ 1]: *
+ 1.9000 - 2.8000 [ 5]: *****
+ 2.8000 - 3.7000 [ 8]: ********
+ 3.7000 - 4.6000 [ 3]: ***
+ 4.6000 - 5.5000 [ 4]: ****
+ 5.5000 - 6.4000 [ 2]: **
+ 6.4000 - 7.3000 [ 3]: ***
+ 7.3000 - 8.2000 [ 1]: *
+ 8.2000 - 9.1000 [ 1]: *
+ 9.1000 - 10.0000 [ 1]: *
+
+nintey_five_percent
+===================
+
+A utility script that takes a stream of decimal values and outputs the 95% time.
+
+This is useful for finding the 95% response time from access logs.
+
+Example:
+
+ $ cat access.log | awk '{print $NF}' | nintey_five_percent.py
+
+Installation
+============
+
+Installing from source:
+
+pip install -e git://github.com/bitly/data_hacks.git#egg=data_hacks
diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py
new file mode 100644
index 0000000..46c4796
--- /dev/null
+++ b/data_hacks/histogram.py
@@ -0,0 +1,123 @@
+#!/bin/env python
+"""
+Generate a text format histogram
+
+This is a loose port to python of the Perl version at
+http://www.pandamatak.com/people/anand/xfer/histo
+
+created by Jehiah Czebotar 2010-09-27
+Copyright (c) 2010 bit.ly. All rights reserved.
+
+http://github.com/bitly/data_hacks
+
+"""
+
+import sys
+from decimal import Decimal
+import math
+
+class MVSD(object):
+ """ A class that calculates a running Mean / Variance / Standard Deviation"""
+ def __init__(self):
+ self.is_started = False
+ self.ss = Decimal(0) # (running) sum of square deviations from mean
+ self.m = Decimal(0) # (running) mean
+ self.total_w = Decimal(0) # weight of items seen
+
+ def add(self, x, w=1):
+ x = x * Decimal('1.0')
+ if not self.is_started:
+ self.m = x
+ self.ss = Decimal(0)
+ self.total_w = w
+ self.is_started = True
+ else:
+ temp_w = self.total_w + w
+ self.ss += (self.total_w * w * (x - self.m) * (x - self.m )) / temp_w
+ self.m += (x - self.m) / temp_w
+ self.total_w = temp_w
+
+ # print "added %-2d mean=%0.2f var=%0.2f std=%0.2f" % (x, self.mean(), self.var(), self.sd())
+
+ def var(self):
+ return self.ss / self.total_w
+
+ def sd(self):
+ return math.sqrt(self.var())
+
+ def mean(self):
+ return self.m
+
+def test_mvsd():
+ mvsd = MVSD()
+ for x in range(10):
+ mvsd.add(x)
+
+ assert '%.2f' % mvsd.mean() == "4.50"
+ assert '%.2f' % mvsd.var() == "8.25"
+ assert '%.14f' % mvsd.sd() == "2.87228132326901"
+
+
+def load_stream(input_stream):
+ while True:
+ line = input_stream.readline()
+ if not line:
+ break
+ line = line.strip()
+ if not line:
+ # skip empty lines (ie: newlines)
+ continue
+ try:
+ yield Decimal(line)
+ except:
+ try:
+ line = line.strip('"').strip("'")
+ yield Decimal(line)
+ except:
+ print >>sys.stderr, "invalid line %r" % line
+
+def histogram(stream):
+ # we can't iterate on stream because we need to get min/max first and then put it into buckets
+ data = list(stream)
+ buckets = 10
+ bucket_scale = 1
+
+ min_v = min(data)
+ max_v = max(data)
+ diff = max_v - min_v
+ step = diff / buckets
+ bucket_counts = [0 for x in range(buckets)]
+ boundaries = []
+ for x in range(buckets):
+ boundaries.append(min_v + (step * (x + 1)))
+
+ mvsd = MVSD()
+ for value in data:
+ mvsd.add(value)
+ # find the bucket this goes in
+ for bucket_postion, boundary in enumerate(boundaries):
+ if value <= boundary:
+ bucket_counts[bucket_postion] +=1
+ break
+
+ # auto-pick the bucket size
+ if max(bucket_counts) > 75:
+ bucket_scale = int(max(bucket_counts) / 75)
+
+ print "# NumSamples = %d; Max = %0.2f; Min = %0.2f" % (len(data), max_v, min_v)
+ print "# Mean = %f; Variance = %f; SD = %f" % (mvsd.mean(), mvsd.var(), mvsd.sd())
+ print "# each * represents a count of %d" % bucket_scale
+ bucket_min = min_v
+ bucket_max = min_v
+ for bucket in range(buckets):
+ bucket_min = bucket_max
+ bucket_max = boundaries[bucket]
+ bucket_count = bucket_counts[bucket]
+ star_count = 0
+ if bucket_count:
+ star_count = bucket_count / bucket_scale
+ print '%10.4f - %10.4f [%6d]: %s' % (bucket_min, bucket_max, bucket_count, '*' * star_count)
+
+
+if __name__ == "__main__":
+ histogram(load_stream(sys.stdin))
diff --git a/data_hacks/nintey_five_percent.py b/data_hacks/nintey_five_percent.py
new file mode 100644
index 0000000..b897fe8
--- /dev/null
+++ b/data_hacks/nintey_five_percent.py
@@ -0,0 +1,45 @@
+#!/bin/env python
+"""
+Calculate the 95% time from a list of times given on stdin
+
+created by Jehiah Czebotar 2010-09-27
+Copyright (c) 2010 bit.ly. All rights reserved.
+
+http://github.com/bitly/data_hacks
+"""
+import sys
+from decimal import Decimal
+
+def run():
+ count = 0
+ data = {}
+ while True:
+ line = sys.stdin.readline()
+ if not line:
+ break
+ line = line.strip()
+ if not line:
+ # skip empty lines (ie: newlines)
+ continue
+ try:
+ t = Decimal(line)
+ except:
+ print >>sys.stderr, "invalid line %r" % line
+ count +=1
+ data[t] = data.get(t, 0) + 1
+ print calc_95(data, count)
+
+def calc_95(data, count):
+ # find the time it took for x entry, where x is the threshold
+ threshold = Decimal(count) * Decimal('.95')
+ start = Decimal(0)
+ times = data.keys()
+ times.sort()
+ for t in times:
+ # increment our count by the # of items in this time bucket
+ start += data[t]
+ if start > threshold:
+ return t
+
+if __name__ == "__main__":
+ run()
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..f371a11
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,12 @@
+from distutils.core import setup
+
+setup(name='data_hacks',
+ version='0.1',
+ description='Command line utilities for data analysis',
+ author='bitly',
+ author_email='support@bit.ly',
+ url='http://github.com/bitly/data_analysis',
+ # packages=['data_hacks'],
+ scripts = ['data_hacks/histogram.py',
+ 'data_hacks/nintey_five_percent.py']
+ ) \ No newline at end of file