summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSean O'Connor <sean@seanoc.com>2014-08-26 18:39:25 -0400
committerSean O'Connor <sean@seanoc.com>2014-08-26 18:39:25 -0400
commit7a761b4eb96a4efdf84e54517e9617ac4aaf090f (patch)
tree8d58121490750c5e7c7a916492d03fe920e57091
parent3658aec39b837f62c1c54e2a5ee262c272b0ade4 (diff)
parent8d9143e937b33d73791799da2c4758b1e717b421 (diff)
downloaddata_hacks-7a761b4eb96a4efdf84e54517e9617ac4aaf090f.zip
data_hacks-7a761b4eb96a4efdf84e54517e9617ac4aaf090f.tar.gz
data_hacks-7a761b4eb96a4efdf84e54517e9617ac4aaf090f.tar.bz2
Merge pull request #17 from jehiah/histogram_agg_17
Support ingesting Aggregate data from histogram.py
-rwxr-xr-xdata_hacks/bar_chart.py2
-rwxr-xr-x[-rw-r--r--]data_hacks/histogram.py51
2 files changed, 34 insertions, 19 deletions
diff --git a/data_hacks/bar_chart.py b/data_hacks/bar_chart.py
index 14f5541..3eaf6f2 100755
--- a/data_hacks/bar_chart.py
+++ b/data_hacks/bar_chart.py
@@ -41,7 +41,7 @@ def run(input_stream, options):
data = defaultdict(lambda:0)
for row in input_stream:
if options.agg_values:
- kv = row.split(' ',2);
+ kv = row.replace('\t', ' ').split(' ',2);
data[kv[0]]+= int(kv[1])
else:
data[row]+=1
diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py
index 3de3e21..1a5f200 100644..100755
--- a/data_hacks/histogram.py
+++ b/data_hacks/histogram.py
@@ -26,8 +26,10 @@ http://github.com/bitly/data_hacks
import sys
from decimal import Decimal
+import logging
import math
from optparse import OptionParser
+from collections import namedtuple
class MVSD(object):
""" A class that calculates a running Mean / Variance / Standard Deviation"""
@@ -63,6 +65,8 @@ class MVSD(object):
def mean(self):
return self.m
+DataPoint = namedtuple('DataPoint', ['value', 'count'])
+
def test_mvsd():
mvsd = MVSD()
for x in range(10):
@@ -72,28 +76,35 @@ def test_mvsd():
assert '%.2f' % mvsd.var() == "8.25"
assert '%.14f' % mvsd.sd() == "2.87228132326901"
-def load_stream(input_stream):
+def load_stream(input_stream, agg):
for line in input_stream:
clean_line = line.strip()
if not clean_line:
# skip empty lines (ie: newlines)
continue
if clean_line[0] in ['"', "'"]:
- clean_line = clean_line.strip('"').strip("'")
+ clean_line = clean_line.strip("\"'")
try:
- yield Decimal(clean_line)
+ if agg:
+ value, count = line.replace("\t", ' ').split(' ', 2)
+ yield DataPoint(Decimal(value), int(count))
+ continue
+ yield DataPoint(Decimal(clean_line), 1)
except:
+ logging.exception('failed %r', line)
print >>sys.stderr, "invalid line %r" % line
-def median(values):
+def median(values, key=None):
+ if not key:
+ key= lambda x: x
length = len(values)
if length%2:
median_indeces = [length/2]
else:
median_indeces = [length/2-1, length/2]
- values = sorted(values)
- return sum([values[i] for i in median_indeces]) / len(median_indeces)
+ values = sorted(values, key=key)
+ return sum(map(key, [values[i] for i in median_indeces])) / len(median_indeces)
def test_median():
assert 6 == median([8,7,9,1,2,6,3]) # odd-sized list
@@ -117,11 +128,13 @@ def histogram(stream, options):
if options.min:
min_v = Decimal(options.min)
else:
- min_v = min(data)
+ min_v = min(data, key=lambda x: x.value)
+ min_v = min_v.value
if options.max:
max_v = Decimal(options.max)
else:
- max_v = max(data)
+ max_v = max(data, key=lambda x: x.value)
+ max_v = max_v.value
if not max_v > min_v:
raise ValueError('max must be > min. max:%s min:%s' % (max_v, min_v))
@@ -163,18 +176,18 @@ def histogram(stream, options):
samples = 0
mvsd = MVSD()
accepted_data = []
- for value in data:
- samples +=1
+ for record in data:
+ samples += record.count
if options.mvsd:
- mvsd.add(value)
- accepted_data.append(value)
+ mvsd.add(record.value, record.count)
+ accepted_data.append(record)
# find the bucket this goes in
- if value < min_v or value > max_v:
- skipped +=1
+ if record.value < min_v or record.value > max_v:
+ skipped += record.count
continue
for bucket_postion, boundary in enumerate(boundaries):
- if value <= boundary:
- bucket_counts[bucket_postion] +=1
+ if record.value <= boundary:
+ bucket_counts[bucket_postion] += record.count
break
# auto-pick the hash scale
@@ -185,7 +198,7 @@ def histogram(stream, options):
if skipped:
print "# %d value%s outside of min/max" % (skipped, skipped > 1 and 's' or '')
if options.mvsd:
- print "# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd.mean(), mvsd.var(), mvsd.sd(), median(accepted_data))
+ print "# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd.mean(), mvsd.var(), mvsd.sd(), median(accepted_data, key=lambda x: x.value))
print "# each ∎ represents a count of %d" % bucket_scale
bucket_min = min_v
bucket_max = min_v
@@ -202,6 +215,8 @@ def histogram(stream, options):
if __name__ == "__main__":
parser = OptionParser()
parser.usage = "cat data | %prog [options]"
+ parser.add_option("-a", "--agg", dest="agg", default=False, action="store_true",
+ help="Two column input format, space seperated with key<space>value")
parser.add_option("-m", "--min", dest="min",
help="minimum value for graph")
parser.add_option("-x", "--max", dest="max",
@@ -219,5 +234,5 @@ if __name__ == "__main__":
parser.print_usage()
print "for more help use --help"
sys.exit(1)
- histogram(load_stream(sys.stdin), options)
+ histogram(load_stream(sys.stdin, options.agg), options)