summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorElazar Leibovich <elazarl@gmail.com>2015-07-03 15:17:33 +0300
committerElazar Leibovich <elazarl@gmail.com>2015-07-03 15:17:33 +0300
commit9078660a5fc27f9c15dc9e5e3956b203a0967db1 (patch)
tree21ada45f1fedf38202c00cd3d21732e1ecf4a3b3
parent3978590585c6b7bb98439419a5d9806be072c938 (diff)
downloaddata_hacks-9078660a5fc27f9c15dc9e5e3956b203a0967db1.zip
data_hacks-9078660a5fc27f9c15dc9e5e3956b203a0967db1.tar.gz
data_hacks-9078660a5fc27f9c15dc9e5e3956b203a0967db1.tar.bz2
pep8 compliance
-rwxr-xr-xdata_hacks/histogram.py130
1 files changed, 76 insertions, 54 deletions
diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py
index 738b8ac..605f54e 100755
--- a/data_hacks/histogram.py
+++ b/data_hacks/histogram.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-#
+#
# Copyright 2010 Bitly
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
@@ -31,16 +31,17 @@ import math
from optparse import OptionParser
from collections import namedtuple
+
class MVSD(object):
- """ A class that calculates a running Mean / Variance / Standard Deviation"""
+ "A class that calculates a running Mean / Variance / Standard Deviation"
def __init__(self):
self.is_started = False
- self.ss = Decimal(0) # (running) sum of square deviations from mean
- self.m = Decimal(0) # (running) mean
- self.total_w = Decimal(0) # weight of items seen
-
+ self.ss = Decimal(0) # (running) sum of square deviations from mean
+ self.m = Decimal(0) # (running) mean
+ self.total_w = Decimal(0) # weight of items seen
+
def add(self, x, w=1):
- """ add another datapoint to the Mean / Variance / Standard Deviation"""
+ "add another datapoint to the Mean / Variance / Standard Deviation"
if not isinstance(x, Decimal):
x = Decimal(x)
if not self.is_started:
@@ -50,32 +51,33 @@ class MVSD(object):
self.is_started = True
else:
temp_w = self.total_w + w
- self.ss += (self.total_w * w * (x - self.m) * (x - self.m )) / temp_w
- self.m += (x - self.m) / temp_w
+ self.ss += (self.total_w * w * (x - self.m) *
+ (x - self.m)) / temp_w
+ self.m += (x - self.m) / temp_w
self.total_w = temp_w
-
- # print "added %-2d mean=%0.2f var=%0.2f std=%0.2f" % (x, self.mean(), self.var(), self.sd())
-
+
def var(self):
return self.ss / self.total_w
-
+
def sd(self):
return math.sqrt(self.var())
-
+
def mean(self):
return self.m
DataPoint = namedtuple('DataPoint', ['value', 'count'])
+
def test_mvsd():
mvsd = MVSD()
for x in range(10):
mvsd.add(x)
-
+
assert '%.2f' % mvsd.mean() == "4.50"
assert '%.2f' % mvsd.var() == "8.25"
assert '%.14f' % mvsd.sd() == "2.87228132326901"
+
def load_stream(input_stream, agg_value_key, agg_key_value):
for line in input_stream:
clean_line = line.strip()
@@ -97,29 +99,34 @@ def load_stream(input_stream, agg_value_key, agg_key_value):
logging.exception('failed %r', line)
print >>sys.stderr, "invalid line %r" % line
+
def median(values, key=None):
if not key:
- key= lambda x: x
+ key = None # map and sort accept None as identity
length = len(values)
- if length%2:
+ if length % 2:
median_indeces = [length/2]
else:
median_indeces = [length/2-1, length/2]
values = sorted(values, key=key)
- return sum(map(key, [values[i] for i in median_indeces])) / len(median_indeces)
+ return sum(map(key,
+ [values[i] for i in median_indeces])) / len(median_indeces)
+
def test_median():
- assert 6 == median([8,7,9,1,2,6,3]) # odd-sized list
- assert 4 == median([4,5,2,1,9,10]) # even-sized int list. (4+5)/2 = 4
- assert "4.50" == "%.2f" % median([4.0,5,2,1,9,10]) #even-sized float list. (4.0+5)/2 = 4.5
+ assert 6 == median([8, 7, 9, 1, 2, 6, 3]) # odd-sized list
+ assert 4 == median([4, 5, 2, 1, 9, 10]) # even-sized int list. (4+5)/2 = 4
+ # even-sized float list. (4.0+5)/2 = 4.5
+ assert "4.50" == "%.2f" % median([4.0, 5, 2, 1, 9, 10])
def histogram(stream, options):
"""
- Loop over the stream and add each entry to the dataset, printing out at the end
-
- stream yields Decimal()
+ Loop over the stream and add each entry to the dataset, printing out at the
+ end.
+
+ stream yields Decimal()
"""
if not options.min or not options.max:
# glob the iterator here so we can do min/max on it
@@ -127,7 +134,7 @@ def histogram(stream, options):
else:
data = stream
bucket_scale = 1
-
+
if options.min:
min_v = Decimal(options.min)
else:
@@ -154,7 +161,7 @@ def histogram(stream, options):
# if the last value is smaller than the maximum, replace it
if bound_sort[-1] < max_v:
bound_sort[-1] = max_v
-
+
# iterate through the sorted list and append to boundaries
for x in bound_sort:
if x >= min_v and x <= max_v:
@@ -163,14 +170,18 @@ def histogram(stream, options):
boundaries.append(max_v)
break
- # beware: the min_v is not included in the boundaries, so no need to do a -1!
+ # beware: the min_v is not included in the boundaries,
+ # so no need to do a -1!
bucket_counts = [0 for x in range(len(boundaries))]
buckets = len(boundaries)
elif options.logscale:
buckets = options.buckets and int(options.buckets) or 10
if buckets <= 0:
raise ValueError('# of buckets must be > 0')
- fx = lambda k, n: n/(2**(k+1)-1)
+
+ def fx(k, n):
+ return n/(2**(k+1)-1)
+
def log_steps(k, n):
"k logarithmic steps whose sum is n"
x = fx(k-1, n)
@@ -195,7 +206,7 @@ def histogram(stream, options):
mvsd = MVSD()
accepted_data = []
for record in data:
- samples += record.count
+ samples += record.count
if options.mvsd:
mvsd.add(record.value, record.count)
accepted_data.append(record)
@@ -207,16 +218,20 @@ def histogram(stream, options):
if record.value <= boundary:
bucket_counts[bucket_postion] += record.count
break
-
+
# auto-pick the hash scale
if max(bucket_counts) > 75:
bucket_scale = int(max(bucket_counts) / 75)
-
- print "# NumSamples = %d; Min = %0.2f; Max = %0.2f" % (samples, min_v, max_v)
+
+ print("# NumSamples = %d; Min = %0.2f; Max = %0.2f" %
+ (samples, min_v, max_v))
if skipped:
- print "# %d value%s outside of min/max" % (skipped, skipped > 1 and 's' or '')
+ print("# %d value%s outside of min/max" %
+ (skipped, skipped > 1 and 's' or ''))
if options.mvsd:
- print "# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd.mean(), mvsd.var(), mvsd.sd(), median(accepted_data, key=lambda x: x.value))
+ print("# Mean = %f; Variance = %f; SD = %f; Median %f" %
+ (mvsd.mean(), mvsd.var(), mvsd.sd(),
+ median(accepted_data, key=lambda x: x.value)))
print "# each ∎ represents a count of %d" % bucket_scale
bucket_min = min_v
bucket_max = min_v
@@ -230,33 +245,40 @@ def histogram(stream, options):
if bucket_count:
star_count = bucket_count / bucket_scale
if options.percentage:
- percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) / Decimal(samples))
- print format_string % (bucket_min, bucket_max, bucket_count, '∎' * star_count, percentage)
-
+ percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) /
+ Decimal(samples))
+ print format_string % (bucket_min, bucket_max, bucket_count, '∎' *
+ star_count, percentage)
+
if __name__ == "__main__":
parser = OptionParser()
parser.usage = "cat data | %prog [options]"
- parser.add_option("-a", "--agg", dest="agg_value_key", default=False, action="store_true",
- help="Two column input format, space seperated with value<space>key")
- parser.add_option("-A", "--agg-key-value", dest="agg_key_value", default=False, action="store_true",
- help="Two column input format, space seperated with key<space>value")
+ parser.add_option("-a", "--agg", dest="agg_value_key", default=False,
+ action="store_true", help="Two column input format, " +
+ "space seperated with value<space>key")
+ parser.add_option("-A", "--agg-key-value", dest="agg_key_value",
+ default=False, action="store_true", help="Two column " +
+ "input format, space seperated with key<space>value")
parser.add_option("-m", "--min", dest="min",
- help="minimum value for graph")
+ help="minimum value for graph")
parser.add_option("-x", "--max", dest="max",
- help="maximum value for graph")
+ help="maximum value for graph")
parser.add_option("-b", "--buckets", dest="buckets",
- help="Number of buckets to use for the histogram")
- parser.add_option("-l", "--logscale", dest="logscale", default=False, action="store_true",
- help="Buckets grow in logarithmic scale")
+ help="Number of buckets to use for the histogram")
+ parser.add_option("-l", "--logscale", dest="logscale", default=False,
+ action="store_true",
+ help="Buckets grow in logarithmic scale")
parser.add_option("-B", "--custom-buckets", dest="custbuckets",
- help="Comma seperated list of bucket edges for the histogram")
- parser.add_option("--no-mvsd", dest="mvsd", action="store_false", default=True,
- help="Disable the calculation of Mean, Variance and SD (improves performance)")
+ help="Comma seperated list of bucket " +
+ "edges for the histogram")
+ parser.add_option("--no-mvsd", dest="mvsd", action="store_false",
+ default=True, help="Disable the calculation of Mean, " +
+ "Variance and SD (improves performance)")
parser.add_option("-f", "--bucket-format", dest="format", default="%10.4f",
- help="format for bucket numbers")
- parser.add_option("-p", "--percentage", dest="percentage", default=False, action="store_true",
- help="List percentage for each bar")
+ help="format for bucket numbers")
+ parser.add_option("-p", "--percentage", dest="percentage", default=False,
+ action="store_true", help="List percentage for each bar")
(options, args) = parser.parse_args()
if sys.stdin.isatty():
@@ -264,5 +286,5 @@ if __name__ == "__main__":
parser.print_usage()
print "for more help use --help"
sys.exit(1)
- histogram(load_stream(sys.stdin, options.agg_value_key, options.agg_key_value), options)
-
+ histogram(load_stream(sys.stdin, options.agg_value_key,
+ options.agg_key_value), options)