diff options
author | Elazar Leibovich <elazarl@gmail.com> | 2015-07-03 15:17:33 +0300 |
---|---|---|
committer | Elazar Leibovich <elazarl@gmail.com> | 2015-07-03 15:17:33 +0300 |
commit | 9078660a5fc27f9c15dc9e5e3956b203a0967db1 (patch) | |
tree | 21ada45f1fedf38202c00cd3d21732e1ecf4a3b3 | |
parent | 3978590585c6b7bb98439419a5d9806be072c938 (diff) | |
download | data_hacks-9078660a5fc27f9c15dc9e5e3956b203a0967db1.zip data_hacks-9078660a5fc27f9c15dc9e5e3956b203a0967db1.tar.gz data_hacks-9078660a5fc27f9c15dc9e5e3956b203a0967db1.tar.bz2 |
pep8 compliance
-rwxr-xr-x | data_hacks/histogram.py | 130 |
1 files changed, 76 insertions, 54 deletions
diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py index 738b8ac..605f54e 100755 --- a/data_hacks/histogram.py +++ b/data_hacks/histogram.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# +# # Copyright 2010 Bitly # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -31,16 +31,17 @@ import math from optparse import OptionParser from collections import namedtuple + class MVSD(object): - """ A class that calculates a running Mean / Variance / Standard Deviation""" + "A class that calculates a running Mean / Variance / Standard Deviation" def __init__(self): self.is_started = False - self.ss = Decimal(0) # (running) sum of square deviations from mean - self.m = Decimal(0) # (running) mean - self.total_w = Decimal(0) # weight of items seen - + self.ss = Decimal(0) # (running) sum of square deviations from mean + self.m = Decimal(0) # (running) mean + self.total_w = Decimal(0) # weight of items seen + def add(self, x, w=1): - """ add another datapoint to the Mean / Variance / Standard Deviation""" + "add another datapoint to the Mean / Variance / Standard Deviation" if not isinstance(x, Decimal): x = Decimal(x) if not self.is_started: @@ -50,32 +51,33 @@ class MVSD(object): self.is_started = True else: temp_w = self.total_w + w - self.ss += (self.total_w * w * (x - self.m) * (x - self.m )) / temp_w - self.m += (x - self.m) / temp_w + self.ss += (self.total_w * w * (x - self.m) * + (x - self.m)) / temp_w + self.m += (x - self.m) / temp_w self.total_w = temp_w - - # print "added %-2d mean=%0.2f var=%0.2f std=%0.2f" % (x, self.mean(), self.var(), self.sd()) - + def var(self): return self.ss / self.total_w - + def sd(self): return math.sqrt(self.var()) - + def mean(self): return self.m DataPoint = namedtuple('DataPoint', ['value', 'count']) + def test_mvsd(): mvsd = MVSD() for x in range(10): mvsd.add(x) - + assert '%.2f' % mvsd.mean() == "4.50" assert '%.2f' % mvsd.var() == "8.25" assert '%.14f' % mvsd.sd() == "2.87228132326901" + def load_stream(input_stream, agg_value_key, agg_key_value): for line in input_stream: clean_line = line.strip() @@ -97,29 +99,34 @@ def load_stream(input_stream, agg_value_key, agg_key_value): logging.exception('failed %r', line) print >>sys.stderr, "invalid line %r" % line + def median(values, key=None): if not key: - key= lambda x: x + key = None # map and sort accept None as identity length = len(values) - if length%2: + if length % 2: median_indeces = [length/2] else: median_indeces = [length/2-1, length/2] values = sorted(values, key=key) - return sum(map(key, [values[i] for i in median_indeces])) / len(median_indeces) + return sum(map(key, + [values[i] for i in median_indeces])) / len(median_indeces) + def test_median(): - assert 6 == median([8,7,9,1,2,6,3]) # odd-sized list - assert 4 == median([4,5,2,1,9,10]) # even-sized int list. (4+5)/2 = 4 - assert "4.50" == "%.2f" % median([4.0,5,2,1,9,10]) #even-sized float list. (4.0+5)/2 = 4.5 + assert 6 == median([8, 7, 9, 1, 2, 6, 3]) # odd-sized list + assert 4 == median([4, 5, 2, 1, 9, 10]) # even-sized int list. (4+5)/2 = 4 + # even-sized float list. (4.0+5)/2 = 4.5 + assert "4.50" == "%.2f" % median([4.0, 5, 2, 1, 9, 10]) def histogram(stream, options): """ - Loop over the stream and add each entry to the dataset, printing out at the end - - stream yields Decimal() + Loop over the stream and add each entry to the dataset, printing out at the + end. + + stream yields Decimal() """ if not options.min or not options.max: # glob the iterator here so we can do min/max on it @@ -127,7 +134,7 @@ def histogram(stream, options): else: data = stream bucket_scale = 1 - + if options.min: min_v = Decimal(options.min) else: @@ -154,7 +161,7 @@ def histogram(stream, options): # if the last value is smaller than the maximum, replace it if bound_sort[-1] < max_v: bound_sort[-1] = max_v - + # iterate through the sorted list and append to boundaries for x in bound_sort: if x >= min_v and x <= max_v: @@ -163,14 +170,18 @@ def histogram(stream, options): boundaries.append(max_v) break - # beware: the min_v is not included in the boundaries, so no need to do a -1! + # beware: the min_v is not included in the boundaries, + # so no need to do a -1! bucket_counts = [0 for x in range(len(boundaries))] buckets = len(boundaries) elif options.logscale: buckets = options.buckets and int(options.buckets) or 10 if buckets <= 0: raise ValueError('# of buckets must be > 0') - fx = lambda k, n: n/(2**(k+1)-1) + + def fx(k, n): + return n/(2**(k+1)-1) + def log_steps(k, n): "k logarithmic steps whose sum is n" x = fx(k-1, n) @@ -195,7 +206,7 @@ def histogram(stream, options): mvsd = MVSD() accepted_data = [] for record in data: - samples += record.count + samples += record.count if options.mvsd: mvsd.add(record.value, record.count) accepted_data.append(record) @@ -207,16 +218,20 @@ def histogram(stream, options): if record.value <= boundary: bucket_counts[bucket_postion] += record.count break - + # auto-pick the hash scale if max(bucket_counts) > 75: bucket_scale = int(max(bucket_counts) / 75) - - print "# NumSamples = %d; Min = %0.2f; Max = %0.2f" % (samples, min_v, max_v) + + print("# NumSamples = %d; Min = %0.2f; Max = %0.2f" % + (samples, min_v, max_v)) if skipped: - print "# %d value%s outside of min/max" % (skipped, skipped > 1 and 's' or '') + print("# %d value%s outside of min/max" % + (skipped, skipped > 1 and 's' or '')) if options.mvsd: - print "# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd.mean(), mvsd.var(), mvsd.sd(), median(accepted_data, key=lambda x: x.value)) + print("# Mean = %f; Variance = %f; SD = %f; Median %f" % + (mvsd.mean(), mvsd.var(), mvsd.sd(), + median(accepted_data, key=lambda x: x.value))) print "# each ∎ represents a count of %d" % bucket_scale bucket_min = min_v bucket_max = min_v @@ -230,33 +245,40 @@ def histogram(stream, options): if bucket_count: star_count = bucket_count / bucket_scale if options.percentage: - percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) / Decimal(samples)) - print format_string % (bucket_min, bucket_max, bucket_count, '∎' * star_count, percentage) - + percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) / + Decimal(samples)) + print format_string % (bucket_min, bucket_max, bucket_count, '∎' * + star_count, percentage) + if __name__ == "__main__": parser = OptionParser() parser.usage = "cat data | %prog [options]" - parser.add_option("-a", "--agg", dest="agg_value_key", default=False, action="store_true", - help="Two column input format, space seperated with value<space>key") - parser.add_option("-A", "--agg-key-value", dest="agg_key_value", default=False, action="store_true", - help="Two column input format, space seperated with key<space>value") + parser.add_option("-a", "--agg", dest="agg_value_key", default=False, + action="store_true", help="Two column input format, " + + "space seperated with value<space>key") + parser.add_option("-A", "--agg-key-value", dest="agg_key_value", + default=False, action="store_true", help="Two column " + + "input format, space seperated with key<space>value") parser.add_option("-m", "--min", dest="min", - help="minimum value for graph") + help="minimum value for graph") parser.add_option("-x", "--max", dest="max", - help="maximum value for graph") + help="maximum value for graph") parser.add_option("-b", "--buckets", dest="buckets", - help="Number of buckets to use for the histogram") - parser.add_option("-l", "--logscale", dest="logscale", default=False, action="store_true", - help="Buckets grow in logarithmic scale") + help="Number of buckets to use for the histogram") + parser.add_option("-l", "--logscale", dest="logscale", default=False, + action="store_true", + help="Buckets grow in logarithmic scale") parser.add_option("-B", "--custom-buckets", dest="custbuckets", - help="Comma seperated list of bucket edges for the histogram") - parser.add_option("--no-mvsd", dest="mvsd", action="store_false", default=True, - help="Disable the calculation of Mean, Variance and SD (improves performance)") + help="Comma seperated list of bucket " + + "edges for the histogram") + parser.add_option("--no-mvsd", dest="mvsd", action="store_false", + default=True, help="Disable the calculation of Mean, " + + "Variance and SD (improves performance)") parser.add_option("-f", "--bucket-format", dest="format", default="%10.4f", - help="format for bucket numbers") - parser.add_option("-p", "--percentage", dest="percentage", default=False, action="store_true", - help="List percentage for each bar") + help="format for bucket numbers") + parser.add_option("-p", "--percentage", dest="percentage", default=False, + action="store_true", help="List percentage for each bar") (options, args) = parser.parse_args() if sys.stdin.isatty(): @@ -264,5 +286,5 @@ if __name__ == "__main__": parser.print_usage() print "for more help use --help" sys.exit(1) - histogram(load_stream(sys.stdin, options.agg_value_key, options.agg_key_value), options) - + histogram(load_stream(sys.stdin, options.agg_value_key, + options.agg_key_value), options) |