summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRandy Au <randy@numbergrinder.com>2014-12-19 11:59:39 -0500
committerRandy Au <randy@numbergrinder.com>2014-12-19 11:59:39 -0500
commit5b020924cc796341fdf4b89e06f867e566bee4c1 (patch)
tree20b2ef92b4d3503114987b1977ef3738db59c3c5
parent7a761b4eb96a4efdf84e54517e9617ac4aaf090f (diff)
parented3b010ec72117b04955b4f96ad0d4d9a5a39c4e (diff)
downloaddata_hacks-5b020924cc796341fdf4b89e06f867e566bee4c1.zip
data_hacks-5b020924cc796341fdf4b89e06f867e566bee4c1.tar.gz
data_hacks-5b020924cc796341fdf4b89e06f867e566bee4c1.tar.bz2
Merge pull request #19 from jehiah/bar_chart_percentage_19
bar chart/histogram percentage
-rw-r--r--.gitignore3
-rw-r--r--README.markdown49
-rwxr-xr-xdata_hacks/bar_chart.py27
-rwxr-xr-xdata_hacks/histogram.py14
-rwxr-xr-x[-rw-r--r--]data_hacks/ninety_five_percent.py4
-rwxr-xr-x[-rw-r--r--]data_hacks/run_for.py4
-rwxr-xr-x[-rw-r--r--]data_hacks/sample.py4
-rwxr-xr-x[-rw-r--r--]setup.py2
8 files changed, 57 insertions, 50 deletions
diff --git a/.gitignore b/.gitignore
index c795b05..9d0b71a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-build \ No newline at end of file
+build
+dist
diff --git a/README.markdown b/README.markdown
index 3bfbf8c..d68202f 100644
--- a/README.markdown
+++ b/README.markdown
@@ -18,20 +18,21 @@ A utility that parses input data points and outputs a text histogram
Example:
- $ cat /tmp/data | histogram.py
- # NumSamples = 29; Max = 10.00; Min = 1.00
- # Mean = 4.379310; Variance = 5.131986; SD = 2.265389
- # each * represents a count of 1
- 1.0000 - 1.9000 [ 1]: *
- 1.9000 - 2.8000 [ 5]: *****
- 2.8000 - 3.7000 [ 8]: ********
- 3.7000 - 4.6000 [ 3]: ***
- 4.6000 - 5.5000 [ 4]: ****
- 5.5000 - 6.4000 [ 2]: **
- 6.4000 - 7.3000 [ 3]: ***
- 7.3000 - 8.2000 [ 1]: *
- 8.2000 - 9.1000 [ 1]: *
- 9.1000 - 10.0000 [ 1]: *
+ $ cat /tmp/data | histogram.py --percentage --max=1000 --min=0
+ # NumSamples = 60; Min = 0.00; Max = 1000.00
+ # 1 value outside of min/max
+ # Mean = 332.666667; Variance = 471056.055556; SD = 686.335236; Median 191.000000
+ # each ∎ represents a count of 1
+ 0.0000 - 100.0000 [ 28]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎ (46.67%)
+ 100.0000 - 200.0000 [ 2]: ∎∎ (3.33%)
+ 200.0000 - 300.0000 [ 2]: ∎∎ (3.33%)
+ 300.0000 - 400.0000 [ 8]: ∎∎∎∎∎∎∎∎ (13.33%)
+ 400.0000 - 500.0000 [ 8]: ∎∎∎∎∎∎∎∎ (13.33%)
+ 500.0000 - 600.0000 [ 7]: ∎∎∎∎∎∎∎ (11.67%)
+ 600.0000 - 700.0000 [ 3]: ∎∎∎ (5.00%)
+ 700.0000 - 800.0000 [ 0]: (0.00%)
+ 800.0000 - 900.0000 [ 1]: ∎ (1.67%)
+ 900.0000 - 1000.0000 [ 0]: (0.00%)
ninety_five_percent.py
----------------------
@@ -67,22 +68,10 @@ bar_chart.py
Generate an ascii bar chart for input data (this is like a visualization of `uniq -c`)
- $ cat data | bar_chart.py --sort-keys
- # each * represents a count of 2
- 19:0 [ 1]
- 19:1 [ 24] ************
- 19:2 [ 3] *
- 19:3 [ 9] ****
- 19:4 [ 5] **
- 19:5 [ 41] ********************
- 20:0 [ 115] *********************************************************
- 20:1 [ 181] ******************************************************************************************
- 20:2 [ 136] ********************************************************************
- 20:3 [ 155] *****************************************************************************
- 20:4 [ 150] ***************************************************************************
- 20:5 [ 79] ***************************************
- 21:0 [ 64] ********************************
- 21:1 [ 8] ****
+ $ cat data | bar_chart.py
+ # each ∎ represents a count of 1. total 63
+ 14:40 [ 49] ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
+ 14:41 [ 14] ∎∎∎∎∎∎∎∎∎∎∎∎∎∎
bar_chart.py also supports ingesting aggregated values. Simply provide a two column input of key<space>value:
diff --git a/data_hacks/bar_chart.py b/data_hacks/bar_chart.py
index 3eaf6f2..3551860 100755
--- a/data_hacks/bar_chart.py
+++ b/data_hacks/bar_chart.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
-# Copyright 2010 bit.ly
+# Copyright 2010 Bitly
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
@@ -18,7 +18,7 @@
"""
Generate an ascii bar chart for input data
-http://github.com/bitly/data_hacks
+https://github.com/bitly/data_hacks
"""
import sys
import math
@@ -38,13 +38,17 @@ def load_stream(input_stream):
yield clean_line
def run(input_stream, options):
- data = defaultdict(lambda:0)
+ data = defaultdict(int)
+ total = 0
for row in input_stream:
if options.agg_values:
kv = row.replace('\t', ' ').split(' ',2);
- data[kv[0]]+= int(kv[1])
+ value = int(kv[1])
+ data[kv[0]] += value
+ total += value
else:
- data[row]+=1
+ data[row] += 1
+ total += 1
if not data:
print "Error: no data"
@@ -57,7 +61,7 @@ def run(input_stream, options):
scale = int(math.ceil(float(max_value) / value_characters))
scale = max(1, scale)
- print "# each ∎ represents a count of %d" % scale
+ print "# each ∎ represents a count of %d. total %d" % (scale, total)
if options.sort_values:
data = [[value, key] for key, value in data.items()]
@@ -71,9 +75,12 @@ def run(input_stream, options):
else:
data.sort(key=lambda x: x[1], reverse=options.reverse_sort)
- format = "%" + str(max_length) + "s [%6d] %s"
- for value,key in data:
- print format % (key[:max_length], value, (value / scale) * "∎")
+ str_format = "%" + str(max_length) + "s [%6d] %s%s"
+ percentage = ""
+ for value, key in data:
+ if options.percentage:
+ percentage = " (%0.2f%%)" % (100 * Decimal(value) / Decimal(total))
+ print str_format % (key[:max_length], value, (value / scale) * "∎", percentage)
if __name__ == "__main__":
parser = OptionParser()
@@ -88,6 +95,8 @@ if __name__ == "__main__":
help="reverse the sort")
parser.add_option("-n", "--numeric-sort", dest="numeric_sort", default=False, action="store_true",
help="sort keys by numeric sequencing")
+ parser.add_option("-p", "--percentage", dest="percentage", default=False, action="store_true",
+ help="List percentage for each bar")
(options, args) = parser.parse_args()
diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py
index 1a5f200..72b3806 100755
--- a/data_hacks/histogram.py
+++ b/data_hacks/histogram.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
-# Copyright 2010 bit.ly
+# Copyright 2010 Bitly
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
@@ -21,7 +21,7 @@ Generate a text format histogram
This is a loose port to python of the Perl version at
http://www.pandamatak.com/people/anand/xfer/histo
-http://github.com/bitly/data_hacks
+https://github.com/bitly/data_hacks
"""
import sys
@@ -202,6 +202,8 @@ def histogram(stream, options):
print "# each ∎ represents a count of %d" % bucket_scale
bucket_min = min_v
bucket_max = min_v
+ percentage = ""
+ format_string = options.format + ' - ' + options.format + ' [%6d]: %s%s'
for bucket in range(buckets):
bucket_min = bucket_max
bucket_max = boundaries[bucket]
@@ -209,7 +211,9 @@ def histogram(stream, options):
star_count = 0
if bucket_count:
star_count = bucket_count / bucket_scale
- print '%10.4f - %10.4f [%6d]: %s' % (bucket_min, bucket_max, bucket_count, '∎' * star_count)
+ if options.percentage:
+ percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) / Decimal(samples))
+ print format_string % (bucket_min, bucket_max, bucket_count, '∎' * star_count, percentage)
if __name__ == "__main__":
@@ -227,6 +231,10 @@ if __name__ == "__main__":
help="Comma seperated list of bucket edges for the histogram")
parser.add_option("--no-mvsd", dest="mvsd", action="store_false", default=True,
help="Disable the calculation of Mean, Variance and SD (improves performance)")
+ parser.add_option("-f", "--bucket-format", dest="format", default="%10.4f",
+ help="format for bucket numbers")
+ parser.add_option("-p", "--percentage", dest="percentage", default=False, action="store_true",
+ help="List percentage for each bar")
(options, args) = parser.parse_args()
if sys.stdin.isatty():
diff --git a/data_hacks/ninety_five_percent.py b/data_hacks/ninety_five_percent.py
index 8459fbc..9a51432 100644..100755
--- a/data_hacks/ninety_five_percent.py
+++ b/data_hacks/ninety_five_percent.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python
#
-# Copyright 2010 bit.ly
+# Copyright 2010 Bitly
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
@@ -17,7 +17,7 @@
"""
Calculate the 95% time from a list of times given on stdin
-http://github.com/bitly/data_hacks
+https://github.com/bitly/data_hacks
"""
import sys
diff --git a/data_hacks/run_for.py b/data_hacks/run_for.py
index 4485a23..a8ea21f 100644..100755
--- a/data_hacks/run_for.py
+++ b/data_hacks/run_for.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python
#
-# Copyright 2010 bit.ly
+# Copyright 2010 Bitly
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
@@ -17,7 +17,7 @@
"""
Pass through data for a specified amount of time
-http://github.com/bitly/data_hacks
+https://github.com/bitly/data_hacks
"""
import time
diff --git a/data_hacks/sample.py b/data_hacks/sample.py
index 744b562..c3296ab 100644..100755
--- a/data_hacks/sample.py
+++ b/data_hacks/sample.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python
#
-# Copyright 2010 bit.ly
+# Copyright 2010 Bitly
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
@@ -17,7 +17,7 @@
"""
Pass through a sampled percentage of data
-http://github.com/bitly/data_hacks
+https://github.com/bitly/data_hacks
"""
import sys
diff --git a/setup.py b/setup.py
index ea37d5a..d0fc881 100644..100755
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@ setup(name='data_hacks',
description='Command line utilities for data analysis',
author='Jehiah Czebotar',
author_email='jehiah@gmail.com',
- url='http://github.com/bitly/data_analysis',
+ url='https://github.com/bitly/data_hacks',
classifiers=[
'Development Status :: 4 - Beta',
'Programming Language :: Python',