Merge pull request #19 from jehiah/bar_chart_percentage_19

bar chart/histogram percentage
author: Randy Au <randy@numbergrinder.com> 2014-12-19 11:59:39 -0500
committer: Randy Au <randy@numbergrinder.com> 2014-12-19 11:59:39 -0500
commit: 5b020924cc796341fdf4b89e06f867e566bee4c1 (patch)
tree: 20b2ef92b4d3503114987b1977ef3738db59c3c5
parent: 7a761b4eb96a4efdf84e54517e9617ac4aaf090f (diff)
parent: ed3b010ec72117b04955b4f96ad0d4d9a5a39c4e (diff)
download: data_hacks-5b020924cc796341fdf4b89e06f867e566bee4c1.zip
data_hacks-5b020924cc796341fdf4b89e06f867e566bee4c1.tar.gz
data_hacks-5b020924cc796341fdf4b89e06f867e566bee4c1.tar.bz2
8 files changed, 57 insertions, 50 deletions
diff --git a/.gitignore b/.gitignore
index c795b05..9d0b71a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-build
-\ No newline at end of file
+build
+dist
diff --git a/README.markdown b/README.markdown
index 3bfbf8c..d68202f 100644
--- a/README.markdown
+++ b/README.markdown
@@ -18,20 +18,21 @@ A utility that parses input data points and outputs a text histogram
 
 Example:
 
-    $ cat /tmp/data | histogram.py
-    # NumSamples = 29; Max = 10.00; Min = 1.00
-    # Mean = 4.379310; Variance = 5.131986; SD = 2.265389
-    # each * represents a count of 1
-        1.0000 -     1.9000 [     1]: *
-        1.9000 -     2.8000 [     5]: *****
-        2.8000 -     3.7000 [     8]: ********
-        3.7000 -     4.6000 [     3]: ***
-        4.6000 -     5.5000 [     4]: ****
-        5.5000 -     6.4000 [     2]: **
-        6.4000 -     7.3000 [     3]: ***
-        7.3000 -     8.2000 [     1]: *
-        8.2000 -     9.1000 [     1]: *
-        9.1000 -    10.0000 [     1]: *
+    $ cat /tmp/data | histogram.py --percentage --max=1000 --min=0
+    # NumSamples = 60; Min = 0.00; Max = 1000.00
+    # 1 value outside of min/max
+    # Mean = 332.666667; Variance = 471056.055556; SD = 686.335236; Median 191.000000
+    # each ∎ represents a count of 1
+        0.0000 -   100.0000 [    28]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎ (46.67%)
+      100.0000 -   200.0000 [     2]: ∎∎ (3.33%)
+      200.0000 -   300.0000 [     2]: ∎∎ (3.33%)
+      300.0000 -   400.0000 [     8]: ∎∎∎∎∎∎∎∎ (13.33%)
+      400.0000 -   500.0000 [     8]: ∎∎∎∎∎∎∎∎ (13.33%)
+      500.0000 -   600.0000 [     7]: ∎∎∎∎∎∎∎ (11.67%)
+      600.0000 -   700.0000 [     3]: ∎∎∎ (5.00%)
+      700.0000 -   800.0000 [     0]:  (0.00%)
+      800.0000 -   900.0000 [     1]: ∎ (1.67%)
+      900.0000 -  1000.0000 [     0]:  (0.00%)
 
 ninety_five_percent.py
 ----------------------
@@ -67,22 +68,10 @@ bar_chart.py
 
 Generate an ascii bar chart for input data (this is like a visualization of `uniq -c`)
 
-    $ cat data | bar_chart.py --sort-keys
-    # each * represents a count of 2
-    19:0 [     1] 
-    19:1 [    24] ************
-    19:2 [     3] *
-    19:3 [     9] ****
-    19:4 [     5] **
-    19:5 [    41] ********************
-    20:0 [   115] *********************************************************
-    20:1 [   181] ******************************************************************************************
-    20:2 [   136] ********************************************************************
-    20:3 [   155] *****************************************************************************
-    20:4 [   150] ***************************************************************************
-    20:5 [    79] ***************************************
-    21:0 [    64] ********************************
-    21:1 [     8] ****
+    $ cat data | bar_chart.py
+    # each ∎ represents a count of 1. total 63
+    14:40 [    49] ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
+    14:41 [    14] ∎∎∎∎∎∎∎∎∎∎∎∎∎∎
 
 bar_chart.py also supports ingesting aggregated values. Simply provide a two column input of key<space>value:
 
diff --git a/data_hacks/bar_chart.py b/data_hacks/bar_chart.py
index 3eaf6f2..3551860 100755
--- a/data_hacks/bar_chart.py
+++ b/data_hacks/bar_chart.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
-# Copyright 2010 bit.ly
+# Copyright 2010 Bitly
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain
@@ -18,7 +18,7 @@
 """
 Generate an ascii bar chart for input data
 
-http://github.com/bitly/data_hacks
+https://github.com/bitly/data_hacks
 """
 import sys
 import math
@@ -38,13 +38,17 @@ def load_stream(input_stream):
             yield clean_line
 
 def run(input_stream, options):
-    data = defaultdict(lambda:0)
+    data = defaultdict(int)
+    total = 0
     for row in input_stream:
         if options.agg_values:
             kv = row.replace('\t', ' ').split(' ',2);
-            data[kv[0]]+= int(kv[1])
+            value = int(kv[1])
+            data[kv[0]] += value
+            total += value
         else:
-            data[row]+=1
+            data[row] += 1
+            total += 1
     
     if not data:
         print "Error: no data"
@@ -57,7 +61,7 @@ def run(input_stream, options):
     scale = int(math.ceil(float(max_value) / value_characters))
     scale = max(1, scale)
     
-    print "# each ∎ represents a count of %d" % scale
+    print "# each ∎ represents a count of %d. total %d" % (scale, total)
     
     if options.sort_values:
         data = [[value, key] for key, value in data.items()]
@@ -71,9 +75,12 @@ def run(input_stream, options):
         else:
             data.sort(key=lambda x: x[1], reverse=options.reverse_sort)
     
-    format = "%" + str(max_length) + "s [%6d] %s"
-    for value,key in data:
-        print format % (key[:max_length], value, (value / scale) * "∎")
+    str_format = "%" + str(max_length) + "s [%6d] %s%s"
+    percentage = ""
+    for value, key in data:
+        if options.percentage:
+            percentage = " (%0.2f%%)" % (100 * Decimal(value) / Decimal(total))
+        print str_format % (key[:max_length], value, (value / scale) * "∎", percentage)
 
 if __name__ == "__main__":
     parser = OptionParser()
@@ -88,6 +95,8 @@ if __name__ == "__main__":
                         help="reverse the sort")
     parser.add_option("-n", "--numeric-sort", dest="numeric_sort", default=False, action="store_true",
                         help="sort keys by numeric sequencing")
+    parser.add_option("-p", "--percentage", dest="percentage", default=False, action="store_true",
+                        help="List percentage for each bar")
     
     (options, args) = parser.parse_args()
     
diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py
index 1a5f200..72b3806 100755
--- a/data_hacks/histogram.py
+++ b/data_hacks/histogram.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # 
-# Copyright 2010 bit.ly
+# Copyright 2010 Bitly
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain
@@ -21,7 +21,7 @@ Generate a text format histogram
 This is a loose port to python of the Perl version at
 http://www.pandamatak.com/people/anand/xfer/histo
 
-http://github.com/bitly/data_hacks
+https://github.com/bitly/data_hacks
 """
 
 import sys
@@ -202,6 +202,8 @@ def histogram(stream, options):
     print "# each ∎ represents a count of %d" % bucket_scale
     bucket_min = min_v
     bucket_max = min_v
+    percentage = ""
+    format_string = options.format + ' - ' + options.format + ' [%6d]: %s%s'
     for bucket in range(buckets):
         bucket_min = bucket_max
         bucket_max = boundaries[bucket]
@@ -209,7 +211,9 @@ def histogram(stream, options):
         star_count = 0
         if bucket_count:
             star_count = bucket_count / bucket_scale
-        print '%10.4f - %10.4f [%6d]: %s' % (bucket_min, bucket_max, bucket_count, '∎' * star_count)
+        if options.percentage:
+            percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) / Decimal(samples))
+        print format_string % (bucket_min, bucket_max, bucket_count, '∎' * star_count, percentage)
         
 
 if __name__ == "__main__":
@@ -227,6 +231,10 @@ if __name__ == "__main__":
                         help="Comma seperated list of bucket edges for the histogram")
     parser.add_option("--no-mvsd", dest="mvsd", action="store_false", default=True,
                         help="Disable the calculation of Mean, Variance and SD (improves performance)")
+    parser.add_option("-f", "--bucket-format", dest="format", default="%10.4f",
+                        help="format for bucket numbers")
+    parser.add_option("-p", "--percentage", dest="percentage", default=False, action="store_true",
+                        help="List percentage for each bar")
 
     (options, args) = parser.parse_args()
     if sys.stdin.isatty():
diff --git a/data_hacks/ninety_five_percent.py b/data_hacks/ninety_five_percent.py
index 8459fbc..9a51432 100644..100755
--- a/data_hacks/ninety_five_percent.py
+++ b/data_hacks/ninety_five_percent.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # 
-# Copyright 2010 bit.ly
+# Copyright 2010 Bitly
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain
@@ -17,7 +17,7 @@
 """
 Calculate the 95% time from a list of times given on stdin
 
-http://github.com/bitly/data_hacks
+https://github.com/bitly/data_hacks
 """
 
 import sys
diff --git a/data_hacks/run_for.py b/data_hacks/run_for.py
index 4485a23..a8ea21f 100644..100755
--- a/data_hacks/run_for.py
+++ b/data_hacks/run_for.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # 
-# Copyright 2010 bit.ly
+# Copyright 2010 Bitly
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain
@@ -17,7 +17,7 @@
 """
 Pass through data for a specified amount of time
 
-http://github.com/bitly/data_hacks
+https://github.com/bitly/data_hacks
 """
 
 import time
diff --git a/data_hacks/sample.py b/data_hacks/sample.py
index 744b562..c3296ab 100644..100755
--- a/data_hacks/sample.py
+++ b/data_hacks/sample.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # 
-# Copyright 2010 bit.ly
+# Copyright 2010 Bitly
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain
@@ -17,7 +17,7 @@
 """
 Pass through a sampled percentage of data
 
-http://github.com/bitly/data_hacks
+https://github.com/bitly/data_hacks
 """
 
 import sys
diff --git a/setup.py b/setup.py
index ea37d5a..d0fc881 100644..100755
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@ setup(name='data_hacks',
       description='Command line utilities for data analysis',
       author='Jehiah Czebotar',
       author_email='jehiah@gmail.com',
-      url='http://github.com/bitly/data_analysis',
+      url='https://github.com/bitly/data_hacks',
       classifiers=[
             'Development Status :: 4 - Beta',
             'Programming Language :: Python',
author	Randy Au <randy@numbergrinder.com>	2014-12-19 11:59:39 -0500
committer	Randy Au <randy@numbergrinder.com>	2014-12-19 11:59:39 -0500
commit	5b020924cc796341fdf4b89e06f867e566bee4c1 (patch)
tree	20b2ef92b4d3503114987b1977ef3738db59c3c5
parent	7a761b4eb96a4efdf84e54517e9617ac4aaf090f (diff)
parent	ed3b010ec72117b04955b4f96ad0d4d9a5a39c4e (diff)
download	data_hacks-5b020924cc796341fdf4b89e06f867e566bee4c1.zip data_hacks-5b020924cc796341fdf4b89e06f867e566bee4c1.tar.gz data_hacks-5b020924cc796341fdf4b89e06f867e566bee4c1.tar.bz2