summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJehiah Czebotar <jehiah@gmail.com>2010-10-20 21:14:56 -0400
committerJehiah Czebotar <jehiah@gmail.com>2010-10-20 21:14:56 -0400
commit5856510d05d7059982300e980f26079ebea0ec59 (patch)
treebba6d45563e86799b165386acaacedd28122a5f9
parent31c8fa11a07706c3922f5b71b3e7ef9d64fc7200 (diff)
downloaddata_hacks-5856510d05d7059982300e980f26079ebea0ec59.zip
data_hacks-5856510d05d7059982300e980f26079ebea0ec59.tar.gz
data_hacks-5856510d05d7059982300e980f26079ebea0ec59.tar.bz2
add bar_chart.py
-rw-r--r--README.markdown22
-rw-r--r--data_hacks/bar_chart.py85
-rw-r--r--setup.py3
3 files changed, 109 insertions, 1 deletions
diff --git a/README.markdown b/README.markdown
index 7f97875..cebea22 100644
--- a/README.markdown
+++ b/README.markdown
@@ -61,3 +61,25 @@ Pass through data for a specified amount of time
Example:
$ tail -f access.log | run_for.py 10s | post_process.py
+
+bar_chart.py
+------------
+
+Generate an ascii bar chart for input data (this is like a visualization of `uniq -c`)
+
+ $ cat data | bar_chart.py --sort-keys
+ # each * represents a count of 2
+ 19:0 [ 1]
+ 19:1 [ 24] ************
+ 19:2 [ 3] *
+ 19:3 [ 9] ****
+ 19:4 [ 5] **
+ 19:5 [ 41] ********************
+ 20:0 [ 115] *********************************************************
+ 20:1 [ 181] ******************************************************************************************
+ 20:2 [ 136] ********************************************************************
+ 20:3 [ 155] *****************************************************************************
+ 20:4 [ 150] ***************************************************************************
+ 20:5 [ 79] ***************************************
+ 21:0 [ 64] ********************************
+ 21:1 [ 8] ****
diff --git a/data_hacks/bar_chart.py b/data_hacks/bar_chart.py
new file mode 100644
index 0000000..5777224
--- /dev/null
+++ b/data_hacks/bar_chart.py
@@ -0,0 +1,85 @@
+#!/bin/env python
+#
+# Copyright 2010 bit.ly
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+"""
+Generate an ascii bar chart for input data
+
+http://github.com/bitly/data_hacks
+"""
+import sys
+from collections import defaultdict
+from optparse import OptionParser
+
+def load_stream(input_stream):
+ while True:
+ line = input_stream.readline()
+ if not line:
+ break
+ clean_line = line.strip()
+ if not clean_line:
+ # skip empty lines (ie: newlines)
+ continue
+ if clean_line[0] in ['"', "'"]:
+ clean_line = clean_line.strip('"').strip("'")
+ if clean_line:
+ yield clean_line
+
+def run(input_stream, options):
+ data = defaultdict(lambda:0)
+ for row in input_stream:
+ data[row]+=1
+
+ if not data:
+ print "Error: no data"
+ sys.exit(1)
+
+ max_length = max([len(key) for key in data.keys()])
+ max_length = min(max_length, 50)
+ value_characters = 80 - max_length
+ max_value = max(data.values())
+ scale = int(float(max_value) / value_characters)
+ scale = max(1, scale)
+
+ print "# each * represents a count of %d" % scale
+
+ if options.sort_keys:
+ data = [[key,value] for key,value in data.items()]
+ data.sort()
+ data = [[value, key] for key,value in data]
+ else:
+ # sort by values
+ data = [[value,key] for key,value in data.items()]
+ data.sort(reverse=True)
+ format = "%" + str(max_length) + "s [%6d] %s"
+ for value,key in data:
+ print format % (key[:max_length], value, (value / scale) * "*")
+
+if __name__ == "__main__":
+ parser = OptionParser()
+ parser.usage = "cat data | %prog [options]"
+ parser.add_option("-k", "--sort-keys", dest="sort_keys", default=True, action="store_true",
+ help="sort by the key [default]")
+ parser.add_option("-v", "--sort-values", dest="sort_values", default=False, action="store_true",
+ help="sort by the frequence")
+
+ (options, args) = parser.parse_args()
+
+ if sys.stdin.isatty():
+ parser.print_usage()
+ print "for more help use --help"
+ sys.exit(1)
+ run(load_stream(sys.stdin), options)
+
diff --git a/setup.py b/setup.py
index 349b1ec..d680041 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
from distutils.core import setup
-version = "0.1"
+version = "0.2"
setup(name='data_hacks',
version=version,
description='Command line utilities for data analysis',
@@ -17,5 +17,6 @@ setup(name='data_hacks',
scripts = ['data_hacks/histogram.py',
'data_hacks/nintey_five_percent.py',
'data_hacks/run_for.py',
+ 'data_hacks/bar_chart.py',
'data_hacks/sample.py']
) \ No newline at end of file