diff options
author | Jehiah Czebotar <jehiah@gmail.com> | 2010-10-20 21:14:56 -0400 |
---|---|---|
committer | Jehiah Czebotar <jehiah@gmail.com> | 2010-10-20 21:14:56 -0400 |
commit | 5856510d05d7059982300e980f26079ebea0ec59 (patch) | |
tree | bba6d45563e86799b165386acaacedd28122a5f9 | |
parent | 31c8fa11a07706c3922f5b71b3e7ef9d64fc7200 (diff) | |
download | data_hacks-5856510d05d7059982300e980f26079ebea0ec59.zip data_hacks-5856510d05d7059982300e980f26079ebea0ec59.tar.gz data_hacks-5856510d05d7059982300e980f26079ebea0ec59.tar.bz2 |
add bar_chart.py
-rw-r--r-- | README.markdown | 22 | ||||
-rw-r--r-- | data_hacks/bar_chart.py | 85 | ||||
-rw-r--r-- | setup.py | 3 |
3 files changed, 109 insertions, 1 deletions
diff --git a/README.markdown b/README.markdown index 7f97875..cebea22 100644 --- a/README.markdown +++ b/README.markdown @@ -61,3 +61,25 @@ Pass through data for a specified amount of time Example: $ tail -f access.log | run_for.py 10s | post_process.py + +bar_chart.py +------------ + +Generate an ascii bar chart for input data (this is like a visualization of `uniq -c`) + + $ cat data | bar_chart.py --sort-keys + # each * represents a count of 2 + 19:0 [ 1] + 19:1 [ 24] ************ + 19:2 [ 3] * + 19:3 [ 9] **** + 19:4 [ 5] ** + 19:5 [ 41] ******************** + 20:0 [ 115] ********************************************************* + 20:1 [ 181] ****************************************************************************************** + 20:2 [ 136] ******************************************************************** + 20:3 [ 155] ***************************************************************************** + 20:4 [ 150] *************************************************************************** + 20:5 [ 79] *************************************** + 21:0 [ 64] ******************************** + 21:1 [ 8] **** diff --git a/data_hacks/bar_chart.py b/data_hacks/bar_chart.py new file mode 100644 index 0000000..5777224 --- /dev/null +++ b/data_hacks/bar_chart.py @@ -0,0 +1,85 @@ +#!/bin/env python +# +# Copyright 2010 bit.ly +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +""" +Generate an ascii bar chart for input data + +http://github.com/bitly/data_hacks +""" +import sys +from collections import defaultdict +from optparse import OptionParser + +def load_stream(input_stream): + while True: + line = input_stream.readline() + if not line: + break + clean_line = line.strip() + if not clean_line: + # skip empty lines (ie: newlines) + continue + if clean_line[0] in ['"', "'"]: + clean_line = clean_line.strip('"').strip("'") + if clean_line: + yield clean_line + +def run(input_stream, options): + data = defaultdict(lambda:0) + for row in input_stream: + data[row]+=1 + + if not data: + print "Error: no data" + sys.exit(1) + + max_length = max([len(key) for key in data.keys()]) + max_length = min(max_length, 50) + value_characters = 80 - max_length + max_value = max(data.values()) + scale = int(float(max_value) / value_characters) + scale = max(1, scale) + + print "# each * represents a count of %d" % scale + + if options.sort_keys: + data = [[key,value] for key,value in data.items()] + data.sort() + data = [[value, key] for key,value in data] + else: + # sort by values + data = [[value,key] for key,value in data.items()] + data.sort(reverse=True) + format = "%" + str(max_length) + "s [%6d] %s" + for value,key in data: + print format % (key[:max_length], value, (value / scale) * "*") + +if __name__ == "__main__": + parser = OptionParser() + parser.usage = "cat data | %prog [options]" + parser.add_option("-k", "--sort-keys", dest="sort_keys", default=True, action="store_true", + help="sort by the key [default]") + parser.add_option("-v", "--sort-values", dest="sort_values", default=False, action="store_true", + help="sort by the frequence") + + (options, args) = parser.parse_args() + + if sys.stdin.isatty(): + parser.print_usage() + print "for more help use --help" + sys.exit(1) + run(load_stream(sys.stdin), options) + @@ -1,6 +1,6 @@ from distutils.core import setup -version = "0.1" +version = "0.2" setup(name='data_hacks', version=version, description='Command line utilities for data analysis', @@ -17,5 +17,6 @@ setup(name='data_hacks', scripts = ['data_hacks/histogram.py', 'data_hacks/nintey_five_percent.py', 'data_hacks/run_for.py', + 'data_hacks/bar_chart.py', 'data_hacks/sample.py'] )
\ No newline at end of file |