diff options
-rw-r--r-- | README.markdown | 5 | ||||
-rw-r--r-- | data_hacks/histogram.py | 47 | ||||
-rw-r--r-- | data_hacks/nintey_five_percent.py | 19 | ||||
-rw-r--r-- | data_hacks/run_for.py | 19 | ||||
-rw-r--r-- | data_hacks/sample.py | 30 |
5 files changed, 83 insertions, 37 deletions
diff --git a/README.markdown b/README.markdown index e892a09..66efecb 100644 --- a/README.markdown +++ b/README.markdown @@ -3,10 +3,11 @@ data_hacks Command line utilities for data analysis -Installing from source: +Installing: `pip install data_hacks` - pip install -e git://github.com/bitly/data_hacks.git#egg=data_hacks +Installing form github `pip install -e git://github.com/bitly/data_hacks.git#egg=data_hacks` +Installing from source `python setup.py install` histogram.py ------------ diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py index 383e0de..fe0739a 100644 --- a/data_hacks/histogram.py +++ b/data_hacks/histogram.py @@ -1,15 +1,26 @@ #!/bin/env python +# +# Copyright 2010 bit.ly +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + """ -Generate a text format histogram +Generate a text format histogram This is a loose port to python of the Perl version at http://www.pandamatak.com/people/anand/xfer/histo -created by Jehiah Czebotar 2010-09-27 -Copyright (c) 2010 bit.ly. All rights reserved. - http://github.com/bitly/data_hacks - """ import sys @@ -26,7 +37,9 @@ class MVSD(object): self.total_w = Decimal(0) # weight of items seen def add(self, x, w=1): - x = x * Decimal('1.0') + """ add another datapoint to the Mean / Variance / Standard Deviation""" + if not isinstance(x, Decimal): + x = Decimal(x) if not self.is_started: self.m = x self.ss = Decimal(0) @@ -64,23 +77,25 @@ def load_stream(input_stream): line = input_stream.readline() if not line: break - line = line.strip() - if not line: + clean_line = line.strip() + if not clean_line: # skip empty lines (ie: newlines) continue + if clean_line[0] in ['"', "'"]: + clean_line = clean_line.strip('"').strip("'") try: - yield Decimal(line) + yield Decimal(clean_line) except: - try: - line = line.strip('"').strip("'") - yield Decimal(line) - except: - print >>sys.stderr, "invalid line %r" % line + print >>sys.stderr, "invalid line %r" % line def histogram(stream, options): - # we can't iterate on stream because we need to get min/max first and then put it into buckets + """ + Loop over the stream and add each entry to the dataset, printing out at the end + + stream yields Decimal() + """ if not options.min or not options.max: - # glob the data here so we can do min/max on it + # glob the iterator here so we can do min/max on it data = list(stream) else: data = stream diff --git a/data_hacks/nintey_five_percent.py b/data_hacks/nintey_five_percent.py index 2717486..78dd85b 100644 --- a/data_hacks/nintey_five_percent.py +++ b/data_hacks/nintey_five_percent.py @@ -1,12 +1,25 @@ #!/bin/env python +# +# Copyright 2010 bit.ly +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + """ Calculate the 95% time from a list of times given on stdin -created by Jehiah Czebotar 2010-09-27 -Copyright (c) 2010 bit.ly. All rights reserved. - http://github.com/bitly/data_hacks """ + import sys import os from decimal import Decimal diff --git a/data_hacks/run_for.py b/data_hacks/run_for.py index 08883b4..47277ac 100644 --- a/data_hacks/run_for.py +++ b/data_hacks/run_for.py @@ -1,12 +1,25 @@ #!/bin/env python +# +# Copyright 2010 bit.ly +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + """ Pass through data for a specified amount of time -created by Jehiah Czebotar 2010-09-27 -Copyright (c) 2010 bit.ly. All rights reserved. - http://github.com/bitly/data_hacks """ + import time import sys import os diff --git a/data_hacks/sample.py b/data_hacks/sample.py index a57bd43..04ecab1 100644 --- a/data_hacks/sample.py +++ b/data_hacks/sample.py @@ -1,10 +1,22 @@ #!/bin/env python +# +# Copyright 2010 bit.ly +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + """ Pass through a sampled percentage of data -created by Jehiah Czebotar 2010-09-27 -Copyright (c) 2010 bit.ly. All rights reserved. - http://github.com/bitly/data_hacks """ @@ -13,13 +25,6 @@ import random from optparse import OptionParser from decimal import Decimal -def usage(): - print """ - usage: - cat data | sample.py 10% | sort | uniq -c - cat data | sample.py 1/50 | sort | uniq -c -""" - def run(sample_rate): input_stream = sys.stdin while True: @@ -44,9 +49,8 @@ def get_sample_rate(rate_string): return rate if __name__ == "__main__": - parser = OptionParser() - parser.usage = "cat data | %prog [options] [sample_rate]" - parser.add_option("-v", "--verbose", dest="verbose", default=False, action="store_true") + parser = OptionParser(usage="cat data | %prog [options] [sample_rate]") + parser.add_option("--verbose", dest="verbose", default=False, action="store_true") (options, args) = parser.parse_args() if not args or sys.stdin.isatty(): |