summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--README.markdown5
-rw-r--r--data_hacks/histogram.py47
-rw-r--r--data_hacks/nintey_five_percent.py19
-rw-r--r--data_hacks/run_for.py19
-rw-r--r--data_hacks/sample.py30
5 files changed, 83 insertions, 37 deletions
diff --git a/README.markdown b/README.markdown
index e892a09..66efecb 100644
--- a/README.markdown
+++ b/README.markdown
@@ -3,10 +3,11 @@ data_hacks
Command line utilities for data analysis
-Installing from source:
+Installing: `pip install data_hacks`
- pip install -e git://github.com/bitly/data_hacks.git#egg=data_hacks
+Installing form github `pip install -e git://github.com/bitly/data_hacks.git#egg=data_hacks`
+Installing from source `python setup.py install`
histogram.py
------------
diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py
index 383e0de..fe0739a 100644
--- a/data_hacks/histogram.py
+++ b/data_hacks/histogram.py
@@ -1,15 +1,26 @@
#!/bin/env python
+#
+# Copyright 2010 bit.ly
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
"""
-Generate a text format histogram
+Generate a text format histogram
This is a loose port to python of the Perl version at
http://www.pandamatak.com/people/anand/xfer/histo
-created by Jehiah Czebotar 2010-09-27
-Copyright (c) 2010 bit.ly. All rights reserved.
-
http://github.com/bitly/data_hacks
-
"""
import sys
@@ -26,7 +37,9 @@ class MVSD(object):
self.total_w = Decimal(0) # weight of items seen
def add(self, x, w=1):
- x = x * Decimal('1.0')
+ """ add another datapoint to the Mean / Variance / Standard Deviation"""
+ if not isinstance(x, Decimal):
+ x = Decimal(x)
if not self.is_started:
self.m = x
self.ss = Decimal(0)
@@ -64,23 +77,25 @@ def load_stream(input_stream):
line = input_stream.readline()
if not line:
break
- line = line.strip()
- if not line:
+ clean_line = line.strip()
+ if not clean_line:
# skip empty lines (ie: newlines)
continue
+ if clean_line[0] in ['"', "'"]:
+ clean_line = clean_line.strip('"').strip("'")
try:
- yield Decimal(line)
+ yield Decimal(clean_line)
except:
- try:
- line = line.strip('"').strip("'")
- yield Decimal(line)
- except:
- print >>sys.stderr, "invalid line %r" % line
+ print >>sys.stderr, "invalid line %r" % line
def histogram(stream, options):
- # we can't iterate on stream because we need to get min/max first and then put it into buckets
+ """
+ Loop over the stream and add each entry to the dataset, printing out at the end
+
+ stream yields Decimal()
+ """
if not options.min or not options.max:
- # glob the data here so we can do min/max on it
+ # glob the iterator here so we can do min/max on it
data = list(stream)
else:
data = stream
diff --git a/data_hacks/nintey_five_percent.py b/data_hacks/nintey_five_percent.py
index 2717486..78dd85b 100644
--- a/data_hacks/nintey_five_percent.py
+++ b/data_hacks/nintey_five_percent.py
@@ -1,12 +1,25 @@
#!/bin/env python
+#
+# Copyright 2010 bit.ly
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
"""
Calculate the 95% time from a list of times given on stdin
-created by Jehiah Czebotar 2010-09-27
-Copyright (c) 2010 bit.ly. All rights reserved.
-
http://github.com/bitly/data_hacks
"""
+
import sys
import os
from decimal import Decimal
diff --git a/data_hacks/run_for.py b/data_hacks/run_for.py
index 08883b4..47277ac 100644
--- a/data_hacks/run_for.py
+++ b/data_hacks/run_for.py
@@ -1,12 +1,25 @@
#!/bin/env python
+#
+# Copyright 2010 bit.ly
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
"""
Pass through data for a specified amount of time
-created by Jehiah Czebotar 2010-09-27
-Copyright (c) 2010 bit.ly. All rights reserved.
-
http://github.com/bitly/data_hacks
"""
+
import time
import sys
import os
diff --git a/data_hacks/sample.py b/data_hacks/sample.py
index a57bd43..04ecab1 100644
--- a/data_hacks/sample.py
+++ b/data_hacks/sample.py
@@ -1,10 +1,22 @@
#!/bin/env python
+#
+# Copyright 2010 bit.ly
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
"""
Pass through a sampled percentage of data
-created by Jehiah Czebotar 2010-09-27
-Copyright (c) 2010 bit.ly. All rights reserved.
-
http://github.com/bitly/data_hacks
"""
@@ -13,13 +25,6 @@ import random
from optparse import OptionParser
from decimal import Decimal
-def usage():
- print """
- usage:
- cat data | sample.py 10% | sort | uniq -c
- cat data | sample.py 1/50 | sort | uniq -c
-"""
-
def run(sample_rate):
input_stream = sys.stdin
while True:
@@ -44,9 +49,8 @@ def get_sample_rate(rate_string):
return rate
if __name__ == "__main__":
- parser = OptionParser()
- parser.usage = "cat data | %prog [options] [sample_rate]"
- parser.add_option("-v", "--verbose", dest="verbose", default=False, action="store_true")
+ parser = OptionParser(usage="cat data | %prog [options] [sample_rate]")
+ parser.add_option("--verbose", dest="verbose", default=False, action="store_true")
(options, args) = parser.parse_args()
if not args or sys.stdin.isatty():