#!/usr/bin/env python
# The MIT License (MIT)
#
# Copyright (c) 2015 Sam Zaydel
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# Progname: sample
# Created: 08/17/2015
# Author: Sam Zaydel szaydel@gmail.com
#
# Description:
# This simple utility is meant to ease randomly sampling some lines from a
# larger file, using python's random module.
# There is nothing fancy here, so choice in sampling algorithms, etc.
#
# To run this utility we specify path to file as first argument and --count=
# optional argument, without which 10 lines are returned,
# similar to head or tail commands.
#
# This implementation is not ideal, because it can take a lot of memory due to
# a less than frugal design, so your mileage will surely vary.
#
import os, random, sys
DEFAULT_COUNT=10
MIN_SIZE=1
BUFFER_SIZE=1 << 20
def usage():
sys.stderr.write(
"Usage: %s </path/to/population/data> [--count=<NUM>]\n" % sys.argv[0]
)
def rand_sample(fn, size):
"""
rand_sample takes two arguments and returns lines : file name and number of
lines to sample from given file. If file exists, size is number of lines
returned as a sample from population.
"""
if size < MIN_SIZE:
return
s = 0
li = []
with open(fn, "rb+") as f:
lines = 0
bufsz = BUFFER_SIZE
r = f.read
buf = r(bufsz)
while buf:
s += buf.count('\n')
for e in buf.split('\n'):
li.append(e + '\n')
buf = r(bufsz)
rsample = ( li[i] for i in sorted(random.sample(xrange(s), size)) )
return "".join(rsample)
if __name__ == "__main__":
cnt = DEFAULT_COUNT
if len(sys.argv) < 2:
usage()
sys.exit(1)
if not os.path.exists(sys.argv[1]):
sys.stderr.write("Error: Path to source file is not valid!\n")
exit(1)
else:
for elem in sys.argv:
try:
k, v = elem.split("=")
if k == "--count":
cnt = int(v)
except ValueError as e:
pass
sys.stdout.write("%s" % rand_sample(sys.argv[1], cnt))