Mercurial > repos > xuebing > sharplabtool
diff sampline.py @ 14:76e1b1b21cce default tip
Deleted selected files
author | xuebing |
---|---|
date | Tue, 13 Mar 2012 19:05:10 -0400 |
parents | 292186c14b08 |
children |
line wrap: on
line diff
--- a/sampline.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,133 +0,0 @@ -#!/usr/bin/env python - -""" -Sampling random records from a file. Each record is defined by a fixed number of lines. - -Usage: sampline.py [options] - -Options: - -h, --help show this help message and exit - -r, --replacement Sampling with replacement - -i INPUT, --input=INPUT - Input file - -o OUTPUT, --output=OUTPUT - Output file - -k NSAMPLE, --nSample=NSAMPLE - (required) number of records to be sampled/output - -m RECSIZE, --recSize=RECSIZE - (default=1) number of lines spanned by each record - -n NSKIP, --nSkip=NSKIP - (default=0) number of comment lines to skip at the - beginning - -example: - python sampline.py -i test10000.fastq -o out.txt --nSample=5 --recSize=4 --nSkip=0 --replacement -""" - -import optparse, string, random,sys,math,itertools - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - - # Parse command line - parser = optparse.OptionParser( usage="%prog [options] " ) - parser.add_option( "-r", "--replacement", action="store_true", dest="replacement",default=False, - help="Sampling with replacement" ) - parser.add_option( "-i", "--input", dest="input", default=None, - help="Input file" ) - parser.add_option( "-o", "--output", dest="output", default=None, - help="Output file" ) - parser.add_option("-k","--nSample", type='int',dest="nSample",default=None, - help="(required) number of records to be sampled/output" ) - parser.add_option("-m","--recSize", type='int',dest="recSize",default=1, - help="(default=1) number of lines spanned by each record" ) - parser.add_option("-n","--nSkip", type='int',dest="nSkip",default=0, - help="(default=0) number of comment lines to skip at the beginning") - options, args = parser.parse_args() - #assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed' ), "Invalid region argument" - - sampline(options.input,options.output,options.nSample,options.recSize,options.nSkip,options.replacement) - -def sample_wr(population, k): - "Chooses k random elements (with replacement) from a population" - n = len(population) - _random, _int = random.random, int # speed hack - return [_int(_random() * n) for i in itertools.repeat(None, k)] - -# num of lines -def readinput(filename): - try: - f = open (filename) - except: - print >> sys.stderr, "can't open file "+str(filename) - sys.exit(0) - - nline = 0 - for line in f: - nline = nline + 1 - f.close() - return nline - -def sampline(infile,outfile,nSample,recSize,nSkip,replacement): - # sample nSample records from file - # each record contains recSize lines - # skip the top nSkip lines - - nLine = readinput(infile) - print 'num of lines in input: ',nLine - print 'avoid sampling the first ',nSkip,' lines' - print 'lines per record: ',recSize - - if (nLine-nSkip) % recSize: - print >> sys.stderr, "the number of lines is not dividable by record size!" - sys.exit(0) - - nTotalRecords = (nLine-nSkip) / recSize - print "total number of records: ",nTotalRecords - - if replacement or nTotalRecords < nSample: - sel = sample_wr(range(nTotalRecords),nSample) - else: - sel = random.sample(range(nTotalRecords),nSample) - - #print len(sel), sorted(sel) - - # output - try: - fout = open (outfile,'w') - except: - print >> sys.stderr, "can't open file "+str(outfile) - sys.exit(0) - fin = open(infile) - n = 0 # index of line - rec = "" # to store all content of a record - nrepeat = 0 # number of times a record is sampled - curr_rec = -1 - for line in fin: - if n < nSkip: - n = n + 1 - fout.write(line) - continue - - if not (n-nSkip) % recSize:# a new record - # print the previous sampled record - for i in range(nrepeat): - fout.write(rec) - curr_rec = (n-nSkip)/recSize - nrepeat = sel.count(curr_rec) - if nrepeat: # sampled - rec = line - #print curr_rec,nrepeat - elif (n-nSkip)/recSize == curr_rec: - rec = rec + line - n = n + 1 - # if the last record is selected - if curr_rec == nTotalRecords-1: - for i in range(nrepeat): - fout.write(rec) - fin.close() - fout.close() - - -if __name__ == "__main__": main()