Mercurial > repos > pjbriggs > pal_finder
view fastq_subset.py @ 9:52dbe2089d14 draft default tip
Version 0.02.04.8 (update fastq subsetting).
author | pjbriggs |
---|---|
date | Wed, 04 Jul 2018 06:05:52 -0400 |
parents | 4e625d3672ba |
children |
line wrap: on
line source
#!/usr/bin/env python import argparse import random import gzip CHUNKSIZE = 102400 def getlines(filen): """ Efficiently fetch lines from a file one by one Generator function implementing an efficient method of reading lines sequentially from a file, attempting to minimise the number of read operations and performing the line splitting in memory: >>> for line in getlines(filen): >>> ...do something... Input file can be gzipped; this function should handle this invisibly provided the file names ends with '.gz'. Arguments: filen (str): path of the file to read lines from Yields: String: next line of text from the file, with any newline character removed. """ if filen.split('.')[-1] == 'gz': fp = gzip.open(filen,'rb') else: fp = open(filen,'rb') # Read in data in chunks buf = '' lines = [] while True: # Grab a chunk of data data = fp.read(CHUNKSIZE) # Check for EOF if not data: break # Add to buffer and split into lines buf = buf + data if buf[0] == '\n': buf = buf[1:] if buf[-1] != '\n': i = buf.rfind('\n') if i == -1: continue else: lines = buf[:i].split('\n') buf = buf[i+1:] else: lines = buf[:-1].split('\n') buf = '' # Return the lines one at a time for line in lines: yield line def count_reads(fastq): """ Count number of reads in a Fastq file """ n = 0 with open(fastq,'r') as fq: while True: buf = fq.read() n += buf.count('\n') if buf == "": break return n/4 def fastq_subset(fastq_in,fastq_out,indices): """ Output a subset of reads from a Fastq file The reads to output are specifed by a list of integer indices; only reads at those positions in the input file will be written to the output. """ with open(fastq_out,'w') as fq_out: # Current index i = 0 # Read count n = 0 # Read contents rd = [] # Iterate through the file for ii,line in enumerate(getlines(fastq_in),start=1): rd.append(line) if ii%4 == 0: # Got a complete read try: # If read index matches the current index # then output the read if n == indices[i]: fq_out.write("%s\n" % '\n'.join(rd)) i += 1 # Update for next read n += 1 rd = [] except IndexError: # Subset complete return # End of file: check nothing was left over if rd: raise Exception("Incomplete read at file end: %s" % rd) if __name__ == "__main__": p = argparse.ArgumentParser() p.add_argument("fastq_r1") p.add_argument("fastq_r2") p.add_argument("-n", dest="subset_size", default=None, help="subset size") p.add_argument("-s", dest="seed", type=int, default=None, help="seed for random number generator") args = p.parse_args() print "Processing fastq pair:" print "\t%s" % args.fastq_r1 print "\t%s" % args.fastq_r2 nreads = count_reads(args.fastq_r1) print "Counted %d reads in %s" % (nreads,args.fastq_r1) if args.subset_size is not None: subset_size = float(args.subset_size) if subset_size < 1.0: subset_size = int(nreads*subset_size) else: subset_size = int(subset_size) print "Extracting subset of reads: %s" % subset_size if args.seed is not None: print "Random number generator seed: %d" % args.seed random.seed(args.seed) subset = sorted(random.sample(xrange(nreads),subset_size)) fastq_subset(args.fastq_r1,"subset_r1.fq",subset) fastq_subset(args.fastq_r2,"subset_r2.fq",subset)