Mercurial > repos > pjbriggs > pal_finder
diff fastq_subset.py @ 9:52dbe2089d14 draft default tip
Version 0.02.04.8 (update fastq subsetting).
author | pjbriggs |
---|---|
date | Wed, 04 Jul 2018 06:05:52 -0400 |
parents | 4e625d3672ba |
children |
line wrap: on
line diff
--- a/fastq_subset.py Wed May 16 07:39:16 2018 -0400 +++ b/fastq_subset.py Wed Jul 04 06:05:52 2018 -0400 @@ -2,7 +2,63 @@ import argparse import random -from Bio.SeqIO.QualityIO import FastqGeneralIterator +import gzip + +CHUNKSIZE = 102400 + +def getlines(filen): + """ + Efficiently fetch lines from a file one by one + + Generator function implementing an efficient + method of reading lines sequentially from a file, + attempting to minimise the number of read operations + and performing the line splitting in memory: + + >>> for line in getlines(filen): + >>> ...do something... + + Input file can be gzipped; this function should + handle this invisibly provided the file names ends + with '.gz'. + + Arguments: + filen (str): path of the file to read lines from + + Yields: + String: next line of text from the file, with any + newline character removed. + """ + if filen.split('.')[-1] == 'gz': + fp = gzip.open(filen,'rb') + else: + fp = open(filen,'rb') + # Read in data in chunks + buf = '' + lines = [] + while True: + # Grab a chunk of data + data = fp.read(CHUNKSIZE) + # Check for EOF + if not data: + break + # Add to buffer and split into lines + buf = buf + data + if buf[0] == '\n': + buf = buf[1:] + if buf[-1] != '\n': + i = buf.rfind('\n') + if i == -1: + continue + else: + lines = buf[:i].split('\n') + buf = buf[i+1:] + else: + lines = buf[:-1].split('\n') + buf = '' + # Return the lines one at a time + for line in lines: + yield line def count_reads(fastq): """ @@ -25,16 +81,34 @@ positions in the input file will be written to the output. """ - with open(fastq_in,'r') as fq_in: - fq_out = open(fastq_out,'w') + with open(fastq_out,'w') as fq_out: + # Current index i = 0 - for title,seq,qual in FastqGeneralIterator(fq_in): - if i in indices: - fq_out.write("@%s\n%s\n+\n%s\n" % (title, - seq, - qual)) - i += 1 - fq_out.close() + # Read count + n = 0 + # Read contents + rd = [] + # Iterate through the file + for ii,line in enumerate(getlines(fastq_in),start=1): + rd.append(line) + if ii%4 == 0: + # Got a complete read + try: + # If read index matches the current index + # then output the read + if n == indices[i]: + fq_out.write("%s\n" % '\n'.join(rd)) + i += 1 + # Update for next read + n += 1 + rd = [] + except IndexError: + # Subset complete + return + # End of file: check nothing was left over + if rd: + raise Exception("Incomplete read at file end: %s" + % rd) if __name__ == "__main__": @@ -69,6 +143,6 @@ if args.seed is not None: print "Random number generator seed: %d" % args.seed random.seed(args.seed) - subset = random.sample(xrange(nreads),subset_size) + subset = sorted(random.sample(xrange(nreads),subset_size)) fastq_subset(args.fastq_r1,"subset_r1.fq",subset) fastq_subset(args.fastq_r2,"subset_r2.fq",subset)