pal_finder: fastq_subset.py comparison

comparison fastq_subset.py @ 9:52dbe2089d14 draft default tip

Version 0.02.04.8 (update fastq subsetting).

author	pjbriggs
date	Wed, 04 Jul 2018 06:05:52 -0400
parents	4e625d3672ba
children

comparison

equal deleted inserted replaced

-:4e625d3672ba
+:52dbe2089d14
 #!/usr/bin/env python
 import argparse
 import random
-from Bio.SeqIO.QualityIO import FastqGeneralIterator
+import gzip
+CHUNKSIZE = 102400
+def getlines(filen):
+"""
+Efficiently fetch lines from a file one by one
+Generator function implementing an efficient
+method of reading lines sequentially from a file,
+attempting to minimise the number of read operations
+and performing the line splitting in memory:
+>>> for line in getlines(filen):
+>>> ...do something...
+Input file can be gzipped; this function should
+handle this invisibly provided the file names ends
+with '.gz'.
+Arguments:
+filen (str): path of the file to read lines from
+Yields:
+String: next line of text from the file, with any
+newline character removed.
+"""
+if filen.split('.')[-1] == 'gz':
+fp = gzip.open(filen,'rb')
+else:
+fp = open(filen,'rb')
+# Read in data in chunks
+buf = ''
+lines = []
+while True:
+# Grab a chunk of data
+data = fp.read(CHUNKSIZE)
+# Check for EOF
+if not data:
+break
+# Add to buffer and split into lines
+buf = buf + data
+if buf[0] == '\n':
+buf = buf[1:]
+if buf[-1] != '\n':
+i = buf.rfind('\n')
+if i == -1:
+continue
+else:
+lines = buf[:i].split('\n')
+buf = buf[i+1:]
+else:
+lines = buf[:-1].split('\n')
+buf = ''
+# Return the lines one at a time
+for line in lines:
+yield line
 def count_reads(fastq):
 """
 Count number of reads in a Fastq file
 """
 The reads to output are specifed by a list
 of integer indices; only reads at those
 positions in the input file will be written
 to the output.
 """
-with open(fastq_in,'r') as fq_in:
+with open(fastq_out,'w') as fq_out:
-fq_out = open(fastq_out,'w')
+# Current index
 i = 0
-for title,seq,qual in FastqGeneralIterator(fq_in):
+# Read count
-if i in indices:
+n = 0
-fq_out.write("@%s\n%s\n+\n%s\n" % (title,
+# Read contents
-seq,
+rd = []
-qual))
+# Iterate through the file
-i += 1
+for ii,line in enumerate(getlines(fastq_in),start=1):
-fq_out.close()
+rd.append(line)
+if ii%4 == 0:
+# Got a complete read
+try:
+# If read index matches the current index
+# then output the read
+if n == indices[i]:
+fq_out.write("%s\n" % '\n'.join(rd))
+i += 1
+# Update for next read
+n += 1
+rd = []
+except IndexError:
+# Subset complete
+return
+# End of file: check nothing was left over
+if rd:
+raise Exception("Incomplete read at file end: %s"
+% rd)
 if __name__ == "__main__":
 p = argparse.ArgumentParser()
 p.add_argument("fastq_r1")
 subset_size = int(subset_size)
 print "Extracting subset of reads: %s" % subset_size
 if args.seed is not None:
 print "Random number generator seed: %d" % args.seed
 random.seed(args.seed)
-subset = random.sample(xrange(nreads),subset_size)
+subset = sorted(random.sample(xrange(nreads),subset_size))
 fastq_subset(args.fastq_r1,"subset_r1.fq",subset)
 fastq_subset(args.fastq_r2,"subset_r2.fq",subset)

Mercurial > repos > pjbriggs > pal_finder

comparison fastq_subset.py @ 9:52dbe2089d14 draft default tip