Mercurial > repos > pjbriggs > pal_finder
comparison fastq_subset.py @ 9:52dbe2089d14 draft default tip
Version 0.02.04.8 (update fastq subsetting).
| author | pjbriggs |
|---|---|
| date | Wed, 04 Jul 2018 06:05:52 -0400 |
| parents | 4e625d3672ba |
| children |
comparison
equal
deleted
inserted
replaced
| 8:4e625d3672ba | 9:52dbe2089d14 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | 2 |
| 3 import argparse | 3 import argparse |
| 4 import random | 4 import random |
| 5 from Bio.SeqIO.QualityIO import FastqGeneralIterator | 5 import gzip |
| 6 | |
| 7 CHUNKSIZE = 102400 | |
| 8 | |
| 9 def getlines(filen): | |
| 10 """ | |
| 11 Efficiently fetch lines from a file one by one | |
| 12 | |
| 13 Generator function implementing an efficient | |
| 14 method of reading lines sequentially from a file, | |
| 15 attempting to minimise the number of read operations | |
| 16 and performing the line splitting in memory: | |
| 17 | |
| 18 >>> for line in getlines(filen): | |
| 19 >>> ...do something... | |
| 20 | |
| 21 Input file can be gzipped; this function should | |
| 22 handle this invisibly provided the file names ends | |
| 23 with '.gz'. | |
| 24 | |
| 25 Arguments: | |
| 26 filen (str): path of the file to read lines from | |
| 27 | |
| 28 Yields: | |
| 29 String: next line of text from the file, with any | |
| 30 newline character removed. | |
| 31 """ | |
| 32 if filen.split('.')[-1] == 'gz': | |
| 33 fp = gzip.open(filen,'rb') | |
| 34 else: | |
| 35 fp = open(filen,'rb') | |
| 36 # Read in data in chunks | |
| 37 buf = '' | |
| 38 lines = [] | |
| 39 while True: | |
| 40 # Grab a chunk of data | |
| 41 data = fp.read(CHUNKSIZE) | |
| 42 # Check for EOF | |
| 43 if not data: | |
| 44 break | |
| 45 # Add to buffer and split into lines | |
| 46 buf = buf + data | |
| 47 if buf[0] == '\n': | |
| 48 buf = buf[1:] | |
| 49 if buf[-1] != '\n': | |
| 50 i = buf.rfind('\n') | |
| 51 if i == -1: | |
| 52 continue | |
| 53 else: | |
| 54 lines = buf[:i].split('\n') | |
| 55 buf = buf[i+1:] | |
| 56 else: | |
| 57 lines = buf[:-1].split('\n') | |
| 58 buf = '' | |
| 59 # Return the lines one at a time | |
| 60 for line in lines: | |
| 61 yield line | |
| 6 | 62 |
| 7 def count_reads(fastq): | 63 def count_reads(fastq): |
| 8 """ | 64 """ |
| 9 Count number of reads in a Fastq file | 65 Count number of reads in a Fastq file |
| 10 """ | 66 """ |
| 23 The reads to output are specifed by a list | 79 The reads to output are specifed by a list |
| 24 of integer indices; only reads at those | 80 of integer indices; only reads at those |
| 25 positions in the input file will be written | 81 positions in the input file will be written |
| 26 to the output. | 82 to the output. |
| 27 """ | 83 """ |
| 28 with open(fastq_in,'r') as fq_in: | 84 with open(fastq_out,'w') as fq_out: |
| 29 fq_out = open(fastq_out,'w') | 85 # Current index |
| 30 i = 0 | 86 i = 0 |
| 31 for title,seq,qual in FastqGeneralIterator(fq_in): | 87 # Read count |
| 32 if i in indices: | 88 n = 0 |
| 33 fq_out.write("@%s\n%s\n+\n%s\n" % (title, | 89 # Read contents |
| 34 seq, | 90 rd = [] |
| 35 qual)) | 91 # Iterate through the file |
| 36 i += 1 | 92 for ii,line in enumerate(getlines(fastq_in),start=1): |
| 37 fq_out.close() | 93 rd.append(line) |
| 94 if ii%4 == 0: | |
| 95 # Got a complete read | |
| 96 try: | |
| 97 # If read index matches the current index | |
| 98 # then output the read | |
| 99 if n == indices[i]: | |
| 100 fq_out.write("%s\n" % '\n'.join(rd)) | |
| 101 i += 1 | |
| 102 # Update for next read | |
| 103 n += 1 | |
| 104 rd = [] | |
| 105 except IndexError: | |
| 106 # Subset complete | |
| 107 return | |
| 108 # End of file: check nothing was left over | |
| 109 if rd: | |
| 110 raise Exception("Incomplete read at file end: %s" | |
| 111 % rd) | |
| 38 | 112 |
| 39 if __name__ == "__main__": | 113 if __name__ == "__main__": |
| 40 | 114 |
| 41 p = argparse.ArgumentParser() | 115 p = argparse.ArgumentParser() |
| 42 p.add_argument("fastq_r1") | 116 p.add_argument("fastq_r1") |
| 67 subset_size = int(subset_size) | 141 subset_size = int(subset_size) |
| 68 print "Extracting subset of reads: %s" % subset_size | 142 print "Extracting subset of reads: %s" % subset_size |
| 69 if args.seed is not None: | 143 if args.seed is not None: |
| 70 print "Random number generator seed: %d" % args.seed | 144 print "Random number generator seed: %d" % args.seed |
| 71 random.seed(args.seed) | 145 random.seed(args.seed) |
| 72 subset = random.sample(xrange(nreads),subset_size) | 146 subset = sorted(random.sample(xrange(nreads),subset_size)) |
| 73 fastq_subset(args.fastq_r1,"subset_r1.fq",subset) | 147 fastq_subset(args.fastq_r1,"subset_r1.fq",subset) |
| 74 fastq_subset(args.fastq_r2,"subset_r2.fq",subset) | 148 fastq_subset(args.fastq_r2,"subset_r2.fq",subset) |
