Mercurial > repos > pjbriggs > pal_finder
comparison fastq_subset.py @ 9:52dbe2089d14 draft default tip
Version 0.02.04.8 (update fastq subsetting).
author | pjbriggs |
---|---|
date | Wed, 04 Jul 2018 06:05:52 -0400 |
parents | 4e625d3672ba |
children |
comparison
equal
deleted
inserted
replaced
8:4e625d3672ba | 9:52dbe2089d14 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 import argparse | 3 import argparse |
4 import random | 4 import random |
5 from Bio.SeqIO.QualityIO import FastqGeneralIterator | 5 import gzip |
6 | |
7 CHUNKSIZE = 102400 | |
8 | |
9 def getlines(filen): | |
10 """ | |
11 Efficiently fetch lines from a file one by one | |
12 | |
13 Generator function implementing an efficient | |
14 method of reading lines sequentially from a file, | |
15 attempting to minimise the number of read operations | |
16 and performing the line splitting in memory: | |
17 | |
18 >>> for line in getlines(filen): | |
19 >>> ...do something... | |
20 | |
21 Input file can be gzipped; this function should | |
22 handle this invisibly provided the file names ends | |
23 with '.gz'. | |
24 | |
25 Arguments: | |
26 filen (str): path of the file to read lines from | |
27 | |
28 Yields: | |
29 String: next line of text from the file, with any | |
30 newline character removed. | |
31 """ | |
32 if filen.split('.')[-1] == 'gz': | |
33 fp = gzip.open(filen,'rb') | |
34 else: | |
35 fp = open(filen,'rb') | |
36 # Read in data in chunks | |
37 buf = '' | |
38 lines = [] | |
39 while True: | |
40 # Grab a chunk of data | |
41 data = fp.read(CHUNKSIZE) | |
42 # Check for EOF | |
43 if not data: | |
44 break | |
45 # Add to buffer and split into lines | |
46 buf = buf + data | |
47 if buf[0] == '\n': | |
48 buf = buf[1:] | |
49 if buf[-1] != '\n': | |
50 i = buf.rfind('\n') | |
51 if i == -1: | |
52 continue | |
53 else: | |
54 lines = buf[:i].split('\n') | |
55 buf = buf[i+1:] | |
56 else: | |
57 lines = buf[:-1].split('\n') | |
58 buf = '' | |
59 # Return the lines one at a time | |
60 for line in lines: | |
61 yield line | |
6 | 62 |
7 def count_reads(fastq): | 63 def count_reads(fastq): |
8 """ | 64 """ |
9 Count number of reads in a Fastq file | 65 Count number of reads in a Fastq file |
10 """ | 66 """ |
23 The reads to output are specifed by a list | 79 The reads to output are specifed by a list |
24 of integer indices; only reads at those | 80 of integer indices; only reads at those |
25 positions in the input file will be written | 81 positions in the input file will be written |
26 to the output. | 82 to the output. |
27 """ | 83 """ |
28 with open(fastq_in,'r') as fq_in: | 84 with open(fastq_out,'w') as fq_out: |
29 fq_out = open(fastq_out,'w') | 85 # Current index |
30 i = 0 | 86 i = 0 |
31 for title,seq,qual in FastqGeneralIterator(fq_in): | 87 # Read count |
32 if i in indices: | 88 n = 0 |
33 fq_out.write("@%s\n%s\n+\n%s\n" % (title, | 89 # Read contents |
34 seq, | 90 rd = [] |
35 qual)) | 91 # Iterate through the file |
36 i += 1 | 92 for ii,line in enumerate(getlines(fastq_in),start=1): |
37 fq_out.close() | 93 rd.append(line) |
94 if ii%4 == 0: | |
95 # Got a complete read | |
96 try: | |
97 # If read index matches the current index | |
98 # then output the read | |
99 if n == indices[i]: | |
100 fq_out.write("%s\n" % '\n'.join(rd)) | |
101 i += 1 | |
102 # Update for next read | |
103 n += 1 | |
104 rd = [] | |
105 except IndexError: | |
106 # Subset complete | |
107 return | |
108 # End of file: check nothing was left over | |
109 if rd: | |
110 raise Exception("Incomplete read at file end: %s" | |
111 % rd) | |
38 | 112 |
39 if __name__ == "__main__": | 113 if __name__ == "__main__": |
40 | 114 |
41 p = argparse.ArgumentParser() | 115 p = argparse.ArgumentParser() |
42 p.add_argument("fastq_r1") | 116 p.add_argument("fastq_r1") |
67 subset_size = int(subset_size) | 141 subset_size = int(subset_size) |
68 print "Extracting subset of reads: %s" % subset_size | 142 print "Extracting subset of reads: %s" % subset_size |
69 if args.seed is not None: | 143 if args.seed is not None: |
70 print "Random number generator seed: %d" % args.seed | 144 print "Random number generator seed: %d" % args.seed |
71 random.seed(args.seed) | 145 random.seed(args.seed) |
72 subset = random.sample(xrange(nreads),subset_size) | 146 subset = sorted(random.sample(xrange(nreads),subset_size)) |
73 fastq_subset(args.fastq_r1,"subset_r1.fq",subset) | 147 fastq_subset(args.fastq_r1,"subset_r1.fq",subset) |
74 fastq_subset(args.fastq_r2,"subset_r2.fq",subset) | 148 fastq_subset(args.fastq_r2,"subset_r2.fq",subset) |