Mercurial > repos > pjbriggs > pal_finder
annotate fastq_subset.py @ 9:52dbe2089d14 draft default tip
Version 0.02.04.8 (update fastq subsetting).
author | pjbriggs |
---|---|
date | Wed, 04 Jul 2018 06:05:52 -0400 |
parents | 4e625d3672ba |
children |
rev | line source |
---|---|
8
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
1 #!/usr/bin/env python |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
2 |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
3 import argparse |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
4 import random |
9 | 5 import gzip |
6 | |
7 CHUNKSIZE = 102400 | |
8 | |
9 def getlines(filen): | |
10 """ | |
11 Efficiently fetch lines from a file one by one | |
12 | |
13 Generator function implementing an efficient | |
14 method of reading lines sequentially from a file, | |
15 attempting to minimise the number of read operations | |
16 and performing the line splitting in memory: | |
17 | |
18 >>> for line in getlines(filen): | |
19 >>> ...do something... | |
20 | |
21 Input file can be gzipped; this function should | |
22 handle this invisibly provided the file names ends | |
23 with '.gz'. | |
24 | |
25 Arguments: | |
26 filen (str): path of the file to read lines from | |
27 | |
28 Yields: | |
29 String: next line of text from the file, with any | |
30 newline character removed. | |
31 """ | |
32 if filen.split('.')[-1] == 'gz': | |
33 fp = gzip.open(filen,'rb') | |
34 else: | |
35 fp = open(filen,'rb') | |
36 # Read in data in chunks | |
37 buf = '' | |
38 lines = [] | |
39 while True: | |
40 # Grab a chunk of data | |
41 data = fp.read(CHUNKSIZE) | |
42 # Check for EOF | |
43 if not data: | |
44 break | |
45 # Add to buffer and split into lines | |
46 buf = buf + data | |
47 if buf[0] == '\n': | |
48 buf = buf[1:] | |
49 if buf[-1] != '\n': | |
50 i = buf.rfind('\n') | |
51 if i == -1: | |
52 continue | |
53 else: | |
54 lines = buf[:i].split('\n') | |
55 buf = buf[i+1:] | |
56 else: | |
57 lines = buf[:-1].split('\n') | |
58 buf = '' | |
59 # Return the lines one at a time | |
60 for line in lines: | |
61 yield line | |
8
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
62 |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
63 def count_reads(fastq): |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
64 """ |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
65 Count number of reads in a Fastq file |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
66 """ |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
67 n = 0 |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
68 with open(fastq,'r') as fq: |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
69 while True: |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
70 buf = fq.read() |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
71 n += buf.count('\n') |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
72 if buf == "": break |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
73 return n/4 |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
74 |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
75 def fastq_subset(fastq_in,fastq_out,indices): |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
76 """ |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
77 Output a subset of reads from a Fastq file |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
78 |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
79 The reads to output are specifed by a list |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
80 of integer indices; only reads at those |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
81 positions in the input file will be written |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
82 to the output. |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
83 """ |
9 | 84 with open(fastq_out,'w') as fq_out: |
85 # Current index | |
8
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
86 i = 0 |
9 | 87 # Read count |
88 n = 0 | |
89 # Read contents | |
90 rd = [] | |
91 # Iterate through the file | |
92 for ii,line in enumerate(getlines(fastq_in),start=1): | |
93 rd.append(line) | |
94 if ii%4 == 0: | |
95 # Got a complete read | |
96 try: | |
97 # If read index matches the current index | |
98 # then output the read | |
99 if n == indices[i]: | |
100 fq_out.write("%s\n" % '\n'.join(rd)) | |
101 i += 1 | |
102 # Update for next read | |
103 n += 1 | |
104 rd = [] | |
105 except IndexError: | |
106 # Subset complete | |
107 return | |
108 # End of file: check nothing was left over | |
109 if rd: | |
110 raise Exception("Incomplete read at file end: %s" | |
111 % rd) | |
8
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
112 |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
113 if __name__ == "__main__": |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
114 |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
115 p = argparse.ArgumentParser() |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
116 p.add_argument("fastq_r1") |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
117 p.add_argument("fastq_r2") |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
118 p.add_argument("-n", |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
119 dest="subset_size", |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
120 default=None, |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
121 help="subset size") |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
122 p.add_argument("-s", |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
123 dest="seed", |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
124 type=int, |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
125 default=None, |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
126 help="seed for random number generator") |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
127 args = p.parse_args() |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
128 |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
129 print "Processing fastq pair:" |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
130 print "\t%s" % args.fastq_r1 |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
131 print "\t%s" % args.fastq_r2 |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
132 |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
133 nreads = count_reads(args.fastq_r1) |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
134 print "Counted %d reads in %s" % (nreads,args.fastq_r1) |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
135 |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
136 if args.subset_size is not None: |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
137 subset_size = float(args.subset_size) |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
138 if subset_size < 1.0: |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
139 subset_size = int(nreads*subset_size) |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
140 else: |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
141 subset_size = int(subset_size) |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
142 print "Extracting subset of reads: %s" % subset_size |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
143 if args.seed is not None: |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
144 print "Random number generator seed: %d" % args.seed |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
145 random.seed(args.seed) |
9 | 146 subset = sorted(random.sample(xrange(nreads),subset_size)) |
8
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
147 fastq_subset(args.fastq_r1,"subset_r1.fq",subset) |
4e625d3672ba
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
pjbriggs
parents:
diff
changeset
|
148 fastq_subset(args.fastq_r2,"subset_r2.fq",subset) |