comparison tools/sample_seqs/sample_seqs.py @ 2:da64f6a9e32b draft

Uploaded v0.2.0, adds desired count mode
author peterjc
date Fri, 06 Mar 2015 11:48:09 -0500
parents 3a807e5ea6c8
children 02c13ef1a669
comparison
equal deleted inserted replaced
1:16ecf25d521f 2:da64f6a9e32b
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """Sub-sample sequence from a FASTA, FASTQ or SFF file. 2 """Sub-sample sequence from a FASTA, FASTQ or SFF file.
3 3
4 This tool is a short Python script which requires Biopython 1.62 or later 4 This tool is a short Python script which requires Biopython 1.62 or later
5 for SFF file support. If you use this tool in scientific work leading to a 5 for sequence parsing. If you use this tool in scientific work leading to a
6 publication, please cite the Biopython application note: 6 publication, please cite the Biopython application note:
7 7
8 Cock et al 2009. Biopython: freely available Python tools for computational 8 Cock et al 2009. Biopython: freely available Python tools for computational
9 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. 9 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
10 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. 10 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
11 11
12 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute 12 This script is copyright 2014-2015 by Peter Cock, The James Hutton Institute
13 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. 13 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
14 See accompanying text file for licence details (MIT license). 14 See accompanying text file for licence details (MIT license).
15 15
16 This is version 0.1.0 of the script, use -v or --version to get the version. 16 Use -v or --version to get the version, -h or --help for help.
17 """ 17 """
18 import os 18 import os
19 import sys 19 import sys
20 20 from optparse import OptionParser
21 def stop_err(msg, err=1): 21
22
23 def sys_exit(msg, err=1):
22 sys.stderr.write(msg.rstrip() + "\n") 24 sys.stderr.write(msg.rstrip() + "\n")
23 sys.exit(err) 25 sys.exit(err)
24 26
25 if "-v" in sys.argv or "--version" in sys.argv: 27 #Parse Command Line
26 print("v0.1.0") 28 usage = """Use as follows:
29
30 $ python sample_seqs.py [options]
31
32 e.g. Sample 20% of the reads:
33
34 $ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq
35
36 This samples uniformly though the file, rather than at random, and therefore
37 should be reproducible.
38 """
39 parser = OptionParser(usage=usage)
40 parser.add_option('-i', '--input', dest='input',
41 default=None, help='Input sequences filename',
42 metavar="FILE")
43 parser.add_option('-f', '--format', dest='format',
44 default=None,
45 help='Input sequence format (e.g. fasta, fastq, sff)')
46 parser.add_option('-o', '--output', dest='output',
47 default=None, help='Output sampled sequenced filename',
48 metavar="FILE")
49 parser.add_option('-p', '--percent', dest='percent',
50 default=None,
51 help='Take this percent of the reads')
52 parser.add_option('-n', '--everyn', dest='everyn',
53 default=None,
54 help='Take every N-th read')
55 parser.add_option('-c', '--count', dest='count',
56 default=None,
57 help='Take exactly N reads')
58 parser.add_option("--interleaved", dest="interleaved",
59 default=False, action="store_true",
60 help="Input is interleaved reads, preserve the pairings")
61 parser.add_option("-v", "--version", dest="version",
62 default=False, action="store_true",
63 help="Show version and quit")
64 options, args = parser.parse_args()
65
66 if options.version:
67 print("v0.2.0")
27 sys.exit(0) 68 sys.exit(0)
28 69
29 #Parse Command Line 70 in_file = options.input
30 if len(sys.argv) < 5: 71 out_file = options.output
31 stop_err("Requires at least four arguments: seq_format, in_file, out_file, mode, ...") 72 interleaved = options.interleaved
32 seq_format, in_file, out_file, mode = sys.argv[1:5] 73
74 if not in_file:
75 sys_exit("Require an input filename")
33 if in_file != "/dev/stdin" and not os.path.isfile(in_file): 76 if in_file != "/dev/stdin" and not os.path.isfile(in_file):
34 stop_err("Missing input file %r" % in_file) 77 sys_exit("Missing input file %r" % in_file)
35 78 if not out_file:
36 if mode == "everyNth": 79 sys_exit("Require an output filename")
37 if len(sys.argv) != 6: 80 if not options.format:
38 stop_err("If using everyNth, just need argument N (integer, at least 2)") 81 sys_exit("Require the sequence format")
39 try: 82 seq_format = options.format.lower()
40 N = int(sys.argv[5]) 83
84
85 def count_fasta(filename):
86 from Bio.SeqIO.FastaIO import SimpleFastaParser
87 count = 0
88 with open(filename) as handle:
89 for title, seq in SimpleFastaParser(handle):
90 count += 1
91 return count
92
93
94 def count_fastq(filename):
95 from Bio.SeqIO.QualityIO import FastqGeneralIterator
96 count = 0
97 with open(filename) as handle:
98 for title, seq, qual in FastqGeneralIterator(handle):
99 count += 1
100 return count
101
102
103 def count_sff(filename):
104 from Bio import SeqIO
105 # If the SFF file has a built in index (which is normal),
106 # this will be parsed and is the quicker than scanning
107 # the whole file.
108 return len(SeqIO.index(filename, "sff"))
109
110
111 def count_sequences(filename, format):
112 if seq_format == "sff":
113 return count_sff(filename)
114 elif seq_format == "fasta":
115 return count_fasta(filename)
116 elif seq_format.startswith("fastq"):
117 return count_fastq(filename)
118 else:
119 sys_exit("Unsupported file type %r" % seq_format)
120
121
122 if options.percent and options.everyn:
123 sys_exit("Cannot combine -p and -n options")
124 elif options.everyn and options.count:
125 sys_exit("Cannot combine -p and -c options")
126 elif options.percent and options.count:
127 sys_exit("Cannot combine -n and -c options")
128 elif options.everyn:
129 try:
130 N = int(options.everyn)
41 except: 131 except:
42 stop_err("Bad N argument %r" % sys.argv[5]) 132 sys_exit("Bad -n argument %r" % options.everyn)
43 if N < 2: 133 if N < 2:
44 stop_err("Bad N argument %r" % sys.argv[5]) 134 sys_exit("Bad -n argument %r" % options.everyn)
45 if (N % 10) == 1: 135 if (N % 10) == 1:
46 sys.stderr.write("Sampling every %ist sequence\n" % N) 136 sys.stderr.write("Sampling every %ist sequence\n" % N)
47 elif (N % 10) == 2: 137 elif (N % 10) == 2:
48 sys.stderr.write("Sampling every %ind sequence\n" % N) 138 sys.stderr.write("Sampling every %ind sequence\n" % N)
49 elif (N % 10) == 3: 139 elif (N % 10) == 3:
55 count = 0 145 count = 0
56 for record in iterator: 146 for record in iterator:
57 count += 1 147 count += 1
58 if count % N == 1: 148 if count % N == 1:
59 yield record 149 yield record
60 elif mode == "percentage": 150 elif options.percent:
61 if len(sys.argv) != 6: 151 try:
62 stop_err("If using percentage, just need percentage argument (float, range 0 to 100)") 152 percent = float(options.percent) / 100.0
63 try:
64 percent = float(sys.argv[5]) / 100.0
65 except: 153 except:
66 stop_err("Bad percent argument %r" % sys.argv[5]) 154 sys_exit("Bad -p percent argument %r" % options.percent)
67 if percent <= 0.0 or 1.0 <= percent: 155 if percent <= 0.0 or 1.0 <= percent:
68 stop_err("Bad percent argument %r" % sys.argv[5]) 156 sys_exit("Bad -p percent argument %r" % options.percent)
69 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) 157 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
70 def sampler(iterator): 158 def sampler(iterator):
71 global percent 159 global percent
72 count = 0 160 count = 0
73 taken = 0 161 taken = 0
74 for record in iterator: 162 for record in iterator:
75 count += 1 163 count += 1
76 if percent * count > taken: 164 if percent * count > taken:
77 taken += 1 165 taken += 1
78 yield record 166 yield record
167 elif options.count:
168 try:
169 N = int(options.count)
170 except:
171 sys_exit("Bad -c count argument %r" % options.count)
172 if N < 1:
173 sys_exit("Bad -c count argument %r" % options.count)
174 total = count_sequences(in_file, seq_format)
175 print("Input file has %i sequences" % total)
176 if interleaved:
177 # Paired
178 if total % 2:
179 sys_exit("Paired mode, but input file has an odd number of sequences: %i"
180 % total)
181 elif N > total // 2:
182 sys_exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)."
183 % (N, total // 2, total))
184 total = total // 2
185 if N == 1:
186 sys.stderr.write("Sampling just first sequence pair!\n")
187 elif N == total:
188 sys.stderr.write("Taking all the sequence pairs\n")
189 else:
190 sys.stderr.write("Sampling %i sequence pairs\n" % N)
191 else:
192 # Not paired
193 if total < N:
194 sys_exit("Requested %i sequences, but file only has %i." % (N, total))
195 if N == 1:
196 sys.stderr.write("Sampling just first sequence!\n")
197 elif N == total:
198 sys.stderr.write("Taking all the sequences\n")
199 else:
200 sys.stderr.write("Sampling %i sequences\n" % N)
201 if N == total:
202 def sampler(iterator):
203 """Dummy filter to filter nothing, taking everything."""
204 global N
205 taken = 0
206 for record in iterator:
207 taken += 1
208 yield record
209 assert taken == N, "Picked %i, wanted %i" % (taken, N)
210 else:
211 def sampler(iterator):
212 # Mimic the percentage sampler, with double check on final count
213 global N, total
214 # Do we need a floating point fudge factor epsilon?
215 # i.e. What if percentage comes out slighty too low, and
216 # we could end up missing last few desired sequences?
217 percentage = float(N) / float(total)
218 #print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f"
219 # % (N, total, percentage * 100.0))
220 count = 0
221 taken = 0
222 for record in iterator:
223 count += 1
224 # Do we need the extra upper bound?
225 if percentage * count > taken and taken < N:
226 taken += 1
227 yield record
228 elif total - count + 1 <= N - taken:
229 # remaining records (incuding this one) <= what we still need.
230 # This is a safey check for floating point edge cases where
231 # we need to take all remaining sequences to meet target
232 taken += 1
233 yield record
234 assert taken == N, "Picked %i, wanted %i" % (taken, N)
79 else: 235 else:
80 stop_err("Unsupported mode %r" % mode) 236 sys_exit("Must use either -n, -p or -c")
237
238
239 def pair(iterator):
240 """Quick and dirty pair batched iterator."""
241 while True:
242 a = next(iterator)
243 b = next(iterator)
244 if not b:
245 assert not a, "Odd number of records?"
246 break
247 yield (a, b)
248
81 249
82 def raw_fasta_iterator(handle): 250 def raw_fasta_iterator(handle):
83 """Yields raw FASTA records as multi-line strings.""" 251 """Yields raw FASTA records as multi-line strings."""
84 while True: 252 while True:
85 line = handle.readline() 253 line = handle.readline()
111 line = handle.readline() 279 line = handle.readline()
112 yield "".join(lines) 280 yield "".join(lines)
113 if not line: 281 if not line:
114 return # StopIteration 282 return # StopIteration
115 283
116 def fasta_filter(in_file, out_file, iterator_filter): 284 def fasta_filter(in_file, out_file, iterator_filter, inter):
117 count = 0 285 count = 0
118 #Galaxy now requires Python 2.5+ so can use with statements, 286 #Galaxy now requires Python 2.5+ so can use with statements,
119 with open(in_file) as in_handle: 287 with open(in_file) as in_handle:
120 with open(out_file, "w") as pos_handle: 288 with open(out_file, "w") as pos_handle:
121 for record in iterator_filter(raw_fasta_iterator(in_handle)): 289 if inter:
122 count += 1 290 for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))):
123 pos_handle.write(record) 291 count += 1
124 return count 292 pos_handle.write(r1)
125 293 pos_handle.write(r2)
126 try: 294 else:
127 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter 295 for record in iterator_filter(raw_fasta_iterator(in_handle)):
128 def fastq_filter(in_file, out_file, iterator_filter): 296 count += 1
129 count = 0 297 pos_handle.write(record)
130 #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter 298 return count
131 reader = fastqReader(open(in_file, "rU")) 299
132 writer = fastqWriter(open(out_file, "w")) 300
133 for record in iterator_filter(reader): 301 from Bio.SeqIO.QualityIO import FastqGeneralIterator
134 count += 1 302 def fastq_filter(in_file, out_file, iterator_filter, inter):
135 writer.write(record) 303 count = 0
136 writer.close() 304 with open(in_file) as in_handle:
137 reader.close() 305 with open(out_file, "w") as pos_handle:
138 return count 306 if inter:
139 except ImportError: 307 for r1, r2 in iterator_filter(pair(FastqGeneralIterator(in_handle))):
140 from Bio.SeqIO.QualityIO import FastqGeneralIterator 308 count += 1
141 def fastq_filter(in_file, out_file, iterator_filter): 309 pos_handle.write("@%s\n%s\n+\n%s\n" % r1)
142 count = 0 310 pos_handle.write("@%s\n%s\n+\n%s\n" % r2)
143 with open(in_file) as in_handle: 311 else:
144 with open(out_file, "w") as pos_handle:
145 for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)): 312 for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)):
146 count += 1 313 count += 1
147 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 314 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
148 return count 315 return count
149 316
150 def sff_filter(in_file, out_file, iterator_filter): 317
318 def sff_filter(in_file, out_file, iterator_filter, inter):
151 count = 0 319 count = 0
152 try: 320 try:
153 from Bio.SeqIO.SffIO import SffIterator, SffWriter 321 from Bio.SeqIO.SffIO import SffIterator, SffWriter
154 except ImportError: 322 except ImportError:
155 stop_err("SFF filtering requires Biopython 1.54 or later") 323 sys_exit("SFF filtering requires Biopython 1.54 or later")
156 try: 324 try:
157 from Bio.SeqIO.SffIO import ReadRocheXmlManifest 325 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
158 except ImportError: 326 except ImportError:
159 #Prior to Biopython 1.56 this was a private function 327 #Prior to Biopython 1.56 this was a private function
160 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest 328 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
165 manifest = None 333 manifest = None
166 in_handle.seek(0) 334 in_handle.seek(0)
167 with open(out_file, "wb") as out_handle: 335 with open(out_file, "wb") as out_handle:
168 writer = SffWriter(out_handle, xml=manifest) 336 writer = SffWriter(out_handle, xml=manifest)
169 in_handle.seek(0) #start again after getting manifest 337 in_handle.seek(0) #start again after getting manifest
170 count = writer.write_file(iterator_filter(SffIterator(in_handle))) 338 if inter:
171 #count = writer.write_file(SffIterator(in_handle)) 339 from itertools import chain
172 return count 340 count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle)))))
173 341 assert count % 2 == 0, "Odd number of records? %i" % count
174 if seq_format.lower()=="sff": 342 count /= 2
175 count = sff_filter(in_file, out_file, sampler) 343 else:
176 elif seq_format.lower()=="fasta": 344 count = writer.write_file(iterator_filter(SffIterator(in_handle)))
177 count = fasta_filter(in_file, out_file, sampler) 345 #count = writer.write_file(SffIterator(in_handle))
178 elif seq_format.lower().startswith("fastq"): 346 return count
179 count = fastq_filter(in_file, out_file, sampler) 347
348 if seq_format == "sff":
349 count = sff_filter(in_file, out_file, sampler, interleaved)
350 elif seq_format == "fasta":
351 count = fasta_filter(in_file, out_file, sampler, interleaved)
352 elif seq_format.startswith("fastq"):
353 count = fastq_filter(in_file, out_file, sampler, interleaved)
180 else: 354 else:
181 stop_err("Unsupported file type %r" % seq_format) 355 sys_exit("Unsupported file type %r" % seq_format)
182 356
183 sys.stderr.write("Sampled %i records\n" % count) 357 if interleaved:
358 sys.stderr.write("Selected %i pairs\n" % count)
359 else:
360 sys.stderr.write("Selected %i records\n" % count)