Mercurial > repos > peterjc > sample_seqs
comparison tools/sample_seqs/sample_seqs.py @ 2:da64f6a9e32b draft
Uploaded v0.2.0, adds desired count mode
| author | peterjc |
|---|---|
| date | Fri, 06 Mar 2015 11:48:09 -0500 |
| parents | 3a807e5ea6c8 |
| children | 02c13ef1a669 |
comparison
equal
deleted
inserted
replaced
| 1:16ecf25d521f | 2:da64f6a9e32b |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 """Sub-sample sequence from a FASTA, FASTQ or SFF file. | 2 """Sub-sample sequence from a FASTA, FASTQ or SFF file. |
| 3 | 3 |
| 4 This tool is a short Python script which requires Biopython 1.62 or later | 4 This tool is a short Python script which requires Biopython 1.62 or later |
| 5 for SFF file support. If you use this tool in scientific work leading to a | 5 for sequence parsing. If you use this tool in scientific work leading to a |
| 6 publication, please cite the Biopython application note: | 6 publication, please cite the Biopython application note: |
| 7 | 7 |
| 8 Cock et al 2009. Biopython: freely available Python tools for computational | 8 Cock et al 2009. Biopython: freely available Python tools for computational |
| 9 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | 9 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. |
| 10 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | 10 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. |
| 11 | 11 |
| 12 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute | 12 This script is copyright 2014-2015 by Peter Cock, The James Hutton Institute |
| 13 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. | 13 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. |
| 14 See accompanying text file for licence details (MIT license). | 14 See accompanying text file for licence details (MIT license). |
| 15 | 15 |
| 16 This is version 0.1.0 of the script, use -v or --version to get the version. | 16 Use -v or --version to get the version, -h or --help for help. |
| 17 """ | 17 """ |
| 18 import os | 18 import os |
| 19 import sys | 19 import sys |
| 20 | 20 from optparse import OptionParser |
| 21 def stop_err(msg, err=1): | 21 |
| 22 | |
| 23 def sys_exit(msg, err=1): | |
| 22 sys.stderr.write(msg.rstrip() + "\n") | 24 sys.stderr.write(msg.rstrip() + "\n") |
| 23 sys.exit(err) | 25 sys.exit(err) |
| 24 | 26 |
| 25 if "-v" in sys.argv or "--version" in sys.argv: | 27 #Parse Command Line |
| 26 print("v0.1.0") | 28 usage = """Use as follows: |
| 29 | |
| 30 $ python sample_seqs.py [options] | |
| 31 | |
| 32 e.g. Sample 20% of the reads: | |
| 33 | |
| 34 $ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq | |
| 35 | |
| 36 This samples uniformly though the file, rather than at random, and therefore | |
| 37 should be reproducible. | |
| 38 """ | |
| 39 parser = OptionParser(usage=usage) | |
| 40 parser.add_option('-i', '--input', dest='input', | |
| 41 default=None, help='Input sequences filename', | |
| 42 metavar="FILE") | |
| 43 parser.add_option('-f', '--format', dest='format', | |
| 44 default=None, | |
| 45 help='Input sequence format (e.g. fasta, fastq, sff)') | |
| 46 parser.add_option('-o', '--output', dest='output', | |
| 47 default=None, help='Output sampled sequenced filename', | |
| 48 metavar="FILE") | |
| 49 parser.add_option('-p', '--percent', dest='percent', | |
| 50 default=None, | |
| 51 help='Take this percent of the reads') | |
| 52 parser.add_option('-n', '--everyn', dest='everyn', | |
| 53 default=None, | |
| 54 help='Take every N-th read') | |
| 55 parser.add_option('-c', '--count', dest='count', | |
| 56 default=None, | |
| 57 help='Take exactly N reads') | |
| 58 parser.add_option("--interleaved", dest="interleaved", | |
| 59 default=False, action="store_true", | |
| 60 help="Input is interleaved reads, preserve the pairings") | |
| 61 parser.add_option("-v", "--version", dest="version", | |
| 62 default=False, action="store_true", | |
| 63 help="Show version and quit") | |
| 64 options, args = parser.parse_args() | |
| 65 | |
| 66 if options.version: | |
| 67 print("v0.2.0") | |
| 27 sys.exit(0) | 68 sys.exit(0) |
| 28 | 69 |
| 29 #Parse Command Line | 70 in_file = options.input |
| 30 if len(sys.argv) < 5: | 71 out_file = options.output |
| 31 stop_err("Requires at least four arguments: seq_format, in_file, out_file, mode, ...") | 72 interleaved = options.interleaved |
| 32 seq_format, in_file, out_file, mode = sys.argv[1:5] | 73 |
| 74 if not in_file: | |
| 75 sys_exit("Require an input filename") | |
| 33 if in_file != "/dev/stdin" and not os.path.isfile(in_file): | 76 if in_file != "/dev/stdin" and not os.path.isfile(in_file): |
| 34 stop_err("Missing input file %r" % in_file) | 77 sys_exit("Missing input file %r" % in_file) |
| 35 | 78 if not out_file: |
| 36 if mode == "everyNth": | 79 sys_exit("Require an output filename") |
| 37 if len(sys.argv) != 6: | 80 if not options.format: |
| 38 stop_err("If using everyNth, just need argument N (integer, at least 2)") | 81 sys_exit("Require the sequence format") |
| 39 try: | 82 seq_format = options.format.lower() |
| 40 N = int(sys.argv[5]) | 83 |
| 84 | |
| 85 def count_fasta(filename): | |
| 86 from Bio.SeqIO.FastaIO import SimpleFastaParser | |
| 87 count = 0 | |
| 88 with open(filename) as handle: | |
| 89 for title, seq in SimpleFastaParser(handle): | |
| 90 count += 1 | |
| 91 return count | |
| 92 | |
| 93 | |
| 94 def count_fastq(filename): | |
| 95 from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
| 96 count = 0 | |
| 97 with open(filename) as handle: | |
| 98 for title, seq, qual in FastqGeneralIterator(handle): | |
| 99 count += 1 | |
| 100 return count | |
| 101 | |
| 102 | |
| 103 def count_sff(filename): | |
| 104 from Bio import SeqIO | |
| 105 # If the SFF file has a built in index (which is normal), | |
| 106 # this will be parsed and is the quicker than scanning | |
| 107 # the whole file. | |
| 108 return len(SeqIO.index(filename, "sff")) | |
| 109 | |
| 110 | |
| 111 def count_sequences(filename, format): | |
| 112 if seq_format == "sff": | |
| 113 return count_sff(filename) | |
| 114 elif seq_format == "fasta": | |
| 115 return count_fasta(filename) | |
| 116 elif seq_format.startswith("fastq"): | |
| 117 return count_fastq(filename) | |
| 118 else: | |
| 119 sys_exit("Unsupported file type %r" % seq_format) | |
| 120 | |
| 121 | |
| 122 if options.percent and options.everyn: | |
| 123 sys_exit("Cannot combine -p and -n options") | |
| 124 elif options.everyn and options.count: | |
| 125 sys_exit("Cannot combine -p and -c options") | |
| 126 elif options.percent and options.count: | |
| 127 sys_exit("Cannot combine -n and -c options") | |
| 128 elif options.everyn: | |
| 129 try: | |
| 130 N = int(options.everyn) | |
| 41 except: | 131 except: |
| 42 stop_err("Bad N argument %r" % sys.argv[5]) | 132 sys_exit("Bad -n argument %r" % options.everyn) |
| 43 if N < 2: | 133 if N < 2: |
| 44 stop_err("Bad N argument %r" % sys.argv[5]) | 134 sys_exit("Bad -n argument %r" % options.everyn) |
| 45 if (N % 10) == 1: | 135 if (N % 10) == 1: |
| 46 sys.stderr.write("Sampling every %ist sequence\n" % N) | 136 sys.stderr.write("Sampling every %ist sequence\n" % N) |
| 47 elif (N % 10) == 2: | 137 elif (N % 10) == 2: |
| 48 sys.stderr.write("Sampling every %ind sequence\n" % N) | 138 sys.stderr.write("Sampling every %ind sequence\n" % N) |
| 49 elif (N % 10) == 3: | 139 elif (N % 10) == 3: |
| 55 count = 0 | 145 count = 0 |
| 56 for record in iterator: | 146 for record in iterator: |
| 57 count += 1 | 147 count += 1 |
| 58 if count % N == 1: | 148 if count % N == 1: |
| 59 yield record | 149 yield record |
| 60 elif mode == "percentage": | 150 elif options.percent: |
| 61 if len(sys.argv) != 6: | 151 try: |
| 62 stop_err("If using percentage, just need percentage argument (float, range 0 to 100)") | 152 percent = float(options.percent) / 100.0 |
| 63 try: | |
| 64 percent = float(sys.argv[5]) / 100.0 | |
| 65 except: | 153 except: |
| 66 stop_err("Bad percent argument %r" % sys.argv[5]) | 154 sys_exit("Bad -p percent argument %r" % options.percent) |
| 67 if percent <= 0.0 or 1.0 <= percent: | 155 if percent <= 0.0 or 1.0 <= percent: |
| 68 stop_err("Bad percent argument %r" % sys.argv[5]) | 156 sys_exit("Bad -p percent argument %r" % options.percent) |
| 69 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) | 157 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) |
| 70 def sampler(iterator): | 158 def sampler(iterator): |
| 71 global percent | 159 global percent |
| 72 count = 0 | 160 count = 0 |
| 73 taken = 0 | 161 taken = 0 |
| 74 for record in iterator: | 162 for record in iterator: |
| 75 count += 1 | 163 count += 1 |
| 76 if percent * count > taken: | 164 if percent * count > taken: |
| 77 taken += 1 | 165 taken += 1 |
| 78 yield record | 166 yield record |
| 167 elif options.count: | |
| 168 try: | |
| 169 N = int(options.count) | |
| 170 except: | |
| 171 sys_exit("Bad -c count argument %r" % options.count) | |
| 172 if N < 1: | |
| 173 sys_exit("Bad -c count argument %r" % options.count) | |
| 174 total = count_sequences(in_file, seq_format) | |
| 175 print("Input file has %i sequences" % total) | |
| 176 if interleaved: | |
| 177 # Paired | |
| 178 if total % 2: | |
| 179 sys_exit("Paired mode, but input file has an odd number of sequences: %i" | |
| 180 % total) | |
| 181 elif N > total // 2: | |
| 182 sys_exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)." | |
| 183 % (N, total // 2, total)) | |
| 184 total = total // 2 | |
| 185 if N == 1: | |
| 186 sys.stderr.write("Sampling just first sequence pair!\n") | |
| 187 elif N == total: | |
| 188 sys.stderr.write("Taking all the sequence pairs\n") | |
| 189 else: | |
| 190 sys.stderr.write("Sampling %i sequence pairs\n" % N) | |
| 191 else: | |
| 192 # Not paired | |
| 193 if total < N: | |
| 194 sys_exit("Requested %i sequences, but file only has %i." % (N, total)) | |
| 195 if N == 1: | |
| 196 sys.stderr.write("Sampling just first sequence!\n") | |
| 197 elif N == total: | |
| 198 sys.stderr.write("Taking all the sequences\n") | |
| 199 else: | |
| 200 sys.stderr.write("Sampling %i sequences\n" % N) | |
| 201 if N == total: | |
| 202 def sampler(iterator): | |
| 203 """Dummy filter to filter nothing, taking everything.""" | |
| 204 global N | |
| 205 taken = 0 | |
| 206 for record in iterator: | |
| 207 taken += 1 | |
| 208 yield record | |
| 209 assert taken == N, "Picked %i, wanted %i" % (taken, N) | |
| 210 else: | |
| 211 def sampler(iterator): | |
| 212 # Mimic the percentage sampler, with double check on final count | |
| 213 global N, total | |
| 214 # Do we need a floating point fudge factor epsilon? | |
| 215 # i.e. What if percentage comes out slighty too low, and | |
| 216 # we could end up missing last few desired sequences? | |
| 217 percentage = float(N) / float(total) | |
| 218 #print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f" | |
| 219 # % (N, total, percentage * 100.0)) | |
| 220 count = 0 | |
| 221 taken = 0 | |
| 222 for record in iterator: | |
| 223 count += 1 | |
| 224 # Do we need the extra upper bound? | |
| 225 if percentage * count > taken and taken < N: | |
| 226 taken += 1 | |
| 227 yield record | |
| 228 elif total - count + 1 <= N - taken: | |
| 229 # remaining records (incuding this one) <= what we still need. | |
| 230 # This is a safey check for floating point edge cases where | |
| 231 # we need to take all remaining sequences to meet target | |
| 232 taken += 1 | |
| 233 yield record | |
| 234 assert taken == N, "Picked %i, wanted %i" % (taken, N) | |
| 79 else: | 235 else: |
| 80 stop_err("Unsupported mode %r" % mode) | 236 sys_exit("Must use either -n, -p or -c") |
| 237 | |
| 238 | |
| 239 def pair(iterator): | |
| 240 """Quick and dirty pair batched iterator.""" | |
| 241 while True: | |
| 242 a = next(iterator) | |
| 243 b = next(iterator) | |
| 244 if not b: | |
| 245 assert not a, "Odd number of records?" | |
| 246 break | |
| 247 yield (a, b) | |
| 248 | |
| 81 | 249 |
| 82 def raw_fasta_iterator(handle): | 250 def raw_fasta_iterator(handle): |
| 83 """Yields raw FASTA records as multi-line strings.""" | 251 """Yields raw FASTA records as multi-line strings.""" |
| 84 while True: | 252 while True: |
| 85 line = handle.readline() | 253 line = handle.readline() |
| 111 line = handle.readline() | 279 line = handle.readline() |
| 112 yield "".join(lines) | 280 yield "".join(lines) |
| 113 if not line: | 281 if not line: |
| 114 return # StopIteration | 282 return # StopIteration |
| 115 | 283 |
| 116 def fasta_filter(in_file, out_file, iterator_filter): | 284 def fasta_filter(in_file, out_file, iterator_filter, inter): |
| 117 count = 0 | 285 count = 0 |
| 118 #Galaxy now requires Python 2.5+ so can use with statements, | 286 #Galaxy now requires Python 2.5+ so can use with statements, |
| 119 with open(in_file) as in_handle: | 287 with open(in_file) as in_handle: |
| 120 with open(out_file, "w") as pos_handle: | 288 with open(out_file, "w") as pos_handle: |
| 121 for record in iterator_filter(raw_fasta_iterator(in_handle)): | 289 if inter: |
| 122 count += 1 | 290 for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))): |
| 123 pos_handle.write(record) | 291 count += 1 |
| 124 return count | 292 pos_handle.write(r1) |
| 125 | 293 pos_handle.write(r2) |
| 126 try: | 294 else: |
| 127 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | 295 for record in iterator_filter(raw_fasta_iterator(in_handle)): |
| 128 def fastq_filter(in_file, out_file, iterator_filter): | 296 count += 1 |
| 129 count = 0 | 297 pos_handle.write(record) |
| 130 #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | 298 return count |
| 131 reader = fastqReader(open(in_file, "rU")) | 299 |
| 132 writer = fastqWriter(open(out_file, "w")) | 300 |
| 133 for record in iterator_filter(reader): | 301 from Bio.SeqIO.QualityIO import FastqGeneralIterator |
| 134 count += 1 | 302 def fastq_filter(in_file, out_file, iterator_filter, inter): |
| 135 writer.write(record) | 303 count = 0 |
| 136 writer.close() | 304 with open(in_file) as in_handle: |
| 137 reader.close() | 305 with open(out_file, "w") as pos_handle: |
| 138 return count | 306 if inter: |
| 139 except ImportError: | 307 for r1, r2 in iterator_filter(pair(FastqGeneralIterator(in_handle))): |
| 140 from Bio.SeqIO.QualityIO import FastqGeneralIterator | 308 count += 1 |
| 141 def fastq_filter(in_file, out_file, iterator_filter): | 309 pos_handle.write("@%s\n%s\n+\n%s\n" % r1) |
| 142 count = 0 | 310 pos_handle.write("@%s\n%s\n+\n%s\n" % r2) |
| 143 with open(in_file) as in_handle: | 311 else: |
| 144 with open(out_file, "w") as pos_handle: | |
| 145 for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)): | 312 for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)): |
| 146 count += 1 | 313 count += 1 |
| 147 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | 314 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) |
| 148 return count | 315 return count |
| 149 | 316 |
| 150 def sff_filter(in_file, out_file, iterator_filter): | 317 |
| 318 def sff_filter(in_file, out_file, iterator_filter, inter): | |
| 151 count = 0 | 319 count = 0 |
| 152 try: | 320 try: |
| 153 from Bio.SeqIO.SffIO import SffIterator, SffWriter | 321 from Bio.SeqIO.SffIO import SffIterator, SffWriter |
| 154 except ImportError: | 322 except ImportError: |
| 155 stop_err("SFF filtering requires Biopython 1.54 or later") | 323 sys_exit("SFF filtering requires Biopython 1.54 or later") |
| 156 try: | 324 try: |
| 157 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | 325 from Bio.SeqIO.SffIO import ReadRocheXmlManifest |
| 158 except ImportError: | 326 except ImportError: |
| 159 #Prior to Biopython 1.56 this was a private function | 327 #Prior to Biopython 1.56 this was a private function |
| 160 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | 328 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest |
| 165 manifest = None | 333 manifest = None |
| 166 in_handle.seek(0) | 334 in_handle.seek(0) |
| 167 with open(out_file, "wb") as out_handle: | 335 with open(out_file, "wb") as out_handle: |
| 168 writer = SffWriter(out_handle, xml=manifest) | 336 writer = SffWriter(out_handle, xml=manifest) |
| 169 in_handle.seek(0) #start again after getting manifest | 337 in_handle.seek(0) #start again after getting manifest |
| 170 count = writer.write_file(iterator_filter(SffIterator(in_handle))) | 338 if inter: |
| 171 #count = writer.write_file(SffIterator(in_handle)) | 339 from itertools import chain |
| 172 return count | 340 count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle))))) |
| 173 | 341 assert count % 2 == 0, "Odd number of records? %i" % count |
| 174 if seq_format.lower()=="sff": | 342 count /= 2 |
| 175 count = sff_filter(in_file, out_file, sampler) | 343 else: |
| 176 elif seq_format.lower()=="fasta": | 344 count = writer.write_file(iterator_filter(SffIterator(in_handle))) |
| 177 count = fasta_filter(in_file, out_file, sampler) | 345 #count = writer.write_file(SffIterator(in_handle)) |
| 178 elif seq_format.lower().startswith("fastq"): | 346 return count |
| 179 count = fastq_filter(in_file, out_file, sampler) | 347 |
| 348 if seq_format == "sff": | |
| 349 count = sff_filter(in_file, out_file, sampler, interleaved) | |
| 350 elif seq_format == "fasta": | |
| 351 count = fasta_filter(in_file, out_file, sampler, interleaved) | |
| 352 elif seq_format.startswith("fastq"): | |
| 353 count = fastq_filter(in_file, out_file, sampler, interleaved) | |
| 180 else: | 354 else: |
| 181 stop_err("Unsupported file type %r" % seq_format) | 355 sys_exit("Unsupported file type %r" % seq_format) |
| 182 | 356 |
| 183 sys.stderr.write("Sampled %i records\n" % count) | 357 if interleaved: |
| 358 sys.stderr.write("Selected %i pairs\n" % count) | |
| 359 else: | |
| 360 sys.stderr.write("Selected %i records\n" % count) |
