Mercurial > repos > peterjc > seq_filter_by_mapping
comparison tools/seq_filter_by_mapping/seq_filter_by_mapping.py @ 2:48e71dfd51b3 draft
v0.0.5 Depend on Biopython 1.67 from Tool Shed or (Bio)conda
author | peterjc |
---|---|
date | Wed, 10 May 2017 13:16:44 -0400 |
parents | 8ff0ac66f1a3 |
children | 481b0a925e66 |
comparison
equal
deleted
inserted
replaced
1:8ff0ac66f1a3 | 2:48e71dfd51b3 |
---|---|
16 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. | 16 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. |
17 See accompanying text file for licence details (MIT license). | 17 See accompanying text file for licence details (MIT license). |
18 | 18 |
19 Use -v or --version to get the version, -h or --help for help. | 19 Use -v or --version to get the version, -h or --help for help. |
20 """ | 20 """ |
21 | |
21 import os | 22 import os |
22 import sys | |
23 import re | 23 import re |
24 import subprocess | 24 import subprocess |
25 import sys | |
26 | |
25 from optparse import OptionParser | 27 from optparse import OptionParser |
26 | 28 |
27 def sys_exit(msg, err=1): | 29 # Parse Command Line |
28 sys.stderr.write(msg.rstrip() + "\n") | |
29 sys.exit(err) | |
30 | |
31 #Parse Command Line | |
32 usage = """Use as follows: | 30 usage = """Use as follows: |
33 | 31 |
34 $ python seq_filter_by_mapping.py [options] mapping.sam/bam [more mappings] | 32 $ python seq_filter_by_mapping.py [options] mapping.sam/bam [more mappings] |
35 | 33 |
36 e.g. Positive matches using column one from a single BAM file: | 34 e.g. Positive matches using column one from a single BAM file: |
62 help="Show version and quit") | 60 help="Show version and quit") |
63 | 61 |
64 options, args = parser.parse_args() | 62 options, args = parser.parse_args() |
65 | 63 |
66 if options.version: | 64 if options.version: |
67 print "v0.0.3" | 65 print "v0.0.5" |
68 sys.exit(0) | 66 sys.exit(0) |
69 | 67 |
70 in_file = options.input | 68 in_file = options.input |
71 seq_format = options.format | 69 seq_format = options.format |
72 out_positive_file = options.output_positive | 70 out_positive_file = options.output_positive |
73 out_negative_file = options.output_negative | 71 out_negative_file = options.output_negative |
74 pair_mode = options.pair_mode | 72 pair_mode = options.pair_mode |
75 | 73 |
76 if in_file is None or not os.path.isfile(in_file): | 74 if in_file is None or not os.path.isfile(in_file): |
77 sys_exit("Missing input file: %r" % in_file) | 75 sys.exit("Missing input file: %r" % in_file) |
78 if out_positive_file is None and out_negative_file is None: | 76 if out_positive_file is None and out_negative_file is None: |
79 sys_exit("Neither output file requested") | 77 sys.exit("Neither output file requested") |
80 if seq_format is None: | 78 if seq_format is None: |
81 sys_exit("Missing sequence format") | 79 sys.exit("Missing sequence format") |
82 if pair_mode not in ["lax", "strict"]: | 80 if pair_mode not in ["lax", "strict"]: |
83 sys_exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode) | 81 sys.exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode) |
84 for mapping in args: | 82 for mapping in args: |
85 if not os.path.isfile(mapping): | 83 if not os.path.isfile(mapping): |
86 sys_exit("Mapping file %r not found" % mapping) | 84 sys.exit("Mapping file %r not found" % mapping) |
87 if not args: | 85 if not args: |
88 sys_exit("At least one SAM/BAM mapping file is required") | 86 sys.exit("At least one SAM/BAM mapping file is required") |
89 | 87 |
90 | 88 |
91 #Cope with three widely used suffix naming convensions, | 89 # Cope with three widely used suffix naming convensions, |
92 #Illumina: /1 or /2 | 90 # Illumina: /1 or /2 |
93 #Forward/revered: .f or .r | 91 # Forward/revered: .f or .r |
94 #Sanger, e.g. .p1k and .q1k | 92 # Sanger, e.g. .p1k and .q1k |
95 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html | 93 # See http://staden.sourceforge.net/manual/pregap4_unix_50.html |
96 #re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") | 94 # re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") |
97 #re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") | 95 # re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") |
98 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$") | 96 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$") |
99 assert re_suffix.search("demo.f") | 97 assert re_suffix.search("demo.f") |
100 assert re_suffix.search("demo.s1") | 98 assert re_suffix.search("demo.s1") |
101 assert re_suffix.search("demo.f1k") | 99 assert re_suffix.search("demo.f1k") |
102 assert re_suffix.search("demo.p1") | 100 assert re_suffix.search("demo.p1") |
105 assert re_suffix.search("demo/2") | 103 assert re_suffix.search("demo/2") |
106 assert re_suffix.search("demo.r") | 104 assert re_suffix.search("demo.r") |
107 assert re_suffix.search("demo.q1") | 105 assert re_suffix.search("demo.q1") |
108 assert re_suffix.search("demo.q1lk") | 106 assert re_suffix.search("demo.q1lk") |
109 | 107 |
108 | |
110 def clean_name(name): | 109 def clean_name(name): |
111 """Remove suffix.""" | 110 """Remove suffix.""" |
112 match = re_suffix.search(name) | 111 match = re_suffix.search(name) |
113 if match: | 112 if match: |
114 # Use the fact this is a suffix, and regular expression will be | 113 # Use the fact this is a suffix, and regular expression will be |
115 # anchored to the end of the name: | 114 # anchored to the end of the name: |
116 return name[:match.start()] | 115 return name[:match.start()] |
117 else: | 116 else: |
118 # Nothing to do | 117 # Nothing to do |
119 return name | 118 return name |
119 | |
120 | |
120 assert clean_name("foo/1") == "foo" | 121 assert clean_name("foo/1") == "foo" |
121 assert clean_name("foo/2") == "foo" | 122 assert clean_name("foo/2") == "foo" |
122 assert clean_name("bar.f") == "bar" | 123 assert clean_name("bar.f") == "bar" |
123 assert clean_name("bar.r") == "bar" | 124 assert clean_name("bar.r") == "bar" |
124 assert clean_name("baz.p1") == "baz" | 125 assert clean_name("baz.p1") == "baz" |
125 assert clean_name("baz.q2") == "baz" | 126 assert clean_name("baz.q2") == "baz" |
126 | 127 |
127 mapped_chars = { '>' :'__gt__', | 128 mapped_chars = { |
128 '<' :'__lt__', | 129 '>': '__gt__', |
129 "'" :'__sq__', | 130 '<': '__lt__', |
130 '"' :'__dq__', | 131 "'": '__sq__', |
131 '[' :'__ob__', | 132 '"': '__dq__', |
132 ']' :'__cb__', | 133 '[': '__ob__', |
133 '{' :'__oc__', | 134 ']': '__cb__', |
134 '}' :'__cc__', | 135 '{': '__oc__', |
135 '@' : '__at__', | 136 '}': '__cc__', |
136 '\n' : '__cn__', | 137 '@': '__at__', |
137 '\r' : '__cr__', | 138 '\n': '__cn__', |
138 '\t' : '__tc__', | 139 '\r': '__cr__', |
139 '#' : '__pd__' | 140 '\t': '__tc__', |
140 } | 141 '#': '__pd__', |
142 } | |
143 | |
141 | 144 |
142 def load_mapping_ids(filename, pair_mode, ids): | 145 def load_mapping_ids(filename, pair_mode, ids): |
143 """Parse SAM/BAM file, updating given set of ids. | 146 """Parse SAM/BAM file, updating given set of ids. |
144 | 147 |
145 Parses BAM files via call out to samtools view command. | 148 Parses BAM files via call out to samtools view command. |
187 stdout, stderr = child.communicate() | 190 stdout, stderr = child.communicate() |
188 assert child.returncode is not None | 191 assert child.returncode is not None |
189 if child.returncode: | 192 if child.returncode: |
190 msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode, | 193 msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode, |
191 filename, stderr) | 194 filename, stderr) |
192 sys_exit(msg.strip(), child.returncode) | 195 sys.exit(msg.strip(), child.returncode) |
193 else: | 196 else: |
194 handle.close() | 197 handle.close() |
195 | 198 |
196 | 199 |
197 # Read mapping file(s) and record all mapped identifiers | 200 # Read mapping file(s) and record all mapped identifiers |
202 # more than just qname (need /1 or /2 indicator) | 205 # more than just qname (need /1 or /2 indicator) |
203 print("Loaded %i mapped IDs" % (len(ids))) | 206 print("Loaded %i mapped IDs" % (len(ids))) |
204 if len(ids) < 10: | 207 if len(ids) < 10: |
205 print("Looking for %s" % ", ".join(sorted(ids))) | 208 print("Looking for %s" % ", ".join(sorted(ids))) |
206 | 209 |
210 | |
207 def crude_fasta_iterator(handle): | 211 def crude_fasta_iterator(handle): |
208 """Yields tuples, record ID and the full record as a string.""" | 212 """Yields tuples, record ID and the full record as a string.""" |
209 while True: | 213 while True: |
210 line = handle.readline() | 214 line = handle.readline() |
211 if line == "": | 215 if line == "": |
212 return # Premature end of file, or just empty? | 216 return # Premature end of file, or just empty? |
213 if line[0] == ">": | 217 if line[0] == ">": |
214 break | 218 break |
215 | 219 |
216 no_id_warned = False | 220 no_id_warned = False |
217 while True: | 221 while True: |
234 break | 238 break |
235 lines.append(line) | 239 lines.append(line) |
236 line = handle.readline() | 240 line = handle.readline() |
237 yield id, "".join(lines) | 241 yield id, "".join(lines) |
238 if not line: | 242 if not line: |
239 return # StopIteration | 243 return # StopIteration |
240 | 244 |
241 | 245 |
242 def fasta_filter(in_file, pos_file, neg_file, wanted): | 246 def fasta_filter(in_file, pos_file, neg_file, wanted): |
243 """FASTA filter producing 60 character line wrapped outout.""" | 247 """FASTA filter producing 60 character line wrapped outout.""" |
244 pos_count = neg_count = 0 | 248 pos_count = neg_count = 0 |
245 #Galaxy now requires Python 2.5+ so can use with statements, | 249 # Galaxy now requires Python 2.5+ so can use with statements, |
246 with open(in_file) as in_handle: | 250 with open(in_file) as in_handle: |
247 #Doing the if statement outside the loop for speed | 251 # Doing the if statement outside the loop for speed |
248 #(with the downside of three very similar loops). | 252 # (with the downside of three very similar loops). |
249 if pos_file is not None and neg_file is not None: | 253 if pos_file is not None and neg_file is not None: |
250 print "Generating two FASTA files" | 254 print "Generating two FASTA files" |
251 with open(pos_file, "w") as pos_handle: | 255 with open(pos_file, "w") as pos_handle: |
252 with open(neg_file, "w") as neg_handle: | 256 with open(neg_file, "w") as neg_handle: |
253 for identifier, record in crude_fasta_iterator(in_handle): | 257 for identifier, record in crude_fasta_iterator(in_handle): |
282 def fastq_filter(in_file, pos_file, neg_file, wanted): | 286 def fastq_filter(in_file, pos_file, neg_file, wanted): |
283 """FASTQ filter.""" | 287 """FASTQ filter.""" |
284 from Bio.SeqIO.QualityIO import FastqGeneralIterator | 288 from Bio.SeqIO.QualityIO import FastqGeneralIterator |
285 pos_count = neg_count = 0 | 289 pos_count = neg_count = 0 |
286 handle = open(in_file, "r") | 290 handle = open(in_file, "r") |
287 if out_positive_file is not None and out_negative_file is not None: | 291 if pos_file is not None and neg_file is not None: |
288 print "Generating two FASTQ files" | 292 print "Generating two FASTQ files" |
289 positive_handle = open(out_positive_file, "w") | 293 positive_handle = open(pos_file, "w") |
290 negative_handle = open(out_negative_file, "w") | 294 negative_handle = open(neg_file, "w") |
291 print in_file | 295 print in_file |
292 for title, seq, qual in FastqGeneralIterator(handle): | 296 for title, seq, qual in FastqGeneralIterator(handle): |
293 # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) | 297 # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) |
294 if clean_name(title.split(None, 1)[0]) in ids: | 298 if clean_name(title.split(None, 1)[0]) in wanted: |
295 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | 299 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) |
296 pos_count += 1 | 300 pos_count += 1 |
297 else: | 301 else: |
298 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | 302 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) |
299 neg_count += 1 | 303 neg_count += 1 |
300 positive_handle.close() | 304 positive_handle.close() |
301 negative_handle.close() | 305 negative_handle.close() |
302 elif out_positive_file is not None: | 306 elif pos_file is not None: |
303 print "Generating matching FASTQ file" | 307 print "Generating matching FASTQ file" |
304 positive_handle = open(out_positive_file, "w") | 308 positive_handle = open(pos_file, "w") |
305 for title, seq, qual in FastqGeneralIterator(handle): | 309 for title, seq, qual in FastqGeneralIterator(handle): |
306 if clean_name(title.split(None, 1)[0]) in ids: | 310 if clean_name(title.split(None, 1)[0]) in wanted: |
307 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | 311 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) |
308 pos_count += 1 | 312 pos_count += 1 |
309 else: | 313 else: |
310 neg_count += 1 | 314 neg_count += 1 |
311 positive_handle.close() | 315 positive_handle.close() |
312 elif out_negative_file is not None: | 316 elif neg_file is not None: |
313 print "Generating non-matching FASTQ file" | 317 print "Generating non-matching FASTQ file" |
314 negative_handle = open(out_negative_file, "w") | 318 negative_handle = open(neg_file, "w") |
315 for title, seq, qual in FastqGeneralIterator(handle): | 319 for title, seq, qual in FastqGeneralIterator(handle): |
316 if clean_name(title.split(None, 1)[0]) in ids: | 320 if clean_name(title.split(None, 1)[0]) in wanted: |
317 pos_count += 1 | 321 pos_count += 1 |
318 else: | 322 else: |
319 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | 323 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) |
320 neg_count += 1 | 324 neg_count += 1 |
321 negative_handle.close() | 325 negative_handle.close() |
326 def sff_filter(in_file, pos_file, neg_file, wanted): | 330 def sff_filter(in_file, pos_file, neg_file, wanted): |
327 """SFF filter.""" | 331 """SFF filter.""" |
328 try: | 332 try: |
329 from Bio.SeqIO.SffIO import SffIterator, SffWriter | 333 from Bio.SeqIO.SffIO import SffIterator, SffWriter |
330 except ImportError: | 334 except ImportError: |
331 sys_exit("SFF filtering requires Biopython 1.54 or later") | 335 sys.exit("SFF filtering requires Biopython 1.54 or later") |
332 | 336 |
333 try: | 337 try: |
334 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | 338 from Bio.SeqIO.SffIO import ReadRocheXmlManifest |
335 except ImportError: | 339 except ImportError: |
336 #Prior to Biopython 1.56 this was a private function | 340 # Prior to Biopython 1.56 this was a private function |
337 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | 341 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest |
338 | 342 |
339 in_handle = open(in_file, "rb") #must be binary mode! | 343 in_handle = open(in_file, "rb") # must be binary mode! |
340 try: | 344 try: |
341 manifest = ReadRocheXmlManifest(in_handle) | 345 manifest = ReadRocheXmlManifest(in_handle) |
342 except ValueError: | 346 except ValueError: |
343 manifest = None | 347 manifest = None |
344 | 348 |
345 #This makes two passes though the SFF file with isn't so efficient, | 349 # This makes two passes though the SFF file with isn't so efficient, |
346 #but this makes the code simple. | 350 # but this makes the code simple. |
347 pos_count = neg_count = 0 | 351 pos_count = neg_count = 0 |
348 if out_positive_file is not None: | 352 if pos_file is not None: |
349 out_handle = open(out_positive_file, "wb") | 353 out_handle = open(pos_file, "wb") |
350 writer = SffWriter(out_handle, xml=manifest) | 354 writer = SffWriter(out_handle, xml=manifest) |
351 in_handle.seek(0) #start again after getting manifest | 355 in_handle.seek(0) # start again after getting manifest |
352 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids) | 356 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted) |
353 out_handle.close() | 357 out_handle.close() |
354 if out_negative_file is not None: | 358 if neg_file is not None: |
355 out_handle = open(out_negative_file, "wb") | 359 out_handle = open(neg_file, "wb") |
356 writer = SffWriter(out_handle, xml=manifest) | 360 writer = SffWriter(out_handle, xml=manifest) |
357 in_handle.seek(0) #start again | 361 in_handle.seek(0) # start again |
358 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids) | 362 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted) |
359 out_handle.close() | 363 out_handle.close() |
360 #And we're done | 364 # And we're done |
361 in_handle.close() | 365 in_handle.close() |
362 return pos_count, neg_count | 366 return pos_count, neg_count |
363 | 367 |
364 | 368 |
365 if seq_format.lower()=="sff": | 369 if seq_format.lower() == "sff": |
366 sequence_filter = sff_filter | 370 sequence_filter = sff_filter |
367 elif seq_format.lower()=="fasta": | 371 elif seq_format.lower() == "fasta": |
368 sequence_filter = fasta_filter | 372 sequence_filter = fasta_filter |
369 elif seq_format.lower().startswith("fastq"): | 373 elif seq_format.lower().startswith("fastq"): |
370 sequence_filter = fastq_filter | 374 sequence_filter = fastq_filter |
371 else: | 375 else: |
372 sys_exit("Unsupported file type %r" % seq_format) | 376 sys.exit("Unsupported file type %r" % seq_format) |
373 | 377 |
374 pos_count, neg_count = sequence_filter(in_file, out_positive_file, out_negative_file, ids) | 378 pos_count, neg_count = sequence_filter(in_file, out_positive_file, out_negative_file, ids) |
375 print("%i mapped and %i unmapped reads." % (pos_count, neg_count)) | 379 print("%i mapped and %i unmapped reads." % (pos_count, neg_count)) |
376 fraction = float(pos_count) * 100.0 / float(pos_count + neg_count) | 380 fraction = float(pos_count) * 100.0 / float(pos_count + neg_count) |
377 print("In total %i reads, of which %0.1f%% mapped." % (pos_count + neg_count, fraction)) | 381 print("In total %i reads, of which %0.1f%% mapped." % (pos_count + neg_count, fraction)) |