comparison tools/seq_filter_by_mapping/seq_filter_by_mapping.py @ 2:48e71dfd51b3 draft

v0.0.5 Depend on Biopython 1.67 from Tool Shed or (Bio)conda
author peterjc
date Wed, 10 May 2017 13:16:44 -0400
parents 8ff0ac66f1a3
children 481b0a925e66
comparison
equal deleted inserted replaced
1:8ff0ac66f1a3 2:48e71dfd51b3
16 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. 16 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
17 See accompanying text file for licence details (MIT license). 17 See accompanying text file for licence details (MIT license).
18 18
19 Use -v or --version to get the version, -h or --help for help. 19 Use -v or --version to get the version, -h or --help for help.
20 """ 20 """
21
21 import os 22 import os
22 import sys
23 import re 23 import re
24 import subprocess 24 import subprocess
25 import sys
26
25 from optparse import OptionParser 27 from optparse import OptionParser
26 28
27 def sys_exit(msg, err=1): 29 # Parse Command Line
28 sys.stderr.write(msg.rstrip() + "\n")
29 sys.exit(err)
30
31 #Parse Command Line
32 usage = """Use as follows: 30 usage = """Use as follows:
33 31
34 $ python seq_filter_by_mapping.py [options] mapping.sam/bam [more mappings] 32 $ python seq_filter_by_mapping.py [options] mapping.sam/bam [more mappings]
35 33
36 e.g. Positive matches using column one from a single BAM file: 34 e.g. Positive matches using column one from a single BAM file:
62 help="Show version and quit") 60 help="Show version and quit")
63 61
64 options, args = parser.parse_args() 62 options, args = parser.parse_args()
65 63
66 if options.version: 64 if options.version:
67 print "v0.0.3" 65 print "v0.0.5"
68 sys.exit(0) 66 sys.exit(0)
69 67
70 in_file = options.input 68 in_file = options.input
71 seq_format = options.format 69 seq_format = options.format
72 out_positive_file = options.output_positive 70 out_positive_file = options.output_positive
73 out_negative_file = options.output_negative 71 out_negative_file = options.output_negative
74 pair_mode = options.pair_mode 72 pair_mode = options.pair_mode
75 73
76 if in_file is None or not os.path.isfile(in_file): 74 if in_file is None or not os.path.isfile(in_file):
77 sys_exit("Missing input file: %r" % in_file) 75 sys.exit("Missing input file: %r" % in_file)
78 if out_positive_file is None and out_negative_file is None: 76 if out_positive_file is None and out_negative_file is None:
79 sys_exit("Neither output file requested") 77 sys.exit("Neither output file requested")
80 if seq_format is None: 78 if seq_format is None:
81 sys_exit("Missing sequence format") 79 sys.exit("Missing sequence format")
82 if pair_mode not in ["lax", "strict"]: 80 if pair_mode not in ["lax", "strict"]:
83 sys_exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode) 81 sys.exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode)
84 for mapping in args: 82 for mapping in args:
85 if not os.path.isfile(mapping): 83 if not os.path.isfile(mapping):
86 sys_exit("Mapping file %r not found" % mapping) 84 sys.exit("Mapping file %r not found" % mapping)
87 if not args: 85 if not args:
88 sys_exit("At least one SAM/BAM mapping file is required") 86 sys.exit("At least one SAM/BAM mapping file is required")
89 87
90 88
91 #Cope with three widely used suffix naming convensions, 89 # Cope with three widely used suffix naming convensions,
92 #Illumina: /1 or /2 90 # Illumina: /1 or /2
93 #Forward/revered: .f or .r 91 # Forward/revered: .f or .r
94 #Sanger, e.g. .p1k and .q1k 92 # Sanger, e.g. .p1k and .q1k
95 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html 93 # See http://staden.sourceforge.net/manual/pregap4_unix_50.html
96 #re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") 94 # re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
97 #re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") 95 # re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
98 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$") 96 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$")
99 assert re_suffix.search("demo.f") 97 assert re_suffix.search("demo.f")
100 assert re_suffix.search("demo.s1") 98 assert re_suffix.search("demo.s1")
101 assert re_suffix.search("demo.f1k") 99 assert re_suffix.search("demo.f1k")
102 assert re_suffix.search("demo.p1") 100 assert re_suffix.search("demo.p1")
105 assert re_suffix.search("demo/2") 103 assert re_suffix.search("demo/2")
106 assert re_suffix.search("demo.r") 104 assert re_suffix.search("demo.r")
107 assert re_suffix.search("demo.q1") 105 assert re_suffix.search("demo.q1")
108 assert re_suffix.search("demo.q1lk") 106 assert re_suffix.search("demo.q1lk")
109 107
108
110 def clean_name(name): 109 def clean_name(name):
111 """Remove suffix.""" 110 """Remove suffix."""
112 match = re_suffix.search(name) 111 match = re_suffix.search(name)
113 if match: 112 if match:
114 # Use the fact this is a suffix, and regular expression will be 113 # Use the fact this is a suffix, and regular expression will be
115 # anchored to the end of the name: 114 # anchored to the end of the name:
116 return name[:match.start()] 115 return name[:match.start()]
117 else: 116 else:
118 # Nothing to do 117 # Nothing to do
119 return name 118 return name
119
120
120 assert clean_name("foo/1") == "foo" 121 assert clean_name("foo/1") == "foo"
121 assert clean_name("foo/2") == "foo" 122 assert clean_name("foo/2") == "foo"
122 assert clean_name("bar.f") == "bar" 123 assert clean_name("bar.f") == "bar"
123 assert clean_name("bar.r") == "bar" 124 assert clean_name("bar.r") == "bar"
124 assert clean_name("baz.p1") == "baz" 125 assert clean_name("baz.p1") == "baz"
125 assert clean_name("baz.q2") == "baz" 126 assert clean_name("baz.q2") == "baz"
126 127
127 mapped_chars = { '>' :'__gt__', 128 mapped_chars = {
128 '<' :'__lt__', 129 '>': '__gt__',
129 "'" :'__sq__', 130 '<': '__lt__',
130 '"' :'__dq__', 131 "'": '__sq__',
131 '[' :'__ob__', 132 '"': '__dq__',
132 ']' :'__cb__', 133 '[': '__ob__',
133 '{' :'__oc__', 134 ']': '__cb__',
134 '}' :'__cc__', 135 '{': '__oc__',
135 '@' : '__at__', 136 '}': '__cc__',
136 '\n' : '__cn__', 137 '@': '__at__',
137 '\r' : '__cr__', 138 '\n': '__cn__',
138 '\t' : '__tc__', 139 '\r': '__cr__',
139 '#' : '__pd__' 140 '\t': '__tc__',
140 } 141 '#': '__pd__',
142 }
143
141 144
142 def load_mapping_ids(filename, pair_mode, ids): 145 def load_mapping_ids(filename, pair_mode, ids):
143 """Parse SAM/BAM file, updating given set of ids. 146 """Parse SAM/BAM file, updating given set of ids.
144 147
145 Parses BAM files via call out to samtools view command. 148 Parses BAM files via call out to samtools view command.
187 stdout, stderr = child.communicate() 190 stdout, stderr = child.communicate()
188 assert child.returncode is not None 191 assert child.returncode is not None
189 if child.returncode: 192 if child.returncode:
190 msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode, 193 msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode,
191 filename, stderr) 194 filename, stderr)
192 sys_exit(msg.strip(), child.returncode) 195 sys.exit(msg.strip(), child.returncode)
193 else: 196 else:
194 handle.close() 197 handle.close()
195 198
196 199
197 # Read mapping file(s) and record all mapped identifiers 200 # Read mapping file(s) and record all mapped identifiers
202 # more than just qname (need /1 or /2 indicator) 205 # more than just qname (need /1 or /2 indicator)
203 print("Loaded %i mapped IDs" % (len(ids))) 206 print("Loaded %i mapped IDs" % (len(ids)))
204 if len(ids) < 10: 207 if len(ids) < 10:
205 print("Looking for %s" % ", ".join(sorted(ids))) 208 print("Looking for %s" % ", ".join(sorted(ids)))
206 209
210
207 def crude_fasta_iterator(handle): 211 def crude_fasta_iterator(handle):
208 """Yields tuples, record ID and the full record as a string.""" 212 """Yields tuples, record ID and the full record as a string."""
209 while True: 213 while True:
210 line = handle.readline() 214 line = handle.readline()
211 if line == "": 215 if line == "":
212 return # Premature end of file, or just empty? 216 return # Premature end of file, or just empty?
213 if line[0] == ">": 217 if line[0] == ">":
214 break 218 break
215 219
216 no_id_warned = False 220 no_id_warned = False
217 while True: 221 while True:
234 break 238 break
235 lines.append(line) 239 lines.append(line)
236 line = handle.readline() 240 line = handle.readline()
237 yield id, "".join(lines) 241 yield id, "".join(lines)
238 if not line: 242 if not line:
239 return # StopIteration 243 return # StopIteration
240 244
241 245
242 def fasta_filter(in_file, pos_file, neg_file, wanted): 246 def fasta_filter(in_file, pos_file, neg_file, wanted):
243 """FASTA filter producing 60 character line wrapped outout.""" 247 """FASTA filter producing 60 character line wrapped outout."""
244 pos_count = neg_count = 0 248 pos_count = neg_count = 0
245 #Galaxy now requires Python 2.5+ so can use with statements, 249 # Galaxy now requires Python 2.5+ so can use with statements,
246 with open(in_file) as in_handle: 250 with open(in_file) as in_handle:
247 #Doing the if statement outside the loop for speed 251 # Doing the if statement outside the loop for speed
248 #(with the downside of three very similar loops). 252 # (with the downside of three very similar loops).
249 if pos_file is not None and neg_file is not None: 253 if pos_file is not None and neg_file is not None:
250 print "Generating two FASTA files" 254 print "Generating two FASTA files"
251 with open(pos_file, "w") as pos_handle: 255 with open(pos_file, "w") as pos_handle:
252 with open(neg_file, "w") as neg_handle: 256 with open(neg_file, "w") as neg_handle:
253 for identifier, record in crude_fasta_iterator(in_handle): 257 for identifier, record in crude_fasta_iterator(in_handle):
282 def fastq_filter(in_file, pos_file, neg_file, wanted): 286 def fastq_filter(in_file, pos_file, neg_file, wanted):
283 """FASTQ filter.""" 287 """FASTQ filter."""
284 from Bio.SeqIO.QualityIO import FastqGeneralIterator 288 from Bio.SeqIO.QualityIO import FastqGeneralIterator
285 pos_count = neg_count = 0 289 pos_count = neg_count = 0
286 handle = open(in_file, "r") 290 handle = open(in_file, "r")
287 if out_positive_file is not None and out_negative_file is not None: 291 if pos_file is not None and neg_file is not None:
288 print "Generating two FASTQ files" 292 print "Generating two FASTQ files"
289 positive_handle = open(out_positive_file, "w") 293 positive_handle = open(pos_file, "w")
290 negative_handle = open(out_negative_file, "w") 294 negative_handle = open(neg_file, "w")
291 print in_file 295 print in_file
292 for title, seq, qual in FastqGeneralIterator(handle): 296 for title, seq, qual in FastqGeneralIterator(handle):
293 # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) 297 # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0])))
294 if clean_name(title.split(None, 1)[0]) in ids: 298 if clean_name(title.split(None, 1)[0]) in wanted:
295 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 299 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
296 pos_count += 1 300 pos_count += 1
297 else: 301 else:
298 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 302 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
299 neg_count += 1 303 neg_count += 1
300 positive_handle.close() 304 positive_handle.close()
301 negative_handle.close() 305 negative_handle.close()
302 elif out_positive_file is not None: 306 elif pos_file is not None:
303 print "Generating matching FASTQ file" 307 print "Generating matching FASTQ file"
304 positive_handle = open(out_positive_file, "w") 308 positive_handle = open(pos_file, "w")
305 for title, seq, qual in FastqGeneralIterator(handle): 309 for title, seq, qual in FastqGeneralIterator(handle):
306 if clean_name(title.split(None, 1)[0]) in ids: 310 if clean_name(title.split(None, 1)[0]) in wanted:
307 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 311 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
308 pos_count += 1 312 pos_count += 1
309 else: 313 else:
310 neg_count += 1 314 neg_count += 1
311 positive_handle.close() 315 positive_handle.close()
312 elif out_negative_file is not None: 316 elif neg_file is not None:
313 print "Generating non-matching FASTQ file" 317 print "Generating non-matching FASTQ file"
314 negative_handle = open(out_negative_file, "w") 318 negative_handle = open(neg_file, "w")
315 for title, seq, qual in FastqGeneralIterator(handle): 319 for title, seq, qual in FastqGeneralIterator(handle):
316 if clean_name(title.split(None, 1)[0]) in ids: 320 if clean_name(title.split(None, 1)[0]) in wanted:
317 pos_count += 1 321 pos_count += 1
318 else: 322 else:
319 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 323 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
320 neg_count += 1 324 neg_count += 1
321 negative_handle.close() 325 negative_handle.close()
326 def sff_filter(in_file, pos_file, neg_file, wanted): 330 def sff_filter(in_file, pos_file, neg_file, wanted):
327 """SFF filter.""" 331 """SFF filter."""
328 try: 332 try:
329 from Bio.SeqIO.SffIO import SffIterator, SffWriter 333 from Bio.SeqIO.SffIO import SffIterator, SffWriter
330 except ImportError: 334 except ImportError:
331 sys_exit("SFF filtering requires Biopython 1.54 or later") 335 sys.exit("SFF filtering requires Biopython 1.54 or later")
332 336
333 try: 337 try:
334 from Bio.SeqIO.SffIO import ReadRocheXmlManifest 338 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
335 except ImportError: 339 except ImportError:
336 #Prior to Biopython 1.56 this was a private function 340 # Prior to Biopython 1.56 this was a private function
337 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest 341 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
338 342
339 in_handle = open(in_file, "rb") #must be binary mode! 343 in_handle = open(in_file, "rb") # must be binary mode!
340 try: 344 try:
341 manifest = ReadRocheXmlManifest(in_handle) 345 manifest = ReadRocheXmlManifest(in_handle)
342 except ValueError: 346 except ValueError:
343 manifest = None 347 manifest = None
344 348
345 #This makes two passes though the SFF file with isn't so efficient, 349 # This makes two passes though the SFF file with isn't so efficient,
346 #but this makes the code simple. 350 # but this makes the code simple.
347 pos_count = neg_count = 0 351 pos_count = neg_count = 0
348 if out_positive_file is not None: 352 if pos_file is not None:
349 out_handle = open(out_positive_file, "wb") 353 out_handle = open(pos_file, "wb")
350 writer = SffWriter(out_handle, xml=manifest) 354 writer = SffWriter(out_handle, xml=manifest)
351 in_handle.seek(0) #start again after getting manifest 355 in_handle.seek(0) # start again after getting manifest
352 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids) 356 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted)
353 out_handle.close() 357 out_handle.close()
354 if out_negative_file is not None: 358 if neg_file is not None:
355 out_handle = open(out_negative_file, "wb") 359 out_handle = open(neg_file, "wb")
356 writer = SffWriter(out_handle, xml=manifest) 360 writer = SffWriter(out_handle, xml=manifest)
357 in_handle.seek(0) #start again 361 in_handle.seek(0) # start again
358 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids) 362 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted)
359 out_handle.close() 363 out_handle.close()
360 #And we're done 364 # And we're done
361 in_handle.close() 365 in_handle.close()
362 return pos_count, neg_count 366 return pos_count, neg_count
363 367
364 368
365 if seq_format.lower()=="sff": 369 if seq_format.lower() == "sff":
366 sequence_filter = sff_filter 370 sequence_filter = sff_filter
367 elif seq_format.lower()=="fasta": 371 elif seq_format.lower() == "fasta":
368 sequence_filter = fasta_filter 372 sequence_filter = fasta_filter
369 elif seq_format.lower().startswith("fastq"): 373 elif seq_format.lower().startswith("fastq"):
370 sequence_filter = fastq_filter 374 sequence_filter = fastq_filter
371 else: 375 else:
372 sys_exit("Unsupported file type %r" % seq_format) 376 sys.exit("Unsupported file type %r" % seq_format)
373 377
374 pos_count, neg_count = sequence_filter(in_file, out_positive_file, out_negative_file, ids) 378 pos_count, neg_count = sequence_filter(in_file, out_positive_file, out_negative_file, ids)
375 print("%i mapped and %i unmapped reads." % (pos_count, neg_count)) 379 print("%i mapped and %i unmapped reads." % (pos_count, neg_count))
376 fraction = float(pos_count) * 100.0 / float(pos_count + neg_count) 380 fraction = float(pos_count) * 100.0 / float(pos_count + neg_count)
377 print("In total %i reads, of which %0.1f%% mapped." % (pos_count + neg_count, fraction)) 381 print("In total %i reads, of which %0.1f%% mapped." % (pos_count + neg_count, fraction))