0
|
1 #!/usr/bin/env python
|
|
2 """Filter a FASTA, FASTQ or SSF file with SAM/BAM mapping information.
|
|
3
|
|
4 When filtering an SFF file, any Roche XML manifest in the input file is
|
|
5 preserved in both output files.
|
|
6
|
|
7 This tool is a short Python script which requires Biopython 1.54 or later
|
|
8 for SFF file support. If you use this tool in scientific work leading to a
|
|
9 publication, please cite the Biopython application note:
|
|
10
|
|
11 Cock et al 2009. Biopython: freely available Python tools for computational
|
|
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
|
|
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
|
|
14
|
|
15 This script is copyright 2014 by Peter Cock, The James Hutton Institute
|
|
16 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
|
|
17 See accompanying text file for licence details (MIT license).
|
|
18
|
|
19 Use -v or --version to get the version, -h or --help for help.
|
|
20 """
|
|
21 import os
|
|
22 import sys
|
|
23 import re
|
|
24 import subprocess
|
|
25 from optparse import OptionParser
|
|
26
|
|
27 def sys_exit(msg, err=1):
|
|
28 sys.stderr.write(msg.rstrip() + "\n")
|
|
29 sys.exit(err)
|
|
30
|
|
31 #Parse Command Line
|
|
32 usage = """Use as follows:
|
|
33
|
|
34 $ python seq_filter_by_mapping.py [options] mapping.sam/bam [more mappings]
|
|
35
|
|
36 e.g. Positive matches using column one from a single BAM file:
|
|
37
|
|
38 $ seq_filter_by_mapping.py -i my_seqs.fastq -f fastq -p matches.fastq mapping.bam
|
|
39
|
|
40 Multiple SAM/BAM mapping files may be given.
|
|
41 """
|
|
42 parser = OptionParser(usage=usage)
|
|
43 parser.add_option('-i', '--input', dest='input',
|
|
44 default=None, help='Input sequences filename',
|
|
45 metavar="FILE")
|
|
46 parser.add_option('-f', '--format', dest='format',
|
|
47 default=None,
|
|
48 help='Input sequence format (e.g. fasta, fastq, sff)')
|
|
49 parser.add_option('-p', '--positive', dest='output_positive',
|
|
50 default=None,
|
|
51 help='Output filename for mapping reads',
|
|
52 metavar="FILE")
|
|
53 parser.add_option('-n', '--negative', dest='output_negative',
|
|
54 default=None,
|
|
55 help='Output filename for non-mapping reads',
|
|
56 metavar="FILE")
|
|
57 parser.add_option("-m", "--pair-mode", dest="pair_mode",
|
|
58 default="lax",
|
|
59 help="How to treat paired reads (lax or strict, default lax)")
|
|
60 parser.add_option("-v", "--version", dest="version",
|
|
61 default=False, action="store_true",
|
|
62 help="Show version and quit")
|
|
63
|
|
64 options, args = parser.parse_args()
|
|
65
|
|
66 if options.version:
|
|
67 print "v0.0.2"
|
|
68 sys.exit(0)
|
|
69
|
|
70 in_file = options.input
|
|
71 seq_format = options.format
|
|
72 out_positive_file = options.output_positive
|
|
73 out_negative_file = options.output_negative
|
|
74 pair_mode = options.pair_mode
|
|
75
|
|
76 if in_file is None or not os.path.isfile(in_file):
|
|
77 sys_exit("Missing input file: %r" % in_file)
|
|
78 if out_positive_file is None and out_negative_file is None:
|
|
79 sys_exit("Neither output file requested")
|
|
80 if seq_format is None:
|
|
81 sys_exit("Missing sequence format")
|
|
82 if pair_mode not in ["lax", "strict"]:
|
|
83 sys_exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode)
|
|
84 for mapping in args:
|
|
85 if not os.path.isfile(mapping):
|
|
86 sys_exit("Mapping file %r not found" % mapping)
|
|
87 if not args:
|
|
88 sys_exit("At least one SAM/BAM mapping file is required")
|
|
89
|
|
90
|
|
91 #Cope with three widely used suffix naming convensions,
|
|
92 #Illumina: /1 or /2
|
|
93 #Forward/revered: .f or .r
|
|
94 #Sanger, e.g. .p1k and .q1k
|
|
95 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html
|
|
96 #re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
|
|
97 #re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
|
|
98 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$")
|
|
99 assert re_suffix.search("demo.f")
|
|
100 assert re_suffix.search("demo.s1")
|
|
101 assert re_suffix.search("demo.f1k")
|
|
102 assert re_suffix.search("demo.p1")
|
|
103 assert re_suffix.search("demo.p1k")
|
|
104 assert re_suffix.search("demo.p1lk")
|
|
105 assert re_suffix.search("demo/2")
|
|
106 assert re_suffix.search("demo.r")
|
|
107 assert re_suffix.search("demo.q1")
|
|
108 assert re_suffix.search("demo.q1lk")
|
|
109
|
|
110 def clean_name(name):
|
|
111 """Remove suffix."""
|
|
112 match = re_suffix.search(name)
|
|
113 if match:
|
|
114 # Use the fact this is a suffix, and regular expression will be
|
|
115 # anchored to the end of the name:
|
|
116 return name[:match.start()]
|
|
117 else:
|
|
118 # Nothing to do
|
|
119 return name
|
|
120 assert clean_name("foo/1") == "foo"
|
|
121 assert clean_name("foo/2") == "foo"
|
|
122 assert clean_name("bar.f") == "bar"
|
|
123 assert clean_name("bar.r") == "bar"
|
|
124 assert clean_name("baz.p1") == "baz"
|
|
125 assert clean_name("baz.q2") == "baz"
|
|
126
|
|
127 mapped_chars = { '>' :'__gt__',
|
|
128 '<' :'__lt__',
|
|
129 "'" :'__sq__',
|
|
130 '"' :'__dq__',
|
|
131 '[' :'__ob__',
|
|
132 ']' :'__cb__',
|
|
133 '{' :'__oc__',
|
|
134 '}' :'__cc__',
|
|
135 '@' : '__at__',
|
|
136 '\n' : '__cn__',
|
|
137 '\r' : '__cr__',
|
|
138 '\t' : '__tc__',
|
|
139 '#' : '__pd__'
|
|
140 }
|
|
141
|
|
142 def load_mapping_ids(filename, pair_mode, ids):
|
|
143 """Parse SAM/BAM file, updating given set of ids.
|
|
144
|
|
145 Parses BAM files via call out to samtools view command.
|
|
146 """
|
|
147 handle = open(filename, "rb")
|
|
148 magic = handle.read(4)
|
|
149 if magic == b"\x1f\x8b\x08\x04":
|
|
150 # Presumably a BAM file then...
|
|
151 handle.close()
|
|
152 # Call samtools view, don't need header so no -h added:
|
|
153 child = subprocess.Popen(["samtools", "view", filename],
|
|
154 stdin=None,
|
|
155 stdout=subprocess.PIPE,
|
|
156 stderr=subprocess.PIPE)
|
|
157 handle = child.stdout
|
|
158 else:
|
|
159 # Presumably a SAM file...
|
|
160 child = None
|
|
161 handle.seek(0)
|
|
162 # Handle should now contain SAM records
|
|
163 for line in handle:
|
|
164 # Ignore header lines
|
|
165 if line[0] != "@":
|
|
166 qname, flag, rest = line.split("\t", 2)
|
|
167 flag = int(flag)
|
|
168 if pair_mode == "lax":
|
|
169 # If either read or its partner is mapped, take it!
|
|
170 # Being lazy, since we will look at both reads
|
|
171 # can just check if (either) has 0x4 clear.
|
|
172 if not (flag & 0x4):
|
|
173 ids.add(qname)
|
|
174 elif pair_mode == "strict":
|
|
175 # For paired reads, require BOTH be mapped.
|
|
176 if (flag & 0x4):
|
|
177 # This is unmapped, ignore it
|
|
178 pass
|
|
179 elif not (flag & 0x1):
|
|
180 # Unpaired (& mapped) - take it
|
|
181 ids.add(qname)
|
|
182 elif not (flag & 0x8):
|
|
183 # Paired and partner also mapped, good
|
|
184 ids.add(qname)
|
|
185 if child:
|
|
186 # Check terminated normally.
|
|
187 stdout, stderr = child.communicate()
|
|
188 assert child.returncode is not None
|
|
189 if child.returncode:
|
|
190 msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode,
|
|
191 filename, stderr)
|
|
192 sys_exit(msg.strip(), child.returncode)
|
|
193 else:
|
|
194 handle.close()
|
|
195
|
|
196
|
|
197 # Read mapping file(s) and record all mapped identifiers
|
|
198 ids = set()
|
|
199 for filename in args:
|
|
200 load_mapping_ids(filename, pair_mode, ids)
|
|
201 # TODO - If want to support naive paired mode, have to record
|
|
202 # more than just qname (need /1 or /2 indicator)
|
|
203 print("Loaded %i mapped IDs" % (len(ids)))
|
|
204 if len(ids) < 10:
|
|
205 print("Looking for %s" % ", ".join(sorted(ids)))
|
|
206
|
|
207 def crude_fasta_iterator(handle):
|
|
208 """Yields tuples, record ID and the full record as a string."""
|
|
209 while True:
|
|
210 line = handle.readline()
|
|
211 if line == "":
|
|
212 return # Premature end of file, or just empty?
|
|
213 if line[0] == ">":
|
|
214 break
|
|
215
|
|
216 no_id_warned = False
|
|
217 while True:
|
|
218 if line[0] != ">":
|
|
219 raise ValueError(
|
|
220 "Records in Fasta files should start with '>' character")
|
|
221 try:
|
|
222 id = line[1:].split(None, 1)[0]
|
|
223 except IndexError:
|
|
224 if not no_id_warned:
|
|
225 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n")
|
|
226 no_id_warned = True
|
|
227 id = None
|
|
228 lines = [line]
|
|
229 line = handle.readline()
|
|
230 while True:
|
|
231 if not line:
|
|
232 break
|
|
233 if line[0] == ">":
|
|
234 break
|
|
235 lines.append(line)
|
|
236 line = handle.readline()
|
|
237 yield id, "".join(lines)
|
|
238 if not line:
|
|
239 return # StopIteration
|
|
240
|
|
241
|
|
242 def fasta_filter(in_file, pos_file, neg_file, wanted):
|
|
243 """FASTA filter producing 60 character line wrapped outout."""
|
|
244 pos_count = neg_count = 0
|
|
245 #Galaxy now requires Python 2.5+ so can use with statements,
|
|
246 with open(in_file) as in_handle:
|
|
247 #Doing the if statement outside the loop for speed
|
|
248 #(with the downside of three very similar loops).
|
|
249 if pos_file is not None and neg_file is not None:
|
|
250 print "Generating two FASTA files"
|
|
251 with open(pos_file, "w") as pos_handle:
|
|
252 with open(neg_file, "w") as neg_handle:
|
|
253 for identifier, record in crude_fasta_iterator(in_handle):
|
|
254 if clean_name(identifier) in wanted:
|
|
255 pos_handle.write(record)
|
|
256 pos_count += 1
|
|
257 else:
|
|
258 neg_handle.write(record)
|
|
259 neg_count += 1
|
|
260 elif pos_file is not None:
|
|
261 print "Generating matching FASTA file"
|
|
262 with open(pos_file, "w") as pos_handle:
|
|
263 for identifier, record in crude_fasta_iterator(in_handle):
|
|
264 if clean_name(identifier) in wanted:
|
|
265 pos_handle.write(record)
|
|
266 pos_count += 1
|
|
267 else:
|
|
268 neg_count += 1
|
|
269 else:
|
|
270 print "Generating non-matching FASTA file"
|
|
271 assert neg_file is not None
|
|
272 with open(neg_file, "w") as neg_handle:
|
|
273 for identifier, record in crude_fasta_iterator(in_handle):
|
|
274 if clean_name(identifier) in wanted:
|
|
275 pos_count += 1
|
|
276 else:
|
|
277 neg_handle.write(record)
|
|
278 neg_count += 1
|
|
279 return pos_count, neg_count
|
|
280
|
|
281
|
|
282 def fastq_filter(in_file, pos_file, neg_file, wanted):
|
|
283 """FASTQ filter."""
|
|
284 from Bio.SeqIO.QualityIO import FastqGeneralIterator
|
|
285 handle = open(in_file, "r")
|
|
286 if out_positive_file is not None and out_negative_file is not None:
|
|
287 print "Generating two FASTQ files"
|
|
288 positive_handle = open(out_positive_file, "w")
|
|
289 negative_handle = open(out_negative_file, "w")
|
|
290 print in_file
|
|
291 for title, seq, qual in FastqGeneralIterator(handle):
|
|
292 # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0])))
|
|
293 if clean_name(title.split(None, 1)[0]) in ids:
|
|
294 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
|
|
295 else:
|
|
296 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
|
|
297 positive_handle.close()
|
|
298 negative_handle.close()
|
|
299 elif out_positive_file is not None:
|
|
300 print "Generating matching FASTQ file"
|
|
301 positive_handle = open(out_positive_file, "w")
|
|
302 for title, seq, qual in FastqGeneralIterator(handle):
|
|
303 if clean_name(title.split(None, 1)[0]) in ids:
|
|
304 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
|
|
305 positive_handle.close()
|
|
306 elif out_negative_file is not None:
|
|
307 print "Generating non-matching FASTQ file"
|
|
308 negative_handle = open(out_negative_file, "w")
|
|
309 for title, seq, qual in FastqGeneralIterator(handle):
|
|
310 if clean_name(title.split(None, 1)[0]) not in ids:
|
|
311 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
|
|
312 negative_handle.close()
|
|
313 handle.close()
|
|
314 # This does not currently bother to record record counts (faster)
|
|
315
|
|
316 def sff_filter(in_file, pos_file, neg_file, wanted):
|
|
317 """SFF filter."""
|
|
318 try:
|
|
319 from Bio.SeqIO.SffIO import SffIterator, SffWriter
|
|
320 except ImportError:
|
|
321 sys_exit("SFF filtering requires Biopython 1.54 or later")
|
|
322
|
|
323 try:
|
|
324 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
|
|
325 except ImportError:
|
|
326 #Prior to Biopython 1.56 this was a private function
|
|
327 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
|
|
328
|
|
329 in_handle = open(in_file, "rb") #must be binary mode!
|
|
330 try:
|
|
331 manifest = ReadRocheXmlManifest(in_handle)
|
|
332 except ValueError:
|
|
333 manifest = None
|
|
334
|
|
335 #This makes two passes though the SFF file with isn't so efficient,
|
|
336 #but this makes the code simple.
|
|
337 pos_count = neg_count = 0
|
|
338 if out_positive_file is not None:
|
|
339 out_handle = open(out_positive_file, "wb")
|
|
340 writer = SffWriter(out_handle, xml=manifest)
|
|
341 in_handle.seek(0) #start again after getting manifest
|
|
342 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids)
|
|
343 out_handle.close()
|
|
344 if out_negative_file is not None:
|
|
345 out_handle = open(out_negative_file, "wb")
|
|
346 writer = SffWriter(out_handle, xml=manifest)
|
|
347 in_handle.seek(0) #start again
|
|
348 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids)
|
|
349 out_handle.close()
|
|
350 #And we're done
|
|
351 in_handle.close()
|
|
352 return pos_count, neg_count
|
|
353
|
|
354
|
|
355 if seq_format.lower()=="sff":
|
|
356 # Now write filtered SFF file based on IDs wanted
|
|
357 pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids)
|
|
358 # At the time of writing, Galaxy doesn't show SFF file read counts,
|
|
359 # so it is useful to put them in stdout and thus shown in job info.
|
|
360 print "%i with and %i without specified IDs" % (pos_count, neg_count)
|
|
361 elif seq_format.lower()=="fasta":
|
|
362 # Write filtered FASTA file based on IDs from tabular file
|
|
363 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
|
|
364 print "%i with and %i without specified IDs" % (pos_count, neg_count)
|
|
365 elif seq_format.lower().startswith("fastq"):
|
|
366 #Write filtered FASTQ file based on IDs from mapping file
|
|
367 fastq_filter(in_file, out_positive_file, out_negative_file, ids)
|
|
368 # This does not currently track the counts
|
|
369 else:
|
|
370 sys_exit("Unsupported file type %r" % seq_format)
|