comparison tools/seq_filter_by_mapping/seq_filter_by_mapping.py @ 0:1d773da0ccf0 draft

Uploaded v0.0.2, fixed some error messages
author peterjc
date Tue, 27 Jan 2015 08:31:13 -0500
parents
children 8ff0ac66f1a3
comparison
equal deleted inserted replaced
-1:000000000000 0:1d773da0ccf0
1 #!/usr/bin/env python
2 """Filter a FASTA, FASTQ or SSF file with SAM/BAM mapping information.
3
4 When filtering an SFF file, any Roche XML manifest in the input file is
5 preserved in both output files.
6
7 This tool is a short Python script which requires Biopython 1.54 or later
8 for SFF file support. If you use this tool in scientific work leading to a
9 publication, please cite the Biopython application note:
10
11 Cock et al 2009. Biopython: freely available Python tools for computational
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
14
15 This script is copyright 2014 by Peter Cock, The James Hutton Institute
16 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
17 See accompanying text file for licence details (MIT license).
18
19 Use -v or --version to get the version, -h or --help for help.
20 """
21 import os
22 import sys
23 import re
24 import subprocess
25 from optparse import OptionParser
26
27 def sys_exit(msg, err=1):
28 sys.stderr.write(msg.rstrip() + "\n")
29 sys.exit(err)
30
31 #Parse Command Line
32 usage = """Use as follows:
33
34 $ python seq_filter_by_mapping.py [options] mapping.sam/bam [more mappings]
35
36 e.g. Positive matches using column one from a single BAM file:
37
38 $ seq_filter_by_mapping.py -i my_seqs.fastq -f fastq -p matches.fastq mapping.bam
39
40 Multiple SAM/BAM mapping files may be given.
41 """
42 parser = OptionParser(usage=usage)
43 parser.add_option('-i', '--input', dest='input',
44 default=None, help='Input sequences filename',
45 metavar="FILE")
46 parser.add_option('-f', '--format', dest='format',
47 default=None,
48 help='Input sequence format (e.g. fasta, fastq, sff)')
49 parser.add_option('-p', '--positive', dest='output_positive',
50 default=None,
51 help='Output filename for mapping reads',
52 metavar="FILE")
53 parser.add_option('-n', '--negative', dest='output_negative',
54 default=None,
55 help='Output filename for non-mapping reads',
56 metavar="FILE")
57 parser.add_option("-m", "--pair-mode", dest="pair_mode",
58 default="lax",
59 help="How to treat paired reads (lax or strict, default lax)")
60 parser.add_option("-v", "--version", dest="version",
61 default=False, action="store_true",
62 help="Show version and quit")
63
64 options, args = parser.parse_args()
65
66 if options.version:
67 print "v0.0.2"
68 sys.exit(0)
69
70 in_file = options.input
71 seq_format = options.format
72 out_positive_file = options.output_positive
73 out_negative_file = options.output_negative
74 pair_mode = options.pair_mode
75
76 if in_file is None or not os.path.isfile(in_file):
77 sys_exit("Missing input file: %r" % in_file)
78 if out_positive_file is None and out_negative_file is None:
79 sys_exit("Neither output file requested")
80 if seq_format is None:
81 sys_exit("Missing sequence format")
82 if pair_mode not in ["lax", "strict"]:
83 sys_exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode)
84 for mapping in args:
85 if not os.path.isfile(mapping):
86 sys_exit("Mapping file %r not found" % mapping)
87 if not args:
88 sys_exit("At least one SAM/BAM mapping file is required")
89
90
91 #Cope with three widely used suffix naming convensions,
92 #Illumina: /1 or /2
93 #Forward/revered: .f or .r
94 #Sanger, e.g. .p1k and .q1k
95 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html
96 #re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
97 #re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
98 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$")
99 assert re_suffix.search("demo.f")
100 assert re_suffix.search("demo.s1")
101 assert re_suffix.search("demo.f1k")
102 assert re_suffix.search("demo.p1")
103 assert re_suffix.search("demo.p1k")
104 assert re_suffix.search("demo.p1lk")
105 assert re_suffix.search("demo/2")
106 assert re_suffix.search("demo.r")
107 assert re_suffix.search("demo.q1")
108 assert re_suffix.search("demo.q1lk")
109
110 def clean_name(name):
111 """Remove suffix."""
112 match = re_suffix.search(name)
113 if match:
114 # Use the fact this is a suffix, and regular expression will be
115 # anchored to the end of the name:
116 return name[:match.start()]
117 else:
118 # Nothing to do
119 return name
120 assert clean_name("foo/1") == "foo"
121 assert clean_name("foo/2") == "foo"
122 assert clean_name("bar.f") == "bar"
123 assert clean_name("bar.r") == "bar"
124 assert clean_name("baz.p1") == "baz"
125 assert clean_name("baz.q2") == "baz"
126
127 mapped_chars = { '>' :'__gt__',
128 '<' :'__lt__',
129 "'" :'__sq__',
130 '"' :'__dq__',
131 '[' :'__ob__',
132 ']' :'__cb__',
133 '{' :'__oc__',
134 '}' :'__cc__',
135 '@' : '__at__',
136 '\n' : '__cn__',
137 '\r' : '__cr__',
138 '\t' : '__tc__',
139 '#' : '__pd__'
140 }
141
142 def load_mapping_ids(filename, pair_mode, ids):
143 """Parse SAM/BAM file, updating given set of ids.
144
145 Parses BAM files via call out to samtools view command.
146 """
147 handle = open(filename, "rb")
148 magic = handle.read(4)
149 if magic == b"\x1f\x8b\x08\x04":
150 # Presumably a BAM file then...
151 handle.close()
152 # Call samtools view, don't need header so no -h added:
153 child = subprocess.Popen(["samtools", "view", filename],
154 stdin=None,
155 stdout=subprocess.PIPE,
156 stderr=subprocess.PIPE)
157 handle = child.stdout
158 else:
159 # Presumably a SAM file...
160 child = None
161 handle.seek(0)
162 # Handle should now contain SAM records
163 for line in handle:
164 # Ignore header lines
165 if line[0] != "@":
166 qname, flag, rest = line.split("\t", 2)
167 flag = int(flag)
168 if pair_mode == "lax":
169 # If either read or its partner is mapped, take it!
170 # Being lazy, since we will look at both reads
171 # can just check if (either) has 0x4 clear.
172 if not (flag & 0x4):
173 ids.add(qname)
174 elif pair_mode == "strict":
175 # For paired reads, require BOTH be mapped.
176 if (flag & 0x4):
177 # This is unmapped, ignore it
178 pass
179 elif not (flag & 0x1):
180 # Unpaired (& mapped) - take it
181 ids.add(qname)
182 elif not (flag & 0x8):
183 # Paired and partner also mapped, good
184 ids.add(qname)
185 if child:
186 # Check terminated normally.
187 stdout, stderr = child.communicate()
188 assert child.returncode is not None
189 if child.returncode:
190 msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode,
191 filename, stderr)
192 sys_exit(msg.strip(), child.returncode)
193 else:
194 handle.close()
195
196
197 # Read mapping file(s) and record all mapped identifiers
198 ids = set()
199 for filename in args:
200 load_mapping_ids(filename, pair_mode, ids)
201 # TODO - If want to support naive paired mode, have to record
202 # more than just qname (need /1 or /2 indicator)
203 print("Loaded %i mapped IDs" % (len(ids)))
204 if len(ids) < 10:
205 print("Looking for %s" % ", ".join(sorted(ids)))
206
207 def crude_fasta_iterator(handle):
208 """Yields tuples, record ID and the full record as a string."""
209 while True:
210 line = handle.readline()
211 if line == "":
212 return # Premature end of file, or just empty?
213 if line[0] == ">":
214 break
215
216 no_id_warned = False
217 while True:
218 if line[0] != ">":
219 raise ValueError(
220 "Records in Fasta files should start with '>' character")
221 try:
222 id = line[1:].split(None, 1)[0]
223 except IndexError:
224 if not no_id_warned:
225 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n")
226 no_id_warned = True
227 id = None
228 lines = [line]
229 line = handle.readline()
230 while True:
231 if not line:
232 break
233 if line[0] == ">":
234 break
235 lines.append(line)
236 line = handle.readline()
237 yield id, "".join(lines)
238 if not line:
239 return # StopIteration
240
241
242 def fasta_filter(in_file, pos_file, neg_file, wanted):
243 """FASTA filter producing 60 character line wrapped outout."""
244 pos_count = neg_count = 0
245 #Galaxy now requires Python 2.5+ so can use with statements,
246 with open(in_file) as in_handle:
247 #Doing the if statement outside the loop for speed
248 #(with the downside of three very similar loops).
249 if pos_file is not None and neg_file is not None:
250 print "Generating two FASTA files"
251 with open(pos_file, "w") as pos_handle:
252 with open(neg_file, "w") as neg_handle:
253 for identifier, record in crude_fasta_iterator(in_handle):
254 if clean_name(identifier) in wanted:
255 pos_handle.write(record)
256 pos_count += 1
257 else:
258 neg_handle.write(record)
259 neg_count += 1
260 elif pos_file is not None:
261 print "Generating matching FASTA file"
262 with open(pos_file, "w") as pos_handle:
263 for identifier, record in crude_fasta_iterator(in_handle):
264 if clean_name(identifier) in wanted:
265 pos_handle.write(record)
266 pos_count += 1
267 else:
268 neg_count += 1
269 else:
270 print "Generating non-matching FASTA file"
271 assert neg_file is not None
272 with open(neg_file, "w") as neg_handle:
273 for identifier, record in crude_fasta_iterator(in_handle):
274 if clean_name(identifier) in wanted:
275 pos_count += 1
276 else:
277 neg_handle.write(record)
278 neg_count += 1
279 return pos_count, neg_count
280
281
282 def fastq_filter(in_file, pos_file, neg_file, wanted):
283 """FASTQ filter."""
284 from Bio.SeqIO.QualityIO import FastqGeneralIterator
285 handle = open(in_file, "r")
286 if out_positive_file is not None and out_negative_file is not None:
287 print "Generating two FASTQ files"
288 positive_handle = open(out_positive_file, "w")
289 negative_handle = open(out_negative_file, "w")
290 print in_file
291 for title, seq, qual in FastqGeneralIterator(handle):
292 # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0])))
293 if clean_name(title.split(None, 1)[0]) in ids:
294 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
295 else:
296 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
297 positive_handle.close()
298 negative_handle.close()
299 elif out_positive_file is not None:
300 print "Generating matching FASTQ file"
301 positive_handle = open(out_positive_file, "w")
302 for title, seq, qual in FastqGeneralIterator(handle):
303 if clean_name(title.split(None, 1)[0]) in ids:
304 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
305 positive_handle.close()
306 elif out_negative_file is not None:
307 print "Generating non-matching FASTQ file"
308 negative_handle = open(out_negative_file, "w")
309 for title, seq, qual in FastqGeneralIterator(handle):
310 if clean_name(title.split(None, 1)[0]) not in ids:
311 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
312 negative_handle.close()
313 handle.close()
314 # This does not currently bother to record record counts (faster)
315
316 def sff_filter(in_file, pos_file, neg_file, wanted):
317 """SFF filter."""
318 try:
319 from Bio.SeqIO.SffIO import SffIterator, SffWriter
320 except ImportError:
321 sys_exit("SFF filtering requires Biopython 1.54 or later")
322
323 try:
324 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
325 except ImportError:
326 #Prior to Biopython 1.56 this was a private function
327 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
328
329 in_handle = open(in_file, "rb") #must be binary mode!
330 try:
331 manifest = ReadRocheXmlManifest(in_handle)
332 except ValueError:
333 manifest = None
334
335 #This makes two passes though the SFF file with isn't so efficient,
336 #but this makes the code simple.
337 pos_count = neg_count = 0
338 if out_positive_file is not None:
339 out_handle = open(out_positive_file, "wb")
340 writer = SffWriter(out_handle, xml=manifest)
341 in_handle.seek(0) #start again after getting manifest
342 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids)
343 out_handle.close()
344 if out_negative_file is not None:
345 out_handle = open(out_negative_file, "wb")
346 writer = SffWriter(out_handle, xml=manifest)
347 in_handle.seek(0) #start again
348 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids)
349 out_handle.close()
350 #And we're done
351 in_handle.close()
352 return pos_count, neg_count
353
354
355 if seq_format.lower()=="sff":
356 # Now write filtered SFF file based on IDs wanted
357 pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids)
358 # At the time of writing, Galaxy doesn't show SFF file read counts,
359 # so it is useful to put them in stdout and thus shown in job info.
360 print "%i with and %i without specified IDs" % (pos_count, neg_count)
361 elif seq_format.lower()=="fasta":
362 # Write filtered FASTA file based on IDs from tabular file
363 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
364 print "%i with and %i without specified IDs" % (pos_count, neg_count)
365 elif seq_format.lower().startswith("fastq"):
366 #Write filtered FASTQ file based on IDs from mapping file
367 fastq_filter(in_file, out_positive_file, out_negative_file, ids)
368 # This does not currently track the counts
369 else:
370 sys_exit("Unsupported file type %r" % seq_format)