Mercurial > repos > peterjc > seq_filter_by_mapping
comparison tools/seq_filter_by_mapping/seq_filter_by_mapping.py @ 0:1d773da0ccf0 draft
Uploaded v0.0.2, fixed some error messages
author | peterjc |
---|---|
date | Tue, 27 Jan 2015 08:31:13 -0500 |
parents | |
children | 8ff0ac66f1a3 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1d773da0ccf0 |
---|---|
1 #!/usr/bin/env python | |
2 """Filter a FASTA, FASTQ or SSF file with SAM/BAM mapping information. | |
3 | |
4 When filtering an SFF file, any Roche XML manifest in the input file is | |
5 preserved in both output files. | |
6 | |
7 This tool is a short Python script which requires Biopython 1.54 or later | |
8 for SFF file support. If you use this tool in scientific work leading to a | |
9 publication, please cite the Biopython application note: | |
10 | |
11 Cock et al 2009. Biopython: freely available Python tools for computational | |
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
14 | |
15 This script is copyright 2014 by Peter Cock, The James Hutton Institute | |
16 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. | |
17 See accompanying text file for licence details (MIT license). | |
18 | |
19 Use -v or --version to get the version, -h or --help for help. | |
20 """ | |
21 import os | |
22 import sys | |
23 import re | |
24 import subprocess | |
25 from optparse import OptionParser | |
26 | |
27 def sys_exit(msg, err=1): | |
28 sys.stderr.write(msg.rstrip() + "\n") | |
29 sys.exit(err) | |
30 | |
31 #Parse Command Line | |
32 usage = """Use as follows: | |
33 | |
34 $ python seq_filter_by_mapping.py [options] mapping.sam/bam [more mappings] | |
35 | |
36 e.g. Positive matches using column one from a single BAM file: | |
37 | |
38 $ seq_filter_by_mapping.py -i my_seqs.fastq -f fastq -p matches.fastq mapping.bam | |
39 | |
40 Multiple SAM/BAM mapping files may be given. | |
41 """ | |
42 parser = OptionParser(usage=usage) | |
43 parser.add_option('-i', '--input', dest='input', | |
44 default=None, help='Input sequences filename', | |
45 metavar="FILE") | |
46 parser.add_option('-f', '--format', dest='format', | |
47 default=None, | |
48 help='Input sequence format (e.g. fasta, fastq, sff)') | |
49 parser.add_option('-p', '--positive', dest='output_positive', | |
50 default=None, | |
51 help='Output filename for mapping reads', | |
52 metavar="FILE") | |
53 parser.add_option('-n', '--negative', dest='output_negative', | |
54 default=None, | |
55 help='Output filename for non-mapping reads', | |
56 metavar="FILE") | |
57 parser.add_option("-m", "--pair-mode", dest="pair_mode", | |
58 default="lax", | |
59 help="How to treat paired reads (lax or strict, default lax)") | |
60 parser.add_option("-v", "--version", dest="version", | |
61 default=False, action="store_true", | |
62 help="Show version and quit") | |
63 | |
64 options, args = parser.parse_args() | |
65 | |
66 if options.version: | |
67 print "v0.0.2" | |
68 sys.exit(0) | |
69 | |
70 in_file = options.input | |
71 seq_format = options.format | |
72 out_positive_file = options.output_positive | |
73 out_negative_file = options.output_negative | |
74 pair_mode = options.pair_mode | |
75 | |
76 if in_file is None or not os.path.isfile(in_file): | |
77 sys_exit("Missing input file: %r" % in_file) | |
78 if out_positive_file is None and out_negative_file is None: | |
79 sys_exit("Neither output file requested") | |
80 if seq_format is None: | |
81 sys_exit("Missing sequence format") | |
82 if pair_mode not in ["lax", "strict"]: | |
83 sys_exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode) | |
84 for mapping in args: | |
85 if not os.path.isfile(mapping): | |
86 sys_exit("Mapping file %r not found" % mapping) | |
87 if not args: | |
88 sys_exit("At least one SAM/BAM mapping file is required") | |
89 | |
90 | |
91 #Cope with three widely used suffix naming convensions, | |
92 #Illumina: /1 or /2 | |
93 #Forward/revered: .f or .r | |
94 #Sanger, e.g. .p1k and .q1k | |
95 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html | |
96 #re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") | |
97 #re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") | |
98 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$") | |
99 assert re_suffix.search("demo.f") | |
100 assert re_suffix.search("demo.s1") | |
101 assert re_suffix.search("demo.f1k") | |
102 assert re_suffix.search("demo.p1") | |
103 assert re_suffix.search("demo.p1k") | |
104 assert re_suffix.search("demo.p1lk") | |
105 assert re_suffix.search("demo/2") | |
106 assert re_suffix.search("demo.r") | |
107 assert re_suffix.search("demo.q1") | |
108 assert re_suffix.search("demo.q1lk") | |
109 | |
110 def clean_name(name): | |
111 """Remove suffix.""" | |
112 match = re_suffix.search(name) | |
113 if match: | |
114 # Use the fact this is a suffix, and regular expression will be | |
115 # anchored to the end of the name: | |
116 return name[:match.start()] | |
117 else: | |
118 # Nothing to do | |
119 return name | |
120 assert clean_name("foo/1") == "foo" | |
121 assert clean_name("foo/2") == "foo" | |
122 assert clean_name("bar.f") == "bar" | |
123 assert clean_name("bar.r") == "bar" | |
124 assert clean_name("baz.p1") == "baz" | |
125 assert clean_name("baz.q2") == "baz" | |
126 | |
127 mapped_chars = { '>' :'__gt__', | |
128 '<' :'__lt__', | |
129 "'" :'__sq__', | |
130 '"' :'__dq__', | |
131 '[' :'__ob__', | |
132 ']' :'__cb__', | |
133 '{' :'__oc__', | |
134 '}' :'__cc__', | |
135 '@' : '__at__', | |
136 '\n' : '__cn__', | |
137 '\r' : '__cr__', | |
138 '\t' : '__tc__', | |
139 '#' : '__pd__' | |
140 } | |
141 | |
142 def load_mapping_ids(filename, pair_mode, ids): | |
143 """Parse SAM/BAM file, updating given set of ids. | |
144 | |
145 Parses BAM files via call out to samtools view command. | |
146 """ | |
147 handle = open(filename, "rb") | |
148 magic = handle.read(4) | |
149 if magic == b"\x1f\x8b\x08\x04": | |
150 # Presumably a BAM file then... | |
151 handle.close() | |
152 # Call samtools view, don't need header so no -h added: | |
153 child = subprocess.Popen(["samtools", "view", filename], | |
154 stdin=None, | |
155 stdout=subprocess.PIPE, | |
156 stderr=subprocess.PIPE) | |
157 handle = child.stdout | |
158 else: | |
159 # Presumably a SAM file... | |
160 child = None | |
161 handle.seek(0) | |
162 # Handle should now contain SAM records | |
163 for line in handle: | |
164 # Ignore header lines | |
165 if line[0] != "@": | |
166 qname, flag, rest = line.split("\t", 2) | |
167 flag = int(flag) | |
168 if pair_mode == "lax": | |
169 # If either read or its partner is mapped, take it! | |
170 # Being lazy, since we will look at both reads | |
171 # can just check if (either) has 0x4 clear. | |
172 if not (flag & 0x4): | |
173 ids.add(qname) | |
174 elif pair_mode == "strict": | |
175 # For paired reads, require BOTH be mapped. | |
176 if (flag & 0x4): | |
177 # This is unmapped, ignore it | |
178 pass | |
179 elif not (flag & 0x1): | |
180 # Unpaired (& mapped) - take it | |
181 ids.add(qname) | |
182 elif not (flag & 0x8): | |
183 # Paired and partner also mapped, good | |
184 ids.add(qname) | |
185 if child: | |
186 # Check terminated normally. | |
187 stdout, stderr = child.communicate() | |
188 assert child.returncode is not None | |
189 if child.returncode: | |
190 msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode, | |
191 filename, stderr) | |
192 sys_exit(msg.strip(), child.returncode) | |
193 else: | |
194 handle.close() | |
195 | |
196 | |
197 # Read mapping file(s) and record all mapped identifiers | |
198 ids = set() | |
199 for filename in args: | |
200 load_mapping_ids(filename, pair_mode, ids) | |
201 # TODO - If want to support naive paired mode, have to record | |
202 # more than just qname (need /1 or /2 indicator) | |
203 print("Loaded %i mapped IDs" % (len(ids))) | |
204 if len(ids) < 10: | |
205 print("Looking for %s" % ", ".join(sorted(ids))) | |
206 | |
207 def crude_fasta_iterator(handle): | |
208 """Yields tuples, record ID and the full record as a string.""" | |
209 while True: | |
210 line = handle.readline() | |
211 if line == "": | |
212 return # Premature end of file, or just empty? | |
213 if line[0] == ">": | |
214 break | |
215 | |
216 no_id_warned = False | |
217 while True: | |
218 if line[0] != ">": | |
219 raise ValueError( | |
220 "Records in Fasta files should start with '>' character") | |
221 try: | |
222 id = line[1:].split(None, 1)[0] | |
223 except IndexError: | |
224 if not no_id_warned: | |
225 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") | |
226 no_id_warned = True | |
227 id = None | |
228 lines = [line] | |
229 line = handle.readline() | |
230 while True: | |
231 if not line: | |
232 break | |
233 if line[0] == ">": | |
234 break | |
235 lines.append(line) | |
236 line = handle.readline() | |
237 yield id, "".join(lines) | |
238 if not line: | |
239 return # StopIteration | |
240 | |
241 | |
242 def fasta_filter(in_file, pos_file, neg_file, wanted): | |
243 """FASTA filter producing 60 character line wrapped outout.""" | |
244 pos_count = neg_count = 0 | |
245 #Galaxy now requires Python 2.5+ so can use with statements, | |
246 with open(in_file) as in_handle: | |
247 #Doing the if statement outside the loop for speed | |
248 #(with the downside of three very similar loops). | |
249 if pos_file is not None and neg_file is not None: | |
250 print "Generating two FASTA files" | |
251 with open(pos_file, "w") as pos_handle: | |
252 with open(neg_file, "w") as neg_handle: | |
253 for identifier, record in crude_fasta_iterator(in_handle): | |
254 if clean_name(identifier) in wanted: | |
255 pos_handle.write(record) | |
256 pos_count += 1 | |
257 else: | |
258 neg_handle.write(record) | |
259 neg_count += 1 | |
260 elif pos_file is not None: | |
261 print "Generating matching FASTA file" | |
262 with open(pos_file, "w") as pos_handle: | |
263 for identifier, record in crude_fasta_iterator(in_handle): | |
264 if clean_name(identifier) in wanted: | |
265 pos_handle.write(record) | |
266 pos_count += 1 | |
267 else: | |
268 neg_count += 1 | |
269 else: | |
270 print "Generating non-matching FASTA file" | |
271 assert neg_file is not None | |
272 with open(neg_file, "w") as neg_handle: | |
273 for identifier, record in crude_fasta_iterator(in_handle): | |
274 if clean_name(identifier) in wanted: | |
275 pos_count += 1 | |
276 else: | |
277 neg_handle.write(record) | |
278 neg_count += 1 | |
279 return pos_count, neg_count | |
280 | |
281 | |
282 def fastq_filter(in_file, pos_file, neg_file, wanted): | |
283 """FASTQ filter.""" | |
284 from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
285 handle = open(in_file, "r") | |
286 if out_positive_file is not None and out_negative_file is not None: | |
287 print "Generating two FASTQ files" | |
288 positive_handle = open(out_positive_file, "w") | |
289 negative_handle = open(out_negative_file, "w") | |
290 print in_file | |
291 for title, seq, qual in FastqGeneralIterator(handle): | |
292 # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) | |
293 if clean_name(title.split(None, 1)[0]) in ids: | |
294 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | |
295 else: | |
296 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | |
297 positive_handle.close() | |
298 negative_handle.close() | |
299 elif out_positive_file is not None: | |
300 print "Generating matching FASTQ file" | |
301 positive_handle = open(out_positive_file, "w") | |
302 for title, seq, qual in FastqGeneralIterator(handle): | |
303 if clean_name(title.split(None, 1)[0]) in ids: | |
304 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | |
305 positive_handle.close() | |
306 elif out_negative_file is not None: | |
307 print "Generating non-matching FASTQ file" | |
308 negative_handle = open(out_negative_file, "w") | |
309 for title, seq, qual in FastqGeneralIterator(handle): | |
310 if clean_name(title.split(None, 1)[0]) not in ids: | |
311 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) | |
312 negative_handle.close() | |
313 handle.close() | |
314 # This does not currently bother to record record counts (faster) | |
315 | |
316 def sff_filter(in_file, pos_file, neg_file, wanted): | |
317 """SFF filter.""" | |
318 try: | |
319 from Bio.SeqIO.SffIO import SffIterator, SffWriter | |
320 except ImportError: | |
321 sys_exit("SFF filtering requires Biopython 1.54 or later") | |
322 | |
323 try: | |
324 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | |
325 except ImportError: | |
326 #Prior to Biopython 1.56 this was a private function | |
327 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | |
328 | |
329 in_handle = open(in_file, "rb") #must be binary mode! | |
330 try: | |
331 manifest = ReadRocheXmlManifest(in_handle) | |
332 except ValueError: | |
333 manifest = None | |
334 | |
335 #This makes two passes though the SFF file with isn't so efficient, | |
336 #but this makes the code simple. | |
337 pos_count = neg_count = 0 | |
338 if out_positive_file is not None: | |
339 out_handle = open(out_positive_file, "wb") | |
340 writer = SffWriter(out_handle, xml=manifest) | |
341 in_handle.seek(0) #start again after getting manifest | |
342 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids) | |
343 out_handle.close() | |
344 if out_negative_file is not None: | |
345 out_handle = open(out_negative_file, "wb") | |
346 writer = SffWriter(out_handle, xml=manifest) | |
347 in_handle.seek(0) #start again | |
348 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids) | |
349 out_handle.close() | |
350 #And we're done | |
351 in_handle.close() | |
352 return pos_count, neg_count | |
353 | |
354 | |
355 if seq_format.lower()=="sff": | |
356 # Now write filtered SFF file based on IDs wanted | |
357 pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids) | |
358 # At the time of writing, Galaxy doesn't show SFF file read counts, | |
359 # so it is useful to put them in stdout and thus shown in job info. | |
360 print "%i with and %i without specified IDs" % (pos_count, neg_count) | |
361 elif seq_format.lower()=="fasta": | |
362 # Write filtered FASTA file based on IDs from tabular file | |
363 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) | |
364 print "%i with and %i without specified IDs" % (pos_count, neg_count) | |
365 elif seq_format.lower().startswith("fastq"): | |
366 #Write filtered FASTQ file based on IDs from mapping file | |
367 fastq_filter(in_file, out_positive_file, out_negative_file, ids) | |
368 # This does not currently track the counts | |
369 else: | |
370 sys_exit("Unsupported file type %r" % seq_format) |