Mercurial > repos > peterjc > seq_primer_clip
comparison tools/primers/seq_primer_clip.py @ 0:945053d79e60 draft
Uploaded v0.0.8, first public release
author | peterjc |
---|---|
date | Mon, 29 Apr 2013 06:11:00 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:945053d79e60 |
---|---|
1 #!/usr/bin/env python | |
2 """Looks for the given primer sequences and clips matching SFF reads. | |
3 | |
4 Takes eight command line options, input read filename, input read format, | |
5 input primer FASTA filename, type of primers (forward, reverse or reverse- | |
6 complement), number of mismatches (currently only 0, 1 and 2 are supported), | |
7 minimum length to keep a read (after primer trimming), should primer-less | |
8 reads be kept (boolean), and finally the output sequence filename. | |
9 | |
10 Both the primer and read sequences can contain IUPAC ambiguity codes like N. | |
11 | |
12 This supports FASTA, FASTQ and SFF sequence files. Colorspace reads are not | |
13 supported. | |
14 | |
15 The mismatch parameter does not consider gapped alignemnts, however the | |
16 special case of missing bases at the very start or end of the read is handled. | |
17 e.g. a primer sequence CCGACTCGAG will match a read starting CGACTCGAG... | |
18 if one or more mismatches are allowed. | |
19 | |
20 This can also be used for stripping off (and optionally filtering on) barcodes. | |
21 | |
22 Note that only the trim/clip values in the SFF file are changed, not the flow | |
23 information of the full read sequence. | |
24 | |
25 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute | |
26 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. | |
27 See accompanying text file for licence details (MIT/BSD style). | |
28 | |
29 This is version 0.0.8 of the script. Currently it uses Python's regular | |
30 expression engine for finding the primers, which for my needs is fast enough. | |
31 """ | |
32 import sys | |
33 import re | |
34 from galaxy_utils.sequence.fasta import fastaReader, fastaWriter | |
35 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | |
36 | |
37 if "-v" in sys.argv or "--version" in sys.argv: | |
38 print "v0.0.5" | |
39 sys.exit(0) | |
40 | |
41 def stop_err(msg, err=1): | |
42 sys.stderr.write(msg) | |
43 sys.exit(err) | |
44 | |
45 try: | |
46 from Bio.Seq import reverse_complement | |
47 from Bio.SeqIO.SffIO import SffIterator, SffWriter | |
48 except ImportError: | |
49 stop_err("Requires Biopython 1.54 or later") | |
50 try: | |
51 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | |
52 except ImportError: | |
53 #Prior to Biopython 1.56 this was a private function | |
54 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | |
55 | |
56 #Parse Command Line | |
57 try: | |
58 in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:] | |
59 except ValueError: | |
60 stop_err("Expected 8 arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) | |
61 | |
62 if in_file == primer_fasta: | |
63 stop_err("Same file given as both primer sequences and sequences to clip!") | |
64 if in_file == out_file: | |
65 stop_err("Same file given as both sequences to clip and output!") | |
66 if primer_fasta == out_file: | |
67 stop_err("Same file given as both primer sequences and output!") | |
68 | |
69 try: | |
70 mm = int(mm) | |
71 except ValueError: | |
72 stop_err("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm) | |
73 if mm < 0: | |
74 stop_err("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm) | |
75 if mm not in [0,1,2]: | |
76 raise NotImplementedError | |
77 | |
78 try: | |
79 min_len = int(min_len) | |
80 except ValueError: | |
81 stop_err("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len) | |
82 if min_len < 0: | |
83 stop_err("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len) | |
84 | |
85 | |
86 if keep_negatives.lower() in ["true", "yes", "on"]: | |
87 keep_negatives = True | |
88 elif keep_negatives.lower() in ["false", "no", "off"]: | |
89 keep_negatives = False | |
90 else: | |
91 stop_err("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives) | |
92 | |
93 | |
94 if primer_type.lower() == "forward": | |
95 forward = True | |
96 rc = False | |
97 elif primer_type.lower() == "reverse": | |
98 forward = False | |
99 rc = False | |
100 elif primer_type.lower() == "reverse-complement": | |
101 forward = False | |
102 rc = True | |
103 else: | |
104 stop_err("Expected foward, reverse or reverse-complement not %r" % primer_type) | |
105 | |
106 | |
107 ambiguous_dna_values = { | |
108 "A": "A", | |
109 "C": "C", | |
110 "G": "G", | |
111 "T": "T", | |
112 "M": "ACM", | |
113 "R": "AGR", | |
114 "W": "ATW", | |
115 "S": "CGS", | |
116 "Y": "CTY", | |
117 "K": "GTK", | |
118 "V": "ACGMRSV", | |
119 "H": "ACTMWYH", | |
120 "D": "AGTRWKD", | |
121 "B": "CGTSYKB", | |
122 "X": ".", #faster than [GATCMRWSYKVVHDBXN] or even [GATC] | |
123 "N": ".", | |
124 } | |
125 | |
126 ambiguous_dna_re = {} | |
127 for letter, values in ambiguous_dna_values.iteritems(): | |
128 if len(values) == 1: | |
129 ambiguous_dna_re[letter] = values | |
130 else: | |
131 ambiguous_dna_re[letter] = "[%s]" % values | |
132 | |
133 | |
134 def make_reg_ex(seq): | |
135 return "".join(ambiguous_dna_re[letter] for letter in seq) | |
136 | |
137 def make_reg_ex_mm(seq, mm): | |
138 if mm > 2: | |
139 raise NotImplementedError("At most 2 mismatches allowed!") | |
140 seq = seq.upper() | |
141 yield make_reg_ex(seq) | |
142 for i in range(1,mm+1): | |
143 #Missing first/last i bases at very start/end of sequence | |
144 for reg in make_reg_ex_mm(seq[i:], mm-i): | |
145 yield "^" + reg | |
146 for reg in make_reg_ex_mm(seq[:-i], mm-i): | |
147 yield "$" + reg | |
148 if mm >= 1: | |
149 for i,letter in enumerate(seq): | |
150 #We'll use a set to remove any duplicate patterns | |
151 #if letter not in "NX": | |
152 pattern = seq[:i] + "N" + seq[i+1:] | |
153 assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \ | |
154 % (pattern, len(pattern), seq, len(seq)) | |
155 yield make_reg_ex(pattern) | |
156 if mm >=2: | |
157 for i,letter in enumerate(seq): | |
158 #We'll use a set to remove any duplicate patterns | |
159 #if letter not in "NX": | |
160 for k,letter in enumerate(seq[i+1:]): | |
161 #We'll use a set to remove any duplicate patterns | |
162 #if letter not in "NX": | |
163 pattern = seq[:i] + "N" + seq[i+1:i+1+k] + "N" + seq[i+k+2:] | |
164 assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \ | |
165 % (pattern, len(pattern), seq, len(seq)) | |
166 yield make_reg_ex(pattern) | |
167 | |
168 def load_primers_as_re(primer_fasta, mm, rc=False): | |
169 #Read primer file and record all specified sequences | |
170 primers = set() | |
171 in_handle = open(primer_fasta, "rU") | |
172 reader = fastaReader(in_handle) | |
173 count = 0 | |
174 for record in reader: | |
175 if rc: | |
176 seq = reverse_complement(record.sequence) | |
177 else: | |
178 seq = record.sequence | |
179 #primers.add(re.compile(make_reg_ex(seq))) | |
180 count += 1 | |
181 for pattern in make_reg_ex_mm(seq, mm): | |
182 primers.add(pattern) | |
183 in_handle.close() | |
184 #Use set to avoid duplicates, sort to have longest first | |
185 #(so more specific primers found before less specific ones) | |
186 primers = sorted(set(primers), key=lambda p: -len(p)) | |
187 return count, re.compile("|".join(primers)) #make one monster re! | |
188 | |
189 | |
190 | |
191 #Read primer file and record all specified sequences | |
192 count, primer = load_primers_as_re(primer_fasta, mm, rc) | |
193 print "%i primer sequences" % count | |
194 | |
195 short_neg = 0 | |
196 short_clipped = 0 | |
197 clipped = 0 | |
198 negs = 0 | |
199 | |
200 if seq_format.lower()=="sff": | |
201 #SFF is different because we just change the trim points | |
202 if forward: | |
203 def process(records): | |
204 global short_clipped, short_neg, clipped, negs | |
205 for record in records: | |
206 left_clip = record.annotations["clip_qual_left"] | |
207 right_clip = record.annotations["clip_qual_right"] | |
208 seq = str(record.seq)[left_clip:right_clip].upper() | |
209 result = primer.search(seq) | |
210 if result: | |
211 #Forward primer, take everything after it | |
212 #so move the left clip along | |
213 if len(seq) - result.end() >= min_len: | |
214 record.annotations["clip_qual_left"] = left_clip + result.end() | |
215 clipped += 1 | |
216 yield record | |
217 else: | |
218 short_clipped += 1 | |
219 elif keep_negatives: | |
220 if len(seq) >= min_len: | |
221 negs += 1 | |
222 yield record | |
223 else: | |
224 short_neg += 1 | |
225 else: | |
226 def process(records): | |
227 global short_clipped, short_neg, clipped, negs | |
228 for record in records: | |
229 left_clip = record.annotations["clip_qual_left"] | |
230 right_clip = record.annotations["clip_qual_right"] | |
231 seq = str(record.seq)[left_clip:right_clip].upper() | |
232 result = primer.search(seq) | |
233 if result: | |
234 #Reverse primer, take everything before it | |
235 #so move the right clip back | |
236 new_len = result.start() | |
237 if new_len >= min_len: | |
238 record.annotations["clip_qual_right"] = left_clip + new_len | |
239 clipped += 1 | |
240 yield record | |
241 else: | |
242 short_clipped += 1 | |
243 elif keep_negatives: | |
244 if len(seq) >= min_len: | |
245 negs += 1 | |
246 yield record | |
247 else: | |
248 short_neg += 1 | |
249 | |
250 in_handle = open(in_file, "rb") | |
251 try: | |
252 manifest = ReadRocheXmlManifest(in_handle) | |
253 except ValueError: | |
254 manifest = None | |
255 in_handle.seek(0) | |
256 out_handle = open(out_file, "wb") | |
257 writer = SffWriter(out_handle, xml=manifest) | |
258 writer.write_file(process(SffIterator(in_handle))) | |
259 #End of SFF code | |
260 elif seq_format.lower().startswith("fastq"): | |
261 in_handle = open(in_file, "rU") | |
262 out_handle = open(out_file, "w") | |
263 reader = fastqReader(in_handle) | |
264 writer = fastqWriter(out_handle) | |
265 if forward: | |
266 for record in reader: | |
267 seq = record.sequence.upper() | |
268 result = primer.search(seq) | |
269 if result: | |
270 #Forward primer, take everything after it | |
271 cut = result.end() | |
272 record.sequence = seq[cut:] | |
273 if len(record.sequence) >= min_len: | |
274 record.quality = record.quality[cut:] | |
275 clipped += 1 | |
276 writer.write(record) | |
277 else: | |
278 short_clipped += 1 | |
279 elif keep_negatives: | |
280 if len(record) >= min_len: | |
281 negs += 1 | |
282 writer.write(record) | |
283 else: | |
284 short_negs += 1 | |
285 else: | |
286 for record in reader: | |
287 seq = record.sequence.upper() | |
288 result = primer.search(seq) | |
289 if result: | |
290 #Reverse primer, take everything before it | |
291 cut = result.start() | |
292 record.sequence = seq[:cut] | |
293 if len(record.sequence) >= min_len: | |
294 record.quality = record.quality[:cut] | |
295 clipped += 1 | |
296 writer.write(record) | |
297 else: | |
298 short_clipped += 1 | |
299 elif keep_negatives: | |
300 if len(record) >= min_len: | |
301 negs += 1 | |
302 writer.write(record) | |
303 else: | |
304 short_negs += 1 | |
305 elif seq_format.lower()=="fasta": | |
306 in_handle = open(in_file, "rU") | |
307 out_handle = open(out_file, "w") | |
308 reader = fastaReader(in_handle) | |
309 writer = fastaWriter(out_handle) | |
310 #Following code is identical to that for FASTQ but without editing qualities | |
311 if forward: | |
312 for record in reader: | |
313 seq = record.sequence.upper() | |
314 result = primer.search(seq) | |
315 if result: | |
316 #Forward primer, take everything after it | |
317 cut = result.end() | |
318 record.sequence = seq[cut:] | |
319 if len(record.sequence) >= min_len: | |
320 clipped += 1 | |
321 writer.write(record) | |
322 else: | |
323 short_clipped += 1 | |
324 elif keep_negatives: | |
325 if len(record) >= min_len: | |
326 negs += 1 | |
327 writer.write(record) | |
328 else: | |
329 short_negs += 1 | |
330 else: | |
331 for record in reader: | |
332 seq = record.sequence.upper() | |
333 result = primer.search(seq) | |
334 if result: | |
335 #Reverse primer, take everything before it | |
336 cut = result.start() | |
337 record.sequence = seq[:cut] | |
338 if len(record.sequence) >= min_len: | |
339 clipped += 1 | |
340 writer.write(record) | |
341 else: | |
342 short_clipped += 1 | |
343 elif keep_negatives: | |
344 if len(record) >= min_len: | |
345 negs += 1 | |
346 writer.write(record) | |
347 else: | |
348 short_negs += 1 | |
349 else: | |
350 stop_err("Unsupported file type %r" % seq_format) | |
351 in_handle.close() | |
352 out_handle.close() | |
353 | |
354 print "Kept %i clipped reads," % clipped | |
355 print "discarded %i short." % short_clipped | |
356 if keep_negatives: | |
357 print "Kept %i non-matching reads," % negs | |
358 print "discarded %i short." % short_neg |