Mercurial > repos > peterjc > seq_primer_clip
comparison tools/primers/seq_primer_clip.py @ 0:945053d79e60 draft
Uploaded v0.0.8, first public release
| author | peterjc |
|---|---|
| date | Mon, 29 Apr 2013 06:11:00 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:945053d79e60 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """Looks for the given primer sequences and clips matching SFF reads. | |
| 3 | |
| 4 Takes eight command line options, input read filename, input read format, | |
| 5 input primer FASTA filename, type of primers (forward, reverse or reverse- | |
| 6 complement), number of mismatches (currently only 0, 1 and 2 are supported), | |
| 7 minimum length to keep a read (after primer trimming), should primer-less | |
| 8 reads be kept (boolean), and finally the output sequence filename. | |
| 9 | |
| 10 Both the primer and read sequences can contain IUPAC ambiguity codes like N. | |
| 11 | |
| 12 This supports FASTA, FASTQ and SFF sequence files. Colorspace reads are not | |
| 13 supported. | |
| 14 | |
| 15 The mismatch parameter does not consider gapped alignemnts, however the | |
| 16 special case of missing bases at the very start or end of the read is handled. | |
| 17 e.g. a primer sequence CCGACTCGAG will match a read starting CGACTCGAG... | |
| 18 if one or more mismatches are allowed. | |
| 19 | |
| 20 This can also be used for stripping off (and optionally filtering on) barcodes. | |
| 21 | |
| 22 Note that only the trim/clip values in the SFF file are changed, not the flow | |
| 23 information of the full read sequence. | |
| 24 | |
| 25 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute | |
| 26 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. | |
| 27 See accompanying text file for licence details (MIT/BSD style). | |
| 28 | |
| 29 This is version 0.0.8 of the script. Currently it uses Python's regular | |
| 30 expression engine for finding the primers, which for my needs is fast enough. | |
| 31 """ | |
| 32 import sys | |
| 33 import re | |
| 34 from galaxy_utils.sequence.fasta import fastaReader, fastaWriter | |
| 35 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | |
| 36 | |
| 37 if "-v" in sys.argv or "--version" in sys.argv: | |
| 38 print "v0.0.5" | |
| 39 sys.exit(0) | |
| 40 | |
| 41 def stop_err(msg, err=1): | |
| 42 sys.stderr.write(msg) | |
| 43 sys.exit(err) | |
| 44 | |
| 45 try: | |
| 46 from Bio.Seq import reverse_complement | |
| 47 from Bio.SeqIO.SffIO import SffIterator, SffWriter | |
| 48 except ImportError: | |
| 49 stop_err("Requires Biopython 1.54 or later") | |
| 50 try: | |
| 51 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | |
| 52 except ImportError: | |
| 53 #Prior to Biopython 1.56 this was a private function | |
| 54 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | |
| 55 | |
| 56 #Parse Command Line | |
| 57 try: | |
| 58 in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:] | |
| 59 except ValueError: | |
| 60 stop_err("Expected 8 arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) | |
| 61 | |
| 62 if in_file == primer_fasta: | |
| 63 stop_err("Same file given as both primer sequences and sequences to clip!") | |
| 64 if in_file == out_file: | |
| 65 stop_err("Same file given as both sequences to clip and output!") | |
| 66 if primer_fasta == out_file: | |
| 67 stop_err("Same file given as both primer sequences and output!") | |
| 68 | |
| 69 try: | |
| 70 mm = int(mm) | |
| 71 except ValueError: | |
| 72 stop_err("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm) | |
| 73 if mm < 0: | |
| 74 stop_err("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm) | |
| 75 if mm not in [0,1,2]: | |
| 76 raise NotImplementedError | |
| 77 | |
| 78 try: | |
| 79 min_len = int(min_len) | |
| 80 except ValueError: | |
| 81 stop_err("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len) | |
| 82 if min_len < 0: | |
| 83 stop_err("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len) | |
| 84 | |
| 85 | |
| 86 if keep_negatives.lower() in ["true", "yes", "on"]: | |
| 87 keep_negatives = True | |
| 88 elif keep_negatives.lower() in ["false", "no", "off"]: | |
| 89 keep_negatives = False | |
| 90 else: | |
| 91 stop_err("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives) | |
| 92 | |
| 93 | |
| 94 if primer_type.lower() == "forward": | |
| 95 forward = True | |
| 96 rc = False | |
| 97 elif primer_type.lower() == "reverse": | |
| 98 forward = False | |
| 99 rc = False | |
| 100 elif primer_type.lower() == "reverse-complement": | |
| 101 forward = False | |
| 102 rc = True | |
| 103 else: | |
| 104 stop_err("Expected foward, reverse or reverse-complement not %r" % primer_type) | |
| 105 | |
| 106 | |
| 107 ambiguous_dna_values = { | |
| 108 "A": "A", | |
| 109 "C": "C", | |
| 110 "G": "G", | |
| 111 "T": "T", | |
| 112 "M": "ACM", | |
| 113 "R": "AGR", | |
| 114 "W": "ATW", | |
| 115 "S": "CGS", | |
| 116 "Y": "CTY", | |
| 117 "K": "GTK", | |
| 118 "V": "ACGMRSV", | |
| 119 "H": "ACTMWYH", | |
| 120 "D": "AGTRWKD", | |
| 121 "B": "CGTSYKB", | |
| 122 "X": ".", #faster than [GATCMRWSYKVVHDBXN] or even [GATC] | |
| 123 "N": ".", | |
| 124 } | |
| 125 | |
| 126 ambiguous_dna_re = {} | |
| 127 for letter, values in ambiguous_dna_values.iteritems(): | |
| 128 if len(values) == 1: | |
| 129 ambiguous_dna_re[letter] = values | |
| 130 else: | |
| 131 ambiguous_dna_re[letter] = "[%s]" % values | |
| 132 | |
| 133 | |
| 134 def make_reg_ex(seq): | |
| 135 return "".join(ambiguous_dna_re[letter] for letter in seq) | |
| 136 | |
| 137 def make_reg_ex_mm(seq, mm): | |
| 138 if mm > 2: | |
| 139 raise NotImplementedError("At most 2 mismatches allowed!") | |
| 140 seq = seq.upper() | |
| 141 yield make_reg_ex(seq) | |
| 142 for i in range(1,mm+1): | |
| 143 #Missing first/last i bases at very start/end of sequence | |
| 144 for reg in make_reg_ex_mm(seq[i:], mm-i): | |
| 145 yield "^" + reg | |
| 146 for reg in make_reg_ex_mm(seq[:-i], mm-i): | |
| 147 yield "$" + reg | |
| 148 if mm >= 1: | |
| 149 for i,letter in enumerate(seq): | |
| 150 #We'll use a set to remove any duplicate patterns | |
| 151 #if letter not in "NX": | |
| 152 pattern = seq[:i] + "N" + seq[i+1:] | |
| 153 assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \ | |
| 154 % (pattern, len(pattern), seq, len(seq)) | |
| 155 yield make_reg_ex(pattern) | |
| 156 if mm >=2: | |
| 157 for i,letter in enumerate(seq): | |
| 158 #We'll use a set to remove any duplicate patterns | |
| 159 #if letter not in "NX": | |
| 160 for k,letter in enumerate(seq[i+1:]): | |
| 161 #We'll use a set to remove any duplicate patterns | |
| 162 #if letter not in "NX": | |
| 163 pattern = seq[:i] + "N" + seq[i+1:i+1+k] + "N" + seq[i+k+2:] | |
| 164 assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \ | |
| 165 % (pattern, len(pattern), seq, len(seq)) | |
| 166 yield make_reg_ex(pattern) | |
| 167 | |
| 168 def load_primers_as_re(primer_fasta, mm, rc=False): | |
| 169 #Read primer file and record all specified sequences | |
| 170 primers = set() | |
| 171 in_handle = open(primer_fasta, "rU") | |
| 172 reader = fastaReader(in_handle) | |
| 173 count = 0 | |
| 174 for record in reader: | |
| 175 if rc: | |
| 176 seq = reverse_complement(record.sequence) | |
| 177 else: | |
| 178 seq = record.sequence | |
| 179 #primers.add(re.compile(make_reg_ex(seq))) | |
| 180 count += 1 | |
| 181 for pattern in make_reg_ex_mm(seq, mm): | |
| 182 primers.add(pattern) | |
| 183 in_handle.close() | |
| 184 #Use set to avoid duplicates, sort to have longest first | |
| 185 #(so more specific primers found before less specific ones) | |
| 186 primers = sorted(set(primers), key=lambda p: -len(p)) | |
| 187 return count, re.compile("|".join(primers)) #make one monster re! | |
| 188 | |
| 189 | |
| 190 | |
| 191 #Read primer file and record all specified sequences | |
| 192 count, primer = load_primers_as_re(primer_fasta, mm, rc) | |
| 193 print "%i primer sequences" % count | |
| 194 | |
| 195 short_neg = 0 | |
| 196 short_clipped = 0 | |
| 197 clipped = 0 | |
| 198 negs = 0 | |
| 199 | |
| 200 if seq_format.lower()=="sff": | |
| 201 #SFF is different because we just change the trim points | |
| 202 if forward: | |
| 203 def process(records): | |
| 204 global short_clipped, short_neg, clipped, negs | |
| 205 for record in records: | |
| 206 left_clip = record.annotations["clip_qual_left"] | |
| 207 right_clip = record.annotations["clip_qual_right"] | |
| 208 seq = str(record.seq)[left_clip:right_clip].upper() | |
| 209 result = primer.search(seq) | |
| 210 if result: | |
| 211 #Forward primer, take everything after it | |
| 212 #so move the left clip along | |
| 213 if len(seq) - result.end() >= min_len: | |
| 214 record.annotations["clip_qual_left"] = left_clip + result.end() | |
| 215 clipped += 1 | |
| 216 yield record | |
| 217 else: | |
| 218 short_clipped += 1 | |
| 219 elif keep_negatives: | |
| 220 if len(seq) >= min_len: | |
| 221 negs += 1 | |
| 222 yield record | |
| 223 else: | |
| 224 short_neg += 1 | |
| 225 else: | |
| 226 def process(records): | |
| 227 global short_clipped, short_neg, clipped, negs | |
| 228 for record in records: | |
| 229 left_clip = record.annotations["clip_qual_left"] | |
| 230 right_clip = record.annotations["clip_qual_right"] | |
| 231 seq = str(record.seq)[left_clip:right_clip].upper() | |
| 232 result = primer.search(seq) | |
| 233 if result: | |
| 234 #Reverse primer, take everything before it | |
| 235 #so move the right clip back | |
| 236 new_len = result.start() | |
| 237 if new_len >= min_len: | |
| 238 record.annotations["clip_qual_right"] = left_clip + new_len | |
| 239 clipped += 1 | |
| 240 yield record | |
| 241 else: | |
| 242 short_clipped += 1 | |
| 243 elif keep_negatives: | |
| 244 if len(seq) >= min_len: | |
| 245 negs += 1 | |
| 246 yield record | |
| 247 else: | |
| 248 short_neg += 1 | |
| 249 | |
| 250 in_handle = open(in_file, "rb") | |
| 251 try: | |
| 252 manifest = ReadRocheXmlManifest(in_handle) | |
| 253 except ValueError: | |
| 254 manifest = None | |
| 255 in_handle.seek(0) | |
| 256 out_handle = open(out_file, "wb") | |
| 257 writer = SffWriter(out_handle, xml=manifest) | |
| 258 writer.write_file(process(SffIterator(in_handle))) | |
| 259 #End of SFF code | |
| 260 elif seq_format.lower().startswith("fastq"): | |
| 261 in_handle = open(in_file, "rU") | |
| 262 out_handle = open(out_file, "w") | |
| 263 reader = fastqReader(in_handle) | |
| 264 writer = fastqWriter(out_handle) | |
| 265 if forward: | |
| 266 for record in reader: | |
| 267 seq = record.sequence.upper() | |
| 268 result = primer.search(seq) | |
| 269 if result: | |
| 270 #Forward primer, take everything after it | |
| 271 cut = result.end() | |
| 272 record.sequence = seq[cut:] | |
| 273 if len(record.sequence) >= min_len: | |
| 274 record.quality = record.quality[cut:] | |
| 275 clipped += 1 | |
| 276 writer.write(record) | |
| 277 else: | |
| 278 short_clipped += 1 | |
| 279 elif keep_negatives: | |
| 280 if len(record) >= min_len: | |
| 281 negs += 1 | |
| 282 writer.write(record) | |
| 283 else: | |
| 284 short_negs += 1 | |
| 285 else: | |
| 286 for record in reader: | |
| 287 seq = record.sequence.upper() | |
| 288 result = primer.search(seq) | |
| 289 if result: | |
| 290 #Reverse primer, take everything before it | |
| 291 cut = result.start() | |
| 292 record.sequence = seq[:cut] | |
| 293 if len(record.sequence) >= min_len: | |
| 294 record.quality = record.quality[:cut] | |
| 295 clipped += 1 | |
| 296 writer.write(record) | |
| 297 else: | |
| 298 short_clipped += 1 | |
| 299 elif keep_negatives: | |
| 300 if len(record) >= min_len: | |
| 301 negs += 1 | |
| 302 writer.write(record) | |
| 303 else: | |
| 304 short_negs += 1 | |
| 305 elif seq_format.lower()=="fasta": | |
| 306 in_handle = open(in_file, "rU") | |
| 307 out_handle = open(out_file, "w") | |
| 308 reader = fastaReader(in_handle) | |
| 309 writer = fastaWriter(out_handle) | |
| 310 #Following code is identical to that for FASTQ but without editing qualities | |
| 311 if forward: | |
| 312 for record in reader: | |
| 313 seq = record.sequence.upper() | |
| 314 result = primer.search(seq) | |
| 315 if result: | |
| 316 #Forward primer, take everything after it | |
| 317 cut = result.end() | |
| 318 record.sequence = seq[cut:] | |
| 319 if len(record.sequence) >= min_len: | |
| 320 clipped += 1 | |
| 321 writer.write(record) | |
| 322 else: | |
| 323 short_clipped += 1 | |
| 324 elif keep_negatives: | |
| 325 if len(record) >= min_len: | |
| 326 negs += 1 | |
| 327 writer.write(record) | |
| 328 else: | |
| 329 short_negs += 1 | |
| 330 else: | |
| 331 for record in reader: | |
| 332 seq = record.sequence.upper() | |
| 333 result = primer.search(seq) | |
| 334 if result: | |
| 335 #Reverse primer, take everything before it | |
| 336 cut = result.start() | |
| 337 record.sequence = seq[:cut] | |
| 338 if len(record.sequence) >= min_len: | |
| 339 clipped += 1 | |
| 340 writer.write(record) | |
| 341 else: | |
| 342 short_clipped += 1 | |
| 343 elif keep_negatives: | |
| 344 if len(record) >= min_len: | |
| 345 negs += 1 | |
| 346 writer.write(record) | |
| 347 else: | |
| 348 short_negs += 1 | |
| 349 else: | |
| 350 stop_err("Unsupported file type %r" % seq_format) | |
| 351 in_handle.close() | |
| 352 out_handle.close() | |
| 353 | |
| 354 print "Kept %i clipped reads," % clipped | |
| 355 print "discarded %i short." % short_clipped | |
| 356 if keep_negatives: | |
| 357 print "Kept %i non-matching reads," % negs | |
| 358 print "discarded %i short." % short_neg |
