Mercurial > repos > galaxyp > translate_bed
comparison translate_bed.py @ 0:038ecf54cbec draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/translate_bed commit 383bb485120a193bcc14f88364e51356d6ede219
author | galaxyp |
---|---|
date | Mon, 22 Jan 2018 13:59:27 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:038ecf54cbec |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 # | |
4 #------------------------------------------------------------------------------ | |
5 # University of Minnesota | |
6 # Copyright 2017, Regents of the University of Minnesota | |
7 #------------------------------------------------------------------------------ | |
8 # Author: | |
9 # | |
10 # James E Johnson | |
11 # | |
12 #------------------------------------------------------------------------------ | |
13 """ | |
14 | |
15 from __future__ import print_function | |
16 | |
17 import argparse | |
18 import re | |
19 import sys | |
20 | |
21 from Bio.Seq import translate | |
22 | |
23 from bedutil import bed_from_line | |
24 | |
25 import digest | |
26 | |
27 from ensembl_rest import get_cdna | |
28 | |
29 from twobitreader import TwoBitFile | |
30 | |
31 | |
32 def __main__(): | |
33 parser = argparse.ArgumentParser( | |
34 description='Translate from BED') | |
35 parser.add_argument( | |
36 'input_bed', default=None, | |
37 help="BED to translate, '-' for stdin") | |
38 pg_seq = parser.add_argument_group('Genomic sequence source') | |
39 pg_seq.add_argument( | |
40 '-t', '--twobit', default=None, | |
41 help='Genome reference sequence in 2bit format') | |
42 pg_seq.add_argument( | |
43 '-c', '--column', type=int, default=None, | |
44 help='Column offset containing genomic sequence' + | |
45 'between start and stop (-1) for last column') | |
46 pg_out = parser.add_argument_group('Output options') | |
47 pg_out.add_argument( | |
48 '-f', '--fasta', default=None, | |
49 help='Path to output translations.fasta') | |
50 pg_out.add_argument( | |
51 '-b', '--bed', default=None, | |
52 help='Path to output translations.bed') | |
53 pg_bed = parser.add_argument_group('BED filter options') | |
54 pg_bed.add_argument( | |
55 '-E', '--ensembl', action='store_true', default=False, | |
56 help='Input BED is in 20 column Ensembl format') | |
57 pg_bed.add_argument( | |
58 '-R', '--regions', action='append', default=[], | |
59 help='Filter input by regions e.g.:' | |
60 + ' X,2:20000-25000,3:100-500+') | |
61 pg_bed.add_argument( | |
62 '-B', '--biotypes', action='append', default=[], | |
63 help='For Ensembl BED restrict translations to Ensembl biotypes') | |
64 pg_trans = parser.add_argument_group('Translation filter options') | |
65 pg_trans.add_argument( | |
66 '-m', '--min_length', type=int, default=10, | |
67 help='Minimum length of protein translation to report') | |
68 pg_trans.add_argument( | |
69 '-e', '--enzyme', default=None, | |
70 help='Digest translation with enzyme') | |
71 pg_trans.add_argument( | |
72 '-M', '--start_codon', action='store_true', default=False, | |
73 help='Trim translations to methionine start_codon') | |
74 pg_trans.add_argument( | |
75 '-C', '--cds', action='store_true', default=False, | |
76 help='Only translate CDS') | |
77 pg_trans.add_argument( | |
78 '-A', '--all', action='store_true', | |
79 help='Include CDS protein translations ') | |
80 pg_fmt = parser.add_argument_group('ID format options') | |
81 pg_fmt.add_argument( | |
82 '-r', '--reference', default='', | |
83 help='Genome Reference Name') | |
84 pg_fmt.add_argument( | |
85 '-D', '--fa_db', dest='fa_db', default=None, | |
86 help='Prefix DB identifier for fasta ID line, e.g. generic') | |
87 pg_fmt.add_argument( | |
88 '-s', '--fa_sep', dest='fa_sep', default='|', | |
89 help='fasta ID separator defaults to pipe char, ' + | |
90 'e.g. generic|ProtID|description') | |
91 pg_fmt.add_argument( | |
92 '-P', '--id_prefix', default='', | |
93 help='prefix for the sequence ID') | |
94 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose') | |
95 parser.add_argument('-d', '--debug', action='store_true', help='Debug') | |
96 args = parser.parse_args() | |
97 | |
98 input_rdr = open(args.input_bed, 'r')\ | |
99 if args.input_bed != '-' else sys.stdin | |
100 fa_wtr = open(args.fasta, 'w')\ | |
101 if args.fasta is not None and args.fasta != '-' else sys.stdout | |
102 bed_wtr = open(args.bed, 'w') if args.bed is not None else None | |
103 | |
104 enzyme = digest.expasy_rules.get(args.enzyme, None) | |
105 | |
106 biotypea = [bt.strip() for biotype in args.biotypes | |
107 for bt in biotype.split(',')] | |
108 | |
109 twobit = TwoBitFile(args.twobit) if args.twobit else None | |
110 | |
111 selected_regions = dict() # chrom:(start, end) | |
112 region_pat = '^(?:chr)?([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?' | |
113 if args.regions: | |
114 for entry in args.regions: | |
115 if not entry: | |
116 continue | |
117 regs = [x.strip() for x in entry.split(',') if x.strip()] | |
118 for reg in regs: | |
119 m = re.match(region_pat, reg) | |
120 if m: | |
121 (chrom, start, end, strand) = m.groups() | |
122 if chrom: | |
123 if chrom not in selected_regions: | |
124 selected_regions[chrom] = [] | |
125 selected_regions[chrom].append([start, end, strand]) | |
126 if args.debug: | |
127 print("selected_regions: %s" % selected_regions, file=sys.stderr) | |
128 | |
129 def filter_by_regions(bed): | |
130 if not selected_regions: | |
131 return True | |
132 ref = re.sub('^(?i)chr', '', bed.chrom) | |
133 if ref not in selected_regions: | |
134 return False | |
135 for reg in selected_regions[ref]: | |
136 (_start, _stop, _strand) = reg | |
137 start = int(_start) if _start else 0 | |
138 stop = int(_stop) if _stop else sys.maxint | |
139 if _strand and bed.strand != _strand: | |
140 continue | |
141 if bed.chromEnd >= start and bed.chromStart <= stop: | |
142 return True | |
143 return False | |
144 | |
145 translations = dict() # start : end : seq | |
146 | |
147 def unique_prot(tbed, seq): | |
148 if tbed.chromStart not in translations: | |
149 translations[tbed.chromStart] = dict() | |
150 translations[tbed.chromStart][tbed.chromEnd] = [] | |
151 translations[tbed.chromStart][tbed.chromEnd].append(seq) | |
152 elif tbed.chromEnd not in translations[tbed.chromStart]: | |
153 translations[tbed.chromStart][tbed.chromEnd] = [] | |
154 translations[tbed.chromStart][tbed.chromEnd].append(seq) | |
155 elif seq not in translations[tbed.chromStart][tbed.chromEnd]: | |
156 translations[tbed.chromStart][tbed.chromEnd].append(seq) | |
157 else: | |
158 return False | |
159 return True | |
160 | |
161 def get_sequence(chrom, start, end): | |
162 if twobit: | |
163 if chrom in twobit and 0 <= start < end < len(twobit[chrom]): | |
164 return twobit[chrom][start:end] | |
165 contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom | |
166 if contig in twobit and 0 <= start < end < len(twobit[contig]): | |
167 return twobit[contig][start:end] | |
168 return None | |
169 | |
170 def write_translation(tbed, accession, peptide): | |
171 if args.id_prefix: | |
172 tbed.name = "%s%s" % (args.id_prefix, tbed.name) | |
173 probed = "%s\t%s\t%s\t%s%s" % (accession, peptide, | |
174 'unique', args.reference, | |
175 '\t.' * 9) | |
176 if bed_wtr: | |
177 bed_wtr.write("%s\t%s\n" % (str(tbed), probed)) | |
178 bed_wtr.flush() | |
179 location = "chromosome:%s:%s:%s:%s:%s"\ | |
180 % (args.reference, tbed.chrom, | |
181 tbed.thickStart, tbed.thickEnd, tbed.strand) | |
182 fa_desc = '%s%s' % (args.fa_sep, location) | |
183 fa_db = '%s%s' % (args.fa_db, args.fa_sep) if args.fa_db else '' | |
184 fa_id = ">%s%s%s\n" % (fa_db, tbed.name, fa_desc) | |
185 fa_wtr.write(fa_id) | |
186 fa_wtr.write(peptide) | |
187 fa_wtr.write("\n") | |
188 fa_wtr.flush() | |
189 | |
190 def translate_bed(bed): | |
191 translate_count = 0 | |
192 transcript_id = bed.name | |
193 refprot = None | |
194 if not bed.seq: | |
195 if twobit: | |
196 bed.seq = get_sequence(bed.chrom, bed.chromStart, bed.chromEnd) | |
197 else: | |
198 bed.cdna = get_cdna(transcript_id) | |
199 cdna = bed.get_cdna() | |
200 if cdna is not None: | |
201 cdna_len = len(cdna) | |
202 if args.cds or args.all: | |
203 try: | |
204 cds = bed.get_cds() | |
205 if cds: | |
206 if args.debug: | |
207 print("cdna:%s" % str(cdna), file=sys.stderr) | |
208 print("cds: %s" % str(cds), file=sys.stderr) | |
209 if len(cds) % 3 != 0: | |
210 cds = cds[:-(len(cds) % 3)] | |
211 refprot = translate(cds) if cds else None | |
212 except: | |
213 refprot = None | |
214 if args.cds: | |
215 if refprot: | |
216 tbed = bed.get_cds_bed() | |
217 if args.start_codon: | |
218 m = refprot.find('M') | |
219 if m < 0: | |
220 return 0 | |
221 elif m > 0: | |
222 bed.trim_cds(m*3) | |
223 refprot = refprot[m:] | |
224 stop = refprot.find('*') | |
225 if stop >= 0: | |
226 bed.trim_cds((stop - len(refprot)) * 3) | |
227 refprot = refprot[:stop] | |
228 if len(refprot) >= args.min_length: | |
229 write_translation(tbed, bed.name, refprot) | |
230 return 1 | |
231 return 0 | |
232 if args.debug: | |
233 print("%s\n" % (str(bed)), file=sys.stderr) | |
234 print("CDS: %s %d %d" % | |
235 (bed.strand, bed.cdna_offset_of_pos(bed.thickStart), | |
236 bed.cdna_offset_of_pos(bed.thickEnd)), | |
237 file=sys.stderr) | |
238 print("refprot: %s" % str(refprot), file=sys.stderr) | |
239 for offset in range(3): | |
240 seqend = cdna_len - (cdna_len - offset) % 3 | |
241 aaseq = translate(cdna[offset:seqend]) | |
242 aa_start = 0 | |
243 while aa_start < len(aaseq): | |
244 aa_end = aaseq.find('*', aa_start) | |
245 if aa_end < 0: | |
246 aa_end = len(aaseq) | |
247 prot = aaseq[aa_start:aa_end] | |
248 if args.start_codon: | |
249 m = prot.find('M') | |
250 aa_start += m if m >= 0 else aa_end | |
251 prot = aaseq[aa_start:aa_end] | |
252 if enzyme and refprot: | |
253 frags = digest._cleave(prot, enzyme) | |
254 for frag in reversed(frags): | |
255 if frag in refprot: | |
256 prot = prot[:prot.rfind(frag)] | |
257 else: | |
258 break | |
259 is_cds = refprot and prot in refprot | |
260 if args.debug: | |
261 print("is_cds: %s %s" % (str(is_cds), str(prot)), | |
262 file=sys.stderr) | |
263 if len(prot) < args.min_length: | |
264 pass | |
265 elif not args.all and is_cds: | |
266 pass | |
267 else: | |
268 tstart = aa_start*3+offset | |
269 tend = aa_end*3+offset | |
270 prot_acc = "%s_%d_%d" % (transcript_id, tstart, tend) | |
271 tbed = bed.trim(tstart, tend) | |
272 if args.all or unique_prot(tbed, prot): | |
273 translate_count += 1 | |
274 tbed.name = prot_acc | |
275 write_translation(tbed, bed.name, prot) | |
276 aa_start = aa_end + 1 | |
277 return translate_count | |
278 | |
279 if input_rdr: | |
280 translation_count = 0 | |
281 transcript_count = 0 | |
282 for i, bedline in enumerate(input_rdr): | |
283 try: | |
284 bed = bed_from_line(bedline, ensembl=args.ensembl, | |
285 seq_column=args.column) | |
286 if bed is None: | |
287 continue | |
288 transcript_count += 1 | |
289 if bed.biotype and biotypea and bed.biotype not in biotypea: | |
290 continue | |
291 if filter_by_regions(bed): | |
292 translation_count += translate_bed(bed) | |
293 except Exception as e: | |
294 print("BED format Error: line %d: %s\n%s" | |
295 % (i, bedline, e), file=sys.stderr) | |
296 break | |
297 if args.debug or args.verbose: | |
298 print("transcripts: %d\ttranslations: %d" | |
299 % (transcript_count, translation_count), file=sys.stderr) | |
300 | |
301 | |
302 if __name__ == "__main__": | |
303 __main__() |