comparison translate_bed.py @ 0:038ecf54cbec draft default tip

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/translate_bed commit 383bb485120a193bcc14f88364e51356d6ede219
author galaxyp
date Mon, 22 Jan 2018 13:59:27 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:038ecf54cbec
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2017, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14
15 from __future__ import print_function
16
17 import argparse
18 import re
19 import sys
20
21 from Bio.Seq import translate
22
23 from bedutil import bed_from_line
24
25 import digest
26
27 from ensembl_rest import get_cdna
28
29 from twobitreader import TwoBitFile
30
31
32 def __main__():
33 parser = argparse.ArgumentParser(
34 description='Translate from BED')
35 parser.add_argument(
36 'input_bed', default=None,
37 help="BED to translate, '-' for stdin")
38 pg_seq = parser.add_argument_group('Genomic sequence source')
39 pg_seq.add_argument(
40 '-t', '--twobit', default=None,
41 help='Genome reference sequence in 2bit format')
42 pg_seq.add_argument(
43 '-c', '--column', type=int, default=None,
44 help='Column offset containing genomic sequence' +
45 'between start and stop (-1) for last column')
46 pg_out = parser.add_argument_group('Output options')
47 pg_out.add_argument(
48 '-f', '--fasta', default=None,
49 help='Path to output translations.fasta')
50 pg_out.add_argument(
51 '-b', '--bed', default=None,
52 help='Path to output translations.bed')
53 pg_bed = parser.add_argument_group('BED filter options')
54 pg_bed.add_argument(
55 '-E', '--ensembl', action='store_true', default=False,
56 help='Input BED is in 20 column Ensembl format')
57 pg_bed.add_argument(
58 '-R', '--regions', action='append', default=[],
59 help='Filter input by regions e.g.:'
60 + ' X,2:20000-25000,3:100-500+')
61 pg_bed.add_argument(
62 '-B', '--biotypes', action='append', default=[],
63 help='For Ensembl BED restrict translations to Ensembl biotypes')
64 pg_trans = parser.add_argument_group('Translation filter options')
65 pg_trans.add_argument(
66 '-m', '--min_length', type=int, default=10,
67 help='Minimum length of protein translation to report')
68 pg_trans.add_argument(
69 '-e', '--enzyme', default=None,
70 help='Digest translation with enzyme')
71 pg_trans.add_argument(
72 '-M', '--start_codon', action='store_true', default=False,
73 help='Trim translations to methionine start_codon')
74 pg_trans.add_argument(
75 '-C', '--cds', action='store_true', default=False,
76 help='Only translate CDS')
77 pg_trans.add_argument(
78 '-A', '--all', action='store_true',
79 help='Include CDS protein translations ')
80 pg_fmt = parser.add_argument_group('ID format options')
81 pg_fmt.add_argument(
82 '-r', '--reference', default='',
83 help='Genome Reference Name')
84 pg_fmt.add_argument(
85 '-D', '--fa_db', dest='fa_db', default=None,
86 help='Prefix DB identifier for fasta ID line, e.g. generic')
87 pg_fmt.add_argument(
88 '-s', '--fa_sep', dest='fa_sep', default='|',
89 help='fasta ID separator defaults to pipe char, ' +
90 'e.g. generic|ProtID|description')
91 pg_fmt.add_argument(
92 '-P', '--id_prefix', default='',
93 help='prefix for the sequence ID')
94 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
95 parser.add_argument('-d', '--debug', action='store_true', help='Debug')
96 args = parser.parse_args()
97
98 input_rdr = open(args.input_bed, 'r')\
99 if args.input_bed != '-' else sys.stdin
100 fa_wtr = open(args.fasta, 'w')\
101 if args.fasta is not None and args.fasta != '-' else sys.stdout
102 bed_wtr = open(args.bed, 'w') if args.bed is not None else None
103
104 enzyme = digest.expasy_rules.get(args.enzyme, None)
105
106 biotypea = [bt.strip() for biotype in args.biotypes
107 for bt in biotype.split(',')]
108
109 twobit = TwoBitFile(args.twobit) if args.twobit else None
110
111 selected_regions = dict() # chrom:(start, end)
112 region_pat = '^(?:chr)?([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?'
113 if args.regions:
114 for entry in args.regions:
115 if not entry:
116 continue
117 regs = [x.strip() for x in entry.split(',') if x.strip()]
118 for reg in regs:
119 m = re.match(region_pat, reg)
120 if m:
121 (chrom, start, end, strand) = m.groups()
122 if chrom:
123 if chrom not in selected_regions:
124 selected_regions[chrom] = []
125 selected_regions[chrom].append([start, end, strand])
126 if args.debug:
127 print("selected_regions: %s" % selected_regions, file=sys.stderr)
128
129 def filter_by_regions(bed):
130 if not selected_regions:
131 return True
132 ref = re.sub('^(?i)chr', '', bed.chrom)
133 if ref not in selected_regions:
134 return False
135 for reg in selected_regions[ref]:
136 (_start, _stop, _strand) = reg
137 start = int(_start) if _start else 0
138 stop = int(_stop) if _stop else sys.maxint
139 if _strand and bed.strand != _strand:
140 continue
141 if bed.chromEnd >= start and bed.chromStart <= stop:
142 return True
143 return False
144
145 translations = dict() # start : end : seq
146
147 def unique_prot(tbed, seq):
148 if tbed.chromStart not in translations:
149 translations[tbed.chromStart] = dict()
150 translations[tbed.chromStart][tbed.chromEnd] = []
151 translations[tbed.chromStart][tbed.chromEnd].append(seq)
152 elif tbed.chromEnd not in translations[tbed.chromStart]:
153 translations[tbed.chromStart][tbed.chromEnd] = []
154 translations[tbed.chromStart][tbed.chromEnd].append(seq)
155 elif seq not in translations[tbed.chromStart][tbed.chromEnd]:
156 translations[tbed.chromStart][tbed.chromEnd].append(seq)
157 else:
158 return False
159 return True
160
161 def get_sequence(chrom, start, end):
162 if twobit:
163 if chrom in twobit and 0 <= start < end < len(twobit[chrom]):
164 return twobit[chrom][start:end]
165 contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom
166 if contig in twobit and 0 <= start < end < len(twobit[contig]):
167 return twobit[contig][start:end]
168 return None
169
170 def write_translation(tbed, accession, peptide):
171 if args.id_prefix:
172 tbed.name = "%s%s" % (args.id_prefix, tbed.name)
173 probed = "%s\t%s\t%s\t%s%s" % (accession, peptide,
174 'unique', args.reference,
175 '\t.' * 9)
176 if bed_wtr:
177 bed_wtr.write("%s\t%s\n" % (str(tbed), probed))
178 bed_wtr.flush()
179 location = "chromosome:%s:%s:%s:%s:%s"\
180 % (args.reference, tbed.chrom,
181 tbed.thickStart, tbed.thickEnd, tbed.strand)
182 fa_desc = '%s%s' % (args.fa_sep, location)
183 fa_db = '%s%s' % (args.fa_db, args.fa_sep) if args.fa_db else ''
184 fa_id = ">%s%s%s\n" % (fa_db, tbed.name, fa_desc)
185 fa_wtr.write(fa_id)
186 fa_wtr.write(peptide)
187 fa_wtr.write("\n")
188 fa_wtr.flush()
189
190 def translate_bed(bed):
191 translate_count = 0
192 transcript_id = bed.name
193 refprot = None
194 if not bed.seq:
195 if twobit:
196 bed.seq = get_sequence(bed.chrom, bed.chromStart, bed.chromEnd)
197 else:
198 bed.cdna = get_cdna(transcript_id)
199 cdna = bed.get_cdna()
200 if cdna is not None:
201 cdna_len = len(cdna)
202 if args.cds or args.all:
203 try:
204 cds = bed.get_cds()
205 if cds:
206 if args.debug:
207 print("cdna:%s" % str(cdna), file=sys.stderr)
208 print("cds: %s" % str(cds), file=sys.stderr)
209 if len(cds) % 3 != 0:
210 cds = cds[:-(len(cds) % 3)]
211 refprot = translate(cds) if cds else None
212 except:
213 refprot = None
214 if args.cds:
215 if refprot:
216 tbed = bed.get_cds_bed()
217 if args.start_codon:
218 m = refprot.find('M')
219 if m < 0:
220 return 0
221 elif m > 0:
222 bed.trim_cds(m*3)
223 refprot = refprot[m:]
224 stop = refprot.find('*')
225 if stop >= 0:
226 bed.trim_cds((stop - len(refprot)) * 3)
227 refprot = refprot[:stop]
228 if len(refprot) >= args.min_length:
229 write_translation(tbed, bed.name, refprot)
230 return 1
231 return 0
232 if args.debug:
233 print("%s\n" % (str(bed)), file=sys.stderr)
234 print("CDS: %s %d %d" %
235 (bed.strand, bed.cdna_offset_of_pos(bed.thickStart),
236 bed.cdna_offset_of_pos(bed.thickEnd)),
237 file=sys.stderr)
238 print("refprot: %s" % str(refprot), file=sys.stderr)
239 for offset in range(3):
240 seqend = cdna_len - (cdna_len - offset) % 3
241 aaseq = translate(cdna[offset:seqend])
242 aa_start = 0
243 while aa_start < len(aaseq):
244 aa_end = aaseq.find('*', aa_start)
245 if aa_end < 0:
246 aa_end = len(aaseq)
247 prot = aaseq[aa_start:aa_end]
248 if args.start_codon:
249 m = prot.find('M')
250 aa_start += m if m >= 0 else aa_end
251 prot = aaseq[aa_start:aa_end]
252 if enzyme and refprot:
253 frags = digest._cleave(prot, enzyme)
254 for frag in reversed(frags):
255 if frag in refprot:
256 prot = prot[:prot.rfind(frag)]
257 else:
258 break
259 is_cds = refprot and prot in refprot
260 if args.debug:
261 print("is_cds: %s %s" % (str(is_cds), str(prot)),
262 file=sys.stderr)
263 if len(prot) < args.min_length:
264 pass
265 elif not args.all and is_cds:
266 pass
267 else:
268 tstart = aa_start*3+offset
269 tend = aa_end*3+offset
270 prot_acc = "%s_%d_%d" % (transcript_id, tstart, tend)
271 tbed = bed.trim(tstart, tend)
272 if args.all or unique_prot(tbed, prot):
273 translate_count += 1
274 tbed.name = prot_acc
275 write_translation(tbed, bed.name, prot)
276 aa_start = aa_end + 1
277 return translate_count
278
279 if input_rdr:
280 translation_count = 0
281 transcript_count = 0
282 for i, bedline in enumerate(input_rdr):
283 try:
284 bed = bed_from_line(bedline, ensembl=args.ensembl,
285 seq_column=args.column)
286 if bed is None:
287 continue
288 transcript_count += 1
289 if bed.biotype and biotypea and bed.biotype not in biotypea:
290 continue
291 if filter_by_regions(bed):
292 translation_count += translate_bed(bed)
293 except Exception as e:
294 print("BED format Error: line %d: %s\n%s"
295 % (i, bedline, e), file=sys.stderr)
296 break
297 if args.debug or args.verbose:
298 print("transcripts: %d\ttranslations: %d"
299 % (transcript_count, translation_count), file=sys.stderr)
300
301
302 if __name__ == "__main__":
303 __main__()