0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 #This script is part of the pyPRADA package. It is used to predict the functional consequences of
|
|
4 #gene fusions. Possible predictions include in-frame, out-of-frame, UTR-UTR, UTR-CDS, Unknown etc.
|
|
5 #It relies on the definition of transcript CDS/UTR boundaries from ENSEMBL, and gives a prediction
|
|
6 #for every pair of transcripts coded by the fusion genes. The whole point is to see if the fusion
|
|
7 #lead to reading shift of the CDS sequences.
|
|
8 #Two files are generated, brief/detail result.
|
|
9 #Result is purely predicted, no guarantee is assumed.
|
|
10 #Contact: Siyuan Zheng, szheng2@mdanderson.org
|
|
11
|
|
12 import os
|
|
13 import sys
|
|
14 import re
|
|
15 import time
|
|
16 import subprocess
|
|
17 import bioclass
|
|
18 from Bio import SeqIO
|
|
19 import ioprada
|
|
20
|
|
21 ######################################################################################
|
|
22 help_menu='''\nUsage: prada-frame -conf xx.txt -i inputfile -docstr XX -outdir ./
|
|
23 **parameters**
|
|
24 -conf see prada-fusion -conf
|
|
25 -i input file, a tab/space delimited four-column file, each row like "GeneA GeneA_break GeneB GeneB_break"
|
|
26 Example:FGFR3 1808661 TACC3 1739325
|
|
27 -docstr outfile prefix. output to two files: XX.frame.brief.txt, XX.frame.detail.txt
|
|
28 -outdir output directory, default is ./
|
|
29 '''
|
|
30
|
|
31 args=sys.argv
|
|
32 if '-h' in args or '-help' in args or len(args)==1:
|
|
33 print help_menu
|
|
34 sys.exit(0)
|
|
35
|
|
36 if '-i' not in sys.argv:
|
|
37 sys.exit('Input file needed')
|
|
38 else:
|
|
39 i=sys.argv.index('-i')
|
|
40 datafilename=sys.argv[i+1]
|
|
41
|
|
42 if '-outdir' not in args:
|
|
43 outpath=os.path.abspath('./')
|
|
44 else:
|
|
45 i=args.index('-outdir')
|
|
46 outpath=os.path.abspath(args[i+1])
|
|
47 if os.path.exists(outpath):
|
|
48 print 'WARNING: outdir %s exists'%outpath
|
|
49 else:
|
|
50 os.mkdir(outpath)
|
|
51
|
|
52 if '-docstr' not in sys.argv:
|
|
53 sys.exit('docstring needed')
|
|
54 else:
|
|
55 i=sys.argv.index('-docstr')
|
|
56 outfile_prefix=sys.argv[i+1]
|
|
57 outfile_brief='%s/%s.frame.brief.txt'%(outpath,outfile_prefix)
|
|
58 outfile_detail='%s/%s.frame.detail.txt'%(outpath,outfile_prefix)
|
|
59
|
|
60 #PRADA executive path
|
|
61 prada_path=os.path.dirname(os.path.abspath(__file__)) ####
|
|
62 ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line
|
|
63
|
|
64 if '-ref' in args:
|
|
65 i=args.index('-conf')
|
|
66 reffile=args[i+1]
|
|
67 if os.path.exists(reffile):
|
|
68 pass
|
|
69 else:
|
|
70 for pth in ref_search_path:
|
|
71 new_reffile='%s/%s'%(pth, os.path.basename(reffile))
|
|
72 if os.path.exists(new_reffile):
|
|
73 reffile=new_reffile
|
|
74 break
|
|
75 else:
|
|
76 sys.exit('ERROR: conf file %s not found'%reffile)
|
|
77 else:
|
|
78 reffile='%s/conf.txt'%prada_path
|
|
79 if not os.path.exists(reffile):
|
|
80 sys.exit('ERROR: No default conf.txt found and none specified')
|
|
81
|
|
82 #read reference files
|
|
83 refdict=ioprada.read_conf(reffile)
|
|
84 featurefile=refdict['--REF--']['feature_file']
|
|
85 cdsfile=refdict['--REF--']['cds_file']
|
|
86 txcatfile=refdict['--REF--']['txcat_file']
|
|
87
|
|
88 #Now print all input parameters
|
|
89 print 'CMD: %s'%('\t'.join(args))
|
|
90
|
|
91 ###########################################################################################
|
|
92 ##Read information from annotation files, for all genes
|
|
93 print 'Reading annotations'
|
|
94 #call functions in ioprada module //
|
|
95 txdb,genedb=ioprada.read_feature(featurefile,verbose=False)
|
|
96 tx_cds=ioprada.read_cds(cdsfile)
|
|
97 tx_primary=ioprada.read_tx_cat(txcatfile)
|
|
98
|
|
99 ###########################################################################################
|
|
100 def maptranscript(gene,exon_end,part=''):
|
|
101 '''Find all transcripts of the gene having the fusion boundary.
|
|
102 gene is a Gene object,return a list of Transcript objects'''
|
|
103 if part not in ['5','3']:
|
|
104 raise Exception('part must be "5" or "3"')
|
|
105 txuse=[] ##
|
|
106 g=gene.name
|
|
107 strand=gene.strand
|
|
108 if part=='5':
|
|
109 tag='big' if strand=='1' else 'small'
|
|
110 else:
|
|
111 tag='small' if strand=='1' else 'big'
|
|
112 txs=gene.transcript
|
|
113 if tag=='small':
|
|
114 for tx in txs:
|
|
115 e_pos=[x.start for x in tx.exon]
|
|
116 if exon_end in e_pos:
|
|
117 txuse.append(tx)
|
|
118 elif tag=='big':
|
|
119 for tx in txs:
|
|
120 e_pos=[x.end for x in tx.exon]
|
|
121 if exon_end in e_pos:
|
|
122 txuse.append(tx)
|
|
123 return txuse
|
|
124
|
|
125 def overlap(r1,r2):
|
|
126 '''compare two ranges, return the overlapped'''
|
|
127 if r1[0] <= r2[0]:
|
|
128 ra,rb=r1[:],r2[:]
|
|
129 else:
|
|
130 ra,rb=r2[:],r1[:]
|
|
131 if rb[0] > ra[1]:
|
|
132 return []
|
|
133 else:
|
|
134 if rb[1] <= ra[1]:
|
|
135 return rb
|
|
136 else:
|
|
137 return [rb[0],ra[1]]
|
|
138
|
|
139 def br_front_len(tx,br,part):
|
|
140 '''No matter it is 5'/3' partner,calculate the ORF length before the break'''
|
|
141 global tx_cds
|
|
142 txname=tx.name
|
|
143 strand=tx.strand
|
|
144 cds_start,cds_end=[int(x) for x in tx_cds[txname]]
|
|
145 e_pos=[[x.start,x.end] for x in tx.exon]
|
|
146 if part=='5' and strand=='1':
|
|
147 posset=[x.end for x in tx.exon]
|
|
148 if br not in posset:
|
|
149 raise Exception('Br is not exon boundary')
|
|
150 if br < cds_start:
|
|
151 return '5UTR'
|
|
152 elif br > cds_end:
|
|
153 return '3UTR'
|
|
154 else:
|
|
155 chim_r=[cds_start,br]
|
|
156 ol=[overlap(x,chim_r) for x in e_pos]
|
|
157 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
|
|
158 return L
|
|
159 if part=='5' and strand=='-1':
|
|
160 posset=[x.start for x in tx.exon]
|
|
161 if br not in posset:
|
|
162 raise Exception('Br is not exon boundary')
|
|
163 if br > cds_end:
|
|
164 return '5UTR'
|
|
165 elif br < cds_start:
|
|
166 return '3UTR'
|
|
167 else:
|
|
168 chim_r=[br,cds_end]
|
|
169 ol=[overlap(x,chim_r) for x in e_pos]
|
|
170 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
|
|
171 return L
|
|
172 if part=='3' and strand=='1':
|
|
173 posset=[x.start for x in tx.exon]
|
|
174 if br not in posset:
|
|
175 raise Exception('Br is not exon boundary')
|
|
176 if br < cds_start:
|
|
177 return '5UTR'
|
|
178 elif br > cds_end:
|
|
179 return '3UTR'
|
|
180 else:
|
|
181 chim_r=[cds_start,br-1]
|
|
182 ol=[overlap(x,chim_r) for x in e_pos]
|
|
183 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
|
|
184 return L
|
|
185 if part=='3' and strand=='-1':
|
|
186 posset=[x.end for x in tx.exon]
|
|
187 if br not in posset:
|
|
188 raise Exception('Br is not exon boundary')
|
|
189 if br > cds_end:
|
|
190 return '5UTR'
|
|
191 elif br < cds_start:
|
|
192 return '3UTR'
|
|
193 else:
|
|
194 chim_r=[br+1,cds_end]
|
|
195 ol=[overlap(x,chim_r) for x in e_pos]
|
|
196 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
|
|
197 return L
|
|
198
|
|
199 def fusion_frame(gene_a,ga_br,gene_b,gb_br):
|
|
200 '''gene_a:FGFR3,gene_b:TACC3,ga_br:1808661,gb_br:1741429
|
|
201 or chr4.1808661.chr4.1741429'''
|
|
202 global genedb,txdb
|
|
203 ga_br=int(ga_br)
|
|
204 gb_br=int(gb_br)
|
|
205 ga_txs=maptranscript(genedb[gene_a],ga_br,part='5')
|
|
206 gb_txs=maptranscript(genedb[gene_b],gb_br,part='3')
|
|
207 res=[]
|
|
208 for ta in ga_txs:
|
|
209 for tb in gb_txs:
|
|
210 s1=br_front_len(ta,ga_br,'5')
|
|
211 s2=br_front_len(tb,gb_br,'3')
|
|
212 fusion_conseq='Unknown'
|
|
213 if isinstance(s1,int) and isinstance(s2,int): #both br in CDS region
|
|
214 if s1%3==s2%3:
|
|
215 fusion_conseq='In-frame'
|
|
216 else:
|
|
217 fusion_conseq='Out-of-frame'
|
|
218 elif isinstance(s1,int) and not isinstance(s2,int):
|
|
219 fusion_conseq='CDS'+'-'+s2
|
|
220 elif isinstance(s2,int) and not isinstance(s1,int):
|
|
221 fusion_conseq=s1+'-'+'CDS'
|
|
222 else:
|
|
223 fusion_conseq=s1+'-'+s2
|
|
224 res.append((ta.name,tb.name,fusion_conseq))
|
|
225 if ga_txs==[] and gb_txs==[]:
|
|
226 res.append(['NA','NA','NA'])
|
|
227 elif ga_txs==[]:
|
|
228 for tb in gb_txs:
|
|
229 res.append(['NA',tb.name,'NA'])
|
|
230 elif gb_txs==[]:
|
|
231 for ta in ga_txs:
|
|
232 res.append([ta.name,'NA','NA'])
|
|
233 return res
|
|
234
|
|
235 ############################################################################
|
|
236 print 'Predicting...'
|
|
237 infile=open(datafilename)
|
|
238 outfile_detail=open(outfile_detail,'w')
|
|
239 outfile_brief=open(outfile_brief,'w')
|
|
240 detailM=[]
|
|
241 briefM=[]
|
|
242 for line in infile:
|
|
243 if not line.strip():
|
|
244 continue
|
|
245 geneA,ga_br,geneB,gb_br=line.split()
|
|
246 try:
|
|
247 M=fusion_frame(geneA,ga_br,geneB,gb_br)
|
|
248 except:
|
|
249 M=[]
|
|
250 ppcollected=0
|
|
251 for ccc in M:
|
|
252 a_cat,b_cat='alternative','alternative'
|
|
253 if tx_primary[geneA]==ccc[0]:
|
|
254 a_cat='primary'
|
|
255 if tx_primary[geneB]==ccc[1]:
|
|
256 b_cat='primary'
|
|
257 tag='%s_%s'%(a_cat,b_cat)
|
|
258 if ccc[0]=='NA' or ccc[1]=='NA':
|
|
259 tag='NA'
|
|
260 row_d=[geneA,ga_br,geneB,gb_br,ccc[0],ccc[1],tag,ccc[2]] #genea,geneb,tag,consequence
|
|
261 detailM.append(row_d)
|
|
262 if tag=='primary_primary':
|
|
263 row_b=[geneA,ga_br,geneB,gb_br,ccc[2]]
|
|
264 ppcollected=1
|
|
265 if ppcollected==0:
|
|
266 row_b=[geneA,ga_br,geneB,gb_br,'not-classified']
|
|
267 briefM.append(row_b)
|
|
268 infile.close()
|
|
269
|
|
270 for line in detailM:
|
|
271 outfile_detail.write('%s\n'%('\t'.join(line)))
|
|
272 outfile_detail.close()
|
|
273
|
|
274 for line in briefM:
|
|
275 outfile_brief.write('%s\n'%('\t'.join(line)))
|
|
276 outfile_brief.close()
|
|
277
|
|
278 print 'Done!'
|