Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/prada-frame @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 #This script is part of the pyPRADA package. It is used to predict the functional consequences of | |
4 #gene fusions. Possible predictions include in-frame, out-of-frame, UTR-UTR, UTR-CDS, Unknown etc. | |
5 #It relies on the definition of transcript CDS/UTR boundaries from ENSEMBL, and gives a prediction | |
6 #for every pair of transcripts coded by the fusion genes. The whole point is to see if the fusion | |
7 #lead to reading shift of the CDS sequences. | |
8 #Two files are generated, brief/detail result. | |
9 #Result is purely predicted, no guarantee is assumed. | |
10 #Contact: Siyuan Zheng, szheng2@mdanderson.org | |
11 | |
12 import os | |
13 import sys | |
14 import re | |
15 import time | |
16 import subprocess | |
17 import bioclass | |
18 from Bio import SeqIO | |
19 import ioprada | |
20 | |
21 ###################################################################################### | |
22 help_menu='''\nUsage: prada-frame -conf xx.txt -i inputfile -docstr XX -outdir ./ | |
23 **parameters** | |
24 -conf see prada-fusion -conf | |
25 -i input file, a tab/space delimited four-column file, each row like "GeneA GeneA_break GeneB GeneB_break" | |
26 Example:FGFR3 1808661 TACC3 1739325 | |
27 -docstr outfile prefix. output to two files: XX.frame.brief.txt, XX.frame.detail.txt | |
28 -outdir output directory, default is ./ | |
29 ''' | |
30 | |
31 args=sys.argv | |
32 if '-h' in args or '-help' in args or len(args)==1: | |
33 print help_menu | |
34 sys.exit(0) | |
35 | |
36 if '-i' not in sys.argv: | |
37 sys.exit('Input file needed') | |
38 else: | |
39 i=sys.argv.index('-i') | |
40 datafilename=sys.argv[i+1] | |
41 | |
42 if '-outdir' not in args: | |
43 outpath=os.path.abspath('./') | |
44 else: | |
45 i=args.index('-outdir') | |
46 outpath=os.path.abspath(args[i+1]) | |
47 if os.path.exists(outpath): | |
48 print 'WARNING: outdir %s exists'%outpath | |
49 else: | |
50 os.mkdir(outpath) | |
51 | |
52 if '-docstr' not in sys.argv: | |
53 sys.exit('docstring needed') | |
54 else: | |
55 i=sys.argv.index('-docstr') | |
56 outfile_prefix=sys.argv[i+1] | |
57 outfile_brief='%s/%s.frame.brief.txt'%(outpath,outfile_prefix) | |
58 outfile_detail='%s/%s.frame.detail.txt'%(outpath,outfile_prefix) | |
59 | |
60 #PRADA executive path | |
61 prada_path=os.path.dirname(os.path.abspath(__file__)) #### | |
62 ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line | |
63 | |
64 if '-ref' in args: | |
65 i=args.index('-conf') | |
66 reffile=args[i+1] | |
67 if os.path.exists(reffile): | |
68 pass | |
69 else: | |
70 for pth in ref_search_path: | |
71 new_reffile='%s/%s'%(pth, os.path.basename(reffile)) | |
72 if os.path.exists(new_reffile): | |
73 reffile=new_reffile | |
74 break | |
75 else: | |
76 sys.exit('ERROR: conf file %s not found'%reffile) | |
77 else: | |
78 reffile='%s/conf.txt'%prada_path | |
79 if not os.path.exists(reffile): | |
80 sys.exit('ERROR: No default conf.txt found and none specified') | |
81 | |
82 #read reference files | |
83 refdict=ioprada.read_conf(reffile) | |
84 featurefile=refdict['--REF--']['feature_file'] | |
85 cdsfile=refdict['--REF--']['cds_file'] | |
86 txcatfile=refdict['--REF--']['txcat_file'] | |
87 | |
88 #Now print all input parameters | |
89 print 'CMD: %s'%('\t'.join(args)) | |
90 | |
91 ########################################################################################### | |
92 ##Read information from annotation files, for all genes | |
93 print 'Reading annotations' | |
94 #call functions in ioprada module // | |
95 txdb,genedb=ioprada.read_feature(featurefile,verbose=False) | |
96 tx_cds=ioprada.read_cds(cdsfile) | |
97 tx_primary=ioprada.read_tx_cat(txcatfile) | |
98 | |
99 ########################################################################################### | |
100 def maptranscript(gene,exon_end,part=''): | |
101 '''Find all transcripts of the gene having the fusion boundary. | |
102 gene is a Gene object,return a list of Transcript objects''' | |
103 if part not in ['5','3']: | |
104 raise Exception('part must be "5" or "3"') | |
105 txuse=[] ## | |
106 g=gene.name | |
107 strand=gene.strand | |
108 if part=='5': | |
109 tag='big' if strand=='1' else 'small' | |
110 else: | |
111 tag='small' if strand=='1' else 'big' | |
112 txs=gene.transcript | |
113 if tag=='small': | |
114 for tx in txs: | |
115 e_pos=[x.start for x in tx.exon] | |
116 if exon_end in e_pos: | |
117 txuse.append(tx) | |
118 elif tag=='big': | |
119 for tx in txs: | |
120 e_pos=[x.end for x in tx.exon] | |
121 if exon_end in e_pos: | |
122 txuse.append(tx) | |
123 return txuse | |
124 | |
125 def overlap(r1,r2): | |
126 '''compare two ranges, return the overlapped''' | |
127 if r1[0] <= r2[0]: | |
128 ra,rb=r1[:],r2[:] | |
129 else: | |
130 ra,rb=r2[:],r1[:] | |
131 if rb[0] > ra[1]: | |
132 return [] | |
133 else: | |
134 if rb[1] <= ra[1]: | |
135 return rb | |
136 else: | |
137 return [rb[0],ra[1]] | |
138 | |
139 def br_front_len(tx,br,part): | |
140 '''No matter it is 5'/3' partner,calculate the ORF length before the break''' | |
141 global tx_cds | |
142 txname=tx.name | |
143 strand=tx.strand | |
144 cds_start,cds_end=[int(x) for x in tx_cds[txname]] | |
145 e_pos=[[x.start,x.end] for x in tx.exon] | |
146 if part=='5' and strand=='1': | |
147 posset=[x.end for x in tx.exon] | |
148 if br not in posset: | |
149 raise Exception('Br is not exon boundary') | |
150 if br < cds_start: | |
151 return '5UTR' | |
152 elif br > cds_end: | |
153 return '3UTR' | |
154 else: | |
155 chim_r=[cds_start,br] | |
156 ol=[overlap(x,chim_r) for x in e_pos] | |
157 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | |
158 return L | |
159 if part=='5' and strand=='-1': | |
160 posset=[x.start for x in tx.exon] | |
161 if br not in posset: | |
162 raise Exception('Br is not exon boundary') | |
163 if br > cds_end: | |
164 return '5UTR' | |
165 elif br < cds_start: | |
166 return '3UTR' | |
167 else: | |
168 chim_r=[br,cds_end] | |
169 ol=[overlap(x,chim_r) for x in e_pos] | |
170 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | |
171 return L | |
172 if part=='3' and strand=='1': | |
173 posset=[x.start for x in tx.exon] | |
174 if br not in posset: | |
175 raise Exception('Br is not exon boundary') | |
176 if br < cds_start: | |
177 return '5UTR' | |
178 elif br > cds_end: | |
179 return '3UTR' | |
180 else: | |
181 chim_r=[cds_start,br-1] | |
182 ol=[overlap(x,chim_r) for x in e_pos] | |
183 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | |
184 return L | |
185 if part=='3' and strand=='-1': | |
186 posset=[x.end for x in tx.exon] | |
187 if br not in posset: | |
188 raise Exception('Br is not exon boundary') | |
189 if br > cds_end: | |
190 return '5UTR' | |
191 elif br < cds_start: | |
192 return '3UTR' | |
193 else: | |
194 chim_r=[br+1,cds_end] | |
195 ol=[overlap(x,chim_r) for x in e_pos] | |
196 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | |
197 return L | |
198 | |
199 def fusion_frame(gene_a,ga_br,gene_b,gb_br): | |
200 '''gene_a:FGFR3,gene_b:TACC3,ga_br:1808661,gb_br:1741429 | |
201 or chr4.1808661.chr4.1741429''' | |
202 global genedb,txdb | |
203 ga_br=int(ga_br) | |
204 gb_br=int(gb_br) | |
205 ga_txs=maptranscript(genedb[gene_a],ga_br,part='5') | |
206 gb_txs=maptranscript(genedb[gene_b],gb_br,part='3') | |
207 res=[] | |
208 for ta in ga_txs: | |
209 for tb in gb_txs: | |
210 s1=br_front_len(ta,ga_br,'5') | |
211 s2=br_front_len(tb,gb_br,'3') | |
212 fusion_conseq='Unknown' | |
213 if isinstance(s1,int) and isinstance(s2,int): #both br in CDS region | |
214 if s1%3==s2%3: | |
215 fusion_conseq='In-frame' | |
216 else: | |
217 fusion_conseq='Out-of-frame' | |
218 elif isinstance(s1,int) and not isinstance(s2,int): | |
219 fusion_conseq='CDS'+'-'+s2 | |
220 elif isinstance(s2,int) and not isinstance(s1,int): | |
221 fusion_conseq=s1+'-'+'CDS' | |
222 else: | |
223 fusion_conseq=s1+'-'+s2 | |
224 res.append((ta.name,tb.name,fusion_conseq)) | |
225 if ga_txs==[] and gb_txs==[]: | |
226 res.append(['NA','NA','NA']) | |
227 elif ga_txs==[]: | |
228 for tb in gb_txs: | |
229 res.append(['NA',tb.name,'NA']) | |
230 elif gb_txs==[]: | |
231 for ta in ga_txs: | |
232 res.append([ta.name,'NA','NA']) | |
233 return res | |
234 | |
235 ############################################################################ | |
236 print 'Predicting...' | |
237 infile=open(datafilename) | |
238 outfile_detail=open(outfile_detail,'w') | |
239 outfile_brief=open(outfile_brief,'w') | |
240 detailM=[] | |
241 briefM=[] | |
242 for line in infile: | |
243 if not line.strip(): | |
244 continue | |
245 geneA,ga_br,geneB,gb_br=line.split() | |
246 try: | |
247 M=fusion_frame(geneA,ga_br,geneB,gb_br) | |
248 except: | |
249 M=[] | |
250 ppcollected=0 | |
251 for ccc in M: | |
252 a_cat,b_cat='alternative','alternative' | |
253 if tx_primary[geneA]==ccc[0]: | |
254 a_cat='primary' | |
255 if tx_primary[geneB]==ccc[1]: | |
256 b_cat='primary' | |
257 tag='%s_%s'%(a_cat,b_cat) | |
258 if ccc[0]=='NA' or ccc[1]=='NA': | |
259 tag='NA' | |
260 row_d=[geneA,ga_br,geneB,gb_br,ccc[0],ccc[1],tag,ccc[2]] #genea,geneb,tag,consequence | |
261 detailM.append(row_d) | |
262 if tag=='primary_primary': | |
263 row_b=[geneA,ga_br,geneB,gb_br,ccc[2]] | |
264 ppcollected=1 | |
265 if ppcollected==0: | |
266 row_b=[geneA,ga_br,geneB,gb_br,'not-classified'] | |
267 briefM.append(row_b) | |
268 infile.close() | |
269 | |
270 for line in detailM: | |
271 outfile_detail.write('%s\n'%('\t'.join(line))) | |
272 outfile_detail.close() | |
273 | |
274 for line in briefM: | |
275 outfile_brief.write('%s\n'%('\t'.join(line))) | |
276 outfile_brief.close() | |
277 | |
278 print 'Done!' |