Mercurial > repos > siyuan > prada
diff pyPRADA_1.2/prada-frame @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyPRADA_1.2/prada-frame Thu Feb 20 00:44:58 2014 -0500 @@ -0,0 +1,278 @@ +#!/usr/bin/env python + +#This script is part of the pyPRADA package. It is used to predict the functional consequences of +#gene fusions. Possible predictions include in-frame, out-of-frame, UTR-UTR, UTR-CDS, Unknown etc. +#It relies on the definition of transcript CDS/UTR boundaries from ENSEMBL, and gives a prediction +#for every pair of transcripts coded by the fusion genes. The whole point is to see if the fusion +#lead to reading shift of the CDS sequences. +#Two files are generated, brief/detail result. +#Result is purely predicted, no guarantee is assumed. +#Contact: Siyuan Zheng, szheng2@mdanderson.org + +import os +import sys +import re +import time +import subprocess +import bioclass +from Bio import SeqIO +import ioprada + +###################################################################################### +help_menu='''\nUsage: prada-frame -conf xx.txt -i inputfile -docstr XX -outdir ./ +**parameters** + -conf see prada-fusion -conf + -i input file, a tab/space delimited four-column file, each row like "GeneA GeneA_break GeneB GeneB_break" + Example:FGFR3 1808661 TACC3 1739325 + -docstr outfile prefix. output to two files: XX.frame.brief.txt, XX.frame.detail.txt + -outdir output directory, default is ./ +''' + +args=sys.argv +if '-h' in args or '-help' in args or len(args)==1: + print help_menu + sys.exit(0) + +if '-i' not in sys.argv: + sys.exit('Input file needed') +else: + i=sys.argv.index('-i') + datafilename=sys.argv[i+1] + +if '-outdir' not in args: + outpath=os.path.abspath('./') +else: + i=args.index('-outdir') + outpath=os.path.abspath(args[i+1]) +if os.path.exists(outpath): + print 'WARNING: outdir %s exists'%outpath +else: + os.mkdir(outpath) + +if '-docstr' not in sys.argv: + sys.exit('docstring needed') +else: + i=sys.argv.index('-docstr') + outfile_prefix=sys.argv[i+1] + outfile_brief='%s/%s.frame.brief.txt'%(outpath,outfile_prefix) + outfile_detail='%s/%s.frame.detail.txt'%(outpath,outfile_prefix) + +#PRADA executive path +prada_path=os.path.dirname(os.path.abspath(__file__)) #### +ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line + +if '-ref' in args: + i=args.index('-conf') + reffile=args[i+1] + if os.path.exists(reffile): + pass + else: + for pth in ref_search_path: + new_reffile='%s/%s'%(pth, os.path.basename(reffile)) + if os.path.exists(new_reffile): + reffile=new_reffile + break + else: + sys.exit('ERROR: conf file %s not found'%reffile) +else: + reffile='%s/conf.txt'%prada_path + if not os.path.exists(reffile): + sys.exit('ERROR: No default conf.txt found and none specified') + +#read reference files +refdict=ioprada.read_conf(reffile) +featurefile=refdict['--REF--']['feature_file'] +cdsfile=refdict['--REF--']['cds_file'] +txcatfile=refdict['--REF--']['txcat_file'] + +#Now print all input parameters +print 'CMD: %s'%('\t'.join(args)) + +########################################################################################### +##Read information from annotation files, for all genes +print 'Reading annotations' +#call functions in ioprada module // +txdb,genedb=ioprada.read_feature(featurefile,verbose=False) +tx_cds=ioprada.read_cds(cdsfile) +tx_primary=ioprada.read_tx_cat(txcatfile) + +########################################################################################### +def maptranscript(gene,exon_end,part=''): + '''Find all transcripts of the gene having the fusion boundary. + gene is a Gene object,return a list of Transcript objects''' + if part not in ['5','3']: + raise Exception('part must be "5" or "3"') + txuse=[] ## + g=gene.name + strand=gene.strand + if part=='5': + tag='big' if strand=='1' else 'small' + else: + tag='small' if strand=='1' else 'big' + txs=gene.transcript + if tag=='small': + for tx in txs: + e_pos=[x.start for x in tx.exon] + if exon_end in e_pos: + txuse.append(tx) + elif tag=='big': + for tx in txs: + e_pos=[x.end for x in tx.exon] + if exon_end in e_pos: + txuse.append(tx) + return txuse + +def overlap(r1,r2): + '''compare two ranges, return the overlapped''' + if r1[0] <= r2[0]: + ra,rb=r1[:],r2[:] + else: + ra,rb=r2[:],r1[:] + if rb[0] > ra[1]: + return [] + else: + if rb[1] <= ra[1]: + return rb + else: + return [rb[0],ra[1]] + +def br_front_len(tx,br,part): + '''No matter it is 5'/3' partner,calculate the ORF length before the break''' + global tx_cds + txname=tx.name + strand=tx.strand + cds_start,cds_end=[int(x) for x in tx_cds[txname]] + e_pos=[[x.start,x.end] for x in tx.exon] + if part=='5' and strand=='1': + posset=[x.end for x in tx.exon] + if br not in posset: + raise Exception('Br is not exon boundary') + if br < cds_start: + return '5UTR' + elif br > cds_end: + return '3UTR' + else: + chim_r=[cds_start,br] + ol=[overlap(x,chim_r) for x in e_pos] + L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) + return L + if part=='5' and strand=='-1': + posset=[x.start for x in tx.exon] + if br not in posset: + raise Exception('Br is not exon boundary') + if br > cds_end: + return '5UTR' + elif br < cds_start: + return '3UTR' + else: + chim_r=[br,cds_end] + ol=[overlap(x,chim_r) for x in e_pos] + L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) + return L + if part=='3' and strand=='1': + posset=[x.start for x in tx.exon] + if br not in posset: + raise Exception('Br is not exon boundary') + if br < cds_start: + return '5UTR' + elif br > cds_end: + return '3UTR' + else: + chim_r=[cds_start,br-1] + ol=[overlap(x,chim_r) for x in e_pos] + L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) + return L + if part=='3' and strand=='-1': + posset=[x.end for x in tx.exon] + if br not in posset: + raise Exception('Br is not exon boundary') + if br > cds_end: + return '5UTR' + elif br < cds_start: + return '3UTR' + else: + chim_r=[br+1,cds_end] + ol=[overlap(x,chim_r) for x in e_pos] + L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) + return L + +def fusion_frame(gene_a,ga_br,gene_b,gb_br): + '''gene_a:FGFR3,gene_b:TACC3,ga_br:1808661,gb_br:1741429 + or chr4.1808661.chr4.1741429''' + global genedb,txdb + ga_br=int(ga_br) + gb_br=int(gb_br) + ga_txs=maptranscript(genedb[gene_a],ga_br,part='5') + gb_txs=maptranscript(genedb[gene_b],gb_br,part='3') + res=[] + for ta in ga_txs: + for tb in gb_txs: + s1=br_front_len(ta,ga_br,'5') + s2=br_front_len(tb,gb_br,'3') + fusion_conseq='Unknown' + if isinstance(s1,int) and isinstance(s2,int): #both br in CDS region + if s1%3==s2%3: + fusion_conseq='In-frame' + else: + fusion_conseq='Out-of-frame' + elif isinstance(s1,int) and not isinstance(s2,int): + fusion_conseq='CDS'+'-'+s2 + elif isinstance(s2,int) and not isinstance(s1,int): + fusion_conseq=s1+'-'+'CDS' + else: + fusion_conseq=s1+'-'+s2 + res.append((ta.name,tb.name,fusion_conseq)) + if ga_txs==[] and gb_txs==[]: + res.append(['NA','NA','NA']) + elif ga_txs==[]: + for tb in gb_txs: + res.append(['NA',tb.name,'NA']) + elif gb_txs==[]: + for ta in ga_txs: + res.append([ta.name,'NA','NA']) + return res + +############################################################################ +print 'Predicting...' +infile=open(datafilename) +outfile_detail=open(outfile_detail,'w') +outfile_brief=open(outfile_brief,'w') +detailM=[] +briefM=[] +for line in infile: + if not line.strip(): + continue + geneA,ga_br,geneB,gb_br=line.split() + try: + M=fusion_frame(geneA,ga_br,geneB,gb_br) + except: + M=[] + ppcollected=0 + for ccc in M: + a_cat,b_cat='alternative','alternative' + if tx_primary[geneA]==ccc[0]: + a_cat='primary' + if tx_primary[geneB]==ccc[1]: + b_cat='primary' + tag='%s_%s'%(a_cat,b_cat) + if ccc[0]=='NA' or ccc[1]=='NA': + tag='NA' + row_d=[geneA,ga_br,geneB,gb_br,ccc[0],ccc[1],tag,ccc[2]] #genea,geneb,tag,consequence + detailM.append(row_d) + if tag=='primary_primary': + row_b=[geneA,ga_br,geneB,gb_br,ccc[2]] + ppcollected=1 + if ppcollected==0: + row_b=[geneA,ga_br,geneB,gb_br,'not-classified'] + briefM.append(row_b) +infile.close() + +for line in detailM: + outfile_detail.write('%s\n'%('\t'.join(line))) +outfile_detail.close() + +for line in briefM: + outfile_brief.write('%s\n'%('\t'.join(line))) +outfile_brief.close() + +print 'Done!'