prada: pyPRADA_1.2/prada-frame comparison

comparison pyPRADA_1.2/prada-frame @ 0:acc2ca1a3ba4

Uploaded

author	siyuan
date	Thu, 20 Feb 2014 00:44:58 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:acc2ca1a3ba4
+#!/usr/bin/env python
+#This script is part of the pyPRADA package. It is used to predict the functional consequences of
+#gene fusions. Possible predictions include in-frame, out-of-frame, UTR-UTR, UTR-CDS, Unknown etc.
+#It relies on the definition of transcript CDS/UTR boundaries from ENSEMBL, and gives a prediction
+#for every pair of transcripts coded by the fusion genes. The whole point is to see if the fusion
+#lead to reading shift of the CDS sequences.
+#Two files are generated, brief/detail result.
+#Result is purely predicted, no guarantee is assumed.
+#Contact: Siyuan Zheng, szheng2@mdanderson.org
+import os
+import sys
+import re
+import time
+import subprocess
+import bioclass
+from Bio import SeqIO
+import ioprada
+######################################################################################
+help_menu='''\nUsage: prada-frame -conf xx.txt -i inputfile -docstr XX -outdir ./
+**parameters**
+-conf	see prada-fusion -conf
+-i		input file, a tab/space delimited four-column file, each row like "GeneA GeneA_break GeneB GeneB_break"
+		Example:FGFR3 1808661 TACC3 1739325
+-docstr	outfile prefix. output to two files: XX.frame.brief.txt, XX.frame.detail.txt
+-outdir	output directory, default is ./
+'''
+args=sys.argv
+if '-h' in args or '-help' in args or len(args)==1:
+print help_menu
+sys.exit(0)
+if '-i' not in sys.argv:
+sys.exit('Input file needed')
+else:
+i=sys.argv.index('-i')
+datafilename=sys.argv[i+1]
+if '-outdir' not in args:
+outpath=os.path.abspath('./')
+else:
+i=args.index('-outdir')
+outpath=os.path.abspath(args[i+1])
+if os.path.exists(outpath):
+print 'WARNING: outdir %s exists'%outpath
+else:
+os.mkdir(outpath)
+if '-docstr' not in sys.argv:
+sys.exit('docstring needed')
+else:
+i=sys.argv.index('-docstr')
+outfile_prefix=sys.argv[i+1]
+outfile_brief='%s/%s.frame.brief.txt'%(outpath,outfile_prefix)
+outfile_detail='%s/%s.frame.detail.txt'%(outpath,outfile_prefix)
+#PRADA executive path
+prada_path=os.path.dirname(os.path.abspath(__file__))   ####
+ref_search_path=[prada_path,os.getcwd()]                #search path for ref file if not specified in command line
+if '-ref' in args:
+i=args.index('-conf')
+reffile=args[i+1]
+if os.path.exists(reffile):
+pass
+else:
+for pth in ref_search_path:
+new_reffile='%s/%s'%(pth, os.path.basename(reffile))
+if os.path.exists(new_reffile):
+reffile=new_reffile
+break
+else:
+sys.exit('ERROR: conf file %s not found'%reffile)
+else:
+reffile='%s/conf.txt'%prada_path
+if not os.path.exists(reffile):
+sys.exit('ERROR: No default conf.txt found and none specified')
+#read reference files
+refdict=ioprada.read_conf(reffile)
+featurefile=refdict['--REF--']['feature_file']
+cdsfile=refdict['--REF--']['cds_file']
+txcatfile=refdict['--REF--']['txcat_file']
+#Now print all input parameters
+print 'CMD: %s'%('\t'.join(args))
+###########################################################################################
+##Read information from annotation files, for all genes
+print 'Reading annotations'
+#call functions in ioprada module //
+txdb,genedb=ioprada.read_feature(featurefile,verbose=False)
+tx_cds=ioprada.read_cds(cdsfile)
+tx_primary=ioprada.read_tx_cat(txcatfile)
+###########################################################################################
+def maptranscript(gene,exon_end,part=''):
+'''Find all transcripts of the gene having the fusion boundary.
+gene is a Gene object,return a list of Transcript objects'''
+if part not in ['5','3']:
+raise Exception('part must be "5" or "3"')
+txuse=[] ##
+g=gene.name
+strand=gene.strand
+if part=='5':
+tag='big' if strand=='1' else 'small'
+else:
+tag='small' if strand=='1' else 'big'
+txs=gene.transcript
+if tag=='small':
+for tx in txs:
+e_pos=[x.start for x in tx.exon]
+if exon_end in e_pos:
+txuse.append(tx)
+elif tag=='big':
+for tx in txs:
+e_pos=[x.end for x in tx.exon]
+if exon_end in e_pos:
+txuse.append(tx)
+return txuse
+def overlap(r1,r2):
+'''compare two ranges, return the overlapped'''
+if r1[0] <= r2[0]:
+ra,rb=r1[:],r2[:]
+else:
+ra,rb=r2[:],r1[:]
+if rb[0] > ra[1]:
+return []
+else:
+if rb[1] <= ra[1]:
+return rb
+else:
+return [rb[0],ra[1]]
+def br_front_len(tx,br,part):
+'''No matter it is 5'/3' partner,calculate the ORF length before the break'''
+global tx_cds
+txname=tx.name
+strand=tx.strand
+cds_start,cds_end=[int(x) for x in tx_cds[txname]]
+e_pos=[[x.start,x.end] for x in tx.exon]
+if part=='5' and strand=='1':
+posset=[x.end for x in tx.exon]
+if br not in posset:
+raise Exception('Br is not exon boundary')
+if br < cds_start:
+return '5UTR'
+elif br > cds_end:
+return '3UTR'
+else:
+chim_r=[cds_start,br]
+ol=[overlap(x,chim_r) for x in e_pos]
+L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
+return L
+if part=='5' and strand=='-1':
+posset=[x.start for x in tx.exon]
+if br not in posset:
+raise Exception('Br is not exon boundary')
+if br > cds_end:
+return '5UTR'
+elif br < cds_start:
+return '3UTR'
+else:
+chim_r=[br,cds_end]
+ol=[overlap(x,chim_r) for x in e_pos]
+L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
+return L
+if part=='3' and strand=='1':
+posset=[x.start for x in tx.exon]
+if br not in posset:
+raise Exception('Br is not exon boundary')
+if br < cds_start:
+return '5UTR'
+elif br > cds_end:
+return '3UTR'
+else:
+chim_r=[cds_start,br-1]
+ol=[overlap(x,chim_r) for x in e_pos]
+L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
+return L
+if part=='3' and strand=='-1':
+posset=[x.end for x in tx.exon]
+if br not in posset:
+raise Exception('Br is not exon boundary')
+if br > cds_end:
+return '5UTR'
+elif br < cds_start:
+return '3UTR'
+else:
+chim_r=[br+1,cds_end]
+ol=[overlap(x,chim_r) for x in e_pos]
+L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
+return L
+def fusion_frame(gene_a,ga_br,gene_b,gb_br):
+'''gene_a:FGFR3,gene_b:TACC3,ga_br:1808661,gb_br:1741429
+or chr4.1808661.chr4.1741429'''
+global genedb,txdb
+ga_br=int(ga_br)
+gb_br=int(gb_br)
+ga_txs=maptranscript(genedb[gene_a],ga_br,part='5')
+gb_txs=maptranscript(genedb[gene_b],gb_br,part='3')
+res=[]
+for ta in ga_txs:
+for tb in gb_txs:
+s1=br_front_len(ta,ga_br,'5')
+s2=br_front_len(tb,gb_br,'3')
+fusion_conseq='Unknown'
+if isinstance(s1,int) and isinstance(s2,int):    #both br in CDS region
+if s1%3==s2%3:
+fusion_conseq='In-frame'
+else:
+fusion_conseq='Out-of-frame'
+elif isinstance(s1,int) and not isinstance(s2,int):
+fusion_conseq='CDS'+'-'+s2
+elif isinstance(s2,int) and not isinstance(s1,int):
+fusion_conseq=s1+'-'+'CDS'
+else:
+fusion_conseq=s1+'-'+s2
+res.append((ta.name,tb.name,fusion_conseq))
+if ga_txs==[] and gb_txs==[]:
+res.append(['NA','NA','NA'])
+elif ga_txs==[]:
+for tb in gb_txs:
+res.append(['NA',tb.name,'NA'])
+elif gb_txs==[]:
+for ta in ga_txs:
+res.append([ta.name,'NA','NA'])
+return res
+############################################################################
+print 'Predicting...'
+infile=open(datafilename)
+outfile_detail=open(outfile_detail,'w')
+outfile_brief=open(outfile_brief,'w')
+detailM=[]
+briefM=[]
+for line in infile:
+if not line.strip():
+continue
+geneA,ga_br,geneB,gb_br=line.split()
+try:
+M=fusion_frame(geneA,ga_br,geneB,gb_br)
+except:
+M=[]
+ppcollected=0
+for ccc in M:
+a_cat,b_cat='alternative','alternative'
+if tx_primary[geneA]==ccc[0]:
+a_cat='primary'
+if tx_primary[geneB]==ccc[1]:
+b_cat='primary'
+tag='%s_%s'%(a_cat,b_cat)
+if ccc[0]=='NA' or ccc[1]=='NA':
+tag='NA'
+row_d=[geneA,ga_br,geneB,gb_br,ccc[0],ccc[1],tag,ccc[2]] #genea,geneb,tag,consequence
+detailM.append(row_d)
+if tag=='primary_primary':
+row_b=[geneA,ga_br,geneB,gb_br,ccc[2]]
+ppcollected=1
+if ppcollected==0:
+row_b=[geneA,ga_br,geneB,gb_br,'not-classified']
+briefM.append(row_b)
+infile.close()
+for line in detailM:
+outfile_detail.write('%s\n'%('\t'.join(line)))
+outfile_detail.close()
+for line in briefM:
+outfile_brief.write('%s\n'%('\t'.join(line)))
+outfile_brief.close()
+print 'Done!'

Mercurial > repos > siyuan > prada

comparison pyPRADA_1.2/prada-frame @ 0:acc2ca1a3ba4