diff pyPRADA_1.2/prada-frame @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyPRADA_1.2/prada-frame	Thu Feb 20 00:44:58 2014 -0500
@@ -0,0 +1,278 @@
+#!/usr/bin/env python
+
+#This script is part of the pyPRADA package. It is used to predict the functional consequences of 
+#gene fusions. Possible predictions include in-frame, out-of-frame, UTR-UTR, UTR-CDS, Unknown etc.
+#It relies on the definition of transcript CDS/UTR boundaries from ENSEMBL, and gives a prediction
+#for every pair of transcripts coded by the fusion genes. The whole point is to see if the fusion
+#lead to reading shift of the CDS sequences.
+#Two files are generated, brief/detail result.
+#Result is purely predicted, no guarantee is assumed.   
+#Contact: Siyuan Zheng, szheng2@mdanderson.org
+
+import os
+import sys
+import re
+import time
+import subprocess
+import bioclass
+from Bio import SeqIO
+import ioprada
+
+######################################################################################
+help_menu='''\nUsage: prada-frame -conf xx.txt -i inputfile -docstr XX -outdir ./
+**parameters**
+    -conf	see prada-fusion -conf
+    -i		input file, a tab/space delimited four-column file, each row like "GeneA GeneA_break GeneB GeneB_break"
+    		Example:FGFR3 1808661 TACC3 1739325
+    -docstr	outfile prefix. output to two files: XX.frame.brief.txt, XX.frame.detail.txt
+    -outdir	output directory, default is ./
+'''
+
+args=sys.argv
+if '-h' in args or '-help' in args or len(args)==1:
+    print help_menu
+    sys.exit(0)
+
+if '-i' not in sys.argv:
+    sys.exit('Input file needed')
+else:
+    i=sys.argv.index('-i')
+    datafilename=sys.argv[i+1]
+
+if '-outdir' not in args:
+    outpath=os.path.abspath('./')
+else:
+    i=args.index('-outdir')
+    outpath=os.path.abspath(args[i+1])
+if os.path.exists(outpath):
+    print 'WARNING: outdir %s exists'%outpath
+else:
+    os.mkdir(outpath)
+
+if '-docstr' not in sys.argv:
+    sys.exit('docstring needed')
+else:
+    i=sys.argv.index('-docstr')
+    outfile_prefix=sys.argv[i+1]
+    outfile_brief='%s/%s.frame.brief.txt'%(outpath,outfile_prefix)
+    outfile_detail='%s/%s.frame.detail.txt'%(outpath,outfile_prefix)
+
+#PRADA executive path
+prada_path=os.path.dirname(os.path.abspath(__file__))   ####
+ref_search_path=[prada_path,os.getcwd()]                #search path for ref file if not specified in command line
+
+if '-ref' in args:
+    i=args.index('-conf')
+    reffile=args[i+1]
+    if os.path.exists(reffile):
+        pass
+    else:
+        for pth in ref_search_path:
+            new_reffile='%s/%s'%(pth, os.path.basename(reffile))
+            if os.path.exists(new_reffile):
+                reffile=new_reffile
+                break
+        else:
+            sys.exit('ERROR: conf file %s not found'%reffile)
+else:
+    reffile='%s/conf.txt'%prada_path
+    if not os.path.exists(reffile):
+        sys.exit('ERROR: No default conf.txt found and none specified')
+
+#read reference files
+refdict=ioprada.read_conf(reffile)
+featurefile=refdict['--REF--']['feature_file']
+cdsfile=refdict['--REF--']['cds_file']
+txcatfile=refdict['--REF--']['txcat_file']
+
+#Now print all input parameters
+print 'CMD: %s'%('\t'.join(args))
+
+###########################################################################################
+##Read information from annotation files, for all genes
+print 'Reading annotations'
+#call functions in ioprada module //
+txdb,genedb=ioprada.read_feature(featurefile,verbose=False)
+tx_cds=ioprada.read_cds(cdsfile)
+tx_primary=ioprada.read_tx_cat(txcatfile)
+
+###########################################################################################
+def maptranscript(gene,exon_end,part=''):
+    '''Find all transcripts of the gene having the fusion boundary.
+    gene is a Gene object,return a list of Transcript objects'''
+    if part not in ['5','3']:
+        raise Exception('part must be "5" or "3"')
+    txuse=[] ##
+    g=gene.name
+    strand=gene.strand
+    if part=='5':
+        tag='big' if strand=='1' else 'small'
+    else:
+        tag='small' if strand=='1' else 'big'
+    txs=gene.transcript
+    if tag=='small':
+        for tx in txs:
+            e_pos=[x.start for x in tx.exon]
+            if exon_end in e_pos:
+                txuse.append(tx)
+    elif tag=='big':
+        for tx in txs:
+            e_pos=[x.end for x in tx.exon]
+            if exon_end in e_pos:
+                txuse.append(tx)
+    return txuse
+
+def overlap(r1,r2):
+    '''compare two ranges, return the overlapped'''
+    if r1[0] <= r2[0]:
+        ra,rb=r1[:],r2[:]
+    else:
+        ra,rb=r2[:],r1[:]
+    if rb[0] > ra[1]:
+        return []
+    else:
+        if rb[1] <= ra[1]:
+            return rb
+        else:
+            return [rb[0],ra[1]]
+
+def br_front_len(tx,br,part):
+    '''No matter it is 5'/3' partner,calculate the ORF length before the break'''
+    global tx_cds
+    txname=tx.name
+    strand=tx.strand
+    cds_start,cds_end=[int(x) for x in tx_cds[txname]]
+    e_pos=[[x.start,x.end] for x in tx.exon]
+    if part=='5' and strand=='1':
+        posset=[x.end for x in tx.exon] 
+        if br not in posset:
+            raise Exception('Br is not exon boundary')
+        if br < cds_start:
+            return '5UTR'
+        elif br > cds_end:
+            return '3UTR'
+        else:
+           chim_r=[cds_start,br] 
+           ol=[overlap(x,chim_r) for x in e_pos]
+           L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
+           return L
+    if part=='5' and strand=='-1':
+        posset=[x.start for x in tx.exon]
+        if br not in posset:
+            raise Exception('Br is not exon boundary')
+        if br > cds_end:
+            return '5UTR'
+        elif br < cds_start:
+            return '3UTR'
+        else:
+            chim_r=[br,cds_end]
+            ol=[overlap(x,chim_r) for x in e_pos]
+            L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
+            return L
+    if part=='3' and strand=='1':
+        posset=[x.start for x in tx.exon]
+        if br not in posset:
+            raise Exception('Br is not exon boundary')
+        if br < cds_start:
+            return '5UTR'
+        elif br > cds_end:
+            return '3UTR'
+        else:
+            chim_r=[cds_start,br-1]
+            ol=[overlap(x,chim_r) for x in e_pos]
+            L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
+            return L
+    if part=='3' and strand=='-1':
+        posset=[x.end for x in tx.exon]
+        if br not in posset:
+            raise Exception('Br is not exon boundary')
+        if br > cds_end:
+            return '5UTR'
+        elif br < cds_start:
+            return '3UTR'
+        else:
+            chim_r=[br+1,cds_end]
+            ol=[overlap(x,chim_r) for x in e_pos]
+            L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
+            return L
+
+def fusion_frame(gene_a,ga_br,gene_b,gb_br):
+    '''gene_a:FGFR3,gene_b:TACC3,ga_br:1808661,gb_br:1741429
+    or chr4.1808661.chr4.1741429'''
+    global genedb,txdb
+    ga_br=int(ga_br)
+    gb_br=int(gb_br)
+    ga_txs=maptranscript(genedb[gene_a],ga_br,part='5')
+    gb_txs=maptranscript(genedb[gene_b],gb_br,part='3')
+    res=[]
+    for ta in ga_txs:
+        for tb in gb_txs:
+            s1=br_front_len(ta,ga_br,'5')
+            s2=br_front_len(tb,gb_br,'3')
+            fusion_conseq='Unknown'
+            if isinstance(s1,int) and isinstance(s2,int):    #both br in CDS region
+                if s1%3==s2%3:
+                    fusion_conseq='In-frame'
+                else:
+                    fusion_conseq='Out-of-frame'
+            elif isinstance(s1,int) and not isinstance(s2,int):
+                fusion_conseq='CDS'+'-'+s2
+            elif isinstance(s2,int) and not isinstance(s1,int):
+                fusion_conseq=s1+'-'+'CDS'
+            else:
+                fusion_conseq=s1+'-'+s2
+            res.append((ta.name,tb.name,fusion_conseq))
+    if ga_txs==[] and gb_txs==[]:
+        res.append(['NA','NA','NA'])
+    elif ga_txs==[]:
+        for tb in gb_txs:
+            res.append(['NA',tb.name,'NA'])
+    elif gb_txs==[]:
+        for ta in ga_txs:
+            res.append([ta.name,'NA','NA'])
+    return res
+ 
+############################################################################
+print 'Predicting...'
+infile=open(datafilename)
+outfile_detail=open(outfile_detail,'w')
+outfile_brief=open(outfile_brief,'w')
+detailM=[]
+briefM=[]
+for line in infile:
+    if not line.strip():
+        continue
+    geneA,ga_br,geneB,gb_br=line.split()
+    try:
+        M=fusion_frame(geneA,ga_br,geneB,gb_br)
+    except:
+        M=[]
+    ppcollected=0
+    for ccc in M:
+        a_cat,b_cat='alternative','alternative'
+        if tx_primary[geneA]==ccc[0]:
+            a_cat='primary'
+        if tx_primary[geneB]==ccc[1]:
+            b_cat='primary'
+        tag='%s_%s'%(a_cat,b_cat)
+        if ccc[0]=='NA' or ccc[1]=='NA':
+            tag='NA'  
+        row_d=[geneA,ga_br,geneB,gb_br,ccc[0],ccc[1],tag,ccc[2]] #genea,geneb,tag,consequence
+        detailM.append(row_d)
+        if tag=='primary_primary':
+            row_b=[geneA,ga_br,geneB,gb_br,ccc[2]]
+            ppcollected=1
+    if ppcollected==0:
+        row_b=[geneA,ga_br,geneB,gb_br,'not-classified']
+    briefM.append(row_b) 
+infile.close()
+
+for line in detailM:
+    outfile_detail.write('%s\n'%('\t'.join(line)))
+outfile_detail.close()
+
+for line in briefM:
+    outfile_brief.write('%s\n'%('\t'.join(line)))
+outfile_brief.close()
+
+print 'Done!'