Mercurial > repos > siyuan > prada

#!/usr/bin/env python

#This script is part of the pyPRADA package. It is used to predict the functional consequences of
#gene fusions. Possible predictions include in-frame, out-of-frame, UTR-UTR, UTR-CDS, Unknown etc.
#It relies on the definition of transcript CDS/UTR boundaries from ENSEMBL, and gives a prediction
#for every pair of transcripts coded by the fusion genes. The whole point is to see if the fusion
#lead to reading shift of the CDS sequences.
#Two files are generated, brief/detail result.
#Result is purely predicted, no guarantee is assumed.
#Contact: Siyuan Zheng, szheng2@mdanderson.org

import os
import sys
import re
import time
import subprocess
import bioclass
from Bio import SeqIO
import ioprada

######################################################################################
help_menu='''\nUsage: prada-frame -conf xx.txt -i inputfile -docstr XX -outdir ./
**parameters**
    -conf	see prada-fusion -conf
    -i		input file, a tab/space delimited four-column file, each row like "GeneA GeneA_break GeneB GeneB_break"
    		Example:FGFR3 1808661 TACC3 1739325
    -docstr	outfile prefix. output to two files: XX.frame.brief.txt, XX.frame.detail.txt
    -outdir	output directory, default is ./
'''

args=sys.argv
if '-h' in args or '-help' in args or len(args)==1:
    print help_menu
    sys.exit(0)

if '-i' not in sys.argv:
    sys.exit('Input file needed')
else:
    i=sys.argv.index('-i')
    datafilename=sys.argv[i+1]

if '-outdir' not in args:
    outpath=os.path.abspath('./')
else:
    i=args.index('-outdir')
    outpath=os.path.abspath(args[i+1])
if os.path.exists(outpath):
    print 'WARNING: outdir %s exists'%outpath
else:
    os.mkdir(outpath)

if '-docstr' not in sys.argv:
    sys.exit('docstring needed')
else:
    i=sys.argv.index('-docstr')
    outfile_prefix=sys.argv[i+1]
    outfile_brief='%s/%s.frame.brief.txt'%(outpath,outfile_prefix)
    outfile_detail='%s/%s.frame.detail.txt'%(outpath,outfile_prefix)

#PRADA executive path
prada_path=os.path.dirname(os.path.abspath(__file__))   ####
ref_search_path=[prada_path,os.getcwd()]                #search path for ref file if not specified in command line

if '-ref' in args:
    i=args.index('-conf')
    reffile=args[i+1]
    if os.path.exists(reffile):
        pass
    else:
        for pth in ref_search_path:
            new_reffile='%s/%s'%(pth, os.path.basename(reffile))
            if os.path.exists(new_reffile):
                reffile=new_reffile
                break
        else:
            sys.exit('ERROR: conf file %s not found'%reffile)
else:
    reffile='%s/conf.txt'%prada_path
    if not os.path.exists(reffile):
        sys.exit('ERROR: No default conf.txt found and none specified')

#read reference files
refdict=ioprada.read_conf(reffile)
featurefile=refdict['--REF--']['feature_file']
cdsfile=refdict['--REF--']['cds_file']
txcatfile=refdict['--REF--']['txcat_file']

#Now print all input parameters
print 'CMD: %s'%('\t'.join(args))

###########################################################################################
##Read information from annotation files, for all genes
print 'Reading annotations'
#call functions in ioprada module //
txdb,genedb=ioprada.read_feature(featurefile,verbose=False)
tx_cds=ioprada.read_cds(cdsfile)
tx_primary=ioprada.read_tx_cat(txcatfile)

###########################################################################################
def maptranscript(gene,exon_end,part=''):
    '''Find all transcripts of the gene having the fusion boundary.
    gene is a Gene object,return a list of Transcript objects'''
    if part not in ['5','3']:
        raise Exception('part must be "5" or "3"')
    txuse=[] ##
    g=gene.name
    strand=gene.strand
    if part=='5':
        tag='big' if strand=='1' else 'small'
    else:
        tag='small' if strand=='1' else 'big'
    txs=gene.transcript
    if tag=='small':
        for tx in txs:
            e_pos=[x.start for x in tx.exon]
            if exon_end in e_pos:
                txuse.append(tx)
    elif tag=='big':
        for tx in txs:
            e_pos=[x.end for x in tx.exon]
            if exon_end in e_pos:
                txuse.append(tx)
    return txuse

def overlap(r1,r2):
    '''compare two ranges, return the overlapped'''
    if r1[0] <= r2[0]:
        ra,rb=r1[:],r2[:]
    else:
        ra,rb=r2[:],r1[:]
    if rb[0] > ra[1]:
        return []
    else:
        if rb[1] <= ra[1]:
            return rb
        else:
            return [rb[0],ra[1]]

def br_front_len(tx,br,part):
    '''No matter it is 5'/3' partner,calculate the ORF length before the break'''
    global tx_cds
    txname=tx.name
    strand=tx.strand
    cds_start,cds_end=[int(x) for x in tx_cds[txname]]
    e_pos=[[x.start,x.end] for x in tx.exon]
    if part=='5' and strand=='1':
        posset=[x.end for x in tx.exon]
        if br not in posset:
            raise Exception('Br is not exon boundary')
        if br < cds_start:
            return '5UTR'
        elif br > cds_end:
            return '3UTR'
        else:
           chim_r=[cds_start,br]
           ol=[overlap(x,chim_r) for x in e_pos]
           L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
           return L
    if part=='5' and strand=='-1':
        posset=[x.start for x in tx.exon]
        if br not in posset:
            raise Exception('Br is not exon boundary')
        if br > cds_end:
            return '5UTR'
        elif br < cds_start:
            return '3UTR'
        else:
            chim_r=[br,cds_end]
            ol=[overlap(x,chim_r) for x in e_pos]
            L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
            return L
    if part=='3' and strand=='1':
        posset=[x.start for x in tx.exon]
        if br not in posset:
            raise Exception('Br is not exon boundary')
        if br < cds_start:
            return '5UTR'
        elif br > cds_end:
            return '3UTR'
        else:
            chim_r=[cds_start,br-1]
            ol=[overlap(x,chim_r) for x in e_pos]
            L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
            return L
    if part=='3' and strand=='-1':
        posset=[x.end for x in tx.exon]
        if br not in posset:
            raise Exception('Br is not exon boundary')
        if br > cds_end:
            return '5UTR'
        elif br < cds_start:
            return '3UTR'
        else:
            chim_r=[br+1,cds_end]
            ol=[overlap(x,chim_r) for x in e_pos]
            L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
            return L

def fusion_frame(gene_a,ga_br,gene_b,gb_br):
    '''gene_a:FGFR3,gene_b:TACC3,ga_br:1808661,gb_br:1741429
    or chr4.1808661.chr4.1741429'''
    global genedb,txdb
    ga_br=int(ga_br)
    gb_br=int(gb_br)
    ga_txs=maptranscript(genedb[gene_a],ga_br,part='5')
    gb_txs=maptranscript(genedb[gene_b],gb_br,part='3')
    res=[]
    for ta in ga_txs:
        for tb in gb_txs:
            s1=br_front_len(ta,ga_br,'5')
            s2=br_front_len(tb,gb_br,'3')
            fusion_conseq='Unknown'
            if isinstance(s1,int) and isinstance(s2,int):    #both br in CDS region
                if s1%3==s2%3:
                    fusion_conseq='In-frame'
                else:
                    fusion_conseq='Out-of-frame'
            elif isinstance(s1,int) and not isinstance(s2,int):
                fusion_conseq='CDS'+'-'+s2
            elif isinstance(s2,int) and not isinstance(s1,int):
                fusion_conseq=s1+'-'+'CDS'
            else:
                fusion_conseq=s1+'-'+s2
            res.append((ta.name,tb.name,fusion_conseq))
    if ga_txs==[] and gb_txs==[]:
        res.append(['NA','NA','NA'])
    elif ga_txs==[]:
        for tb in gb_txs:
            res.append(['NA',tb.name,'NA'])
    elif gb_txs==[]:
        for ta in ga_txs:
            res.append([ta.name,'NA','NA'])
    return res

############################################################################
print 'Predicting...'
infile=open(datafilename)
outfile_detail=open(outfile_detail,'w')
outfile_brief=open(outfile_brief,'w')
detailM=[]
briefM=[]
for line in infile:
    if not line.strip():
        continue
    geneA,ga_br,geneB,gb_br=line.split()
    try:
        M=fusion_frame(geneA,ga_br,geneB,gb_br)
    except:
        M=[]
    ppcollected=0
    for ccc in M:
        a_cat,b_cat='alternative','alternative'
        if tx_primary[geneA]==ccc[0]:
            a_cat='primary'
        if tx_primary[geneB]==ccc[1]:
            b_cat='primary'
        tag='%s_%s'%(a_cat,b_cat)
        if ccc[0]=='NA' or ccc[1]=='NA':
            tag='NA'
        row_d=[geneA,ga_br,geneB,gb_br,ccc[0],ccc[1],tag,ccc[2]] #genea,geneb,tag,consequence
        detailM.append(row_d)
        if tag=='primary_primary':
            row_b=[geneA,ga_br,geneB,gb_br,ccc[2]]
            ppcollected=1
    if ppcollected==0:
        row_b=[geneA,ga_br,geneB,gb_br,'not-classified']
    briefM.append(row_b)
infile.close()

for line in detailM:
    outfile_detail.write('%s\n'%('\t'.join(line)))
outfile_detail.close()

for line in briefM:
    outfile_brief.write('%s\n'%('\t'.join(line)))
outfile_brief.close()

print 'Done!'
author	siyuan
date	Thu, 20 Feb 2014 00:44:58 -0500
parents
children