view pyPRADA_1.2/privutils.py @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
line wrap: on
line source

#This module collects functions used in homology comparison and frame prediction

import subprocess 

def maptranscript(gene,exon_end,part=''):
    """
    Find all transcripts of the gene having the fusion boundary.
    "gene" is a Gene object.
    "exon_end" is the exon boundary that to be searched.
    "part" has to be '5' or '3'.
    Return a list of Transcript objects that has "exon_end" boundary. 
    """
    if part not in ['5','3']:
        raise Exception('part must be "5" or "3"')
    txuse=[] ##
    g=gene.name
    strand=gene.strand
    if part=='5':
        tag='big' if strand=='1' else 'small'
    else:
        tag='small' if strand=='1' else 'big'
    txs=gene.transcript
    if tag=='small':
        for tx in txs:
            e_pos=[x.start for x in tx.exon]
            if exon_end in e_pos:
                txuse.append(tx)
    elif tag=='big':
        for tx in txs:
            e_pos=[x.end for x in tx.exon]
            if exon_end in e_pos:
                txuse.append(tx)
    return txuse


def overlap(r1,r2):
    """compare two ranges, return the overlapped range"""
    if r1[0] <= r2[0]:
        ra,rb=r1[:],r2[:]
    else:
        ra,rb=r2[:],r1[:]
    if rb[0] > ra[1]:
        return []
    else:
        if rb[1] <= ra[1]:
            return rb
        else:
            return [rb[0],ra[1]]

def br_front_len(tx,br,part):
    '''No matter it is 5'/3' partner,calculate the ORF length before the break'''
    txname=tx.name
    strand=tx.strand
    cds_start=tx.cds_start
    cds_end=tx.cds_end
    e_pos=[[x.start,x.end] for x in tx.exon]
    if part=='5' and strand=='1':
        posset=[x.end for x in tx.exon]
        if br not in posset:
            raise Exception('Br is not exon boundary')
        if br < cds_start:
            return '5UTR'
        elif br > cds_end:
            return '3UTR'
        else:
           chim_r=[cds_start,br]
           ol=[overlap(x,chim_r) for x in e_pos]
           L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
           return L
    if part=='5' and strand=='-1':
        posset=[x.start for x in tx.exon]
        if br not in posset:
            raise Exception('Br is not exon boundary')
        if br > cds_end:
            return '5UTR'
        elif br < cds_start:
            return '3UTR'
        else:
            chim_r=[br,cds_end]
            ol=[overlap(x,chim_r) for x in e_pos]
            L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
            return L
    if part=='3' and strand=='1':
        posset=[x.start for x in tx.exon]
        if br not in posset:
            raise Exception('Br is not exon boundary')
        if br < cds_start:
            return '5UTR'
        elif br > cds_end:
            return '3UTR'
        else:
            chim_r=[cds_start,br-1]
            ol=[overlap(x,chim_r) for x in e_pos]
            L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
            return L
    if part=='3' and strand=='-1':
        posset=[x.end for x in tx.exon]
        if br not in posset:
            raise Exception('Br is not exon boundary')
        if br > cds_end:
            return '5UTR'
        elif br < cds_start:
            return '3UTR'
        else:
            chim_r=[br+1,cds_end]
            ol=[overlap(x,chim_r) for x in e_pos]
            L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
            return L

def fusion_frame(gene_a,ga_br,gene_b,gb_br):
    """gene_a:FGFR3,gene_b:TACC3,ga_br:1808661,gb_br:1741429
    or chr4.1808661.chr4.1741429
    gene_a, gene_b are Gene objects.
    """
    ga_br=int(ga_br)
    gb_br=int(gb_br)
    ga_txs=maptranscript(gene_a,ga_br,part='5')
    gb_txs=maptranscript(gene_b,gb_br,part='3')
    res=[]
    for ta in ga_txs:
        for tb in gb_txs:
            s1=br_front_len(ta,ga_br,'5')
            s2=br_front_len(tb,gb_br,'3')
            fusion_conseq='Unknown'
            if isinstance(s1,int) and isinstance(s2,int):    #both br in CDS region
                if s1%3==s2%3:
                    fusion_conseq='In-frame'
                else:
                    fusion_conseq='Out-of-frame'
            elif isinstance(s1,int) and not isinstance(s2,int):
                fusion_conseq='CDS'+'-'+s2
            elif isinstance(s2,int) and not isinstance(s1,int):
                fusion_conseq=s1+'-'+'CDS'
            else:
                fusion_conseq=s1+'-'+s2
            res.append((ta.name,tb.name,fusion_conseq))
    if ga_txs==[] and gb_txs==[]:
        res.append(['NA','NA','NA'])
    elif ga_txs==[]:
        for tb in gb_txs:
            res.append(['NA',tb.name,'NA'])
    elif gb_txs==[]:
        for ta in ga_txs:
            res.append([ta.name,'NA','NA'])
    return res


def seqblast(seqA,seqB,blastn=None):
    '''seqA,seqB:fasta files'''
    if blastn==None: blastn='blastn'
    cmdstr='%s -task=blastn  -subject %s -query %s -outfmt 6'%(blastn,seqA,seqB)
    cmdout=subprocess.Popen(cmdstr.split(),stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
    result=cmdout.stdout.read()
    best_align=result.split('\n')[0]
    if best_align=='':
        return None
    else:
        info=best_align.split('\t')
        identity=info[2].strip()
        align_len=info[3].strip()
        evalue=info[10].strip()
        bit_score=info[11].strip()
        return [identity,align_len,evalue,bit_score]