diff pyPRADA_1.2/privutils.py @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyPRADA_1.2/privutils.py	Thu Feb 20 00:44:58 2014 -0500
@@ -0,0 +1,165 @@
+#This module collects functions used in homology comparison and frame prediction
+
+import subprocess 
+
+def maptranscript(gene,exon_end,part=''):
+    """
+    Find all transcripts of the gene having the fusion boundary.
+    "gene" is a Gene object.
+    "exon_end" is the exon boundary that to be searched.
+    "part" has to be '5' or '3'.
+    Return a list of Transcript objects that has "exon_end" boundary. 
+    """
+    if part not in ['5','3']:
+        raise Exception('part must be "5" or "3"')
+    txuse=[] ##
+    g=gene.name
+    strand=gene.strand
+    if part=='5':
+        tag='big' if strand=='1' else 'small'
+    else:
+        tag='small' if strand=='1' else 'big'
+    txs=gene.transcript
+    if tag=='small':
+        for tx in txs:
+            e_pos=[x.start for x in tx.exon]
+            if exon_end in e_pos:
+                txuse.append(tx)
+    elif tag=='big':
+        for tx in txs:
+            e_pos=[x.end for x in tx.exon]
+            if exon_end in e_pos:
+                txuse.append(tx)
+    return txuse
+
+
+def overlap(r1,r2):
+    """compare two ranges, return the overlapped range"""
+    if r1[0] <= r2[0]:
+        ra,rb=r1[:],r2[:]
+    else:
+        ra,rb=r2[:],r1[:]
+    if rb[0] > ra[1]:
+        return []
+    else:
+        if rb[1] <= ra[1]:
+            return rb
+        else:
+            return [rb[0],ra[1]]
+
+def br_front_len(tx,br,part):
+    '''No matter it is 5'/3' partner,calculate the ORF length before the break'''
+    txname=tx.name
+    strand=tx.strand
+    cds_start=tx.cds_start
+    cds_end=tx.cds_end
+    e_pos=[[x.start,x.end] for x in tx.exon]
+    if part=='5' and strand=='1':
+        posset=[x.end for x in tx.exon]
+        if br not in posset:
+            raise Exception('Br is not exon boundary')
+        if br < cds_start:
+            return '5UTR'
+        elif br > cds_end:
+            return '3UTR'
+        else:
+           chim_r=[cds_start,br]
+           ol=[overlap(x,chim_r) for x in e_pos]
+           L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
+           return L
+    if part=='5' and strand=='-1':
+        posset=[x.start for x in tx.exon]
+        if br not in posset:
+            raise Exception('Br is not exon boundary')
+        if br > cds_end:
+            return '5UTR'
+        elif br < cds_start:
+            return '3UTR'
+        else:
+            chim_r=[br,cds_end]
+            ol=[overlap(x,chim_r) for x in e_pos]
+            L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
+            return L
+    if part=='3' and strand=='1':
+        posset=[x.start for x in tx.exon]
+        if br not in posset:
+            raise Exception('Br is not exon boundary')
+        if br < cds_start:
+            return '5UTR'
+        elif br > cds_end:
+            return '3UTR'
+        else:
+            chim_r=[cds_start,br-1]
+            ol=[overlap(x,chim_r) for x in e_pos]
+            L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
+            return L
+    if part=='3' and strand=='-1':
+        posset=[x.end for x in tx.exon]
+        if br not in posset:
+            raise Exception('Br is not exon boundary')
+        if br > cds_end:
+            return '5UTR'
+        elif br < cds_start:
+            return '3UTR'
+        else:
+            chim_r=[br+1,cds_end]
+            ol=[overlap(x,chim_r) for x in e_pos]
+            L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
+            return L
+
+def fusion_frame(gene_a,ga_br,gene_b,gb_br):
+    """gene_a:FGFR3,gene_b:TACC3,ga_br:1808661,gb_br:1741429
+    or chr4.1808661.chr4.1741429
+    gene_a, gene_b are Gene objects.
+    """
+    ga_br=int(ga_br)
+    gb_br=int(gb_br)
+    ga_txs=maptranscript(gene_a,ga_br,part='5')
+    gb_txs=maptranscript(gene_b,gb_br,part='3')
+    res=[]
+    for ta in ga_txs:
+        for tb in gb_txs:
+            s1=br_front_len(ta,ga_br,'5')
+            s2=br_front_len(tb,gb_br,'3')
+            fusion_conseq='Unknown'
+            if isinstance(s1,int) and isinstance(s2,int):    #both br in CDS region
+                if s1%3==s2%3:
+                    fusion_conseq='In-frame'
+                else:
+                    fusion_conseq='Out-of-frame'
+            elif isinstance(s1,int) and not isinstance(s2,int):
+                fusion_conseq='CDS'+'-'+s2
+            elif isinstance(s2,int) and not isinstance(s1,int):
+                fusion_conseq=s1+'-'+'CDS'
+            else:
+                fusion_conseq=s1+'-'+s2
+            res.append((ta.name,tb.name,fusion_conseq))
+    if ga_txs==[] and gb_txs==[]:
+        res.append(['NA','NA','NA'])
+    elif ga_txs==[]:
+        for tb in gb_txs:
+            res.append(['NA',tb.name,'NA'])
+    elif gb_txs==[]:
+        for ta in ga_txs:
+            res.append([ta.name,'NA','NA'])
+    return res
+
+
+def seqblast(seqA,seqB,blastn=None):
+    '''seqA,seqB:fasta files'''
+    if blastn==None: blastn='blastn'
+    cmdstr='%s -task=blastn  -subject %s -query %s -outfmt 6'%(blastn,seqA,seqB)
+    cmdout=subprocess.Popen(cmdstr.split(),stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+    result=cmdout.stdout.read()
+    best_align=result.split('\n')[0]
+    if best_align=='':
+        return None
+    else:
+        info=best_align.split('\t')
+        identity=info[2].strip()
+        align_len=info[3].strip()
+        evalue=info[10].strip()
+        bit_score=info[11].strip()
+        return [identity,align_len,evalue,bit_score]
+
+