Mercurial > repos > siyuan > prada
diff pyPRADA_1.2/privutils.py @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyPRADA_1.2/privutils.py Thu Feb 20 00:44:58 2014 -0500 @@ -0,0 +1,165 @@ +#This module collects functions used in homology comparison and frame prediction + +import subprocess + +def maptranscript(gene,exon_end,part=''): + """ + Find all transcripts of the gene having the fusion boundary. + "gene" is a Gene object. + "exon_end" is the exon boundary that to be searched. + "part" has to be '5' or '3'. + Return a list of Transcript objects that has "exon_end" boundary. + """ + if part not in ['5','3']: + raise Exception('part must be "5" or "3"') + txuse=[] ## + g=gene.name + strand=gene.strand + if part=='5': + tag='big' if strand=='1' else 'small' + else: + tag='small' if strand=='1' else 'big' + txs=gene.transcript + if tag=='small': + for tx in txs: + e_pos=[x.start for x in tx.exon] + if exon_end in e_pos: + txuse.append(tx) + elif tag=='big': + for tx in txs: + e_pos=[x.end for x in tx.exon] + if exon_end in e_pos: + txuse.append(tx) + return txuse + + +def overlap(r1,r2): + """compare two ranges, return the overlapped range""" + if r1[0] <= r2[0]: + ra,rb=r1[:],r2[:] + else: + ra,rb=r2[:],r1[:] + if rb[0] > ra[1]: + return [] + else: + if rb[1] <= ra[1]: + return rb + else: + return [rb[0],ra[1]] + +def br_front_len(tx,br,part): + '''No matter it is 5'/3' partner,calculate the ORF length before the break''' + txname=tx.name + strand=tx.strand + cds_start=tx.cds_start + cds_end=tx.cds_end + e_pos=[[x.start,x.end] for x in tx.exon] + if part=='5' and strand=='1': + posset=[x.end for x in tx.exon] + if br not in posset: + raise Exception('Br is not exon boundary') + if br < cds_start: + return '5UTR' + elif br > cds_end: + return '3UTR' + else: + chim_r=[cds_start,br] + ol=[overlap(x,chim_r) for x in e_pos] + L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) + return L + if part=='5' and strand=='-1': + posset=[x.start for x in tx.exon] + if br not in posset: + raise Exception('Br is not exon boundary') + if br > cds_end: + return '5UTR' + elif br < cds_start: + return '3UTR' + else: + chim_r=[br,cds_end] + ol=[overlap(x,chim_r) for x in e_pos] + L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) + return L + if part=='3' and strand=='1': + posset=[x.start for x in tx.exon] + if br not in posset: + raise Exception('Br is not exon boundary') + if br < cds_start: + return '5UTR' + elif br > cds_end: + return '3UTR' + else: + chim_r=[cds_start,br-1] + ol=[overlap(x,chim_r) for x in e_pos] + L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) + return L + if part=='3' and strand=='-1': + posset=[x.end for x in tx.exon] + if br not in posset: + raise Exception('Br is not exon boundary') + if br > cds_end: + return '5UTR' + elif br < cds_start: + return '3UTR' + else: + chim_r=[br+1,cds_end] + ol=[overlap(x,chim_r) for x in e_pos] + L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) + return L + +def fusion_frame(gene_a,ga_br,gene_b,gb_br): + """gene_a:FGFR3,gene_b:TACC3,ga_br:1808661,gb_br:1741429 + or chr4.1808661.chr4.1741429 + gene_a, gene_b are Gene objects. + """ + ga_br=int(ga_br) + gb_br=int(gb_br) + ga_txs=maptranscript(gene_a,ga_br,part='5') + gb_txs=maptranscript(gene_b,gb_br,part='3') + res=[] + for ta in ga_txs: + for tb in gb_txs: + s1=br_front_len(ta,ga_br,'5') + s2=br_front_len(tb,gb_br,'3') + fusion_conseq='Unknown' + if isinstance(s1,int) and isinstance(s2,int): #both br in CDS region + if s1%3==s2%3: + fusion_conseq='In-frame' + else: + fusion_conseq='Out-of-frame' + elif isinstance(s1,int) and not isinstance(s2,int): + fusion_conseq='CDS'+'-'+s2 + elif isinstance(s2,int) and not isinstance(s1,int): + fusion_conseq=s1+'-'+'CDS' + else: + fusion_conseq=s1+'-'+s2 + res.append((ta.name,tb.name,fusion_conseq)) + if ga_txs==[] and gb_txs==[]: + res.append(['NA','NA','NA']) + elif ga_txs==[]: + for tb in gb_txs: + res.append(['NA',tb.name,'NA']) + elif gb_txs==[]: + for ta in ga_txs: + res.append([ta.name,'NA','NA']) + return res + + +def seqblast(seqA,seqB,blastn=None): + '''seqA,seqB:fasta files''' + if blastn==None: blastn='blastn' + cmdstr='%s -task=blastn -subject %s -query %s -outfmt 6'%(blastn,seqA,seqB) + cmdout=subprocess.Popen(cmdstr.split(),stdout=subprocess.PIPE,stderr=subprocess.STDOUT) + result=cmdout.stdout.read() + best_align=result.split('\n')[0] + if best_align=='': + return None + else: + info=best_align.split('\t') + identity=info[2].strip() + align_len=info[3].strip() + evalue=info[10].strip() + bit_score=info[11].strip() + return [identity,align_len,evalue,bit_score] + +