Mercurial > repos > siyuan > prada
view pyPRADA_1.2/privutils.py @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
line wrap: on
line source
#This module collects functions used in homology comparison and frame prediction import subprocess def maptranscript(gene,exon_end,part=''): """ Find all transcripts of the gene having the fusion boundary. "gene" is a Gene object. "exon_end" is the exon boundary that to be searched. "part" has to be '5' or '3'. Return a list of Transcript objects that has "exon_end" boundary. """ if part not in ['5','3']: raise Exception('part must be "5" or "3"') txuse=[] ## g=gene.name strand=gene.strand if part=='5': tag='big' if strand=='1' else 'small' else: tag='small' if strand=='1' else 'big' txs=gene.transcript if tag=='small': for tx in txs: e_pos=[x.start for x in tx.exon] if exon_end in e_pos: txuse.append(tx) elif tag=='big': for tx in txs: e_pos=[x.end for x in tx.exon] if exon_end in e_pos: txuse.append(tx) return txuse def overlap(r1,r2): """compare two ranges, return the overlapped range""" if r1[0] <= r2[0]: ra,rb=r1[:],r2[:] else: ra,rb=r2[:],r1[:] if rb[0] > ra[1]: return [] else: if rb[1] <= ra[1]: return rb else: return [rb[0],ra[1]] def br_front_len(tx,br,part): '''No matter it is 5'/3' partner,calculate the ORF length before the break''' txname=tx.name strand=tx.strand cds_start=tx.cds_start cds_end=tx.cds_end e_pos=[[x.start,x.end] for x in tx.exon] if part=='5' and strand=='1': posset=[x.end for x in tx.exon] if br not in posset: raise Exception('Br is not exon boundary') if br < cds_start: return '5UTR' elif br > cds_end: return '3UTR' else: chim_r=[cds_start,br] ol=[overlap(x,chim_r) for x in e_pos] L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) return L if part=='5' and strand=='-1': posset=[x.start for x in tx.exon] if br not in posset: raise Exception('Br is not exon boundary') if br > cds_end: return '5UTR' elif br < cds_start: return '3UTR' else: chim_r=[br,cds_end] ol=[overlap(x,chim_r) for x in e_pos] L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) return L if part=='3' and strand=='1': posset=[x.start for x in tx.exon] if br not in posset: raise Exception('Br is not exon boundary') if br < cds_start: return '5UTR' elif br > cds_end: return '3UTR' else: chim_r=[cds_start,br-1] ol=[overlap(x,chim_r) for x in e_pos] L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) return L if part=='3' and strand=='-1': posset=[x.end for x in tx.exon] if br not in posset: raise Exception('Br is not exon boundary') if br > cds_end: return '5UTR' elif br < cds_start: return '3UTR' else: chim_r=[br+1,cds_end] ol=[overlap(x,chim_r) for x in e_pos] L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) return L def fusion_frame(gene_a,ga_br,gene_b,gb_br): """gene_a:FGFR3,gene_b:TACC3,ga_br:1808661,gb_br:1741429 or chr4.1808661.chr4.1741429 gene_a, gene_b are Gene objects. """ ga_br=int(ga_br) gb_br=int(gb_br) ga_txs=maptranscript(gene_a,ga_br,part='5') gb_txs=maptranscript(gene_b,gb_br,part='3') res=[] for ta in ga_txs: for tb in gb_txs: s1=br_front_len(ta,ga_br,'5') s2=br_front_len(tb,gb_br,'3') fusion_conseq='Unknown' if isinstance(s1,int) and isinstance(s2,int): #both br in CDS region if s1%3==s2%3: fusion_conseq='In-frame' else: fusion_conseq='Out-of-frame' elif isinstance(s1,int) and not isinstance(s2,int): fusion_conseq='CDS'+'-'+s2 elif isinstance(s2,int) and not isinstance(s1,int): fusion_conseq=s1+'-'+'CDS' else: fusion_conseq=s1+'-'+s2 res.append((ta.name,tb.name,fusion_conseq)) if ga_txs==[] and gb_txs==[]: res.append(['NA','NA','NA']) elif ga_txs==[]: for tb in gb_txs: res.append(['NA',tb.name,'NA']) elif gb_txs==[]: for ta in ga_txs: res.append([ta.name,'NA','NA']) return res def seqblast(seqA,seqB,blastn=None): '''seqA,seqB:fasta files''' if blastn==None: blastn='blastn' cmdstr='%s -task=blastn -subject %s -query %s -outfmt 6'%(blastn,seqA,seqB) cmdout=subprocess.Popen(cmdstr.split(),stdout=subprocess.PIPE,stderr=subprocess.STDOUT) result=cmdout.stdout.read() best_align=result.split('\n')[0] if best_align=='': return None else: info=best_align.split('\t') identity=info[2].strip() align_len=info[3].strip() evalue=info[10].strip() bit_score=info[11].strip() return [identity,align_len,evalue,bit_score]