Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/privutils.py @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 #This module collects functions used in homology comparison and frame prediction | |
2 | |
3 import subprocess | |
4 | |
5 def maptranscript(gene,exon_end,part=''): | |
6 """ | |
7 Find all transcripts of the gene having the fusion boundary. | |
8 "gene" is a Gene object. | |
9 "exon_end" is the exon boundary that to be searched. | |
10 "part" has to be '5' or '3'. | |
11 Return a list of Transcript objects that has "exon_end" boundary. | |
12 """ | |
13 if part not in ['5','3']: | |
14 raise Exception('part must be "5" or "3"') | |
15 txuse=[] ## | |
16 g=gene.name | |
17 strand=gene.strand | |
18 if part=='5': | |
19 tag='big' if strand=='1' else 'small' | |
20 else: | |
21 tag='small' if strand=='1' else 'big' | |
22 txs=gene.transcript | |
23 if tag=='small': | |
24 for tx in txs: | |
25 e_pos=[x.start for x in tx.exon] | |
26 if exon_end in e_pos: | |
27 txuse.append(tx) | |
28 elif tag=='big': | |
29 for tx in txs: | |
30 e_pos=[x.end for x in tx.exon] | |
31 if exon_end in e_pos: | |
32 txuse.append(tx) | |
33 return txuse | |
34 | |
35 | |
36 def overlap(r1,r2): | |
37 """compare two ranges, return the overlapped range""" | |
38 if r1[0] <= r2[0]: | |
39 ra,rb=r1[:],r2[:] | |
40 else: | |
41 ra,rb=r2[:],r1[:] | |
42 if rb[0] > ra[1]: | |
43 return [] | |
44 else: | |
45 if rb[1] <= ra[1]: | |
46 return rb | |
47 else: | |
48 return [rb[0],ra[1]] | |
49 | |
50 def br_front_len(tx,br,part): | |
51 '''No matter it is 5'/3' partner,calculate the ORF length before the break''' | |
52 txname=tx.name | |
53 strand=tx.strand | |
54 cds_start=tx.cds_start | |
55 cds_end=tx.cds_end | |
56 e_pos=[[x.start,x.end] for x in tx.exon] | |
57 if part=='5' and strand=='1': | |
58 posset=[x.end for x in tx.exon] | |
59 if br not in posset: | |
60 raise Exception('Br is not exon boundary') | |
61 if br < cds_start: | |
62 return '5UTR' | |
63 elif br > cds_end: | |
64 return '3UTR' | |
65 else: | |
66 chim_r=[cds_start,br] | |
67 ol=[overlap(x,chim_r) for x in e_pos] | |
68 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | |
69 return L | |
70 if part=='5' and strand=='-1': | |
71 posset=[x.start for x in tx.exon] | |
72 if br not in posset: | |
73 raise Exception('Br is not exon boundary') | |
74 if br > cds_end: | |
75 return '5UTR' | |
76 elif br < cds_start: | |
77 return '3UTR' | |
78 else: | |
79 chim_r=[br,cds_end] | |
80 ol=[overlap(x,chim_r) for x in e_pos] | |
81 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | |
82 return L | |
83 if part=='3' and strand=='1': | |
84 posset=[x.start for x in tx.exon] | |
85 if br not in posset: | |
86 raise Exception('Br is not exon boundary') | |
87 if br < cds_start: | |
88 return '5UTR' | |
89 elif br > cds_end: | |
90 return '3UTR' | |
91 else: | |
92 chim_r=[cds_start,br-1] | |
93 ol=[overlap(x,chim_r) for x in e_pos] | |
94 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | |
95 return L | |
96 if part=='3' and strand=='-1': | |
97 posset=[x.end for x in tx.exon] | |
98 if br not in posset: | |
99 raise Exception('Br is not exon boundary') | |
100 if br > cds_end: | |
101 return '5UTR' | |
102 elif br < cds_start: | |
103 return '3UTR' | |
104 else: | |
105 chim_r=[br+1,cds_end] | |
106 ol=[overlap(x,chim_r) for x in e_pos] | |
107 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0]) | |
108 return L | |
109 | |
110 def fusion_frame(gene_a,ga_br,gene_b,gb_br): | |
111 """gene_a:FGFR3,gene_b:TACC3,ga_br:1808661,gb_br:1741429 | |
112 or chr4.1808661.chr4.1741429 | |
113 gene_a, gene_b are Gene objects. | |
114 """ | |
115 ga_br=int(ga_br) | |
116 gb_br=int(gb_br) | |
117 ga_txs=maptranscript(gene_a,ga_br,part='5') | |
118 gb_txs=maptranscript(gene_b,gb_br,part='3') | |
119 res=[] | |
120 for ta in ga_txs: | |
121 for tb in gb_txs: | |
122 s1=br_front_len(ta,ga_br,'5') | |
123 s2=br_front_len(tb,gb_br,'3') | |
124 fusion_conseq='Unknown' | |
125 if isinstance(s1,int) and isinstance(s2,int): #both br in CDS region | |
126 if s1%3==s2%3: | |
127 fusion_conseq='In-frame' | |
128 else: | |
129 fusion_conseq='Out-of-frame' | |
130 elif isinstance(s1,int) and not isinstance(s2,int): | |
131 fusion_conseq='CDS'+'-'+s2 | |
132 elif isinstance(s2,int) and not isinstance(s1,int): | |
133 fusion_conseq=s1+'-'+'CDS' | |
134 else: | |
135 fusion_conseq=s1+'-'+s2 | |
136 res.append((ta.name,tb.name,fusion_conseq)) | |
137 if ga_txs==[] and gb_txs==[]: | |
138 res.append(['NA','NA','NA']) | |
139 elif ga_txs==[]: | |
140 for tb in gb_txs: | |
141 res.append(['NA',tb.name,'NA']) | |
142 elif gb_txs==[]: | |
143 for ta in ga_txs: | |
144 res.append([ta.name,'NA','NA']) | |
145 return res | |
146 | |
147 | |
148 def seqblast(seqA,seqB,blastn=None): | |
149 '''seqA,seqB:fasta files''' | |
150 if blastn==None: blastn='blastn' | |
151 cmdstr='%s -task=blastn -subject %s -query %s -outfmt 6'%(blastn,seqA,seqB) | |
152 cmdout=subprocess.Popen(cmdstr.split(),stdout=subprocess.PIPE,stderr=subprocess.STDOUT) | |
153 result=cmdout.stdout.read() | |
154 best_align=result.split('\n')[0] | |
155 if best_align=='': | |
156 return None | |
157 else: | |
158 info=best_align.split('\t') | |
159 identity=info[2].strip() | |
160 align_len=info[3].strip() | |
161 evalue=info[10].strip() | |
162 bit_score=info[11].strip() | |
163 return [identity,align_len,evalue,bit_score] | |
164 | |
165 |