annotate Contra/scripts/convert_gene_coordinate.py @ 15:f4345d10e1ad

Uploaded
author fcaramia
date Tue, 11 Dec 2012 18:52:28 -0500
parents 7564f3b1e675
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
1 # ----------------------------------------------------------------------#
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
2 # Copyright (c) 2011, Richard Lupat & Jason Li.
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
3 #
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
4 # > Source License <
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
5 # This file is part of CONTRA.
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
6 #
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
7 # CONTRA is free software: you can redistribute it and/or modify
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
8 # it under the terms of the GNU General Public License as published by
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
9 # the Free Software Foundation, either version 3 of the License, or
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
10 # (at your option) any later version.
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
11 #
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
12 # CONTRA is distributed in the hope that it will be useful,
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
15 # GNU General Public License for more details.
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
16 #
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
17 # You should have received a copy of the GNU General Public License
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
18 # along with CONTRA. If not, see <http://www.gnu.org/licenses/>.
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
19 #
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
20 #
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
21 #-----------------------------------------------------------------------#
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
22 # Last Updated : 03 Oct 2011 11:00AM
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
23
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
24 import sys
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
25
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
26 def outputwrite(output, gene,chr,a,b,c,id,n,sOri, eOri):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
27 id = str(id)
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
28 n = str(n)
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
29 output.write(gene+"\t"+chr+"\t"+a+"\t"+b+"\t"+c+"\t"+id+"\t"+n+"\t"+sOri+"\t"+eOri+"\n")
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
30
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
31 def outputwrite2(output2, chr,c,sOri, eOri):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
32 output2.write(chr+"\t"+sOri+"\t"+eOri+"\t"+c+"\n")
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
33
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
34
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
35 def convertGeneCoordinate(targetList, bufLocFolder):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
36 inputfile2 = bufLocFolder + "chr/"
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
37 outputfile = bufLocFolder + "geneRefCoverage.txt"
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
38 outputfile2= bufLocFolder + "geneRefCoverage2.txt"
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
39
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
40 output= open(outputfile,"w")
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
41 output2 = open(outputfile2 , "w")
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
42
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
43 rn = 1
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
44 prevchr = ""
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
45 for target in targetList:
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
46 chr = target.chr
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
47 starts = target.oriStart.split(",")
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
48 ends = target.oriEnd.split(",")
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
49
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
50 if ((len(chr) > 5) or (chr[len(chr)-1] == "M")):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
51 continue
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
52
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
53 if (prevchr != chr):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
54 print chr #progress checking
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
55 prevchr = chr
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
56 t = 0
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
57 covFile = file.readlines(open(inputfile2+chr+".txt","r"))
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
58
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
59 for n in range(0,target.numberExon):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
60 if t >= len(covFile):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
61 break
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
62 cov = covFile[t].split()
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
63 while ((int(starts[n]) < int(cov[1])) or (int(starts[n]) >= int(cov[2]))):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
64 if (int(cov[1]) > int(starts[n])):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
65 t-=1
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
66 else:
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
67 t+=1
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
68 cov = covFile[t].split()
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
69
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
70 while ((int(ends[n]) < int(cov[1])) or (int(ends[n]) > int(cov[2]))):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
71 # print output #
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
72 if (rn == 1):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
73 prev = target.id
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
74 nID = target.id
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
75 if (nID != prev):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
76 rn = 1
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
77 ref1 = str(rn)
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
78 ref2 = str(int(cov[2]) - int(starts[n]) + rn)
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
79 outputwrite(output, target.gene,chr,ref1,ref2,cov[3],target.id,n,starts[n],cov[2])
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
80
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
81 outputwrite2(output2, chr,cov[3],starts[n],cov[2])
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
82
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
83
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
84 rn = int(ref2)
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
85 prev = nID
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
86 # -- #
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
87
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
88 # get to the next line of inputfile#
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
89 t+= 1
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
90 cov = covFile[t].split()
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
91 starts[n] = cov[1]
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
92
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
93 #print output #
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
94 if (t == 0) and (t >= len(covFile)):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
95 continue
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
96
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
97 if (rn == 1):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
98 prev = target.id
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
99 nID = target.id
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
100 if (nID != prev):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
101 rn = 1
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
102 ref1 = str(rn)
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
103 ref2 = str(int(ends[n]) - int(starts[n]) + rn)
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
104 outputwrite(output, target.gene, chr, ref1, ref2, cov[3], target.id, n, starts[n], ends[n])
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
105
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
106 outputwrite2(output2, chr, cov[3], starts[n], ends[n])
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
107
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
108
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
109 rn = int(ref2)
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
110 prev = nID
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
111 # -- #
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
112 output.close()
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
113 output2.close()
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
114
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
115
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
116 def convertGeneCoordinate2(targetList, bufLocFolder):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
117 inputfile2 = bufLocFolder + "chr/"
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
118 outputfile = bufLocFolder + "geneRefCoverage.txt"
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
119 outputfile_avg = bufLocFolder + "geneRefCoverage_targetAverage.txt"
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
120
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
121 output= open(outputfile,"w")
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
122 output_avg = open(outputfile_avg,"w")
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
123
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
124 rn = 1
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
125 prevchr = ""
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
126 for target in targetList:
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
127 chr = target.chr
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
128 starts = target.oriStart.split(",")
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
129 ends = target.oriEnd.split(",")
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
130 target_ttl_readdepth = 0
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
131 starts_leftmost = starts[0]
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
132
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
133
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
134 if ((len(chr) > 5) or (chr[len(chr)-1] == "M")):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
135 continue
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
136
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
137 if (prevchr != chr):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
138 print chr #progress checking
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
139 prevchr = chr
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
140 t = 0
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
141 covFile = file.readlines(open(inputfile2+chr+".txt","r"))
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
142
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
143 for n in range(0,target.numberExon):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
144 if t >= len(covFile):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
145 break
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
146 cov = covFile[t].split()
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
147 while ((int(starts[n]) < int(cov[1])) or (int(starts[n]) >= int(cov[2]))):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
148 if (int(cov[1]) > int(starts[n])): t-=1
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
149 else:
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
150 t+=1
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
151 cov = covFile[t].split()
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
152
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
153 while ((int(ends[n]) < int(cov[1])) or (int(ends[n]) > int(cov[2]))):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
154 # print output #
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
155 if (rn == 1):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
156 prev = target.id
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
157 nID = target.id
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
158 if (nID != prev):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
159 rn = 1
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
160 ref1 = str(rn)
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
161 ref2 = str(int(cov[2]) - int(starts[n]) + rn)
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
162 outputwrite2(output, chr,cov[3],starts[n],cov[2])
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
163 tmprange=int(cov[2])-int(starts[n])+1
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
164 target_ttl_readdepth+=int(cov[3])*tmprange
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
165 #target_length+=tmprange
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
166
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
167 rn = int(ref2)
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
168 prev = nID
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
169 # -- #
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
170
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
171 # get to the next line of inputfile#
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
172 t+= 1
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
173 cov = covFile[t].split()
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
174 starts[n] = cov[1]
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
175
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
176 #print output #
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
177 if (t == 0) and (t >= len(covFile)):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
178 continue
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
179
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
180 if (rn == 1):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
181 prev = target.id
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
182 nID = target.id
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
183 if (nID != prev):
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
184 rn = 1
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
185 ref1 = str(rn)
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
186 ref2 = str(int(ends[n]) - int(starts[n]) + rn)
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
187 outputwrite2(output, chr, cov[3], starts[n], ends[n])
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
188 tmprange=int(ends[n])-int(starts[n])+1
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
189 target_ttl_readdepth+=int(cov[3])*tmprange
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
190 #target_length+=tmprange
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
191 target_length = int(ends[n])-int(starts_leftmost)+1
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
192 output_avg.write("\t".join([chr,starts_leftmost,ends[n],str(target_ttl_readdepth/target_length),str(target_length)])+"\n")
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
193
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
194 rn = int(ref2)
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
195 prev = nID
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
196 # -- #
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
197 output.close()
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
198 output_avg.close()
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
199
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
200
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
201
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
202
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
203
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
204
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
205
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
206
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
207
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
208
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
209
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
210
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
211
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
212
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
213
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
214
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
215
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
216
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
217
7564f3b1e675 Uploaded
fcaramia
parents:
diff changeset
218