Mercurial > repos > davidvanzessen > shm_csr
comparison gene_identification.py @ 26:ce25fb581ca3 draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 05 Dec 2016 11:00:13 -0500 |
parents | 012a738edf5a |
children | 69baebd10e54 |
comparison
equal
deleted
inserted
replaced
25:8d1c4c75f81b | 26:ce25fb581ca3 |
---|---|
162 varsInCA = float(len(ca1.keys()) * 2) | 162 varsInCA = float(len(ca1.keys()) * 2) |
163 varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn't hit the first and last nt twice | 163 varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn't hit the first and last nt twice |
164 varsInCM = 0 | 164 varsInCM = 0 |
165 varsInCE = 0 | 165 varsInCE = 0 |
166 | 166 |
167 | 167 def round_int(val): |
168 return int(round(val)) | |
168 | 169 |
169 first = True | 170 first = True |
170 seq_write_count=0 | 171 seq_write_count=0 |
171 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence | 172 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence |
172 with open(output, 'w') as o: | 173 with open(output, 'w') as o: |
191 cehits = currentIDHits["ce_hits"] | 192 cehits = currentIDHits["ce_hits"] |
192 if cahits >= cghits and cahits >= cmhits and cahits >= cehits: #its a ca gene | 193 if cahits >= cghits and cahits >= cmhits and cahits >= cehits: #its a ca gene |
193 ca1hits = currentIDHits["ca1"] | 194 ca1hits = currentIDHits["ca1"] |
194 ca2hits = currentIDHits["ca2"] | 195 ca2hits = currentIDHits["ca2"] |
195 if ca1hits >= ca2hits: | 196 if ca1hits >= ca2hits: |
196 o.write(ID + "\tIGA1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") | 197 o.write(ID + "\tIGA1\t" + str(round_int(ca1hits / varsInCA * 100)) + "\t" + str(round_int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") |
197 else: | 198 else: |
198 o.write(ID + "\tIGA2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") | 199 o.write(ID + "\tIGA2\t" + str(round_int(ca2hits / varsInCA * 100)) + "\t" + str(round_int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") |
199 elif cghits >= cahits and cghits >= cmhits and cghits >= cehits: #its a cg gene | 200 elif cghits >= cahits and cghits >= cmhits and cghits >= cehits: #its a cg gene |
200 cg1hits = currentIDHits["cg1"] | 201 cg1hits = currentIDHits["cg1"] |
201 cg2hits = currentIDHits["cg2"] | 202 cg2hits = currentIDHits["cg2"] |
202 cg3hits = currentIDHits["cg3"] | 203 cg3hits = currentIDHits["cg3"] |
203 cg4hits = currentIDHits["cg4"] | 204 cg4hits = currentIDHits["cg4"] |
204 if cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene | 205 if cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene |
205 o.write(ID + "\tIGG1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | 206 o.write(ID + "\tIGG1\t" + str(round_int(cg1hits / varsInCG * 100)) + "\t" + str(round_int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") |
206 elif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene | 207 elif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene |
207 o.write(ID + "\tIGG2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | 208 o.write(ID + "\tIGG2\t" + str(round_int(cg2hits / varsInCG * 100)) + "\t" + str(round_int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") |
208 elif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene | 209 elif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene |
209 o.write(ID + "\tIGG3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | 210 o.write(ID + "\tIGG3\t" + str(round_int(cg3hits / varsInCG * 100)) + "\t" + str(round_int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") |
210 else: #cg4 gene | 211 else: #cg4 gene |
211 o.write(ID + "\tIGG4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") | 212 o.write(ID + "\tIGG4\t" + str(round_int(cg4hits / varsInCG * 100)) + "\t" + str(round_int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") |
212 else: #its a cm or ce gene | 213 else: #its a cm or ce gene |
213 if cmhits >= cehits: | 214 if cmhits >= cehits: |
214 o.write(ID + "\tIGM\t100\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n") | 215 o.write(ID + "\tIGM\t100\t" + str(round_int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n") |
215 else: | 216 else: |
216 o.write(ID + "\tIGE\t100\t" + str(int(cehits / possiblece * 100)) + "\t" + start_location[ID + "_ce"] + "\n") | 217 o.write(ID + "\tIGE\t100\t" + str(round_int(cehits / possiblece * 100)) + "\t" + start_location[ID + "_ce"] + "\n") |
217 seq_write_count += 1 | 218 seq_write_count += 1 |
218 | 219 |
219 print "Time: %i" % (int(time.time() * 1000) - starttime) | 220 print "Time: %i" % (int(time.time() * 1000) - starttime) |
220 | 221 |
221 print "Number of sequences written to file:", seq_write_count | 222 print "Number of sequences written to file:", seq_write_count |