comparison gene_identification.py @ 26:ce25fb581ca3 draft

Uploaded
author davidvanzessen
date Mon, 05 Dec 2016 11:00:13 -0500
parents 012a738edf5a
children 69baebd10e54
comparison
equal deleted inserted replaced
25:8d1c4c75f81b 26:ce25fb581ca3
162 varsInCA = float(len(ca1.keys()) * 2) 162 varsInCA = float(len(ca1.keys()) * 2)
163 varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn't hit the first and last nt twice 163 varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn't hit the first and last nt twice
164 varsInCM = 0 164 varsInCM = 0
165 varsInCE = 0 165 varsInCE = 0
166 166
167 167 def round_int(val):
168 return int(round(val))
168 169
169 first = True 170 first = True
170 seq_write_count=0 171 seq_write_count=0
171 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence 172 with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence
172 with open(output, 'w') as o: 173 with open(output, 'w') as o:
191 cehits = currentIDHits["ce_hits"] 192 cehits = currentIDHits["ce_hits"]
192 if cahits >= cghits and cahits >= cmhits and cahits >= cehits: #its a ca gene 193 if cahits >= cghits and cahits >= cmhits and cahits >= cehits: #its a ca gene
193 ca1hits = currentIDHits["ca1"] 194 ca1hits = currentIDHits["ca1"]
194 ca2hits = currentIDHits["ca2"] 195 ca2hits = currentIDHits["ca2"]
195 if ca1hits >= ca2hits: 196 if ca1hits >= ca2hits:
196 o.write(ID + "\tIGA1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") 197 o.write(ID + "\tIGA1\t" + str(round_int(ca1hits / varsInCA * 100)) + "\t" + str(round_int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
197 else: 198 else:
198 o.write(ID + "\tIGA2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") 199 o.write(ID + "\tIGA2\t" + str(round_int(ca2hits / varsInCA * 100)) + "\t" + str(round_int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
199 elif cghits >= cahits and cghits >= cmhits and cghits >= cehits: #its a cg gene 200 elif cghits >= cahits and cghits >= cmhits and cghits >= cehits: #its a cg gene
200 cg1hits = currentIDHits["cg1"] 201 cg1hits = currentIDHits["cg1"]
201 cg2hits = currentIDHits["cg2"] 202 cg2hits = currentIDHits["cg2"]
202 cg3hits = currentIDHits["cg3"] 203 cg3hits = currentIDHits["cg3"]
203 cg4hits = currentIDHits["cg4"] 204 cg4hits = currentIDHits["cg4"]
204 if cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene 205 if cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene
205 o.write(ID + "\tIGG1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") 206 o.write(ID + "\tIGG1\t" + str(round_int(cg1hits / varsInCG * 100)) + "\t" + str(round_int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
206 elif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene 207 elif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene
207 o.write(ID + "\tIGG2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") 208 o.write(ID + "\tIGG2\t" + str(round_int(cg2hits / varsInCG * 100)) + "\t" + str(round_int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
208 elif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene 209 elif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene
209 o.write(ID + "\tIGG3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") 210 o.write(ID + "\tIGG3\t" + str(round_int(cg3hits / varsInCG * 100)) + "\t" + str(round_int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
210 else: #cg4 gene 211 else: #cg4 gene
211 o.write(ID + "\tIGG4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") 212 o.write(ID + "\tIGG4\t" + str(round_int(cg4hits / varsInCG * 100)) + "\t" + str(round_int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
212 else: #its a cm or ce gene 213 else: #its a cm or ce gene
213 if cmhits >= cehits: 214 if cmhits >= cehits:
214 o.write(ID + "\tIGM\t100\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n") 215 o.write(ID + "\tIGM\t100\t" + str(round_int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n")
215 else: 216 else:
216 o.write(ID + "\tIGE\t100\t" + str(int(cehits / possiblece * 100)) + "\t" + start_location[ID + "_ce"] + "\n") 217 o.write(ID + "\tIGE\t100\t" + str(round_int(cehits / possiblece * 100)) + "\t" + start_location[ID + "_ce"] + "\n")
217 seq_write_count += 1 218 seq_write_count += 1
218 219
219 print "Time: %i" % (int(time.time() * 1000) - starttime) 220 print "Time: %i" % (int(time.time() * 1000) - starttime)
220 221
221 print "Number of sequences written to file:", seq_write_count 222 print "Number of sequences written to file:", seq_write_count