comparison gene_identification.py @ 5:012a738edf5a draft

Uploaded
author davidvanzessen
date Tue, 01 Nov 2016 10:15:37 -0400
parents c33d93683a09
children ce25fb581ca3
comparison
equal deleted inserted replaced
4:477e95b098fd 5:012a738edf5a
45 #old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag 45 #old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag
46 46
47 #lambda/kappa reference sequence 47 #lambda/kappa reference sequence
48 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg", 48 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg",
49 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc", 49 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc",
50 "ce": "gcctccacacagagcccatccgtcttccccttgacccgctgctgcaaaaacattccctcc",
50 "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence 51 "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence
51 52
52 compiledregex = {"ca": [], 53 compiledregex = {"ca": [],
53 "cg": [], 54 "cg": [],
55 "ce": [],
54 "cm": []} 56 "cm": []}
55 57
56 #lambda/kappa reference sequence variable nucleotides 58 #lambda/kappa reference sequence variable nucleotides
57 ca1 = {38: 't', 39: 'g', 48: 'a', 49: 'g', 51: 'c', 68: 'a', 73: 'c'} 59 ca1 = {38: 't', 39: 'g', 48: 'a', 49: 'g', 51: 'c', 68: 'a', 73: 'c'}
58 ca2 = {38: 'g', 39: 'a', 48: 'c', 49: 'c', 51: 'a', 68: 'g', 73: 'a'} 60 ca2 = {38: 'g', 39: 'a', 48: 'c', 49: 'c', 51: 'a', 68: 'g', 73: 'a'}
100 compiledregex["cg"].append((re.compile(result), varsInResult)) 102 compiledregex["cg"].append((re.compile(result), varsInResult))
101 103
102 for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength / 2): 104 for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength / 2):
103 compiledregex["cm"].append((re.compile(searchstrings["cm"][i:i+chunklength]), False)) 105 compiledregex["cm"].append((re.compile(searchstrings["cm"][i:i+chunklength]), False))
104 106
107 for i in range(0, len(searchstrings["ce"]) - chunklength, chunklength / 2):
108 compiledregex["ce"].append((re.compile(searchstrings["ce"][i:i+chunklength]), False))
105 109
106 110
107 def removeAndReturnMaxIndex(x): #simplifies a list comprehension 111 def removeAndReturnMaxIndex(x): #simplifies a list comprehension
108 m = max(x) 112 m = max(x)
109 index = x.index(m) 113 index = x.index(m)
112 116
113 117
114 start_location = dict() 118 start_location = dict()
115 hits = dict() 119 hits = dict()
116 alltotal = 0 120 alltotal = 0
117 for key in compiledregex.keys(): #for ca/cg/cm 121 for key in compiledregex.keys(): #for ca/cg/cm/ce
118 regularexpressions = compiledregex[key] #get the compiled regular expressions 122 regularexpressions = compiledregex[key] #get the compiled regular expressions
119 for ID in dic.keys()[0:]: #for every ID 123 for ID in dic.keys()[0:]: #for every ID
120 if ID not in hits.keys(): #ensure that the dictionairy that keeps track of the hits for every gene exists 124 if ID not in hits.keys(): #ensure that the dictionairy that keeps track of the hits for every gene exists
121 hits[ID] = {"ca_hits": 0, "cg_hits": 0, "cm_hits": 0, "ca1": 0, "ca2": 0, "cg1": 0, "cg2": 0, "cg3": 0, "cg4": 0} 125 hits[ID] = {"ca_hits": 0, "cg_hits": 0, "cm_hits": 0, "ce_hits": 0, "ca1": 0, "ca2": 0, "cg1": 0, "cg2": 0, "cg3": 0, "cg4": 0}
122 currentIDHits = hits[ID] 126 currentIDHits = hits[ID]
123 seq = dic[ID] 127 seq = dic[ID]
124 lastindex = 0 128 lastindex = 0
125 start_zero = len(searchstrings[key]) #allows the reference sequence to start before search sequence (start_locations of < 0) 129 start_zero = len(searchstrings[key]) #allows the reference sequence to start before search sequence (start_locations of < 0)
126 start = [0] * (len(seq) + start_zero) 130 start = [0] * (len(seq) + start_zero)
128 relativeStartLocation = lastindex - (chunklength / 2) * i 132 relativeStartLocation = lastindex - (chunklength / 2) * i
129 if relativeStartLocation >= len(seq): 133 if relativeStartLocation >= len(seq):
130 break 134 break
131 regex, hasVar = regexp 135 regex, hasVar = regexp
132 matches = regex.finditer(seq[lastindex:]) 136 matches = regex.finditer(seq[lastindex:])
133 for match in matches: #for every match with the current regex, only uses the first hit 137 for match in matches: #for every match with the current regex, only uses the first hit because of the break at the end of this loop
134 lastindex += match.start() 138 lastindex += match.start()
135 start[relativeStartLocation + start_zero] += 1 139 start[relativeStartLocation + start_zero] += 1
136 if hasVar: #if the regex has a variable nt in it 140 if hasVar: #if the regex has a variable nt in it
137 chunkstart = chunklength / 2 * i #where in the reference does this chunk start 141 chunkstart = chunklength / 2 * i #where in the reference does this chunk start
138 chunkend = chunklength / 2 * i + chunklength #where in the reference does this chunk end 142 chunkend = chunklength / 2 * i + chunklength #where in the reference does this chunk end
142 elif key == "cg": #just calculate the variable nt score for 'cg', cheaper 146 elif key == "cg": #just calculate the variable nt score for 'cg', cheaper
143 currentIDHits["cg1"] += len([1 for x in cg1 if chunkstart <= x < chunkend and cg1[x] == seq[lastindex + x - chunkstart]]) 147 currentIDHits["cg1"] += len([1 for x in cg1 if chunkstart <= x < chunkend and cg1[x] == seq[lastindex + x - chunkstart]])
144 currentIDHits["cg2"] += len([1 for x in cg2 if chunkstart <= x < chunkend and cg2[x] == seq[lastindex + x - chunkstart]]) 148 currentIDHits["cg2"] += len([1 for x in cg2 if chunkstart <= x < chunkend and cg2[x] == seq[lastindex + x - chunkstart]])
145 currentIDHits["cg3"] += len([1 for x in cg3 if chunkstart <= x < chunkend and cg3[x] == seq[lastindex + x - chunkstart]]) 149 currentIDHits["cg3"] += len([1 for x in cg3 if chunkstart <= x < chunkend and cg3[x] == seq[lastindex + x - chunkstart]])
146 currentIDHits["cg4"] += len([1 for x in cg4 if chunkstart <= x < chunkend and cg4[x] == seq[lastindex + x - chunkstart]]) 150 currentIDHits["cg4"] += len([1 for x in cg4 if chunkstart <= x < chunkend and cg4[x] == seq[lastindex + x - chunkstart]])
147 else: #key == "cm" #no variable regions in 'cm' 151 else: #key == "cm" #no variable regions in 'cm' or 'ce'
148 pass 152 pass
149 break #this only breaks when there was a match with the regex, breaking means the 'else:' clause is skipped 153 break #this only breaks when there was a match with the regex, breaking means the 'else:' clause is skipped
150 else: #only runs if there were no hits 154 else: #only runs if there were no hits
151 continue 155 continue
152 #print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i] 156 #print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i]
153 currentIDHits[key + "_hits"] += 1 157 currentIDHits[key + "_hits"] += 1
154 start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1 - start_zero) for x in range(5) if len(start) > 0 and max(start) > 1]) 158 start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1 - start_zero) for x in range(5) if len(start) > 0 and max(start) > 1])
155 #start_location[ID + "_" + key] = str(start.index(max(start))) 159 #start_location[ID + "_" + key] = str(start.index(max(start)))
156 160
157 161
158 chunksInCA = len(compiledregex["ca"])
159 chunksInCG = len(compiledregex["cg"])
160 chunksInCM = len(compiledregex["cm"])
161 requiredChunkPercentage = 0.7
162 varsInCA = float(len(ca1.keys()) * 2) 162 varsInCA = float(len(ca1.keys()) * 2)
163 varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn't hit the first and last nt twice 163 varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn't hit the first and last nt twice
164 varsInCM = 0 164 varsInCM = 0
165 varsInCE = 0
165 166
166 167
167 168
168 first = True 169 first = True
169 seq_write_count=0 170 seq_write_count=0
181 ID = linesplt[1] 182 ID = linesplt[1]
182 currentIDHits = hits[ID] 183 currentIDHits = hits[ID]
183 possibleca = float(len(compiledregex["ca"])) 184 possibleca = float(len(compiledregex["ca"]))
184 possiblecg = float(len(compiledregex["cg"])) 185 possiblecg = float(len(compiledregex["cg"]))
185 possiblecm = float(len(compiledregex["cm"])) 186 possiblecm = float(len(compiledregex["cm"]))
187 possiblece = float(len(compiledregex["ce"]))
186 cahits = currentIDHits["ca_hits"] 188 cahits = currentIDHits["ca_hits"]
187 cghits = currentIDHits["cg_hits"] 189 cghits = currentIDHits["cg_hits"]
188 cmhits = currentIDHits["cm_hits"] 190 cmhits = currentIDHits["cm_hits"]
189 if cahits >= cghits and cahits >= cmhits: #its a ca gene 191 cehits = currentIDHits["ce_hits"]
192 if cahits >= cghits and cahits >= cmhits and cahits >= cehits: #its a ca gene
190 ca1hits = currentIDHits["ca1"] 193 ca1hits = currentIDHits["ca1"]
191 ca2hits = currentIDHits["ca2"] 194 ca2hits = currentIDHits["ca2"]
192 if ca1hits >= ca2hits: 195 if ca1hits >= ca2hits:
193 o.write(ID + "\tIGA1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") 196 o.write(ID + "\tIGA1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
194 else: 197 else:
195 o.write(ID + "\tIGA2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") 198 o.write(ID + "\tIGA2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
196 elif cghits >= cahits and cghits >= cmhits: #its a cg gene 199 elif cghits >= cahits and cghits >= cmhits and cghits >= cehits: #its a cg gene
197 cg1hits = currentIDHits["cg1"] 200 cg1hits = currentIDHits["cg1"]
198 cg2hits = currentIDHits["cg2"] 201 cg2hits = currentIDHits["cg2"]
199 cg3hits = currentIDHits["cg3"] 202 cg3hits = currentIDHits["cg3"]
200 cg4hits = currentIDHits["cg4"] 203 cg4hits = currentIDHits["cg4"]
201 if cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene 204 if cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene
204 o.write(ID + "\tIGG2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") 207 o.write(ID + "\tIGG2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
205 elif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene 208 elif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene
206 o.write(ID + "\tIGG3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") 209 o.write(ID + "\tIGG3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
207 else: #cg4 gene 210 else: #cg4 gene
208 o.write(ID + "\tIGG4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") 211 o.write(ID + "\tIGG4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
209 else: #its a cm gene 212 else: #its a cm or ce gene
210 o.write(ID + "\tIGM\t100\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n") 213 if cmhits >= cehits:
214 o.write(ID + "\tIGM\t100\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n")
215 else:
216 o.write(ID + "\tIGE\t100\t" + str(int(cehits / possiblece * 100)) + "\t" + start_location[ID + "_ce"] + "\n")
211 seq_write_count += 1 217 seq_write_count += 1
212 218
213 print "Time: %i" % (int(time.time() * 1000) - starttime) 219 print "Time: %i" % (int(time.time() * 1000) - starttime)
214 220
215 print "Number of sequences written to file:", seq_write_count 221 print "Number of sequences written to file:", seq_write_count