Mercurial > repos > davidvanzessen > shm_csr
diff gene_identification.py @ 5:012a738edf5a draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 01 Nov 2016 10:15:37 -0400 |
parents | c33d93683a09 |
children | ce25fb581ca3 |
line wrap: on
line diff
--- a/gene_identification.py Mon Oct 31 05:05:26 2016 -0400 +++ b/gene_identification.py Tue Nov 01 10:15:37 2016 -0400 @@ -47,10 +47,12 @@ #lambda/kappa reference sequence searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg", "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc", + "ce": "gcctccacacagagcccatccgtcttccccttgacccgctgctgcaaaaacattccctcc", "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence compiledregex = {"ca": [], "cg": [], + "ce": [], "cm": []} #lambda/kappa reference sequence variable nucleotides @@ -102,6 +104,8 @@ for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength / 2): compiledregex["cm"].append((re.compile(searchstrings["cm"][i:i+chunklength]), False)) +for i in range(0, len(searchstrings["ce"]) - chunklength, chunklength / 2): + compiledregex["ce"].append((re.compile(searchstrings["ce"][i:i+chunklength]), False)) def removeAndReturnMaxIndex(x): #simplifies a list comprehension @@ -114,11 +118,11 @@ start_location = dict() hits = dict() alltotal = 0 -for key in compiledregex.keys(): #for ca/cg/cm +for key in compiledregex.keys(): #for ca/cg/cm/ce regularexpressions = compiledregex[key] #get the compiled regular expressions for ID in dic.keys()[0:]: #for every ID if ID not in hits.keys(): #ensure that the dictionairy that keeps track of the hits for every gene exists - hits[ID] = {"ca_hits": 0, "cg_hits": 0, "cm_hits": 0, "ca1": 0, "ca2": 0, "cg1": 0, "cg2": 0, "cg3": 0, "cg4": 0} + hits[ID] = {"ca_hits": 0, "cg_hits": 0, "cm_hits": 0, "ce_hits": 0, "ca1": 0, "ca2": 0, "cg1": 0, "cg2": 0, "cg3": 0, "cg4": 0} currentIDHits = hits[ID] seq = dic[ID] lastindex = 0 @@ -130,7 +134,7 @@ break regex, hasVar = regexp matches = regex.finditer(seq[lastindex:]) - for match in matches: #for every match with the current regex, only uses the first hit + for match in matches: #for every match with the current regex, only uses the first hit because of the break at the end of this loop lastindex += match.start() start[relativeStartLocation + start_zero] += 1 if hasVar: #if the regex has a variable nt in it @@ -144,7 +148,7 @@ currentIDHits["cg2"] += len([1 for x in cg2 if chunkstart <= x < chunkend and cg2[x] == seq[lastindex + x - chunkstart]]) currentIDHits["cg3"] += len([1 for x in cg3 if chunkstart <= x < chunkend and cg3[x] == seq[lastindex + x - chunkstart]]) currentIDHits["cg4"] += len([1 for x in cg4 if chunkstart <= x < chunkend and cg4[x] == seq[lastindex + x - chunkstart]]) - else: #key == "cm" #no variable regions in 'cm' + else: #key == "cm" #no variable regions in 'cm' or 'ce' pass break #this only breaks when there was a match with the regex, breaking means the 'else:' clause is skipped else: #only runs if there were no hits @@ -155,13 +159,10 @@ #start_location[ID + "_" + key] = str(start.index(max(start))) -chunksInCA = len(compiledregex["ca"]) -chunksInCG = len(compiledregex["cg"]) -chunksInCM = len(compiledregex["cm"]) -requiredChunkPercentage = 0.7 varsInCA = float(len(ca1.keys()) * 2) varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn't hit the first and last nt twice varsInCM = 0 +varsInCE = 0 @@ -183,17 +184,19 @@ possibleca = float(len(compiledregex["ca"])) possiblecg = float(len(compiledregex["cg"])) possiblecm = float(len(compiledregex["cm"])) + possiblece = float(len(compiledregex["ce"])) cahits = currentIDHits["ca_hits"] cghits = currentIDHits["cg_hits"] cmhits = currentIDHits["cm_hits"] - if cahits >= cghits and cahits >= cmhits: #its a ca gene + cehits = currentIDHits["ce_hits"] + if cahits >= cghits and cahits >= cmhits and cahits >= cehits: #its a ca gene ca1hits = currentIDHits["ca1"] ca2hits = currentIDHits["ca2"] if ca1hits >= ca2hits: o.write(ID + "\tIGA1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") else: o.write(ID + "\tIGA2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") - elif cghits >= cahits and cghits >= cmhits: #its a cg gene + elif cghits >= cahits and cghits >= cmhits and cghits >= cehits: #its a cg gene cg1hits = currentIDHits["cg1"] cg2hits = currentIDHits["cg2"] cg3hits = currentIDHits["cg3"] @@ -206,8 +209,11 @@ o.write(ID + "\tIGG3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") else: #cg4 gene o.write(ID + "\tIGG4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") - else: #its a cm gene - o.write(ID + "\tIGM\t100\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + else: #its a cm or ce gene + if cmhits >= cehits: + o.write(ID + "\tIGM\t100\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n") + else: + o.write(ID + "\tIGE\t100\t" + str(int(cehits / possiblece * 100)) + "\t" + start_location[ID + "_ce"] + "\n") seq_write_count += 1 print "Time: %i" % (int(time.time() * 1000) - starttime)