comparison shm_csr.py @ 62:aa8d37bd1930 draft

Uploaded
author davidvanzessen
date Tue, 05 Dec 2017 10:57:13 -0500
parents c5295dd10dfc
children 8728284105ee
comparison
equal deleted inserted replaced
61:275e759e7985 62:aa8d37bd1930
21 outfile = args.output 21 outfile = args.output
22 22
23 genedic = dict() 23 genedic = dict()
24 24
25 mutationdic = dict() 25 mutationdic = dict()
26 mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") 26 mutationMatcher = re.compile("^(.)(\d+).(.),?[ ]?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?")
27 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?.?([A-Z])?(.*)?")
28 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?")
29 mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?")
27 NAMatchResult = (None, None, None, None, None, None, '') 30 NAMatchResult = (None, None, None, None, None, None, '')
28 geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes} 31 geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}
29 linecount = 0 32 linecount = 0
30 33
31 IDIndex = 0 34 IDIndex = 0
76 continue 79 continue
77 linecount += 1 80 linecount += 1
78 linesplt = line.split("\t") 81 linesplt = line.split("\t")
79 ID = linesplt[IDIndex] 82 ID = linesplt[IDIndex]
80 genedic[ID] = linesplt[best_matchIndex] 83 genedic[ID] = linesplt[best_matchIndex]
84
85 mutationdic[ID + "_FR1"] = []
86 if len(linesplt[fr1Index]) > 5 and empty_region_filter == "leader":
87 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x]
88
89 mutationdic[ID + "_CDR1"] = []
90 if len(linesplt[cdr1Index]) > 5 and empty_region_filter in ["leader", "FR1"]:
91 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x]
92
93 mutationdic[ID + "_FR2"] = []
94 if len(linesplt[fr2Index]) > 5 and empty_region_filter in ["leader", "FR1", "CDR1"]:
95 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x]
96
97 mutationdic[ID + "_CDR2"] = []
98 if len(linesplt[cdr2Index]) > 5:
99 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x]
100
101 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
102
103 mutationdic[ID + "_FR3"] = []
104 if len(linesplt[fr3Index]) > 5:
105 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x]
106
81 try: 107 try:
82 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if (linesplt[fr1Index] != "NA" and empty_region_filter == "leader") else [] 108 pass
83 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if (linesplt[cdr1Index] != "NA" and empty_region_filter in ["leader", "FR1"]) else []
84 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if (linesplt[fr2Index] != "NA" and empty_region_filter in ["leader", "FR1", "CDR1"]) else []
85 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if (linesplt[cdr2Index] != "NA") else []
86 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
87 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else []
88 except Exception as e: 109 except Exception as e:
89 print "Something went wrong while processing this line:" 110 print "Something went wrong while processing this line:"
111 print "line:", linecount
112 print "fr1 len:", len(linesplt[fr1Index]), "value:", linesplt[fr1Index]
113 print "cdr1 len:", len(linesplt[cdr1Index]), "value:", linesplt[cdr1Index]
114 print "fr2 len:", len(linesplt[fr2Index]), "value:", linesplt[fr2Index]
115 print "cdr2 len:", len(linesplt[cdr2Index]), "value:", linesplt[cdr2Index]
116 print "fr3 len:", len(linesplt[fr3Index]), "value:", linesplt[fr3Index]
117 print ID + "_FR1 in mutationdic", ID + "_FR1" in mutationdic
118 print ID + "_CDR1 in mutationdic", ID + "_CDR1" in mutationdic
119 print ID + "_FR2 in mutationdic", ID + "_FR2" in mutationdic
120 print ID + "_CDR2 in mutationdic", ID + "_CDR2" in mutationdic
121 print ID + "_FR3 in mutationdic", ID + "_FR3" in mutationdic
90 print linesplt 122 print linesplt
91 print linecount
92 print e 123 print e
93 mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] 124 mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
94 mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] 125 mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
95 126
96 cdr1Length = len(linesplt[cdr1LengthIndex]) 127 cdr1Length = len(linesplt[cdr1LengthIndex])
107 fr1LengthDict[ID] = fr1Length 138 fr1LengthDict[ID] = fr1Length
108 fr2LengthDict[ID] = fr2Length 139 fr2LengthDict[ID] = fr2Length
109 fr3LengthDict[ID] = fr3Length 140 fr3LengthDict[ID] = fr3Length
110 141
111 IDlist += [ID] 142 IDlist += [ID]
112 143 print "len(mutationdic) =", len(mutationdic)
144
145 with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "mutationdict.txt"), 'w') as out_handle:
146 for ID, lst in mutationdic.iteritems():
147 for mut in lst:
148 out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in mut])))
113 149
114 #tandem mutation stuff 150 #tandem mutation stuff
115 tandem_frequency = defaultdict(int) 151 tandem_frequency = defaultdict(int)
116 mutation_frequency = defaultdict(int) 152 mutation_frequency = defaultdict(int)
117 153
220 with open(tandem_freq_file, 'w') as o: 256 with open(tandem_freq_file, 'w') as o:
221 o.write("Tandems/Expected (ratio),{0}\n".format(",".join([str(x) for x in tandem_row]))) 257 o.write("Tandems/Expected (ratio),{0}\n".format(",".join([str(x) for x in tandem_row])))
222 258
223 #print mutationList, linecount 259 #print mutationList, linecount
224 260
225 AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1) # [4] is the position of the AA mutation, None if silent 261 AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] and i[5] != ";" else 0)[4]) + 1) # [4] is the position of the AA mutation, None if silent
226 if AALength < 60: 262 if AALength < 60:
227 AALength = 64 263 AALength = 64
228 264
229 AA_mutation = [0] * AALength 265 AA_mutation = [0] * AALength
230 AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]} 266 AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]}
336 [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] 372 [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]]
337 TW = [(int(x), int(y), z) for (x, y, z) in 373 TW = [(int(x), int(y), z) for (x, y, z) in
338 [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] 374 [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]]
339 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 375 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0
340 376
377 with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "RGYW.txt"), 'a') as out_handle:
378 for hotspot in RGYW:
379 out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in hotspot])))
380
341 mutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] 381 mutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
342 for mutation in mutationList: 382 for mutation in mutationList:
343 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation 383 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation
344 mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW]) 384 mutation_in_RGYW = any(((start <= int(where) <= end) for (start, end, region) in RGYW))
345 mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY]) 385 mutation_in_WRCY = any(((start <= int(where) <= end) for (start, end, region) in WRCY))
346 mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA]) 386 mutation_in_WA = any(((start <= int(where) <= end) for (start, end, region) in WA))
347 mutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW]) 387 mutation_in_TW = any(((start <= int(where) <= end) for (start, end, region) in TW))
348 388
349 in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW]) 389 in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW])
350 390
351 if in_how_many_motifs > 0: 391 if in_how_many_motifs > 0:
352 RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs 392 RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs