comparison shm_csr.py @ 83:729738462297 draft

"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
author rhpvorderman
date Wed, 15 Sep 2021 12:24:06 +0000
parents b6f9a640e098
children 6809c63d9161
comparison
equal deleted inserted replaced
82:a103134ee6e0 83:729738462297
24 24
25 mutationdic = dict() 25 mutationdic = dict()
26 mutationMatcher = re.compile("^(.)(\d+).(.),?[ ]?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") 26 mutationMatcher = re.compile("^(.)(\d+).(.),?[ ]?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?")
27 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?.?([A-Z])?(.*)?") 27 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?.?([A-Z])?(.*)?")
28 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?") 28 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?")
29 mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?") 29 mutationMatcher = re.compile(r"^([nactg])(\d+).([nactg]),?[ ]?([A-Z*])?(\d+)?[>]?([A-Z*;])?(.*)?")
30 NAMatchResult = (None, None, None, None, None, None, '') 30 NAMatchResult = (None, None, None, None, None, None, '')
31 geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes} 31 geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}
32 linecount = 0 32 linecount = 0
33 33
34 IDIndex = 0 34 IDIndex = 0
57 fr3SeqIndex = 0 57 fr3SeqIndex = 0
58 58
59 tandem_sum_by_class = defaultdict(int) 59 tandem_sum_by_class = defaultdict(int)
60 expected_tandem_sum_by_class = defaultdict(float) 60 expected_tandem_sum_by_class = defaultdict(float)
61 61
62 with open(infile, 'ru') as i: 62 with open(infile, 'r') as i:
63 for line in i: 63 for line in i:
64 if first: 64 if first:
65 linesplt = line.split("\t") 65 linesplt = line.split("\t")
66 IDIndex = linesplt.index("Sequence.ID") 66 IDIndex = linesplt.index("Sequence.ID")
67 best_matchIndex = linesplt.index("best_match") 67 best_matchIndex = linesplt.index("best_match")
128 fr1LengthDict[ID] = fr1Length 128 fr1LengthDict[ID] = fr1Length
129 fr2LengthDict[ID] = fr2Length 129 fr2LengthDict[ID] = fr2Length
130 fr3LengthDict[ID] = fr3Length 130 fr3LengthDict[ID] = fr3Length
131 131
132 IDlist += [ID] 132 IDlist += [ID]
133 print "len(mutationdic) =", len(mutationdic) 133 print("len(mutationdic) =", len(mutationdic))
134 134
135 with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "mutationdict.txt"), 'w') as out_handle: 135 with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "mutationdict.txt"), 'w') as out_handle:
136 for ID, lst in mutationdic.iteritems(): 136 for ID, lst in mutationdic.items():
137 for mut in lst: 137 for mut in lst:
138 out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in mut]))) 138 out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in mut])))
139 139
140 #tandem mutation stuff 140 #tandem mutation stuff
141 tandem_frequency = defaultdict(int) 141 tandem_frequency = defaultdict(int)
228 tandem_frequency[str(tandem_mut[1])] += 1 228 tandem_frequency[str(tandem_mut[1])] += 1
229 #print "\t".join([ID, str(len(tandem_muts)), str(longest_tandem[1]) , str(tandem_muts)]) 229 #print "\t".join([ID, str(len(tandem_muts)), str(longest_tandem[1]) , str(tandem_muts)])
230 230
231 tandem_freq_file = os.path.join(os.path.dirname(outfile), "tandem_frequency.txt") 231 tandem_freq_file = os.path.join(os.path.dirname(outfile), "tandem_frequency.txt")
232 with open(tandem_freq_file, 'w') as o: 232 with open(tandem_freq_file, 'w') as o:
233 for frq in sorted([int(x) for x in tandem_frequency.keys()]): 233 for frq in sorted([int(x) for x in list(tandem_frequency.keys())]):
234 o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)])) 234 o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)]))
235 235
236 tandem_row = [] 236 tandem_row = []
237 genes_extra = list(genes) 237 genes_extra = list(genes)
238 genes_extra.append("all") 238 genes_extra.append("all")
254 254
255 AA_mutation = [0] * AALength 255 AA_mutation = [0] * AALength
256 AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]} 256 AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]}
257 AA_mutation_empty = AA_mutation[:] 257 AA_mutation_empty = AA_mutation[:]
258 258
259 print "AALength:", AALength 259 print("AALength:", AALength)
260 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt" 260 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt"
261 with open(aa_mutations_by_id_file, 'w') as o: 261 with open(aa_mutations_by_id_file, 'w') as o:
262 o.write("ID\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n") 262 o.write("ID\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n")
263 for ID in mutationListByID.keys(): 263 for ID in list(mutationListByID.keys()):
264 AA_mutation_for_ID = AA_mutation_empty[:] 264 AA_mutation_for_ID = AA_mutation_empty[:]
265 for mutation in mutationListByID[ID]: 265 for mutation in mutationListByID[ID]:
266 if mutation[4] and mutation[5] != ";": 266 if mutation[4] and mutation[5] != ";":
267 AA_mutation_position = int(mutation[4]) 267 AA_mutation_position = int(mutation[4])
268 try: 268 try:
269 AA_mutation[AA_mutation_position] += 1 269 AA_mutation[AA_mutation_position] += 1
270 AA_mutation_for_ID[AA_mutation_position] += 1 270 AA_mutation_for_ID[AA_mutation_position] += 1
271 except Exception as e: 271 except Exception as e:
272 print e 272 print(e)
273 print mutation 273 print(mutation)
274 sys.exit() 274 sys.exit()
275 clss = genedic[ID][:3] 275 clss = genedic[ID][:3]
276 AA_mutation_dic[clss][AA_mutation_position] += 1 276 AA_mutation_dic[clss][AA_mutation_position] += 1
277 o.write(ID + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n") 277 o.write(ID + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n")
278 278
279 279
280 280
281 #absent AA stuff 281 #absent AA stuff
282 absentAACDR1Dic = defaultdict(list) 282 absentAACDR1Dic = defaultdict(list)
283 absentAACDR1Dic[5] = range(29,36) 283 absentAACDR1Dic[5] = list(range(29,36))
284 absentAACDR1Dic[6] = range(29,35) 284 absentAACDR1Dic[6] = list(range(29,35))
285 absentAACDR1Dic[7] = range(30,35) 285 absentAACDR1Dic[7] = list(range(30,35))
286 absentAACDR1Dic[8] = range(30,34) 286 absentAACDR1Dic[8] = list(range(30,34))
287 absentAACDR1Dic[9] = range(31,34) 287 absentAACDR1Dic[9] = list(range(31,34))
288 absentAACDR1Dic[10] = range(31,33) 288 absentAACDR1Dic[10] = list(range(31,33))
289 absentAACDR1Dic[11] = [32] 289 absentAACDR1Dic[11] = [32]
290 290
291 absentAACDR2Dic = defaultdict(list) 291 absentAACDR2Dic = defaultdict(list)
292 absentAACDR2Dic[0] = range(55,65) 292 absentAACDR2Dic[0] = list(range(55,65))
293 absentAACDR2Dic[1] = range(56,65) 293 absentAACDR2Dic[1] = list(range(56,65))
294 absentAACDR2Dic[2] = range(56,64) 294 absentAACDR2Dic[2] = list(range(56,64))
295 absentAACDR2Dic[3] = range(57,64) 295 absentAACDR2Dic[3] = list(range(57,64))
296 absentAACDR2Dic[4] = range(57,63) 296 absentAACDR2Dic[4] = list(range(57,63))
297 absentAACDR2Dic[5] = range(58,63) 297 absentAACDR2Dic[5] = list(range(58,63))
298 absentAACDR2Dic[6] = range(58,62) 298 absentAACDR2Dic[6] = list(range(58,62))
299 absentAACDR2Dic[7] = range(59,62) 299 absentAACDR2Dic[7] = list(range(59,62))
300 absentAACDR2Dic[8] = range(59,61) 300 absentAACDR2Dic[8] = list(range(59,61))
301 absentAACDR2Dic[9] = [60] 301 absentAACDR2Dic[9] = [60]
302 302
303 absentAA = [len(IDlist)] * (AALength-1) 303 absentAA = [len(IDlist)] * (AALength-1)
304 for k, cdr1Length in cdr1LengthDic.iteritems(): 304 for k, cdr1Length in cdr1LengthDic.items():
305 for c in absentAACDR1Dic[cdr1Length]: 305 for c in absentAACDR1Dic[cdr1Length]:
306 absentAA[c] -= 1 306 absentAA[c] -= 1
307 307
308 for k, cdr2Length in cdr2LengthDic.iteritems(): 308 for k, cdr2Length in cdr2LengthDic.items():
309 for c in absentAACDR2Dic[cdr2Length]: 309 for c in absentAACDR2Dic[cdr2Length]:
310 absentAA[c] -= 1 310 absentAA[c] -= 1
311 311
312 312
313 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt" 313 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt"
323 for c in absentAACDR2Dic[cdr2Length]: 323 for c in absentAACDR2Dic[cdr2Length]:
324 absentAAbyID[c] -= 1 324 absentAAbyID[c] -= 1
325 o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n") 325 o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n")
326 326
327 if linecount == 0: 327 if linecount == 0:
328 print "No data, exiting" 328 print("No data, exiting")
329 with open(outfile, 'w') as o: 329 with open(outfile, 'w') as o:
330 o.write("RGYW (%)," + ("0,0,0\n" * len(genes))) 330 o.write("RGYW (%)," + ("0,0,0\n" * len(genes)))
331 o.write("WRCY (%)," + ("0,0,0\n" * len(genes))) 331 o.write("WRCY (%)," + ("0,0,0\n" * len(genes)))
332 o.write("WA (%)," + ("0,0,0\n" * len(genes))) 332 o.write("WA (%)," + ("0,0,0\n" * len(genes)))
333 o.write("TW (%)," + ("0,0,0\n" * len(genes))) 333 o.write("TW (%)," + ("0,0,0\n" * len(genes)))
334 import sys
335
336 sys.exit() 334 sys.exit()
337 335
338 hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") 336 hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)")
339 RGYWCount = {} 337 RGYWCount = {}
340 WRCYCount = {} 338 WRCYCount = {}
345 ataIndex = 0 343 ataIndex = 0
346 tatIndex = 0 344 tatIndex = 0
347 aggctatIndex = 0 345 aggctatIndex = 0
348 atagcctIndex = 0 346 atagcctIndex = 0
349 first = True 347 first = True
350 with open(infile, 'ru') as i: 348 with open(infile, 'r') as i:
351 for line in i: 349 for line in i:
352 if first: 350 if first:
353 linesplt = line.split("\t") 351 linesplt = line.split("\t")
354 ataIndex = linesplt.index("X.a.t.a") 352 ataIndex = linesplt.index("X.a.t.a")
355 tatIndex = linesplt.index("t.a.t.") 353 tatIndex = linesplt.index("t.a.t.")
410 408
411 with open(mutations_in_motifs_file, 'a') as out_handle: 409 with open(mutations_in_motifs_file, 'a') as out_handle:
412 motif_dic = {"RGYW": RGYW, "WRCY": WRCY, "WA": WA, "TW": TW} 410 motif_dic = {"RGYW": RGYW, "WRCY": WRCY, "WA": WA, "TW": TW}
413 for mutation in mutationList: 411 for mutation in mutationList:
414 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation 412 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation
415 for motif in motif_dic.keys(): 413 for motif in list(motif_dic.keys()):
416 414
417 for start, end, region in motif_dic[motif]: 415 for start, end, region in motif_dic[motif]:
418 if start <= int(where) <= end: 416 if start <= int(where) <= end:
419 out_handle.write("{0}\n".format( 417 out_handle.write("{0}\n".format(
420 "\t".join([ 418 "\t".join([
458 456
459 directory = outfile[:outfile.rfind("/") + 1] 457 directory = outfile[:outfile.rfind("/") + 1]
460 value = 0 458 value = 0
461 valuedic = dict() 459 valuedic = dict()
462 460
463 for fname in funcs.keys(): 461 for fname in list(funcs.keys()):
464 for gene in genes: 462 for gene in genes:
465 with open(directory + gene + "_" + fname + "_value.txt", 'r') as v: 463 with open(directory + gene + "_" + fname + "_value.txt", 'r') as v:
466 valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip()) 464 valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip())
467 with open(directory + "all_" + fname + "_value.txt", 'r') as v: 465 with open(directory + "all_" + fname + "_value.txt", 'r') as v:
468 valuedic["total_" + fname] = float(v.readlines()[0].rstrip()) 466 valuedic["total_" + fname] = float(v.readlines()[0].rstrip())
475 return (str(x), str(y), z) 473 return (str(x), str(y), z)
476 474
477 dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} 475 dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}
478 arr = ["RGYW", "WRCY", "WA", "TW"] 476 arr = ["RGYW", "WRCY", "WA", "TW"]
479 477
480 for fname in funcs.keys(): 478 for fname in list(funcs.keys()):
481 func = funcs[fname] 479 func = funcs[fname]
482 foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt" 480 foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt"
483 with open(foutfile, 'w') as o: 481 with open(foutfile, 'w') as o:
484 for typ in arr: 482 for typ in arr:
485 o.write(typ + " (%)") 483 o.write(typ + " (%)")
487 for gene in genes: 485 for gene in genes:
488 geneMatcher = geneMatchers[gene] 486 geneMatcher = geneMatchers[gene]
489 if valuedic[gene + "_" + fname] is 0: 487 if valuedic[gene + "_" + fname] is 0:
490 o.write(",0,0,0") 488 o.write(",0,0,0")
491 else: 489 else:
492 x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname) 490 x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.items() if geneMatcher.match(z)]], gene, func, fname)
493 o.write("," + x + "," + y + "," + z) 491 o.write("," + x + "," + y + "," + z)
494 x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname) 492 x, y, z = get_xyz([y for x, y in curr.items() if not genedic[x].startswith("unmatched")], "total", func, fname)
495 #x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname) 493 #x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname)
496 o.write("," + x + "," + y + "," + z + "\n") 494 o.write("," + x + "," + y + "," + z + "\n")
497 495
498 496
499 # for testing 497 # for testing