Mercurial > repos > davidvanzessen > shm_csr
comparison shm_csr.py @ 83:729738462297 draft
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
author | rhpvorderman |
---|---|
date | Wed, 15 Sep 2021 12:24:06 +0000 |
parents | b6f9a640e098 |
children | 6809c63d9161 |
comparison
equal
deleted
inserted
replaced
82:a103134ee6e0 | 83:729738462297 |
---|---|
24 | 24 |
25 mutationdic = dict() | 25 mutationdic = dict() |
26 mutationMatcher = re.compile("^(.)(\d+).(.),?[ ]?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") | 26 mutationMatcher = re.compile("^(.)(\d+).(.),?[ ]?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") |
27 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?.?([A-Z])?(.*)?") | 27 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?.?([A-Z])?(.*)?") |
28 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?") | 28 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?") |
29 mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?") | 29 mutationMatcher = re.compile(r"^([nactg])(\d+).([nactg]),?[ ]?([A-Z*])?(\d+)?[>]?([A-Z*;])?(.*)?") |
30 NAMatchResult = (None, None, None, None, None, None, '') | 30 NAMatchResult = (None, None, None, None, None, None, '') |
31 geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes} | 31 geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes} |
32 linecount = 0 | 32 linecount = 0 |
33 | 33 |
34 IDIndex = 0 | 34 IDIndex = 0 |
57 fr3SeqIndex = 0 | 57 fr3SeqIndex = 0 |
58 | 58 |
59 tandem_sum_by_class = defaultdict(int) | 59 tandem_sum_by_class = defaultdict(int) |
60 expected_tandem_sum_by_class = defaultdict(float) | 60 expected_tandem_sum_by_class = defaultdict(float) |
61 | 61 |
62 with open(infile, 'ru') as i: | 62 with open(infile, 'r') as i: |
63 for line in i: | 63 for line in i: |
64 if first: | 64 if first: |
65 linesplt = line.split("\t") | 65 linesplt = line.split("\t") |
66 IDIndex = linesplt.index("Sequence.ID") | 66 IDIndex = linesplt.index("Sequence.ID") |
67 best_matchIndex = linesplt.index("best_match") | 67 best_matchIndex = linesplt.index("best_match") |
128 fr1LengthDict[ID] = fr1Length | 128 fr1LengthDict[ID] = fr1Length |
129 fr2LengthDict[ID] = fr2Length | 129 fr2LengthDict[ID] = fr2Length |
130 fr3LengthDict[ID] = fr3Length | 130 fr3LengthDict[ID] = fr3Length |
131 | 131 |
132 IDlist += [ID] | 132 IDlist += [ID] |
133 print "len(mutationdic) =", len(mutationdic) | 133 print("len(mutationdic) =", len(mutationdic)) |
134 | 134 |
135 with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "mutationdict.txt"), 'w') as out_handle: | 135 with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "mutationdict.txt"), 'w') as out_handle: |
136 for ID, lst in mutationdic.iteritems(): | 136 for ID, lst in mutationdic.items(): |
137 for mut in lst: | 137 for mut in lst: |
138 out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in mut]))) | 138 out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in mut]))) |
139 | 139 |
140 #tandem mutation stuff | 140 #tandem mutation stuff |
141 tandem_frequency = defaultdict(int) | 141 tandem_frequency = defaultdict(int) |
228 tandem_frequency[str(tandem_mut[1])] += 1 | 228 tandem_frequency[str(tandem_mut[1])] += 1 |
229 #print "\t".join([ID, str(len(tandem_muts)), str(longest_tandem[1]) , str(tandem_muts)]) | 229 #print "\t".join([ID, str(len(tandem_muts)), str(longest_tandem[1]) , str(tandem_muts)]) |
230 | 230 |
231 tandem_freq_file = os.path.join(os.path.dirname(outfile), "tandem_frequency.txt") | 231 tandem_freq_file = os.path.join(os.path.dirname(outfile), "tandem_frequency.txt") |
232 with open(tandem_freq_file, 'w') as o: | 232 with open(tandem_freq_file, 'w') as o: |
233 for frq in sorted([int(x) for x in tandem_frequency.keys()]): | 233 for frq in sorted([int(x) for x in list(tandem_frequency.keys())]): |
234 o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)])) | 234 o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)])) |
235 | 235 |
236 tandem_row = [] | 236 tandem_row = [] |
237 genes_extra = list(genes) | 237 genes_extra = list(genes) |
238 genes_extra.append("all") | 238 genes_extra.append("all") |
254 | 254 |
255 AA_mutation = [0] * AALength | 255 AA_mutation = [0] * AALength |
256 AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]} | 256 AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]} |
257 AA_mutation_empty = AA_mutation[:] | 257 AA_mutation_empty = AA_mutation[:] |
258 | 258 |
259 print "AALength:", AALength | 259 print("AALength:", AALength) |
260 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt" | 260 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt" |
261 with open(aa_mutations_by_id_file, 'w') as o: | 261 with open(aa_mutations_by_id_file, 'w') as o: |
262 o.write("ID\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n") | 262 o.write("ID\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n") |
263 for ID in mutationListByID.keys(): | 263 for ID in list(mutationListByID.keys()): |
264 AA_mutation_for_ID = AA_mutation_empty[:] | 264 AA_mutation_for_ID = AA_mutation_empty[:] |
265 for mutation in mutationListByID[ID]: | 265 for mutation in mutationListByID[ID]: |
266 if mutation[4] and mutation[5] != ";": | 266 if mutation[4] and mutation[5] != ";": |
267 AA_mutation_position = int(mutation[4]) | 267 AA_mutation_position = int(mutation[4]) |
268 try: | 268 try: |
269 AA_mutation[AA_mutation_position] += 1 | 269 AA_mutation[AA_mutation_position] += 1 |
270 AA_mutation_for_ID[AA_mutation_position] += 1 | 270 AA_mutation_for_ID[AA_mutation_position] += 1 |
271 except Exception as e: | 271 except Exception as e: |
272 print e | 272 print(e) |
273 print mutation | 273 print(mutation) |
274 sys.exit() | 274 sys.exit() |
275 clss = genedic[ID][:3] | 275 clss = genedic[ID][:3] |
276 AA_mutation_dic[clss][AA_mutation_position] += 1 | 276 AA_mutation_dic[clss][AA_mutation_position] += 1 |
277 o.write(ID + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n") | 277 o.write(ID + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n") |
278 | 278 |
279 | 279 |
280 | 280 |
281 #absent AA stuff | 281 #absent AA stuff |
282 absentAACDR1Dic = defaultdict(list) | 282 absentAACDR1Dic = defaultdict(list) |
283 absentAACDR1Dic[5] = range(29,36) | 283 absentAACDR1Dic[5] = list(range(29,36)) |
284 absentAACDR1Dic[6] = range(29,35) | 284 absentAACDR1Dic[6] = list(range(29,35)) |
285 absentAACDR1Dic[7] = range(30,35) | 285 absentAACDR1Dic[7] = list(range(30,35)) |
286 absentAACDR1Dic[8] = range(30,34) | 286 absentAACDR1Dic[8] = list(range(30,34)) |
287 absentAACDR1Dic[9] = range(31,34) | 287 absentAACDR1Dic[9] = list(range(31,34)) |
288 absentAACDR1Dic[10] = range(31,33) | 288 absentAACDR1Dic[10] = list(range(31,33)) |
289 absentAACDR1Dic[11] = [32] | 289 absentAACDR1Dic[11] = [32] |
290 | 290 |
291 absentAACDR2Dic = defaultdict(list) | 291 absentAACDR2Dic = defaultdict(list) |
292 absentAACDR2Dic[0] = range(55,65) | 292 absentAACDR2Dic[0] = list(range(55,65)) |
293 absentAACDR2Dic[1] = range(56,65) | 293 absentAACDR2Dic[1] = list(range(56,65)) |
294 absentAACDR2Dic[2] = range(56,64) | 294 absentAACDR2Dic[2] = list(range(56,64)) |
295 absentAACDR2Dic[3] = range(57,64) | 295 absentAACDR2Dic[3] = list(range(57,64)) |
296 absentAACDR2Dic[4] = range(57,63) | 296 absentAACDR2Dic[4] = list(range(57,63)) |
297 absentAACDR2Dic[5] = range(58,63) | 297 absentAACDR2Dic[5] = list(range(58,63)) |
298 absentAACDR2Dic[6] = range(58,62) | 298 absentAACDR2Dic[6] = list(range(58,62)) |
299 absentAACDR2Dic[7] = range(59,62) | 299 absentAACDR2Dic[7] = list(range(59,62)) |
300 absentAACDR2Dic[8] = range(59,61) | 300 absentAACDR2Dic[8] = list(range(59,61)) |
301 absentAACDR2Dic[9] = [60] | 301 absentAACDR2Dic[9] = [60] |
302 | 302 |
303 absentAA = [len(IDlist)] * (AALength-1) | 303 absentAA = [len(IDlist)] * (AALength-1) |
304 for k, cdr1Length in cdr1LengthDic.iteritems(): | 304 for k, cdr1Length in cdr1LengthDic.items(): |
305 for c in absentAACDR1Dic[cdr1Length]: | 305 for c in absentAACDR1Dic[cdr1Length]: |
306 absentAA[c] -= 1 | 306 absentAA[c] -= 1 |
307 | 307 |
308 for k, cdr2Length in cdr2LengthDic.iteritems(): | 308 for k, cdr2Length in cdr2LengthDic.items(): |
309 for c in absentAACDR2Dic[cdr2Length]: | 309 for c in absentAACDR2Dic[cdr2Length]: |
310 absentAA[c] -= 1 | 310 absentAA[c] -= 1 |
311 | 311 |
312 | 312 |
313 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt" | 313 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt" |
323 for c in absentAACDR2Dic[cdr2Length]: | 323 for c in absentAACDR2Dic[cdr2Length]: |
324 absentAAbyID[c] -= 1 | 324 absentAAbyID[c] -= 1 |
325 o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n") | 325 o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n") |
326 | 326 |
327 if linecount == 0: | 327 if linecount == 0: |
328 print "No data, exiting" | 328 print("No data, exiting") |
329 with open(outfile, 'w') as o: | 329 with open(outfile, 'w') as o: |
330 o.write("RGYW (%)," + ("0,0,0\n" * len(genes))) | 330 o.write("RGYW (%)," + ("0,0,0\n" * len(genes))) |
331 o.write("WRCY (%)," + ("0,0,0\n" * len(genes))) | 331 o.write("WRCY (%)," + ("0,0,0\n" * len(genes))) |
332 o.write("WA (%)," + ("0,0,0\n" * len(genes))) | 332 o.write("WA (%)," + ("0,0,0\n" * len(genes))) |
333 o.write("TW (%)," + ("0,0,0\n" * len(genes))) | 333 o.write("TW (%)," + ("0,0,0\n" * len(genes))) |
334 import sys | |
335 | |
336 sys.exit() | 334 sys.exit() |
337 | 335 |
338 hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") | 336 hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") |
339 RGYWCount = {} | 337 RGYWCount = {} |
340 WRCYCount = {} | 338 WRCYCount = {} |
345 ataIndex = 0 | 343 ataIndex = 0 |
346 tatIndex = 0 | 344 tatIndex = 0 |
347 aggctatIndex = 0 | 345 aggctatIndex = 0 |
348 atagcctIndex = 0 | 346 atagcctIndex = 0 |
349 first = True | 347 first = True |
350 with open(infile, 'ru') as i: | 348 with open(infile, 'r') as i: |
351 for line in i: | 349 for line in i: |
352 if first: | 350 if first: |
353 linesplt = line.split("\t") | 351 linesplt = line.split("\t") |
354 ataIndex = linesplt.index("X.a.t.a") | 352 ataIndex = linesplt.index("X.a.t.a") |
355 tatIndex = linesplt.index("t.a.t.") | 353 tatIndex = linesplt.index("t.a.t.") |
410 | 408 |
411 with open(mutations_in_motifs_file, 'a') as out_handle: | 409 with open(mutations_in_motifs_file, 'a') as out_handle: |
412 motif_dic = {"RGYW": RGYW, "WRCY": WRCY, "WA": WA, "TW": TW} | 410 motif_dic = {"RGYW": RGYW, "WRCY": WRCY, "WA": WA, "TW": TW} |
413 for mutation in mutationList: | 411 for mutation in mutationList: |
414 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation | 412 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation |
415 for motif in motif_dic.keys(): | 413 for motif in list(motif_dic.keys()): |
416 | 414 |
417 for start, end, region in motif_dic[motif]: | 415 for start, end, region in motif_dic[motif]: |
418 if start <= int(where) <= end: | 416 if start <= int(where) <= end: |
419 out_handle.write("{0}\n".format( | 417 out_handle.write("{0}\n".format( |
420 "\t".join([ | 418 "\t".join([ |
458 | 456 |
459 directory = outfile[:outfile.rfind("/") + 1] | 457 directory = outfile[:outfile.rfind("/") + 1] |
460 value = 0 | 458 value = 0 |
461 valuedic = dict() | 459 valuedic = dict() |
462 | 460 |
463 for fname in funcs.keys(): | 461 for fname in list(funcs.keys()): |
464 for gene in genes: | 462 for gene in genes: |
465 with open(directory + gene + "_" + fname + "_value.txt", 'r') as v: | 463 with open(directory + gene + "_" + fname + "_value.txt", 'r') as v: |
466 valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip()) | 464 valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip()) |
467 with open(directory + "all_" + fname + "_value.txt", 'r') as v: | 465 with open(directory + "all_" + fname + "_value.txt", 'r') as v: |
468 valuedic["total_" + fname] = float(v.readlines()[0].rstrip()) | 466 valuedic["total_" + fname] = float(v.readlines()[0].rstrip()) |
475 return (str(x), str(y), z) | 473 return (str(x), str(y), z) |
476 | 474 |
477 dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} | 475 dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} |
478 arr = ["RGYW", "WRCY", "WA", "TW"] | 476 arr = ["RGYW", "WRCY", "WA", "TW"] |
479 | 477 |
480 for fname in funcs.keys(): | 478 for fname in list(funcs.keys()): |
481 func = funcs[fname] | 479 func = funcs[fname] |
482 foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt" | 480 foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt" |
483 with open(foutfile, 'w') as o: | 481 with open(foutfile, 'w') as o: |
484 for typ in arr: | 482 for typ in arr: |
485 o.write(typ + " (%)") | 483 o.write(typ + " (%)") |
487 for gene in genes: | 485 for gene in genes: |
488 geneMatcher = geneMatchers[gene] | 486 geneMatcher = geneMatchers[gene] |
489 if valuedic[gene + "_" + fname] is 0: | 487 if valuedic[gene + "_" + fname] is 0: |
490 o.write(",0,0,0") | 488 o.write(",0,0,0") |
491 else: | 489 else: |
492 x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname) | 490 x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.items() if geneMatcher.match(z)]], gene, func, fname) |
493 o.write("," + x + "," + y + "," + z) | 491 o.write("," + x + "," + y + "," + z) |
494 x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname) | 492 x, y, z = get_xyz([y for x, y in curr.items() if not genedic[x].startswith("unmatched")], "total", func, fname) |
495 #x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname) | 493 #x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname) |
496 o.write("," + x + "," + y + "," + z + "\n") | 494 o.write("," + x + "," + y + "," + z + "\n") |
497 | 495 |
498 | 496 |
499 # for testing | 497 # for testing |