Mercurial > repos > davidvanzessen > shm_csr
comparison shm_csr.py @ 83:729738462297 draft
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
| author | rhpvorderman |
|---|---|
| date | Wed, 15 Sep 2021 12:24:06 +0000 |
| parents | b6f9a640e098 |
| children | 6809c63d9161 |
comparison
equal
deleted
inserted
replaced
| 82:a103134ee6e0 | 83:729738462297 |
|---|---|
| 24 | 24 |
| 25 mutationdic = dict() | 25 mutationdic = dict() |
| 26 mutationMatcher = re.compile("^(.)(\d+).(.),?[ ]?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") | 26 mutationMatcher = re.compile("^(.)(\d+).(.),?[ ]?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") |
| 27 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?.?([A-Z])?(.*)?") | 27 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?.?([A-Z])?(.*)?") |
| 28 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?") | 28 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?") |
| 29 mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?") | 29 mutationMatcher = re.compile(r"^([nactg])(\d+).([nactg]),?[ ]?([A-Z*])?(\d+)?[>]?([A-Z*;])?(.*)?") |
| 30 NAMatchResult = (None, None, None, None, None, None, '') | 30 NAMatchResult = (None, None, None, None, None, None, '') |
| 31 geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes} | 31 geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes} |
| 32 linecount = 0 | 32 linecount = 0 |
| 33 | 33 |
| 34 IDIndex = 0 | 34 IDIndex = 0 |
| 57 fr3SeqIndex = 0 | 57 fr3SeqIndex = 0 |
| 58 | 58 |
| 59 tandem_sum_by_class = defaultdict(int) | 59 tandem_sum_by_class = defaultdict(int) |
| 60 expected_tandem_sum_by_class = defaultdict(float) | 60 expected_tandem_sum_by_class = defaultdict(float) |
| 61 | 61 |
| 62 with open(infile, 'ru') as i: | 62 with open(infile, 'r') as i: |
| 63 for line in i: | 63 for line in i: |
| 64 if first: | 64 if first: |
| 65 linesplt = line.split("\t") | 65 linesplt = line.split("\t") |
| 66 IDIndex = linesplt.index("Sequence.ID") | 66 IDIndex = linesplt.index("Sequence.ID") |
| 67 best_matchIndex = linesplt.index("best_match") | 67 best_matchIndex = linesplt.index("best_match") |
| 128 fr1LengthDict[ID] = fr1Length | 128 fr1LengthDict[ID] = fr1Length |
| 129 fr2LengthDict[ID] = fr2Length | 129 fr2LengthDict[ID] = fr2Length |
| 130 fr3LengthDict[ID] = fr3Length | 130 fr3LengthDict[ID] = fr3Length |
| 131 | 131 |
| 132 IDlist += [ID] | 132 IDlist += [ID] |
| 133 print "len(mutationdic) =", len(mutationdic) | 133 print("len(mutationdic) =", len(mutationdic)) |
| 134 | 134 |
| 135 with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "mutationdict.txt"), 'w') as out_handle: | 135 with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "mutationdict.txt"), 'w') as out_handle: |
| 136 for ID, lst in mutationdic.iteritems(): | 136 for ID, lst in mutationdic.items(): |
| 137 for mut in lst: | 137 for mut in lst: |
| 138 out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in mut]))) | 138 out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in mut]))) |
| 139 | 139 |
| 140 #tandem mutation stuff | 140 #tandem mutation stuff |
| 141 tandem_frequency = defaultdict(int) | 141 tandem_frequency = defaultdict(int) |
| 228 tandem_frequency[str(tandem_mut[1])] += 1 | 228 tandem_frequency[str(tandem_mut[1])] += 1 |
| 229 #print "\t".join([ID, str(len(tandem_muts)), str(longest_tandem[1]) , str(tandem_muts)]) | 229 #print "\t".join([ID, str(len(tandem_muts)), str(longest_tandem[1]) , str(tandem_muts)]) |
| 230 | 230 |
| 231 tandem_freq_file = os.path.join(os.path.dirname(outfile), "tandem_frequency.txt") | 231 tandem_freq_file = os.path.join(os.path.dirname(outfile), "tandem_frequency.txt") |
| 232 with open(tandem_freq_file, 'w') as o: | 232 with open(tandem_freq_file, 'w') as o: |
| 233 for frq in sorted([int(x) for x in tandem_frequency.keys()]): | 233 for frq in sorted([int(x) for x in list(tandem_frequency.keys())]): |
| 234 o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)])) | 234 o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)])) |
| 235 | 235 |
| 236 tandem_row = [] | 236 tandem_row = [] |
| 237 genes_extra = list(genes) | 237 genes_extra = list(genes) |
| 238 genes_extra.append("all") | 238 genes_extra.append("all") |
| 254 | 254 |
| 255 AA_mutation = [0] * AALength | 255 AA_mutation = [0] * AALength |
| 256 AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]} | 256 AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]} |
| 257 AA_mutation_empty = AA_mutation[:] | 257 AA_mutation_empty = AA_mutation[:] |
| 258 | 258 |
| 259 print "AALength:", AALength | 259 print("AALength:", AALength) |
| 260 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt" | 260 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt" |
| 261 with open(aa_mutations_by_id_file, 'w') as o: | 261 with open(aa_mutations_by_id_file, 'w') as o: |
| 262 o.write("ID\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n") | 262 o.write("ID\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n") |
| 263 for ID in mutationListByID.keys(): | 263 for ID in list(mutationListByID.keys()): |
| 264 AA_mutation_for_ID = AA_mutation_empty[:] | 264 AA_mutation_for_ID = AA_mutation_empty[:] |
| 265 for mutation in mutationListByID[ID]: | 265 for mutation in mutationListByID[ID]: |
| 266 if mutation[4] and mutation[5] != ";": | 266 if mutation[4] and mutation[5] != ";": |
| 267 AA_mutation_position = int(mutation[4]) | 267 AA_mutation_position = int(mutation[4]) |
| 268 try: | 268 try: |
| 269 AA_mutation[AA_mutation_position] += 1 | 269 AA_mutation[AA_mutation_position] += 1 |
| 270 AA_mutation_for_ID[AA_mutation_position] += 1 | 270 AA_mutation_for_ID[AA_mutation_position] += 1 |
| 271 except Exception as e: | 271 except Exception as e: |
| 272 print e | 272 print(e) |
| 273 print mutation | 273 print(mutation) |
| 274 sys.exit() | 274 sys.exit() |
| 275 clss = genedic[ID][:3] | 275 clss = genedic[ID][:3] |
| 276 AA_mutation_dic[clss][AA_mutation_position] += 1 | 276 AA_mutation_dic[clss][AA_mutation_position] += 1 |
| 277 o.write(ID + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n") | 277 o.write(ID + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n") |
| 278 | 278 |
| 279 | 279 |
| 280 | 280 |
| 281 #absent AA stuff | 281 #absent AA stuff |
| 282 absentAACDR1Dic = defaultdict(list) | 282 absentAACDR1Dic = defaultdict(list) |
| 283 absentAACDR1Dic[5] = range(29,36) | 283 absentAACDR1Dic[5] = list(range(29,36)) |
| 284 absentAACDR1Dic[6] = range(29,35) | 284 absentAACDR1Dic[6] = list(range(29,35)) |
| 285 absentAACDR1Dic[7] = range(30,35) | 285 absentAACDR1Dic[7] = list(range(30,35)) |
| 286 absentAACDR1Dic[8] = range(30,34) | 286 absentAACDR1Dic[8] = list(range(30,34)) |
| 287 absentAACDR1Dic[9] = range(31,34) | 287 absentAACDR1Dic[9] = list(range(31,34)) |
| 288 absentAACDR1Dic[10] = range(31,33) | 288 absentAACDR1Dic[10] = list(range(31,33)) |
| 289 absentAACDR1Dic[11] = [32] | 289 absentAACDR1Dic[11] = [32] |
| 290 | 290 |
| 291 absentAACDR2Dic = defaultdict(list) | 291 absentAACDR2Dic = defaultdict(list) |
| 292 absentAACDR2Dic[0] = range(55,65) | 292 absentAACDR2Dic[0] = list(range(55,65)) |
| 293 absentAACDR2Dic[1] = range(56,65) | 293 absentAACDR2Dic[1] = list(range(56,65)) |
| 294 absentAACDR2Dic[2] = range(56,64) | 294 absentAACDR2Dic[2] = list(range(56,64)) |
| 295 absentAACDR2Dic[3] = range(57,64) | 295 absentAACDR2Dic[3] = list(range(57,64)) |
| 296 absentAACDR2Dic[4] = range(57,63) | 296 absentAACDR2Dic[4] = list(range(57,63)) |
| 297 absentAACDR2Dic[5] = range(58,63) | 297 absentAACDR2Dic[5] = list(range(58,63)) |
| 298 absentAACDR2Dic[6] = range(58,62) | 298 absentAACDR2Dic[6] = list(range(58,62)) |
| 299 absentAACDR2Dic[7] = range(59,62) | 299 absentAACDR2Dic[7] = list(range(59,62)) |
| 300 absentAACDR2Dic[8] = range(59,61) | 300 absentAACDR2Dic[8] = list(range(59,61)) |
| 301 absentAACDR2Dic[9] = [60] | 301 absentAACDR2Dic[9] = [60] |
| 302 | 302 |
| 303 absentAA = [len(IDlist)] * (AALength-1) | 303 absentAA = [len(IDlist)] * (AALength-1) |
| 304 for k, cdr1Length in cdr1LengthDic.iteritems(): | 304 for k, cdr1Length in cdr1LengthDic.items(): |
| 305 for c in absentAACDR1Dic[cdr1Length]: | 305 for c in absentAACDR1Dic[cdr1Length]: |
| 306 absentAA[c] -= 1 | 306 absentAA[c] -= 1 |
| 307 | 307 |
| 308 for k, cdr2Length in cdr2LengthDic.iteritems(): | 308 for k, cdr2Length in cdr2LengthDic.items(): |
| 309 for c in absentAACDR2Dic[cdr2Length]: | 309 for c in absentAACDR2Dic[cdr2Length]: |
| 310 absentAA[c] -= 1 | 310 absentAA[c] -= 1 |
| 311 | 311 |
| 312 | 312 |
| 313 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt" | 313 aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt" |
| 323 for c in absentAACDR2Dic[cdr2Length]: | 323 for c in absentAACDR2Dic[cdr2Length]: |
| 324 absentAAbyID[c] -= 1 | 324 absentAAbyID[c] -= 1 |
| 325 o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n") | 325 o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n") |
| 326 | 326 |
| 327 if linecount == 0: | 327 if linecount == 0: |
| 328 print "No data, exiting" | 328 print("No data, exiting") |
| 329 with open(outfile, 'w') as o: | 329 with open(outfile, 'w') as o: |
| 330 o.write("RGYW (%)," + ("0,0,0\n" * len(genes))) | 330 o.write("RGYW (%)," + ("0,0,0\n" * len(genes))) |
| 331 o.write("WRCY (%)," + ("0,0,0\n" * len(genes))) | 331 o.write("WRCY (%)," + ("0,0,0\n" * len(genes))) |
| 332 o.write("WA (%)," + ("0,0,0\n" * len(genes))) | 332 o.write("WA (%)," + ("0,0,0\n" * len(genes))) |
| 333 o.write("TW (%)," + ("0,0,0\n" * len(genes))) | 333 o.write("TW (%)," + ("0,0,0\n" * len(genes))) |
| 334 import sys | |
| 335 | |
| 336 sys.exit() | 334 sys.exit() |
| 337 | 335 |
| 338 hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") | 336 hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") |
| 339 RGYWCount = {} | 337 RGYWCount = {} |
| 340 WRCYCount = {} | 338 WRCYCount = {} |
| 345 ataIndex = 0 | 343 ataIndex = 0 |
| 346 tatIndex = 0 | 344 tatIndex = 0 |
| 347 aggctatIndex = 0 | 345 aggctatIndex = 0 |
| 348 atagcctIndex = 0 | 346 atagcctIndex = 0 |
| 349 first = True | 347 first = True |
| 350 with open(infile, 'ru') as i: | 348 with open(infile, 'r') as i: |
| 351 for line in i: | 349 for line in i: |
| 352 if first: | 350 if first: |
| 353 linesplt = line.split("\t") | 351 linesplt = line.split("\t") |
| 354 ataIndex = linesplt.index("X.a.t.a") | 352 ataIndex = linesplt.index("X.a.t.a") |
| 355 tatIndex = linesplt.index("t.a.t.") | 353 tatIndex = linesplt.index("t.a.t.") |
| 410 | 408 |
| 411 with open(mutations_in_motifs_file, 'a') as out_handle: | 409 with open(mutations_in_motifs_file, 'a') as out_handle: |
| 412 motif_dic = {"RGYW": RGYW, "WRCY": WRCY, "WA": WA, "TW": TW} | 410 motif_dic = {"RGYW": RGYW, "WRCY": WRCY, "WA": WA, "TW": TW} |
| 413 for mutation in mutationList: | 411 for mutation in mutationList: |
| 414 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation | 412 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation |
| 415 for motif in motif_dic.keys(): | 413 for motif in list(motif_dic.keys()): |
| 416 | 414 |
| 417 for start, end, region in motif_dic[motif]: | 415 for start, end, region in motif_dic[motif]: |
| 418 if start <= int(where) <= end: | 416 if start <= int(where) <= end: |
| 419 out_handle.write("{0}\n".format( | 417 out_handle.write("{0}\n".format( |
| 420 "\t".join([ | 418 "\t".join([ |
| 458 | 456 |
| 459 directory = outfile[:outfile.rfind("/") + 1] | 457 directory = outfile[:outfile.rfind("/") + 1] |
| 460 value = 0 | 458 value = 0 |
| 461 valuedic = dict() | 459 valuedic = dict() |
| 462 | 460 |
| 463 for fname in funcs.keys(): | 461 for fname in list(funcs.keys()): |
| 464 for gene in genes: | 462 for gene in genes: |
| 465 with open(directory + gene + "_" + fname + "_value.txt", 'r') as v: | 463 with open(directory + gene + "_" + fname + "_value.txt", 'r') as v: |
| 466 valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip()) | 464 valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip()) |
| 467 with open(directory + "all_" + fname + "_value.txt", 'r') as v: | 465 with open(directory + "all_" + fname + "_value.txt", 'r') as v: |
| 468 valuedic["total_" + fname] = float(v.readlines()[0].rstrip()) | 466 valuedic["total_" + fname] = float(v.readlines()[0].rstrip()) |
| 475 return (str(x), str(y), z) | 473 return (str(x), str(y), z) |
| 476 | 474 |
| 477 dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} | 475 dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} |
| 478 arr = ["RGYW", "WRCY", "WA", "TW"] | 476 arr = ["RGYW", "WRCY", "WA", "TW"] |
| 479 | 477 |
| 480 for fname in funcs.keys(): | 478 for fname in list(funcs.keys()): |
| 481 func = funcs[fname] | 479 func = funcs[fname] |
| 482 foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt" | 480 foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt" |
| 483 with open(foutfile, 'w') as o: | 481 with open(foutfile, 'w') as o: |
| 484 for typ in arr: | 482 for typ in arr: |
| 485 o.write(typ + " (%)") | 483 o.write(typ + " (%)") |
| 487 for gene in genes: | 485 for gene in genes: |
| 488 geneMatcher = geneMatchers[gene] | 486 geneMatcher = geneMatchers[gene] |
| 489 if valuedic[gene + "_" + fname] is 0: | 487 if valuedic[gene + "_" + fname] is 0: |
| 490 o.write(",0,0,0") | 488 o.write(",0,0,0") |
| 491 else: | 489 else: |
| 492 x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname) | 490 x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.items() if geneMatcher.match(z)]], gene, func, fname) |
| 493 o.write("," + x + "," + y + "," + z) | 491 o.write("," + x + "," + y + "," + z) |
| 494 x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname) | 492 x, y, z = get_xyz([y for x, y in curr.items() if not genedic[x].startswith("unmatched")], "total", func, fname) |
| 495 #x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname) | 493 #x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname) |
| 496 o.write("," + x + "," + y + "," + z + "\n") | 494 o.write("," + x + "," + y + "," + z + "\n") |
| 497 | 495 |
| 498 | 496 |
| 499 # for testing | 497 # for testing |
