Mercurial > repos > davidvanzessen > shm_csr
diff shm_csr.py @ 83:729738462297 draft
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"
author | rhpvorderman |
---|---|
date | Wed, 15 Sep 2021 12:24:06 +0000 |
parents | b6f9a640e098 |
children | 6809c63d9161 |
line wrap: on
line diff
--- a/shm_csr.py Thu Feb 25 10:32:32 2021 +0000 +++ b/shm_csr.py Wed Sep 15 12:24:06 2021 +0000 @@ -26,7 +26,7 @@ mutationMatcher = re.compile("^(.)(\d+).(.),?[ ]?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?.?([A-Z])?(.*)?") mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?") - mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?") + mutationMatcher = re.compile(r"^([nactg])(\d+).([nactg]),?[ ]?([A-Z*])?(\d+)?[>]?([A-Z*;])?(.*)?") NAMatchResult = (None, None, None, None, None, None, '') geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes} linecount = 0 @@ -59,7 +59,7 @@ tandem_sum_by_class = defaultdict(int) expected_tandem_sum_by_class = defaultdict(float) - with open(infile, 'ru') as i: + with open(infile, 'r') as i: for line in i: if first: linesplt = line.split("\t") @@ -130,10 +130,10 @@ fr3LengthDict[ID] = fr3Length IDlist += [ID] - print "len(mutationdic) =", len(mutationdic) + print("len(mutationdic) =", len(mutationdic)) with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "mutationdict.txt"), 'w') as out_handle: - for ID, lst in mutationdic.iteritems(): + for ID, lst in mutationdic.items(): for mut in lst: out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in mut]))) @@ -230,7 +230,7 @@ tandem_freq_file = os.path.join(os.path.dirname(outfile), "tandem_frequency.txt") with open(tandem_freq_file, 'w') as o: - for frq in sorted([int(x) for x in tandem_frequency.keys()]): + for frq in sorted([int(x) for x in list(tandem_frequency.keys())]): o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)])) tandem_row = [] @@ -256,11 +256,11 @@ AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]} AA_mutation_empty = AA_mutation[:] - print "AALength:", AALength + print("AALength:", AALength) aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt" with open(aa_mutations_by_id_file, 'w') as o: o.write("ID\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n") - for ID in mutationListByID.keys(): + for ID in list(mutationListByID.keys()): AA_mutation_for_ID = AA_mutation_empty[:] for mutation in mutationListByID[ID]: if mutation[4] and mutation[5] != ";": @@ -269,8 +269,8 @@ AA_mutation[AA_mutation_position] += 1 AA_mutation_for_ID[AA_mutation_position] += 1 except Exception as e: - print e - print mutation + print(e) + print(mutation) sys.exit() clss = genedic[ID][:3] AA_mutation_dic[clss][AA_mutation_position] += 1 @@ -280,32 +280,32 @@ #absent AA stuff absentAACDR1Dic = defaultdict(list) - absentAACDR1Dic[5] = range(29,36) - absentAACDR1Dic[6] = range(29,35) - absentAACDR1Dic[7] = range(30,35) - absentAACDR1Dic[8] = range(30,34) - absentAACDR1Dic[9] = range(31,34) - absentAACDR1Dic[10] = range(31,33) + absentAACDR1Dic[5] = list(range(29,36)) + absentAACDR1Dic[6] = list(range(29,35)) + absentAACDR1Dic[7] = list(range(30,35)) + absentAACDR1Dic[8] = list(range(30,34)) + absentAACDR1Dic[9] = list(range(31,34)) + absentAACDR1Dic[10] = list(range(31,33)) absentAACDR1Dic[11] = [32] absentAACDR2Dic = defaultdict(list) - absentAACDR2Dic[0] = range(55,65) - absentAACDR2Dic[1] = range(56,65) - absentAACDR2Dic[2] = range(56,64) - absentAACDR2Dic[3] = range(57,64) - absentAACDR2Dic[4] = range(57,63) - absentAACDR2Dic[5] = range(58,63) - absentAACDR2Dic[6] = range(58,62) - absentAACDR2Dic[7] = range(59,62) - absentAACDR2Dic[8] = range(59,61) + absentAACDR2Dic[0] = list(range(55,65)) + absentAACDR2Dic[1] = list(range(56,65)) + absentAACDR2Dic[2] = list(range(56,64)) + absentAACDR2Dic[3] = list(range(57,64)) + absentAACDR2Dic[4] = list(range(57,63)) + absentAACDR2Dic[5] = list(range(58,63)) + absentAACDR2Dic[6] = list(range(58,62)) + absentAACDR2Dic[7] = list(range(59,62)) + absentAACDR2Dic[8] = list(range(59,61)) absentAACDR2Dic[9] = [60] absentAA = [len(IDlist)] * (AALength-1) - for k, cdr1Length in cdr1LengthDic.iteritems(): + for k, cdr1Length in cdr1LengthDic.items(): for c in absentAACDR1Dic[cdr1Length]: absentAA[c] -= 1 - for k, cdr2Length in cdr2LengthDic.iteritems(): + for k, cdr2Length in cdr2LengthDic.items(): for c in absentAACDR2Dic[cdr2Length]: absentAA[c] -= 1 @@ -325,14 +325,12 @@ o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n") if linecount == 0: - print "No data, exiting" + print("No data, exiting") with open(outfile, 'w') as o: o.write("RGYW (%)," + ("0,0,0\n" * len(genes))) o.write("WRCY (%)," + ("0,0,0\n" * len(genes))) o.write("WA (%)," + ("0,0,0\n" * len(genes))) o.write("TW (%)," + ("0,0,0\n" * len(genes))) - import sys - sys.exit() hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") @@ -347,7 +345,7 @@ aggctatIndex = 0 atagcctIndex = 0 first = True - with open(infile, 'ru') as i: + with open(infile, 'r') as i: for line in i: if first: linesplt = line.split("\t") @@ -412,7 +410,7 @@ motif_dic = {"RGYW": RGYW, "WRCY": WRCY, "WA": WA, "TW": TW} for mutation in mutationList: frm, where, to, AAfrm, AAwhere, AAto, junk = mutation - for motif in motif_dic.keys(): + for motif in list(motif_dic.keys()): for start, end, region in motif_dic[motif]: if start <= int(where) <= end: @@ -460,7 +458,7 @@ value = 0 valuedic = dict() - for fname in funcs.keys(): + for fname in list(funcs.keys()): for gene in genes: with open(directory + gene + "_" + fname + "_value.txt", 'r') as v: valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip()) @@ -477,7 +475,7 @@ dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} arr = ["RGYW", "WRCY", "WA", "TW"] - for fname in funcs.keys(): + for fname in list(funcs.keys()): func = funcs[fname] foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt" with open(foutfile, 'w') as o: @@ -489,9 +487,9 @@ if valuedic[gene + "_" + fname] is 0: o.write(",0,0,0") else: - x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname) + x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.items() if geneMatcher.match(z)]], gene, func, fname) o.write("," + x + "," + y + "," + z) - x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname) + x, y, z = get_xyz([y for x, y in curr.items() if not genedic[x].startswith("unmatched")], "total", func, fname) #x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname) o.write("," + x + "," + y + "," + z + "\n")