Mercurial > repos > davidvanzessen > shm_csr
comparison shm_csr.py @ 62:aa8d37bd1930 draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Tue, 05 Dec 2017 10:57:13 -0500 |
| parents | c5295dd10dfc |
| children | 8728284105ee |
comparison
equal
deleted
inserted
replaced
| 61:275e759e7985 | 62:aa8d37bd1930 |
|---|---|
| 21 outfile = args.output | 21 outfile = args.output |
| 22 | 22 |
| 23 genedic = dict() | 23 genedic = dict() |
| 24 | 24 |
| 25 mutationdic = dict() | 25 mutationdic = dict() |
| 26 mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") | 26 mutationMatcher = re.compile("^(.)(\d+).(.),?[ ]?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") |
| 27 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?.?([A-Z])?(.*)?") | |
| 28 mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?") | |
| 29 mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?") | |
| 27 NAMatchResult = (None, None, None, None, None, None, '') | 30 NAMatchResult = (None, None, None, None, None, None, '') |
| 28 geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes} | 31 geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes} |
| 29 linecount = 0 | 32 linecount = 0 |
| 30 | 33 |
| 31 IDIndex = 0 | 34 IDIndex = 0 |
| 76 continue | 79 continue |
| 77 linecount += 1 | 80 linecount += 1 |
| 78 linesplt = line.split("\t") | 81 linesplt = line.split("\t") |
| 79 ID = linesplt[IDIndex] | 82 ID = linesplt[IDIndex] |
| 80 genedic[ID] = linesplt[best_matchIndex] | 83 genedic[ID] = linesplt[best_matchIndex] |
| 84 | |
| 85 mutationdic[ID + "_FR1"] = [] | |
| 86 if len(linesplt[fr1Index]) > 5 and empty_region_filter == "leader": | |
| 87 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] | |
| 88 | |
| 89 mutationdic[ID + "_CDR1"] = [] | |
| 90 if len(linesplt[cdr1Index]) > 5 and empty_region_filter in ["leader", "FR1"]: | |
| 91 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] | |
| 92 | |
| 93 mutationdic[ID + "_FR2"] = [] | |
| 94 if len(linesplt[fr2Index]) > 5 and empty_region_filter in ["leader", "FR1", "CDR1"]: | |
| 95 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] | |
| 96 | |
| 97 mutationdic[ID + "_CDR2"] = [] | |
| 98 if len(linesplt[cdr2Index]) > 5: | |
| 99 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] | |
| 100 | |
| 101 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] | |
| 102 | |
| 103 mutationdic[ID + "_FR3"] = [] | |
| 104 if len(linesplt[fr3Index]) > 5: | |
| 105 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] | |
| 106 | |
| 81 try: | 107 try: |
| 82 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if (linesplt[fr1Index] != "NA" and empty_region_filter == "leader") else [] | 108 pass |
| 83 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if (linesplt[cdr1Index] != "NA" and empty_region_filter in ["leader", "FR1"]) else [] | |
| 84 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if (linesplt[fr2Index] != "NA" and empty_region_filter in ["leader", "FR1", "CDR1"]) else [] | |
| 85 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if (linesplt[cdr2Index] != "NA") else [] | |
| 86 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] | |
| 87 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else [] | |
| 88 except Exception as e: | 109 except Exception as e: |
| 89 print "Something went wrong while processing this line:" | 110 print "Something went wrong while processing this line:" |
| 111 print "line:", linecount | |
| 112 print "fr1 len:", len(linesplt[fr1Index]), "value:", linesplt[fr1Index] | |
| 113 print "cdr1 len:", len(linesplt[cdr1Index]), "value:", linesplt[cdr1Index] | |
| 114 print "fr2 len:", len(linesplt[fr2Index]), "value:", linesplt[fr2Index] | |
| 115 print "cdr2 len:", len(linesplt[cdr2Index]), "value:", linesplt[cdr2Index] | |
| 116 print "fr3 len:", len(linesplt[fr3Index]), "value:", linesplt[fr3Index] | |
| 117 print ID + "_FR1 in mutationdic", ID + "_FR1" in mutationdic | |
| 118 print ID + "_CDR1 in mutationdic", ID + "_CDR1" in mutationdic | |
| 119 print ID + "_FR2 in mutationdic", ID + "_FR2" in mutationdic | |
| 120 print ID + "_CDR2 in mutationdic", ID + "_CDR2" in mutationdic | |
| 121 print ID + "_FR3 in mutationdic", ID + "_FR3" in mutationdic | |
| 90 print linesplt | 122 print linesplt |
| 91 print linecount | |
| 92 print e | 123 print e |
| 93 mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] | 124 mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] |
| 94 mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] | 125 mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] |
| 95 | 126 |
| 96 cdr1Length = len(linesplt[cdr1LengthIndex]) | 127 cdr1Length = len(linesplt[cdr1LengthIndex]) |
| 107 fr1LengthDict[ID] = fr1Length | 138 fr1LengthDict[ID] = fr1Length |
| 108 fr2LengthDict[ID] = fr2Length | 139 fr2LengthDict[ID] = fr2Length |
| 109 fr3LengthDict[ID] = fr3Length | 140 fr3LengthDict[ID] = fr3Length |
| 110 | 141 |
| 111 IDlist += [ID] | 142 IDlist += [ID] |
| 112 | 143 print "len(mutationdic) =", len(mutationdic) |
| 144 | |
| 145 with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "mutationdict.txt"), 'w') as out_handle: | |
| 146 for ID, lst in mutationdic.iteritems(): | |
| 147 for mut in lst: | |
| 148 out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in mut]))) | |
| 113 | 149 |
| 114 #tandem mutation stuff | 150 #tandem mutation stuff |
| 115 tandem_frequency = defaultdict(int) | 151 tandem_frequency = defaultdict(int) |
| 116 mutation_frequency = defaultdict(int) | 152 mutation_frequency = defaultdict(int) |
| 117 | 153 |
| 220 with open(tandem_freq_file, 'w') as o: | 256 with open(tandem_freq_file, 'w') as o: |
| 221 o.write("Tandems/Expected (ratio),{0}\n".format(",".join([str(x) for x in tandem_row]))) | 257 o.write("Tandems/Expected (ratio),{0}\n".format(",".join([str(x) for x in tandem_row]))) |
| 222 | 258 |
| 223 #print mutationList, linecount | 259 #print mutationList, linecount |
| 224 | 260 |
| 225 AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1) # [4] is the position of the AA mutation, None if silent | 261 AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] and i[5] != ";" else 0)[4]) + 1) # [4] is the position of the AA mutation, None if silent |
| 226 if AALength < 60: | 262 if AALength < 60: |
| 227 AALength = 64 | 263 AALength = 64 |
| 228 | 264 |
| 229 AA_mutation = [0] * AALength | 265 AA_mutation = [0] * AALength |
| 230 AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]} | 266 AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]} |
| 336 [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] | 372 [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] |
| 337 TW = [(int(x), int(y), z) for (x, y, z) in | 373 TW = [(int(x), int(y), z) for (x, y, z) in |
| 338 [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] | 374 [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] |
| 339 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 | 375 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 |
| 340 | 376 |
| 377 with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "RGYW.txt"), 'a') as out_handle: | |
| 378 for hotspot in RGYW: | |
| 379 out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in hotspot]))) | |
| 380 | |
| 341 mutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] | 381 mutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] |
| 342 for mutation in mutationList: | 382 for mutation in mutationList: |
| 343 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation | 383 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation |
| 344 mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW]) | 384 mutation_in_RGYW = any(((start <= int(where) <= end) for (start, end, region) in RGYW)) |
| 345 mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY]) | 385 mutation_in_WRCY = any(((start <= int(where) <= end) for (start, end, region) in WRCY)) |
| 346 mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA]) | 386 mutation_in_WA = any(((start <= int(where) <= end) for (start, end, region) in WA)) |
| 347 mutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW]) | 387 mutation_in_TW = any(((start <= int(where) <= end) for (start, end, region) in TW)) |
| 348 | 388 |
| 349 in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW]) | 389 in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW]) |
| 350 | 390 |
| 351 if in_how_many_motifs > 0: | 391 if in_how_many_motifs > 0: |
| 352 RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs | 392 RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs |
