Mercurial > repos > davidvanzessen > shm_csr
comparison shm_csr.py @ 1:faae21ba5c63 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 25 Oct 2016 07:28:43 -0400 |
parents | c33d93683a09 |
children | 275ab5175fd6 |
comparison
equal
deleted
inserted
replaced
0:c33d93683a09 | 1:faae21ba5c63 |
---|---|
5 | 5 |
6 parser = argparse.ArgumentParser() | 6 parser = argparse.ArgumentParser() |
7 parser.add_argument("--input", | 7 parser.add_argument("--input", |
8 help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") | 8 help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") |
9 parser.add_argument("--genes", help="The genes available in the 'best_match' column") | 9 parser.add_argument("--genes", help="The genes available in the 'best_match' column") |
10 parser.add_argument("--includefr1", help="Should the mutation/nucleotides in the FR1 region be included?") | 10 parser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=['leader', 'FR1', 'CDR1', 'FR2']) |
11 parser.add_argument("--output", help="Output file") | 11 parser.add_argument("--output", help="Output file") |
12 | 12 |
13 args = parser.parse_args() | 13 args = parser.parse_args() |
14 | 14 |
15 infile = args.input | 15 infile = args.input |
16 genes = str(args.genes).split(",") | 16 genes = str(args.genes).split(",") |
17 print "includefr1 =", args.includefr1 | 17 empty_region_filter = args.empty_region_filter |
18 include_fr1 = True if args.includefr1 == "yes" else False | |
19 outfile = args.output | 18 outfile = args.output |
20 | 19 |
21 genedic = dict() | 20 genedic = dict() |
22 | 21 |
23 mutationdic = dict() | 22 mutationdic = dict() |
57 linecount += 1 | 56 linecount += 1 |
58 linesplt = line.split("\t") | 57 linesplt = line.split("\t") |
59 ID = linesplt[IDIndex] | 58 ID = linesplt[IDIndex] |
60 genedic[ID] = linesplt[best_matchIndex] | 59 genedic[ID] = linesplt[best_matchIndex] |
61 try: | 60 try: |
62 if linesplt[fr1Index] != "NA": | 61 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if (linesplt[fr1Index] != "NA" and empty_region_filter == "leader") else [] |
63 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else [] | 62 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if (linesplt[cdr1Index] != "NA" and empty_region_filter in ["leader", "FR1"]) else [] |
64 else: | 63 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if (linesplt[fr2Index] != "NA" and empty_region_filter in ["leader", "FR1", "CDR1"]) else [] |
65 mutationdic[ID + "_FR1"] = [] | 64 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if (linesplt[cdr2Index] != "NA") else [] |
66 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if linesplt[cdr1Index] != "NA" else [] | |
67 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if linesplt[fr2Index] != "NA" else [] | |
68 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if linesplt[cdr2Index] != "NA" else [] | |
69 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] | 65 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] |
70 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else [] | 66 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else [] |
71 except e: | 67 except Exception as e: |
68 print "Something went wrong while processing this line:" | |
72 print linesplt | 69 print linesplt |
73 print linecount | 70 print linecount |
74 print e | 71 print e |
75 mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] | 72 mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] |
76 mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] | 73 mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] |
187 first = False | 184 first = False |
188 continue | 185 continue |
189 linesplt = line.split("\t") | 186 linesplt = line.split("\t") |
190 gene = linesplt[best_matchIndex] | 187 gene = linesplt[best_matchIndex] |
191 ID = linesplt[IDIndex] | 188 ID = linesplt[IDIndex] |
192 if ID == "ca2": | |
193 print linesplt | |
194 RGYW = [(int(x), int(y), z) for (x, y, z) in | 189 RGYW = [(int(x), int(y), z) for (x, y, z) in |
195 [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] | 190 [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] |
196 WRCY = [(int(x), int(y), z) for (x, y, z) in | 191 WRCY = [(int(x), int(y), z) for (x, y, z) in |
197 [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] | 192 [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] |
198 WA = [(int(x), int(y), z) for (x, y, z) in | 193 WA = [(int(x), int(y), z) for (x, y, z) in |
199 [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] | 194 [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] |
200 TW = [(int(x), int(y), z) for (x, y, z) in | 195 TW = [(int(x), int(y), z) for (x, y, z) in |
201 [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] | 196 [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] |
202 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 | 197 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 |
203 | 198 |
204 mutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[ | 199 mutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] |
205 ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] | |
206 for mutation in mutationList: | 200 for mutation in mutationList: |
207 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation | 201 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation |
208 mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW]) | 202 mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW]) |
209 mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY]) | 203 mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY]) |
210 mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA]) | 204 mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA]) |
269 with open(foutfile, 'w') as o: | 263 with open(foutfile, 'w') as o: |
270 for typ in arr: | 264 for typ in arr: |
271 o.write(typ + " (%)") | 265 o.write(typ + " (%)") |
272 curr = dic[typ] | 266 curr = dic[typ] |
273 for gene in genes: | 267 for gene in genes: |
274 geneMatcher = geneMatchers[gene] #re.compile("^" + gene + ".*") #recompile every loop.... | 268 geneMatcher = geneMatchers[gene] |
275 if valuedic[gene + "_" + fname] is 0: | 269 if valuedic[gene + "_" + fname] is 0: |
276 o.write(",0,0,0") | 270 o.write(",0,0,0") |
277 else: | 271 else: |
278 x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname) | 272 x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname) |
279 o.write("," + x + "," + y + "," + z) | 273 o.write("," + x + "," + y + "," + z) |