comparison shm_csr.py @ 1:faae21ba5c63 draft

Uploaded
author davidvanzessen
date Tue, 25 Oct 2016 07:28:43 -0400
parents c33d93683a09
children 275ab5175fd6
comparison
equal deleted inserted replaced
0:c33d93683a09 1:faae21ba5c63
5 5
6 parser = argparse.ArgumentParser() 6 parser = argparse.ArgumentParser()
7 parser.add_argument("--input", 7 parser.add_argument("--input",
8 help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") 8 help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation")
9 parser.add_argument("--genes", help="The genes available in the 'best_match' column") 9 parser.add_argument("--genes", help="The genes available in the 'best_match' column")
10 parser.add_argument("--includefr1", help="Should the mutation/nucleotides in the FR1 region be included?") 10 parser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=['leader', 'FR1', 'CDR1', 'FR2'])
11 parser.add_argument("--output", help="Output file") 11 parser.add_argument("--output", help="Output file")
12 12
13 args = parser.parse_args() 13 args = parser.parse_args()
14 14
15 infile = args.input 15 infile = args.input
16 genes = str(args.genes).split(",") 16 genes = str(args.genes).split(",")
17 print "includefr1 =", args.includefr1 17 empty_region_filter = args.empty_region_filter
18 include_fr1 = True if args.includefr1 == "yes" else False
19 outfile = args.output 18 outfile = args.output
20 19
21 genedic = dict() 20 genedic = dict()
22 21
23 mutationdic = dict() 22 mutationdic = dict()
57 linecount += 1 56 linecount += 1
58 linesplt = line.split("\t") 57 linesplt = line.split("\t")
59 ID = linesplt[IDIndex] 58 ID = linesplt[IDIndex]
60 genedic[ID] = linesplt[best_matchIndex] 59 genedic[ID] = linesplt[best_matchIndex]
61 try: 60 try:
62 if linesplt[fr1Index] != "NA": 61 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if (linesplt[fr1Index] != "NA" and empty_region_filter == "leader") else []
63 mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else [] 62 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if (linesplt[cdr1Index] != "NA" and empty_region_filter in ["leader", "FR1"]) else []
64 else: 63 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if (linesplt[fr2Index] != "NA" and empty_region_filter in ["leader", "FR1", "CDR1"]) else []
65 mutationdic[ID + "_FR1"] = [] 64 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if (linesplt[cdr2Index] != "NA") else []
66 mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if linesplt[cdr1Index] != "NA" else []
67 mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if linesplt[fr2Index] != "NA" else []
68 mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if linesplt[cdr2Index] != "NA" else []
69 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] 65 mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
70 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else [] 66 mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else []
71 except e: 67 except Exception as e:
68 print "Something went wrong while processing this line:"
72 print linesplt 69 print linesplt
73 print linecount 70 print linecount
74 print e 71 print e
75 mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] 72 mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
76 mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] 73 mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
187 first = False 184 first = False
188 continue 185 continue
189 linesplt = line.split("\t") 186 linesplt = line.split("\t")
190 gene = linesplt[best_matchIndex] 187 gene = linesplt[best_matchIndex]
191 ID = linesplt[IDIndex] 188 ID = linesplt[IDIndex]
192 if ID == "ca2":
193 print linesplt
194 RGYW = [(int(x), int(y), z) for (x, y, z) in 189 RGYW = [(int(x), int(y), z) for (x, y, z) in
195 [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] 190 [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]]
196 WRCY = [(int(x), int(y), z) for (x, y, z) in 191 WRCY = [(int(x), int(y), z) for (x, y, z) in
197 [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] 192 [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]]
198 WA = [(int(x), int(y), z) for (x, y, z) in 193 WA = [(int(x), int(y), z) for (x, y, z) in
199 [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] 194 [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]]
200 TW = [(int(x), int(y), z) for (x, y, z) in 195 TW = [(int(x), int(y), z) for (x, y, z) in
201 [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] 196 [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]]
202 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 197 RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0
203 198
204 mutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[ 199 mutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
205 ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
206 for mutation in mutationList: 200 for mutation in mutationList:
207 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation 201 frm, where, to, AAfrm, AAwhere, AAto, junk = mutation
208 mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW]) 202 mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW])
209 mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY]) 203 mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY])
210 mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA]) 204 mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA])
269 with open(foutfile, 'w') as o: 263 with open(foutfile, 'w') as o:
270 for typ in arr: 264 for typ in arr:
271 o.write(typ + " (%)") 265 o.write(typ + " (%)")
272 curr = dic[typ] 266 curr = dic[typ]
273 for gene in genes: 267 for gene in genes:
274 geneMatcher = geneMatchers[gene] #re.compile("^" + gene + ".*") #recompile every loop.... 268 geneMatcher = geneMatchers[gene]
275 if valuedic[gene + "_" + fname] is 0: 269 if valuedic[gene + "_" + fname] is 0:
276 o.write(",0,0,0") 270 o.write(",0,0,0")
277 else: 271 else:
278 x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname) 272 x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)
279 o.write("," + x + "," + y + "," + z) 273 o.write("," + x + "," + y + "," + z)