shm_csr: shm_csr.py comparison

comparison shm_csr.py @ 1:faae21ba5c63 draft

Uploaded

author	davidvanzessen
date	Tue, 25 Oct 2016 07:28:43 -0400
parents	c33d93683a09
children	275ab5175fd6

comparison

equal deleted inserted replaced

-:c33d93683a09
+:faae21ba5c63
 parser = argparse.ArgumentParser()
 parser.add_argument("--input",
 					help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation")
 parser.add_argument("--genes", help="The genes available in the 'best_match' column")
-parser.add_argument("--includefr1", help="Should the mutation/nucleotides in the FR1 region be included?")
+parser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=['leader', 'FR1', 'CDR1', 'FR2'])
 parser.add_argument("--output", help="Output file")
 args = parser.parse_args()
 infile = args.input
 genes = str(args.genes).split(",")
-print "includefr1 =", args.includefr1
+empty_region_filter = args.empty_region_filter
-include_fr1 = True if args.includefr1 == "yes" else False
 outfile = args.output
 genedic = dict()
 mutationdic = dict()
 		linecount += 1
 		linesplt = line.split("\t")
 		ID = linesplt[IDIndex]
 		genedic[ID] = linesplt[best_matchIndex]
 		try:
-			if linesplt[fr1Index] != "NA":
+			mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if (linesplt[fr1Index] != "NA" and empty_region_filter == "leader") else []
-				mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else []
+			mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if (linesplt[cdr1Index] != "NA" and empty_region_filter in ["leader", "FR1"]) else []
-			else:
+			mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if (linesplt[fr2Index] != "NA" and empty_region_filter in ["leader", "FR1", "CDR1"]) else []
-				mutationdic[ID + "_FR1"] = []
+			mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if (linesplt[cdr2Index] != "NA") else []
-			mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if linesplt[cdr1Index] != "NA" else []
-			mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if linesplt[fr2Index] != "NA" else []
-			mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if linesplt[cdr2Index] != "NA" else []
 			mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
 			mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else []
-		except e:
+		except Exception as e:
+			print "Something went wrong while processing this line:"
 			print linesplt
 			print linecount
 			print e
 		mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
 		mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
 			first = False
 			continue
 		linesplt = line.split("\t")
 		gene = linesplt[best_matchIndex]
 		ID = linesplt[IDIndex]
-		if ID == "ca2":
-			print linesplt
 		RGYW = [(int(x), int(y), z) for (x, y, z) in
 				[hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]]
 		WRCY = [(int(x), int(y), z) for (x, y, z) in
 				[hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]]
 		WA = [(int(x), int(y), z) for (x, y, z) in
 			  [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]]
 		TW = [(int(x), int(y), z) for (x, y, z) in
 			  [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]]
 		RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0
-		mutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[
+		mutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
-			ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
 		for mutation in mutationList:
 			frm, where, to, AAfrm, AAwhere, AAto, junk = mutation
 			mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW])
 			mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY])
 			mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA])
 	with open(foutfile, 'w') as o:
 		for typ in arr:
 			o.write(typ + " (%)")
 			curr = dic[typ]
 			for gene in genes:
-				geneMatcher = geneMatchers[gene] #re.compile("^" + gene + ".*") #recompile every loop....
+				geneMatcher = geneMatchers[gene]
 				if valuedic[gene + "_" + fname] is 0:
 					o.write(",0,0,0")
 				else:
 					x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)
 					o.write("," + x + "," + y + "," + z)

Mercurial > repos > davidvanzessen > shm_csr

comparison shm_csr.py @ 1:faae21ba5c63 draft