shm_csr: shm_csr.py comparison

comparison shm_csr.py @ 62:aa8d37bd1930 draft

Uploaded

author	davidvanzessen
date	Tue, 05 Dec 2017 10:57:13 -0500
parents	c5295dd10dfc
children	8728284105ee

comparison

equal deleted inserted replaced

-:275e759e7985
+:aa8d37bd1930
 	outfile = args.output
 	genedic = dict()
 	mutationdic = dict()
-	mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?")
+	mutationMatcher = re.compile("^(.)(\d+).(.),?[ ]?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?")
+	mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?.?([A-Z])?(.*)?")
+	mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?")
+	mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?")
 	NAMatchResult = (None, None, None, None, None, None, '')
 	geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}
 	linecount = 0
 	IDIndex = 0
 				continue
 			linecount += 1
 			linesplt = line.split("\t")
 			ID = linesplt[IDIndex]
 			genedic[ID] = linesplt[best_matchIndex]
+			mutationdic[ID + "_FR1"] = []
+			if len(linesplt[fr1Index]) > 5 and empty_region_filter == "leader":
+				mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x]
+			mutationdic[ID + "_CDR1"] = []
+			if len(linesplt[cdr1Index]) > 5 and empty_region_filter in ["leader", "FR1"]:
+				mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x]
+			mutationdic[ID + "_FR2"] = []
+			if len(linesplt[fr2Index]) > 5 and empty_region_filter in ["leader", "FR1", "CDR1"]:
+				mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x]
+			mutationdic[ID + "_CDR2"] = []
+			if len(linesplt[cdr2Index]) > 5:
+				mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x]
+			mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
+			mutationdic[ID + "_FR3"] = []
+			if len(linesplt[fr3Index]) > 5:
+				mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x]
 			try:
-				mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if (linesplt[fr1Index] != "NA" and empty_region_filter == "leader") else []
+				pass
-				mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if (linesplt[cdr1Index] != "NA" and empty_region_filter in ["leader", "FR1"]) else []
-				mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if (linesplt[fr2Index] != "NA" and empty_region_filter in ["leader", "FR1", "CDR1"]) else []
-				mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if (linesplt[cdr2Index] != "NA") else []
-				mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
-				mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else []
 			except Exception as e:
 				print "Something went wrong while processing this line:"
+				print "line:", linecount
+				print "fr1 len:", len(linesplt[fr1Index]), "value:", linesplt[fr1Index]
+				print "cdr1 len:", len(linesplt[cdr1Index]), "value:", linesplt[cdr1Index]
+				print "fr2 len:", len(linesplt[fr2Index]), "value:", linesplt[fr2Index]
+				print "cdr2 len:", len(linesplt[cdr2Index]), "value:", linesplt[cdr2Index]
+				print "fr3 len:", len(linesplt[fr3Index]), "value:", linesplt[fr3Index]
+				print ID + "_FR1 in mutationdic", ID + "_FR1" in mutationdic
+				print ID + "_CDR1 in mutationdic", ID + "_CDR1" in mutationdic
+				print ID + "_FR2 in mutationdic", ID + "_FR2" in mutationdic
+				print ID + "_CDR2 in mutationdic", ID + "_CDR2" in mutationdic
+				print ID + "_FR3 in mutationdic", ID + "_FR3" in mutationdic
 				print linesplt
-				print linecount
 				print e
 			mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
 			mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
 			cdr1Length = len(linesplt[cdr1LengthIndex])
 			fr1LengthDict[ID] = fr1Length
 			fr2LengthDict[ID] = fr2Length
 			fr3LengthDict[ID] = fr3Length
 			IDlist += [ID]
+	print "len(mutationdic) =", len(mutationdic)
+	with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "mutationdict.txt"), 'w') as out_handle:
+		for ID, lst in mutationdic.iteritems():
+			for mut in lst:
+				out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in mut])))
 	#tandem mutation stuff
 	tandem_frequency = defaultdict(int)
 	mutation_frequency = defaultdict(int)
 	with open(tandem_freq_file, 'w') as o:
 		o.write("Tandems/Expected (ratio),{0}\n".format(",".join([str(x) for x in tandem_row])))
 	#print mutationList, linecount
-	AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1)  # [4] is the position of the AA mutation, None if silent
+	AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] and i[5] != ";" else 0)[4]) + 1)  # [4] is the position of the AA mutation, None if silent
 	if AALength < 60:
 		AALength = 64
 	AA_mutation = [0] * AALength
 	AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]}
 				[hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]]
 			TW = [(int(x), int(y), z) for (x, y, z) in
 				[hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]]
 			RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0
+			with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "RGYW.txt"), 'a') as out_handle:
+				for hotspot in RGYW:
+					out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in hotspot])))
 			mutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
 			for mutation in mutationList:
 				frm, where, to, AAfrm, AAwhere, AAto, junk = mutation
-				mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW])
+				mutation_in_RGYW = any(((start <= int(where) <= end) for (start, end, region) in RGYW))
-				mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY])
+				mutation_in_WRCY = any(((start <= int(where) <= end) for (start, end, region) in WRCY))
-				mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA])
+				mutation_in_WA = any(((start <= int(where) <= end) for (start, end, region) in WA))
-				mutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW])
+				mutation_in_TW = any(((start <= int(where) <= end) for (start, end, region) in TW))
 				in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW])
 				if in_how_many_motifs > 0:
 					RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs

Mercurial > repos > davidvanzessen > shm_csr

comparison shm_csr.py @ 62:aa8d37bd1930 draft