shm_csr: shm_csr.py comparison

comparison shm_csr.py @ 83:729738462297 draft

"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"

author	rhpvorderman
date	Wed, 15 Sep 2021 12:24:06 +0000
parents	b6f9a640e098
children	6809c63d9161

comparison

equal deleted inserted replaced

-:a103134ee6e0
+:729738462297
 	mutationdic = dict()
 	mutationMatcher = re.compile("^(.)(\d+).(.),?[ ]?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?")
 	mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?.?([A-Z])?(.*)?")
 	mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?")
-	mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?")
+	mutationMatcher = re.compile(r"^([nactg])(\d+).([nactg]),?[ ]?([A-Z*])?(\d+)?[>]?([A-Z*;])?(.*)?")
 	NAMatchResult = (None, None, None, None, None, None, '')
 	geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}
 	linecount = 0
 	IDIndex = 0
 	fr3SeqIndex = 0
 	tandem_sum_by_class = defaultdict(int)
 	expected_tandem_sum_by_class = defaultdict(float)
-	with open(infile, 'ru') as i:
+	with open(infile, 'r') as i:
 		for line in i:
 			if first:
 				linesplt = line.split("\t")
 				IDIndex = linesplt.index("Sequence.ID")
 				best_matchIndex = linesplt.index("best_match")
 			fr1LengthDict[ID] = fr1Length
 			fr2LengthDict[ID] = fr2Length
 			fr3LengthDict[ID] = fr3Length
 			IDlist += [ID]
-	print "len(mutationdic) =", len(mutationdic)
+	print("len(mutationdic) =", len(mutationdic))
 	with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "mutationdict.txt"), 'w') as out_handle:
-		for ID, lst in mutationdic.iteritems():
+		for ID, lst in mutationdic.items():
 			for mut in lst:
 				out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in mut])))
 	#tandem mutation stuff
 	tandem_frequency = defaultdict(int)
 				tandem_frequency[str(tandem_mut[1])] += 1
 			#print "\t".join([ID, str(len(tandem_muts)), str(longest_tandem[1]) , str(tandem_muts)])
 	tandem_freq_file = os.path.join(os.path.dirname(outfile), "tandem_frequency.txt")
 	with open(tandem_freq_file, 'w') as o:
-		for frq in sorted([int(x) for x in tandem_frequency.keys()]):
+		for frq in sorted([int(x) for x in list(tandem_frequency.keys())]):
 			o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)]))
 	tandem_row = []
 	genes_extra = list(genes)
 	genes_extra.append("all")
 	AA_mutation = [0] * AALength
 	AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]}
 	AA_mutation_empty = AA_mutation[:]
-	print "AALength:", AALength
+	print("AALength:", AALength)
 	aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt"
 	with open(aa_mutations_by_id_file, 'w') as o:
 		o.write("ID\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n")
-		for ID in mutationListByID.keys():
+		for ID in list(mutationListByID.keys()):
 			AA_mutation_for_ID = AA_mutation_empty[:]
 			for mutation in mutationListByID[ID]:
 				if mutation[4] and mutation[5] != ";":
 					AA_mutation_position = int(mutation[4])
 					try:
 						AA_mutation[AA_mutation_position] += 1
 						AA_mutation_for_ID[AA_mutation_position] += 1
 					except Exception as e:
-						print e
+						print(e)
-						print mutation
+						print(mutation)
 						sys.exit()
 					clss = genedic[ID][:3]
 					AA_mutation_dic[clss][AA_mutation_position] += 1
 			o.write(ID + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n")
 	#absent AA stuff
 	absentAACDR1Dic = defaultdict(list)
-	absentAACDR1Dic[5] = range(29,36)
+	absentAACDR1Dic[5] = list(range(29,36))
-	absentAACDR1Dic[6] = range(29,35)
+	absentAACDR1Dic[6] = list(range(29,35))
-	absentAACDR1Dic[7] = range(30,35)
+	absentAACDR1Dic[7] = list(range(30,35))
-	absentAACDR1Dic[8] = range(30,34)
+	absentAACDR1Dic[8] = list(range(30,34))
-	absentAACDR1Dic[9] = range(31,34)
+	absentAACDR1Dic[9] = list(range(31,34))
-	absentAACDR1Dic[10] = range(31,33)
+	absentAACDR1Dic[10] = list(range(31,33))
 	absentAACDR1Dic[11] = [32]
 	absentAACDR2Dic = defaultdict(list)
-	absentAACDR2Dic[0] = range(55,65)
+	absentAACDR2Dic[0] = list(range(55,65))
-	absentAACDR2Dic[1] = range(56,65)
+	absentAACDR2Dic[1] = list(range(56,65))
-	absentAACDR2Dic[2] = range(56,64)
+	absentAACDR2Dic[2] = list(range(56,64))
-	absentAACDR2Dic[3] = range(57,64)
+	absentAACDR2Dic[3] = list(range(57,64))
-	absentAACDR2Dic[4] = range(57,63)
+	absentAACDR2Dic[4] = list(range(57,63))
-	absentAACDR2Dic[5] = range(58,63)
+	absentAACDR2Dic[5] = list(range(58,63))
-	absentAACDR2Dic[6] = range(58,62)
+	absentAACDR2Dic[6] = list(range(58,62))
-	absentAACDR2Dic[7] = range(59,62)
+	absentAACDR2Dic[7] = list(range(59,62))
-	absentAACDR2Dic[8] = range(59,61)
+	absentAACDR2Dic[8] = list(range(59,61))
 	absentAACDR2Dic[9] = [60]
 	absentAA = [len(IDlist)] * (AALength-1)
-	for k, cdr1Length in cdr1LengthDic.iteritems():
+	for k, cdr1Length in cdr1LengthDic.items():
 		for c in absentAACDR1Dic[cdr1Length]:
 			absentAA[c] -= 1
-	for k, cdr2Length in cdr2LengthDic.iteritems():
+	for k, cdr2Length in cdr2LengthDic.items():
 		for c in absentAACDR2Dic[cdr2Length]:
 			absentAA[c] -= 1
 	aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt"
 			for c in absentAACDR2Dic[cdr2Length]:
 				absentAAbyID[c] -= 1
 			o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n")
 	if linecount == 0:
-		print "No data, exiting"
+		print("No data, exiting")
 		with open(outfile, 'w') as o:
 			o.write("RGYW (%)," + ("0,0,0\n" * len(genes)))
 			o.write("WRCY (%)," + ("0,0,0\n" * len(genes)))
 			o.write("WA (%)," + ("0,0,0\n" * len(genes)))
 			o.write("TW (%)," + ("0,0,0\n" * len(genes)))
-		import sys
 		sys.exit()
 	hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)")
 	RGYWCount = {}
 	WRCYCount = {}
 	ataIndex = 0
 	tatIndex = 0
 	aggctatIndex = 0
 	atagcctIndex = 0
 	first = True
-	with open(infile, 'ru') as i:
+	with open(infile, 'r') as i:
 		for line in i:
 			if first:
 				linesplt = line.split("\t")
 				ataIndex = linesplt.index("X.a.t.a")
 				tatIndex = linesplt.index("t.a.t.")
 			with open(mutations_in_motifs_file, 'a') as out_handle:
 				motif_dic = {"RGYW": RGYW, "WRCY": WRCY, "WA": WA, "TW": TW}
 				for mutation in mutationList:
 					frm, where, to, AAfrm, AAwhere, AAto, junk = mutation
-					for motif in motif_dic.keys():
+					for motif in list(motif_dic.keys()):
 						for start, end, region in motif_dic[motif]:
 							if start <= int(where) <= end:
 								out_handle.write("{0}\n".format(
 									"\t".join([
 	directory = outfile[:outfile.rfind("/") + 1]
 	value = 0
 	valuedic = dict()
-	for fname in funcs.keys():
+	for fname in list(funcs.keys()):
 		for gene in genes:
 			with open(directory + gene + "_" + fname + "_value.txt", 'r') as v:
 				valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip())
 		with open(directory + "all_" + fname + "_value.txt", 'r') as v:
 			valuedic["total_" + fname] = float(v.readlines()[0].rstrip())
 		return (str(x), str(y), z)
 	dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}
 	arr = ["RGYW", "WRCY", "WA", "TW"]
-	for fname in funcs.keys():
+	for fname in list(funcs.keys()):
 		func = funcs[fname]
 		foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt"
 		with open(foutfile, 'w') as o:
 			for typ in arr:
 				o.write(typ + " (%)")
 				for gene in genes:
 					geneMatcher = geneMatchers[gene]
 					if valuedic[gene + "_" + fname] is 0:
 						o.write(",0,0,0")
 					else:
-						x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)
+						x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.items() if geneMatcher.match(z)]], gene, func, fname)
 						o.write("," + x + "," + y + "," + z)
-				x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname)
+				x, y, z = get_xyz([y for x, y in curr.items() if not genedic[x].startswith("unmatched")], "total", func, fname)
 				#x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname)
 				o.write("," + x + "," + y + "," + z + "\n")
 	# for testing

Mercurial > repos > davidvanzessen > shm_csr

comparison shm_csr.py @ 83:729738462297 draft