variant_analyzer2: read2mut.py comparison

comparison read2mut.py @ 63:f0fc93b7945c draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8

author	mheinzl
date	Thu, 18 Mar 2021 10:07:50 +0000
parents	66c1245436b9
children	fd342f5a97d9

comparison

equal deleted inserted replaced

-:66c1245436b9
+:f0fc93b7945c
 Looks for reads with mutation at known
 positions and calculates frequencies and stats.
 =======  ==========  =================  ================================
 Version  Date        Author             Description
-0.2.1    2019-10-27  Gundula Povysil    -
+0.2.2    2019-10-27  Gundula Povysil    -
 =======  ==========  =================  ================================
 USAGE: python read2mut.py --mutFile DCS_Mutations.tabular --bamFile Interesting_Reads.trim.bam
 --inputJson tag_count_dict.json --sscsJson SSCS_counts.json
 if len(value) < thresh:
 pure_tags_dict_short[key] = value
 else:
 pure_tags_dict_short = pure_tags_dict
-# whole_array = []
-# for k in pure_tags_dict.values():
-#    if len(k) != 0:
-#        keys = k.keys()
-#        if len(keys) > 1:
-#            for k1 in keys:
-#                whole_array.append(k1)
-#        else:
-#            whole_array.append(keys[0])
 csv_data = open(outputFile_csv, "wb")
 csv_writer = csv.writer(csv_data, delimiter=",")
 # output summary with threshold
 workbook = xlsxwriter.Workbook(outfile)
 counter_tier24 = 0
 counter_tier31 = 0
 counter_tier32 = 0
 counter_tier25 = 0
 counter_tier4 = 0
-# if chimera_correction:
-#    counter_tier43 = 0
 counter_tier51 = 0
 counter_tier52 = 0
 counter_tier53 = 0
 counter_tier54 = 0
 counter_tier55 = 0
 counter_tier7 = 0
 row = 1
 tier_dict = {}
 chimera_dict = {}
-#change_tier_after_print = {}
 for key1, value1 in sorted(mut_dict.items()):
 counts_mut = 0
 chimeric_tag_list = []
 chimeric_tag = {}
 if key1 in pure_tags_dict_short.keys():
 change_tier_after_print = []
 i = np.where(np.array(['#'.join(str(i) for i in z)
 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
 ref = mut_array[i, 2]
 alt = mut_array[i, 3]
 dcs_median = cvrg_dict[key1][2]
 	softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1]
 	dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)]
 	dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)]
 	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
 	if any(ij is True for ij in softclipped_both_ends_idx1):
-		print(softclipped_both_ends_idx1)
 		for nr, indx in enumerate(softclipped_both_ends_idx1):
 			if indx:
 				if dist_start_read1[nr] <= dist_end_read1[nr]:
 					dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number
 				else:
 	softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4]
 	dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)]
 	dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)]
 	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
 	if any(ij is True for ij in softclipped_both_ends_idx4):
-		print(softclipped_both_ends_idx4)
 		for nr, indx in enumerate(softclipped_both_ends_idx4):
 			if indx:
 				if dist_start_read4[nr] <= dist_end_read4[nr]:
 					dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number
 				else:
 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2]
 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)]
 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)]
 	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
 if any(ij is True for ij in softclipped_both_ends_idx2):
-	print(softclipped_both_ends_idx2)
 for nr, indx in enumerate(softclipped_both_ends_idx2):
 if indx:
 if dist_start_read2[nr] <= dist_end_read2[nr]:
 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number
 else:
 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3]
 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)]
 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)]
 	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
 if any(ij is True for ij in softclipped_both_ends_idx3):
-	print(softclipped_both_ends_idx3)
 for nr, indx in enumerate(softclipped_both_ends_idx3):
 if indx:
 if dist_start_read3[nr] <= dist_end_read3[nr]:
 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh
 else:
 min_tag_array2 = array2[min_index]  # get whole tag with min HD
 min_value = dist.min()
 # calculate HD of "b" to all "b's" or "a" to all "a's"
 dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e))
 for e in min_tag_half2])
 dist2 = dist_second_half.max()
 max_index = np.where(dist_second_half == dist_second_half.max())[0]  # get index of max HD
 max_tag = min_tag_array2[max_index]
 # tags which have identical parts:
 else:
 chimeric_dcs_high_tiers += high_tiers
 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers)
 # write to file
 # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4
 sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]])
 correct_tier = False
 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0:
 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"]
 tier_dict[key1]["tier 4"] = 0
 correct_tier = True
 ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"),
 ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"),
 ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"),
 ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"),
 ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"),
-("Tier 2.5", "variants at the start or end of the read and recurring mutation on this position in tier 1.1-2.4"),
+("Tier 2.5", "variants at the start or end of the read (ignoring variant position tier 1.1-2.4) and recurring mutation on this position in tier 1.1-2.4"),
 ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"),
 ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"),
 ("Tier 4", "variants at the start or end of the reads"),
 ("Tier 5.1", "variant is close to softclipping in both mates and SSCS"),
 ("Tier 5.2", "variant is close to softclipping in one of the mates but both SSCS"),

Mercurial > repos > mheinzl > variant_analyzer2

comparison read2mut.py @ 63:f0fc93b7945c draft