diff read2mut.py @ 63:f0fc93b7945c draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author mheinzl
date Thu, 18 Mar 2021 10:07:50 +0000
parents 66c1245436b9
children fd342f5a97d9
line wrap: on
line diff
--- a/read2mut.py	Wed Mar 17 13:24:40 2021 +0000
+++ b/read2mut.py	Thu Mar 18 10:07:50 2021 +0000
@@ -10,7 +10,7 @@
 
 =======  ==========  =================  ================================
 Version  Date        Author             Description
-0.2.1    2019-10-27  Gundula Povysil    -
+0.2.2    2019-10-27  Gundula Povysil    -
 =======  ==========  =================  ================================
 
 
@@ -269,16 +269,6 @@
     else:
         pure_tags_dict_short = pure_tags_dict
 
-    # whole_array = []
-    # for k in pure_tags_dict.values():
-    #    if len(k) != 0:
-    #        keys = k.keys()
-    #        if len(keys) > 1:
-    #            for k1 in keys:
-    #                whole_array.append(k1)
-    #        else:
-    #            whole_array.append(keys[0])
-
     csv_data = open(outputFile_csv, "wb")
     csv_writer = csv.writer(csv_data, delimiter=",")
 
@@ -321,8 +311,6 @@
     counter_tier32 = 0
     counter_tier25 = 0
     counter_tier4 = 0
-    # if chimera_correction:
-    #    counter_tier43 = 0
     counter_tier51 = 0
     counter_tier52 = 0
     counter_tier53 = 0
@@ -334,14 +322,12 @@
     row = 1
     tier_dict = {}
     chimera_dict = {}
-    #change_tier_after_print = {}
     for key1, value1 in sorted(mut_dict.items()):
         counts_mut = 0
         chimeric_tag_list = []
         chimeric_tag = {}
         if key1 in pure_tags_dict_short.keys():
             change_tier_after_print = []
-
             i = np.where(np.array(['#'.join(str(i) for i in z)
                                    for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
             ref = mut_array[i, 2]
@@ -640,7 +626,6 @@
                         	dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)]
                         	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
                         	if any(ij is True for ij in softclipped_both_ends_idx1):
-                        		print(softclipped_both_ends_idx1)
                         		for nr, indx in enumerate(softclipped_both_ends_idx1):
                         			if indx:
                         				if dist_start_read1[nr] <= dist_end_read1[nr]:
@@ -661,7 +646,6 @@
                         	dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)]
                         	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
                         	if any(ij is True for ij in softclipped_both_ends_idx4):
-                        		print(softclipped_both_ends_idx4)
                         		for nr, indx in enumerate(softclipped_both_ends_idx4):
                         			if indx:
                         				if dist_start_read4[nr] <= dist_end_read4[nr]:
@@ -682,7 +666,6 @@
                             dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)]
                         	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
                             if any(ij is True for ij in softclipped_both_ends_idx2):
-                            	print(softclipped_both_ends_idx2)
                                 for nr, indx in enumerate(softclipped_both_ends_idx2):
                                     if indx:
                                         if dist_start_read2[nr] <= dist_end_read2[nr]:
@@ -703,7 +686,6 @@
                             dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)]
                         	# if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
                             if any(ij is True for ij in softclipped_both_ends_idx3):
-                            	print(softclipped_both_ends_idx3)
                                 for nr, indx in enumerate(softclipped_both_ends_idx3):
                                     if indx:
                                         if dist_start_read3[nr] <= dist_end_read3[nr]:
@@ -1091,7 +1073,6 @@
                                 # calculate HD of "b" to all "b's" or "a" to all "a's"
                                 dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e))
                                                              for e in min_tag_half2])
-
                                 dist2 = dist_second_half.max()
                                 max_index = np.where(dist_second_half == dist_second_half.max())[0]  # get index of max HD
                                 max_tag = min_tag_array2[max_index]
@@ -1170,12 +1151,9 @@
                 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers)
 
             # write to file
-            
             # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4
             sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]])
-
             correct_tier = False
-
             if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0:
                 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"]
                 tier_dict[key1]["tier 4"] = 0
@@ -1317,7 +1295,7 @@
                          ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"),
                          ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"),
                          ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"),
-                         ("Tier 2.5", "variants at the start or end of the read and recurring mutation on this position in tier 1.1-2.4"),
+                         ("Tier 2.5", "variants at the start or end of the read (ignoring variant position tier 1.1-2.4) and recurring mutation on this position in tier 1.1-2.4"),
                          ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"),
                          ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"),
                          ("Tier 4", "variants at the start or end of the reads"),