diff read2mut.py @ 84:e46d5e377760 draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty
author mheinzl
date Fri, 19 Aug 2022 11:23:37 +0000
parents 8cec772c0bf1
children d1cd4cd9f18d
line wrap: on
line diff
--- a/read2mut.py	Fri Aug 05 08:23:34 2022 +0000
+++ b/read2mut.py	Fri Aug 19 11:23:37 2022 +0000
@@ -67,6 +67,9 @@
                         help='Count mutation as an artifact if mutation lies within this parameter away from the softclipping part of the read.')
     parser.add_argument('--reads_threshold',  type=float, default=1.0,
                         help='Float number which specifies the minimum percentage of softclipped reads in a family to be considered in the softclipping tiers. Default: 1.0, means all reads of a family have to be softclipped.')
+    parser.add_argument('--refalttiers', action="store_true",
+                        help='Store also information about the reference allele.')
+
     return parser
 
 
@@ -87,6 +90,7 @@
     outfile2 = args.outputFile2
     outfile3 = args.outputFile3
     outputFile_csv = args.outputFile_csv
+    refalttiers = args.refalttiers
 
     thresh = args.thresh
     phred_score = args.phred
@@ -425,8 +429,8 @@
         counts_mut = 0
         chimeric_tag_list = []
         chimeric_tag = {}
+
         if (key1 in pure_tags_dict_short.keys()) or (key1 in pure_tags_dict_ref.keys()):  # ref or alt
-
             # if key1 not in np.array(['#'.join(str(i) for i in z)
             #                        for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]):
             #     continue
@@ -478,6 +482,9 @@
                     elif key2[:-5] in tag_dict_ref.keys():
                         variant_type = "ref"
 
+                    if refalttiers is False and variant_type == "ref":  # if we only want information about alt tiers, skip all refs
+                        continue
+
                     if key2[:-5] + '.ab.1' in mut_dict[key1].keys():
                         total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values())
                         if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
@@ -682,8 +689,6 @@
                     #     used_keys_ref.append(key2[:-5])
                     counts_mut += 1
 
-                    if key2[:-5] == "CTTGAACACGTCAGCATGCATGGAGA":
-                        print(key1, variant_type, (alt1f + alt2f + alt3f + alt4f) > 0.5, (ref1f + ref2f + ref3f + ref4f) > 0.5)
                     if (variant_type == "alt" and ((alt1f + alt2f + alt3f + alt4f) > 0.5)) or (variant_type == "ref" and ((ref1f + ref2f + ref3f + ref4f) > 0.5)):
                         if variant_type == "alt":
                             tier1ff, tier2ff, tier3ff, tier4ff = alt1f, alt2f, alt3f, alt4f
@@ -1345,10 +1350,9 @@
                                 ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
                                 ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
                                 ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+                            row += 3
                         else:
-                            change_tier_after_print.append((line, line2, trimmed_actual_high_tier))
-
-                        row += 3
+                            change_tier_after_print.append((line, line2, trimmed_actual_high_tier))    
 
             if chimera_correction:
                 chimeric_dcs_high_tiers = 0
@@ -1416,20 +1420,32 @@
                     row += 3
 
     # sheet 2
-    if chimera_correction:
-        header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
-                        'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)',
-                        'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)',
-                        'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)',
-                        'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)'
-                        )
+    if refalttiers:
+        if chimera_correction:
+            header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
+                            'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)',
+                            'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)',
+                            'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)',
+                            'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)'
+                            )
+        else:
+            header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
+                            'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)',
+                            'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)',
+                            'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)',
+                            'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)'
+                            )
     else:
-        header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
-                        'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)',
-                        'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)',
-                        'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)',
-                        'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)'
-                        )
+        if chimera_correction:
+            header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
+                            'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)',
+                            'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)'
+                            )
+        else:
+            header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
+                            'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)',
+                            'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)'
+                            )
     ws2.write_row(0, 0, header_line2)
     row = 0
 
@@ -1458,7 +1474,10 @@
             if sum(used_tiers) == 0:  # skip mutations that are filtered by the VA in the first place
                 continue
             lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)])
-            lst.extend([(sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])), sum(used_tiers_ref[0:7]), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])))])
+            if refalttiers:
+                lst.extend([(sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])), sum(used_tiers_ref[0:7]), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])))])
+            else:
+                lst.extend([(cvrg - sum(used_tiers[-10:])), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (cvrg - sum(used_tiers[-10:])))])
             if chimera_correction:
                 chimeras_all = chimera_dict[key1][1]
                 new_alt = sum(used_tiers[0:7]) - chimeras_all
@@ -1469,18 +1488,29 @@
                 lst.extend([new_cvrg, chimeras_all, safe_div(new_alt, new_cvrg)])
             lst.extend([alt_count, safe_div(alt_count, cvrg)])
             lst.extend(used_tiers)
-            lst.extend(used_tiers_ref)
+            if refalttiers:
+                lst.extend(used_tiers_ref)
             # lst.extend(cum_af)
             lst = tuple(lst)
             ws2.write_row(row + 1, 0, lst)
-            if chimera_correction:
-                ws2.conditional_format('N{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$N$1="tier 1.1 (alt)"', 'format': format12, 'multi_range': 'N{}:O{} N1:O1 AE{}:AF{} AE1:AF1'.format(row + 2, row + 2, row + 2, row + 2)})
-                ws2.conditional_format('P{}:T{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 2.1 (alt)"', 'format': format32, 'multi_range': 'P{}:T{} P1:T1 AG{}:AK{} AG1:AK1'.format(row + 2, row + 2, row + 2, row + 2)})
-                ws2.conditional_format('U{}:AD{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$U$1="tier 3.1 (alt)"', 'format': format22, 'multi_range': 'U{}:AD{} U1:AD1 AL{}:AU{} AL1:AU1'.format(row + 2, row + 2, row + 2, row + 2)})
+            if refalttiers:
+                if chimera_correction:
+                    ws2.conditional_format('N{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$N$1="tier 1.1 (alt)"', 'format': format12, 'multi_range': 'N{}:O{} N1:O1 AE{}:AF{} AE1:AF1'.format(row + 2, row + 2, row + 2, row + 2)})
+                    ws2.conditional_format('P{}:T{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 2.1 (alt)"', 'format': format32, 'multi_range': 'P{}:T{} P1:T1 AG{}:AK{} AG1:AK1'.format(row + 2, row + 2, row + 2, row + 2)})
+                    ws2.conditional_format('U{}:AD{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$U$1="tier 3.1 (alt)"', 'format': format22, 'multi_range': 'U{}:AD{} U1:AD1 AL{}:AU{} AL1:AU1'.format(row + 2, row + 2, row + 2, row + 2)})
+                else:
+                    ws2.conditional_format('K{}:L{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$K$1="tier 1.1 (alt)"', 'format': format12, 'multi_range': 'K{}:L{} K1:L1 AB{}:AC{} AB1:AC1'.format(row + 2, row + 2, row + 2, row + 2)})
+                    ws2.conditional_format('M{}:Q{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$M$1="tier 2.1 (alt)"', 'format': format32, 'multi_range': 'M{}:Q{} M1:Q1 AD{}:AH{} AD1:AH1'.format(row + 2, row + 2, row + 2, row + 2)})
+                    ws2.conditional_format('R{}:AA{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$R$1="tier 3.1 (alt)"', 'format': format22, 'multi_range': 'R{}:AA{} R1:AA1 AI{}:AR{} AI1:AR1'.format(row + 2, row + 2, row + 2, row + 2)})
             else:
-                ws2.conditional_format('K{}:L{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$K$1="tier 1.1 (alt)"', 'format': format12, 'multi_range': 'K{}:L{} K1:L1 AB{}:AC{} AB1:AC1'.format(row + 2, row + 2, row + 2, row + 2)})
-                ws2.conditional_format('M{}:Q{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$M$1="tier 2.1 (alt)"', 'format': format32, 'multi_range': 'M{}:Q{} M1:Q1 AD{}:AH{} AD1:AH1'.format(row + 2, row + 2, row + 2, row + 2)})
-                ws2.conditional_format('R{}:AA{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$R$1="tier 3.1 (alt)"', 'format': format22, 'multi_range': 'R{}:AA{} R1:AA1 AI{}:AR{} AI1:AR1'.format(row + 2, row + 2, row + 2, row + 2)})
+                if chimera_correction:
+                    ws2.conditional_format('M{}:N{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$M$1="tier 1.1 (alt)"', 'format': format12, 'multi_range': 'M{}:N{} M1:N1'.format(row + 2, row + 2)})
+                    ws2.conditional_format('O{}:S{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$O$1="tier 2.1 (alt)"', 'format': format32, 'multi_range': 'O{}:S{} O1:S1'.format(row + 2, row + 2)})
+                    ws2.conditional_format('T{}:AC{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$T$1="tier 3.1 (alt)"', 'format': format22, 'multi_range': 'T{}:AC{} T1:AC1'.format(row + 2, row + 2)})
+                else:
+                    ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1 (alt)"', 'format': format12, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)})
+                    ws2.conditional_format('L{}:P{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1 (alt)"', 'format': format32, 'multi_range': 'L{}:P{} L1:P1'.format(row + 2, row + 2)})
+                    ws2.conditional_format('Q{}:Z{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$Q$1="tier 3.1 (alt)"', 'format': format22, 'multi_range': 'Q{}:Z{} Q1:Z1'.format(row + 2, row + 2)})
             row += 1
 
     # sheet 3
@@ -1612,9 +1642,9 @@
         ex = examples_tiers[i]
         for k in range(len(ex)):
             ws3.write_row(start_row + 2 + row + i + k + 2, 0, ex[k])
-        ws3.conditional_format('M{}:N{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), 
-                               {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 
-                                'format': format13, 
+        ws3.conditional_format('M{}:N{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3),
+                               {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2),
+                                'format': format13,
                                 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)})
         ws3.conditional_format('M{}:N{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3),
                                {'type': 'formula', 'criteria': '=OR($B${}="2.1",$B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2),