Mercurial > repos > mheinzl > variant_analyzer2
changeset 43:d21960b45a6b draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author | mheinzl |
---|---|
date | Tue, 02 Mar 2021 15:32:41 +0000 |
parents | da224c392a54 |
children | 29226ceba7cd |
files | mut2read.py mut2read.xml mut2sscs.py mut2sscs.xml read2mut.py read2mut.xml test-data/Variant_Analyzer_allele_frequencies_test.xlsx test-data/Variant_Analyzer_summary_test.xlsx test-data/Variant_Analyzer_test.xlsx test-data/Variant_Analyzer_tiers_test.xlsx va_macros.xml |
diffstat | 11 files changed, 525 insertions(+), 213 deletions(-) [+] |
line wrap: on
line diff
--- a/mut2read.py Wed Feb 24 14:20:17 2021 +0000 +++ b/mut2read.py Tue Mar 02 15:32:41 2021 +0000 @@ -11,7 +11,7 @@ ======= ========== ================= ================================ Version Date Author Description -2.0.0 2020-10-30 Gundula Povysil - +0.2.1 2019-10-27 Gundula Povysil - ======= ========== ================= ================================ USAGE: python mut2read.py DCS_Mutations.tabular DCS.bam Aligned_Families.tabular Interesting_Reads.fastq tag_count_dict.json @@ -62,6 +62,7 @@ sys.exit("Error: Could not find '{}'".format(file3)) # read dcs bam file +# pysam.index(file2) bam = pysam.AlignmentFile(file2, "rb") # get tags @@ -73,8 +74,14 @@ stop_pos = variant.start #chrom_stop_pos = str(chrom) + "#" + str(stop_pos) ref = variant.REF - alt = variant.ALT[0] + if len(variant.ALT) == 0: + continue + else: + alt = variant.ALT[0] + print(alt) chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt + + dcs_len = [] if len(ref) == len(alt): for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): @@ -145,3 +152,4 @@ if __name__ == '__main__': sys.exit(mut2read(sys.argv)) +
--- a/mut2read.xml Wed Feb 24 14:20:17 2021 +0000 +++ b/mut2read.xml Tue Mar 02 15:32:41 2021 +0000 @@ -4,7 +4,12 @@ <macros> <import>va_macros.xml</import> </macros> - <expand macro="requirements"/> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.4.0">matplotlib</requirement> + <requirement type="package" version="0.15">pysam</requirement> + <requirement type="package" version="0.11.6">cyvcf2</requirement> + </requirements> <command><![CDATA[ ln -s '$file2' bam_input.bam && ln -s '${file2.metadata.bam_index}' bam_input.bam.bai && @@ -27,10 +32,10 @@ </outputs> <tests> <test> - <param name="file1" value="FreeBayes_test.vcf"/> + <param name="file1" value="FreeBayes_test.vcf" lines_diff="2"/> <param name="file2" value="DCS_test.bam"/> <param name="file3" value="Aligned_Families_test.tabular"/> - <output name="output_fastq" file="Interesting_Reads_test.fastq"/> + <output name="output_fastq" file="Interesting_Reads_test.fastq" lines_diff="136"/> <output name="output_json" file="tag_count_dict_test.json" lines_diff="2"/> </test> </tests>
--- a/mut2sscs.py Wed Feb 24 14:20:17 2021 +0000 +++ b/mut2sscs.py Tue Mar 02 15:32:41 2021 +0000 @@ -11,7 +11,7 @@ ======= ========== ================= ================================ Version Date Author Description -2.0.0 2020-10-30 Gundula Povysil - +0.2.1 2019-10-27 Gundula Povysil - ======= ========== ================= ================================ USAGE: python mut2sscs.py DCS_Mutations.tabular SSCS.bam SSCS_counts.json @@ -25,6 +25,7 @@ import os import sys +import numpy as np import pysam from cyvcf2 import VCF @@ -55,6 +56,7 @@ sys.exit("Error: Could not find '{}'".format(file2)) # read SSCS bam file +# pysam.index(file2) bam = pysam.AlignmentFile(file2, "rb") # get tags @@ -66,10 +68,14 @@ stop_pos = variant.start #chrom_stop_pos = str(chrom) + "#" + str(stop_pos) ref = variant.REF - alt = variant.ALT[0] + if len(variant.ALT) == 0: + continue + else: + alt = variant.ALT[0] chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt if len(ref) == len(alt): + for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000): if pileupcolumn.reference_pos == stop_pos: count_alt = 0 @@ -131,3 +137,4 @@ if __name__ == '__main__': sys.exit(mut2sscs(sys.argv)) +
--- a/mut2sscs.xml Wed Feb 24 14:20:17 2021 +0000 +++ b/mut2sscs.xml Tue Mar 02 15:32:41 2021 +0000 @@ -4,7 +4,12 @@ <macros> <import>va_macros.xml</import> </macros> - <expand macro="requirements"/> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.4.0">matplotlib</requirement> + <requirement type="package" version="0.15">pysam</requirement> + <requirement type="package" version="0.11.6">cyvcf2</requirement> + </requirements> <command><![CDATA[ ln -s '$file2' bam_input.bam && ln -s '${file2.metadata.bam_index}' bam_input.bam.bai &&
--- a/read2mut.py Wed Feb 24 14:20:17 2021 +0000 +++ b/read2mut.py Tue Mar 02 15:32:41 2021 +0000 @@ -10,27 +10,26 @@ ======= ========== ================= ================================ Version Date Author Description -2.0.0 2020-10-30 Gundula Povysil - +0.2.1 2019-10-27 Gundula Povysil - ======= ========== ================= ================================ USAGE: python read2mut.py --mutFile DCS_Mutations.tabular --bamFile Interesting_Reads.trim.bam --inputJson tag_count_dict.json --sscsJson SSCS_counts.json - --outputFile mutant_reads_summary_short_trim.xlsx --thresh 10 --phred 20 --trim5 10 --trim3 10 --chimera_correction + --outputFile mutant_reads_summary_short_trim.xlsx --thresh 10 --phred 20 --trim 10 --chimera_correction """ from __future__ import division import argparse -import csv +import itertools import json import operator import os import re import sys - import numpy as np import pysam import xlsxwriter @@ -49,8 +48,6 @@ help='JSON file with SSCS counts collected by mut2sscs.py.') parser.add_argument('--outputFile', help='Output xlsx file with summary of mutations.') - parser.add_argument('--outputFile_csv', - help='Output csv file with summary of mutations.') parser.add_argument('--outputFile2', help='Output xlsx file with allele frequencies of mutations.') parser.add_argument('--outputFile3', @@ -59,12 +56,14 @@ help='Integer threshold for displaying mutations. Only mutations occuring less than thresh times are displayed. Default of 0 displays all.') parser.add_argument('--phred', type=int, default=20, help='Integer threshold for Phred score. Only reads higher than this threshold are considered. Default 20.') - parser.add_argument('--trim5', type=int, default=10, - help='Integer threshold for assigning mutations at the beginning of the reads to lower tier. Default 10.') - parser.add_argument('--trim3', type=int, default=10, - help='Integer threshold for assigning mutations at the end of the reads to lower tier. Default 10.') + parser.add_argument('--trim', type=int, default=10, + help='Integer threshold for assigning mutations at start and end of reads to lower tier. Default 10.') parser.add_argument('--chimera_correction', action="store_true", help='Count chimeric variants and correct the variant frequencies') + parser.add_argument('--softclipping_dist', type=int, default=15, + help='Count mutation as an artifact if mutation lies within this parameter away from the softclipping part of the read.') + parser.add_argument('--reads_threshold', type=float, default=1.0, + help='Float number which specifies the minimum percentage of softclipped reads in a family to be considered in the softclipping tiers. Default: 1.0, means all reads of a family have to be softclipped.') return parser @@ -84,12 +83,12 @@ outfile = args.outputFile outfile2 = args.outputFile2 outfile3 = args.outputFile3 - outputFile_csv = args.outputFile_csv thresh = args.thresh phred_score = args.phred - trim5 = args.trim5 - trim3 = args.trim3 + trim = args.trim chimera_correction = args.chimera_correction + thr = args.softclipping_dist + threshold_reads = args.reads_threshold if os.path.isfile(file1) is False: sys.exit("Error: Could not find '{}'".format(file1)) @@ -101,10 +100,10 @@ sys.exit("Error: thresh is '{}', but only non-negative integers allowed".format(thresh)) if phred_score < 0: sys.exit("Error: phred is '{}', but only non-negative integers allowed".format(phred_score)) - if trim5 < 0: - sys.exit("Error: trim5 is '{}', but only non-negative integers allowed".format(trim5)) - if trim3 < 0: - sys.exit("Error: trim3 is '{}', but only non-negative integers allowed".format(trim3)) + if trim < 0: + sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thresh)) + if thr <= 0: + sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thr)) # load dicts with open(json_file, "r") as f: @@ -114,6 +113,7 @@ (mut_pos_dict, ref_pos_dict) = json.load(f) # read bam file + # pysam.index(file2) bam = pysam.AlignmentFile(file2, "rb") # create mut_dict @@ -121,15 +121,21 @@ mut_read_pos_dict = {} mut_read_dict = {} reads_dict = {} + mut_read_cigar_dict = {} i = 0 mut_array = [] - for variant in VCF(file1): + for count, variant in enumerate(VCF(file1)): + #if count == 2000: + # break chrom = variant.CHROM stop_pos = variant.start #chrom_stop_pos = str(chrom) + "#" + str(stop_pos) ref = variant.REF - alt = variant.ALT[0] + if len(variant.ALT) == 0: + continue + else: + alt = variant.ALT[0] chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt if len(ref) == len(alt): @@ -138,6 +144,7 @@ mut_dict[chrom_stop_pos] = {} mut_read_pos_dict[chrom_stop_pos] = {} reads_dict[chrom_stop_pos] = {} + mut_read_cigar_dict[chrom_stop_pos] = {} for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): if pileupcolumn.reference_pos == stop_pos: @@ -148,8 +155,8 @@ count_other = 0 count_lowq = 0 n = 0 - print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), - "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) + #print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), + # "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) for pileupread in pileupcolumn.pileups: n += 1 if not pileupread.is_del and not pileupread.is_refskip: @@ -165,14 +172,15 @@ else: mut_dict[chrom_stop_pos][tag][nuc] = 1 if tag not in mut_read_pos_dict[chrom_stop_pos]: - mut_read_pos_dict[chrom_stop_pos][tag] = np.array(pileupread.query_position) + 1 - reads_dict[chrom_stop_pos][tag] = len(pileupread.alignment.query_sequence) + mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1] + reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)] + mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring] + + #alignedRefPositions = pileupread.get_reference_positions()[0] else: - mut_read_pos_dict[chrom_stop_pos][tag] = np.append( - mut_read_pos_dict[chrom_stop_pos][tag], pileupread.query_position + 1) - reads_dict[chrom_stop_pos][tag] = np.append( - reads_dict[chrom_stop_pos][tag], len(pileupread.alignment.query_sequence)) - + mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1) + reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) + mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) if nuc == alt: count_alt += 1 if tag not in mut_read_dict: @@ -191,10 +199,9 @@ else: count_indel += 1 - print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq)) - else: - print("indels are currently not evaluated") - + #print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq)) + #else: + # print("indels are currently not evaluated") mut_array = np.array(mut_array) for read in bam.fetch(until_eof=True): if read.is_unmapped: @@ -214,6 +221,10 @@ # create pure_tags_dict pure_tags_dict = {} for key1, value1 in sorted(mut_dict.items()): + #if len(np.where(np.array(['#'.join(str(i) for i in z) + # for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0]) == 0: + # continue + i = np.where(np.array(['#'.join(str(i) for i in z) for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] ref = mut_array[i, 2] @@ -237,8 +248,15 @@ else: pure_tags_dict_short = pure_tags_dict - csv_data = open(outputFile_csv, "wb") - csv_writer = csv.writer(csv_data, delimiter=",") + # whole_array = [] + # for k in pure_tags_dict.values(): + # if len(k) != 0: + # keys = k.keys() + # if len(keys) > 1: + # for k1 in keys: + # whole_array.append(k1) + # else: + # whole_array.append(keys[0]) # output summary with threshold workbook = xlsxwriter.Workbook(outfile) @@ -268,7 +286,7 @@ 'SSCS alt.ab', 'SSCS alt.ba', 'SSCS ref.ab', 'SSCS ref.ba', 'in phase', 'chimeric tag') ws1.write_row(0, 0, header_line) - csv_writer.writerow(header_line) + counter_tier11 = 0 counter_tier12 = 0 counter_tier21 = 0 @@ -277,15 +295,24 @@ counter_tier24 = 0 counter_tier31 = 0 counter_tier32 = 0 - counter_tier41 = 0 - counter_tier42 = 0 - counter_tier5 = 0 + counter_tier33 = 0 + counter_tier4 = 0 + # if chimera_correction: + # counter_tier43 = 0 + counter_tier51 = 0 + counter_tier52 = 0 + counter_tier53 = 0 + counter_tier54 = 0 + counter_tier55 = 0 counter_tier6 = 0 + counter_tier7 = 0 + row = 1 tier_dict = {} chimera_dict = {} for key1, value1 in sorted(mut_dict.items()): counts_mut = 0 + chimeric_tag_list = [] chimeric_tag = {} if key1 in pure_tags_dict_short.keys(): i = np.where(np.array(['#'.join(str(i) for i in z) @@ -293,11 +320,12 @@ ref = mut_array[i, 2] alt = mut_array[i, 3] dcs_median = cvrg_dict[key1][2] - whole_array = list(pure_tags_dict_short[key1].keys()) + whole_array = pure_tags_dict_short[key1].keys() tier_dict[key1] = {} - values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), - ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 5", 0), ("tier 6", 0)] + values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0), + ("tier 3.2", 0), ("tier 3.3", 0), ("tier 4", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0), + ("tier 6", 0), ("tier 7", 0)] for k, v in values_tier_dict: tier_dict[key1][k] = v @@ -326,11 +354,15 @@ total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values()) if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys(): na1 = mut_dict[key1][key2[:-5] + '.ab.1']['na'] + # na1f = na1/total1 else: + # na1 = na1f = 0 na1 = 0 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ab.1'].keys(): lowq1 = mut_dict[key1][key2[:-5] + '.ab.1']['lowQ'] + # lowq1f = lowq1 / total1 else: + # lowq1 = lowq1f = 0 lowq1 = 0 if ref in mut_dict[key1][key2[:-5] + '.ab.1'].keys(): ref1 = mut_dict[key1][key2[:-5] + '.ab.1'][ref] @@ -365,11 +397,15 @@ total2 = sum(mut_dict[key1][key2[:-5] + '.ab.2'].values()) if 'na' in mut_dict[key1][key2[:-5] + '.ab.2'].keys(): na2 = mut_dict[key1][key2[:-5] + '.ab.2']['na'] + # na2f = na2 / total2 else: + # na2 = na2f = 0 na2 = 0 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ab.2'].keys(): lowq2 = mut_dict[key1][key2[:-5] + '.ab.2']['lowQ'] + # lowq2f = lowq2 / total2 else: + # lowq2 = lowq2f = 0 lowq2 = 0 if ref in mut_dict[key1][key2[:-5] + '.ab.2'].keys(): ref2 = mut_dict[key1][key2[:-5] + '.ab.2'][ref] @@ -404,11 +440,15 @@ total3 = sum(mut_dict[key1][key2[:-5] + '.ba.1'].values()) if 'na' in mut_dict[key1][key2[:-5] + '.ba.1'].keys(): na3 = mut_dict[key1][key2[:-5] + '.ba.1']['na'] + # na3f = na3 / total3 else: + # na3 = na3f = 0 na3 = 0 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ba.1'].keys(): lowq3 = mut_dict[key1][key2[:-5] + '.ba.1']['lowQ'] + # lowq3f = lowq3 / total3 else: + # lowq3 = lowq3f = 0 lowq3 = 0 if ref in mut_dict[key1][key2[:-5] + '.ba.1'].keys(): ref3 = mut_dict[key1][key2[:-5] + '.ba.1'][ref] @@ -439,11 +479,15 @@ total4 = sum(mut_dict[key1][key2[:-5] + '.ba.2'].values()) if 'na' in mut_dict[key1][key2[:-5] + '.ba.2'].keys(): na4 = mut_dict[key1][key2[:-5] + '.ba.2']['na'] + # na4f = na4 / total4 else: + # na4 = na4f = 0 na4 = 0 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ba.2'].keys(): lowq4 = mut_dict[key1][key2[:-5] + '.ba.2']['lowQ'] + # lowq4f = lowq4 / total4 else: + # lowq4 = lowq4f = 0 lowq4 = 0 if ref in mut_dict[key1][key2[:-5] + '.ba.2'].keys(): ref4 = mut_dict[key1][key2[:-5] + '.ba.2'][ref] @@ -472,19 +516,38 @@ read_pos1 = read_pos2 = read_pos3 = read_pos4 = -1 read_len_median1 = read_len_median2 = read_len_median3 = read_len_median4 = 0 - + cigars_dcs1 = cigars_dcs2 = cigars_dcs3 = cigars_dcs4 = [] + pos_read1 = pos_read2 = pos_read3 = pos_read4 = [] + end_read1 = end_read2 = end_read3 = end_read4 = [] if key2[:-5] + '.ab.1' in mut_read_pos_dict[key1].keys(): - read_pos1 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ab.1']) - read_len_median1 = np.median(reads_dict[key1][key2[:-5] + '.ab.1']) + read_pos1 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.1'])) + read_len_median1 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.1'])) + cigars_dcs1 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.1'] + #print(mut_read_cigar_dict[key1][key2[:-5] + '.ab.1']) + pos_read1 = mut_read_pos_dict[key1][key2[:-5] + '.ab.1'] + #print(cigars_dcs1) + end_read1 = reads_dict[key1][key2[:-5] + '.ab.1'] if key2[:-5] + '.ab.2' in mut_read_pos_dict[key1].keys(): - read_pos2 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ab.2']) - read_len_median2 = np.median(reads_dict[key1][key2[:-5] + '.ab.2']) + read_pos2 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.2'])) + read_len_median2 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.2'])) + cigars_dcs2 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.2'] + pos_read2 = mut_read_pos_dict[key1][key2[:-5] + '.ab.2'] + end_read2 = reads_dict[key1][key2[:-5] + '.ab.2'] if key2[:-5] + '.ba.1' in mut_read_pos_dict[key1].keys(): - read_pos3 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ba.1']) - read_len_median3 = np.median(reads_dict[key1][key2[:-5] + '.ba.1']) + read_pos3 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.1'])) + read_len_median3 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.1'])) + cigars_dcs3 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.1'] + pos_read3 = mut_read_pos_dict[key1][key2[:-5] + '.ba.1'] + end_read3 = reads_dict[key1][key2[:-5] + '.ba.1'] if key2[:-5] + '.ba.2' in mut_read_pos_dict[key1].keys(): - read_pos4 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ba.2']) - read_len_median4 = np.median(reads_dict[key1][key2[:-5] + '.ba.2']) + read_pos4 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.2'])) + read_len_median4 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.2'])) + #print(mut_read_cigar_dict[key1][key2[:-5] + '.ba.2']) + cigars_dcs4 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.2'] + + pos_read4 = mut_read_pos_dict[key1][key2[:-5] + '.ba.2'] + #print(cigars_dcs4) + end_read4 = reads_dict[key1][key2[:-5] + '.ba.2'] used_keys.append(key2[:-5]) counts_mut += 1 @@ -515,145 +578,370 @@ details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) - trimmed_five = False - trimmed_three = False + trimmed = False contradictory = False + softclipped_mutation_allMates = False + softclipped_mutation_oneOfTwoMates = False + softclipped_mutation_oneOfTwoSSCS = False + softclipped_mutation_oneMate = False + softclipped_mutation_oneMateOneSSCS = False + print() + print(key1, cigars_dcs1, cigars_dcs4, cigars_dcs2, cigars_dcs3) + dist_start_read1 = dist_start_read2 = dist_start_read3 = dist_start_read4 = [] + dist_end_read1 = dist_end_read2 = dist_end_read3 = dist_end_read4 = [] + ratio_dist_start1 = ratio_dist_start2 = ratio_dist_start3 = ratio_dist_start4 = False + ratio_dist_end1 = ratio_dist_end2 = ratio_dist_end3 = ratio_dist_end4 = False - if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & all(float(ij) == 0. for ij in [alt1ff, alt4ff]))): + # mate 1 - SSCS ab + softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1] + ratio1 = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) >= threshold_reads + + if any(ij is True for ij in softclipped_idx1): + softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1] + softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1] + softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1] + dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)] + dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] + + # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping + if any(ij is True for ij in softclipped_both_ends_idx1): + print(softclipped_both_ends_idx1) + for nr, indx in enumerate(softclipped_both_ends_idx1): + if indx: + if dist_start_read1[nr] <= dist_end_read1[nr]: + dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number + else: + dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number + ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads + ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads + print(key1, "mate1 ab", dist_start_read1, dist_end_read1, cigars_dcs1, ratio1, ratio_dist_start1, ratio_dist_end1) + + # mate 1 - SSCS ba + softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4] + ratio4 = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) >= threshold_reads + if any(ij is True for ij in softclipped_idx4): + softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4] + softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4] + softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4] + dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)] + dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] + + # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping + if any(ij is True for ij in softclipped_both_ends_idx4): + print(softclipped_both_ends_idx4) + for nr, indx in enumerate(softclipped_both_ends_idx4): + if indx: + if dist_start_read4[nr] <= dist_end_read4[nr]: + dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number + else: + dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number + ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads + ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads + print(key1, "mate1 ba", dist_start_read4, dist_end_read4,cigars_dcs4, ratio4, ratio_dist_start4, ratio_dist_end4) + + # mate 2 - SSCS ab + softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2] + #print(sum(softclipped_idx2)) + ratio2 = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) >= threshold_reads + if any(ij is True for ij in softclipped_idx2): + softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2] + softclipped_start2 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs2] + softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2] + dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)] + dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)] + + # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping + if any(ij is True for ij in softclipped_both_ends_idx2): + print(softclipped_both_ends_idx2) + for nr, indx in enumerate(softclipped_both_ends_idx2): + if indx: + if dist_start_read2[nr] <= dist_end_read2[nr]: + dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number + else: + dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number + ratio_dist_start2 = safe_div(sum([True if x <= thr else False for x in dist_start_read2]), float(sum(softclipped_idx2))) >= threshold_reads + #print(ratio_dist_end2) + #print([True if x <= thr else False for x in ratio_dist_end2]) + ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads + print(key1, "mate2 ab", dist_start_read2, dist_end_read2,cigars_dcs2, ratio2, ratio_dist_start2, ratio_dist_end2) + + # mate 2 - SSCS ba + softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3] + ratio3 = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) >= threshold_reads + if any(ij is True for ij in softclipped_idx3): + softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3] + softclipped_start3 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs3] + softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3] + dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)] + dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)] + + # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping + if any(ij is True for ij in softclipped_both_ends_idx3): + print(softclipped_both_ends_idx3) + for nr, indx in enumerate(softclipped_both_ends_idx3): + if indx: + if dist_start_read3[nr] <= dist_end_read3[nr]: + dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh + else: + dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number + #print([True if x <= thr else False for x in dist_start_read3]) + ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads + ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads + print(key1, "mate2 ba", dist_start_read3, dist_end_read3,cigars_dcs3, ratio3, ratio_dist_start3, ratio_dist_end3) + + if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant + all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | + (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & + all(float(ij) == 0. for ij in [alt1ff, alt4ff]))): alt1ff = 0 alt4ff = 0 alt2ff = 0 alt3ff = 0 - trimmed_five = False - trimmed_three = False + trimmed = False contradictory = True + # softclipping tiers + # information of both mates available --> all reads for both mates and SSCS are softclipped + elif (ratio1 & ratio4 & ratio2 & ratio3 & + (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & + all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available + # if distance between softclipping and mutation is at start or end of the read smaller than threshold + softclipped_mutation_allMates = True + softclipped_mutation_oneOfTwoMates = False + softclipped_mutation_oneOfTwoSSCS = False + softclipped_mutation_oneMate = False + softclipped_mutation_oneMateOneSSCS = False + alt1ff = 0 + alt4ff = 0 + alt2ff = 0 + alt3ff = 0 + trimmed = False + contradictory = False + print(key1, "softclipped_mutation_allMates", softclipped_mutation_allMates) + # information of both mates available --> only one mate softclipped + elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) | + (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) & + all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available + # if distance between softclipping and mutation is at start or end of the read smaller than threshold + softclipped_mutation_allMates = False + softclipped_mutation_oneOfTwoMates = True + softclipped_mutation_oneOfTwoSSCS = False + softclipped_mutation_oneMate = False + softclipped_mutation_oneMateOneSSCS = False + alt1ff = 0 + alt4ff = 0 + alt2ff = 0 + alt3ff = 0 + trimmed = False + contradictory = False + print(key1, "softclipped_mutation_oneOfTwoMates", softclipped_mutation_oneOfTwoMates) + # information of both mates available --> only one mate softclipped + elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & + ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & + all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available + # if distance between softclipping and mutation is at start or end of the read smaller than threshold + softclipped_mutation_allMates = False + softclipped_mutation_oneOfTwoMates = False + softclipped_mutation_oneOfTwoSSCS = True + softclipped_mutation_oneMate = False + softclipped_mutation_oneMateOneSSCS = False + alt1ff = 0 + alt4ff = 0 + alt2ff = 0 + alt3ff = 0 + trimmed = False + contradictory = False + print(key1, "softclipped_mutation_oneOfTwoSSCS", softclipped_mutation_oneOfTwoSSCS, [alt1ff, alt2ff, alt3ff, alt4ff]) + # information of one mate available --> all reads of one mate are softclipped + elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & + all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) | + (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & + all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available + # if distance between softclipping and mutation is at start or end of the read smaller than threshold + #if ((((len(dist_start_read1) > 0 | len(dist_end_read1) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1))) & + # ((len(dist_start_read4) > 0 | len(dist_end_read4) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4)))) | + # (((len(dist_start_read2) > 0 | len(dist_end_read2) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2))) & + # ((len(dist_start_read3) > 0 | len(dist_end_read3) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3))))): + softclipped_mutation_allMates = False + softclipped_mutation_oneOfTwoMates = False + softclipped_mutation_oneOfTwoSSCS = False + softclipped_mutation_oneMate = True + softclipped_mutation_oneMateOneSSCS = False + alt1ff = 0 + alt4ff = 0 + alt2ff = 0 + alt3ff = 0 + trimmed = False + contradictory = False + print(key1, "softclipped_mutation_oneMate", softclipped_mutation_oneMate) + # information of one mate available --> only one SSCS is softclipped + elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & + (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) | + (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & + (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available + # if distance between softclipping and mutation is at start or end of the read smaller than threshold + #if ((all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1)) | + # all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4))) | + # (all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2)) | + # all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3)))): + softclipped_mutation_allMates = False + softclipped_mutation_oneOfTwoMates = False + softclipped_mutation_oneOfTwoSSCS = False + softclipped_mutation_oneMate = False + softclipped_mutation_oneMateOneSSCS = True + alt1ff = 0 + alt4ff = 0 + alt2ff = 0 + alt3ff = 0 + trimmed = False + contradictory = False + print(key1, "softclipped_mutation_oneMateOneSSCS", softclipped_mutation_oneMateOneSSCS) + else: - if ((read_pos1 >= 0) and (read_pos1 <= trim5)): + if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): beg1 = total1new total1new = 0 alt1ff = 0 alt1f = 0 - trimmed_five = True + trimmed = True - if ((read_pos1 >= 0) and (abs(read_len_median1 - read_pos1) <= trim3)): - beg1 = total1new - total1new = 0 - alt1ff = 0 - alt1f = 0 - trimmed_three = True - - if ((read_pos4 >= 0) and (read_pos4 <= trim5)): + if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))): beg4 = total4new total4new = 0 alt4ff = 0 alt4f = 0 - trimmed_five = True + trimmed = True - if ((read_pos4 >= 0) and (abs(read_len_median4 - read_pos4) <= trim3)): - beg4 = total4new - total4new = 0 - alt4ff = 0 - alt4f = 0 - trimmed_three = True - - if ((read_pos2 >= 0) and (read_pos2 <= trim5)): + if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))): beg2 = total2new total2new = 0 alt2ff = 0 alt2f = 0 - trimmed_five = True - - if ((read_pos2 >= 0) and (abs(read_len_median2 - read_pos2) <= trim3)): - beg2 = total2new - total2new = 0 - alt2ff = 0 - alt2f = 0 - trimmed_three = True + trimmed = True - if ((read_pos3 >= 0) and (read_pos3 <= trim5)): - beg3 = total3new - total3new = 0 - alt3ff = 0 - alt3f = 0 - trimmed_five = True - - if ((read_pos3 >= 0) and (abs(read_len_median3 - read_pos3) <= trim3)): + if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))): beg3 = total3new total3new = 0 alt3ff = 0 alt3f = 0 - trimmed_three = True - + trimmed = True details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) # assign tiers - if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | (all(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): + if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & + all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | + (all(int(ij) >= 3 for ij in [total2new, total3new]) & + all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): tier = "1.1" counter_tier11 += 1 tier_dict[key1]["tier 1.1"] += 1 - elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) - & any(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): + elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & + any(int(ij) >= 3 for ij in [total1new, total4new]) & + any(int(ij) >= 3 for ij in [total2new, total3new]) & + all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): tier = "1.2" counter_tier12 += 1 tier_dict[key1]["tier 1.2"] += 1 - elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) - | (all(int(ij) >= 1 for ij in [total2new, total3new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): + elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & + any(int(ij) >= 3 for ij in [total1new, total4new]) & + all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | + (all(int(ij) >= 1 for ij in [total2new, total3new]) & + any(int(ij) >= 3 for ij in [total2new, total3new]) & + all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): tier = "2.1" counter_tier21 += 1 tier_dict[key1]["tier 2.1"] += 1 - elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): + elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & + all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): tier = "2.2" counter_tier22 += 1 tier_dict[key1]["tier 2.2"] += 1 - elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) & any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff])) - | (all(int(ij) >= 1 for ij in [total2new, total3new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) & any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))): + elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & + any(int(ij) >= 3 for ij in [total2new, total3new]) & + all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) & + any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff])) | + (all(int(ij) >= 1 for ij in [total2new, total3new]) & + any(int(ij) >= 3 for ij in [total1new, total4new]) & + all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) & + any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))): tier = "2.3" counter_tier23 += 1 tier_dict[key1]["tier 2.3"] += 1 - elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) - | (all(int(ij) >= 1 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): + elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & + all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | + (all(int(ij) >= 1 for ij in [total2new, total3new]) & + all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): tier = "2.4" counter_tier24 += 1 tier_dict[key1]["tier 2.4"] += 1 - elif ((len(pure_tags_dict_short[key1]) > 1) & (all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) | all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): + elif ((len(pure_tags_dict_short[key1]) > 1) & + (all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) | + all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): tier = "3.1" counter_tier31 += 1 tier_dict[key1]["tier 3.1"] += 1 - elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff])) - | (all(int(ij) >= 1 for ij in [total2new, total3new]) & all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): + elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & + all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff])) | + (all(int(ij) >= 1 for ij in [total2new, total3new]) & + all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): tier = "3.2" counter_tier32 += 1 tier_dict[key1]["tier 3.2"] += 1 - elif trimmed_five: - tier = "4.1" - counter_tier41 += 1 - tier_dict[key1]["tier 4.1"] += 1 + elif (trimmed) and (len(pure_tags_dict_short[key1]) > 1): + tier = "3.3" + counter_tier33 += 1 + tier_dict[key1]["tier 3.3"] += 1 + + elif (trimmed): + tier = "4" + counter_tier4 += 1 + tier_dict[key1]["tier 4"] += 1 + + elif softclipped_mutation_allMates: + tier = "5.1" + counter_tier51 += 1 + tier_dict[key1]["tier 5.1"] += 1 - elif trimmed_three: - tier = "4.2" - counter_tier42 += 1 - tier_dict[key1]["tier 4.2"] += 1 + elif softclipped_mutation_oneOfTwoMates: + tier = "5.2" + counter_tier52 += 1 + tier_dict[key1]["tier 5.2"] += 1 + + elif softclipped_mutation_oneOfTwoSSCS: + tier = "5.3" + counter_tier53 += 1 + tier_dict[key1]["tier 5.3"] += 1 - elif contradictory: - tier = "5" - counter_tier5 += 1 - tier_dict[key1]["tier 5"] += 1 - else: + elif softclipped_mutation_oneMate: + tier = "5.4" + counter_tier54 += 1 + tier_dict[key1]["tier 5.4"] += 1 + + elif softclipped_mutation_oneMateOneSSCS: + tier = "5.5" + counter_tier55 += 1 + tier_dict[key1]["tier 5.5"] += 1 + + elif (contradictory): tier = "6" counter_tier6 += 1 tier_dict[key1]["tier 6"] += 1 + else: + tier = "7" + counter_tier7 += 1 + tier_dict[key1]["tier 7"] += 1 + chrom, pos, ref_a, alt_a = re.split(r'\#', key1) - var_id = '-'.join([chrom, str(int(pos) + 1), ref, alt]) + var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) sample_tag = key2[:-5] array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time # exclude identical tag from array2, to prevent comparison to itself @@ -682,14 +970,14 @@ half1_mate2 = array2_half2 half2_mate2 = array2_half # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" - dist = np.array([sum(map(operator.ne, half1_mate1, c)) for c in half1_mate2]) + dist = np.array([sum(itertools.imap(operator.ne, half1_mate1, c)) for c in half1_mate2]) min_index = np.where(dist == dist.min()) # get index of min HD # get all "b's" of the tag or all "a's" of the tag with minimum HD min_tag_half2 = half2_mate2[min_index] min_tag_array2 = array2[min_index] # get whole tag with min HD min_value = dist.min() # calculate HD of "b" to all "b's" or "a" to all "a's" - dist_second_half = np.array([sum(map(operator.ne, half2_mate1, e)) + dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e)) for e in min_tag_half2]) dist2 = dist_second_half.max() @@ -700,7 +988,14 @@ if min_value == 0 or dist2 == 0: min_tags_list_zeros.append(tag) chimera_tags.append(max_tag) + # chimeric = True + # else: + # chimeric = False + # if mate_b is False: + # text = "pos {}: sample tag: {}; HD a = {}; HD b' = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, min_value, dist2, list(max_tag), chimeric) + # else: + # text = "pos {}: sample tag: {}; HD a' = {}; HD b = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, dist2, min_value, list(max_tag), chimeric) i += 1 chimera_tags = [x for x in chimera_tags if x != []] chimera_tags_new = [] @@ -733,28 +1028,26 @@ read_pos3 = read_len_median3 = None line = (var_id, tier, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera) ws1.write_row(row, 0, line) - csv_writer.writerow(line) line = ("", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera) ws1.write_row(row + 1, 0, line) - csv_writer.writerow(line) ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 'format': format1, - 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1)}) + 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4")'.format(row + 1, row + 1, row + 1, row + 1), 'format': format3, - 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1)}) + 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row + 1), 'format': format2, - 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1)}) + 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) + row += 3 - if chimera_correction: chimeric_dcs_high_tiers = 0 chimeric_dcs = 0 @@ -767,23 +1060,19 @@ else: chimeric_dcs_high_tiers += high_tiers chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers) - #csv_data.close() - # sheet 2 if chimera_correction: header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'chimeras in AC alt (all tiers)', 'chimera-corrected cvrg', 'chimera-corrected AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'chimeras in AC alt (tiers 1.1-2.4)', 'chimera-corrected cvrg (tiers 1.1-2.4)', 'chimera-corrected AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)', - 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', - 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', - 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5', 'AF 1.1-6') + 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', + 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', + 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6') else: header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)', 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', - 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', - 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5', 'AF 1.1-6') + 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', + 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6') ws2.write_row(0, 0, header_line2) - #ws2.conditional_format('J1', {'type': 'formula', 'criteria': 'containing', 'value': 'tier 1.1', 'format': format1, 'multi_range': 'J1:K1'}) - row = 0 for key1, value1 in sorted(tier_dict.items()): @@ -797,7 +1086,7 @@ alt_count = cvrg_dict[key1][1] cvrg = ref_count + alt_count - var_id = '-'.join([chrom, str(int(pos) + 1), ref, alt]) + var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) lst = [var_id, cvrg] used_tiers = [] cum_af = [] @@ -807,6 +1096,8 @@ if len(used_tiers) > 1: cum = safe_div(sum(used_tiers), cvrg) cum_af.append(cum) + if sum(used_tiers) == 0: # skip mutations that are filtered by the VA in the first place + continue lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)]) if chimera_correction: chimeras_all = chimera_dict[key1][0] @@ -816,14 +1107,14 @@ fraction_chimeras = 0. new_cvrg = cvrg * (1. - fraction_chimeras) lst.extend([chimeras_all, new_cvrg, safe_div(new_alt, new_cvrg)]) - lst.extend([(cvrg - sum(used_tiers[-6:])), sum(used_tiers[0:6]), safe_div(sum(used_tiers[0:6]), (cvrg - sum(used_tiers[-6:])))]) + lst.extend([(cvrg - sum(used_tiers[-11:])), sum(used_tiers[0:6]), safe_div(sum(used_tiers[0:6]), (cvrg - sum(used_tiers[-11:])))]) if chimera_correction: chimeras_all = chimera_dict[key1][1] new_alt = sum(used_tiers[0:6]) - chimeras_all fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:6]))) if fraction_chimeras is None: fraction_chimeras = 0. - new_cvrg = (cvrg - sum(used_tiers[-6:])) * (1. - fraction_chimeras) + new_cvrg = (cvrg - sum(used_tiers[-11:])) * (1. - fraction_chimeras) lst.extend([chimeras_all, new_cvrg, safe_div(new_alt, new_cvrg)]) lst.extend([alt_count, safe_div(alt_count, cvrg)]) lst.extend(used_tiers) @@ -833,18 +1124,19 @@ if chimera_correction: ws2.conditional_format('P{}:Q{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 1.1"', 'format': format12, 'multi_range': 'P{}:Q{} P1:Q1'.format(row + 2, row + 2)}) ws2.conditional_format('R{}:U{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$R$1="tier 2.1"', 'format': format32, 'multi_range': 'R{}:U{} R1:U1'.format(row + 2, row + 2)}) - ws2.conditional_format('V{}:AA{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$V$1="tier 3.1"', 'format': format22, 'multi_range': 'V{}:AA{} V1:AA1'.format(row + 2, row + 2)}) + ws2.conditional_format('V{}:AF{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$V$1="tier 3.1"', 'format': format22, 'multi_range': 'V{}:AF{} V1:AF1'.format(row + 2, row + 2)}) else: ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format12, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)}) ws2.conditional_format('L{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format32, 'multi_range': 'L{}:O{} L1:O1'.format(row + 2, row + 2)}) - ws2.conditional_format('P{}:U{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format22, 'multi_range': 'P{}:U{} P1:U1'.format(row + 2, row + 2)}) + ws2.conditional_format('P{}:Z{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format22, 'multi_range': 'P{}:Z{} P1:Z1'.format(row + 2, row + 2)}) row += 1 # sheet 3 sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21), ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24), - ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4.1", counter_tier41), - ("tier 4.2", counter_tier42), ("tier 5", counter_tier5), ("tier 6", counter_tier6)] + ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 3.3", counter_tier33), ("tier 4", counter_tier), + ("tier 5.1", counter_tier51), ("tier 5.2", counter_tier52), + ("tier 5.3", counter_tier53), ("tier 5.4", counter_tier54), ("tier 5.5", counter_tier55), ("tier 6", counter_tier6), ("tier 7", counter_tier7)] header = ("tier", "count") ws3.write_row(0, 0, header) @@ -854,15 +1146,15 @@ ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2), {'type': 'formula', 'criteria': '=OR($A${}="tier 1.1", $A${}="tier 1.2")'.format(i + 2, i + 2), - 'format': format13}) + 'format': format1}) ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2), {'type': 'formula', 'criteria': '=OR($A${}="tier 2.1", $A${}="tier 2.2", $A${}="tier 2.3", $A${}="tier 2.4")'.format(i + 2, i + 2, i + 2, i + 2), - 'format': format33}) + 'format': format3}) ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2), {'type': 'formula', 'criteria': '=$A${}>="3"'.format(i + 2), - 'format': format23}) + 'format': format2}) description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""), ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), @@ -872,88 +1164,87 @@ ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), - ("Tier 4.1", "variants at the beginning of the reads"), - ("Tier 4.2", "variants at the end of the reads"), - ("Tier 5", "mates with contradictory information"), + ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"), + ("Tier 5.1", "variant is close to softclipping in both mates"), + ("Tier 5.2", "variant is close to softclipping in one of the mates"), + ("Tier 5.3", "variant is close to softclipping in one of the SSCS of both mates"), + ("Tier 5.4", "variant is close to softclipping in one mate (no information of second mate"), + ("Tier 5.5", "variant is close to softclipping in one of the SSCS (no information of the second mate"), ("Tier 6", "remaining variants")] - examples_tiers = [[("chr5-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289", + examples_tiers = [[("Chr5:5-20000-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289", "3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", ""), ("", "", "AAAAAGATGCCGACTACCTT", "ab2.ba1", None, None, None, None, "289", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0", "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", "")], - [("chr5-11068-C-G", "1.1", "AAAAATGCGTAGAAATATGC", "ab1.ba2", "254", "228", "287", "288", "289", + [("Chr5:5-20000-11068-C-G", "1.1", "AAAAATGCGTAGAAATATGC", "ab1.ba2", "254", "228", "287", "288", "289", "33", "43", "33", "43", "0", "0", "33", "43", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", ""), ("", "", "AAAAATGCGTAGAAATATGC", "ab2.ba1", "268", "268", "270", "288", "289", "11", "34", "10", "27", "0", "0", "10", "27", "0", "0", "1", "1", "0", "0", "1", "7", "0", "0", "4081", "4098", "5", "10", "", "")], - [("chr5-10776-G-T", "1.2", "CTATGACCCGTGAGCCCATG", "ab1.ba2", "132", "132", "287", "288", "290", + [("Chr5:5-20000-10776-G-T", "1.2", "CTATGACCCGTGAGCCCATG", "ab1.ba2", "132", "132", "287", "288", "290", "4", "1", "4", "1", "0", "0", "4", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "1", "6", "47170", "41149", "", ""), ("", "", "CTATGACCCGTGAGCCCATG", "ab2.ba1", "77", "132", "233", "200", "290", "4", "1", "4", "1", "0", "0", "4", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "1", "6", "47170", "41149", "", "")], - [("chr5-11068-C-G", "2.1", "AAAAAAACATCATACACCCA", "ab1.ba2", "246", "244", "287", "288", "289", + [("Chr5:5-20000-11068-C-G", "2.1", "AAAAAAACATCATACACCCA", "ab1.ba2", "246", "244", "287", "288", "289", "2", "8", "2", "8", "0", "0", "2", "8", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", ""), ("", "", "AAAAAAACATCATACACCCA", "ab2.ba1", None, None, None, None, "289", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0", "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", "")], - [("chr5-11068-C-G", "2.2", "ATCAGCCATGGCTATTATTG", "ab1.ba2", "72", "72", "217", "288", "289", + [("Chr5:5-20000-11068-C-G", "2.2", "ATCAGCCATGGCTATTATTG", "ab1.ba2", "72", "72", "217", "288", "289", "1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", ""), ("", "", "ATCAGCCATGGCTATTATTG", "ab2.ba1", "153", "164", "217", "260", "289", "1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", "")], - [("chr5-11068-C-G", "2.3", "ATCAATATGGCCTCGCCACG", "ab1.ba2", None, None, None, None, + [("Chr5:5-20000-11068-C-G", "2.3", "ATCAATATGGCCTCGCCACG", "ab1.ba2", None, None, None, None, "289", "0", "5", "0", "5", "0", "0", "0", "5", None, None, None, "1", "0", "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", ""), ("", "", "ATCAATATGGCCTCGCCACG", "ab2.ba1", "202", "255", "277", "290", "289", "1", "3", "1", "3", "0", "0", "1", "3", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", "")], - [("chr5-11068-C-G", "2.4", "ATCAGCCATGGCTATTTTTT", "ab1.ba2", "72", "72", "217", "288", "289", + [("Chr5:5-20000-11068-C-G", "2.4", "ATCAGCCATGGCTATTTTTT", "ab1.ba2", "72", "72", "217", "288", "289", "1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", ""), ("", "", "ATCAGCCATGGCTATTTTTT", "ab2.ba1", "153", "164", "217", "260", "289", "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "1", "0", "0", "0", "0", "4081", "4098", "5", "10", "", "")], - [("chr5-10776-G-T", "3.1", "ATGCCTACCTCATTTGTCGT", "ab1.ba2", "46", "15", "287", "288", "290", + [("Chr5:5-20000-10776-G-T", "3.1", "ATGCCTACCTCATTTGTCGT", "ab1.ba2", "46", "15", "287", "288", "290", "3", "3", "3", "2", "3", "1", "0", "1", "1", "0.5", "0", "0.5", "0", "0", "0", "1", "0", "0", "3", "3", "47170", "41149", "", ""), ("", "", "ATGCCTACCTCATTTGTCGT", "ab2.ba1", None, "274", None, "288", "290", "0", "3", "0", "2", "0", "1", "0", "1", None, "0.5", None, "0.5", "0", "0", "0", "1", "0", "0", "3", "3", "47170", "41149", "", "")], - [("chr5-11315-C-T", "3.2", "ACAACATCACGTATTCAGGT", "ab1.ba2", "197", "197", "240", "255", "271", + [("Chr5:5-20000-11315-C-T", "3.2", "ACAACATCACGTATTCAGGT", "ab1.ba2", "197", "197", "240", "255", "271", "2", "3", "2", "3", "0", "1", "2", "2", "0", "0.333333333333333", "1", "0.666666666666667", "0", "0", "0", "0", "0", "0", "1", "1", "6584", "6482", "", ""), ("", "", "ACAACATCACGTATTCAGGT", "ab2.ba1", "35", "35", "240", "258", "271", "2", "3", "2", "3", "0", "1", "2", "2", "0", "0.333333333333333", "1", "0.666666666666667", "0", "0", "0", "0", "0", "0", "1", "1", "6584", "6482", "", "")], - [("chr5-13983-G-C", "4.1", "AAAAAAAGAATAACCCACAC", "ab1.ba2", "1", "100", "255", "276", "269", - "5", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "5", "0", "1", "1", "5348", "5350", "", ""), + [("Chr5:5-20000-13983-G-C", "4.1", "AAAAAAAGAATAACCCACAC", "ab1.ba2", "0", "100", "255", "276", "269", + "5", "6", "0", "6", "0", "0", "5", "6", "0", "0", "0", "1", "0", "0", "0", "0", "5", "0", "1", "1", "5348", "5350", "", ""), ("", "", "AAAAAAAGAATAACCCACAC", "ab2.ba1", None, None, None, None, "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")], - [("chr5-13983-G-C", "4.2", "AAAAAAAGAATAACCCACAC", "ab1.ba2", "20", "270", "255", "276", "269", - "5", "6", "5", "0", "0", "0", "5", "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "6", "1", "1", "5348", "5350", "", ""), - ("", "", "AAAAAAAGAATAACCCACAC", "ab2.ba1", None, None, None, None, - "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0", - "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")], - [("chr5-13963-T-C", "5", "TTTTTAAGAATAACCCACAC", "ab1.ba2", "38", "38", "240", "283", "263", + [("Chr5:5-20000-13963-T-C", "4.2", "TTTTTAAGAATAACCCACAC", "ab1.ba2", "38", "38", "240", "283", "263", "110", "54", "110", "54", "0", "0", "110", "54", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", ""), ("", "", "TTTTTAAGAATAACCCACAC", "ab2.ba1", "100", "112", "140", "145", "263", "7", "12", "7", "12", "7", "12", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")], - [("chr5-13983-G-C", "6", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269", + [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], + [("Chr5:5-20000-13983-G-C", "6", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269", "0", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", ""), ("", "", "ATGTTGTGAATAACCCACAC", "ab2.ba1", None, None, None, None, "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]] - start_row = 15 + start_row = 20 ws3.write(start_row, 0, "Description of tiers with examples") ws3.write_row(start_row + 1, 0, header_line) row = 0 @@ -962,23 +1253,22 @@ ex = examples_tiers[i] for k in range(len(ex)): ws3.write_row(start_row + 2 + row + i + k + 2, 0, ex[k]) - ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 'format': format13, 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2)}) + ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 'format': format13, 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)}) ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), {'type': 'formula', 'criteria': '=OR($B${}="2.1",$B${}="2.2", $B${}="2.3", $B${}="2.4")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 'format': format33, - 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2)}) + 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)}) ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(start_row + 2 + row + i + k + 2), 'format': format23, - 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2)}) + 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)}) row += 3 workbook.close() workbook2.close() workbook3.close() - csv_data.close() - if __name__ == '__main__': sys.exit(read2mut(sys.argv)) +
--- a/read2mut.xml Wed Feb 24 14:20:17 2021 +0000 +++ b/read2mut.xml Tue Mar 02 15:32:41 2021 +0000 @@ -1,12 +1,16 @@ <?xml version="1.0" encoding="UTF-8"?> -<tool id="read2mut" name="Call specific mutations in reads:" version="2.0.4" profile="17.01"> +<tool id="read2mut" name="Call specific mutations in reads:" version="2.1.0" profile="19.01"> <description>Looks for reads with mutation at known positions and calculates frequencies and stats.</description> <macros> <import>va_macros.xml</import> </macros> - <expand macro="requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.4.0">matplotlib</requirement> + <requirement type="package" version="0.15">pysam</requirement> <requirement type="package" version="1.1.0">xlsxwriter</requirement> - </expand> + <requirement type="package" version="0.11.6">cyvcf2</requirement> + </requirements> <command><![CDATA[ ln -s '$file2' bam_input.bam && ln -s '${file2.metadata.bam_index}' bam_input.bam.bai && @@ -17,11 +21,11 @@ --sscsJson '$file4' --thresh '$thresh' --phred '$phred' - --trim5 '$trim5' - --trim3 '$trim3' + --trim '$trim' $chimera_correction + --softclipping_dist '$softclipping_dist' + --reads_threshold '$reads_threshold' --outputFile '$output_xlsx' - --outputFile_csv '$outputFile_csv' --outputFile2 '$output_xlsx2' --outputFile3 '$output_xlsx3' ]]> @@ -33,13 +37,13 @@ <param name="file4" type="data" format="json" label="JSON File with SSCS tag stats" optional="false" help="JSON file generated by DCS mutations to SSCS stats."/> <param name="thresh" type="integer" label="Tag count threshold" value="0" help="Integer threshold for displaying mutations. Only mutations occuring in DCS of less than thresh tags are displayed. Default of 0 displays all."/> <param name="phred" type="integer" label="Phred quality score threshold" min="0" max="41" value="20" help="Integer threshold for Phred quality score. Only reads higher than this threshold are considered. Default = 20."/> - <param name="trim5" type="integer" label="Trimming threshold at 5' end of reads" value="10" help="Integer threshold for assigning mutations at the beginning of reads to a lower tier. Default 10."/> - <param name="trim3" type="integer" label="Trimming threshold at 3' end of reads" value="10" help="Integer threshold for assigning mutations at the end of reads to a lower tier. Default 10."/> + <param name="trim" type="integer" label="Trimming threshold" value="10" help="Integer threshold for assigning mutations at start and end of reads to lower tier. Default 10."/> <param name="chimera_correction" type="boolean" label="Apply chimera correction?" truevalue="--chimera_correction" falsevalue="" checked="False" help="Count chimeric variants and correct the variant frequencies."/> + <param name="softclipping_dist" type="integer" label="Distance between artifact and softclipping of the reads" min="1" value="15" help="Count mutation as an artifact if mutation lies within this parameter away from the softclipping part of the reads. Default = 20"/> +<param name="reads_threshold" type="float" label="Minimum percentage of softclipped reads in a family" min="0.0" max="1.0" value="1.0" help="Float number which specifies the minimum percentage of softclipped reads in a family to be considered in the softclipping tiers. Default: 1.0, means all reads of a family have to be softclipped."/> </inputs> <outputs> <data name="output_xlsx" format="xlsx" label="${tool.name} on ${on_string}: XLSX summary"/> - <data name="outputFile_csv" format="csv" label="${tool.name} on ${on_string}: CSV summary"/> <data name="output_xlsx2" format="xlsx" label="${tool.name} on ${on_string}: XLSX allele frequencies"/> <data name="output_xlsx3" format="xlsx" label="${tool.name} on ${on_string}: XLSX tiers"/> </outputs> @@ -51,10 +55,11 @@ <param name="file4" value="SSCS_counts_test.json"/> <param name="thresh" value="0"/> <param name="phred" value="20"/> - <param name="trim5" value="10"/> - <param name="trim3" value="10"/> + <param name="trim" value="10"/> + <param name="chimera_correction"/> + <param name="softclipping_dist" value="15"/> + <param name="reads_threshold" value="1.0"/> <output name="output_xlsx" file="Variant_Analyzer_summary_test.xlsx" decompress="true" lines_diff="10"/> - <output name="outputFile_csv" file="Variant_Analyzer_summary_test.csv" decompress="true" lines_diff="10"/> <output name="output_xlsx2" file="Variant_Analyzer_allele_frequencies_test.xlsx" decompress="true" lines_diff="10"/> <output name="output_xlsx3" file="Variant_Analyzer_tiers_test.xlsx" decompress="true" lines_diff="10"/> </test> @@ -70,7 +75,7 @@ **Input** **Dataset 1:** VCF file with duplex consesus sequence (DCS) mutations. E.g. -generated by the `FreeBayes <https://arxiv.org/abs/1207.3907>`_ or `LoFreq <https://academic.oup.com/nar/article/40/22/11189/1152727>`_ variant caller. +generated by the `FreeBayes variant caller <https://arxiv.org/abs/1207.3907>`_. **Dataset 2:** BAM file of aligned raw reads. This file can be obtained by the tool `Map with BWA-MEM <https://arxiv.org/abs/1303.3997>`_. @@ -86,7 +91,7 @@ **Output** The output are three XLSX files containing frequencies stats for DCS mutations based -on information from the raw reads and a CSV file containing the summary information without color-coding. In addition to that a tier based +on information from the raw reads. In addition to that a tier based classification is provided based on the amout of support for a true variant call. ]]>
--- a/va_macros.xml Wed Feb 24 14:20:17 2021 +0000 +++ b/va_macros.xml Tue Mar 02 15:32:41 2021 +0000 @@ -1,21 +1,13 @@ <macros> <xml name="citation"> - <citations> - <citation type="bibtex"> - @misc{duplex, - author = {Povysil, Gundula and Heinzl, Monika and Salazar, Renato and Stoler, Nicholas and Nekrutenko, Anton and Tiemann-Boege, Irene}, - year = {2020}, - title = {{Increased yields of duplex sequencing data by a series of quality control tools (manuscript)}} - } - </citation> - </citations> - </xml> - <xml name="requirements"> - <requirements> - <requirement type="package" version="3.1.2">matplotlib</requirement> - <requirement type="package" version="0.15">pysam</requirement> - <requirement type="package" version="0.11.6">cyvcf2</requirement> - <yield/> - </requirements> - </xml> + <citations> + <citation type="bibtex"> + @misc{duplex, + author = {Povysil, Gundula and Heinzl, Monika and Salazar, Renato and Stoler, Nicholas and Nekrutenko, Anton and Tiemann-Boege, Irene}, + year = {2019}, + title = {{Variant Analyzer: a quality control for variant calling in duplex sequencing data (manuscript)}} + } + </citation> + </citations> +</xml> </macros> \ No newline at end of file