Mercurial > repos > mheinzl > variant_analyzer2
view mut2read.py @ 83:8cec772c0bf1 draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty
author | mheinzl |
---|---|
date | Fri, 05 Aug 2022 08:23:34 +0000 |
parents | fdfe9a919ff7 |
children | e46d5e377760 |
line wrap: on
line source
#!/usr/bin/env python """mut2read.py Author -- Gundula Povysil Contact -- povysil@bioinf.jku.at Takes a tabular file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file. Creates fastq file of reads of tags with mutation. ======= ========== ================= ================================ Version Date Author Description 0.2.1 2019-10-27 Gundula Povysil - ======= ========== ================= ================================ USAGE: python mut2read.py DCS_Mutations.tabular DCS.bam Aligned_Families.tabular Interesting_Reads.fastq tag_count_dict.json """ import argparse import json import os import re import sys import numpy as np import pysam from cyvcf2 import VCF def make_argparser(): parser = argparse.ArgumentParser(description='Takes a vcf file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file and creates a fastq file of reads of tags with mutation.') parser.add_argument('--mutFile', help='VCF file with DCS mutations.') parser.add_argument('--bamFile', help='BAM file with aligned DCS reads.') parser.add_argument('--familiesFile', help='TABULAR file with aligned families.') parser.add_argument('--outputFastq', help='Output FASTQ file of reads with mutations.') parser.add_argument('--outputJson', help='Output JSON file to store collected data.') return parser def mut2read(argv): parser = make_argparser() args = parser.parse_args(argv[1:]) file1 = args.mutFile file2 = args.bamFile file3 = args.familiesFile outfile = args.outputFastq json_file = args.outputJson if os.path.isfile(file1) is False: sys.exit("Error: Could not find '{}'".format(file1)) if os.path.isfile(file2) is False: sys.exit("Error: Could not find '{}'".format(file2)) if os.path.isfile(file3) is False: sys.exit("Error: Could not find '{}'".format(file3)) # read dcs bam file # pysam.index(file2) bam = pysam.AlignmentFile(file2, "rb") # get tags tag_dict = {} cvrg_dict = {} tag_dict_ref = {} for variant in VCF(file1): chrom = variant.CHROM stop_pos = variant.start ref = variant.REF if len(variant.ALT) == 0: continue else: alt = variant.ALT[0] alt = alt.upper() ref = ref.upper() if "N" in alt: # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV) continue chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt dcs_len = [] for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): if pileupcolumn.reference_pos == stop_pos: count_alt = 0 count_ref = 0 count_indel = 0 count_n = 0 count_other = 0 count_lowq = 0 for pileupread in pileupcolumn.pileups: if not pileupread.is_refskip: if pileupread.is_del: p = pileupread.query_position_or_next e = p + len(alt) - 1 else: p = pileupread.query_position e = p + len(alt) s = p split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring) if len(ref) < len(alt): if "I" in split_cigar: all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"] for ai in all_insertions: # if multiple insertions in DCS ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele if "I" in split_cigar and sum(ins_index) == p + 1 and len(alt) - 1 == int(ins_count): # if position in read matches and length of insertion nuc = pileupread.alignment.query_sequence[s:e] break else: nuc = pileupread.alignment.query_sequence[s] else: nuc = pileupread.alignment.query_sequence[s] elif len(ref) > len(alt): ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)] if "D" in split_cigar: all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"] for di, ai in enumerate(all_deletions): # if multiple insertions in DCS if di > 0: # more than 1 deletion, don't count previous deletion to position all_deletions_mod = split_cigar[:ai - 1] prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")] split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx] del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()] else: # first deletion in read, sum all previous (mis)matches and insertions to position del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] del_count = split_cigar[ai - 1] # nr of deletions should match with ref allele if "D" in split_cigar and sum(del_index) == p + 1 and len(ref) - 1 == int(del_count): nuc = pileupread.alignment.query_sequence[s:e] if nuc == "": nuc = str(alt) break else: nuc = pileupread.alignment.query_sequence[s:s + len(ref)] elif len(ref_positions) < len(ref): # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)] if nuc.upper() == ref[:len(nuc)]: nuc = str(ref) else: nuc = pileupread.alignment.query_sequence[s:s + len(ref)] else: # SNV: query position is None if is_del or is_refskip is set. nuc = pileupread.alignment.query_sequence[s] nuc = nuc.upper() tag = pileupread.alignment.query_name if "_" in tag: tag = re.split('_', tag)[0] if nuc == alt: count_alt += 1 if tag in tag_dict: tag_dict[tag][chrom_stop_pos] = alt else: tag_dict[tag] = {} tag_dict[tag][chrom_stop_pos] = alt elif nuc == ref: count_ref += 1 if tag in tag_dict_ref: tag_dict_ref[tag][chrom_stop_pos] = ref else: tag_dict_ref[tag] = {} tag_dict_ref[tag][chrom_stop_pos] = ref elif nuc == "N": count_n += 1 elif nuc == "lowQ": count_lowq += 1 else: count_other += 1 dcs_len.append(len(pileupread.alignment.query_sequence)) dcs_median = np.median(np.array(dcs_len)) cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median) print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq, dcs_median)) bam.close() with open(json_file, "w") as f: json.dump((tag_dict, cvrg_dict, tag_dict_ref), f) # create fastq from aligned reads with open(outfile, 'w') as out: with open(file3, 'r') as families: for line in families: line = line.rstrip('\n') splits = line.split('\t') tag = splits[0] if tag in tag_dict or tag in tag_dict_ref: str1 = splits[4] curr_seq = str1.replace("-", "") str2 = splits[5] curr_qual = str2.replace(" ", "") out.write("@" + splits[0] + "." + splits[1] + "." + splits[2] + "\n") out.write(curr_seq + "\n") out.write("+" + "\n") out.write(curr_qual + "\n") if __name__ == '__main__': sys.exit(mut2read(sys.argv))