variant_analyzer2: mut2read.py comparison

comparison mut2read.py @ 78:fdfe9a919ff7 draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty

author	mheinzl
date	Fri, 22 Jul 2022 09:19:44 +0000
parents	6ccff403db8a
children	e46d5e377760

comparison

equal deleted inserted replaced

-:1797e461d674
+:fdfe9a919ff7
 """
 import argparse
 import json
 import os
+import re
 import sys
 import numpy as np
 import pysam
 from cyvcf2 import VCF
 bam = pysam.AlignmentFile(file2, "rb")
 # get tags
 tag_dict = {}
 cvrg_dict = {}
+tag_dict_ref = {}
 for variant in VCF(file1):
 chrom = variant.CHROM
 stop_pos = variant.start
 ref = variant.REF
 if len(variant.ALT) == 0:
 continue
 else:
 alt = variant.ALT[0]
+alt = alt.upper()
+ref = ref.upper()
+if "N" in alt:  # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV)
+continue
 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
 dcs_len = []
-if len(ref) == len(alt):
+for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
-for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
+if pileupcolumn.reference_pos == stop_pos:
-if pileupcolumn.reference_pos == stop_pos:
+count_alt = 0
-count_alt = 0
+count_ref = 0
-count_ref = 0
+count_indel = 0
-count_indel = 0
+count_n = 0
-count_n = 0
+count_other = 0
-count_other = 0
+count_lowq = 0
-count_lowq = 0
+for pileupread in pileupcolumn.pileups:
-print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
+if not pileupread.is_refskip:
-"difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
+if pileupread.is_del:
-for pileupread in pileupcolumn.pileups:
+p = pileupread.query_position_or_next
-if not pileupread.is_del and not pileupread.is_refskip:
+e = p + len(alt) - 1
-# query position is None if is_del or is_refskip is set.
-nuc = pileupread.alignment.query_sequence[pileupread.query_position]
-dcs_len.append(len(pileupread.alignment.query_sequence))
-if nuc == alt:
-count_alt += 1
-tag = pileupread.alignment.query_name
-if tag in tag_dict:
-tag_dict[tag][chrom_stop_pos] = alt
-else:
-tag_dict[tag] = {}
-tag_dict[tag][chrom_stop_pos] = alt
-elif nuc == ref:
-count_ref += 1
-elif nuc == "N":
-count_n += 1
-elif nuc == "lowQ":
-count_lowq += 1
-else:
-count_other += 1
 else:
-count_indel += 1
+p = pileupread.query_position
-dcs_median = np.median(np.array(dcs_len))
+e = p + len(alt)
-cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median)
+s = p
+split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring)
-print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" %
+if len(ref) < len(alt):
-(pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n,
+if "I" in split_cigar:
-count_indel, count_lowq, dcs_median))
+all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"]
-else:
+for ai in all_insertions:  # if multiple insertions in DCS
-print("indels are currently not evaluated")
+ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+ins_count = split_cigar[ai - 1]  # nr of insertions should match with alt allele
+if "I" in split_cigar and sum(ins_index) == p + 1 and len(alt) - 1 == int(ins_count):  # if position in read matches and length of insertion
+nuc = pileupread.alignment.query_sequence[s:e]
+break
+else:
+nuc = pileupread.alignment.query_sequence[s]
+else:
+nuc = pileupread.alignment.query_sequence[s]
+elif len(ref) > len(alt):
+ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)]
+if "D" in split_cigar:
+all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"]
+for di, ai in enumerate(all_deletions):  # if multiple insertions in DCS
+if di > 0:  # more than 1 deletion, don't count previous deletion to position
+all_deletions_mod = split_cigar[:ai - 1]
+prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")]
+split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx]
+del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()]
+else:  # first deletion in read, sum all previous (mis)matches and insertions to position
+del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+del_count = split_cigar[ai - 1]  # nr of deletions should match with ref allele
+if "D" in split_cigar and sum(del_index) == p + 1 and len(ref) - 1 == int(del_count):
+nuc = pileupread.alignment.query_sequence[s:e]
+if nuc == "":
+nuc = str(alt)
+break
+else:
+nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
+elif len(ref_positions) < len(ref):  # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there
+nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)]
+if nuc.upper() == ref[:len(nuc)]:
+nuc = str(ref)
+else:
+nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
+else:  # SNV: query position is None if is_del or is_refskip is set.
+nuc = pileupread.alignment.query_sequence[s]
+nuc = nuc.upper()
+tag = pileupread.alignment.query_name
+if "_" in tag:
+tag = re.split('_', tag)[0]
+if nuc == alt:
+count_alt += 1
+if tag in tag_dict:
+tag_dict[tag][chrom_stop_pos] = alt
+else:
+tag_dict[tag] = {}
+tag_dict[tag][chrom_stop_pos] = alt
+elif nuc == ref:
+count_ref += 1
+if tag in tag_dict_ref:
+tag_dict_ref[tag][chrom_stop_pos] = ref
+else:
+tag_dict_ref[tag] = {}
+tag_dict_ref[tag][chrom_stop_pos] = ref
+elif nuc == "N":
+count_n += 1
+elif nuc == "lowQ":
+count_lowq += 1
+else:
+count_other += 1
+dcs_len.append(len(pileupread.alignment.query_sequence))
+dcs_median = np.median(np.array(dcs_len))
+cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median)
+print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" %
+(pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n,
+count_indel, count_lowq, dcs_median))
 bam.close()
 with open(json_file, "w") as f:
-json.dump((tag_dict, cvrg_dict), f)
+json.dump((tag_dict, cvrg_dict, tag_dict_ref), f)
 # create fastq from aligned reads
 with open(outfile, 'w') as out:
 with open(file3, 'r') as families:
 for line in families:
 line = line.rstrip('\n')
 splits = line.split('\t')
 tag = splits[0]
-if tag in tag_dict:
+if tag in tag_dict or tag in tag_dict_ref:
 str1 = splits[4]
 curr_seq = str1.replace("-", "")
 str2 = splits[5]
 curr_qual = str2.replace(" ", "")
 out.write("@" + splits[0] + "." + splits[1] + "." + splits[2] + "\n")
 out.write(curr_seq + "\n")
 out.write("+" + "\n")
 out.write(curr_qual + "\n")

Mercurial > repos > mheinzl > variant_analyzer2

comparison mut2read.py @ 78:fdfe9a919ff7 draft