comparison mut2read.py @ 78:fdfe9a919ff7 draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty
author mheinzl
date Fri, 22 Jul 2022 09:19:44 +0000
parents 6ccff403db8a
children e46d5e377760
comparison
equal deleted inserted replaced
77:1797e461d674 78:fdfe9a919ff7
18 """ 18 """
19 19
20 import argparse 20 import argparse
21 import json 21 import json
22 import os 22 import os
23 import re
23 import sys 24 import sys
24 25
25 import numpy as np 26 import numpy as np
26 import pysam 27 import pysam
27 from cyvcf2 import VCF 28 from cyvcf2 import VCF
66 bam = pysam.AlignmentFile(file2, "rb") 67 bam = pysam.AlignmentFile(file2, "rb")
67 68
68 # get tags 69 # get tags
69 tag_dict = {} 70 tag_dict = {}
70 cvrg_dict = {} 71 cvrg_dict = {}
72 tag_dict_ref = {}
71 73
72 for variant in VCF(file1): 74 for variant in VCF(file1):
73 chrom = variant.CHROM 75 chrom = variant.CHROM
74 stop_pos = variant.start 76 stop_pos = variant.start
75 ref = variant.REF 77 ref = variant.REF
76 if len(variant.ALT) == 0: 78 if len(variant.ALT) == 0:
77 continue 79 continue
78 else: 80 else:
79 alt = variant.ALT[0] 81 alt = variant.ALT[0]
82 alt = alt.upper()
83 ref = ref.upper()
84 if "N" in alt: # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV)
85 continue
80 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt 86 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
81 dcs_len = [] 87 dcs_len = []
82 if len(ref) == len(alt): 88 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
83 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): 89 if pileupcolumn.reference_pos == stop_pos:
84 if pileupcolumn.reference_pos == stop_pos: 90 count_alt = 0
85 count_alt = 0 91 count_ref = 0
86 count_ref = 0 92 count_indel = 0
87 count_indel = 0 93 count_n = 0
88 count_n = 0 94 count_other = 0
89 count_other = 0 95 count_lowq = 0
90 count_lowq = 0 96 for pileupread in pileupcolumn.pileups:
91 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), 97 if not pileupread.is_refskip:
92 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) 98 if pileupread.is_del:
93 for pileupread in pileupcolumn.pileups: 99 p = pileupread.query_position_or_next
94 if not pileupread.is_del and not pileupread.is_refskip: 100 e = p + len(alt) - 1
95 # query position is None if is_del or is_refskip is set.
96 nuc = pileupread.alignment.query_sequence[pileupread.query_position]
97 dcs_len.append(len(pileupread.alignment.query_sequence))
98 if nuc == alt:
99 count_alt += 1
100 tag = pileupread.alignment.query_name
101 if tag in tag_dict:
102 tag_dict[tag][chrom_stop_pos] = alt
103 else:
104 tag_dict[tag] = {}
105 tag_dict[tag][chrom_stop_pos] = alt
106 elif nuc == ref:
107 count_ref += 1
108 elif nuc == "N":
109 count_n += 1
110 elif nuc == "lowQ":
111 count_lowq += 1
112 else:
113 count_other += 1
114 else: 101 else:
115 count_indel += 1 102 p = pileupread.query_position
116 dcs_median = np.median(np.array(dcs_len)) 103 e = p + len(alt)
117 cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median) 104 s = p
118 105 split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring)
119 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" % 106 if len(ref) < len(alt):
120 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, 107 if "I" in split_cigar:
121 count_indel, count_lowq, dcs_median)) 108 all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"]
122 else: 109 for ai in all_insertions: # if multiple insertions in DCS
123 print("indels are currently not evaluated") 110 ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
111 ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele
112 if "I" in split_cigar and sum(ins_index) == p + 1 and len(alt) - 1 == int(ins_count): # if position in read matches and length of insertion
113 nuc = pileupread.alignment.query_sequence[s:e]
114 break
115 else:
116 nuc = pileupread.alignment.query_sequence[s]
117 else:
118 nuc = pileupread.alignment.query_sequence[s]
119 elif len(ref) > len(alt):
120 ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)]
121 if "D" in split_cigar:
122 all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"]
123 for di, ai in enumerate(all_deletions): # if multiple insertions in DCS
124 if di > 0: # more than 1 deletion, don't count previous deletion to position
125 all_deletions_mod = split_cigar[:ai - 1]
126 prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")]
127 split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx]
128 del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()]
129 else: # first deletion in read, sum all previous (mis)matches and insertions to position
130 del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
131 del_count = split_cigar[ai - 1] # nr of deletions should match with ref allele
132 if "D" in split_cigar and sum(del_index) == p + 1 and len(ref) - 1 == int(del_count):
133 nuc = pileupread.alignment.query_sequence[s:e]
134 if nuc == "":
135 nuc = str(alt)
136 break
137 else:
138 nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
139 elif len(ref_positions) < len(ref): # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there
140 nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)]
141 if nuc.upper() == ref[:len(nuc)]:
142 nuc = str(ref)
143 else:
144 nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
145 else: # SNV: query position is None if is_del or is_refskip is set.
146 nuc = pileupread.alignment.query_sequence[s]
147
148 nuc = nuc.upper()
149 tag = pileupread.alignment.query_name
150 if "_" in tag:
151 tag = re.split('_', tag)[0]
152
153 if nuc == alt:
154 count_alt += 1
155 if tag in tag_dict:
156 tag_dict[tag][chrom_stop_pos] = alt
157 else:
158 tag_dict[tag] = {}
159 tag_dict[tag][chrom_stop_pos] = alt
160 elif nuc == ref:
161 count_ref += 1
162 if tag in tag_dict_ref:
163 tag_dict_ref[tag][chrom_stop_pos] = ref
164 else:
165 tag_dict_ref[tag] = {}
166 tag_dict_ref[tag][chrom_stop_pos] = ref
167 elif nuc == "N":
168 count_n += 1
169 elif nuc == "lowQ":
170 count_lowq += 1
171 else:
172 count_other += 1
173 dcs_len.append(len(pileupread.alignment.query_sequence))
174
175 dcs_median = np.median(np.array(dcs_len))
176 cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median)
177 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" %
178 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n,
179 count_indel, count_lowq, dcs_median))
124 bam.close() 180 bam.close()
125 181
126 with open(json_file, "w") as f: 182 with open(json_file, "w") as f:
127 json.dump((tag_dict, cvrg_dict), f) 183 json.dump((tag_dict, cvrg_dict, tag_dict_ref), f)
128 184
129 # create fastq from aligned reads 185 # create fastq from aligned reads
130 with open(outfile, 'w') as out: 186 with open(outfile, 'w') as out:
131 with open(file3, 'r') as families: 187 with open(file3, 'r') as families:
132 for line in families: 188 for line in families:
133 line = line.rstrip('\n') 189 line = line.rstrip('\n')
134 splits = line.split('\t') 190 splits = line.split('\t')
135 tag = splits[0] 191 tag = splits[0]
136 192
137 if tag in tag_dict: 193 if tag in tag_dict or tag in tag_dict_ref:
138 str1 = splits[4] 194 str1 = splits[4]
139 curr_seq = str1.replace("-", "") 195 curr_seq = str1.replace("-", "")
140 str2 = splits[5] 196 str2 = splits[5]
141 curr_qual = str2.replace(" ", "") 197 curr_qual = str2.replace(" ", "")
142
143 out.write("@" + splits[0] + "." + splits[1] + "." + splits[2] + "\n") 198 out.write("@" + splits[0] + "." + splits[1] + "." + splits[2] + "\n")
144 out.write(curr_seq + "\n") 199 out.write(curr_seq + "\n")
145 out.write("+" + "\n") 200 out.write("+" + "\n")
146 out.write(curr_qual + "\n") 201 out.write(curr_qual + "\n")
147 202