Mercurial > repos > mheinzl > variant_analyzer2
comparison mut2read.py @ 78:fdfe9a919ff7 draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty
| author | mheinzl |
|---|---|
| date | Fri, 22 Jul 2022 09:19:44 +0000 |
| parents | 6ccff403db8a |
| children | e46d5e377760 |
comparison
equal
deleted
inserted
replaced
| 77:1797e461d674 | 78:fdfe9a919ff7 |
|---|---|
| 18 """ | 18 """ |
| 19 | 19 |
| 20 import argparse | 20 import argparse |
| 21 import json | 21 import json |
| 22 import os | 22 import os |
| 23 import re | |
| 23 import sys | 24 import sys |
| 24 | 25 |
| 25 import numpy as np | 26 import numpy as np |
| 26 import pysam | 27 import pysam |
| 27 from cyvcf2 import VCF | 28 from cyvcf2 import VCF |
| 66 bam = pysam.AlignmentFile(file2, "rb") | 67 bam = pysam.AlignmentFile(file2, "rb") |
| 67 | 68 |
| 68 # get tags | 69 # get tags |
| 69 tag_dict = {} | 70 tag_dict = {} |
| 70 cvrg_dict = {} | 71 cvrg_dict = {} |
| 72 tag_dict_ref = {} | |
| 71 | 73 |
| 72 for variant in VCF(file1): | 74 for variant in VCF(file1): |
| 73 chrom = variant.CHROM | 75 chrom = variant.CHROM |
| 74 stop_pos = variant.start | 76 stop_pos = variant.start |
| 75 ref = variant.REF | 77 ref = variant.REF |
| 76 if len(variant.ALT) == 0: | 78 if len(variant.ALT) == 0: |
| 77 continue | 79 continue |
| 78 else: | 80 else: |
| 79 alt = variant.ALT[0] | 81 alt = variant.ALT[0] |
| 82 alt = alt.upper() | |
| 83 ref = ref.upper() | |
| 84 if "N" in alt: # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV) | |
| 85 continue | |
| 80 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt | 86 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt |
| 81 dcs_len = [] | 87 dcs_len = [] |
| 82 if len(ref) == len(alt): | 88 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): |
| 83 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): | 89 if pileupcolumn.reference_pos == stop_pos: |
| 84 if pileupcolumn.reference_pos == stop_pos: | 90 count_alt = 0 |
| 85 count_alt = 0 | 91 count_ref = 0 |
| 86 count_ref = 0 | 92 count_indel = 0 |
| 87 count_indel = 0 | 93 count_n = 0 |
| 88 count_n = 0 | 94 count_other = 0 |
| 89 count_other = 0 | 95 count_lowq = 0 |
| 90 count_lowq = 0 | 96 for pileupread in pileupcolumn.pileups: |
| 91 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), | 97 if not pileupread.is_refskip: |
| 92 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) | 98 if pileupread.is_del: |
| 93 for pileupread in pileupcolumn.pileups: | 99 p = pileupread.query_position_or_next |
| 94 if not pileupread.is_del and not pileupread.is_refskip: | 100 e = p + len(alt) - 1 |
| 95 # query position is None if is_del or is_refskip is set. | |
| 96 nuc = pileupread.alignment.query_sequence[pileupread.query_position] | |
| 97 dcs_len.append(len(pileupread.alignment.query_sequence)) | |
| 98 if nuc == alt: | |
| 99 count_alt += 1 | |
| 100 tag = pileupread.alignment.query_name | |
| 101 if tag in tag_dict: | |
| 102 tag_dict[tag][chrom_stop_pos] = alt | |
| 103 else: | |
| 104 tag_dict[tag] = {} | |
| 105 tag_dict[tag][chrom_stop_pos] = alt | |
| 106 elif nuc == ref: | |
| 107 count_ref += 1 | |
| 108 elif nuc == "N": | |
| 109 count_n += 1 | |
| 110 elif nuc == "lowQ": | |
| 111 count_lowq += 1 | |
| 112 else: | |
| 113 count_other += 1 | |
| 114 else: | 101 else: |
| 115 count_indel += 1 | 102 p = pileupread.query_position |
| 116 dcs_median = np.median(np.array(dcs_len)) | 103 e = p + len(alt) |
| 117 cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median) | 104 s = p |
| 118 | 105 split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring) |
| 119 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" % | 106 if len(ref) < len(alt): |
| 120 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, | 107 if "I" in split_cigar: |
| 121 count_indel, count_lowq, dcs_median)) | 108 all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"] |
| 122 else: | 109 for ai in all_insertions: # if multiple insertions in DCS |
| 123 print("indels are currently not evaluated") | 110 ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] |
| 111 ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele | |
| 112 if "I" in split_cigar and sum(ins_index) == p + 1 and len(alt) - 1 == int(ins_count): # if position in read matches and length of insertion | |
| 113 nuc = pileupread.alignment.query_sequence[s:e] | |
| 114 break | |
| 115 else: | |
| 116 nuc = pileupread.alignment.query_sequence[s] | |
| 117 else: | |
| 118 nuc = pileupread.alignment.query_sequence[s] | |
| 119 elif len(ref) > len(alt): | |
| 120 ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)] | |
| 121 if "D" in split_cigar: | |
| 122 all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"] | |
| 123 for di, ai in enumerate(all_deletions): # if multiple insertions in DCS | |
| 124 if di > 0: # more than 1 deletion, don't count previous deletion to position | |
| 125 all_deletions_mod = split_cigar[:ai - 1] | |
| 126 prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")] | |
| 127 split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx] | |
| 128 del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()] | |
| 129 else: # first deletion in read, sum all previous (mis)matches and insertions to position | |
| 130 del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] | |
| 131 del_count = split_cigar[ai - 1] # nr of deletions should match with ref allele | |
| 132 if "D" in split_cigar and sum(del_index) == p + 1 and len(ref) - 1 == int(del_count): | |
| 133 nuc = pileupread.alignment.query_sequence[s:e] | |
| 134 if nuc == "": | |
| 135 nuc = str(alt) | |
| 136 break | |
| 137 else: | |
| 138 nuc = pileupread.alignment.query_sequence[s:s + len(ref)] | |
| 139 elif len(ref_positions) < len(ref): # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there | |
| 140 nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)] | |
| 141 if nuc.upper() == ref[:len(nuc)]: | |
| 142 nuc = str(ref) | |
| 143 else: | |
| 144 nuc = pileupread.alignment.query_sequence[s:s + len(ref)] | |
| 145 else: # SNV: query position is None if is_del or is_refskip is set. | |
| 146 nuc = pileupread.alignment.query_sequence[s] | |
| 147 | |
| 148 nuc = nuc.upper() | |
| 149 tag = pileupread.alignment.query_name | |
| 150 if "_" in tag: | |
| 151 tag = re.split('_', tag)[0] | |
| 152 | |
| 153 if nuc == alt: | |
| 154 count_alt += 1 | |
| 155 if tag in tag_dict: | |
| 156 tag_dict[tag][chrom_stop_pos] = alt | |
| 157 else: | |
| 158 tag_dict[tag] = {} | |
| 159 tag_dict[tag][chrom_stop_pos] = alt | |
| 160 elif nuc == ref: | |
| 161 count_ref += 1 | |
| 162 if tag in tag_dict_ref: | |
| 163 tag_dict_ref[tag][chrom_stop_pos] = ref | |
| 164 else: | |
| 165 tag_dict_ref[tag] = {} | |
| 166 tag_dict_ref[tag][chrom_stop_pos] = ref | |
| 167 elif nuc == "N": | |
| 168 count_n += 1 | |
| 169 elif nuc == "lowQ": | |
| 170 count_lowq += 1 | |
| 171 else: | |
| 172 count_other += 1 | |
| 173 dcs_len.append(len(pileupread.alignment.query_sequence)) | |
| 174 | |
| 175 dcs_median = np.median(np.array(dcs_len)) | |
| 176 cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median) | |
| 177 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" % | |
| 178 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, | |
| 179 count_indel, count_lowq, dcs_median)) | |
| 124 bam.close() | 180 bam.close() |
| 125 | 181 |
| 126 with open(json_file, "w") as f: | 182 with open(json_file, "w") as f: |
| 127 json.dump((tag_dict, cvrg_dict), f) | 183 json.dump((tag_dict, cvrg_dict, tag_dict_ref), f) |
| 128 | 184 |
| 129 # create fastq from aligned reads | 185 # create fastq from aligned reads |
| 130 with open(outfile, 'w') as out: | 186 with open(outfile, 'w') as out: |
| 131 with open(file3, 'r') as families: | 187 with open(file3, 'r') as families: |
| 132 for line in families: | 188 for line in families: |
| 133 line = line.rstrip('\n') | 189 line = line.rstrip('\n') |
| 134 splits = line.split('\t') | 190 splits = line.split('\t') |
| 135 tag = splits[0] | 191 tag = splits[0] |
| 136 | 192 |
| 137 if tag in tag_dict: | 193 if tag in tag_dict or tag in tag_dict_ref: |
| 138 str1 = splits[4] | 194 str1 = splits[4] |
| 139 curr_seq = str1.replace("-", "") | 195 curr_seq = str1.replace("-", "") |
| 140 str2 = splits[5] | 196 str2 = splits[5] |
| 141 curr_qual = str2.replace(" ", "") | 197 curr_qual = str2.replace(" ", "") |
| 142 | |
| 143 out.write("@" + splits[0] + "." + splits[1] + "." + splits[2] + "\n") | 198 out.write("@" + splits[0] + "." + splits[1] + "." + splits[2] + "\n") |
| 144 out.write(curr_seq + "\n") | 199 out.write(curr_seq + "\n") |
| 145 out.write("+" + "\n") | 200 out.write("+" + "\n") |
| 146 out.write(curr_qual + "\n") | 201 out.write(curr_qual + "\n") |
| 147 | 202 |
