Mercurial > repos > mheinzl > variant_analyzer2
comparison mut2sscs.py @ 78:fdfe9a919ff7 draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty
| author | mheinzl |
|---|---|
| date | Fri, 22 Jul 2022 09:19:44 +0000 |
| parents | 6ccff403db8a |
| children |
comparison
equal
deleted
inserted
replaced
| 77:1797e461d674 | 78:fdfe9a919ff7 |
|---|---|
| 21 from __future__ import division | 21 from __future__ import division |
| 22 | 22 |
| 23 import argparse | 23 import argparse |
| 24 import json | 24 import json |
| 25 import os | 25 import os |
| 26 import re | |
| 26 import sys | 27 import sys |
| 27 | 28 |
| 28 import numpy as np | 29 import numpy as np |
| 29 import pysam | 30 import pysam |
| 30 from cyvcf2 import VCF | 31 from cyvcf2 import VCF |
| 69 ref = variant.REF | 70 ref = variant.REF |
| 70 if len(variant.ALT) == 0: | 71 if len(variant.ALT) == 0: |
| 71 continue | 72 continue |
| 72 else: | 73 else: |
| 73 alt = variant.ALT[0] | 74 alt = variant.ALT[0] |
| 75 alt = alt.upper() | |
| 76 ref = ref.upper() | |
| 77 if "N" in alt: # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV) | |
| 78 continue | |
| 74 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt | 79 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt |
| 75 if len(ref) == len(alt): | 80 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000): |
| 76 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000): | 81 if pileupcolumn.reference_pos == stop_pos: |
| 77 if pileupcolumn.reference_pos == stop_pos: | 82 count_alt = 0 |
| 78 count_alt = 0 | 83 count_ref = 0 |
| 79 count_ref = 0 | 84 count_indel = 0 |
| 80 count_indel = 0 | 85 for pileupread in pileupcolumn.pileups: |
| 81 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), | 86 if not pileupread.is_refskip: |
| 82 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) | 87 tag = pileupread.alignment.query_name |
| 83 for pileupread in pileupcolumn.pileups: | 88 abba = tag[-2:] |
| 84 if not pileupread.is_del and not pileupread.is_refskip: | 89 if pileupread.is_del: |
| 85 tag = pileupread.alignment.query_name | 90 p = pileupread.query_position_or_next |
| 86 abba = tag[-2:] | 91 e = p + len(alt) - 1 |
| 87 # query position is None if is_del or is_refskip is set. | 92 else: |
| 88 if pileupread.alignment.query_sequence[pileupread.query_position] == alt: | 93 p = pileupread.query_position |
| 89 count_alt += 1 | 94 e = p + len(alt) |
| 90 if chrom_stop_pos in mut_pos_dict: | 95 tag = pileupread.alignment.query_name |
| 91 if abba in mut_pos_dict[chrom_stop_pos]: | 96 if "_" in tag: |
| 92 mut_pos_dict[chrom_stop_pos][abba] += 1 | 97 tag = re.split('_', tag)[0] |
| 98 s = p | |
| 99 split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring) | |
| 100 if len(ref) < len(alt): # insertion | |
| 101 if "I" in split_cigar: | |
| 102 all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"] | |
| 103 for ai in all_insertions: # if multiple insertions in DCS | |
| 104 ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] | |
| 105 ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele | |
| 106 if "I" in split_cigar and sum(ins_index) == p + 1 and len(alt) - 1 == int(ins_count): | |
| 107 nuc = pileupread.alignment.query_sequence[s:e] | |
| 108 break | |
| 93 else: | 109 else: |
| 94 mut_pos_dict[chrom_stop_pos][abba] = 1 | 110 nuc = pileupread.alignment.query_sequence[s] |
| 111 else: | |
| 112 nuc = pileupread.alignment.query_sequence[s] | |
| 113 elif len(ref) > len(alt): # deletion | |
| 114 ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)] | |
| 115 if "D" in split_cigar: | |
| 116 all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"] | |
| 117 for di, ai in enumerate(all_deletions): # if multiple insertions in DCS | |
| 118 if di > 0: # more than 1 deletion, don't count previous deletion to position | |
| 119 all_deletions_mod = split_cigar[:ai - 1] | |
| 120 prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")] | |
| 121 split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx] | |
| 122 del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()] | |
| 123 else: # first deletion in read, sum all previous (mis)matches and insertions to position | |
| 124 del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] | |
| 125 del_count = split_cigar[ai - 1] # nr of deletions should match with ref allele | |
| 126 if "D" in split_cigar and sum(del_index) == p + 1 and len(ref) - 1 == int(del_count): | |
| 127 nuc = pileupread.alignment.query_sequence[s:e] | |
| 128 if nuc == "": | |
| 129 nuc = str(alt) | |
| 130 break | |
| 131 else: | |
| 132 nuc = pileupread.alignment.query_sequence[s:s + len(ref)] | |
| 133 elif len(ref_positions) < len(ref): # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there | |
| 134 nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)] | |
| 135 if nuc.upper() == ref[:len(nuc)]: | |
| 136 nuc = str(ref) | |
| 137 else: | |
| 138 nuc = pileupread.alignment.query_sequence[s:s + len(ref)] | |
| 139 else: # SNV: query position is None if is_del or is_refskip is set. | |
| 140 nuc = pileupread.alignment.query_sequence[s] | |
| 141 | |
| 142 nuc = nuc.upper() | |
| 143 | |
| 144 if nuc == alt: | |
| 145 count_alt += 1 | |
| 146 if chrom_stop_pos in mut_pos_dict: | |
| 147 if abba in mut_pos_dict[chrom_stop_pos]: | |
| 148 mut_pos_dict[chrom_stop_pos][abba] += 1 | |
| 95 else: | 149 else: |
| 96 mut_pos_dict[chrom_stop_pos] = {} | |
| 97 mut_pos_dict[chrom_stop_pos][abba] = 1 | 150 mut_pos_dict[chrom_stop_pos][abba] = 1 |
| 98 if chrom_stop_pos not in ref_pos_dict: | 151 else: |
| 99 ref_pos_dict[chrom_stop_pos] = {} | 152 mut_pos_dict[chrom_stop_pos] = {} |
| 100 ref_pos_dict[chrom_stop_pos][abba] = 0 | 153 mut_pos_dict[chrom_stop_pos][abba] = 1 |
| 101 | 154 if chrom_stop_pos not in ref_pos_dict: |
| 102 elif pileupread.alignment.query_sequence[pileupread.query_position] == ref: | 155 ref_pos_dict[chrom_stop_pos] = {} |
| 103 count_ref += 1 | 156 ref_pos_dict[chrom_stop_pos][abba] = 0 |
| 104 if chrom_stop_pos in ref_pos_dict: | 157 elif nuc == ref: |
| 105 if abba in ref_pos_dict[chrom_stop_pos]: | 158 count_ref += 1 |
| 106 ref_pos_dict[chrom_stop_pos][abba] += 1 | 159 if chrom_stop_pos in ref_pos_dict: |
| 107 else: | 160 if abba in ref_pos_dict[chrom_stop_pos]: |
| 108 ref_pos_dict[chrom_stop_pos][abba] = 1 | 161 ref_pos_dict[chrom_stop_pos][abba] += 1 |
| 109 else: | 162 else: |
| 110 ref_pos_dict[chrom_stop_pos] = {} | |
| 111 ref_pos_dict[chrom_stop_pos][abba] = 1 | 163 ref_pos_dict[chrom_stop_pos][abba] = 1 |
| 112 else: | 164 else: |
| 113 count_indel += 1 | 165 ref_pos_dict[chrom_stop_pos] = {} |
| 166 ref_pos_dict[chrom_stop_pos][abba] = 1 | |
| 167 else: | |
| 168 count_indel += 1 | |
| 169 print("coverage at pos %s = %s, ref = %s, alt = %s, other = %s,\n" % | |
| 170 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) | |
| 114 | 171 |
| 115 print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" % | 172 # if mutation is in DCS file but not in SSCS, then set counts to NA |
| 116 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) | 173 if chrom_stop_pos not in mut_pos_dict.keys(): |
| 117 | 174 mut_pos_dict[chrom_stop_pos] = {} |
| 118 # if mutation is in DCS file but not in SSCS, then set counts to NA | 175 mut_pos_dict[chrom_stop_pos]["ab"] = 0 |
| 119 if chrom_stop_pos not in mut_pos_dict.keys(): | 176 mut_pos_dict[chrom_stop_pos]["ba"] = 0 |
| 120 mut_pos_dict[chrom_stop_pos] = {} | 177 ref_pos_dict[chrom_stop_pos] = {} |
| 121 mut_pos_dict[chrom_stop_pos]["ab"] = 0 | 178 ref_pos_dict[chrom_stop_pos]["ab"] = 0 |
| 122 mut_pos_dict[chrom_stop_pos]["ba"] = 0 | 179 ref_pos_dict[chrom_stop_pos]["ba"] = 0 |
| 123 ref_pos_dict[chrom_stop_pos] = {} | |
| 124 ref_pos_dict[chrom_stop_pos]["ab"] = 0 | |
| 125 ref_pos_dict[chrom_stop_pos]["ba"] = 0 | |
| 126 else: | |
| 127 print("indels are currently not evaluated") | |
| 128 bam.close() | 180 bam.close() |
| 129 | 181 |
| 130 # save counts | 182 # save counts |
| 131 with open(sscs_counts_json, "w") as f: | 183 with open(sscs_counts_json, "w") as f: |
| 132 json.dump((mut_pos_dict, ref_pos_dict), f) | 184 json.dump((mut_pos_dict, ref_pos_dict), f) |
