Mercurial > repos > mheinzl > variant_analyzer2
comparison mut2sscs.py @ 0:e5953c54cfb5 draft
planemo upload for repository https://github.com/gpovysil/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author | mheinzl |
---|---|
date | Sun, 04 Oct 2020 17:19:39 +0000 |
parents | |
children | 11a2a34f8a2b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e5953c54cfb5 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """mut2sscs.py | |
4 | |
5 Author -- Gundula Povysil | |
6 Contact -- povysil@bioinf.jku.at | |
7 | |
8 Takes a tabular file with mutations from DCS and a BAM file of SSCS as input | |
9 and extracts all tags of reads that carry the mutation. | |
10 Calculates statistics about number of ab/ba/duplex per mutation. | |
11 | |
12 ======= ========== ================= ================================ | |
13 Version Date Author Description | |
14 0.2.1 2019-10-27 Gundula Povysil - | |
15 ======= ========== ================= ================================ | |
16 | |
17 USAGE: python mut2sscs.py DCS_Mutations.tabular SSCS.bam SSCS_counts.json | |
18 | |
19 """ | |
20 | |
21 from __future__ import division | |
22 | |
23 import argparse | |
24 import json | |
25 import os | |
26 import sys | |
27 | |
28 import numpy as np | |
29 import pysam | |
30 | |
31 | |
32 def make_argparser(): | |
33 parser = argparse.ArgumentParser(description='Takes a tabular file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file.') | |
34 parser.add_argument('--mutFile', | |
35 help='TABULAR file with DCS mutations.') | |
36 parser.add_argument('--bamFile', | |
37 help='BAM file with aligned SSCS reads.') | |
38 parser.add_argument('--outputJson', | |
39 help='Output JSON file to store SSCS counts.') | |
40 return parser | |
41 | |
42 | |
43 def mut2sscs(argv): | |
44 parser = make_argparser() | |
45 args = parser.parse_args(argv[1:]) | |
46 | |
47 file1 = args.mutFile | |
48 file2 = args.bamFile | |
49 sscs_counts_json = args.outputJson | |
50 | |
51 if os.path.isfile(file1) is False: | |
52 sys.exit("Error: Could not find '{}'".format(file1)) | |
53 | |
54 if os.path.isfile(file2) is False: | |
55 sys.exit("Error: Could not find '{}'".format(file2)) | |
56 | |
57 # 1. read mut file | |
58 with open(file1, 'r') as mut: | |
59 mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str) | |
60 | |
61 # 2 read SSCS bam file | |
62 # pysam.index(file2) | |
63 bam = pysam.AlignmentFile(file2, "rb") | |
64 | |
65 # get tags | |
66 mut_pos_dict = {} | |
67 ref_pos_dict = {} | |
68 if mut_array.shape == (1,13): | |
69 mut_array = mut_array.reshape((1, len(mut_array))) | |
70 | |
71 for m in range(0, len(mut_array[:, 0])): | |
72 print(str(m + 1) + " of " + str(len(mut_array[:, 0]))) | |
73 chrom = mut_array[m, 1] | |
74 stop_pos = mut_array[m, 2].astype(int) | |
75 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) | |
76 ref = mut_array[m, 9] | |
77 alt = mut_array[m, 10] | |
78 | |
79 for pileupcolumn in bam.pileup(chrom.tostring(), stop_pos - 2, stop_pos, max_depth=1000000000): | |
80 if pileupcolumn.reference_pos == stop_pos - 1: | |
81 count_alt = 0 | |
82 count_ref = 0 | |
83 count_indel = 0 | |
84 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), | |
85 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) | |
86 for pileupread in pileupcolumn.pileups: | |
87 if not pileupread.is_del and not pileupread.is_refskip: | |
88 tag = pileupread.alignment.query_name | |
89 abba = tag[-2:] | |
90 # query position is None if is_del or is_refskip is set. | |
91 if pileupread.alignment.query_sequence[pileupread.query_position] == alt: | |
92 count_alt += 1 | |
93 if chrom_stop_pos in mut_pos_dict: | |
94 if abba in mut_pos_dict[chrom_stop_pos]: | |
95 mut_pos_dict[chrom_stop_pos][abba] += 1 | |
96 else: | |
97 mut_pos_dict[chrom_stop_pos][abba] = 1 | |
98 else: | |
99 mut_pos_dict[chrom_stop_pos] = {} | |
100 mut_pos_dict[chrom_stop_pos][abba] = 1 | |
101 elif pileupread.alignment.query_sequence[pileupread.query_position] == ref: | |
102 count_ref += 1 | |
103 if chrom_stop_pos in ref_pos_dict: | |
104 if abba in ref_pos_dict[chrom_stop_pos]: | |
105 ref_pos_dict[chrom_stop_pos][abba] += 1 | |
106 else: | |
107 ref_pos_dict[chrom_stop_pos][abba] = 1 | |
108 else: | |
109 ref_pos_dict[chrom_stop_pos] = {} | |
110 ref_pos_dict[chrom_stop_pos][abba] = 1 | |
111 else: | |
112 count_indel += 1 | |
113 | |
114 print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" % | |
115 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) | |
116 | |
117 # if mutation is in DCS file but not in SSCS, then set counts to NA | |
118 if chrom_stop_pos not in mut_pos_dict.keys(): | |
119 mut_pos_dict[chrom_stop_pos] = {} | |
120 mut_pos_dict[chrom_stop_pos]["ab"] = 0 | |
121 mut_pos_dict[chrom_stop_pos]["ba"] = 0 | |
122 ref_pos_dict[chrom_stop_pos] = {} | |
123 ref_pos_dict[chrom_stop_pos]["ab"] = 0 | |
124 ref_pos_dict[chrom_stop_pos]["ba"] = 0 | |
125 bam.close() | |
126 | |
127 # save counts | |
128 with open(sscs_counts_json, "w") as f: | |
129 json.dump((mut_pos_dict, ref_pos_dict), f) | |
130 | |
131 | |
132 if __name__ == '__main__': | |
133 sys.exit(mut2sscs(sys.argv)) |