Mercurial > repos > iuc > gemini_qc
comparison gemini_mafify.py @ 7:86e46972e183 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 5ea789e5342c3ad1afd2e0068c88f2b6dc4f7246"
author | iuc |
---|---|
date | Tue, 10 Mar 2020 06:14:22 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:137a3e07062e | 7:86e46972e183 |
---|---|
1 import string | |
2 import sys | |
3 | |
4 | |
5 so_to_maf = { | |
6 'splice_acceptor_variant': 'Splice_Site', | |
7 'splice_donor_variant': 'Splice_Site', | |
8 'transcript_ablation': 'Splice_Site', | |
9 'exon_loss_variant': 'Splice_Site', | |
10 'stop_gained': 'Nonsense_Mutation', | |
11 'stop_lost': 'Nonstop_Mutation', | |
12 'frameshift_variant': 'Frame_Shift_', | |
13 'initiator_codon_variant': 'Translation_Start_Site', | |
14 'start_lost': 'Translation_Start_Site', | |
15 'inframe_insertion': 'In_Frame_Ins', | |
16 'inframe_deletion': 'In_Frame_Del', | |
17 'conservative_inframe_insertion': 'In_Frame_Ins', | |
18 'conservative_inframe_deletion': 'In_Frame_Del', | |
19 'disruptive_inframe_insertion': 'In_Frame_Ins', | |
20 'disruptive_inframe_deletion': 'In_Frame_Del', | |
21 'missense_variant': 'Missense_Mutation', | |
22 'coding_sequence_variant': 'Missense_Mutation', | |
23 'conservative_missense_variant': 'Missense_Mutation', | |
24 'rare_amino_acid_variant': 'Missense_Mutation', | |
25 'transcript_amplification': 'Intron', | |
26 'intron_variant': 'Intron', | |
27 'INTRAGENIC': 'Intron', | |
28 'intragenic_variant': 'Intron', | |
29 'splice_region_variant': 'Splice_Region', | |
30 'mature_miRNA_variant': 'RNA', | |
31 'exon_variant': 'RNA', | |
32 'non_coding_exon_variant': 'RNA', | |
33 'non_coding_transcript_exon_variant': 'RNA', | |
34 'non_coding_transcript_variant': 'RNA', | |
35 'nc_transcript_variant': 'RNA', | |
36 'stop_retained_variant': 'Silent', | |
37 'synonymous_variant': 'Silent', | |
38 'NMD_transcript_variant': 'Silent', | |
39 'incomplete_terminal_codon_variant': 'Silent', | |
40 '5_prime_UTR_variant': "5'UTR", | |
41 '5_prime_UTR_premature_start_codon_gain_variant': "5'UTR", | |
42 '3_prime_UTR_variant': "3'UTR", | |
43 'intergenic_variant': 'IGR', | |
44 'intergenic_region': 'IGR', | |
45 'regulatory_region_variant': 'IGR', | |
46 'regulatory_region': 'IGR', | |
47 'TF_binding_site_variant': 'IGR', | |
48 'upstream_gene_variant': "5'Flank", | |
49 'downstream_gene_variant': "3'Flank", | |
50 } | |
51 | |
52 | |
53 class VariantEffect(): | |
54 def __init__(self, variant_type): | |
55 self.variant_type = variant_type.capitalize() | |
56 assert self.variant_type in ['Snp', 'Ins', 'Del'] | |
57 | |
58 def __getitem__(self, so_effect): | |
59 if so_effect not in so_to_maf or ( | |
60 'frame' in so_effect and self.variant_type == 'Snp' | |
61 ): | |
62 return 'Targeted_Region' | |
63 | |
64 ret = so_to_maf[so_effect] | |
65 if ret == 'Frame_Shift_': | |
66 ret += self.variant_type | |
67 return ret | |
68 | |
69 | |
70 infile = sys.argv[1] | |
71 if len(sys.argv) > 2: | |
72 tumor_sample_name = sys.argv[2] | |
73 if len(sys.argv) > 3: | |
74 normal_sample_name = sys.argv[3] | |
75 | |
76 start_pos_idx = None | |
77 ref_idx = None | |
78 alt_idx = None | |
79 variant_type_idx = None | |
80 variant_classification_idx = None | |
81 gt_alt_depths_idx = {} | |
82 gt_ref_depths_idx = {} | |
83 gts_idx = {} | |
84 samples = set() | |
85 required_fields = [ | |
86 'Hugo_Symbol', | |
87 'NCBI_Build', | |
88 'Variant_Type', | |
89 'Variant_Classification', | |
90 'Tumor_Sample_Barcode', | |
91 'HGVSp_Short' | |
92 ] | |
93 | |
94 | |
95 with open(infile) as data_in: | |
96 cols = data_in.readline().rstrip().split('\t') | |
97 for field in required_fields: | |
98 if field not in cols: | |
99 raise IndexError( | |
100 'Cannot generate valid MAF without the following input ' | |
101 'columns: {0}.\n' | |
102 'Missing column: "{1}"' | |
103 .format(required_fields, field) | |
104 ) | |
105 for i, col in enumerate(cols): | |
106 if col == 'Variant_Type': | |
107 variant_type_idx = i | |
108 elif col == 'Variant_Classification': | |
109 variant_classification_idx = i | |
110 elif col == 'Start_Position': | |
111 start_pos_idx = i | |
112 elif col == 'Reference_Allele': | |
113 ref_idx = i | |
114 elif col == 'alt': | |
115 alt_idx = i | |
116 else: | |
117 column, _, sample = col.partition('.') | |
118 if sample: | |
119 if column == 'gt_alt_depths': | |
120 gt_alt_depths_idx[sample] = i | |
121 elif column == 'gt_ref_depths': | |
122 gt_ref_depths_idx[sample] = i | |
123 elif column == 'gts': | |
124 gts_idx[sample] = i | |
125 else: | |
126 # not a recognized sample-specific column | |
127 continue | |
128 samples.add(sample) | |
129 | |
130 if ref_idx is None: | |
131 raise IndexError('Input file does not have a column "Reference_Allele".') | |
132 | |
133 if not tumor_sample_name: | |
134 if normal_sample_name: | |
135 raise ValueError( | |
136 'Normal sample name requires the tumor sample name to be ' | |
137 'specified, too.' | |
138 ) | |
139 if len(samples) > 1: | |
140 raise ValueError( | |
141 'A tumor sample name is required with more than one sample ' | |
142 'in the input.' | |
143 ) | |
144 if samples: | |
145 # There is a single sample with genotype data. | |
146 # Assume its the tumor sample. | |
147 tumor_sample_name = next(iter(samples)) | |
148 else: | |
149 if tumor_sample_name not in samples: | |
150 raise ValueError( | |
151 'Could not find information about the specified tumor sample ' | |
152 'in the input.' | |
153 ) | |
154 if tumor_sample_name == normal_sample_name: | |
155 raise ValueError( | |
156 'Need different names for the normal and the tumor sample.' | |
157 ) | |
158 | |
159 if normal_sample_name and normal_sample_name not in samples: | |
160 raise ValueError( | |
161 'Could not find information about the specified normal sample ' | |
162 'in the input.' | |
163 ) | |
164 | |
165 # All input data checks passed! | |
166 # Now extract just the relevant index numbers for the tumor/normal pair | |
167 gts_idx = ( | |
168 gts_idx.get(tumor_sample_name, alt_idx), | |
169 gts_idx.get(normal_sample_name) | |
170 ) | |
171 gt_alt_depths_idx = ( | |
172 gt_alt_depths_idx.get(tumor_sample_name), | |
173 gt_alt_depths_idx.get(normal_sample_name) | |
174 ) | |
175 gt_ref_depths_idx = ( | |
176 gt_ref_depths_idx.get(tumor_sample_name), | |
177 gt_ref_depths_idx.get(normal_sample_name) | |
178 ) | |
179 | |
180 # Echo all MAF column names | |
181 cols_to_print = [] | |
182 for n in range(len(cols)): | |
183 if n in gts_idx: | |
184 continue | |
185 if n in gt_alt_depths_idx: | |
186 continue | |
187 if n in gt_ref_depths_idx: | |
188 continue | |
189 if n != alt_idx: | |
190 cols_to_print.append(n) | |
191 | |
192 print('\t'.join([cols[n] for n in cols_to_print])) | |
193 | |
194 for line in data_in: | |
195 cols = line.rstrip().split('\t') | |
196 | |
197 gt_alt_depths = [ | |
198 int(cols[ad_idx]) if ad_idx else '' | |
199 for ad_idx in gt_alt_depths_idx | |
200 ] | |
201 gt_ref_depths = [ | |
202 int(cols[rd_idx]) if rd_idx else '' | |
203 for rd_idx in gt_ref_depths_idx | |
204 ] | |
205 | |
206 gts = [ | |
207 ['', ''], | |
208 ['', ''] | |
209 ] | |
210 for n, gt_idx in enumerate(gts_idx): | |
211 if gt_idx: | |
212 gt_sep = '/' if '/' in cols[gt_idx] else '|' | |
213 allele1, _, allele2 = [ | |
214 '' if allele == '.' else allele | |
215 for allele in cols[gt_idx].partition(gt_sep) | |
216 ] | |
217 # follow cBioportal recommendation to leave allele1 empty | |
218 # when information is not avaliable | |
219 if not allele2: | |
220 gts[n] = [allele2, allele1] | |
221 else: | |
222 gts[n] = [allele1, allele2] | |
223 if not gts: | |
224 gts = [['', ''], ['', '']] | |
225 | |
226 if cols[variant_type_idx].lower() in ['ins', 'del']: | |
227 # transform VCF-style indel representations into MAF ones | |
228 ref_allele = cols[ref_idx] | |
229 for n, nucs in enumerate( | |
230 zip( | |
231 ref_allele, | |
232 *[allele for gt in gts for allele in gt if allele] | |
233 ) | |
234 ): | |
235 if any(nuc != nucs[0] for nuc in nucs[1:]): | |
236 break | |
237 else: | |
238 n += 1 | |
239 if n > 0: | |
240 cols[ref_idx] = cols[ref_idx][n:] or '-' | |
241 for gt in gts: | |
242 for idx, allele in enumerate(gt): | |
243 if allele: | |
244 gt[idx] = allele[n:] or '-' | |
245 if cols[ref_idx] == '-': | |
246 n -= 1 | |
247 cols[start_pos_idx] = str(int(cols[start_pos_idx]) + n) | |
248 | |
249 # in-place substitution of so_effect with MAF effect | |
250 cols[variant_classification_idx] = VariantEffect( | |
251 cols[variant_type_idx] | |
252 )[cols[variant_classification_idx]] | |
253 ret_line = '\t'.join([cols[n] for n in cols_to_print]) | |
254 | |
255 field_formatters = { | |
256 'tumor_seq_allele1': gts[0][0], | |
257 'tumor_seq_allele2': gts[0][1], | |
258 'match_norm_seq_allele1': gts[1][0], | |
259 'match_norm_seq_allele2': gts[1][1], | |
260 't_alt_count': gt_alt_depths[0], | |
261 'n_alt_count': gt_alt_depths[1], | |
262 't_ref_count': gt_ref_depths[0], | |
263 'n_ref_count': gt_ref_depths[1], | |
264 } | |
265 | |
266 print( | |
267 # use safe_substitute here to avoid key errors with column content | |
268 # looking like unknown placeholders | |
269 string.Template(ret_line).safe_substitute(field_formatters) | |
270 ) |