comparison gemini_mafify.py @ 7:9199331c3421 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 5ea789e5342c3ad1afd2e0068c88f2b6dc4f7246"
author iuc
date Tue, 10 Mar 2020 06:21:36 -0400
parents
children
comparison
equal deleted inserted replaced
6:d5ec6384a3be 7:9199331c3421
1 import string
2 import sys
3
4
5 so_to_maf = {
6 'splice_acceptor_variant': 'Splice_Site',
7 'splice_donor_variant': 'Splice_Site',
8 'transcript_ablation': 'Splice_Site',
9 'exon_loss_variant': 'Splice_Site',
10 'stop_gained': 'Nonsense_Mutation',
11 'stop_lost': 'Nonstop_Mutation',
12 'frameshift_variant': 'Frame_Shift_',
13 'initiator_codon_variant': 'Translation_Start_Site',
14 'start_lost': 'Translation_Start_Site',
15 'inframe_insertion': 'In_Frame_Ins',
16 'inframe_deletion': 'In_Frame_Del',
17 'conservative_inframe_insertion': 'In_Frame_Ins',
18 'conservative_inframe_deletion': 'In_Frame_Del',
19 'disruptive_inframe_insertion': 'In_Frame_Ins',
20 'disruptive_inframe_deletion': 'In_Frame_Del',
21 'missense_variant': 'Missense_Mutation',
22 'coding_sequence_variant': 'Missense_Mutation',
23 'conservative_missense_variant': 'Missense_Mutation',
24 'rare_amino_acid_variant': 'Missense_Mutation',
25 'transcript_amplification': 'Intron',
26 'intron_variant': 'Intron',
27 'INTRAGENIC': 'Intron',
28 'intragenic_variant': 'Intron',
29 'splice_region_variant': 'Splice_Region',
30 'mature_miRNA_variant': 'RNA',
31 'exon_variant': 'RNA',
32 'non_coding_exon_variant': 'RNA',
33 'non_coding_transcript_exon_variant': 'RNA',
34 'non_coding_transcript_variant': 'RNA',
35 'nc_transcript_variant': 'RNA',
36 'stop_retained_variant': 'Silent',
37 'synonymous_variant': 'Silent',
38 'NMD_transcript_variant': 'Silent',
39 'incomplete_terminal_codon_variant': 'Silent',
40 '5_prime_UTR_variant': "5'UTR",
41 '5_prime_UTR_premature_start_codon_gain_variant': "5'UTR",
42 '3_prime_UTR_variant': "3'UTR",
43 'intergenic_variant': 'IGR',
44 'intergenic_region': 'IGR',
45 'regulatory_region_variant': 'IGR',
46 'regulatory_region': 'IGR',
47 'TF_binding_site_variant': 'IGR',
48 'upstream_gene_variant': "5'Flank",
49 'downstream_gene_variant': "3'Flank",
50 }
51
52
53 class VariantEffect():
54 def __init__(self, variant_type):
55 self.variant_type = variant_type.capitalize()
56 assert self.variant_type in ['Snp', 'Ins', 'Del']
57
58 def __getitem__(self, so_effect):
59 if so_effect not in so_to_maf or (
60 'frame' in so_effect and self.variant_type == 'Snp'
61 ):
62 return 'Targeted_Region'
63
64 ret = so_to_maf[so_effect]
65 if ret == 'Frame_Shift_':
66 ret += self.variant_type
67 return ret
68
69
70 infile = sys.argv[1]
71 if len(sys.argv) > 2:
72 tumor_sample_name = sys.argv[2]
73 if len(sys.argv) > 3:
74 normal_sample_name = sys.argv[3]
75
76 start_pos_idx = None
77 ref_idx = None
78 alt_idx = None
79 variant_type_idx = None
80 variant_classification_idx = None
81 gt_alt_depths_idx = {}
82 gt_ref_depths_idx = {}
83 gts_idx = {}
84 samples = set()
85 required_fields = [
86 'Hugo_Symbol',
87 'NCBI_Build',
88 'Variant_Type',
89 'Variant_Classification',
90 'Tumor_Sample_Barcode',
91 'HGVSp_Short'
92 ]
93
94
95 with open(infile) as data_in:
96 cols = data_in.readline().rstrip().split('\t')
97 for field in required_fields:
98 if field not in cols:
99 raise IndexError(
100 'Cannot generate valid MAF without the following input '
101 'columns: {0}.\n'
102 'Missing column: "{1}"'
103 .format(required_fields, field)
104 )
105 for i, col in enumerate(cols):
106 if col == 'Variant_Type':
107 variant_type_idx = i
108 elif col == 'Variant_Classification':
109 variant_classification_idx = i
110 elif col == 'Start_Position':
111 start_pos_idx = i
112 elif col == 'Reference_Allele':
113 ref_idx = i
114 elif col == 'alt':
115 alt_idx = i
116 else:
117 column, _, sample = col.partition('.')
118 if sample:
119 if column == 'gt_alt_depths':
120 gt_alt_depths_idx[sample] = i
121 elif column == 'gt_ref_depths':
122 gt_ref_depths_idx[sample] = i
123 elif column == 'gts':
124 gts_idx[sample] = i
125 else:
126 # not a recognized sample-specific column
127 continue
128 samples.add(sample)
129
130 if ref_idx is None:
131 raise IndexError('Input file does not have a column "Reference_Allele".')
132
133 if not tumor_sample_name:
134 if normal_sample_name:
135 raise ValueError(
136 'Normal sample name requires the tumor sample name to be '
137 'specified, too.'
138 )
139 if len(samples) > 1:
140 raise ValueError(
141 'A tumor sample name is required with more than one sample '
142 'in the input.'
143 )
144 if samples:
145 # There is a single sample with genotype data.
146 # Assume its the tumor sample.
147 tumor_sample_name = next(iter(samples))
148 else:
149 if tumor_sample_name not in samples:
150 raise ValueError(
151 'Could not find information about the specified tumor sample '
152 'in the input.'
153 )
154 if tumor_sample_name == normal_sample_name:
155 raise ValueError(
156 'Need different names for the normal and the tumor sample.'
157 )
158
159 if normal_sample_name and normal_sample_name not in samples:
160 raise ValueError(
161 'Could not find information about the specified normal sample '
162 'in the input.'
163 )
164
165 # All input data checks passed!
166 # Now extract just the relevant index numbers for the tumor/normal pair
167 gts_idx = (
168 gts_idx.get(tumor_sample_name, alt_idx),
169 gts_idx.get(normal_sample_name)
170 )
171 gt_alt_depths_idx = (
172 gt_alt_depths_idx.get(tumor_sample_name),
173 gt_alt_depths_idx.get(normal_sample_name)
174 )
175 gt_ref_depths_idx = (
176 gt_ref_depths_idx.get(tumor_sample_name),
177 gt_ref_depths_idx.get(normal_sample_name)
178 )
179
180 # Echo all MAF column names
181 cols_to_print = []
182 for n in range(len(cols)):
183 if n in gts_idx:
184 continue
185 if n in gt_alt_depths_idx:
186 continue
187 if n in gt_ref_depths_idx:
188 continue
189 if n != alt_idx:
190 cols_to_print.append(n)
191
192 print('\t'.join([cols[n] for n in cols_to_print]))
193
194 for line in data_in:
195 cols = line.rstrip().split('\t')
196
197 gt_alt_depths = [
198 int(cols[ad_idx]) if ad_idx else ''
199 for ad_idx in gt_alt_depths_idx
200 ]
201 gt_ref_depths = [
202 int(cols[rd_idx]) if rd_idx else ''
203 for rd_idx in gt_ref_depths_idx
204 ]
205
206 gts = [
207 ['', ''],
208 ['', '']
209 ]
210 for n, gt_idx in enumerate(gts_idx):
211 if gt_idx:
212 gt_sep = '/' if '/' in cols[gt_idx] else '|'
213 allele1, _, allele2 = [
214 '' if allele == '.' else allele
215 for allele in cols[gt_idx].partition(gt_sep)
216 ]
217 # follow cBioportal recommendation to leave allele1 empty
218 # when information is not avaliable
219 if not allele2:
220 gts[n] = [allele2, allele1]
221 else:
222 gts[n] = [allele1, allele2]
223 if not gts:
224 gts = [['', ''], ['', '']]
225
226 if cols[variant_type_idx].lower() in ['ins', 'del']:
227 # transform VCF-style indel representations into MAF ones
228 ref_allele = cols[ref_idx]
229 for n, nucs in enumerate(
230 zip(
231 ref_allele,
232 *[allele for gt in gts for allele in gt if allele]
233 )
234 ):
235 if any(nuc != nucs[0] for nuc in nucs[1:]):
236 break
237 else:
238 n += 1
239 if n > 0:
240 cols[ref_idx] = cols[ref_idx][n:] or '-'
241 for gt in gts:
242 for idx, allele in enumerate(gt):
243 if allele:
244 gt[idx] = allele[n:] or '-'
245 if cols[ref_idx] == '-':
246 n -= 1
247 cols[start_pos_idx] = str(int(cols[start_pos_idx]) + n)
248
249 # in-place substitution of so_effect with MAF effect
250 cols[variant_classification_idx] = VariantEffect(
251 cols[variant_type_idx]
252 )[cols[variant_classification_idx]]
253 ret_line = '\t'.join([cols[n] for n in cols_to_print])
254
255 field_formatters = {
256 'tumor_seq_allele1': gts[0][0],
257 'tumor_seq_allele2': gts[0][1],
258 'match_norm_seq_allele1': gts[1][0],
259 'match_norm_seq_allele2': gts[1][1],
260 't_alt_count': gt_alt_depths[0],
261 'n_alt_count': gt_alt_depths[1],
262 't_ref_count': gt_ref_depths[0],
263 'n_ref_count': gt_ref_depths[1],
264 }
265
266 print(
267 # use safe_substitute here to avoid key errors with column content
268 # looking like unknown placeholders
269 string.Template(ret_line).safe_substitute(field_formatters)
270 )