Mercurial > repos > iuc > gemini_lof_sieve
comparison gemini_mafify.py @ 7:2969a1f43b7d draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 5ea789e5342c3ad1afd2e0068c88f2b6dc4f7246"
| author | iuc |
|---|---|
| date | Tue, 10 Mar 2020 06:15:27 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 6:295814260d95 | 7:2969a1f43b7d |
|---|---|
| 1 import string | |
| 2 import sys | |
| 3 | |
| 4 | |
| 5 so_to_maf = { | |
| 6 'splice_acceptor_variant': 'Splice_Site', | |
| 7 'splice_donor_variant': 'Splice_Site', | |
| 8 'transcript_ablation': 'Splice_Site', | |
| 9 'exon_loss_variant': 'Splice_Site', | |
| 10 'stop_gained': 'Nonsense_Mutation', | |
| 11 'stop_lost': 'Nonstop_Mutation', | |
| 12 'frameshift_variant': 'Frame_Shift_', | |
| 13 'initiator_codon_variant': 'Translation_Start_Site', | |
| 14 'start_lost': 'Translation_Start_Site', | |
| 15 'inframe_insertion': 'In_Frame_Ins', | |
| 16 'inframe_deletion': 'In_Frame_Del', | |
| 17 'conservative_inframe_insertion': 'In_Frame_Ins', | |
| 18 'conservative_inframe_deletion': 'In_Frame_Del', | |
| 19 'disruptive_inframe_insertion': 'In_Frame_Ins', | |
| 20 'disruptive_inframe_deletion': 'In_Frame_Del', | |
| 21 'missense_variant': 'Missense_Mutation', | |
| 22 'coding_sequence_variant': 'Missense_Mutation', | |
| 23 'conservative_missense_variant': 'Missense_Mutation', | |
| 24 'rare_amino_acid_variant': 'Missense_Mutation', | |
| 25 'transcript_amplification': 'Intron', | |
| 26 'intron_variant': 'Intron', | |
| 27 'INTRAGENIC': 'Intron', | |
| 28 'intragenic_variant': 'Intron', | |
| 29 'splice_region_variant': 'Splice_Region', | |
| 30 'mature_miRNA_variant': 'RNA', | |
| 31 'exon_variant': 'RNA', | |
| 32 'non_coding_exon_variant': 'RNA', | |
| 33 'non_coding_transcript_exon_variant': 'RNA', | |
| 34 'non_coding_transcript_variant': 'RNA', | |
| 35 'nc_transcript_variant': 'RNA', | |
| 36 'stop_retained_variant': 'Silent', | |
| 37 'synonymous_variant': 'Silent', | |
| 38 'NMD_transcript_variant': 'Silent', | |
| 39 'incomplete_terminal_codon_variant': 'Silent', | |
| 40 '5_prime_UTR_variant': "5'UTR", | |
| 41 '5_prime_UTR_premature_start_codon_gain_variant': "5'UTR", | |
| 42 '3_prime_UTR_variant': "3'UTR", | |
| 43 'intergenic_variant': 'IGR', | |
| 44 'intergenic_region': 'IGR', | |
| 45 'regulatory_region_variant': 'IGR', | |
| 46 'regulatory_region': 'IGR', | |
| 47 'TF_binding_site_variant': 'IGR', | |
| 48 'upstream_gene_variant': "5'Flank", | |
| 49 'downstream_gene_variant': "3'Flank", | |
| 50 } | |
| 51 | |
| 52 | |
| 53 class VariantEffect(): | |
| 54 def __init__(self, variant_type): | |
| 55 self.variant_type = variant_type.capitalize() | |
| 56 assert self.variant_type in ['Snp', 'Ins', 'Del'] | |
| 57 | |
| 58 def __getitem__(self, so_effect): | |
| 59 if so_effect not in so_to_maf or ( | |
| 60 'frame' in so_effect and self.variant_type == 'Snp' | |
| 61 ): | |
| 62 return 'Targeted_Region' | |
| 63 | |
| 64 ret = so_to_maf[so_effect] | |
| 65 if ret == 'Frame_Shift_': | |
| 66 ret += self.variant_type | |
| 67 return ret | |
| 68 | |
| 69 | |
| 70 infile = sys.argv[1] | |
| 71 if len(sys.argv) > 2: | |
| 72 tumor_sample_name = sys.argv[2] | |
| 73 if len(sys.argv) > 3: | |
| 74 normal_sample_name = sys.argv[3] | |
| 75 | |
| 76 start_pos_idx = None | |
| 77 ref_idx = None | |
| 78 alt_idx = None | |
| 79 variant_type_idx = None | |
| 80 variant_classification_idx = None | |
| 81 gt_alt_depths_idx = {} | |
| 82 gt_ref_depths_idx = {} | |
| 83 gts_idx = {} | |
| 84 samples = set() | |
| 85 required_fields = [ | |
| 86 'Hugo_Symbol', | |
| 87 'NCBI_Build', | |
| 88 'Variant_Type', | |
| 89 'Variant_Classification', | |
| 90 'Tumor_Sample_Barcode', | |
| 91 'HGVSp_Short' | |
| 92 ] | |
| 93 | |
| 94 | |
| 95 with open(infile) as data_in: | |
| 96 cols = data_in.readline().rstrip().split('\t') | |
| 97 for field in required_fields: | |
| 98 if field not in cols: | |
| 99 raise IndexError( | |
| 100 'Cannot generate valid MAF without the following input ' | |
| 101 'columns: {0}.\n' | |
| 102 'Missing column: "{1}"' | |
| 103 .format(required_fields, field) | |
| 104 ) | |
| 105 for i, col in enumerate(cols): | |
| 106 if col == 'Variant_Type': | |
| 107 variant_type_idx = i | |
| 108 elif col == 'Variant_Classification': | |
| 109 variant_classification_idx = i | |
| 110 elif col == 'Start_Position': | |
| 111 start_pos_idx = i | |
| 112 elif col == 'Reference_Allele': | |
| 113 ref_idx = i | |
| 114 elif col == 'alt': | |
| 115 alt_idx = i | |
| 116 else: | |
| 117 column, _, sample = col.partition('.') | |
| 118 if sample: | |
| 119 if column == 'gt_alt_depths': | |
| 120 gt_alt_depths_idx[sample] = i | |
| 121 elif column == 'gt_ref_depths': | |
| 122 gt_ref_depths_idx[sample] = i | |
| 123 elif column == 'gts': | |
| 124 gts_idx[sample] = i | |
| 125 else: | |
| 126 # not a recognized sample-specific column | |
| 127 continue | |
| 128 samples.add(sample) | |
| 129 | |
| 130 if ref_idx is None: | |
| 131 raise IndexError('Input file does not have a column "Reference_Allele".') | |
| 132 | |
| 133 if not tumor_sample_name: | |
| 134 if normal_sample_name: | |
| 135 raise ValueError( | |
| 136 'Normal sample name requires the tumor sample name to be ' | |
| 137 'specified, too.' | |
| 138 ) | |
| 139 if len(samples) > 1: | |
| 140 raise ValueError( | |
| 141 'A tumor sample name is required with more than one sample ' | |
| 142 'in the input.' | |
| 143 ) | |
| 144 if samples: | |
| 145 # There is a single sample with genotype data. | |
| 146 # Assume its the tumor sample. | |
| 147 tumor_sample_name = next(iter(samples)) | |
| 148 else: | |
| 149 if tumor_sample_name not in samples: | |
| 150 raise ValueError( | |
| 151 'Could not find information about the specified tumor sample ' | |
| 152 'in the input.' | |
| 153 ) | |
| 154 if tumor_sample_name == normal_sample_name: | |
| 155 raise ValueError( | |
| 156 'Need different names for the normal and the tumor sample.' | |
| 157 ) | |
| 158 | |
| 159 if normal_sample_name and normal_sample_name not in samples: | |
| 160 raise ValueError( | |
| 161 'Could not find information about the specified normal sample ' | |
| 162 'in the input.' | |
| 163 ) | |
| 164 | |
| 165 # All input data checks passed! | |
| 166 # Now extract just the relevant index numbers for the tumor/normal pair | |
| 167 gts_idx = ( | |
| 168 gts_idx.get(tumor_sample_name, alt_idx), | |
| 169 gts_idx.get(normal_sample_name) | |
| 170 ) | |
| 171 gt_alt_depths_idx = ( | |
| 172 gt_alt_depths_idx.get(tumor_sample_name), | |
| 173 gt_alt_depths_idx.get(normal_sample_name) | |
| 174 ) | |
| 175 gt_ref_depths_idx = ( | |
| 176 gt_ref_depths_idx.get(tumor_sample_name), | |
| 177 gt_ref_depths_idx.get(normal_sample_name) | |
| 178 ) | |
| 179 | |
| 180 # Echo all MAF column names | |
| 181 cols_to_print = [] | |
| 182 for n in range(len(cols)): | |
| 183 if n in gts_idx: | |
| 184 continue | |
| 185 if n in gt_alt_depths_idx: | |
| 186 continue | |
| 187 if n in gt_ref_depths_idx: | |
| 188 continue | |
| 189 if n != alt_idx: | |
| 190 cols_to_print.append(n) | |
| 191 | |
| 192 print('\t'.join([cols[n] for n in cols_to_print])) | |
| 193 | |
| 194 for line in data_in: | |
| 195 cols = line.rstrip().split('\t') | |
| 196 | |
| 197 gt_alt_depths = [ | |
| 198 int(cols[ad_idx]) if ad_idx else '' | |
| 199 for ad_idx in gt_alt_depths_idx | |
| 200 ] | |
| 201 gt_ref_depths = [ | |
| 202 int(cols[rd_idx]) if rd_idx else '' | |
| 203 for rd_idx in gt_ref_depths_idx | |
| 204 ] | |
| 205 | |
| 206 gts = [ | |
| 207 ['', ''], | |
| 208 ['', ''] | |
| 209 ] | |
| 210 for n, gt_idx in enumerate(gts_idx): | |
| 211 if gt_idx: | |
| 212 gt_sep = '/' if '/' in cols[gt_idx] else '|' | |
| 213 allele1, _, allele2 = [ | |
| 214 '' if allele == '.' else allele | |
| 215 for allele in cols[gt_idx].partition(gt_sep) | |
| 216 ] | |
| 217 # follow cBioportal recommendation to leave allele1 empty | |
| 218 # when information is not avaliable | |
| 219 if not allele2: | |
| 220 gts[n] = [allele2, allele1] | |
| 221 else: | |
| 222 gts[n] = [allele1, allele2] | |
| 223 if not gts: | |
| 224 gts = [['', ''], ['', '']] | |
| 225 | |
| 226 if cols[variant_type_idx].lower() in ['ins', 'del']: | |
| 227 # transform VCF-style indel representations into MAF ones | |
| 228 ref_allele = cols[ref_idx] | |
| 229 for n, nucs in enumerate( | |
| 230 zip( | |
| 231 ref_allele, | |
| 232 *[allele for gt in gts for allele in gt if allele] | |
| 233 ) | |
| 234 ): | |
| 235 if any(nuc != nucs[0] for nuc in nucs[1:]): | |
| 236 break | |
| 237 else: | |
| 238 n += 1 | |
| 239 if n > 0: | |
| 240 cols[ref_idx] = cols[ref_idx][n:] or '-' | |
| 241 for gt in gts: | |
| 242 for idx, allele in enumerate(gt): | |
| 243 if allele: | |
| 244 gt[idx] = allele[n:] or '-' | |
| 245 if cols[ref_idx] == '-': | |
| 246 n -= 1 | |
| 247 cols[start_pos_idx] = str(int(cols[start_pos_idx]) + n) | |
| 248 | |
| 249 # in-place substitution of so_effect with MAF effect | |
| 250 cols[variant_classification_idx] = VariantEffect( | |
| 251 cols[variant_type_idx] | |
| 252 )[cols[variant_classification_idx]] | |
| 253 ret_line = '\t'.join([cols[n] for n in cols_to_print]) | |
| 254 | |
| 255 field_formatters = { | |
| 256 'tumor_seq_allele1': gts[0][0], | |
| 257 'tumor_seq_allele2': gts[0][1], | |
| 258 'match_norm_seq_allele1': gts[1][0], | |
| 259 'match_norm_seq_allele2': gts[1][1], | |
| 260 't_alt_count': gt_alt_depths[0], | |
| 261 'n_alt_count': gt_alt_depths[1], | |
| 262 't_ref_count': gt_ref_depths[0], | |
| 263 'n_ref_count': gt_ref_depths[1], | |
| 264 } | |
| 265 | |
| 266 print( | |
| 267 # use safe_substitute here to avoid key errors with column content | |
| 268 # looking like unknown placeholders | |
| 269 string.Template(ret_line).safe_substitute(field_formatters) | |
| 270 ) |
