# HG changeset patch # User iuc # Date 1583835295 14400 # Node ID da74170c55c74bc93f7fed548553de8477ac4f91 # Parent 840fb4850be36d39e6fba28c722ec8c84b3a54d7 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 5ea789e5342c3ad1afd2e0068c88f2b6dc4f7246" diff -r 840fb4850be3 -r da74170c55c7 gemini_mafify.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gemini_mafify.py Tue Mar 10 06:14:55 2020 -0400 @@ -0,0 +1,270 @@ +import string +import sys + + +so_to_maf = { + 'splice_acceptor_variant': 'Splice_Site', + 'splice_donor_variant': 'Splice_Site', + 'transcript_ablation': 'Splice_Site', + 'exon_loss_variant': 'Splice_Site', + 'stop_gained': 'Nonsense_Mutation', + 'stop_lost': 'Nonstop_Mutation', + 'frameshift_variant': 'Frame_Shift_', + 'initiator_codon_variant': 'Translation_Start_Site', + 'start_lost': 'Translation_Start_Site', + 'inframe_insertion': 'In_Frame_Ins', + 'inframe_deletion': 'In_Frame_Del', + 'conservative_inframe_insertion': 'In_Frame_Ins', + 'conservative_inframe_deletion': 'In_Frame_Del', + 'disruptive_inframe_insertion': 'In_Frame_Ins', + 'disruptive_inframe_deletion': 'In_Frame_Del', + 'missense_variant': 'Missense_Mutation', + 'coding_sequence_variant': 'Missense_Mutation', + 'conservative_missense_variant': 'Missense_Mutation', + 'rare_amino_acid_variant': 'Missense_Mutation', + 'transcript_amplification': 'Intron', + 'intron_variant': 'Intron', + 'INTRAGENIC': 'Intron', + 'intragenic_variant': 'Intron', + 'splice_region_variant': 'Splice_Region', + 'mature_miRNA_variant': 'RNA', + 'exon_variant': 'RNA', + 'non_coding_exon_variant': 'RNA', + 'non_coding_transcript_exon_variant': 'RNA', + 'non_coding_transcript_variant': 'RNA', + 'nc_transcript_variant': 'RNA', + 'stop_retained_variant': 'Silent', + 'synonymous_variant': 'Silent', + 'NMD_transcript_variant': 'Silent', + 'incomplete_terminal_codon_variant': 'Silent', + '5_prime_UTR_variant': "5'UTR", + '5_prime_UTR_premature_start_codon_gain_variant': "5'UTR", + '3_prime_UTR_variant': "3'UTR", + 'intergenic_variant': 'IGR', + 'intergenic_region': 'IGR', + 'regulatory_region_variant': 'IGR', + 'regulatory_region': 'IGR', + 'TF_binding_site_variant': 'IGR', + 'upstream_gene_variant': "5'Flank", + 'downstream_gene_variant': "3'Flank", +} + + +class VariantEffect(): + def __init__(self, variant_type): + self.variant_type = variant_type.capitalize() + assert self.variant_type in ['Snp', 'Ins', 'Del'] + + def __getitem__(self, so_effect): + if so_effect not in so_to_maf or ( + 'frame' in so_effect and self.variant_type == 'Snp' + ): + return 'Targeted_Region' + + ret = so_to_maf[so_effect] + if ret == 'Frame_Shift_': + ret += self.variant_type + return ret + + +infile = sys.argv[1] +if len(sys.argv) > 2: + tumor_sample_name = sys.argv[2] +if len(sys.argv) > 3: + normal_sample_name = sys.argv[3] + +start_pos_idx = None +ref_idx = None +alt_idx = None +variant_type_idx = None +variant_classification_idx = None +gt_alt_depths_idx = {} +gt_ref_depths_idx = {} +gts_idx = {} +samples = set() +required_fields = [ + 'Hugo_Symbol', + 'NCBI_Build', + 'Variant_Type', + 'Variant_Classification', + 'Tumor_Sample_Barcode', + 'HGVSp_Short' +] + + +with open(infile) as data_in: + cols = data_in.readline().rstrip().split('\t') + for field in required_fields: + if field not in cols: + raise IndexError( + 'Cannot generate valid MAF without the following input ' + 'columns: {0}.\n' + 'Missing column: "{1}"' + .format(required_fields, field) + ) + for i, col in enumerate(cols): + if col == 'Variant_Type': + variant_type_idx = i + elif col == 'Variant_Classification': + variant_classification_idx = i + elif col == 'Start_Position': + start_pos_idx = i + elif col == 'Reference_Allele': + ref_idx = i + elif col == 'alt': + alt_idx = i + else: + column, _, sample = col.partition('.') + if sample: + if column == 'gt_alt_depths': + gt_alt_depths_idx[sample] = i + elif column == 'gt_ref_depths': + gt_ref_depths_idx[sample] = i + elif column == 'gts': + gts_idx[sample] = i + else: + # not a recognized sample-specific column + continue + samples.add(sample) + + if ref_idx is None: + raise IndexError('Input file does not have a column "Reference_Allele".') + + if not tumor_sample_name: + if normal_sample_name: + raise ValueError( + 'Normal sample name requires the tumor sample name to be ' + 'specified, too.' + ) + if len(samples) > 1: + raise ValueError( + 'A tumor sample name is required with more than one sample ' + 'in the input.' + ) + if samples: + # There is a single sample with genotype data. + # Assume its the tumor sample. + tumor_sample_name = next(iter(samples)) + else: + if tumor_sample_name not in samples: + raise ValueError( + 'Could not find information about the specified tumor sample ' + 'in the input.' + ) + if tumor_sample_name == normal_sample_name: + raise ValueError( + 'Need different names for the normal and the tumor sample.' + ) + + if normal_sample_name and normal_sample_name not in samples: + raise ValueError( + 'Could not find information about the specified normal sample ' + 'in the input.' + ) + + # All input data checks passed! + # Now extract just the relevant index numbers for the tumor/normal pair + gts_idx = ( + gts_idx.get(tumor_sample_name, alt_idx), + gts_idx.get(normal_sample_name) + ) + gt_alt_depths_idx = ( + gt_alt_depths_idx.get(tumor_sample_name), + gt_alt_depths_idx.get(normal_sample_name) + ) + gt_ref_depths_idx = ( + gt_ref_depths_idx.get(tumor_sample_name), + gt_ref_depths_idx.get(normal_sample_name) + ) + + # Echo all MAF column names + cols_to_print = [] + for n in range(len(cols)): + if n in gts_idx: + continue + if n in gt_alt_depths_idx: + continue + if n in gt_ref_depths_idx: + continue + if n != alt_idx: + cols_to_print.append(n) + + print('\t'.join([cols[n] for n in cols_to_print])) + + for line in data_in: + cols = line.rstrip().split('\t') + + gt_alt_depths = [ + int(cols[ad_idx]) if ad_idx else '' + for ad_idx in gt_alt_depths_idx + ] + gt_ref_depths = [ + int(cols[rd_idx]) if rd_idx else '' + for rd_idx in gt_ref_depths_idx + ] + + gts = [ + ['', ''], + ['', ''] + ] + for n, gt_idx in enumerate(gts_idx): + if gt_idx: + gt_sep = '/' if '/' in cols[gt_idx] else '|' + allele1, _, allele2 = [ + '' if allele == '.' else allele + for allele in cols[gt_idx].partition(gt_sep) + ] + # follow cBioportal recommendation to leave allele1 empty + # when information is not avaliable + if not allele2: + gts[n] = [allele2, allele1] + else: + gts[n] = [allele1, allele2] + if not gts: + gts = [['', ''], ['', '']] + + if cols[variant_type_idx].lower() in ['ins', 'del']: + # transform VCF-style indel representations into MAF ones + ref_allele = cols[ref_idx] + for n, nucs in enumerate( + zip( + ref_allele, + *[allele for gt in gts for allele in gt if allele] + ) + ): + if any(nuc != nucs[0] for nuc in nucs[1:]): + break + else: + n += 1 + if n > 0: + cols[ref_idx] = cols[ref_idx][n:] or '-' + for gt in gts: + for idx, allele in enumerate(gt): + if allele: + gt[idx] = allele[n:] or '-' + if cols[ref_idx] == '-': + n -= 1 + cols[start_pos_idx] = str(int(cols[start_pos_idx]) + n) + + # in-place substitution of so_effect with MAF effect + cols[variant_classification_idx] = VariantEffect( + cols[variant_type_idx] + )[cols[variant_classification_idx]] + ret_line = '\t'.join([cols[n] for n in cols_to_print]) + + field_formatters = { + 'tumor_seq_allele1': gts[0][0], + 'tumor_seq_allele2': gts[0][1], + 'match_norm_seq_allele1': gts[1][0], + 'match_norm_seq_allele2': gts[1][1], + 't_alt_count': gt_alt_depths[0], + 'n_alt_count': gt_alt_depths[1], + 't_ref_count': gt_ref_depths[0], + 'n_ref_count': gt_ref_depths[1], + } + + print( + # use safe_substitute here to avoid key errors with column content + # looking like unknown placeholders + string.Template(ret_line).safe_substitute(field_formatters) + ) diff -r 840fb4850be3 -r da74170c55c7 gemini_query.xml --- a/gemini_query.xml Fri Jan 24 17:31:00 2020 -0500 +++ b/gemini_query.xml Tue Mar 10 06:14:55 2020 -0400 @@ -1,4 +1,4 @@ - + Querying the GEMINI database gemini_macros.xml @@ -27,6 +27,13 @@ + + + + @@ -69,7 +76,7 @@ #else: affected #end if - #else: + #elif str($query.oformat.report.format) != 'maf': --format ${query.oformat.report.format} #end if @@ -77,11 +84,36 @@ ## build the SQL query string from its components #if str($query.oformat.report.format) in ('vcf', 'tped'): #set $cols = "*" + #elif str($query.oformat.report.format) == 'maf': + #if str($query.oformat.report.tumor_sample_name): + #set $gt_string = 'gt_alt_depths.{0}, gt_ref_depths.{0}, gts.{0}'.format(str($query.oformat.report.tumor_sample_name)) + #if str($query.oformat.report.normal_sample_name): + #set $gt_string = $gt_string + ', gt_alt_depths.{0}, gt_ref_depths.{0}, gts.{0}'.format(str($query.oformat.report.normal_sample_name)) + #end if + #else: + #set $gt_string = '(gt_alt_depths).(*), (gt_ref_depths).(*), (gts).(*)' + #end if + #if str($query.oformat.report.mutation_status.status_select) == 'custom': + ## Need to quote the user-specified mutation status for the SQL query + #set $mutation_status = '"%s"' % str($query.oformat.report.mutation_status.status_custom) + #elif str($query.oformat.report.mutation_status.status_select) == 'expression': + ## For custom expressions, it is up to the user to ensure valid syntax + #set $mutation_status = str($query.oformat.report.mutation_status.status_expression) + #else: + ## The user selected a fixed value from the list, but + ## it still needs quoting. + #set $mutation_status = '"%s"' % str($query.oformat.report.mutation_status.status_select) + #end if + #set $cols = 'ifnull(g1.gene, "unknown") AS Hugo_Symbol, ifnull(ifnull(g2.entrez_id, g1.entrez_id), "") AS Entrez_Gene_Id, "" AS Center, "37" AS NCBI_Build, replace(v.chrom, "chr", "") AS Chromosome, v.start + 1 AS Start_Position, v.end AS End_Position, "" as Strand, v.impact_so AS Variant_Classification, ifnull(nullif(v.type, "indel"), v.sub_type) AS Variant_Type, v.ref AS Reference_Allele, "${tumor_seq_allele1}" AS Tumor_Seq_Allele1, "${tumor_seq_allele2}" AS Tumor_Seq_Allele2, ifnull(v.rs_ids, ifnull(nullif(ifnull(nullif(v.in_omim = 0 AND v.cosmic_ids IS NULL AND v.max_aaf_all = -1, 1), "novel"), 0), "")) AS dbSNP_RS, "" AS dbSNP_Val_Status, printf("%s", "' + str($query.oformat.report.tumor_sample_id) + '") AS Tumor_Sample_Barcode, printf("%s", "' + str($query.oformat.report.norm_sample_id) + '") AS Matched_Norm_Sample_Barcode, "${match_norm_seq_allele1}" AS Match_Norm_Seq_Allele1, "${match_norm_seq_allele2}" AS Match_Norm_Seq_Allele2, "" AS Tumor_Validation_Allele1, "" AS Tumor_Validation_Allele2, "" AS Match_Norm_Validation_Allele1, "" AS Match_Norm_Validation_Allele2, "" AS Verification_Status, "" AS Validation_Status, ' + $mutation_status + ' AS Mutation_Status, "" AS Sequencing_Phase, "" AS Sequence_Source, "" AS Validation_Method, "" AS Score, "" AS BAM_File, "" AS Sequencer, ifnull(nullif(v.aa_change, ""), "p.=") AS HGVSp_Short, "${t_alt_count}" AS t_alt_count, "${t_ref_count}" AS t_ref_count, "${n_alt_count}" AS n_alt_count, "${n_ref_count}" AS n_ref_count, v.alt, ' + $gt_string #else: #set $report = $query.oformat.report.report @SET_COLS@ #end if #set $q = "SELECT %s FROM variants" % $cols + #if str($query.oformat.report.format) == 'maf': + #set $q = $q + ' v LEFT JOIN (SELECT DISTINCT gene, is_hgnc, hgnc_id, entrez_id, chrom FROM gene_detailed) g1 ON v.gene = g1.gene AND v.chrom = g1.chrom LEFT JOIN (SELECT DISTINCT gene, is_hgnc, hgnc_id, entrez_id, transcript, chrom, ensembl_gene_id FROM gene_detailed) g2 ON g1.gene = g2.gene AND (v.transcript = g2.transcript OR v.transcript=g2.ensembl_gene_id)' + #end if + #set $where_clause_elements = [] #if str($query.filter).strip(): #silent $where_clause_elements.append(str($query.filter).strip()) @@ -95,6 +127,9 @@ #if $where_clause_elements: #set $q = $q + " WHERE " + " AND ".join($where_clause_elements) #end if + #if str($query.oformat.report.format) == 'maf': + #set $q = $q + " GROUP BY v.variant_id" + #end if #if str($query.oformat.report.order_by).strip(): #set $q = $q + " ORDER BY " + str($query.oformat.report.order_by).strip() + str($query.oformat.report.sort_order) #end if @@ -108,6 +143,9 @@ @MULTILN_SQL_EXPR_TO_CMDLN@ '$infile' + #if str($query.oformat.report.format) == 'maf': + > temp.txt && python '$__tool_directory__/gemini_mafify.py' temp.txt '${query.oformat.report.tumor_sample_name}' '${query.oformat.report.normal_sample_name}' + #end if > '$outfile' ]]> @@ -136,6 +174,7 @@ + @@ -194,6 +233,50 @@ + + + + + + value.strip() + + + + + + + + + + + + + + + + + + + + value.strip() + + + + + + value.strip() + + + + + @@ -222,6 +305,7 @@ + @@ -255,6 +339,11 @@ + + + + + @@ -285,6 +374,19 @@ + + + + + +
+ + + + +
+ +