# HG changeset patch # User jjohnson # Date 1363111154 14400 # Node ID cdbdac66d6b59037077e88aece171db7c4a324a1 Uploaded diff -r 000000000000 -r cdbdac66d6b5 README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Tue Mar 12 13:59:14 2013 -0400 @@ -0,0 +1,4 @@ +snpeff_cds_report reports the protein coding changes that result from frame shift and non_synonymous variants. +It requires the input to be from the MMuFF (Missense Mutation and Frameshift Finder) workflow. +Specifically it requires the input from snpEff ( http://snpeff.sourceforge.net/ ) set with an Ensembl genome build, +which will annotate variants with Ensembl transctipt IDs. diff -r 000000000000 -r cdbdac66d6b5 snpEff_cds_report.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snpEff_cds_report.py Tue Mar 12 13:59:14 2013 -0400 @@ -0,0 +1,801 @@ +#!/usr/bin/python +import sys,os,tempfile,re +import httplib, urllib +import optparse +#import vcfClass +#from vcfClass import * +#import tools +#from tools import * + +debug = False +cds_anchor = 'cds_variation' +aa_anchor = 'aa_variation' + +codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L", + "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S", + "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*", + "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W", + "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L", + "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P", + "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q", + "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R", + "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M", + "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T", + "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K", + "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R", + "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V", + "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A", + "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E", + "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",} + +def reverseComplement(seq) : + rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper() + return rev + +def translate(seq) : + rna = seq.upper().replace('T','U') + aa = [] + for i in range(0,len(rna) - 2, 3): + aa.append(codon_map[rna[i:i+3]]) + return ''.join(aa) + +""" +SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand +Queries that request a sequence always return the sequence in the first column regardless of the order of attributes. +""" +query_ccds_template = """ + + + + + + + + + +""" +ccds_filter = '' +query_template = """ + + + + + + __CCDS_FILTER_HERE__ + + + + + + + + + + + + + + + + + + + + + +""" +""" +Columns(19): +['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)'] + + 0 cDNA sequence + 1 Ensembl Gene ID + 2 Ensembl Transcript ID + 3 Strand +- 4 Transcript Start (bp) +- 5 Transcript End (bp) + 6 Exon Chr Start (bp) + 7 Exon Chr End (bp) + 8 CDS Start + 9 CDS End + 10 CDS Length +- 11 Exon Rank in Transcript + 12 5' UTR Start + 13 5' UTR End + 14 3' UTR Start + 15 3' UTR End +- 16 Ensembl Protein ID +- 17 Gene Start (bp) +- 18 Gene End (bp) +""" + +# return {transcript_id : ccds_id} +def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids): + ccds_dict = dict() + if transcript_ids: + query = query_ccds_template + query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query) + query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query) + params = urllib.urlencode({'query':query}) + headers = {"Accept": "text/plain"} + if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query) + try: + if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host) + conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org') + conn.request("POST", "/biomart/martservice", params, headers) + response = conn.getresponse() + data = response.read() + if len(data) > 0: + # if debug: print >> sys.stdout, "%s\n\n" % data + lines = data.split('\n') + for line in lines: + fields = line.split('\t') + if len(fields) == 2: + (transcript_id,ccds) = fields + ccds_dict[transcript_id] = ccds + if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines) + except Exception, e: + sys.stdout.write( "transcript_ids: %s %s\n" % (transcript_ids, e) ) + return ccds_dict + +def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True): + query = query_template + query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query) + query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query) + query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query) + return query + +# return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence] +def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True): + transcript_id = snpEffect.transcript + chr_pos = snpEffect.pos + query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds) + if debug: print >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query) + params = urllib.urlencode({'query':query}) + headers = {"Accept": "text/plain"} + pos = snpEffect.pos + cds_pos = None # zero based offset + coding_sequence = '' + cds_ref = snpEffect.ref + cds_alt = snpEffect.alt + try: + if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host) + conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org') + conn.request("POST", "/biomart/martservice", params, headers) + response = conn.getresponse() + data = response.read() + if len(data) > 0: + # if debug: print >> sys.stdout, "%s\n\n" % data + lines = data.split('\n') + # Use the header line to determine the order of fields + line = lines[0] + # if debug: print >> sys.stdout, "Headers:\n%s\n" % line + colnames = line.split('\t') + # for i in range(len(colnames)): + # + if debug: print >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames) + for line in lines[1:]: + if line == None or len(line) < 2: + continue + field = line.split('\t') + if len(field) != len(colnames): + continue + if debug: print >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line) + if field[10] == None or len(field[10]) < 1: + continue + if debug: + for i in range(len(colnames)): + print >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i]) + snpEffect.gene_id = field[1] + strand = '+' if int(field[3]) > 0 else '-' + snpEffect.strand = strand + # coding_sequence is first + if len(field) > 10 and re.match('[ATGCN]+',field[0]): + if field[10] == None or len(field[10]) < 1: + continue + cdna_seq = field[0] + in_utr = False + # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence + utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-')) + utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-')) + utr_5_lens = [] + for i in range(len(utr_5_starts)): + utr_5_start = int(utr_5_starts[i]) + utr_5_end = int(utr_5_ends[i]) + utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1) + if utr_5_start <= pos <= utr_5_end : + in_utr = True + if debug: print >> sys.stdout, "in utr_5: %s %s %s\n" % (utr_5_start,pos,utr_5_end); + break + utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-')) + utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-')) + for i in range(len(utr_3_starts)): + utr_3_start = int(utr_3_starts[i]) + utr_3_end = int(utr_3_ends[i]) + if utr_3_start <= pos <= utr_3_end : + in_utr = True + if debug: print >> sys.stdout, "in utr_3: %s %s %s\n" % (utr_3_start,pos,utr_3_end); + break + # Get coding sequence from cdna + cdna_length = int(field[10]) + cdna_coding_start = sorted(eval('[' + re.sub(';',',',field[8]) + ']')) + cdna_coding_end = sorted(eval('[' + re.sub(';',',',field[9]) + ']')) + cdna_lens = [] + for i in range(len(cdna_coding_start)): + cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1) + if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens) + cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet + for i in range(len(cdna_coding_start)): + if debug: print >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1] + coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]] + snpEffect.cds = coding_sequence + if coding_sequence and len(coding_sequence) >= 3: + snpEffect.cds_stop_codon = coding_sequence[-3:] + if debug: print >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence) + if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens) + if debug: print >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens) + if not in_utr: + exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-')) + exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-')) + exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-')) + exon_lens = [] + for i in range(len(exon_start)): + exon_lens.append(exon_end[i] - exon_start[i] + 1) + if debug: print >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens) + if debug: + bp_tot = 0 + for i in range(len(exon_start)): + exon_len = exon_end[i] - exon_start[i] + 1 + bp_tot += exon_len + print >> sys.stdout, "test: %s %s %s %s %s %d %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot) + cds_idx = 0 + for i in range(len(exon_start)): + # Does this exon have cds bases - i.e. not entirely in UTR + if len(utr_5_ends) > 0: + if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]: + continue + if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]: + continue + exon_len = exon_end[i] - exon_start[i] + 1 + if exon_start[i] <= pos <= exon_end[i]: + if strand == '+': + if cds_idx: # find offset from start of cdna_coding and exon + offset = pos - exon_start[i] + else: # find offset from end of cdna_coding and exon + offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) ) + cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset + else: # negative strand + cds_ref = reverseComplement(snpEffect.ref) + cds_alt = reverseComplement(snpEffect.alt) + offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos + cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1) + snpEffect.cds_pos = cds_pos + snpEffect.cds_ref = cds_ref + snpEffect.cds_alt = cds_alt + return snpEffect + else: + cds_idx += 1 + except Exception, e: + sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) ) + finally: + if conn != None : + conn.close() + return None + +""" + Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) + FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) +""" +class SnpEffect( object ): + report_headings = ['Gene','Variant Position','Reference','Variation','Penetrance Percent','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','AA Variation'] + def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False): + self.chrom = chrom + self.pos = int(pos) + self.ref = ref + self.alt = alt + self.freq = float(freq) if freq else None + self.depth = int(depth) if depth else None + # From SnpEffect VCF String + self.effect = None + self.effect_impact = None + self.functional_class = None + self.codon_change = None + self.amino_acid_change = None + self.amino_acid_length = None + self.gene_name = None + self.gene_id = None + self.gene_biotype = None + self.coding = None + self.transcript = None + self.transcript_ids = None + self.exon = None + self.cds_stop_codon = None + # retrieve from ensembl + self.strand = None + self.cds_pos = None + self.cds_ref = None + self.cds_alt = None + self.cds = None + self.aa_pos = None + self.aa_len = None + self.alt_stop_codon = None + def chrPos(self): + return "%s:%s" % (self.chrom,self.pos) + def setEffect(self, effect, snpEffVersion = None): + if snpEffVersion and snpEffVersion == '2': + (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9] + else: # SnpEffVersion 3 # adds Amino_Acid_length field + (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10] + self.amino_acid_length = amino_acid_length + self.effect_impact = effect_impact + self.functional_class = functional_class + self.codon_change = codon_change + self.amino_acid_change = amino_acid_change + self.gene_name = gene_name + self.gene_biotype = gene_biotype + self.coding = coding + self.transcript = transcript + self.exon = exon + def parseEffect(self, effect, snpEffVersion = None): + (eff,effs) = effect.rstrip(')').split('(') + self.effect = eff + eff_fields = effs.split('|') + if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields) + self.setEffect(eff_fields, snpEffVersion) + def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False): + getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds) + return len(self.cds) > 0 if self.cds else False + def score(self): + return self.freq * self.depth if self.freq and self.depth else -1 + def getCodingSequence(self): + return self.cds + def getVariantSequence(self): + if self.cds and self.cds_pos and self.cds_ref and self.cds_alt: + return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):] + else: + if debug: print >> sys.stdout, "getVariantSequence: %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds) + return None + def getCodingAminoSequence(self): + seq = translate(self.cds) if self.cds else None + if seq: + self.aa_len = len(seq) + return seq + def getVariantAminoSequence(self): + variant_seq = self.getVariantSequence() + return translate(variant_seq) if variant_seq else None + def getVariantPeptide(self,toStopCodon=True): + (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences() + if var_aa: + if toStopCodon: + end_pos = var_aa_end + else: + end_pos = len(var_aa) + novel_peptide = var_aa[var_aa_pos:end_pos] + return novel_peptide + return None + # [preAA,varAA,postAA, varPeptide, varOffset, subAA] + def getNonSynonymousPeptide(self,start_offset,end_offset,toStopCodon=True): + (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences() + if var_aa: + start_pos = max(var_aa_pos - start_offset,0) if start_offset and int(start_offset) >= 0 else 0 + if toStopCodon: + end_pos = var_aa_end + else: + end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end + try: + varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_' + if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos) + mutation = "p.%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA) + preAA = var_aa[start_pos:var_aa_pos] # N-term side + postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side + novel_peptide = var_aa[start_pos:end_pos] + return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation] + except Exception, e: + sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) ) + return None + # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop] + # positions are 0-based array indexes + def getSequences(self): + coding_dna = self.getCodingSequence() + coding_aa = self.getCodingAminoSequence() + var_dna = self.getVariantSequence() + var_aa = self.getVariantAminoSequence() + var_aa_pos = None + var_aa_end = None + if var_aa: + var_aa_pos = self.cds_pos / 3 + for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))): + if var_aa[j] != coding_aa[j]: + var_aa_pos = j + break + self.aa_pos = var_aa_pos + var_stop = var_aa.find('*',var_aa_pos) + if var_stop < 0: + var_aa_end = len(var_aa) + else: + var_aa_end = var_stop + 1 # include the stop codon + stop_pos = var_stop * 3 + self.alt_stop_codon = var_dna[stop_pos:stop_pos+3] + return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end] + def inHomopolymer(self,polyA_limit): + if polyA_limit: ## library prep can results in inserting or deleting an A in a poly A region + ## check if base changes at cds_pos involve A or T + bdiff = None # the difference between the cds_ref and cds_alt + boff = 0 # the offset to the difference + if len(self.cds_ref) < len(self.cds_alt): + bdiff = re.sub(self.cds_ref,'',self.cds_alt) + boff = self.cds_alt.find(bdiff) + elif len(self.cds_alt) < len(self.cds_ref): + bdiff = re.sub(self.cds_alt,'',self.cds_ref) + boff = self.cds_ref.find(bdiff) + if bdiff: + ## check the number of similar base neighboring the change + alt_seq = self.getVariantSequence() + subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))] + ## adjust match length if the cps_pos is within polyA_limit form sequence end + match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit)) + if debug: print >> sys.stdout, "polyA bdiff %s %s %d %d\n" % (bdiff,subseq, match_len, boff) + ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times + if bdiff.find('A') >= 0: + pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq)) + match = re.match(pat,subseq) + if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match') + if match: + return True + if bdiff.find('T') >= 0: + pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq)) + match = re.match(pat,subseq) + if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match') + if match: + if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups()) + return True + return False + def getReportHeader(): + return report_headings + def getReportFields(self): + gene_name = self.gene_name if self.gene_name else '' + cds_ref = self.cds_ref if self.cds_ref else '' + cds_alt = self.cds_alt if self.cds_alt else '' + hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False) + if debug: print >> sys.stdout, "HGVS %s" % hgvs + var_aa = self.getVariantPeptide(toStopCodon=True) + freq = "%2.2f" % self.freq if self.freq else '' + depth = "%d" % self.depth if self.depth else '' + aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else '' + aa_len = "%d" % self.aa_len if self.aa_len else '' + gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript + stop_codon = self.alt_stop_codon if self.alt_stop_codon else '' + chrpos = self.chrPos() + aa_change = self.amino_acid_change if self.amino_acid_change else hgvs[5] + return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,var_aa if var_aa else ''] + +def __main__(): + + def printReportTsv(output_file, snpEffects): + if output_file: + print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings) + for snpEffect in snpEffects: + (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields() + if not snpEffect.effect == 'FRAME_SHIFT': + (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False) + novel_peptide = "%s_%s_%s" % (preAA,varAA,postAA) + print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide]) + + """ + http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111 + http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970 + http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774 + """ + def printReportHtml(output_file, detail_dir, snpEffects): + if output_file: + print >> output_file, '\n' + print >> output_file, '\n' + print >> output_file, '\n' + for snpEffect in snpEffects: + (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields() + refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html' + aa_ref = '#'.join([refname,aa_anchor]) + refpath = os.path.join(detail_dir,refname) + try: + ref_file = open(refpath,'w') + printEffDetailHtml(ref_file,snpEffect) + ref_file.close() + except Exception, e: + sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) ) + if snpEffect.effect == 'FRAME_SHIFT': + print >> output_file, '\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref,novel_peptide) + else: + (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False) + print >> output_file, '\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref, preAA,varAA,postAA) + print >> output_file, '
GeneVariant PositionReferenceVariationPenetrance PercentSequencing DepthTranscript DetailsAA PositionAA ChangeAA Length Stop CodonAA Variation
%s%s%s%s%s%s%s%s%s%s%s%s
%s%s%s%s%s%s%s%s%s%s%s%s%s%s
\n' + print >> output_file, '\n' + + def printEffDetailHtml(output_file,snpEffect,wrap_len=60): + (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences() + seq_len = len(coding_seq) + print >> output_file, '\n' + print >> output_file, '\n' + print >> output_file, '\n' % snpEffect.gene_name + print >> output_file, '\n' % (snpEffect.cds_ref,snpEffect.cds_alt) + print >> output_file, '\n' % (snpEffect.gene_id,snpEffect.transcript) + print >> output_file, '\n' % snpEffect.strand + print >> output_file, '\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1)) + print >> output_file, '
Gene:%s
Allele:%s/%s
Transcript:%s|%s
Transcript Strand:%s
Position in transcript:%d
\n' + if alt_aa: + alt_aa_pos = pos / 3 + if coding_aa: + for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))): + if alt_aa[j] != coding_aa[j]: + alt_aa_pos = j + break + alt_aa_end = 0 + for j in range(alt_aa_pos,len(alt_aa)): + if alt_aa[j] == '*': + alt_aa_end = j + break + seq = [] + for j in range(1,wrap_len+1): + seq.append('-' if j % 10 else '|') + hrule = ''.join(seq) + print >> output_file, '' + for i in range(0,seq_len,wrap_len): + e = min(i + wrap_len,seq_len) + sa = i/3 + ea = (i+wrap_len)/3 + print >> output_file, "\n" % (hrule) + print >> output_file, "" % (i+1) + for j in range(i,i + wrap_len): + if j < len(coding_seq): + if pos == j: + print >> output_file, "" % (cds_anchor,coding_seq[j]) + elif pos <= j < pos + len(ref): + print >> output_file, "" % coding_seq[j] + else: + print >> output_file, "" % coding_seq[j] + else: + print >> output_file, "" + print >> output_file, "\n" % e + if coding_aa: + print >> output_file, "" % (sa+1) + for j in range(sa,ea): + if j < len(coding_aa): + print >> output_file, "" % coding_aa[j] + else: + print >> output_file, "" + print >> output_file, "\n" % ea + if alt_aa: + print >> output_file, "" % (sa+1) + for j in range(sa,ea): + if j < len(alt_aa): + if alt_aa_pos == j: + print >> output_file, "" % (aa_anchor,alt_aa[j]) + elif alt_aa_pos < j <= alt_aa_end: + print >> output_file, "" % alt_aa[j] + else: + print >> output_file, "" % alt_aa[j] + else: + print >> output_file, "" + print >> output_file, "\n" % ea + print >> output_file, "" % (i+1) + for j in range(i,i + wrap_len): + if j < len(alt_seq): + if pos <= j < pos + len(alt): + print >> output_file, "" % alt_seq[j] + else: + print >> output_file, "" % alt_seq[j] + else: + print >> output_file, "" + print >> output_file, "\n" % e + print >> output_file, "
%s
%d%s%s%s%d
%d%s%d
%d%s%s%s%d
%d%s%s%d
" + print >> output_file, "" + + def printSeqText(output_file,snpEffect, wrap_len=120): + nw = 6 + (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences() + if coding_seq: + seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0) + rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|' + for i in range(0,seq_len,wrap_len): + e = min(i + wrap_len,seq_len) + sa = i/3 + ea = e/3 + print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'') + print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e) + if coding_aa: + print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(coding_aa[sa:ea]),nw,ea) + if alt_aa: + print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(alt_aa[sa:ea]),nw,ea) + if alt_seq: + print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e) + + # Parse the command line options + usage = "Usage: snpEff_cds_report.py filter [options]" + parser = optparse.OptionParser(usage = usage) + parser.add_option("-i", "--in", + action="store", type="string", + dest="vcfFile", help="input vcf file") + parser.add_option("-o", "--out", + action="store", type="string", + dest="output", help="output report file") + parser.add_option("-H", "--html_report", + action="store", type="string", + dest="html_file", help="html output report file") + parser.add_option("-D", "--html_dir", + action="store", type="string", + dest="html_dir", help="html output report file") + parser.add_option("-t", "--tsv_file", + action="store", type="string", + dest="tsv_file", help="TAB Separated Value (.tsv) output report file") + parser.add_option("-e", "--ensembl_host", + action="store", type="string", + dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org') + parser.add_option("-f", "--effects_filter", + action="store", type="string", default=None, + dest="effects_filter", help="a list of effects to filter") + parser.add_option("-O", "--ensembl_dataset", + action="store", type="string", + dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl') + parser.add_option("-p", "--polyA_limit", + action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' ) + parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects' ) + parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries' ) + parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) + + (options, args) = parser.parse_args() + + debug = options.debug + ensembl_host = options.ensembl_host + + # Check that a single vcf file is given. + if options.vcfFile == None: + parser.print_help() + print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report " + exit(1) + outputFile = None + outputHtml = None + detailDir = None + outputTsv = None + tmpHtmlName = None + tmpHtml = None + effects_list = [] + # Set the output file to stdout if no output file was specified. + if options.output == None and options.html_file == None and options.tsv_file == None: + outputFile = sys.stdout + if options.output != None: + output = os.path.abspath(options.output) + outputFile = open(output, 'w') + if options.tsv_file != None: + output = os.path.abspath(options.tsv_file) + outputTsv = open(output, 'w') + if options.html_file != None: + output = os.path.abspath(options.html_file) + outputHtml = open(output, 'w') + if options.html_dir == None: + detailDir = os.path.dirname(os.path.abspath(output)) + else: + detailDir = options.html_dir + if detailDir: + if not os.path.isdir(detailDir): + os.makedirs(detailDir) + if options.effects_filter: + eff_codes = options.effects_filter.split(',') + for val in eff_codes: + code = val.strip() + if code: + effects_list.append(code) + + # + # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) + try: + snpEffects = [] + homopolymers = [] + polya_count = 0 + polya_list = [] + fh = open( options.vcfFile ) + while True: + line = fh.readline() + if not line: + break #EOF + if line: + if outputFile: + print >> outputFile, "%s\n" % line + line = line.strip() + if len(line) < 1: + continue + elif line.startswith('#'): + # check for SnpEffVersion + """ + ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani" + ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani" + """ + if line.startswith('##SnpEffVersion='): + m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line) + if m: + SnpEffVersion = m.group(2) + # check for SnpEff Info Description + """ + ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani" + ##INFO= + ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani" + ##INFO= + """ + pass + else: + if debug: print >> sys.stdout, "%s\n" % line + freq = None + depth = None + fields = line.split('\t') + (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8] + if debug: print >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10) + if options.debug: print(fields) + for idx,alt in enumerate(alts.split(',')): + if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts) + if not re.match('^[ATCG]*$',alt): + print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s %s\n" % (chrom,pos,alt) + for info_item in info.split(';'): + try: + (key,val) = info_item.split('=',1) + if key == 'SAF': # Usually a SAF for each variant: if A,AG then SAF=0.333333333333333,SAF=0.333333333333333; + freqs = info_item.split(',') + (k,v) = freqs[idx].split('=',1) + freq = v + elif key == 'DP': + depth = val + elif key == 'EFF': + transcript_ids = [] + effects = val.split(',') + ccds_dict = None + for effect in effects: + (eff,effs) = effect.rstrip(')').split('(') + eff_fields = effs.split('|') + if SnpEffVersion == '2': + """ + Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) + EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) + """ + (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9] + else: # SnpEffVersion 3 # adds Amino_Acid_length field + """ + Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) + FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) + """ + (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10] + if transcript: + transcript_ids.append(transcript) + if debug: print >> sys.stdout, "transcripts: %s" % (transcript_ids) + if options.with_ccds: + ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids) + if debug: print >> sys.stdout, "ccds_dict: %s" % (ccds_dict) + for effect in effects: + snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth) + snpEffect.transcript_ids = transcript_ids + snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion) + if effects_list and not snpEffect.effect in effects_list: + continue + if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict): + if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds): + if snpEffect.cds_pos: + if snpEffect.inHomopolymer(options.polyA_limit): + homopolymers.append(snpEffect) + else: + snpEffects.append(snpEffect) + if outputFile: + print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields()) + printSeqText(outputFile,snpEffect) + if snpEffect.cds and snpEffect.cds_pos and not options.all_effects: + ## Just report the first sequence returned for a vcf line + break + except Exception, e: + sys.stderr.write( "line: %s err: %s\n" % (line,e) ) + print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers) + # Sort snpEffects + snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True) + if outputHtml: + printReportHtml(outputHtml, detailDir, snpEffects) + if outputTsv: + printReportTsv(outputTsv,snpEffects) + except Exception, e: + print(str(e)) + +if __name__ == "__main__": __main__() + diff -r 000000000000 -r cdbdac66d6b5 snpEff_cds_report.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snpEff_cds_report.xml Tue Mar 12 13:59:14 2013 -0400 @@ -0,0 +1,191 @@ + + Report Variant coding sequence changes for SnpEffects + + snpEff_cds_report.py --in $snp_effect_vcf + #if len($ensembl_host.__str__.strip) > 0: + --ensembl_host ${ensembl_host}.archive.ensembl.org + #end if + #if len($ensembl_dataset.__str__.strip) > 0: + --ensembl_dataset $ensembl_dataset + #end if + #if len($polya.__str__.strip) > 0: + --polyA_limit $polya + #end if + #if $effects_filter + --effects_filter=$effects_filter + #end if + $all_effects + $with_ccds + #if $report_format.__str__.find('html') >= 0: + --html_report $html_report + --html_dir $html_report.extra_files_path + #end if + #if $report_format.__str__.find('tsv') >= 0: + --tsv_file $tsv_report + #end if + #if $report_format.__str__.find('detailed') >= 0: + --out $text_report + #end if + + + + + ^([jfmajsond][aepuco][nbrynlgptvc]20[0-9][0-9])?$ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 'html' in report_format + + + 'tsv' in report_format + + + 'detailed' in report_format + + + + + + + + + + + + + + + + + + + + + + + +**SnpEff Variant Detection Ensembl Report** + +Generates the variant sequence and its translation for the variations reported by SnpEffect_. +The SnpEffect_ output must be in the VCF format, and must include Ensembl Transcript IDs. SnpSift_ can be used to filter the SnpEffect_ VCF output. + +The sequences are retrieved from a biomart server using the xml query interface with the Ensembl Transcript ID as the query key. + +.. _SnpEffect: http://snpeff.sourceforge.net/ +.. _SnpSift: http://snpeff.sourceforge.net/SnpSift.html + + + diff -r 000000000000 -r cdbdac66d6b5 test-data/snpeff.vcf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/snpeff.vcf Tue Mar 12 13:59:14 2013 -0400 @@ -0,0 +1,7 @@ +##SnpEffVersion="SnpEff 3.1k (build 2012-12-17), by Pablo Cingolani" +##SnpEffCmd="SnpEff -i vcf -o vcf -upDownStreamLen 0 -no downstream,intergenic,intron,upstream,utr -stats /galaxy/PRODUCTION/database/files/000/246/dataset_246290.dat GRCh37.66 /galaxy/PRODUCTION/database/files/000/246/dataset_246288.dat " +##INFO= +chr1 46871931 . GC G . PASS SAF=0.333333;DP=6;EFF=EXON(MODIFIER|||||FAAH|processed_transcript|CODING|ENST00000489366|2),EXON(MODIFIER|||||FAAH|processed_transcript|CODING|ENST00000493735|6),FRAME_SHIFT(HIGH||-|-281|579|FAAH|protein_coding|CODING|ENST00000243167|7) +chr1 54291558 . G GA . PASS SAF=0.333333;DP=12;EFF=EXON(MODIFIER|||||TMEM48|processed_transcript|CODING|ENST00000480952|4),FRAME_SHIFT(HIGH||-/T|-169?|542|TMEM48|protein_coding|CODING|ENST00000540001|5),FRAME_SHIFT(HIGH||-/T|-169?|557|TMEM48|protein_coding|CODING|ENST00000360494|5),FRAME_SHIFT(HIGH||-/T|-169?|674|TMEM48|protein_coding|CODING|ENST00000371429|5),FRAME_SHIFT(HIGH||-/T|-54?|559|TMEM48|protein_coding|CODING|ENST00000234725|4) +chr1 109461327 . G A,GA . PASS SAF=0.333333,SAF=0.333333;DP=6;EFF=FRAME_SHIFT(HIGH||-/A|-453?|684|GPSM2|protein_coding|CODING|ENST00000264126|12),FRAME_SHIFT(HIGH||-/A|-453?|684|GPSM2|protein_coding|CODING|ENST00000406462|13),SYNONYMOUS_CODING(LOW|SILENT|ggG/ggA|G452|684|GPSM2|protein_coding|CODING|ENST00000264126|),SYNONYMOUS_CODING(LOW|SILENT|ggG/ggA|G452|684|GPSM2|protein_coding|CODING|ENST00000406462|) +