0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 #
|
|
4 #------------------------------------------------------------------------------
|
|
5 # University of Minnesota
|
|
6 # Copyright 2013, Regents of the University of Minnesota
|
|
7 #------------------------------------------------------------------------------
|
|
8 # Author:
|
|
9 #
|
|
10 # James E Johnson
|
|
11 #
|
|
12 #------------------------------------------------------------------------------
|
|
13 """
|
|
14
|
|
15
|
|
16 """
|
|
17 This tool takes a SnpEff VCF file and an Ensembl pep.all.fa file ( e.g. Homo_sapiens.GRCh37.73.pep.all.fa )
|
|
18 It outputs a peptide fasta file with the variant peptide sequence that result from NON_SYNONYMOUS_CODING effects
|
|
19
|
|
20 """
|
|
21
|
|
22 import sys,re,os.path
|
|
23 import tempfile
|
|
24 import optparse
|
|
25 from optparse import OptionParser
|
|
26 import logging
|
|
27
|
|
28 ## dictionary for Amino Acid Abbreviations
|
|
29 aa_abbrev_dict = dict()
|
|
30 aa_abbrev_dict['Phe'] = 'F'
|
|
31 aa_abbrev_dict['Leu'] = 'L'
|
|
32 aa_abbrev_dict['Ser'] = 'S'
|
|
33 aa_abbrev_dict['Tyr'] = 'Y'
|
|
34 aa_abbrev_dict['Cys'] = 'C'
|
|
35 aa_abbrev_dict['Trp'] = 'W'
|
|
36 aa_abbrev_dict['Pro'] = 'P'
|
|
37 aa_abbrev_dict['His'] = 'H'
|
|
38 aa_abbrev_dict['Gln'] = 'Q'
|
|
39 aa_abbrev_dict['Arg'] = 'R'
|
|
40 aa_abbrev_dict['Ile'] = 'I'
|
|
41 aa_abbrev_dict['Met'] = 'M'
|
|
42 aa_abbrev_dict['Thr'] = 'T'
|
|
43 aa_abbrev_dict['Asn'] = 'N'
|
|
44 aa_abbrev_dict['Lys'] = 'K'
|
|
45 aa_abbrev_dict['Val'] = 'V'
|
|
46 aa_abbrev_dict['Ala'] = 'A'
|
|
47 aa_abbrev_dict['Asp'] = 'D'
|
|
48 aa_abbrev_dict['Glu'] = 'E'
|
|
49 aa_abbrev_dict['Gly'] = 'G'
|
|
50
|
|
51 ## Get the peptide ID and sequence a given ID
|
|
52 def get_sequence(id,seq_file):
|
|
53 fh = open(seq_file, 'r')
|
|
54 try:
|
|
55 for (ln,line) in enumerate(fh):
|
|
56 if line.find(id) >= 0:
|
|
57 fields = line.split('\t')
|
|
58 return ( ' '.join(fields[0:-1]),fields[-1].rstrip() if fields and len(fields) > 0 else None )
|
|
59 except Exception, e:
|
|
60 print >> sys.stderr, "failed: %s" % e
|
|
61 finally:
|
|
62 fh.close()
|
|
63
|
|
64 def fasta_to_tabular(fasta_file,tabular_file):
|
|
65 inFile = open(fasta_file,'r')
|
|
66 outFile = open(tabular_file,'w')
|
|
67 for i, line in enumerate( inFile ):
|
|
68 line = line.rstrip( '\r\n' )
|
|
69 if not line or line.startswith( '#' ):
|
|
70 continue
|
|
71 if line.startswith( '>' ):
|
|
72 #Don't want any existing tabs to trigger extra columns:
|
|
73 line = line.replace('\t', ' ')
|
|
74 if i > 0:
|
|
75 outFile.write('\n')
|
|
76 outFile.write(line[1:])
|
|
77 outFile.write('\t')
|
|
78 else:
|
|
79 outFile.write(line)
|
|
80 if i > 0:
|
|
81 outFile.write('\n')
|
|
82 if inFile:
|
|
83 inFile.close()
|
|
84 if outFile:
|
|
85 outFile.close()
|
|
86
|
|
87 def __main__():
|
|
88 #Parse Command Line
|
|
89 parser = optparse.OptionParser()
|
|
90 parser.add_option( '-i', '--input', dest='input', help='The input snpeff vcf file with HGVS annotations (else read from stdin)' )
|
|
91 parser.add_option( '-o', '--output', dest='output', help='The output fasta (else write to stdout)' )
|
|
92 parser.add_option( '-p', '--protein_fasta', dest='protein_fasta', default=None, help='The Esembl protein fasta in tabular format' )
|
|
93 parser.add_option( '-l', '--leading_aa_num', dest='leading_aa_num', type='int', default=None, help='leading number of AAs to output' )
|
|
94 parser.add_option( '-t', '--trailing_aa_num', dest='trailing_aa_num', type='int', default=None, help='trailing number of AAs to output' )
|
|
95 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' )
|
|
96 (options, args) = parser.parse_args()
|
|
97
|
|
98 # need protein_fasta file
|
|
99 fastaFile = options.protein_fasta
|
|
100 if options.protein_fasta == None:
|
|
101 print >> sys.stderr, "Ensembl protein_fasta tabular file required"
|
|
102 exit(4)
|
|
103 else:
|
|
104 # determine if fasta is already in tabular format
|
|
105 is_tabular = False
|
|
106 standard_aa = '^[AC-IK-WY]+$'
|
|
107 standard_na = '^[ACGTN]+$'
|
|
108 inFile = open(fastaFile,'r')
|
|
109 try:
|
|
110 nseq = 0
|
|
111 for i, line in enumerate( inFile ):
|
|
112 line = line.rstrip( '\r\n' )
|
|
113 if not line or line.startswith( '#' ):
|
|
114 continue
|
|
115 fields = line.split('\t')
|
|
116 if len(fields) < 2:
|
|
117 is_tabular = False
|
|
118 if line[0] != '>':
|
|
119 print >> sys.stderr, "failed: %s does not appear to be a fasta file" % fastaFile
|
|
120 exit(4)
|
|
121 break
|
|
122 if re.match('^[A-Z]+$',fields[-1].upper()):
|
|
123 is_tabular = True
|
|
124 nseq += 1
|
|
125 else:
|
|
126 if line[0] != '>':
|
|
127 print >> sys.stderr, "failed: %s does not appear to be a fasta file" % fastaFile
|
|
128 exit(4)
|
|
129 if nseq > 10:
|
|
130 break
|
|
131 finally:
|
|
132 if inFile:
|
|
133 inFile.close()
|
|
134 if not is_tabular:
|
|
135 fastaFile = tempfile.NamedTemporaryFile(prefix='pep_fasta_',suffix=".tab",dir=os.getcwd()).name
|
|
136 fasta_to_tabular(options.protein_fasta,fastaFile)
|
|
137 # vcf input
|
|
138 if options.input != None:
|
|
139 try:
|
|
140 inputPath = os.path.abspath(options.input)
|
|
141 inputFile = open(inputPath, 'r')
|
|
142 except Exception, e:
|
|
143 print >> sys.stderr, "failed: %s" % e
|
|
144 exit(2)
|
|
145 else:
|
|
146 inputFile = sys.stdin
|
|
147 # output
|
|
148 if options.output != None:
|
|
149 try:
|
|
150 outputPath = os.path.abspath(options.output)
|
|
151 outputFile = open(outputPath, 'w')
|
|
152 except Exception, e:
|
|
153 print >> sys.stderr, "failed: %s" % e
|
|
154 exit(3)
|
|
155 else:
|
|
156 outputFile = sys.stdout
|
|
157 ## Amino_Acid_Change notations
|
|
158 # G528R
|
|
159 # p.Gly528Arg/c.1582G>C
|
|
160 aa_change_regex = '([A-Z])(\d+)([A-Z])' # G528R
|
|
161 aa_hgvs_regex = 'p\.([A-Z][a-z][a-z])(\d+)([A-Z][a-z][a-z])(/c\.(\d+)([ACGTN])>([ACGTN]))' # p.Gly528Arg/c.1582G>C
|
|
162 # Save VCF file header, not currently used
|
|
163 vcf_header = []
|
|
164 reading_entries = False
|
|
165 try:
|
|
166 for linenum,line in enumerate(inputFile):
|
|
167 ## print >> sys.stderr, "%d: %s\n" % (linenum,line)
|
|
168 if line.startswith('##'):
|
|
169 vcf_header.append(line)
|
|
170 # May need to check SnpEff version in the header, the EFF info changed between versions 2 and 3
|
|
171 ##SnpEffVersion
|
|
172 elif line.startswith('#CHROM'):
|
|
173 reading_entries = True
|
|
174 else:
|
|
175 fields = line.split('\t')
|
|
176 # This is the current format of the EFF entry:
|
|
177 # EFF=missense(MODERATE|MISSENSE|Ggg/Cgg|G528R|802|SCNN1D|protein_coding|CODING|ENST00000379116|12|1);OICR=(ENST00000379116|1808)
|
|
178 # If this becomes variable, will need to dynamically pattern this on the defintion in the vcf header:
|
|
179 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_Change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank | Genotype_Number [ | ERRORS | WARNINGS ] )' ">
|
|
180 (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8]
|
|
181 for info_item in info.split(';'):
|
|
182 try:
|
|
183 if info_item.find('=') < 0:
|
|
184 continue
|
|
185 (key,val) = info_item.split('=',1)
|
|
186 if key == 'EFF':
|
|
187 effects = val.split(',')
|
|
188 for effect in effects:
|
|
189 (eff,effs) = effect.rstrip(')').split('(')
|
|
190 if not eff == 'NON_SYNONYMOUS_CODING':
|
|
191 continue
|
|
192 eff_fields = effs.split('|')
|
|
193 (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10]
|
|
194 if transcript:
|
|
195 aa_pos = None # 1-based position
|
|
196 alt_aa = '_'
|
|
197 # parse aa_change
|
|
198 # get AA change position and alternate Animo Acid
|
|
199 sap = aa_change
|
|
200 m = re.match(aa_change_regex,aa_change)
|
|
201 if m:
|
|
202 aa_pos = int(m.groups()[1])
|
|
203 alt_aa = m.groups()[2]
|
|
204 else:
|
|
205 m = re.match(aa_hgvs_regex,aa_change)
|
|
206 if m:
|
|
207 aa_pos = int(m.groups()[1])
|
|
208 ref_aa = aa_abbrev_dict[m.groups()[0]]
|
|
209 alt_aa = aa_abbrev_dict[m.groups()[2]]
|
|
210 sap = "%s%d%s" % (ref_aa,aa_pos,alt_aa)
|
|
211 if not aa_pos:
|
|
212 continue
|
|
213 # get AA sequence
|
|
214 aa_offset = aa_pos - 1
|
|
215 (pep_id,pep_seq) = get_sequence(transcript,fastaFile)
|
|
216 if not pep_seq:
|
|
217 continue
|
|
218 start_pos = max(aa_offset - options.leading_aa_num, 0) if options.leading_aa_num else 0
|
|
219 end_pos = min(aa_offset + options.trailing_aa_num + 1, len(pep_seq)) if options.trailing_aa_num else len(pep_seq)
|
|
220 # transform sequence
|
|
221 alt_seq = pep_seq[start_pos:aa_offset] + alt_aa + pep_seq[aa_offset+1:end_pos]
|
|
222 # >ENSP00000363782 pep:known chromosome:GRCh37:1:22778472:22853855:1 gene:ENSG00000184677 transcript:ENST00000374651 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:1:22778472 codon_change:Gtg/Atg sap:V885M
|
|
223 pep_id = re.sub('pep:[a-z]*','pep:sap',pep_id)
|
|
224 hdr = ">%s snp_location:%s:%s codon_change:%s sap:%s\n" % (pep_id, chrom, pos, codon_change, sap)
|
|
225 outputFile.write(hdr)
|
|
226 if options.debug:
|
|
227 trimmed_seq = pep_seq[start_pos:end_pos]
|
|
228 outputFile.write(trimmed_seq)
|
|
229 outputFile.write('\n')
|
|
230 outputFile.write(alt_seq)
|
|
231 outputFile.write('\n')
|
|
232 except Exception, e:
|
|
233 print >> sys.stderr, "failed: %s" % e
|
|
234 except Exception, e:
|
|
235 print >> sys.stderr, "failed: %s" % e
|
|
236 exit(1)
|
|
237
|
|
238 if __name__ == "__main__" : __main__()
|
|
239
|