Mercurial > repos > jjohnson > snpeff_cds_report
annotate snpEff_cds_report.py @ 1:29b286896c50
Change the default number of Amino Acids to display after a missense to 10.
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Fri, 19 Apr 2013 10:49:05 -0500 |
parents | cdbdac66d6b5 |
children | 15a54fa11ad7 |
rev | line source |
---|---|
0 | 1 #!/usr/bin/python |
2 import sys,os,tempfile,re | |
3 import httplib, urllib | |
4 import optparse | |
5 #import vcfClass | |
6 #from vcfClass import * | |
7 #import tools | |
8 #from tools import * | |
9 | |
10 debug = False | |
11 cds_anchor = 'cds_variation' | |
12 aa_anchor = 'aa_variation' | |
13 | |
14 codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L", | |
15 "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S", | |
16 "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*", | |
17 "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W", | |
18 "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L", | |
19 "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P", | |
20 "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q", | |
21 "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R", | |
22 "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M", | |
23 "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T", | |
24 "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K", | |
25 "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R", | |
26 "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V", | |
27 "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A", | |
28 "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E", | |
29 "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",} | |
30 | |
31 def reverseComplement(seq) : | |
32 rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper() | |
33 return rev | |
34 | |
35 def translate(seq) : | |
36 rna = seq.upper().replace('T','U') | |
37 aa = [] | |
38 for i in range(0,len(rna) - 2, 3): | |
39 aa.append(codon_map[rna[i:i+3]]) | |
40 return ''.join(aa) | |
41 | |
42 """ | |
43 SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand | |
44 Queries that request a sequence always return the sequence in the first column regardless of the order of attributes. | |
45 """ | |
46 query_ccds_template = """<?xml version="1.0" encoding="UTF-8"?> | |
47 <!DOCTYPE Query> | |
48 <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" > | |
49 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" > | |
50 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/> | |
51 <Filter name = "with_ccds" excluded = "0"/> | |
52 <Attribute name = "ensembl_transcript_id" /> | |
53 <Attribute name = "ccds" /> | |
54 </Dataset> | |
55 </Query> | |
56 """ | |
57 ccds_filter = '<Filter name = "with_ccds" excluded = "0"/>' | |
58 query_template = """<?xml version="1.0" encoding="UTF-8"?> | |
59 <!DOCTYPE Query> | |
60 <Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" > | |
61 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" > | |
62 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/> | |
63 <Filter name = "biotype" value = "protein_coding"/> | |
64 __CCDS_FILTER_HERE__ | |
65 <Attribute name = "cdna" /> | |
66 <Attribute name = "ensembl_gene_id" /> | |
67 <Attribute name = "ensembl_transcript_id" /> | |
68 <Attribute name = "strand" /> | |
69 <Attribute name = "transcript_start" /> | |
70 <Attribute name = "transcript_end"/> | |
71 <Attribute name = "exon_chrom_start"/> | |
72 <Attribute name = "exon_chrom_end"/> | |
73 <Attribute name = "cdna_coding_start" /> | |
74 <Attribute name = "cdna_coding_end" /> | |
75 <Attribute name = "cds_length"/> | |
76 <Attribute name = "rank"/> | |
77 <Attribute name = "5_utr_start" /> | |
78 <Attribute name = "5_utr_end" /> | |
79 <Attribute name = "3_utr_start" /> | |
80 <Attribute name = "3_utr_end" /> | |
81 <Attribute name = "ensembl_peptide_id"/> | |
82 <Attribute name = "start_position" /> | |
83 <Attribute name = "end_position" /> | |
84 </Dataset> | |
85 </Query> | |
86 """ | |
87 """ | |
88 Columns(19): | |
89 ['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)'] | |
90 | |
91 0 cDNA sequence | |
92 1 Ensembl Gene ID | |
93 2 Ensembl Transcript ID | |
94 3 Strand | |
95 - 4 Transcript Start (bp) | |
96 - 5 Transcript End (bp) | |
97 6 Exon Chr Start (bp) | |
98 7 Exon Chr End (bp) | |
99 8 CDS Start | |
100 9 CDS End | |
101 10 CDS Length | |
102 - 11 Exon Rank in Transcript | |
103 12 5' UTR Start | |
104 13 5' UTR End | |
105 14 3' UTR Start | |
106 15 3' UTR End | |
107 - 16 Ensembl Protein ID | |
108 - 17 Gene Start (bp) | |
109 - 18 Gene End (bp) | |
110 """ | |
111 | |
112 # return {transcript_id : ccds_id} | |
113 def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids): | |
114 ccds_dict = dict() | |
115 if transcript_ids: | |
116 query = query_ccds_template | |
117 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query) | |
118 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query) | |
119 params = urllib.urlencode({'query':query}) | |
120 headers = {"Accept": "text/plain"} | |
121 if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query) | |
122 try: | |
123 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host) | |
124 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org') | |
125 conn.request("POST", "/biomart/martservice", params, headers) | |
126 response = conn.getresponse() | |
127 data = response.read() | |
128 if len(data) > 0: | |
129 # if debug: print >> sys.stdout, "%s\n\n" % data | |
130 lines = data.split('\n') | |
131 for line in lines: | |
132 fields = line.split('\t') | |
133 if len(fields) == 2: | |
134 (transcript_id,ccds) = fields | |
135 ccds_dict[transcript_id] = ccds | |
136 if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines) | |
137 except Exception, e: | |
138 sys.stdout.write( "transcript_ids: %s %s\n" % (transcript_ids, e) ) | |
139 return ccds_dict | |
140 | |
141 def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True): | |
142 query = query_template | |
143 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query) | |
144 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query) | |
145 query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query) | |
146 return query | |
147 | |
148 # return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence] | |
149 def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True): | |
150 transcript_id = snpEffect.transcript | |
151 chr_pos = snpEffect.pos | |
152 query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds) | |
153 if debug: print >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query) | |
154 params = urllib.urlencode({'query':query}) | |
155 headers = {"Accept": "text/plain"} | |
156 pos = snpEffect.pos | |
157 cds_pos = None # zero based offset | |
158 coding_sequence = '' | |
159 cds_ref = snpEffect.ref | |
160 cds_alt = snpEffect.alt | |
161 try: | |
162 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host) | |
163 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org') | |
164 conn.request("POST", "/biomart/martservice", params, headers) | |
165 response = conn.getresponse() | |
166 data = response.read() | |
167 if len(data) > 0: | |
168 # if debug: print >> sys.stdout, "%s\n\n" % data | |
169 lines = data.split('\n') | |
170 # Use the header line to determine the order of fields | |
171 line = lines[0] | |
172 # if debug: print >> sys.stdout, "Headers:\n%s\n" % line | |
173 colnames = line.split('\t') | |
174 # for i in range(len(colnames)): | |
175 # | |
176 if debug: print >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames) | |
177 for line in lines[1:]: | |
178 if line == None or len(line) < 2: | |
179 continue | |
180 field = line.split('\t') | |
181 if len(field) != len(colnames): | |
182 continue | |
183 if debug: print >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line) | |
184 if field[10] == None or len(field[10]) < 1: | |
185 continue | |
186 if debug: | |
187 for i in range(len(colnames)): | |
188 print >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i]) | |
189 snpEffect.gene_id = field[1] | |
190 strand = '+' if int(field[3]) > 0 else '-' | |
191 snpEffect.strand = strand | |
192 # coding_sequence is first | |
193 if len(field) > 10 and re.match('[ATGCN]+',field[0]): | |
194 if field[10] == None or len(field[10]) < 1: | |
195 continue | |
196 cdna_seq = field[0] | |
197 in_utr = False | |
198 # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence | |
199 utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-')) | |
200 utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-')) | |
201 utr_5_lens = [] | |
202 for i in range(len(utr_5_starts)): | |
203 utr_5_start = int(utr_5_starts[i]) | |
204 utr_5_end = int(utr_5_ends[i]) | |
205 utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1) | |
206 if utr_5_start <= pos <= utr_5_end : | |
207 in_utr = True | |
208 if debug: print >> sys.stdout, "in utr_5: %s %s %s\n" % (utr_5_start,pos,utr_5_end); | |
209 break | |
210 utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-')) | |
211 utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-')) | |
212 for i in range(len(utr_3_starts)): | |
213 utr_3_start = int(utr_3_starts[i]) | |
214 utr_3_end = int(utr_3_ends[i]) | |
215 if utr_3_start <= pos <= utr_3_end : | |
216 in_utr = True | |
217 if debug: print >> sys.stdout, "in utr_3: %s %s %s\n" % (utr_3_start,pos,utr_3_end); | |
218 break | |
219 # Get coding sequence from cdna | |
220 cdna_length = int(field[10]) | |
221 cdna_coding_start = sorted(eval('[' + re.sub(';',',',field[8]) + ']')) | |
222 cdna_coding_end = sorted(eval('[' + re.sub(';',',',field[9]) + ']')) | |
223 cdna_lens = [] | |
224 for i in range(len(cdna_coding_start)): | |
225 cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1) | |
226 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens) | |
227 cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet | |
228 for i in range(len(cdna_coding_start)): | |
229 if debug: print >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1] | |
230 coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]] | |
231 snpEffect.cds = coding_sequence | |
232 if coding_sequence and len(coding_sequence) >= 3: | |
233 snpEffect.cds_stop_codon = coding_sequence[-3:] | |
234 if debug: print >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence) | |
235 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens) | |
236 if debug: print >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens) | |
237 if not in_utr: | |
238 exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-')) | |
239 exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-')) | |
240 exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-')) | |
241 exon_lens = [] | |
242 for i in range(len(exon_start)): | |
243 exon_lens.append(exon_end[i] - exon_start[i] + 1) | |
244 if debug: print >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens) | |
245 if debug: | |
246 bp_tot = 0 | |
247 for i in range(len(exon_start)): | |
248 exon_len = exon_end[i] - exon_start[i] + 1 | |
249 bp_tot += exon_len | |
250 print >> sys.stdout, "test: %s %s %s %s %s %d %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot) | |
251 cds_idx = 0 | |
252 for i in range(len(exon_start)): | |
253 # Does this exon have cds bases - i.e. not entirely in UTR | |
254 if len(utr_5_ends) > 0: | |
255 if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]: | |
256 continue | |
257 if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]: | |
258 continue | |
259 exon_len = exon_end[i] - exon_start[i] + 1 | |
260 if exon_start[i] <= pos <= exon_end[i]: | |
261 if strand == '+': | |
262 if cds_idx: # find offset from start of cdna_coding and exon | |
263 offset = pos - exon_start[i] | |
264 else: # find offset from end of cdna_coding and exon | |
265 offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) ) | |
266 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset | |
267 else: # negative strand | |
268 cds_ref = reverseComplement(snpEffect.ref) | |
269 cds_alt = reverseComplement(snpEffect.alt) | |
270 offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos | |
271 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1) | |
272 snpEffect.cds_pos = cds_pos | |
273 snpEffect.cds_ref = cds_ref | |
274 snpEffect.cds_alt = cds_alt | |
275 return snpEffect | |
276 else: | |
277 cds_idx += 1 | |
278 except Exception, e: | |
279 sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) ) | |
280 finally: | |
281 if conn != None : | |
282 conn.close() | |
283 return None | |
284 | |
285 """ | |
286 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
287 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) | |
288 """ | |
289 class SnpEffect( object ): | |
290 report_headings = ['Gene','Variant Position','Reference','Variation','Penetrance Percent','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','AA Variation'] | |
291 def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False): | |
292 self.chrom = chrom | |
293 self.pos = int(pos) | |
294 self.ref = ref | |
295 self.alt = alt | |
296 self.freq = float(freq) if freq else None | |
297 self.depth = int(depth) if depth else None | |
298 # From SnpEffect VCF String | |
299 self.effect = None | |
300 self.effect_impact = None | |
301 self.functional_class = None | |
302 self.codon_change = None | |
303 self.amino_acid_change = None | |
304 self.amino_acid_length = None | |
305 self.gene_name = None | |
306 self.gene_id = None | |
307 self.gene_biotype = None | |
308 self.coding = None | |
309 self.transcript = None | |
310 self.transcript_ids = None | |
311 self.exon = None | |
312 self.cds_stop_codon = None | |
313 # retrieve from ensembl | |
314 self.strand = None | |
315 self.cds_pos = None | |
316 self.cds_ref = None | |
317 self.cds_alt = None | |
318 self.cds = None | |
319 self.aa_pos = None | |
320 self.aa_len = None | |
321 self.alt_stop_codon = None | |
322 def chrPos(self): | |
323 return "%s:%s" % (self.chrom,self.pos) | |
324 def setEffect(self, effect, snpEffVersion = None): | |
325 if snpEffVersion and snpEffVersion == '2': | |
326 (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9] | |
327 else: # SnpEffVersion 3 # adds Amino_Acid_length field | |
328 (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10] | |
329 self.amino_acid_length = amino_acid_length | |
330 self.effect_impact = effect_impact | |
331 self.functional_class = functional_class | |
332 self.codon_change = codon_change | |
333 self.amino_acid_change = amino_acid_change | |
334 self.gene_name = gene_name | |
335 self.gene_biotype = gene_biotype | |
336 self.coding = coding | |
337 self.transcript = transcript | |
338 self.exon = exon | |
339 def parseEffect(self, effect, snpEffVersion = None): | |
340 (eff,effs) = effect.rstrip(')').split('(') | |
341 self.effect = eff | |
342 eff_fields = effs.split('|') | |
343 if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields) | |
344 self.setEffect(eff_fields, snpEffVersion) | |
345 def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False): | |
346 getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds) | |
347 return len(self.cds) > 0 if self.cds else False | |
348 def score(self): | |
349 return self.freq * self.depth if self.freq and self.depth else -1 | |
350 def getCodingSequence(self): | |
351 return self.cds | |
352 def getVariantSequence(self): | |
353 if self.cds and self.cds_pos and self.cds_ref and self.cds_alt: | |
354 return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):] | |
355 else: | |
356 if debug: print >> sys.stdout, "getVariantSequence: %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds) | |
357 return None | |
358 def getCodingAminoSequence(self): | |
359 seq = translate(self.cds) if self.cds else None | |
360 if seq: | |
361 self.aa_len = len(seq) | |
362 return seq | |
363 def getVariantAminoSequence(self): | |
364 variant_seq = self.getVariantSequence() | |
365 return translate(variant_seq) if variant_seq else None | |
366 def getVariantPeptide(self,toStopCodon=True): | |
367 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences() | |
368 if var_aa: | |
369 if toStopCodon: | |
370 end_pos = var_aa_end | |
371 else: | |
372 end_pos = len(var_aa) | |
373 novel_peptide = var_aa[var_aa_pos:end_pos] | |
374 return novel_peptide | |
375 return None | |
376 # [preAA,varAA,postAA, varPeptide, varOffset, subAA] | |
1
29b286896c50
Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
377 def getNonSynonymousPeptide(self,start_count,end_count,toStopCodon=True): |
0 | 378 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences() |
379 if var_aa: | |
1
29b286896c50
Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
380 start_pos = max(var_aa_pos - start_count,0) if start_count and int(start_count) >= 0 else 0 |
0 | 381 if toStopCodon: |
382 end_pos = var_aa_end | |
383 else: | |
1
29b286896c50
Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
384 end_offset = end_count + 1 |
0 | 385 end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end |
386 try: | |
387 varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_' | |
388 if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos) | |
389 mutation = "p.%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA) | |
390 preAA = var_aa[start_pos:var_aa_pos] # N-term side | |
391 postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side | |
392 novel_peptide = var_aa[start_pos:end_pos] | |
393 return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation] | |
394 except Exception, e: | |
395 sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) ) | |
396 return None | |
397 # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop] | |
398 # positions are 0-based array indexes | |
399 def getSequences(self): | |
400 coding_dna = self.getCodingSequence() | |
401 coding_aa = self.getCodingAminoSequence() | |
402 var_dna = self.getVariantSequence() | |
403 var_aa = self.getVariantAminoSequence() | |
404 var_aa_pos = None | |
405 var_aa_end = None | |
406 if var_aa: | |
407 var_aa_pos = self.cds_pos / 3 | |
408 for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))): | |
409 if var_aa[j] != coding_aa[j]: | |
410 var_aa_pos = j | |
411 break | |
412 self.aa_pos = var_aa_pos | |
413 var_stop = var_aa.find('*',var_aa_pos) | |
414 if var_stop < 0: | |
415 var_aa_end = len(var_aa) | |
416 else: | |
417 var_aa_end = var_stop + 1 # include the stop codon | |
418 stop_pos = var_stop * 3 | |
419 self.alt_stop_codon = var_dna[stop_pos:stop_pos+3] | |
420 return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end] | |
421 def inHomopolymer(self,polyA_limit): | |
422 if polyA_limit: ## library prep can results in inserting or deleting an A in a poly A region | |
423 ## check if base changes at cds_pos involve A or T | |
424 bdiff = None # the difference between the cds_ref and cds_alt | |
425 boff = 0 # the offset to the difference | |
426 if len(self.cds_ref) < len(self.cds_alt): | |
427 bdiff = re.sub(self.cds_ref,'',self.cds_alt) | |
428 boff = self.cds_alt.find(bdiff) | |
429 elif len(self.cds_alt) < len(self.cds_ref): | |
430 bdiff = re.sub(self.cds_alt,'',self.cds_ref) | |
431 boff = self.cds_ref.find(bdiff) | |
432 if bdiff: | |
433 ## check the number of similar base neighboring the change | |
434 alt_seq = self.getVariantSequence() | |
435 subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))] | |
436 ## adjust match length if the cps_pos is within polyA_limit form sequence end | |
437 match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit)) | |
438 if debug: print >> sys.stdout, "polyA bdiff %s %s %d %d\n" % (bdiff,subseq, match_len, boff) | |
439 ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times | |
440 if bdiff.find('A') >= 0: | |
441 pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq)) | |
442 match = re.match(pat,subseq) | |
443 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match') | |
444 if match: | |
445 return True | |
446 if bdiff.find('T') >= 0: | |
447 pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq)) | |
448 match = re.match(pat,subseq) | |
449 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match') | |
450 if match: | |
451 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups()) | |
452 return True | |
453 return False | |
454 def getReportHeader(): | |
455 return report_headings | |
456 def getReportFields(self): | |
457 gene_name = self.gene_name if self.gene_name else '' | |
458 cds_ref = self.cds_ref if self.cds_ref else '' | |
459 cds_alt = self.cds_alt if self.cds_alt else '' | |
460 hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False) | |
461 if debug: print >> sys.stdout, "HGVS %s" % hgvs | |
462 var_aa = self.getVariantPeptide(toStopCodon=True) | |
463 freq = "%2.2f" % self.freq if self.freq else '' | |
464 depth = "%d" % self.depth if self.depth else '' | |
465 aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else '' | |
466 aa_len = "%d" % self.aa_len if self.aa_len else '' | |
467 gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript | |
468 stop_codon = self.alt_stop_codon if self.alt_stop_codon else '' | |
469 chrpos = self.chrPos() | |
470 aa_change = self.amino_acid_change if self.amino_acid_change else hgvs[5] | |
471 return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,var_aa if var_aa else ''] | |
472 | |
473 def __main__(): | |
474 | |
475 def printReportTsv(output_file, snpEffects): | |
476 if output_file: | |
477 print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings) | |
478 for snpEffect in snpEffects: | |
479 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields() | |
480 if not snpEffect.effect == 'FRAME_SHIFT': | |
481 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False) | |
482 novel_peptide = "%s_%s_%s" % (preAA,varAA,postAA) | |
483 print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide]) | |
484 | |
485 """ | |
486 http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111 | |
487 http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970 | |
488 http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774 | |
489 """ | |
490 def printReportHtml(output_file, detail_dir, snpEffects): | |
491 if output_file: | |
492 print >> output_file, '<HTML><BODY>\n' | |
493 print >> output_file, '<TABLE BORDER="1">\n' | |
494 print >> output_file, '<TR align="LEFT"><TH>Gene</TH><TH>Variant Position</TH><TH>Reference</TH><TH>Variation</TH><TH>Penetrance Percent</TH><TH>Sequencing Depth</TH><TH>Transcript Details</TH><TH>AA Position</TH><TH>AA Change</TH><TH>AA Length</TH> <TH>Stop Codon</TH><TH>AA Variation</TH></TR>\n' | |
495 for snpEffect in snpEffects: | |
496 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields() | |
497 refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html' | |
498 aa_ref = '#'.join([refname,aa_anchor]) | |
499 refpath = os.path.join(detail_dir,refname) | |
500 try: | |
501 ref_file = open(refpath,'w') | |
502 printEffDetailHtml(ref_file,snpEffect) | |
503 ref_file.close() | |
504 except Exception, e: | |
505 sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) ) | |
506 if snpEffect.effect == 'FRAME_SHIFT': | |
507 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s">%s</A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref,novel_peptide) | |
508 else: | |
509 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False) | |
510 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s"><small><I>%s</I></small><B>%s</B><small><I>%s</I></small></A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref, preAA,varAA,postAA) | |
511 print >> output_file, '</TABLE>\n' | |
512 print >> output_file, '</BODY></HTML>\n' | |
513 | |
514 def printEffDetailHtml(output_file,snpEffect,wrap_len=60): | |
515 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences() | |
516 seq_len = len(coding_seq) | |
517 print >> output_file, '<HTML><BODY>\n' | |
518 print >> output_file, '<TABLE>\n' | |
519 print >> output_file, '<TR><TD>Gene:</TD><TD>%s</TD></TR>\n' % snpEffect.gene_name | |
520 print >> output_file, '<TR><TD>Allele:</TD><TD>%s/%s</TD></TR>\n' % (snpEffect.cds_ref,snpEffect.cds_alt) | |
521 print >> output_file, '<TR><TD>Transcript:</TD><TD>%s|%s</TD></TR>\n' % (snpEffect.gene_id,snpEffect.transcript) | |
522 print >> output_file, '<TR><TD>Transcript Strand:</TD><TD>%s</TD></TR>\n' % snpEffect.strand | |
523 print >> output_file, '<TR><TD>Position in transcript:</TD><TD><A HREF="%s">%d</A></TD></TR>\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1)) | |
524 print >> output_file, '</TABLE>\n' | |
525 if alt_aa: | |
526 alt_aa_pos = pos / 3 | |
527 if coding_aa: | |
528 for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))): | |
529 if alt_aa[j] != coding_aa[j]: | |
530 alt_aa_pos = j | |
531 break | |
532 alt_aa_end = 0 | |
533 for j in range(alt_aa_pos,len(alt_aa)): | |
534 if alt_aa[j] == '*': | |
535 alt_aa_end = j | |
536 break | |
537 seq = [] | |
538 for j in range(1,wrap_len+1): | |
539 seq.append('-' if j % 10 else '|') | |
540 hrule = '</TD><TD>'.join(seq) | |
541 print >> output_file, '<TABLE cellspacing="0">' | |
542 for i in range(0,seq_len,wrap_len): | |
543 e = min(i + wrap_len,seq_len) | |
544 sa = i/3 | |
545 ea = (i+wrap_len)/3 | |
546 print >> output_file, "<TR STYLE=\"background:#DDD\"><TD ALIGN=RIGHT></TD><TD>%s</TD><TD></TD ALIGN=RIGHT></TR>\n" % (hrule) | |
547 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1) | |
548 for j in range(i,i + wrap_len): | |
549 if j < len(coding_seq): | |
550 if pos == j: | |
551 print >> output_file, "<TD><FONT COLOR=\"#F00\"><A NAME=\"%s\">%s</A></FONT></TD>" % (cds_anchor,coding_seq[j]) | |
552 elif pos <= j < pos + len(ref): | |
553 print >> output_file, "<TD><FONT COLOR=\"#F00\">%s</FONT></TD>" % coding_seq[j] | |
554 else: | |
555 print >> output_file, "<TD>%s</TD>" % coding_seq[j] | |
556 else: | |
557 print >> output_file, "<TD></TD>" | |
558 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e | |
559 if coding_aa: | |
560 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1) | |
561 for j in range(sa,ea): | |
562 if j < len(coding_aa): | |
563 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % coding_aa[j] | |
564 else: | |
565 print >> output_file, "<TD COLSPAN=\"3\"></TD>" | |
566 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea | |
567 if alt_aa: | |
568 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1) | |
569 for j in range(sa,ea): | |
570 if j < len(alt_aa): | |
571 if alt_aa_pos == j: | |
572 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\"><A NAME=\"%s\">%s</A></TD>" % (aa_anchor,alt_aa[j]) | |
573 elif alt_aa_pos < j <= alt_aa_end: | |
574 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\">%s</TD>" % alt_aa[j] | |
575 else: | |
576 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % alt_aa[j] | |
577 else: | |
578 print >> output_file, "<TD COLSPAN=\"3\"></TD>" | |
579 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea | |
580 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1) | |
581 for j in range(i,i + wrap_len): | |
582 if j < len(alt_seq): | |
583 if pos <= j < pos + len(alt): | |
584 print >> output_file, "<TD><FONT COLOR=\"#00F\">%s</FONT></TD>" % alt_seq[j] | |
585 else: | |
586 print >> output_file, "<TD>%s</TD>" % alt_seq[j] | |
587 else: | |
588 print >> output_file, "<TD></TD>" | |
589 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e | |
590 print >> output_file, "</TABLE>" | |
591 print >> output_file, "</BODY></HTML>" | |
592 | |
593 def printSeqText(output_file,snpEffect, wrap_len=120): | |
594 nw = 6 | |
595 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences() | |
596 if coding_seq: | |
597 seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0) | |
598 rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|' | |
599 for i in range(0,seq_len,wrap_len): | |
600 e = min(i + wrap_len,seq_len) | |
601 sa = i/3 | |
602 ea = e/3 | |
603 print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'') | |
604 print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e) | |
605 if coding_aa: | |
606 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(coding_aa[sa:ea]),nw,ea) | |
607 if alt_aa: | |
608 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(alt_aa[sa:ea]),nw,ea) | |
609 if alt_seq: | |
610 print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e) | |
611 | |
612 # Parse the command line options | |
613 usage = "Usage: snpEff_cds_report.py filter [options]" | |
614 parser = optparse.OptionParser(usage = usage) | |
615 parser.add_option("-i", "--in", | |
616 action="store", type="string", | |
617 dest="vcfFile", help="input vcf file") | |
618 parser.add_option("-o", "--out", | |
619 action="store", type="string", | |
620 dest="output", help="output report file") | |
621 parser.add_option("-H", "--html_report", | |
622 action="store", type="string", | |
623 dest="html_file", help="html output report file") | |
624 parser.add_option("-D", "--html_dir", | |
625 action="store", type="string", | |
626 dest="html_dir", help="html output report file") | |
627 parser.add_option("-t", "--tsv_file", | |
628 action="store", type="string", | |
629 dest="tsv_file", help="TAB Separated Value (.tsv) output report file") | |
630 parser.add_option("-e", "--ensembl_host", | |
631 action="store", type="string", | |
632 dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org') | |
633 parser.add_option("-f", "--effects_filter", | |
634 action="store", type="string", default=None, | |
635 dest="effects_filter", help="a list of effects to filter") | |
636 parser.add_option("-O", "--ensembl_dataset", | |
637 action="store", type="string", | |
638 dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl') | |
639 parser.add_option("-p", "--polyA_limit", | |
640 action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' ) | |
641 parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects' ) | |
642 parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries' ) | |
643 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) | |
644 | |
645 (options, args) = parser.parse_args() | |
646 | |
647 debug = options.debug | |
648 ensembl_host = options.ensembl_host | |
649 | |
650 # Check that a single vcf file is given. | |
651 if options.vcfFile == None: | |
652 parser.print_help() | |
653 print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report " | |
654 exit(1) | |
655 outputFile = None | |
656 outputHtml = None | |
657 detailDir = None | |
658 outputTsv = None | |
659 tmpHtmlName = None | |
660 tmpHtml = None | |
661 effects_list = [] | |
662 # Set the output file to stdout if no output file was specified. | |
663 if options.output == None and options.html_file == None and options.tsv_file == None: | |
664 outputFile = sys.stdout | |
665 if options.output != None: | |
666 output = os.path.abspath(options.output) | |
667 outputFile = open(output, 'w') | |
668 if options.tsv_file != None: | |
669 output = os.path.abspath(options.tsv_file) | |
670 outputTsv = open(output, 'w') | |
671 if options.html_file != None: | |
672 output = os.path.abspath(options.html_file) | |
673 outputHtml = open(output, 'w') | |
674 if options.html_dir == None: | |
675 detailDir = os.path.dirname(os.path.abspath(output)) | |
676 else: | |
677 detailDir = options.html_dir | |
678 if detailDir: | |
679 if not os.path.isdir(detailDir): | |
680 os.makedirs(detailDir) | |
681 if options.effects_filter: | |
682 eff_codes = options.effects_filter.split(',') | |
683 for val in eff_codes: | |
684 code = val.strip() | |
685 if code: | |
686 effects_list.append(code) | |
687 | |
688 # | |
689 # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
690 try: | |
691 snpEffects = [] | |
692 homopolymers = [] | |
693 polya_count = 0 | |
694 polya_list = [] | |
695 fh = open( options.vcfFile ) | |
696 while True: | |
697 line = fh.readline() | |
698 if not line: | |
699 break #EOF | |
700 if line: | |
701 if outputFile: | |
702 print >> outputFile, "%s\n" % line | |
703 line = line.strip() | |
704 if len(line) < 1: | |
705 continue | |
706 elif line.startswith('#'): | |
707 # check for SnpEffVersion | |
708 """ | |
709 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani" | |
710 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani" | |
711 """ | |
712 if line.startswith('##SnpEffVersion='): | |
713 m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line) | |
714 if m: | |
715 SnpEffVersion = m.group(2) | |
716 # check for SnpEff Info Description | |
717 """ | |
718 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani" | |
719 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' "> | |
720 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani" | |
721 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' "> | |
722 """ | |
723 pass | |
724 else: | |
725 if debug: print >> sys.stdout, "%s\n" % line | |
726 freq = None | |
727 depth = None | |
728 fields = line.split('\t') | |
729 (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8] | |
730 if debug: print >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10) | |
731 if options.debug: print(fields) | |
732 for idx,alt in enumerate(alts.split(',')): | |
733 if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts) | |
734 if not re.match('^[ATCG]*$',alt): | |
735 print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s %s\n" % (chrom,pos,alt) | |
736 for info_item in info.split(';'): | |
737 try: | |
738 (key,val) = info_item.split('=',1) | |
739 if key == 'SAF': # Usually a SAF for each variant: if A,AG then SAF=0.333333333333333,SAF=0.333333333333333; | |
740 freqs = info_item.split(',') | |
741 (k,v) = freqs[idx].split('=',1) | |
742 freq = v | |
743 elif key == 'DP': | |
744 depth = val | |
745 elif key == 'EFF': | |
746 transcript_ids = [] | |
747 effects = val.split(',') | |
748 ccds_dict = None | |
749 for effect in effects: | |
750 (eff,effs) = effect.rstrip(')').split('(') | |
751 eff_fields = effs.split('|') | |
752 if SnpEffVersion == '2': | |
753 """ | |
754 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
755 EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) | |
756 """ | |
757 (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9] | |
758 else: # SnpEffVersion 3 # adds Amino_Acid_length field | |
759 """ | |
760 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
761 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) | |
762 """ | |
763 (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10] | |
764 if transcript: | |
765 transcript_ids.append(transcript) | |
766 if debug: print >> sys.stdout, "transcripts: %s" % (transcript_ids) | |
767 if options.with_ccds: | |
768 ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids) | |
769 if debug: print >> sys.stdout, "ccds_dict: %s" % (ccds_dict) | |
770 for effect in effects: | |
771 snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth) | |
772 snpEffect.transcript_ids = transcript_ids | |
773 snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion) | |
774 if effects_list and not snpEffect.effect in effects_list: | |
775 continue | |
776 if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict): | |
777 if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds): | |
778 if snpEffect.cds_pos: | |
779 if snpEffect.inHomopolymer(options.polyA_limit): | |
780 homopolymers.append(snpEffect) | |
781 else: | |
782 snpEffects.append(snpEffect) | |
783 if outputFile: | |
784 print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields()) | |
785 printSeqText(outputFile,snpEffect) | |
786 if snpEffect.cds and snpEffect.cds_pos and not options.all_effects: | |
787 ## Just report the first sequence returned for a vcf line | |
788 break | |
789 except Exception, e: | |
790 sys.stderr.write( "line: %s err: %s\n" % (line,e) ) | |
791 print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers) | |
792 # Sort snpEffects | |
793 snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True) | |
794 if outputHtml: | |
795 printReportHtml(outputHtml, detailDir, snpEffects) | |
796 if outputTsv: | |
797 printReportTsv(outputTsv,snpEffects) | |
798 except Exception, e: | |
799 print(str(e)) | |
800 | |
801 if __name__ == "__main__": __main__() | |
802 |