Mercurial > repos > jjohnson > snpeff_cds_report
annotate snpEff_cds_report.py @ 7:fbb6510186df default tip
when codon has ambiguous nucleotide translate to X
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Mon, 15 Sep 2014 06:13:47 -0500 |
parents | a64ef0611117 |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/python |
5
85b933b7d231
Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents:
4
diff
changeset
|
2 ## version 1.2 |
0 | 3 import sys,os,tempfile,re |
4 import httplib, urllib | |
5 import optparse | |
6 #import vcfClass | |
7 #from vcfClass import * | |
8 #import tools | |
9 #from tools import * | |
10 | |
11 debug = False | |
12 cds_anchor = 'cds_variation' | |
13 aa_anchor = 'aa_variation' | |
14 | |
15 codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L", | |
16 "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S", | |
17 "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*", | |
18 "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W", | |
19 "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L", | |
20 "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P", | |
21 "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q", | |
22 "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R", | |
23 "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M", | |
24 "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T", | |
25 "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K", | |
26 "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R", | |
27 "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V", | |
28 "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A", | |
29 "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E", | |
30 "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",} | |
31 | |
32 def reverseComplement(seq) : | |
33 rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper() | |
34 return rev | |
35 | |
36 def translate(seq) : | |
37 rna = seq.upper().replace('T','U') | |
38 aa = [] | |
39 for i in range(0,len(rna) - 2, 3): | |
7
fbb6510186df
when codon has ambiguous nucleotide translate to X
Jim Johnson <jj@umn.edu>
parents:
6
diff
changeset
|
40 codon = rna[i:i+3] |
fbb6510186df
when codon has ambiguous nucleotide translate to X
Jim Johnson <jj@umn.edu>
parents:
6
diff
changeset
|
41 aa.append(codon_map[codon] if codon in codon_map else 'X') |
0 | 42 return ''.join(aa) |
43 | |
44 """ | |
45 SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand | |
46 Queries that request a sequence always return the sequence in the first column regardless of the order of attributes. | |
47 """ | |
48 query_ccds_template = """<?xml version="1.0" encoding="UTF-8"?> | |
49 <!DOCTYPE Query> | |
50 <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" > | |
51 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" > | |
52 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/> | |
53 <Filter name = "with_ccds" excluded = "0"/> | |
54 <Attribute name = "ensembl_transcript_id" /> | |
55 <Attribute name = "ccds" /> | |
56 </Dataset> | |
57 </Query> | |
58 """ | |
59 ccds_filter = '<Filter name = "with_ccds" excluded = "0"/>' | |
60 query_template = """<?xml version="1.0" encoding="UTF-8"?> | |
61 <!DOCTYPE Query> | |
62 <Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" > | |
63 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" > | |
64 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/> | |
65 <Filter name = "biotype" value = "protein_coding"/> | |
66 __CCDS_FILTER_HERE__ | |
67 <Attribute name = "cdna" /> | |
68 <Attribute name = "ensembl_gene_id" /> | |
69 <Attribute name = "ensembl_transcript_id" /> | |
70 <Attribute name = "strand" /> | |
71 <Attribute name = "transcript_start" /> | |
72 <Attribute name = "transcript_end"/> | |
73 <Attribute name = "exon_chrom_start"/> | |
74 <Attribute name = "exon_chrom_end"/> | |
75 <Attribute name = "cdna_coding_start" /> | |
76 <Attribute name = "cdna_coding_end" /> | |
77 <Attribute name = "cds_length"/> | |
78 <Attribute name = "rank"/> | |
79 <Attribute name = "5_utr_start" /> | |
80 <Attribute name = "5_utr_end" /> | |
81 <Attribute name = "3_utr_start" /> | |
82 <Attribute name = "3_utr_end" /> | |
83 <Attribute name = "ensembl_peptide_id"/> | |
84 <Attribute name = "start_position" /> | |
85 <Attribute name = "end_position" /> | |
86 </Dataset> | |
87 </Query> | |
88 """ | |
89 """ | |
90 Columns(19): | |
91 ['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)'] | |
92 | |
93 0 cDNA sequence | |
94 1 Ensembl Gene ID | |
95 2 Ensembl Transcript ID | |
96 3 Strand | |
97 - 4 Transcript Start (bp) | |
98 - 5 Transcript End (bp) | |
99 6 Exon Chr Start (bp) | |
100 7 Exon Chr End (bp) | |
101 8 CDS Start | |
102 9 CDS End | |
103 10 CDS Length | |
104 - 11 Exon Rank in Transcript | |
105 12 5' UTR Start | |
106 13 5' UTR End | |
107 14 3' UTR Start | |
108 15 3' UTR End | |
109 - 16 Ensembl Protein ID | |
110 - 17 Gene Start (bp) | |
111 - 18 Gene End (bp) | |
112 """ | |
113 | |
114 # return {transcript_id : ccds_id} | |
115 def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids): | |
116 ccds_dict = dict() | |
117 if transcript_ids: | |
118 query = query_ccds_template | |
119 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query) | |
120 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query) | |
121 params = urllib.urlencode({'query':query}) | |
122 headers = {"Accept": "text/plain"} | |
123 if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query) | |
124 try: | |
125 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host) | |
126 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org') | |
127 conn.request("POST", "/biomart/martservice", params, headers) | |
128 response = conn.getresponse() | |
129 data = response.read() | |
130 if len(data) > 0: | |
131 # if debug: print >> sys.stdout, "%s\n\n" % data | |
132 lines = data.split('\n') | |
133 for line in lines: | |
134 fields = line.split('\t') | |
135 if len(fields) == 2: | |
136 (transcript_id,ccds) = fields | |
137 ccds_dict[transcript_id] = ccds | |
138 if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines) | |
139 except Exception, e: | |
140 sys.stdout.write( "transcript_ids: %s %s\n" % (transcript_ids, e) ) | |
141 return ccds_dict | |
142 | |
143 def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True): | |
144 query = query_template | |
145 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query) | |
146 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query) | |
147 query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query) | |
148 return query | |
149 | |
150 # return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence] | |
151 def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True): | |
152 transcript_id = snpEffect.transcript | |
153 chr_pos = snpEffect.pos | |
154 query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds) | |
155 if debug: print >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query) | |
156 params = urllib.urlencode({'query':query}) | |
157 headers = {"Accept": "text/plain"} | |
158 pos = snpEffect.pos | |
159 cds_pos = None # zero based offset | |
160 coding_sequence = '' | |
161 cds_ref = snpEffect.ref | |
162 cds_alt = snpEffect.alt | |
163 try: | |
164 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host) | |
165 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org') | |
166 conn.request("POST", "/biomart/martservice", params, headers) | |
167 response = conn.getresponse() | |
168 data = response.read() | |
169 if len(data) > 0: | |
170 # if debug: print >> sys.stdout, "%s\n\n" % data | |
171 lines = data.split('\n') | |
172 # Use the header line to determine the order of fields | |
173 line = lines[0] | |
174 # if debug: print >> sys.stdout, "Headers:\n%s\n" % line | |
175 colnames = line.split('\t') | |
176 # for i in range(len(colnames)): | |
177 # | |
178 if debug: print >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames) | |
179 for line in lines[1:]: | |
180 if line == None or len(line) < 2: | |
181 continue | |
182 field = line.split('\t') | |
183 if len(field) != len(colnames): | |
184 continue | |
185 if debug: print >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line) | |
186 if field[10] == None or len(field[10]) < 1: | |
187 continue | |
188 if debug: | |
189 for i in range(len(colnames)): | |
190 print >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i]) | |
191 snpEffect.gene_id = field[1] | |
192 strand = '+' if int(field[3]) > 0 else '-' | |
193 snpEffect.strand = strand | |
194 # coding_sequence is first | |
195 if len(field) > 10 and re.match('[ATGCN]+',field[0]): | |
196 if field[10] == None or len(field[10]) < 1: | |
197 continue | |
198 cdna_seq = field[0] | |
199 in_utr = False | |
200 # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence | |
201 utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-')) | |
202 utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-')) | |
203 utr_5_lens = [] | |
204 for i in range(len(utr_5_starts)): | |
205 utr_5_start = int(utr_5_starts[i]) | |
206 utr_5_end = int(utr_5_ends[i]) | |
207 utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1) | |
208 if utr_5_start <= pos <= utr_5_end : | |
209 in_utr = True | |
210 if debug: print >> sys.stdout, "in utr_5: %s %s %s\n" % (utr_5_start,pos,utr_5_end); | |
211 break | |
212 utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-')) | |
213 utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-')) | |
214 for i in range(len(utr_3_starts)): | |
215 utr_3_start = int(utr_3_starts[i]) | |
216 utr_3_end = int(utr_3_ends[i]) | |
217 if utr_3_start <= pos <= utr_3_end : | |
218 in_utr = True | |
219 if debug: print >> sys.stdout, "in utr_3: %s %s %s\n" % (utr_3_start,pos,utr_3_end); | |
220 break | |
221 # Get coding sequence from cdna | |
222 cdna_length = int(field[10]) | |
223 cdna_coding_start = sorted(eval('[' + re.sub(';',',',field[8]) + ']')) | |
224 cdna_coding_end = sorted(eval('[' + re.sub(';',',',field[9]) + ']')) | |
225 cdna_lens = [] | |
226 for i in range(len(cdna_coding_start)): | |
227 cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1) | |
228 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens) | |
229 cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet | |
230 for i in range(len(cdna_coding_start)): | |
231 if debug: print >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1] | |
232 coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]] | |
233 snpEffect.cds = coding_sequence | |
234 if coding_sequence and len(coding_sequence) >= 3: | |
235 snpEffect.cds_stop_codon = coding_sequence[-3:] | |
236 if debug: print >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence) | |
237 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens) | |
238 if debug: print >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens) | |
239 if not in_utr: | |
240 exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-')) | |
241 exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-')) | |
242 exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-')) | |
243 exon_lens = [] | |
244 for i in range(len(exon_start)): | |
245 exon_lens.append(exon_end[i] - exon_start[i] + 1) | |
246 if debug: print >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens) | |
247 if debug: | |
248 bp_tot = 0 | |
249 for i in range(len(exon_start)): | |
250 exon_len = exon_end[i] - exon_start[i] + 1 | |
251 bp_tot += exon_len | |
252 print >> sys.stdout, "test: %s %s %s %s %s %d %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot) | |
253 cds_idx = 0 | |
254 for i in range(len(exon_start)): | |
255 # Does this exon have cds bases - i.e. not entirely in UTR | |
256 if len(utr_5_ends) > 0: | |
257 if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]: | |
258 continue | |
259 if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]: | |
260 continue | |
261 exon_len = exon_end[i] - exon_start[i] + 1 | |
262 if exon_start[i] <= pos <= exon_end[i]: | |
263 if strand == '+': | |
264 if cds_idx: # find offset from start of cdna_coding and exon | |
265 offset = pos - exon_start[i] | |
266 else: # find offset from end of cdna_coding and exon | |
267 offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) ) | |
268 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset | |
269 else: # negative strand | |
270 cds_ref = reverseComplement(snpEffect.ref) | |
271 cds_alt = reverseComplement(snpEffect.alt) | |
272 offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos | |
273 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1) | |
274 snpEffect.cds_pos = cds_pos | |
275 snpEffect.cds_ref = cds_ref | |
276 snpEffect.cds_alt = cds_alt | |
277 return snpEffect | |
278 else: | |
279 cds_idx += 1 | |
280 except Exception, e: | |
281 sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) ) | |
282 finally: | |
283 if conn != None : | |
284 conn.close() | |
285 return None | |
286 | |
287 """ | |
288 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
289 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) | |
290 """ | |
291 class SnpEffect( object ): | |
6
a64ef0611117
Change field header from "Penetrance" to "Prevalence"
Jim Johnson <jj@umn.edu>
parents:
5
diff
changeset
|
292 report_headings = ['Gene','Variant Position','Reference','Variation','Prevalence','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','Stop Region','AA Variation'] |
0 | 293 def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False): |
294 self.chrom = chrom | |
295 self.pos = int(pos) | |
296 self.ref = ref | |
297 self.alt = alt | |
298 self.freq = float(freq) if freq else None | |
299 self.depth = int(depth) if depth else None | |
300 # From SnpEffect VCF String | |
301 self.effect = None | |
302 self.effect_impact = None | |
303 self.functional_class = None | |
304 self.codon_change = None | |
305 self.amino_acid_change = None | |
306 self.amino_acid_length = None | |
307 self.gene_name = None | |
308 self.gene_id = None | |
309 self.gene_biotype = None | |
310 self.coding = None | |
311 self.transcript = None | |
312 self.transcript_ids = None | |
313 self.exon = None | |
314 self.cds_stop_codon = None | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
315 self.cds_stop_pos = None |
0 | 316 # retrieve from ensembl |
317 self.strand = None | |
318 self.cds_pos = None | |
319 self.cds_ref = None | |
320 self.cds_alt = None | |
321 self.cds = None | |
322 self.aa_pos = None | |
323 self.aa_len = None | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
324 self.alt_stop_pos = None |
0 | 325 self.alt_stop_codon = None |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
326 self.alt_stop_region = None ## includes base before and after alt_stop_codon |
5
85b933b7d231
Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents:
4
diff
changeset
|
327 self.use_eff_aa_change = False |
0 | 328 def chrPos(self): |
329 return "%s:%s" % (self.chrom,self.pos) | |
330 def setEffect(self, effect, snpEffVersion = None): | |
331 if snpEffVersion and snpEffVersion == '2': | |
332 (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9] | |
333 else: # SnpEffVersion 3 # adds Amino_Acid_length field | |
334 (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10] | |
335 self.amino_acid_length = amino_acid_length | |
336 self.effect_impact = effect_impact | |
337 self.functional_class = functional_class | |
338 self.codon_change = codon_change | |
339 self.amino_acid_change = amino_acid_change | |
340 self.gene_name = gene_name | |
341 self.gene_biotype = gene_biotype | |
342 self.coding = coding | |
343 self.transcript = transcript | |
344 self.exon = exon | |
345 def parseEffect(self, effect, snpEffVersion = None): | |
346 (eff,effs) = effect.rstrip(')').split('(') | |
347 self.effect = eff | |
348 eff_fields = effs.split('|') | |
349 if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields) | |
350 self.setEffect(eff_fields, snpEffVersion) | |
351 def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False): | |
352 getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds) | |
353 return len(self.cds) > 0 if self.cds else False | |
354 def score(self): | |
355 return self.freq * self.depth if self.freq and self.depth else -1 | |
356 def getCodingSequence(self): | |
357 return self.cds | |
358 def getVariantSequence(self): | |
359 if self.cds and self.cds_pos and self.cds_ref and self.cds_alt: | |
360 return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):] | |
361 else: | |
362 if debug: print >> sys.stdout, "getVariantSequence: %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds) | |
363 return None | |
364 def getCodingAminoSequence(self): | |
365 seq = translate(self.cds) if self.cds else None | |
366 if seq: | |
367 self.aa_len = len(seq) | |
368 return seq | |
369 def getVariantAminoSequence(self): | |
370 variant_seq = self.getVariantSequence() | |
371 return translate(variant_seq) if variant_seq else None | |
372 def getVariantPeptide(self,toStopCodon=True): | |
373 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences() | |
374 if var_aa: | |
375 if toStopCodon: | |
376 end_pos = var_aa_end | |
377 else: | |
378 end_pos = len(var_aa) | |
379 novel_peptide = var_aa[var_aa_pos:end_pos] | |
380 return novel_peptide | |
381 return None | |
382 # [preAA,varAA,postAA, varPeptide, varOffset, subAA] | |
1
29b286896c50
Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
383 def getNonSynonymousPeptide(self,start_count,end_count,toStopCodon=True): |
0 | 384 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences() |
385 if var_aa: | |
1
29b286896c50
Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
386 start_pos = max(var_aa_pos - start_count,0) if start_count and int(start_count) >= 0 else 0 |
0 | 387 if toStopCodon: |
388 end_pos = var_aa_end | |
389 else: | |
1
29b286896c50
Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
390 end_offset = end_count + 1 |
0 | 391 end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end |
392 try: | |
393 varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_' | |
394 if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos) | |
5
85b933b7d231
Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents:
4
diff
changeset
|
395 mutation = "%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA) |
0 | 396 preAA = var_aa[start_pos:var_aa_pos] # N-term side |
397 postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side | |
398 novel_peptide = var_aa[start_pos:end_pos] | |
399 return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation] | |
400 except Exception, e: | |
401 sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) ) | |
402 return None | |
403 # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop] | |
404 # positions are 0-based array indexes | |
405 def getSequences(self): | |
406 coding_dna = self.getCodingSequence() | |
407 coding_aa = self.getCodingAminoSequence() | |
408 var_dna = self.getVariantSequence() | |
409 var_aa = self.getVariantAminoSequence() | |
410 var_aa_pos = None | |
411 var_aa_end = None | |
412 if var_aa: | |
413 var_aa_pos = self.cds_pos / 3 | |
414 for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))): | |
415 if var_aa[j] != coding_aa[j]: | |
416 var_aa_pos = j | |
417 break | |
418 self.aa_pos = var_aa_pos | |
419 var_stop = var_aa.find('*',var_aa_pos) | |
420 if var_stop < 0: | |
421 var_aa_end = len(var_aa) | |
422 else: | |
423 var_aa_end = var_stop + 1 # include the stop codon | |
424 stop_pos = var_stop * 3 | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
425 self.alt_stop_pos = stop_pos |
0 | 426 self.alt_stop_codon = var_dna[stop_pos:stop_pos+3] |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
427 self.alt_stop_region = (var_dna[stop_pos - 1] + '-' if stop_pos > 0 else '') + \ |
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
428 self.alt_stop_codon + \ |
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
429 ( '-' + var_dna[stop_pos+3] if len(var_dna) > stop_pos+3 else '') |
0 | 430 return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end] |
431 def inHomopolymer(self,polyA_limit): | |
432 if polyA_limit: ## library prep can results in inserting or deleting an A in a poly A region | |
433 ## check if base changes at cds_pos involve A or T | |
434 bdiff = None # the difference between the cds_ref and cds_alt | |
435 boff = 0 # the offset to the difference | |
436 if len(self.cds_ref) < len(self.cds_alt): | |
437 bdiff = re.sub(self.cds_ref,'',self.cds_alt) | |
438 boff = self.cds_alt.find(bdiff) | |
439 elif len(self.cds_alt) < len(self.cds_ref): | |
440 bdiff = re.sub(self.cds_alt,'',self.cds_ref) | |
441 boff = self.cds_ref.find(bdiff) | |
442 if bdiff: | |
443 ## check the number of similar base neighboring the change | |
444 alt_seq = self.getVariantSequence() | |
445 subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))] | |
446 ## adjust match length if the cps_pos is within polyA_limit form sequence end | |
447 match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit)) | |
448 if debug: print >> sys.stdout, "polyA bdiff %s %s %d %d\n" % (bdiff,subseq, match_len, boff) | |
449 ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times | |
450 if bdiff.find('A') >= 0: | |
451 pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq)) | |
452 match = re.match(pat,subseq) | |
453 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match') | |
454 if match: | |
455 return True | |
456 if bdiff.find('T') >= 0: | |
457 pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq)) | |
458 match = re.match(pat,subseq) | |
459 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match') | |
460 if match: | |
461 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups()) | |
462 return True | |
463 return False | |
464 def getReportHeader(): | |
465 return report_headings | |
466 def getReportFields(self): | |
467 gene_name = self.gene_name if self.gene_name else '' | |
468 cds_ref = self.cds_ref if self.cds_ref else '' | |
469 cds_alt = self.cds_alt if self.cds_alt else '' | |
470 hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False) | |
471 if debug: print >> sys.stdout, "HGVS %s" % hgvs | |
472 var_aa = self.getVariantPeptide(toStopCodon=True) | |
473 freq = "%2.2f" % self.freq if self.freq else '' | |
474 depth = "%d" % self.depth if self.depth else '' | |
475 aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else '' | |
476 aa_len = "%d" % self.aa_len if self.aa_len else '' | |
477 gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript | |
478 stop_codon = self.alt_stop_codon if self.alt_stop_codon else '' | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
479 stop_region = self.alt_stop_region if self.alt_stop_region else '' |
0 | 480 chrpos = self.chrPos() |
5
85b933b7d231
Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents:
4
diff
changeset
|
481 aa_change = self.amino_acid_change if (self.use_eff_aa_change and self.amino_acid_change) else hgvs[5] |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
482 return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,stop_region,var_aa if var_aa else ''] |
0 | 483 |
484 def __main__(): | |
485 | |
486 def printReportTsv(output_file, snpEffects): | |
487 if output_file: | |
488 print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings) | |
489 for snpEffect in snpEffects: | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
490 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,stop_region,novel_peptide) = snpEffect.getReportFields() |
0 | 491 if not snpEffect.effect == 'FRAME_SHIFT': |
492 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False) | |
493 novel_peptide = "%s_%s_%s" % (preAA,varAA,postAA) | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
494 print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,novel_peptide]) |
0 | 495 |
496 """ | |
497 http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111 | |
498 http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970 | |
499 http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774 | |
500 """ | |
501 def printReportHtml(output_file, detail_dir, snpEffects): | |
502 if output_file: | |
503 print >> output_file, '<HTML><BODY>\n' | |
504 print >> output_file, '<TABLE BORDER="1">\n' | |
6
a64ef0611117
Change field header from "Penetrance" to "Prevalence"
Jim Johnson <jj@umn.edu>
parents:
5
diff
changeset
|
505 print >> output_file, '<TR align="LEFT"><TH>Gene</TH><TH>Variant Position</TH><TH>Reference</TH><TH>Variation</TH><TH>Prevalence</TH><TH>Sequencing Depth</TH><TH>Transcript Details</TH><TH>AA Position</TH><TH>AA Change</TH><TH>AA Length</TH> <TH>Stop Codon</TH><TH>AA Variation</TH></TR>\n' |
0 | 506 for snpEffect in snpEffects: |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
507 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,stop_region,novel_peptide) = snpEffect.getReportFields() |
0 | 508 refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html' |
509 aa_ref = '#'.join([refname,aa_anchor]) | |
510 refpath = os.path.join(detail_dir,refname) | |
511 try: | |
512 ref_file = open(refpath,'w') | |
513 printEffDetailHtml(ref_file,snpEffect) | |
514 ref_file.close() | |
515 except Exception, e: | |
516 sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) ) | |
517 if snpEffect.effect == 'FRAME_SHIFT': | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
518 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s">%s</A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,aa_ref,novel_peptide) |
0 | 519 else: |
520 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False) | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
521 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s"><small><I>%s</I></small><B>%s</B><small><I>%s</I></small></A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,aa_ref, preAA,varAA,postAA) |
0 | 522 print >> output_file, '</TABLE>\n' |
523 print >> output_file, '</BODY></HTML>\n' | |
524 | |
525 def printEffDetailHtml(output_file,snpEffect,wrap_len=60): | |
526 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences() | |
527 seq_len = len(coding_seq) | |
528 print >> output_file, '<HTML><BODY>\n' | |
529 print >> output_file, '<TABLE>\n' | |
530 print >> output_file, '<TR><TD>Gene:</TD><TD>%s</TD></TR>\n' % snpEffect.gene_name | |
531 print >> output_file, '<TR><TD>Allele:</TD><TD>%s/%s</TD></TR>\n' % (snpEffect.cds_ref,snpEffect.cds_alt) | |
532 print >> output_file, '<TR><TD>Transcript:</TD><TD>%s|%s</TD></TR>\n' % (snpEffect.gene_id,snpEffect.transcript) | |
533 print >> output_file, '<TR><TD>Transcript Strand:</TD><TD>%s</TD></TR>\n' % snpEffect.strand | |
534 print >> output_file, '<TR><TD>Position in transcript:</TD><TD><A HREF="%s">%d</A></TD></TR>\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1)) | |
535 print >> output_file, '</TABLE>\n' | |
536 if alt_aa: | |
537 alt_aa_pos = pos / 3 | |
538 if coding_aa: | |
539 for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))): | |
540 if alt_aa[j] != coding_aa[j]: | |
541 alt_aa_pos = j | |
542 break | |
543 alt_aa_end = 0 | |
544 for j in range(alt_aa_pos,len(alt_aa)): | |
545 if alt_aa[j] == '*': | |
546 alt_aa_end = j | |
547 break | |
548 seq = [] | |
549 for j in range(1,wrap_len+1): | |
550 seq.append('-' if j % 10 else '|') | |
551 hrule = '</TD><TD>'.join(seq) | |
552 print >> output_file, '<TABLE cellspacing="0">' | |
553 for i in range(0,seq_len,wrap_len): | |
554 e = min(i + wrap_len,seq_len) | |
555 sa = i/3 | |
556 ea = (i+wrap_len)/3 | |
557 print >> output_file, "<TR STYLE=\"background:#DDD\"><TD ALIGN=RIGHT></TD><TD>%s</TD><TD></TD ALIGN=RIGHT></TR>\n" % (hrule) | |
558 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1) | |
559 for j in range(i,i + wrap_len): | |
560 if j < len(coding_seq): | |
561 if pos == j: | |
562 print >> output_file, "<TD><FONT COLOR=\"#F00\"><A NAME=\"%s\">%s</A></FONT></TD>" % (cds_anchor,coding_seq[j]) | |
563 elif pos <= j < pos + len(ref): | |
564 print >> output_file, "<TD><FONT COLOR=\"#F00\">%s</FONT></TD>" % coding_seq[j] | |
565 else: | |
566 print >> output_file, "<TD>%s</TD>" % coding_seq[j] | |
567 else: | |
568 print >> output_file, "<TD></TD>" | |
569 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e | |
570 if coding_aa: | |
571 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1) | |
572 for j in range(sa,ea): | |
573 if j < len(coding_aa): | |
574 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % coding_aa[j] | |
575 else: | |
576 print >> output_file, "<TD COLSPAN=\"3\"></TD>" | |
577 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea | |
578 if alt_aa: | |
579 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1) | |
580 for j in range(sa,ea): | |
581 if j < len(alt_aa): | |
582 if alt_aa_pos == j: | |
583 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\"><A NAME=\"%s\">%s</A></TD>" % (aa_anchor,alt_aa[j]) | |
584 elif alt_aa_pos < j <= alt_aa_end: | |
585 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\">%s</TD>" % alt_aa[j] | |
586 else: | |
587 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % alt_aa[j] | |
588 else: | |
589 print >> output_file, "<TD COLSPAN=\"3\"></TD>" | |
590 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea | |
591 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1) | |
592 for j in range(i,i + wrap_len): | |
593 if j < len(alt_seq): | |
594 if pos <= j < pos + len(alt): | |
595 print >> output_file, "<TD><FONT COLOR=\"#00F\">%s</FONT></TD>" % alt_seq[j] | |
596 else: | |
597 print >> output_file, "<TD>%s</TD>" % alt_seq[j] | |
598 else: | |
599 print >> output_file, "<TD></TD>" | |
600 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e | |
601 print >> output_file, "</TABLE>" | |
602 print >> output_file, "</BODY></HTML>" | |
603 | |
604 def printSeqText(output_file,snpEffect, wrap_len=120): | |
605 nw = 6 | |
606 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences() | |
607 if coding_seq: | |
608 seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0) | |
609 rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|' | |
610 for i in range(0,seq_len,wrap_len): | |
611 e = min(i + wrap_len,seq_len) | |
612 sa = i/3 | |
613 ea = e/3 | |
614 print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'') | |
615 print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e) | |
616 if coding_aa: | |
617 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(coding_aa[sa:ea]),nw,ea) | |
618 if alt_aa: | |
619 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(alt_aa[sa:ea]),nw,ea) | |
620 if alt_seq: | |
621 print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e) | |
622 | |
623 # Parse the command line options | |
624 usage = "Usage: snpEff_cds_report.py filter [options]" | |
625 parser = optparse.OptionParser(usage = usage) | |
626 parser.add_option("-i", "--in", | |
627 action="store", type="string", | |
628 dest="vcfFile", help="input vcf file") | |
629 parser.add_option("-o", "--out", | |
630 action="store", type="string", | |
631 dest="output", help="output report file") | |
632 parser.add_option("-H", "--html_report", | |
633 action="store", type="string", | |
634 dest="html_file", help="html output report file") | |
635 parser.add_option("-D", "--html_dir", | |
636 action="store", type="string", | |
637 dest="html_dir", help="html output report file") | |
638 parser.add_option("-t", "--tsv_file", | |
639 action="store", type="string", | |
640 dest="tsv_file", help="TAB Separated Value (.tsv) output report file") | |
641 parser.add_option("-e", "--ensembl_host", | |
642 action="store", type="string", | |
643 dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org') | |
644 parser.add_option("-f", "--effects_filter", | |
645 action="store", type="string", default=None, | |
646 dest="effects_filter", help="a list of effects to filter") | |
647 parser.add_option("-O", "--ensembl_dataset", | |
648 action="store", type="string", | |
649 dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl') | |
650 parser.add_option("-p", "--polyA_limit", | |
651 action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' ) | |
5
85b933b7d231
Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents:
4
diff
changeset
|
652 parser.add_option('-S', '--snpeff_aa_change', dest='snpeff_aa_change', action='store_true', default=False, help='Report the SnpEff Amino_Acid_change field' ) |
0 | 653 parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects' ) |
654 parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries' ) | |
655 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) | |
656 | |
657 (options, args) = parser.parse_args() | |
658 | |
659 debug = options.debug | |
660 ensembl_host = options.ensembl_host | |
661 | |
662 # Check that a single vcf file is given. | |
663 if options.vcfFile == None: | |
664 parser.print_help() | |
665 print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report " | |
666 exit(1) | |
667 outputFile = None | |
668 outputHtml = None | |
669 detailDir = None | |
670 outputTsv = None | |
671 tmpHtmlName = None | |
672 tmpHtml = None | |
673 effects_list = [] | |
674 # Set the output file to stdout if no output file was specified. | |
675 if options.output == None and options.html_file == None and options.tsv_file == None: | |
676 outputFile = sys.stdout | |
677 if options.output != None: | |
678 output = os.path.abspath(options.output) | |
679 outputFile = open(output, 'w') | |
680 if options.tsv_file != None: | |
681 output = os.path.abspath(options.tsv_file) | |
682 outputTsv = open(output, 'w') | |
683 if options.html_file != None: | |
684 output = os.path.abspath(options.html_file) | |
685 outputHtml = open(output, 'w') | |
686 if options.html_dir == None: | |
687 detailDir = os.path.dirname(os.path.abspath(output)) | |
688 else: | |
689 detailDir = options.html_dir | |
690 if detailDir: | |
691 if not os.path.isdir(detailDir): | |
692 os.makedirs(detailDir) | |
693 if options.effects_filter: | |
694 eff_codes = options.effects_filter.split(',') | |
695 for val in eff_codes: | |
696 code = val.strip() | |
697 if code: | |
698 effects_list.append(code) | |
699 | |
700 # | |
701 # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
702 try: | |
703 snpEffects = [] | |
704 homopolymers = [] | |
705 polya_count = 0 | |
706 polya_list = [] | |
707 fh = open( options.vcfFile ) | |
708 while True: | |
709 line = fh.readline() | |
710 if not line: | |
711 break #EOF | |
712 if line: | |
713 if outputFile: | |
714 print >> outputFile, "%s\n" % line | |
715 line = line.strip() | |
716 if len(line) < 1: | |
717 continue | |
718 elif line.startswith('#'): | |
719 # check for SnpEffVersion | |
720 """ | |
721 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani" | |
722 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani" | |
723 """ | |
724 if line.startswith('##SnpEffVersion='): | |
725 m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line) | |
726 if m: | |
727 SnpEffVersion = m.group(2) | |
728 # check for SnpEff Info Description | |
729 """ | |
730 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani" | |
731 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' "> | |
732 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani" | |
733 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' "> | |
734 """ | |
735 pass | |
736 else: | |
737 if debug: print >> sys.stdout, "%s\n" % line | |
738 freq = None | |
739 depth = None | |
740 fields = line.split('\t') | |
741 (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8] | |
742 if debug: print >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10) | |
743 if options.debug: print(fields) | |
744 for idx,alt in enumerate(alts.split(',')): | |
745 if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts) | |
746 if not re.match('^[ATCG]*$',alt): | |
747 print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s %s\n" % (chrom,pos,alt) | |
748 for info_item in info.split(';'): | |
749 try: | |
4
429a6b4df5e5
Restrict to info items that are key=value pairs
Jim Johnson <jj@umn.edu>
parents:
2
diff
changeset
|
750 if info_item.find('=') < 0: |
429a6b4df5e5
Restrict to info items that are key=value pairs
Jim Johnson <jj@umn.edu>
parents:
2
diff
changeset
|
751 continue |
0 | 752 (key,val) = info_item.split('=',1) |
753 if key == 'SAF': # Usually a SAF for each variant: if A,AG then SAF=0.333333333333333,SAF=0.333333333333333; | |
754 freqs = info_item.split(',') | |
755 (k,v) = freqs[idx].split('=',1) | |
756 freq = v | |
757 elif key == 'DP': | |
758 depth = val | |
759 elif key == 'EFF': | |
760 transcript_ids = [] | |
761 effects = val.split(',') | |
762 ccds_dict = None | |
763 for effect in effects: | |
764 (eff,effs) = effect.rstrip(')').split('(') | |
765 eff_fields = effs.split('|') | |
766 if SnpEffVersion == '2': | |
767 """ | |
768 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
769 EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) | |
770 """ | |
771 (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9] | |
772 else: # SnpEffVersion 3 # adds Amino_Acid_length field | |
773 """ | |
774 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
775 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) | |
776 """ | |
777 (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10] | |
778 if transcript: | |
779 transcript_ids.append(transcript) | |
780 if debug: print >> sys.stdout, "transcripts: %s" % (transcript_ids) | |
781 if options.with_ccds: | |
782 ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids) | |
783 if debug: print >> sys.stdout, "ccds_dict: %s" % (ccds_dict) | |
784 for effect in effects: | |
785 snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth) | |
786 snpEffect.transcript_ids = transcript_ids | |
787 snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion) | |
5
85b933b7d231
Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents:
4
diff
changeset
|
788 if options.snpeff_aa_change: |
85b933b7d231
Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents:
4
diff
changeset
|
789 snpEffect.use_eff_aa_change = True |
0 | 790 if effects_list and not snpEffect.effect in effects_list: |
791 continue | |
792 if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict): | |
793 if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds): | |
794 if snpEffect.cds_pos: | |
795 if snpEffect.inHomopolymer(options.polyA_limit): | |
796 homopolymers.append(snpEffect) | |
797 else: | |
798 snpEffects.append(snpEffect) | |
799 if outputFile: | |
800 print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields()) | |
801 printSeqText(outputFile,snpEffect) | |
802 if snpEffect.cds and snpEffect.cds_pos and not options.all_effects: | |
803 ## Just report the first sequence returned for a vcf line | |
804 break | |
805 except Exception, e: | |
806 sys.stderr.write( "line: %s err: %s\n" % (line,e) ) | |
807 print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers) | |
808 # Sort snpEffects | |
809 snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True) | |
810 if outputHtml: | |
811 printReportHtml(outputHtml, detailDir, snpEffects) | |
812 if outputTsv: | |
813 printReportTsv(outputTsv,snpEffects) | |
814 except Exception, e: | |
815 print(str(e)) | |
816 | |
817 if __name__ == "__main__": __main__() | |
818 |