Mercurial > repos > jjohnson > snpeff_cds_report
annotate snpEff_cds_report.py @ 6:a64ef0611117
Change field header from "Penetrance" to "Prevalence"
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Thu, 13 Jun 2013 11:37:34 -0500 |
parents | 85b933b7d231 |
children | fbb6510186df |
rev | line source |
---|---|
0 | 1 #!/usr/bin/python |
5
85b933b7d231
Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents:
4
diff
changeset
|
2 ## version 1.2 |
0 | 3 import sys,os,tempfile,re |
4 import httplib, urllib | |
5 import optparse | |
6 #import vcfClass | |
7 #from vcfClass import * | |
8 #import tools | |
9 #from tools import * | |
10 | |
11 debug = False | |
12 cds_anchor = 'cds_variation' | |
13 aa_anchor = 'aa_variation' | |
14 | |
15 codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L", | |
16 "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S", | |
17 "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*", | |
18 "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W", | |
19 "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L", | |
20 "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P", | |
21 "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q", | |
22 "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R", | |
23 "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M", | |
24 "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T", | |
25 "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K", | |
26 "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R", | |
27 "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V", | |
28 "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A", | |
29 "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E", | |
30 "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",} | |
31 | |
32 def reverseComplement(seq) : | |
33 rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper() | |
34 return rev | |
35 | |
36 def translate(seq) : | |
37 rna = seq.upper().replace('T','U') | |
38 aa = [] | |
39 for i in range(0,len(rna) - 2, 3): | |
40 aa.append(codon_map[rna[i:i+3]]) | |
41 return ''.join(aa) | |
42 | |
43 """ | |
44 SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand | |
45 Queries that request a sequence always return the sequence in the first column regardless of the order of attributes. | |
46 """ | |
47 query_ccds_template = """<?xml version="1.0" encoding="UTF-8"?> | |
48 <!DOCTYPE Query> | |
49 <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" > | |
50 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" > | |
51 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/> | |
52 <Filter name = "with_ccds" excluded = "0"/> | |
53 <Attribute name = "ensembl_transcript_id" /> | |
54 <Attribute name = "ccds" /> | |
55 </Dataset> | |
56 </Query> | |
57 """ | |
58 ccds_filter = '<Filter name = "with_ccds" excluded = "0"/>' | |
59 query_template = """<?xml version="1.0" encoding="UTF-8"?> | |
60 <!DOCTYPE Query> | |
61 <Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" > | |
62 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" > | |
63 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/> | |
64 <Filter name = "biotype" value = "protein_coding"/> | |
65 __CCDS_FILTER_HERE__ | |
66 <Attribute name = "cdna" /> | |
67 <Attribute name = "ensembl_gene_id" /> | |
68 <Attribute name = "ensembl_transcript_id" /> | |
69 <Attribute name = "strand" /> | |
70 <Attribute name = "transcript_start" /> | |
71 <Attribute name = "transcript_end"/> | |
72 <Attribute name = "exon_chrom_start"/> | |
73 <Attribute name = "exon_chrom_end"/> | |
74 <Attribute name = "cdna_coding_start" /> | |
75 <Attribute name = "cdna_coding_end" /> | |
76 <Attribute name = "cds_length"/> | |
77 <Attribute name = "rank"/> | |
78 <Attribute name = "5_utr_start" /> | |
79 <Attribute name = "5_utr_end" /> | |
80 <Attribute name = "3_utr_start" /> | |
81 <Attribute name = "3_utr_end" /> | |
82 <Attribute name = "ensembl_peptide_id"/> | |
83 <Attribute name = "start_position" /> | |
84 <Attribute name = "end_position" /> | |
85 </Dataset> | |
86 </Query> | |
87 """ | |
88 """ | |
89 Columns(19): | |
90 ['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)'] | |
91 | |
92 0 cDNA sequence | |
93 1 Ensembl Gene ID | |
94 2 Ensembl Transcript ID | |
95 3 Strand | |
96 - 4 Transcript Start (bp) | |
97 - 5 Transcript End (bp) | |
98 6 Exon Chr Start (bp) | |
99 7 Exon Chr End (bp) | |
100 8 CDS Start | |
101 9 CDS End | |
102 10 CDS Length | |
103 - 11 Exon Rank in Transcript | |
104 12 5' UTR Start | |
105 13 5' UTR End | |
106 14 3' UTR Start | |
107 15 3' UTR End | |
108 - 16 Ensembl Protein ID | |
109 - 17 Gene Start (bp) | |
110 - 18 Gene End (bp) | |
111 """ | |
112 | |
113 # return {transcript_id : ccds_id} | |
114 def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids): | |
115 ccds_dict = dict() | |
116 if transcript_ids: | |
117 query = query_ccds_template | |
118 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query) | |
119 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query) | |
120 params = urllib.urlencode({'query':query}) | |
121 headers = {"Accept": "text/plain"} | |
122 if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query) | |
123 try: | |
124 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host) | |
125 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org') | |
126 conn.request("POST", "/biomart/martservice", params, headers) | |
127 response = conn.getresponse() | |
128 data = response.read() | |
129 if len(data) > 0: | |
130 # if debug: print >> sys.stdout, "%s\n\n" % data | |
131 lines = data.split('\n') | |
132 for line in lines: | |
133 fields = line.split('\t') | |
134 if len(fields) == 2: | |
135 (transcript_id,ccds) = fields | |
136 ccds_dict[transcript_id] = ccds | |
137 if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines) | |
138 except Exception, e: | |
139 sys.stdout.write( "transcript_ids: %s %s\n" % (transcript_ids, e) ) | |
140 return ccds_dict | |
141 | |
142 def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True): | |
143 query = query_template | |
144 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query) | |
145 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query) | |
146 query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query) | |
147 return query | |
148 | |
149 # return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence] | |
150 def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True): | |
151 transcript_id = snpEffect.transcript | |
152 chr_pos = snpEffect.pos | |
153 query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds) | |
154 if debug: print >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query) | |
155 params = urllib.urlencode({'query':query}) | |
156 headers = {"Accept": "text/plain"} | |
157 pos = snpEffect.pos | |
158 cds_pos = None # zero based offset | |
159 coding_sequence = '' | |
160 cds_ref = snpEffect.ref | |
161 cds_alt = snpEffect.alt | |
162 try: | |
163 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host) | |
164 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org') | |
165 conn.request("POST", "/biomart/martservice", params, headers) | |
166 response = conn.getresponse() | |
167 data = response.read() | |
168 if len(data) > 0: | |
169 # if debug: print >> sys.stdout, "%s\n\n" % data | |
170 lines = data.split('\n') | |
171 # Use the header line to determine the order of fields | |
172 line = lines[0] | |
173 # if debug: print >> sys.stdout, "Headers:\n%s\n" % line | |
174 colnames = line.split('\t') | |
175 # for i in range(len(colnames)): | |
176 # | |
177 if debug: print >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames) | |
178 for line in lines[1:]: | |
179 if line == None or len(line) < 2: | |
180 continue | |
181 field = line.split('\t') | |
182 if len(field) != len(colnames): | |
183 continue | |
184 if debug: print >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line) | |
185 if field[10] == None or len(field[10]) < 1: | |
186 continue | |
187 if debug: | |
188 for i in range(len(colnames)): | |
189 print >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i]) | |
190 snpEffect.gene_id = field[1] | |
191 strand = '+' if int(field[3]) > 0 else '-' | |
192 snpEffect.strand = strand | |
193 # coding_sequence is first | |
194 if len(field) > 10 and re.match('[ATGCN]+',field[0]): | |
195 if field[10] == None or len(field[10]) < 1: | |
196 continue | |
197 cdna_seq = field[0] | |
198 in_utr = False | |
199 # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence | |
200 utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-')) | |
201 utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-')) | |
202 utr_5_lens = [] | |
203 for i in range(len(utr_5_starts)): | |
204 utr_5_start = int(utr_5_starts[i]) | |
205 utr_5_end = int(utr_5_ends[i]) | |
206 utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1) | |
207 if utr_5_start <= pos <= utr_5_end : | |
208 in_utr = True | |
209 if debug: print >> sys.stdout, "in utr_5: %s %s %s\n" % (utr_5_start,pos,utr_5_end); | |
210 break | |
211 utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-')) | |
212 utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-')) | |
213 for i in range(len(utr_3_starts)): | |
214 utr_3_start = int(utr_3_starts[i]) | |
215 utr_3_end = int(utr_3_ends[i]) | |
216 if utr_3_start <= pos <= utr_3_end : | |
217 in_utr = True | |
218 if debug: print >> sys.stdout, "in utr_3: %s %s %s\n" % (utr_3_start,pos,utr_3_end); | |
219 break | |
220 # Get coding sequence from cdna | |
221 cdna_length = int(field[10]) | |
222 cdna_coding_start = sorted(eval('[' + re.sub(';',',',field[8]) + ']')) | |
223 cdna_coding_end = sorted(eval('[' + re.sub(';',',',field[9]) + ']')) | |
224 cdna_lens = [] | |
225 for i in range(len(cdna_coding_start)): | |
226 cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1) | |
227 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens) | |
228 cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet | |
229 for i in range(len(cdna_coding_start)): | |
230 if debug: print >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1] | |
231 coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]] | |
232 snpEffect.cds = coding_sequence | |
233 if coding_sequence and len(coding_sequence) >= 3: | |
234 snpEffect.cds_stop_codon = coding_sequence[-3:] | |
235 if debug: print >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence) | |
236 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens) | |
237 if debug: print >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens) | |
238 if not in_utr: | |
239 exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-')) | |
240 exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-')) | |
241 exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-')) | |
242 exon_lens = [] | |
243 for i in range(len(exon_start)): | |
244 exon_lens.append(exon_end[i] - exon_start[i] + 1) | |
245 if debug: print >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens) | |
246 if debug: | |
247 bp_tot = 0 | |
248 for i in range(len(exon_start)): | |
249 exon_len = exon_end[i] - exon_start[i] + 1 | |
250 bp_tot += exon_len | |
251 print >> sys.stdout, "test: %s %s %s %s %s %d %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot) | |
252 cds_idx = 0 | |
253 for i in range(len(exon_start)): | |
254 # Does this exon have cds bases - i.e. not entirely in UTR | |
255 if len(utr_5_ends) > 0: | |
256 if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]: | |
257 continue | |
258 if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]: | |
259 continue | |
260 exon_len = exon_end[i] - exon_start[i] + 1 | |
261 if exon_start[i] <= pos <= exon_end[i]: | |
262 if strand == '+': | |
263 if cds_idx: # find offset from start of cdna_coding and exon | |
264 offset = pos - exon_start[i] | |
265 else: # find offset from end of cdna_coding and exon | |
266 offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) ) | |
267 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset | |
268 else: # negative strand | |
269 cds_ref = reverseComplement(snpEffect.ref) | |
270 cds_alt = reverseComplement(snpEffect.alt) | |
271 offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos | |
272 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1) | |
273 snpEffect.cds_pos = cds_pos | |
274 snpEffect.cds_ref = cds_ref | |
275 snpEffect.cds_alt = cds_alt | |
276 return snpEffect | |
277 else: | |
278 cds_idx += 1 | |
279 except Exception, e: | |
280 sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) ) | |
281 finally: | |
282 if conn != None : | |
283 conn.close() | |
284 return None | |
285 | |
286 """ | |
287 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
288 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) | |
289 """ | |
290 class SnpEffect( object ): | |
6
a64ef0611117
Change field header from "Penetrance" to "Prevalence"
Jim Johnson <jj@umn.edu>
parents:
5
diff
changeset
|
291 report_headings = ['Gene','Variant Position','Reference','Variation','Prevalence','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','Stop Region','AA Variation'] |
0 | 292 def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False): |
293 self.chrom = chrom | |
294 self.pos = int(pos) | |
295 self.ref = ref | |
296 self.alt = alt | |
297 self.freq = float(freq) if freq else None | |
298 self.depth = int(depth) if depth else None | |
299 # From SnpEffect VCF String | |
300 self.effect = None | |
301 self.effect_impact = None | |
302 self.functional_class = None | |
303 self.codon_change = None | |
304 self.amino_acid_change = None | |
305 self.amino_acid_length = None | |
306 self.gene_name = None | |
307 self.gene_id = None | |
308 self.gene_biotype = None | |
309 self.coding = None | |
310 self.transcript = None | |
311 self.transcript_ids = None | |
312 self.exon = None | |
313 self.cds_stop_codon = None | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
314 self.cds_stop_pos = None |
0 | 315 # retrieve from ensembl |
316 self.strand = None | |
317 self.cds_pos = None | |
318 self.cds_ref = None | |
319 self.cds_alt = None | |
320 self.cds = None | |
321 self.aa_pos = None | |
322 self.aa_len = None | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
323 self.alt_stop_pos = None |
0 | 324 self.alt_stop_codon = None |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
325 self.alt_stop_region = None ## includes base before and after alt_stop_codon |
5
85b933b7d231
Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents:
4
diff
changeset
|
326 self.use_eff_aa_change = False |
0 | 327 def chrPos(self): |
328 return "%s:%s" % (self.chrom,self.pos) | |
329 def setEffect(self, effect, snpEffVersion = None): | |
330 if snpEffVersion and snpEffVersion == '2': | |
331 (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9] | |
332 else: # SnpEffVersion 3 # adds Amino_Acid_length field | |
333 (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10] | |
334 self.amino_acid_length = amino_acid_length | |
335 self.effect_impact = effect_impact | |
336 self.functional_class = functional_class | |
337 self.codon_change = codon_change | |
338 self.amino_acid_change = amino_acid_change | |
339 self.gene_name = gene_name | |
340 self.gene_biotype = gene_biotype | |
341 self.coding = coding | |
342 self.transcript = transcript | |
343 self.exon = exon | |
344 def parseEffect(self, effect, snpEffVersion = None): | |
345 (eff,effs) = effect.rstrip(')').split('(') | |
346 self.effect = eff | |
347 eff_fields = effs.split('|') | |
348 if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields) | |
349 self.setEffect(eff_fields, snpEffVersion) | |
350 def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False): | |
351 getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds) | |
352 return len(self.cds) > 0 if self.cds else False | |
353 def score(self): | |
354 return self.freq * self.depth if self.freq and self.depth else -1 | |
355 def getCodingSequence(self): | |
356 return self.cds | |
357 def getVariantSequence(self): | |
358 if self.cds and self.cds_pos and self.cds_ref and self.cds_alt: | |
359 return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):] | |
360 else: | |
361 if debug: print >> sys.stdout, "getVariantSequence: %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds) | |
362 return None | |
363 def getCodingAminoSequence(self): | |
364 seq = translate(self.cds) if self.cds else None | |
365 if seq: | |
366 self.aa_len = len(seq) | |
367 return seq | |
368 def getVariantAminoSequence(self): | |
369 variant_seq = self.getVariantSequence() | |
370 return translate(variant_seq) if variant_seq else None | |
371 def getVariantPeptide(self,toStopCodon=True): | |
372 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences() | |
373 if var_aa: | |
374 if toStopCodon: | |
375 end_pos = var_aa_end | |
376 else: | |
377 end_pos = len(var_aa) | |
378 novel_peptide = var_aa[var_aa_pos:end_pos] | |
379 return novel_peptide | |
380 return None | |
381 # [preAA,varAA,postAA, varPeptide, varOffset, subAA] | |
1
29b286896c50
Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
382 def getNonSynonymousPeptide(self,start_count,end_count,toStopCodon=True): |
0 | 383 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences() |
384 if var_aa: | |
1
29b286896c50
Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
385 start_pos = max(var_aa_pos - start_count,0) if start_count and int(start_count) >= 0 else 0 |
0 | 386 if toStopCodon: |
387 end_pos = var_aa_end | |
388 else: | |
1
29b286896c50
Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
389 end_offset = end_count + 1 |
0 | 390 end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end |
391 try: | |
392 varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_' | |
393 if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos) | |
5
85b933b7d231
Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents:
4
diff
changeset
|
394 mutation = "%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA) |
0 | 395 preAA = var_aa[start_pos:var_aa_pos] # N-term side |
396 postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side | |
397 novel_peptide = var_aa[start_pos:end_pos] | |
398 return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation] | |
399 except Exception, e: | |
400 sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) ) | |
401 return None | |
402 # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop] | |
403 # positions are 0-based array indexes | |
404 def getSequences(self): | |
405 coding_dna = self.getCodingSequence() | |
406 coding_aa = self.getCodingAminoSequence() | |
407 var_dna = self.getVariantSequence() | |
408 var_aa = self.getVariantAminoSequence() | |
409 var_aa_pos = None | |
410 var_aa_end = None | |
411 if var_aa: | |
412 var_aa_pos = self.cds_pos / 3 | |
413 for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))): | |
414 if var_aa[j] != coding_aa[j]: | |
415 var_aa_pos = j | |
416 break | |
417 self.aa_pos = var_aa_pos | |
418 var_stop = var_aa.find('*',var_aa_pos) | |
419 if var_stop < 0: | |
420 var_aa_end = len(var_aa) | |
421 else: | |
422 var_aa_end = var_stop + 1 # include the stop codon | |
423 stop_pos = var_stop * 3 | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
424 self.alt_stop_pos = stop_pos |
0 | 425 self.alt_stop_codon = var_dna[stop_pos:stop_pos+3] |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
426 self.alt_stop_region = (var_dna[stop_pos - 1] + '-' if stop_pos > 0 else '') + \ |
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
427 self.alt_stop_codon + \ |
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
428 ( '-' + var_dna[stop_pos+3] if len(var_dna) > stop_pos+3 else '') |
0 | 429 return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end] |
430 def inHomopolymer(self,polyA_limit): | |
431 if polyA_limit: ## library prep can results in inserting or deleting an A in a poly A region | |
432 ## check if base changes at cds_pos involve A or T | |
433 bdiff = None # the difference between the cds_ref and cds_alt | |
434 boff = 0 # the offset to the difference | |
435 if len(self.cds_ref) < len(self.cds_alt): | |
436 bdiff = re.sub(self.cds_ref,'',self.cds_alt) | |
437 boff = self.cds_alt.find(bdiff) | |
438 elif len(self.cds_alt) < len(self.cds_ref): | |
439 bdiff = re.sub(self.cds_alt,'',self.cds_ref) | |
440 boff = self.cds_ref.find(bdiff) | |
441 if bdiff: | |
442 ## check the number of similar base neighboring the change | |
443 alt_seq = self.getVariantSequence() | |
444 subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))] | |
445 ## adjust match length if the cps_pos is within polyA_limit form sequence end | |
446 match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit)) | |
447 if debug: print >> sys.stdout, "polyA bdiff %s %s %d %d\n" % (bdiff,subseq, match_len, boff) | |
448 ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times | |
449 if bdiff.find('A') >= 0: | |
450 pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq)) | |
451 match = re.match(pat,subseq) | |
452 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match') | |
453 if match: | |
454 return True | |
455 if bdiff.find('T') >= 0: | |
456 pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq)) | |
457 match = re.match(pat,subseq) | |
458 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match') | |
459 if match: | |
460 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups()) | |
461 return True | |
462 return False | |
463 def getReportHeader(): | |
464 return report_headings | |
465 def getReportFields(self): | |
466 gene_name = self.gene_name if self.gene_name else '' | |
467 cds_ref = self.cds_ref if self.cds_ref else '' | |
468 cds_alt = self.cds_alt if self.cds_alt else '' | |
469 hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False) | |
470 if debug: print >> sys.stdout, "HGVS %s" % hgvs | |
471 var_aa = self.getVariantPeptide(toStopCodon=True) | |
472 freq = "%2.2f" % self.freq if self.freq else '' | |
473 depth = "%d" % self.depth if self.depth else '' | |
474 aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else '' | |
475 aa_len = "%d" % self.aa_len if self.aa_len else '' | |
476 gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript | |
477 stop_codon = self.alt_stop_codon if self.alt_stop_codon else '' | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
478 stop_region = self.alt_stop_region if self.alt_stop_region else '' |
0 | 479 chrpos = self.chrPos() |
5
85b933b7d231
Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents:
4
diff
changeset
|
480 aa_change = self.amino_acid_change if (self.use_eff_aa_change and self.amino_acid_change) else hgvs[5] |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
481 return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,stop_region,var_aa if var_aa else ''] |
0 | 482 |
483 def __main__(): | |
484 | |
485 def printReportTsv(output_file, snpEffects): | |
486 if output_file: | |
487 print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings) | |
488 for snpEffect in snpEffects: | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
489 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,stop_region,novel_peptide) = snpEffect.getReportFields() |
0 | 490 if not snpEffect.effect == 'FRAME_SHIFT': |
491 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False) | |
492 novel_peptide = "%s_%s_%s" % (preAA,varAA,postAA) | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
493 print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,novel_peptide]) |
0 | 494 |
495 """ | |
496 http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111 | |
497 http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970 | |
498 http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774 | |
499 """ | |
500 def printReportHtml(output_file, detail_dir, snpEffects): | |
501 if output_file: | |
502 print >> output_file, '<HTML><BODY>\n' | |
503 print >> output_file, '<TABLE BORDER="1">\n' | |
6
a64ef0611117
Change field header from "Penetrance" to "Prevalence"
Jim Johnson <jj@umn.edu>
parents:
5
diff
changeset
|
504 print >> output_file, '<TR align="LEFT"><TH>Gene</TH><TH>Variant Position</TH><TH>Reference</TH><TH>Variation</TH><TH>Prevalence</TH><TH>Sequencing Depth</TH><TH>Transcript Details</TH><TH>AA Position</TH><TH>AA Change</TH><TH>AA Length</TH> <TH>Stop Codon</TH><TH>AA Variation</TH></TR>\n' |
0 | 505 for snpEffect in snpEffects: |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
506 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,stop_region,novel_peptide) = snpEffect.getReportFields() |
0 | 507 refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html' |
508 aa_ref = '#'.join([refname,aa_anchor]) | |
509 refpath = os.path.join(detail_dir,refname) | |
510 try: | |
511 ref_file = open(refpath,'w') | |
512 printEffDetailHtml(ref_file,snpEffect) | |
513 ref_file.close() | |
514 except Exception, e: | |
515 sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) ) | |
516 if snpEffect.effect == 'FRAME_SHIFT': | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
517 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s">%s</A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,aa_ref,novel_peptide) |
0 | 518 else: |
519 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False) | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
520 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s"><small><I>%s</I></small><B>%s</B><small><I>%s</I></small></A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,aa_ref, preAA,varAA,postAA) |
0 | 521 print >> output_file, '</TABLE>\n' |
522 print >> output_file, '</BODY></HTML>\n' | |
523 | |
524 def printEffDetailHtml(output_file,snpEffect,wrap_len=60): | |
525 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences() | |
526 seq_len = len(coding_seq) | |
527 print >> output_file, '<HTML><BODY>\n' | |
528 print >> output_file, '<TABLE>\n' | |
529 print >> output_file, '<TR><TD>Gene:</TD><TD>%s</TD></TR>\n' % snpEffect.gene_name | |
530 print >> output_file, '<TR><TD>Allele:</TD><TD>%s/%s</TD></TR>\n' % (snpEffect.cds_ref,snpEffect.cds_alt) | |
531 print >> output_file, '<TR><TD>Transcript:</TD><TD>%s|%s</TD></TR>\n' % (snpEffect.gene_id,snpEffect.transcript) | |
532 print >> output_file, '<TR><TD>Transcript Strand:</TD><TD>%s</TD></TR>\n' % snpEffect.strand | |
533 print >> output_file, '<TR><TD>Position in transcript:</TD><TD><A HREF="%s">%d</A></TD></TR>\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1)) | |
534 print >> output_file, '</TABLE>\n' | |
535 if alt_aa: | |
536 alt_aa_pos = pos / 3 | |
537 if coding_aa: | |
538 for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))): | |
539 if alt_aa[j] != coding_aa[j]: | |
540 alt_aa_pos = j | |
541 break | |
542 alt_aa_end = 0 | |
543 for j in range(alt_aa_pos,len(alt_aa)): | |
544 if alt_aa[j] == '*': | |
545 alt_aa_end = j | |
546 break | |
547 seq = [] | |
548 for j in range(1,wrap_len+1): | |
549 seq.append('-' if j % 10 else '|') | |
550 hrule = '</TD><TD>'.join(seq) | |
551 print >> output_file, '<TABLE cellspacing="0">' | |
552 for i in range(0,seq_len,wrap_len): | |
553 e = min(i + wrap_len,seq_len) | |
554 sa = i/3 | |
555 ea = (i+wrap_len)/3 | |
556 print >> output_file, "<TR STYLE=\"background:#DDD\"><TD ALIGN=RIGHT></TD><TD>%s</TD><TD></TD ALIGN=RIGHT></TR>\n" % (hrule) | |
557 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1) | |
558 for j in range(i,i + wrap_len): | |
559 if j < len(coding_seq): | |
560 if pos == j: | |
561 print >> output_file, "<TD><FONT COLOR=\"#F00\"><A NAME=\"%s\">%s</A></FONT></TD>" % (cds_anchor,coding_seq[j]) | |
562 elif pos <= j < pos + len(ref): | |
563 print >> output_file, "<TD><FONT COLOR=\"#F00\">%s</FONT></TD>" % coding_seq[j] | |
564 else: | |
565 print >> output_file, "<TD>%s</TD>" % coding_seq[j] | |
566 else: | |
567 print >> output_file, "<TD></TD>" | |
568 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e | |
569 if coding_aa: | |
570 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1) | |
571 for j in range(sa,ea): | |
572 if j < len(coding_aa): | |
573 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % coding_aa[j] | |
574 else: | |
575 print >> output_file, "<TD COLSPAN=\"3\"></TD>" | |
576 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea | |
577 if alt_aa: | |
578 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1) | |
579 for j in range(sa,ea): | |
580 if j < len(alt_aa): | |
581 if alt_aa_pos == j: | |
582 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\"><A NAME=\"%s\">%s</A></TD>" % (aa_anchor,alt_aa[j]) | |
583 elif alt_aa_pos < j <= alt_aa_end: | |
584 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\">%s</TD>" % alt_aa[j] | |
585 else: | |
586 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % alt_aa[j] | |
587 else: | |
588 print >> output_file, "<TD COLSPAN=\"3\"></TD>" | |
589 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea | |
590 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1) | |
591 for j in range(i,i + wrap_len): | |
592 if j < len(alt_seq): | |
593 if pos <= j < pos + len(alt): | |
594 print >> output_file, "<TD><FONT COLOR=\"#00F\">%s</FONT></TD>" % alt_seq[j] | |
595 else: | |
596 print >> output_file, "<TD>%s</TD>" % alt_seq[j] | |
597 else: | |
598 print >> output_file, "<TD></TD>" | |
599 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e | |
600 print >> output_file, "</TABLE>" | |
601 print >> output_file, "</BODY></HTML>" | |
602 | |
603 def printSeqText(output_file,snpEffect, wrap_len=120): | |
604 nw = 6 | |
605 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences() | |
606 if coding_seq: | |
607 seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0) | |
608 rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|' | |
609 for i in range(0,seq_len,wrap_len): | |
610 e = min(i + wrap_len,seq_len) | |
611 sa = i/3 | |
612 ea = e/3 | |
613 print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'') | |
614 print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e) | |
615 if coding_aa: | |
616 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(coding_aa[sa:ea]),nw,ea) | |
617 if alt_aa: | |
618 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(alt_aa[sa:ea]),nw,ea) | |
619 if alt_seq: | |
620 print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e) | |
621 | |
622 # Parse the command line options | |
623 usage = "Usage: snpEff_cds_report.py filter [options]" | |
624 parser = optparse.OptionParser(usage = usage) | |
625 parser.add_option("-i", "--in", | |
626 action="store", type="string", | |
627 dest="vcfFile", help="input vcf file") | |
628 parser.add_option("-o", "--out", | |
629 action="store", type="string", | |
630 dest="output", help="output report file") | |
631 parser.add_option("-H", "--html_report", | |
632 action="store", type="string", | |
633 dest="html_file", help="html output report file") | |
634 parser.add_option("-D", "--html_dir", | |
635 action="store", type="string", | |
636 dest="html_dir", help="html output report file") | |
637 parser.add_option("-t", "--tsv_file", | |
638 action="store", type="string", | |
639 dest="tsv_file", help="TAB Separated Value (.tsv) output report file") | |
640 parser.add_option("-e", "--ensembl_host", | |
641 action="store", type="string", | |
642 dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org') | |
643 parser.add_option("-f", "--effects_filter", | |
644 action="store", type="string", default=None, | |
645 dest="effects_filter", help="a list of effects to filter") | |
646 parser.add_option("-O", "--ensembl_dataset", | |
647 action="store", type="string", | |
648 dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl') | |
649 parser.add_option("-p", "--polyA_limit", | |
650 action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' ) | |
5
85b933b7d231
Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents:
4
diff
changeset
|
651 parser.add_option('-S', '--snpeff_aa_change', dest='snpeff_aa_change', action='store_true', default=False, help='Report the SnpEff Amino_Acid_change field' ) |
0 | 652 parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects' ) |
653 parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries' ) | |
654 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) | |
655 | |
656 (options, args) = parser.parse_args() | |
657 | |
658 debug = options.debug | |
659 ensembl_host = options.ensembl_host | |
660 | |
661 # Check that a single vcf file is given. | |
662 if options.vcfFile == None: | |
663 parser.print_help() | |
664 print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report " | |
665 exit(1) | |
666 outputFile = None | |
667 outputHtml = None | |
668 detailDir = None | |
669 outputTsv = None | |
670 tmpHtmlName = None | |
671 tmpHtml = None | |
672 effects_list = [] | |
673 # Set the output file to stdout if no output file was specified. | |
674 if options.output == None and options.html_file == None and options.tsv_file == None: | |
675 outputFile = sys.stdout | |
676 if options.output != None: | |
677 output = os.path.abspath(options.output) | |
678 outputFile = open(output, 'w') | |
679 if options.tsv_file != None: | |
680 output = os.path.abspath(options.tsv_file) | |
681 outputTsv = open(output, 'w') | |
682 if options.html_file != None: | |
683 output = os.path.abspath(options.html_file) | |
684 outputHtml = open(output, 'w') | |
685 if options.html_dir == None: | |
686 detailDir = os.path.dirname(os.path.abspath(output)) | |
687 else: | |
688 detailDir = options.html_dir | |
689 if detailDir: | |
690 if not os.path.isdir(detailDir): | |
691 os.makedirs(detailDir) | |
692 if options.effects_filter: | |
693 eff_codes = options.effects_filter.split(',') | |
694 for val in eff_codes: | |
695 code = val.strip() | |
696 if code: | |
697 effects_list.append(code) | |
698 | |
699 # | |
700 # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
701 try: | |
702 snpEffects = [] | |
703 homopolymers = [] | |
704 polya_count = 0 | |
705 polya_list = [] | |
706 fh = open( options.vcfFile ) | |
707 while True: | |
708 line = fh.readline() | |
709 if not line: | |
710 break #EOF | |
711 if line: | |
712 if outputFile: | |
713 print >> outputFile, "%s\n" % line | |
714 line = line.strip() | |
715 if len(line) < 1: | |
716 continue | |
717 elif line.startswith('#'): | |
718 # check for SnpEffVersion | |
719 """ | |
720 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani" | |
721 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani" | |
722 """ | |
723 if line.startswith('##SnpEffVersion='): | |
724 m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line) | |
725 if m: | |
726 SnpEffVersion = m.group(2) | |
727 # check for SnpEff Info Description | |
728 """ | |
729 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani" | |
730 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' "> | |
731 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani" | |
732 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' "> | |
733 """ | |
734 pass | |
735 else: | |
736 if debug: print >> sys.stdout, "%s\n" % line | |
737 freq = None | |
738 depth = None | |
739 fields = line.split('\t') | |
740 (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8] | |
741 if debug: print >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10) | |
742 if options.debug: print(fields) | |
743 for idx,alt in enumerate(alts.split(',')): | |
744 if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts) | |
745 if not re.match('^[ATCG]*$',alt): | |
746 print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s %s\n" % (chrom,pos,alt) | |
747 for info_item in info.split(';'): | |
748 try: | |
4
429a6b4df5e5
Restrict to info items that are key=value pairs
Jim Johnson <jj@umn.edu>
parents:
2
diff
changeset
|
749 if info_item.find('=') < 0: |
429a6b4df5e5
Restrict to info items that are key=value pairs
Jim Johnson <jj@umn.edu>
parents:
2
diff
changeset
|
750 continue |
0 | 751 (key,val) = info_item.split('=',1) |
752 if key == 'SAF': # Usually a SAF for each variant: if A,AG then SAF=0.333333333333333,SAF=0.333333333333333; | |
753 freqs = info_item.split(',') | |
754 (k,v) = freqs[idx].split('=',1) | |
755 freq = v | |
756 elif key == 'DP': | |
757 depth = val | |
758 elif key == 'EFF': | |
759 transcript_ids = [] | |
760 effects = val.split(',') | |
761 ccds_dict = None | |
762 for effect in effects: | |
763 (eff,effs) = effect.rstrip(')').split('(') | |
764 eff_fields = effs.split('|') | |
765 if SnpEffVersion == '2': | |
766 """ | |
767 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
768 EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) | |
769 """ | |
770 (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9] | |
771 else: # SnpEffVersion 3 # adds Amino_Acid_length field | |
772 """ | |
773 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
774 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) | |
775 """ | |
776 (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10] | |
777 if transcript: | |
778 transcript_ids.append(transcript) | |
779 if debug: print >> sys.stdout, "transcripts: %s" % (transcript_ids) | |
780 if options.with_ccds: | |
781 ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids) | |
782 if debug: print >> sys.stdout, "ccds_dict: %s" % (ccds_dict) | |
783 for effect in effects: | |
784 snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth) | |
785 snpEffect.transcript_ids = transcript_ids | |
786 snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion) | |
5
85b933b7d231
Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents:
4
diff
changeset
|
787 if options.snpeff_aa_change: |
85b933b7d231
Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents:
4
diff
changeset
|
788 snpEffect.use_eff_aa_change = True |
0 | 789 if effects_list and not snpEffect.effect in effects_list: |
790 continue | |
791 if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict): | |
792 if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds): | |
793 if snpEffect.cds_pos: | |
794 if snpEffect.inHomopolymer(options.polyA_limit): | |
795 homopolymers.append(snpEffect) | |
796 else: | |
797 snpEffects.append(snpEffect) | |
798 if outputFile: | |
799 print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields()) | |
800 printSeqText(outputFile,snpEffect) | |
801 if snpEffect.cds and snpEffect.cds_pos and not options.all_effects: | |
802 ## Just report the first sequence returned for a vcf line | |
803 break | |
804 except Exception, e: | |
805 sys.stderr.write( "line: %s err: %s\n" % (line,e) ) | |
806 print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers) | |
807 # Sort snpEffects | |
808 snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True) | |
809 if outputHtml: | |
810 printReportHtml(outputHtml, detailDir, snpEffects) | |
811 if outputTsv: | |
812 printReportTsv(outputTsv,snpEffects) | |
813 except Exception, e: | |
814 print(str(e)) | |
815 | |
816 if __name__ == "__main__": __main__() | |
817 |