annotate snpEff_cds_report.py @ 6:a64ef0611117

Change field header from "Penetrance" to "Prevalence"
author Jim Johnson <jj@umn.edu>
date Thu, 13 Jun 2013 11:37:34 -0500
parents 85b933b7d231
children fbb6510186df
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
1 #!/usr/bin/python
5
85b933b7d231 Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents: 4
diff changeset
2 ## version 1.2
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
3 import sys,os,tempfile,re
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
4 import httplib, urllib
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
5 import optparse
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
6 #import vcfClass
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
7 #from vcfClass import *
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
8 #import tools
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
9 #from tools import *
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
10
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
11 debug = False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
12 cds_anchor = 'cds_variation'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
13 aa_anchor = 'aa_variation'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
14
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
15 codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
16 "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
17 "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
18 "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
19 "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
20 "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
21 "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
22 "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
23 "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
24 "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
25 "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
26 "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
27 "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
28 "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
29 "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
30 "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
31
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
32 def reverseComplement(seq) :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
33 rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
34 return rev
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
35
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
36 def translate(seq) :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
37 rna = seq.upper().replace('T','U')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
38 aa = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
39 for i in range(0,len(rna) - 2, 3):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
40 aa.append(codon_map[rna[i:i+3]])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
41 return ''.join(aa)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
42
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
43 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
44 SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
45 Queries that request a sequence always return the sequence in the first column regardless of the order of attributes.
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
46 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
47 query_ccds_template = """<?xml version="1.0" encoding="UTF-8"?>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
48 <!DOCTYPE Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
49 <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
50 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
51 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
52 <Filter name = "with_ccds" excluded = "0"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
53 <Attribute name = "ensembl_transcript_id" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
54 <Attribute name = "ccds" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
55 </Dataset>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
56 </Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
57 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
58 ccds_filter = '<Filter name = "with_ccds" excluded = "0"/>'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
59 query_template = """<?xml version="1.0" encoding="UTF-8"?>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
60 <!DOCTYPE Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
61 <Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
62 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
63 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
64 <Filter name = "biotype" value = "protein_coding"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
65 __CCDS_FILTER_HERE__
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
66 <Attribute name = "cdna" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
67 <Attribute name = "ensembl_gene_id" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
68 <Attribute name = "ensembl_transcript_id" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
69 <Attribute name = "strand" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
70 <Attribute name = "transcript_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
71 <Attribute name = "transcript_end"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
72 <Attribute name = "exon_chrom_start"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
73 <Attribute name = "exon_chrom_end"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
74 <Attribute name = "cdna_coding_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
75 <Attribute name = "cdna_coding_end" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
76 <Attribute name = "cds_length"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
77 <Attribute name = "rank"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
78 <Attribute name = "5_utr_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
79 <Attribute name = "5_utr_end" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
80 <Attribute name = "3_utr_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
81 <Attribute name = "3_utr_end" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
82 <Attribute name = "ensembl_peptide_id"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
83 <Attribute name = "start_position" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
84 <Attribute name = "end_position" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
85 </Dataset>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
86 </Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
87 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
88 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
89 Columns(19):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
90 ['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)']
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
91
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
92 0 cDNA sequence
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
93 1 Ensembl Gene ID
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
94 2 Ensembl Transcript ID
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
95 3 Strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
96 - 4 Transcript Start (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
97 - 5 Transcript End (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
98 6 Exon Chr Start (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
99 7 Exon Chr End (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
100 8 CDS Start
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
101 9 CDS End
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
102 10 CDS Length
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
103 - 11 Exon Rank in Transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
104 12 5' UTR Start
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
105 13 5' UTR End
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
106 14 3' UTR Start
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
107 15 3' UTR End
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
108 - 16 Ensembl Protein ID
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
109 - 17 Gene Start (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
110 - 18 Gene End (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
111 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
112
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
113 # return {transcript_id : ccds_id}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
114 def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
115 ccds_dict = dict()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
116 if transcript_ids:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
117 query = query_ccds_template
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
118 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
119 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
120 params = urllib.urlencode({'query':query})
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
121 headers = {"Accept": "text/plain"}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
122 if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
123 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
124 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
125 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
126 conn.request("POST", "/biomart/martservice", params, headers)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
127 response = conn.getresponse()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
128 data = response.read()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
129 if len(data) > 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
130 # if debug: print >> sys.stdout, "%s\n\n" % data
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
131 lines = data.split('\n')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
132 for line in lines:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
133 fields = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
134 if len(fields) == 2:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
135 (transcript_id,ccds) = fields
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
136 ccds_dict[transcript_id] = ccds
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
137 if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
138 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
139 sys.stdout.write( "transcript_ids: %s %s\n" % (transcript_ids, e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
140 return ccds_dict
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
141
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
142 def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
143 query = query_template
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
144 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
145 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
146 query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
147 return query
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
148
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
149 # return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
150 def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
151 transcript_id = snpEffect.transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
152 chr_pos = snpEffect.pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
153 query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
154 if debug: print >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
155 params = urllib.urlencode({'query':query})
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
156 headers = {"Accept": "text/plain"}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
157 pos = snpEffect.pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
158 cds_pos = None # zero based offset
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
159 coding_sequence = ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
160 cds_ref = snpEffect.ref
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
161 cds_alt = snpEffect.alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
162 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
163 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
164 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
165 conn.request("POST", "/biomart/martservice", params, headers)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
166 response = conn.getresponse()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
167 data = response.read()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
168 if len(data) > 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
169 # if debug: print >> sys.stdout, "%s\n\n" % data
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
170 lines = data.split('\n')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
171 # Use the header line to determine the order of fields
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
172 line = lines[0]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
173 # if debug: print >> sys.stdout, "Headers:\n%s\n" % line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
174 colnames = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
175 # for i in range(len(colnames)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
176 #
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
177 if debug: print >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
178 for line in lines[1:]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
179 if line == None or len(line) < 2:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
180 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
181 field = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
182 if len(field) != len(colnames):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
183 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
184 if debug: print >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
185 if field[10] == None or len(field[10]) < 1:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
186 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
187 if debug:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
188 for i in range(len(colnames)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
189 print >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
190 snpEffect.gene_id = field[1]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
191 strand = '+' if int(field[3]) > 0 else '-'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
192 snpEffect.strand = strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
193 # coding_sequence is first
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
194 if len(field) > 10 and re.match('[ATGCN]+',field[0]):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
195 if field[10] == None or len(field[10]) < 1:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
196 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
197 cdna_seq = field[0]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
198 in_utr = False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
199 # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
200 utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
201 utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
202 utr_5_lens = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
203 for i in range(len(utr_5_starts)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
204 utr_5_start = int(utr_5_starts[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
205 utr_5_end = int(utr_5_ends[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
206 utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
207 if utr_5_start <= pos <= utr_5_end :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
208 in_utr = True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
209 if debug: print >> sys.stdout, "in utr_5: %s %s %s\n" % (utr_5_start,pos,utr_5_end);
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
210 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
211 utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
212 utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
213 for i in range(len(utr_3_starts)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
214 utr_3_start = int(utr_3_starts[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
215 utr_3_end = int(utr_3_ends[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
216 if utr_3_start <= pos <= utr_3_end :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
217 in_utr = True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
218 if debug: print >> sys.stdout, "in utr_3: %s %s %s\n" % (utr_3_start,pos,utr_3_end);
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
219 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
220 # Get coding sequence from cdna
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
221 cdna_length = int(field[10])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
222 cdna_coding_start = sorted(eval('[' + re.sub(';',',',field[8]) + ']'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
223 cdna_coding_end = sorted(eval('[' + re.sub(';',',',field[9]) + ']'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
224 cdna_lens = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
225 for i in range(len(cdna_coding_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
226 cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
227 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
228 cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
229 for i in range(len(cdna_coding_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
230 if debug: print >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
231 coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
232 snpEffect.cds = coding_sequence
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
233 if coding_sequence and len(coding_sequence) >= 3:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
234 snpEffect.cds_stop_codon = coding_sequence[-3:]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
235 if debug: print >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
236 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
237 if debug: print >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
238 if not in_utr:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
239 exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
240 exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
241 exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
242 exon_lens = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
243 for i in range(len(exon_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
244 exon_lens.append(exon_end[i] - exon_start[i] + 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
245 if debug: print >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
246 if debug:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
247 bp_tot = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
248 for i in range(len(exon_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
249 exon_len = exon_end[i] - exon_start[i] + 1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
250 bp_tot += exon_len
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
251 print >> sys.stdout, "test: %s %s %s %s %s %d %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
252 cds_idx = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
253 for i in range(len(exon_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
254 # Does this exon have cds bases - i.e. not entirely in UTR
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
255 if len(utr_5_ends) > 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
256 if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
257 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
258 if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
259 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
260 exon_len = exon_end[i] - exon_start[i] + 1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
261 if exon_start[i] <= pos <= exon_end[i]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
262 if strand == '+':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
263 if cds_idx: # find offset from start of cdna_coding and exon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
264 offset = pos - exon_start[i]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
265 else: # find offset from end of cdna_coding and exon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
266 offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
267 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
268 else: # negative strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
269 cds_ref = reverseComplement(snpEffect.ref)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
270 cds_alt = reverseComplement(snpEffect.alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
271 offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
272 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
273 snpEffect.cds_pos = cds_pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
274 snpEffect.cds_ref = cds_ref
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
275 snpEffect.cds_alt = cds_alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
276 return snpEffect
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
277 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
278 cds_idx += 1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
279 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
280 sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
281 finally:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
282 if conn != None :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
283 conn.close()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
284 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
285
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
286 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
287 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
288 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
289 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
290 class SnpEffect( object ):
6
a64ef0611117 Change field header from "Penetrance" to "Prevalence"
Jim Johnson <jj@umn.edu>
parents: 5
diff changeset
291 report_headings = ['Gene','Variant Position','Reference','Variation','Prevalence','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','Stop Region','AA Variation']
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
292 def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
293 self.chrom = chrom
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
294 self.pos = int(pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
295 self.ref = ref
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
296 self.alt = alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
297 self.freq = float(freq) if freq else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
298 self.depth = int(depth) if depth else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
299 # From SnpEffect VCF String
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
300 self.effect = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
301 self.effect_impact = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
302 self.functional_class = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
303 self.codon_change = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
304 self.amino_acid_change = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
305 self.amino_acid_length = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
306 self.gene_name = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
307 self.gene_id = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
308 self.gene_biotype = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
309 self.coding = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
310 self.transcript = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
311 self.transcript_ids = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
312 self.exon = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
313 self.cds_stop_codon = None
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
314 self.cds_stop_pos = None
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
315 # retrieve from ensembl
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
316 self.strand = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
317 self.cds_pos = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
318 self.cds_ref = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
319 self.cds_alt = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
320 self.cds = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
321 self.aa_pos = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
322 self.aa_len = None
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
323 self.alt_stop_pos = None
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
324 self.alt_stop_codon = None
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
325 self.alt_stop_region = None ## includes base before and after alt_stop_codon
5
85b933b7d231 Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents: 4
diff changeset
326 self.use_eff_aa_change = False
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
327 def chrPos(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
328 return "%s:%s" % (self.chrom,self.pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
329 def setEffect(self, effect, snpEffVersion = None):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
330 if snpEffVersion and snpEffVersion == '2':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
331 (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
332 else: # SnpEffVersion 3 # adds Amino_Acid_length field
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
333 (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
334 self.amino_acid_length = amino_acid_length
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
335 self.effect_impact = effect_impact
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
336 self.functional_class = functional_class
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
337 self.codon_change = codon_change
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
338 self.amino_acid_change = amino_acid_change
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
339 self.gene_name = gene_name
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
340 self.gene_biotype = gene_biotype
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
341 self.coding = coding
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
342 self.transcript = transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
343 self.exon = exon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
344 def parseEffect(self, effect, snpEffVersion = None):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
345 (eff,effs) = effect.rstrip(')').split('(')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
346 self.effect = eff
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
347 eff_fields = effs.split('|')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
348 if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
349 self.setEffect(eff_fields, snpEffVersion)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
350 def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
351 getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
352 return len(self.cds) > 0 if self.cds else False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
353 def score(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
354 return self.freq * self.depth if self.freq and self.depth else -1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
355 def getCodingSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
356 return self.cds
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
357 def getVariantSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
358 if self.cds and self.cds_pos and self.cds_ref and self.cds_alt:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
359 return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
360 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
361 if debug: print >> sys.stdout, "getVariantSequence: %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
362 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
363 def getCodingAminoSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
364 seq = translate(self.cds) if self.cds else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
365 if seq:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
366 self.aa_len = len(seq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
367 return seq
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
368 def getVariantAminoSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
369 variant_seq = self.getVariantSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
370 return translate(variant_seq) if variant_seq else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
371 def getVariantPeptide(self,toStopCodon=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
372 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
373 if var_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
374 if toStopCodon:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
375 end_pos = var_aa_end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
376 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
377 end_pos = len(var_aa)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
378 novel_peptide = var_aa[var_aa_pos:end_pos]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
379 return novel_peptide
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
380 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
381 # [preAA,varAA,postAA, varPeptide, varOffset, subAA]
1
29b286896c50 Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
382 def getNonSynonymousPeptide(self,start_count,end_count,toStopCodon=True):
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
383 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
384 if var_aa:
1
29b286896c50 Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
385 start_pos = max(var_aa_pos - start_count,0) if start_count and int(start_count) >= 0 else 0
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
386 if toStopCodon:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
387 end_pos = var_aa_end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
388 else:
1
29b286896c50 Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
389 end_offset = end_count + 1
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
390 end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
391 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
392 varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
393 if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos)
5
85b933b7d231 Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents: 4
diff changeset
394 mutation = "%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA)
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
395 preAA = var_aa[start_pos:var_aa_pos] # N-term side
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
396 postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
397 novel_peptide = var_aa[start_pos:end_pos]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
398 return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
399 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
400 sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
401 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
402 # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
403 # positions are 0-based array indexes
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
404 def getSequences(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
405 coding_dna = self.getCodingSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
406 coding_aa = self.getCodingAminoSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
407 var_dna = self.getVariantSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
408 var_aa = self.getVariantAminoSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
409 var_aa_pos = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
410 var_aa_end = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
411 if var_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
412 var_aa_pos = self.cds_pos / 3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
413 for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
414 if var_aa[j] != coding_aa[j]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
415 var_aa_pos = j
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
416 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
417 self.aa_pos = var_aa_pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
418 var_stop = var_aa.find('*',var_aa_pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
419 if var_stop < 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
420 var_aa_end = len(var_aa)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
421 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
422 var_aa_end = var_stop + 1 # include the stop codon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
423 stop_pos = var_stop * 3
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
424 self.alt_stop_pos = stop_pos
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
425 self.alt_stop_codon = var_dna[stop_pos:stop_pos+3]
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
426 self.alt_stop_region = (var_dna[stop_pos - 1] + '-' if stop_pos > 0 else '') + \
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
427 self.alt_stop_codon + \
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
428 ( '-' + var_dna[stop_pos+3] if len(var_dna) > stop_pos+3 else '')
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
429 return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
430 def inHomopolymer(self,polyA_limit):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
431 if polyA_limit: ## library prep can results in inserting or deleting an A in a poly A region
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
432 ## check if base changes at cds_pos involve A or T
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
433 bdiff = None # the difference between the cds_ref and cds_alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
434 boff = 0 # the offset to the difference
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
435 if len(self.cds_ref) < len(self.cds_alt):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
436 bdiff = re.sub(self.cds_ref,'',self.cds_alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
437 boff = self.cds_alt.find(bdiff)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
438 elif len(self.cds_alt) < len(self.cds_ref):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
439 bdiff = re.sub(self.cds_alt,'',self.cds_ref)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
440 boff = self.cds_ref.find(bdiff)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
441 if bdiff:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
442 ## check the number of similar base neighboring the change
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
443 alt_seq = self.getVariantSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
444 subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
445 ## adjust match length if the cps_pos is within polyA_limit form sequence end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
446 match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
447 if debug: print >> sys.stdout, "polyA bdiff %s %s %d %d\n" % (bdiff,subseq, match_len, boff)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
448 ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
449 if bdiff.find('A') >= 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
450 pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
451 match = re.match(pat,subseq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
452 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
453 if match:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
454 return True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
455 if bdiff.find('T') >= 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
456 pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
457 match = re.match(pat,subseq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
458 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
459 if match:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
460 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups())
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
461 return True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
462 return False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
463 def getReportHeader():
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
464 return report_headings
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
465 def getReportFields(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
466 gene_name = self.gene_name if self.gene_name else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
467 cds_ref = self.cds_ref if self.cds_ref else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
468 cds_alt = self.cds_alt if self.cds_alt else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
469 hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
470 if debug: print >> sys.stdout, "HGVS %s" % hgvs
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
471 var_aa = self.getVariantPeptide(toStopCodon=True)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
472 freq = "%2.2f" % self.freq if self.freq else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
473 depth = "%d" % self.depth if self.depth else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
474 aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
475 aa_len = "%d" % self.aa_len if self.aa_len else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
476 gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
477 stop_codon = self.alt_stop_codon if self.alt_stop_codon else ''
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
478 stop_region = self.alt_stop_region if self.alt_stop_region else ''
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
479 chrpos = self.chrPos()
5
85b933b7d231 Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents: 4
diff changeset
480 aa_change = self.amino_acid_change if (self.use_eff_aa_change and self.amino_acid_change) else hgvs[5]
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
481 return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,stop_region,var_aa if var_aa else '']
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
482
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
483 def __main__():
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
484
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
485 def printReportTsv(output_file, snpEffects):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
486 if output_file:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
487 print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
488 for snpEffect in snpEffects:
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
489 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,stop_region,novel_peptide) = snpEffect.getReportFields()
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
490 if not snpEffect.effect == 'FRAME_SHIFT':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
491 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
492 novel_peptide = "%s_%s_%s" % (preAA,varAA,postAA)
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
493 print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,novel_peptide])
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
494
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
495 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
496 http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
497 http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
498 http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
499 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
500 def printReportHtml(output_file, detail_dir, snpEffects):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
501 if output_file:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
502 print >> output_file, '<HTML><BODY>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
503 print >> output_file, '<TABLE BORDER="1">\n'
6
a64ef0611117 Change field header from "Penetrance" to "Prevalence"
Jim Johnson <jj@umn.edu>
parents: 5
diff changeset
504 print >> output_file, '<TR align="LEFT"><TH>Gene</TH><TH>Variant Position</TH><TH>Reference</TH><TH>Variation</TH><TH>Prevalence</TH><TH>Sequencing Depth</TH><TH>Transcript Details</TH><TH>AA Position</TH><TH>AA Change</TH><TH>AA Length</TH> <TH>Stop Codon</TH><TH>AA Variation</TH></TR>\n'
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
505 for snpEffect in snpEffects:
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
506 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,stop_region,novel_peptide) = snpEffect.getReportFields()
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
507 refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
508 aa_ref = '#'.join([refname,aa_anchor])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
509 refpath = os.path.join(detail_dir,refname)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
510 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
511 ref_file = open(refpath,'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
512 printEffDetailHtml(ref_file,snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
513 ref_file.close()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
514 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
515 sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
516 if snpEffect.effect == 'FRAME_SHIFT':
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
517 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s">%s</A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,aa_ref,novel_peptide)
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
518 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
519 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
520 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s"><small><I>%s</I></small><B>%s</B><small><I>%s</I></small></A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,aa_ref, preAA,varAA,postAA)
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
521 print >> output_file, '</TABLE>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
522 print >> output_file, '</BODY></HTML>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
523
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
524 def printEffDetailHtml(output_file,snpEffect,wrap_len=60):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
525 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
526 seq_len = len(coding_seq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
527 print >> output_file, '<HTML><BODY>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
528 print >> output_file, '<TABLE>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
529 print >> output_file, '<TR><TD>Gene:</TD><TD>%s</TD></TR>\n' % snpEffect.gene_name
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
530 print >> output_file, '<TR><TD>Allele:</TD><TD>%s/%s</TD></TR>\n' % (snpEffect.cds_ref,snpEffect.cds_alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
531 print >> output_file, '<TR><TD>Transcript:</TD><TD>%s|%s</TD></TR>\n' % (snpEffect.gene_id,snpEffect.transcript)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
532 print >> output_file, '<TR><TD>Transcript Strand:</TD><TD>%s</TD></TR>\n' % snpEffect.strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
533 print >> output_file, '<TR><TD>Position in transcript:</TD><TD><A HREF="%s">%d</A></TD></TR>\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
534 print >> output_file, '</TABLE>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
535 if alt_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
536 alt_aa_pos = pos / 3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
537 if coding_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
538 for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
539 if alt_aa[j] != coding_aa[j]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
540 alt_aa_pos = j
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
541 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
542 alt_aa_end = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
543 for j in range(alt_aa_pos,len(alt_aa)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
544 if alt_aa[j] == '*':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
545 alt_aa_end = j
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
546 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
547 seq = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
548 for j in range(1,wrap_len+1):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
549 seq.append('-' if j % 10 else '|')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
550 hrule = '</TD><TD>'.join(seq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
551 print >> output_file, '<TABLE cellspacing="0">'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
552 for i in range(0,seq_len,wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
553 e = min(i + wrap_len,seq_len)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
554 sa = i/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
555 ea = (i+wrap_len)/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
556 print >> output_file, "<TR STYLE=\"background:#DDD\"><TD ALIGN=RIGHT></TD><TD>%s</TD><TD></TD ALIGN=RIGHT></TR>\n" % (hrule)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
557 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
558 for j in range(i,i + wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
559 if j < len(coding_seq):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
560 if pos == j:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
561 print >> output_file, "<TD><FONT COLOR=\"#F00\"><A NAME=\"%s\">%s</A></FONT></TD>" % (cds_anchor,coding_seq[j])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
562 elif pos <= j < pos + len(ref):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
563 print >> output_file, "<TD><FONT COLOR=\"#F00\">%s</FONT></TD>" % coding_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
564 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
565 print >> output_file, "<TD>%s</TD>" % coding_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
566 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
567 print >> output_file, "<TD></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
568 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
569 if coding_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
570 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
571 for j in range(sa,ea):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
572 if j < len(coding_aa):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
573 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % coding_aa[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
574 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
575 print >> output_file, "<TD COLSPAN=\"3\"></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
576 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
577 if alt_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
578 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
579 for j in range(sa,ea):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
580 if j < len(alt_aa):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
581 if alt_aa_pos == j:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
582 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\"><A NAME=\"%s\">%s</A></TD>" % (aa_anchor,alt_aa[j])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
583 elif alt_aa_pos < j <= alt_aa_end:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
584 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\">%s</TD>" % alt_aa[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
585 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
586 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % alt_aa[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
587 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
588 print >> output_file, "<TD COLSPAN=\"3\"></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
589 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
590 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
591 for j in range(i,i + wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
592 if j < len(alt_seq):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
593 if pos <= j < pos + len(alt):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
594 print >> output_file, "<TD><FONT COLOR=\"#00F\">%s</FONT></TD>" % alt_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
595 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
596 print >> output_file, "<TD>%s</TD>" % alt_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
597 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
598 print >> output_file, "<TD></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
599 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
600 print >> output_file, "</TABLE>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
601 print >> output_file, "</BODY></HTML>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
602
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
603 def printSeqText(output_file,snpEffect, wrap_len=120):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
604 nw = 6
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
605 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
606 if coding_seq:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
607 seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
608 rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
609 for i in range(0,seq_len,wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
610 e = min(i + wrap_len,seq_len)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
611 sa = i/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
612 ea = e/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
613 print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
614 print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
615 if coding_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
616 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(coding_aa[sa:ea]),nw,ea)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
617 if alt_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
618 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(alt_aa[sa:ea]),nw,ea)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
619 if alt_seq:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
620 print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
621
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
622 # Parse the command line options
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
623 usage = "Usage: snpEff_cds_report.py filter [options]"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
624 parser = optparse.OptionParser(usage = usage)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
625 parser.add_option("-i", "--in",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
626 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
627 dest="vcfFile", help="input vcf file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
628 parser.add_option("-o", "--out",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
629 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
630 dest="output", help="output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
631 parser.add_option("-H", "--html_report",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
632 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
633 dest="html_file", help="html output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
634 parser.add_option("-D", "--html_dir",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
635 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
636 dest="html_dir", help="html output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
637 parser.add_option("-t", "--tsv_file",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
638 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
639 dest="tsv_file", help="TAB Separated Value (.tsv) output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
640 parser.add_option("-e", "--ensembl_host",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
641 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
642 dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
643 parser.add_option("-f", "--effects_filter",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
644 action="store", type="string", default=None,
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
645 dest="effects_filter", help="a list of effects to filter")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
646 parser.add_option("-O", "--ensembl_dataset",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
647 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
648 dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
649 parser.add_option("-p", "--polyA_limit",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
650 action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' )
5
85b933b7d231 Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents: 4
diff changeset
651 parser.add_option('-S', '--snpeff_aa_change', dest='snpeff_aa_change', action='store_true', default=False, help='Report the SnpEff Amino_Acid_change field' )
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
652 parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
653 parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
654 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
655
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
656 (options, args) = parser.parse_args()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
657
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
658 debug = options.debug
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
659 ensembl_host = options.ensembl_host
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
660
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
661 # Check that a single vcf file is given.
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
662 if options.vcfFile == None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
663 parser.print_help()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
664 print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report "
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
665 exit(1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
666 outputFile = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
667 outputHtml = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
668 detailDir = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
669 outputTsv = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
670 tmpHtmlName = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
671 tmpHtml = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
672 effects_list = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
673 # Set the output file to stdout if no output file was specified.
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
674 if options.output == None and options.html_file == None and options.tsv_file == None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
675 outputFile = sys.stdout
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
676 if options.output != None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
677 output = os.path.abspath(options.output)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
678 outputFile = open(output, 'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
679 if options.tsv_file != None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
680 output = os.path.abspath(options.tsv_file)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
681 outputTsv = open(output, 'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
682 if options.html_file != None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
683 output = os.path.abspath(options.html_file)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
684 outputHtml = open(output, 'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
685 if options.html_dir == None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
686 detailDir = os.path.dirname(os.path.abspath(output))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
687 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
688 detailDir = options.html_dir
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
689 if detailDir:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
690 if not os.path.isdir(detailDir):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
691 os.makedirs(detailDir)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
692 if options.effects_filter:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
693 eff_codes = options.effects_filter.split(',')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
694 for val in eff_codes:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
695 code = val.strip()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
696 if code:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
697 effects_list.append(code)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
698
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
699 #
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
700 # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
701 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
702 snpEffects = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
703 homopolymers = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
704 polya_count = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
705 polya_list = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
706 fh = open( options.vcfFile )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
707 while True:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
708 line = fh.readline()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
709 if not line:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
710 break #EOF
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
711 if line:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
712 if outputFile:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
713 print >> outputFile, "%s\n" % line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
714 line = line.strip()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
715 if len(line) < 1:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
716 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
717 elif line.startswith('#'):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
718 # check for SnpEffVersion
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
719 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
720 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
721 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
722 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
723 if line.startswith('##SnpEffVersion='):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
724 m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
725 if m:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
726 SnpEffVersion = m.group(2)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
727 # check for SnpEff Info Description
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
728 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
729 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
730 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
731 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
732 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
733 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
734 pass
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
735 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
736 if debug: print >> sys.stdout, "%s\n" % line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
737 freq = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
738 depth = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
739 fields = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
740 (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
741 if debug: print >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
742 if options.debug: print(fields)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
743 for idx,alt in enumerate(alts.split(',')):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
744 if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
745 if not re.match('^[ATCG]*$',alt):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
746 print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s %s\n" % (chrom,pos,alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
747 for info_item in info.split(';'):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
748 try:
4
429a6b4df5e5 Restrict to info items that are key=value pairs
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
749 if info_item.find('=') < 0:
429a6b4df5e5 Restrict to info items that are key=value pairs
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
750 continue
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
751 (key,val) = info_item.split('=',1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
752 if key == 'SAF': # Usually a SAF for each variant: if A,AG then SAF=0.333333333333333,SAF=0.333333333333333;
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
753 freqs = info_item.split(',')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
754 (k,v) = freqs[idx].split('=',1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
755 freq = v
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
756 elif key == 'DP':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
757 depth = val
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
758 elif key == 'EFF':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
759 transcript_ids = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
760 effects = val.split(',')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
761 ccds_dict = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
762 for effect in effects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
763 (eff,effs) = effect.rstrip(')').split('(')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
764 eff_fields = effs.split('|')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
765 if SnpEffVersion == '2':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
766 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
767 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
768 EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
769 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
770 (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
771 else: # SnpEffVersion 3 # adds Amino_Acid_length field
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
772 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
773 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
774 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
775 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
776 (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
777 if transcript:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
778 transcript_ids.append(transcript)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
779 if debug: print >> sys.stdout, "transcripts: %s" % (transcript_ids)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
780 if options.with_ccds:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
781 ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
782 if debug: print >> sys.stdout, "ccds_dict: %s" % (ccds_dict)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
783 for effect in effects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
784 snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
785 snpEffect.transcript_ids = transcript_ids
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
786 snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion)
5
85b933b7d231 Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents: 4
diff changeset
787 if options.snpeff_aa_change:
85b933b7d231 Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents: 4
diff changeset
788 snpEffect.use_eff_aa_change = True
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
789 if effects_list and not snpEffect.effect in effects_list:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
790 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
791 if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
792 if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
793 if snpEffect.cds_pos:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
794 if snpEffect.inHomopolymer(options.polyA_limit):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
795 homopolymers.append(snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
796 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
797 snpEffects.append(snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
798 if outputFile:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
799 print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields())
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
800 printSeqText(outputFile,snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
801 if snpEffect.cds and snpEffect.cds_pos and not options.all_effects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
802 ## Just report the first sequence returned for a vcf line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
803 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
804 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
805 sys.stderr.write( "line: %s err: %s\n" % (line,e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
806 print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
807 # Sort snpEffects
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
808 snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
809 if outputHtml:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
810 printReportHtml(outputHtml, detailDir, snpEffects)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
811 if outputTsv:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
812 printReportTsv(outputTsv,snpEffects)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
813 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
814 print(str(e))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
815
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
816 if __name__ == "__main__": __main__()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
817