annotate snpEff_cds_report.py @ 7:fbb6510186df default tip

when codon has ambiguous nucleotide translate to X
author Jim Johnson <jj@umn.edu>
date Mon, 15 Sep 2014 06:13:47 -0500
parents a64ef0611117
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
1 #!/usr/bin/python
5
85b933b7d231 Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents: 4
diff changeset
2 ## version 1.2
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
3 import sys,os,tempfile,re
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
4 import httplib, urllib
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
5 import optparse
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
6 #import vcfClass
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
7 #from vcfClass import *
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
8 #import tools
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
9 #from tools import *
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
10
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
11 debug = False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
12 cds_anchor = 'cds_variation'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
13 aa_anchor = 'aa_variation'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
14
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
15 codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
16 "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
17 "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
18 "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
19 "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
20 "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
21 "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
22 "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
23 "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
24 "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
25 "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
26 "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
27 "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
28 "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
29 "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
30 "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
31
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
32 def reverseComplement(seq) :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
33 rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
34 return rev
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
35
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
36 def translate(seq) :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
37 rna = seq.upper().replace('T','U')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
38 aa = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
39 for i in range(0,len(rna) - 2, 3):
7
fbb6510186df when codon has ambiguous nucleotide translate to X
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
40 codon = rna[i:i+3]
fbb6510186df when codon has ambiguous nucleotide translate to X
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
41 aa.append(codon_map[codon] if codon in codon_map else 'X')
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
42 return ''.join(aa)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
43
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
44 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
45 SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
46 Queries that request a sequence always return the sequence in the first column regardless of the order of attributes.
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
47 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
48 query_ccds_template = """<?xml version="1.0" encoding="UTF-8"?>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
49 <!DOCTYPE Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
50 <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
51 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
52 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
53 <Filter name = "with_ccds" excluded = "0"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
54 <Attribute name = "ensembl_transcript_id" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
55 <Attribute name = "ccds" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
56 </Dataset>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
57 </Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
58 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
59 ccds_filter = '<Filter name = "with_ccds" excluded = "0"/>'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
60 query_template = """<?xml version="1.0" encoding="UTF-8"?>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
61 <!DOCTYPE Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
62 <Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
63 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
64 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
65 <Filter name = "biotype" value = "protein_coding"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
66 __CCDS_FILTER_HERE__
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
67 <Attribute name = "cdna" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
68 <Attribute name = "ensembl_gene_id" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
69 <Attribute name = "ensembl_transcript_id" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
70 <Attribute name = "strand" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
71 <Attribute name = "transcript_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
72 <Attribute name = "transcript_end"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
73 <Attribute name = "exon_chrom_start"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
74 <Attribute name = "exon_chrom_end"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
75 <Attribute name = "cdna_coding_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
76 <Attribute name = "cdna_coding_end" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
77 <Attribute name = "cds_length"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
78 <Attribute name = "rank"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
79 <Attribute name = "5_utr_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
80 <Attribute name = "5_utr_end" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
81 <Attribute name = "3_utr_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
82 <Attribute name = "3_utr_end" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
83 <Attribute name = "ensembl_peptide_id"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
84 <Attribute name = "start_position" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
85 <Attribute name = "end_position" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
86 </Dataset>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
87 </Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
88 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
89 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
90 Columns(19):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
91 ['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)']
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
92
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
93 0 cDNA sequence
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
94 1 Ensembl Gene ID
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
95 2 Ensembl Transcript ID
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
96 3 Strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
97 - 4 Transcript Start (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
98 - 5 Transcript End (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
99 6 Exon Chr Start (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
100 7 Exon Chr End (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
101 8 CDS Start
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
102 9 CDS End
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
103 10 CDS Length
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
104 - 11 Exon Rank in Transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
105 12 5' UTR Start
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
106 13 5' UTR End
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
107 14 3' UTR Start
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
108 15 3' UTR End
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
109 - 16 Ensembl Protein ID
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
110 - 17 Gene Start (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
111 - 18 Gene End (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
112 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
113
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
114 # return {transcript_id : ccds_id}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
115 def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
116 ccds_dict = dict()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
117 if transcript_ids:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
118 query = query_ccds_template
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
119 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
120 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
121 params = urllib.urlencode({'query':query})
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
122 headers = {"Accept": "text/plain"}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
123 if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
124 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
125 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
126 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
127 conn.request("POST", "/biomart/martservice", params, headers)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
128 response = conn.getresponse()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
129 data = response.read()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
130 if len(data) > 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
131 # if debug: print >> sys.stdout, "%s\n\n" % data
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
132 lines = data.split('\n')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
133 for line in lines:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
134 fields = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
135 if len(fields) == 2:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
136 (transcript_id,ccds) = fields
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
137 ccds_dict[transcript_id] = ccds
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
138 if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
139 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
140 sys.stdout.write( "transcript_ids: %s %s\n" % (transcript_ids, e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
141 return ccds_dict
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
142
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
143 def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
144 query = query_template
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
145 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
146 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
147 query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
148 return query
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
149
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
150 # return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
151 def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
152 transcript_id = snpEffect.transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
153 chr_pos = snpEffect.pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
154 query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
155 if debug: print >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
156 params = urllib.urlencode({'query':query})
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
157 headers = {"Accept": "text/plain"}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
158 pos = snpEffect.pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
159 cds_pos = None # zero based offset
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
160 coding_sequence = ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
161 cds_ref = snpEffect.ref
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
162 cds_alt = snpEffect.alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
163 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
164 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
165 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
166 conn.request("POST", "/biomart/martservice", params, headers)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
167 response = conn.getresponse()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
168 data = response.read()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
169 if len(data) > 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
170 # if debug: print >> sys.stdout, "%s\n\n" % data
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
171 lines = data.split('\n')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
172 # Use the header line to determine the order of fields
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
173 line = lines[0]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
174 # if debug: print >> sys.stdout, "Headers:\n%s\n" % line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
175 colnames = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
176 # for i in range(len(colnames)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
177 #
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
178 if debug: print >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
179 for line in lines[1:]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
180 if line == None or len(line) < 2:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
181 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
182 field = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
183 if len(field) != len(colnames):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
184 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
185 if debug: print >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
186 if field[10] == None or len(field[10]) < 1:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
187 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
188 if debug:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
189 for i in range(len(colnames)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
190 print >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
191 snpEffect.gene_id = field[1]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
192 strand = '+' if int(field[3]) > 0 else '-'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
193 snpEffect.strand = strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
194 # coding_sequence is first
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
195 if len(field) > 10 and re.match('[ATGCN]+',field[0]):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
196 if field[10] == None or len(field[10]) < 1:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
197 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
198 cdna_seq = field[0]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
199 in_utr = False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
200 # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
201 utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
202 utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
203 utr_5_lens = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
204 for i in range(len(utr_5_starts)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
205 utr_5_start = int(utr_5_starts[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
206 utr_5_end = int(utr_5_ends[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
207 utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
208 if utr_5_start <= pos <= utr_5_end :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
209 in_utr = True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
210 if debug: print >> sys.stdout, "in utr_5: %s %s %s\n" % (utr_5_start,pos,utr_5_end);
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
211 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
212 utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
213 utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
214 for i in range(len(utr_3_starts)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
215 utr_3_start = int(utr_3_starts[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
216 utr_3_end = int(utr_3_ends[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
217 if utr_3_start <= pos <= utr_3_end :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
218 in_utr = True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
219 if debug: print >> sys.stdout, "in utr_3: %s %s %s\n" % (utr_3_start,pos,utr_3_end);
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
220 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
221 # Get coding sequence from cdna
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
222 cdna_length = int(field[10])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
223 cdna_coding_start = sorted(eval('[' + re.sub(';',',',field[8]) + ']'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
224 cdna_coding_end = sorted(eval('[' + re.sub(';',',',field[9]) + ']'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
225 cdna_lens = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
226 for i in range(len(cdna_coding_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
227 cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
228 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
229 cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
230 for i in range(len(cdna_coding_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
231 if debug: print >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
232 coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
233 snpEffect.cds = coding_sequence
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
234 if coding_sequence and len(coding_sequence) >= 3:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
235 snpEffect.cds_stop_codon = coding_sequence[-3:]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
236 if debug: print >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
237 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
238 if debug: print >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
239 if not in_utr:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
240 exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
241 exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
242 exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
243 exon_lens = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
244 for i in range(len(exon_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
245 exon_lens.append(exon_end[i] - exon_start[i] + 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
246 if debug: print >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
247 if debug:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
248 bp_tot = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
249 for i in range(len(exon_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
250 exon_len = exon_end[i] - exon_start[i] + 1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
251 bp_tot += exon_len
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
252 print >> sys.stdout, "test: %s %s %s %s %s %d %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
253 cds_idx = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
254 for i in range(len(exon_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
255 # Does this exon have cds bases - i.e. not entirely in UTR
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
256 if len(utr_5_ends) > 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
257 if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
258 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
259 if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
260 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
261 exon_len = exon_end[i] - exon_start[i] + 1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
262 if exon_start[i] <= pos <= exon_end[i]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
263 if strand == '+':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
264 if cds_idx: # find offset from start of cdna_coding and exon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
265 offset = pos - exon_start[i]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
266 else: # find offset from end of cdna_coding and exon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
267 offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
268 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
269 else: # negative strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
270 cds_ref = reverseComplement(snpEffect.ref)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
271 cds_alt = reverseComplement(snpEffect.alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
272 offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
273 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
274 snpEffect.cds_pos = cds_pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
275 snpEffect.cds_ref = cds_ref
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
276 snpEffect.cds_alt = cds_alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
277 return snpEffect
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
278 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
279 cds_idx += 1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
280 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
281 sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
282 finally:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
283 if conn != None :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
284 conn.close()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
285 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
286
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
287 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
288 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
289 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
290 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
291 class SnpEffect( object ):
6
a64ef0611117 Change field header from "Penetrance" to "Prevalence"
Jim Johnson <jj@umn.edu>
parents: 5
diff changeset
292 report_headings = ['Gene','Variant Position','Reference','Variation','Prevalence','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','Stop Region','AA Variation']
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
293 def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
294 self.chrom = chrom
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
295 self.pos = int(pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
296 self.ref = ref
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
297 self.alt = alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
298 self.freq = float(freq) if freq else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
299 self.depth = int(depth) if depth else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
300 # From SnpEffect VCF String
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
301 self.effect = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
302 self.effect_impact = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
303 self.functional_class = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
304 self.codon_change = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
305 self.amino_acid_change = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
306 self.amino_acid_length = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
307 self.gene_name = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
308 self.gene_id = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
309 self.gene_biotype = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
310 self.coding = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
311 self.transcript = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
312 self.transcript_ids = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
313 self.exon = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
314 self.cds_stop_codon = None
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
315 self.cds_stop_pos = None
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
316 # retrieve from ensembl
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
317 self.strand = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
318 self.cds_pos = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
319 self.cds_ref = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
320 self.cds_alt = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
321 self.cds = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
322 self.aa_pos = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
323 self.aa_len = None
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
324 self.alt_stop_pos = None
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
325 self.alt_stop_codon = None
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
326 self.alt_stop_region = None ## includes base before and after alt_stop_codon
5
85b933b7d231 Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents: 4
diff changeset
327 self.use_eff_aa_change = False
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
328 def chrPos(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
329 return "%s:%s" % (self.chrom,self.pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
330 def setEffect(self, effect, snpEffVersion = None):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
331 if snpEffVersion and snpEffVersion == '2':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
332 (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
333 else: # SnpEffVersion 3 # adds Amino_Acid_length field
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
334 (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
335 self.amino_acid_length = amino_acid_length
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
336 self.effect_impact = effect_impact
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
337 self.functional_class = functional_class
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
338 self.codon_change = codon_change
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
339 self.amino_acid_change = amino_acid_change
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
340 self.gene_name = gene_name
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
341 self.gene_biotype = gene_biotype
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
342 self.coding = coding
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
343 self.transcript = transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
344 self.exon = exon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
345 def parseEffect(self, effect, snpEffVersion = None):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
346 (eff,effs) = effect.rstrip(')').split('(')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
347 self.effect = eff
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
348 eff_fields = effs.split('|')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
349 if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
350 self.setEffect(eff_fields, snpEffVersion)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
351 def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
352 getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
353 return len(self.cds) > 0 if self.cds else False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
354 def score(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
355 return self.freq * self.depth if self.freq and self.depth else -1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
356 def getCodingSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
357 return self.cds
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
358 def getVariantSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
359 if self.cds and self.cds_pos and self.cds_ref and self.cds_alt:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
360 return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
361 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
362 if debug: print >> sys.stdout, "getVariantSequence: %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
363 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
364 def getCodingAminoSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
365 seq = translate(self.cds) if self.cds else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
366 if seq:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
367 self.aa_len = len(seq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
368 return seq
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
369 def getVariantAminoSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
370 variant_seq = self.getVariantSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
371 return translate(variant_seq) if variant_seq else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
372 def getVariantPeptide(self,toStopCodon=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
373 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
374 if var_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
375 if toStopCodon:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
376 end_pos = var_aa_end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
377 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
378 end_pos = len(var_aa)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
379 novel_peptide = var_aa[var_aa_pos:end_pos]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
380 return novel_peptide
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
381 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
382 # [preAA,varAA,postAA, varPeptide, varOffset, subAA]
1
29b286896c50 Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
383 def getNonSynonymousPeptide(self,start_count,end_count,toStopCodon=True):
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
384 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
385 if var_aa:
1
29b286896c50 Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
386 start_pos = max(var_aa_pos - start_count,0) if start_count and int(start_count) >= 0 else 0
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
387 if toStopCodon:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
388 end_pos = var_aa_end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
389 else:
1
29b286896c50 Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
390 end_offset = end_count + 1
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
391 end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
392 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
393 varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
394 if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos)
5
85b933b7d231 Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents: 4
diff changeset
395 mutation = "%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA)
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
396 preAA = var_aa[start_pos:var_aa_pos] # N-term side
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
397 postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
398 novel_peptide = var_aa[start_pos:end_pos]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
399 return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
400 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
401 sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
402 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
403 # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
404 # positions are 0-based array indexes
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
405 def getSequences(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
406 coding_dna = self.getCodingSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
407 coding_aa = self.getCodingAminoSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
408 var_dna = self.getVariantSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
409 var_aa = self.getVariantAminoSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
410 var_aa_pos = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
411 var_aa_end = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
412 if var_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
413 var_aa_pos = self.cds_pos / 3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
414 for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
415 if var_aa[j] != coding_aa[j]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
416 var_aa_pos = j
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
417 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
418 self.aa_pos = var_aa_pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
419 var_stop = var_aa.find('*',var_aa_pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
420 if var_stop < 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
421 var_aa_end = len(var_aa)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
422 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
423 var_aa_end = var_stop + 1 # include the stop codon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
424 stop_pos = var_stop * 3
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
425 self.alt_stop_pos = stop_pos
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
426 self.alt_stop_codon = var_dna[stop_pos:stop_pos+3]
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
427 self.alt_stop_region = (var_dna[stop_pos - 1] + '-' if stop_pos > 0 else '') + \
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
428 self.alt_stop_codon + \
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
429 ( '-' + var_dna[stop_pos+3] if len(var_dna) > stop_pos+3 else '')
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
430 return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
431 def inHomopolymer(self,polyA_limit):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
432 if polyA_limit: ## library prep can results in inserting or deleting an A in a poly A region
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
433 ## check if base changes at cds_pos involve A or T
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
434 bdiff = None # the difference between the cds_ref and cds_alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
435 boff = 0 # the offset to the difference
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
436 if len(self.cds_ref) < len(self.cds_alt):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
437 bdiff = re.sub(self.cds_ref,'',self.cds_alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
438 boff = self.cds_alt.find(bdiff)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
439 elif len(self.cds_alt) < len(self.cds_ref):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
440 bdiff = re.sub(self.cds_alt,'',self.cds_ref)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
441 boff = self.cds_ref.find(bdiff)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
442 if bdiff:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
443 ## check the number of similar base neighboring the change
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
444 alt_seq = self.getVariantSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
445 subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
446 ## adjust match length if the cps_pos is within polyA_limit form sequence end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
447 match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
448 if debug: print >> sys.stdout, "polyA bdiff %s %s %d %d\n" % (bdiff,subseq, match_len, boff)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
449 ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
450 if bdiff.find('A') >= 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
451 pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
452 match = re.match(pat,subseq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
453 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
454 if match:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
455 return True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
456 if bdiff.find('T') >= 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
457 pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
458 match = re.match(pat,subseq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
459 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
460 if match:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
461 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups())
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
462 return True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
463 return False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
464 def getReportHeader():
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
465 return report_headings
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
466 def getReportFields(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
467 gene_name = self.gene_name if self.gene_name else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
468 cds_ref = self.cds_ref if self.cds_ref else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
469 cds_alt = self.cds_alt if self.cds_alt else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
470 hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
471 if debug: print >> sys.stdout, "HGVS %s" % hgvs
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
472 var_aa = self.getVariantPeptide(toStopCodon=True)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
473 freq = "%2.2f" % self.freq if self.freq else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
474 depth = "%d" % self.depth if self.depth else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
475 aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
476 aa_len = "%d" % self.aa_len if self.aa_len else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
477 gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
478 stop_codon = self.alt_stop_codon if self.alt_stop_codon else ''
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
479 stop_region = self.alt_stop_region if self.alt_stop_region else ''
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
480 chrpos = self.chrPos()
5
85b933b7d231 Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents: 4
diff changeset
481 aa_change = self.amino_acid_change if (self.use_eff_aa_change and self.amino_acid_change) else hgvs[5]
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
482 return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,stop_region,var_aa if var_aa else '']
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
483
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
484 def __main__():
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
485
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
486 def printReportTsv(output_file, snpEffects):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
487 if output_file:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
488 print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
489 for snpEffect in snpEffects:
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
490 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,stop_region,novel_peptide) = snpEffect.getReportFields()
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
491 if not snpEffect.effect == 'FRAME_SHIFT':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
492 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
493 novel_peptide = "%s_%s_%s" % (preAA,varAA,postAA)
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
494 print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,novel_peptide])
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
495
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
496 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
497 http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
498 http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
499 http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
500 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
501 def printReportHtml(output_file, detail_dir, snpEffects):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
502 if output_file:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
503 print >> output_file, '<HTML><BODY>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
504 print >> output_file, '<TABLE BORDER="1">\n'
6
a64ef0611117 Change field header from "Penetrance" to "Prevalence"
Jim Johnson <jj@umn.edu>
parents: 5
diff changeset
505 print >> output_file, '<TR align="LEFT"><TH>Gene</TH><TH>Variant Position</TH><TH>Reference</TH><TH>Variation</TH><TH>Prevalence</TH><TH>Sequencing Depth</TH><TH>Transcript Details</TH><TH>AA Position</TH><TH>AA Change</TH><TH>AA Length</TH> <TH>Stop Codon</TH><TH>AA Variation</TH></TR>\n'
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
506 for snpEffect in snpEffects:
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
507 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,stop_region,novel_peptide) = snpEffect.getReportFields()
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
508 refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
509 aa_ref = '#'.join([refname,aa_anchor])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
510 refpath = os.path.join(detail_dir,refname)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
511 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
512 ref_file = open(refpath,'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
513 printEffDetailHtml(ref_file,snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
514 ref_file.close()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
515 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
516 sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
517 if snpEffect.effect == 'FRAME_SHIFT':
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
518 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s">%s</A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,aa_ref,novel_peptide)
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
519 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
520 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
521 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s"><small><I>%s</I></small><B>%s</B><small><I>%s</I></small></A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,aa_ref, preAA,varAA,postAA)
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
522 print >> output_file, '</TABLE>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
523 print >> output_file, '</BODY></HTML>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
524
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
525 def printEffDetailHtml(output_file,snpEffect,wrap_len=60):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
526 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
527 seq_len = len(coding_seq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
528 print >> output_file, '<HTML><BODY>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
529 print >> output_file, '<TABLE>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
530 print >> output_file, '<TR><TD>Gene:</TD><TD>%s</TD></TR>\n' % snpEffect.gene_name
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
531 print >> output_file, '<TR><TD>Allele:</TD><TD>%s/%s</TD></TR>\n' % (snpEffect.cds_ref,snpEffect.cds_alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
532 print >> output_file, '<TR><TD>Transcript:</TD><TD>%s|%s</TD></TR>\n' % (snpEffect.gene_id,snpEffect.transcript)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
533 print >> output_file, '<TR><TD>Transcript Strand:</TD><TD>%s</TD></TR>\n' % snpEffect.strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
534 print >> output_file, '<TR><TD>Position in transcript:</TD><TD><A HREF="%s">%d</A></TD></TR>\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
535 print >> output_file, '</TABLE>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
536 if alt_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
537 alt_aa_pos = pos / 3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
538 if coding_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
539 for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
540 if alt_aa[j] != coding_aa[j]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
541 alt_aa_pos = j
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
542 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
543 alt_aa_end = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
544 for j in range(alt_aa_pos,len(alt_aa)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
545 if alt_aa[j] == '*':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
546 alt_aa_end = j
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
547 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
548 seq = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
549 for j in range(1,wrap_len+1):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
550 seq.append('-' if j % 10 else '|')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
551 hrule = '</TD><TD>'.join(seq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
552 print >> output_file, '<TABLE cellspacing="0">'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
553 for i in range(0,seq_len,wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
554 e = min(i + wrap_len,seq_len)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
555 sa = i/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
556 ea = (i+wrap_len)/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
557 print >> output_file, "<TR STYLE=\"background:#DDD\"><TD ALIGN=RIGHT></TD><TD>%s</TD><TD></TD ALIGN=RIGHT></TR>\n" % (hrule)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
558 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
559 for j in range(i,i + wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
560 if j < len(coding_seq):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
561 if pos == j:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
562 print >> output_file, "<TD><FONT COLOR=\"#F00\"><A NAME=\"%s\">%s</A></FONT></TD>" % (cds_anchor,coding_seq[j])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
563 elif pos <= j < pos + len(ref):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
564 print >> output_file, "<TD><FONT COLOR=\"#F00\">%s</FONT></TD>" % coding_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
565 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
566 print >> output_file, "<TD>%s</TD>" % coding_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
567 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
568 print >> output_file, "<TD></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
569 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
570 if coding_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
571 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
572 for j in range(sa,ea):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
573 if j < len(coding_aa):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
574 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % coding_aa[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
575 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
576 print >> output_file, "<TD COLSPAN=\"3\"></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
577 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
578 if alt_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
579 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
580 for j in range(sa,ea):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
581 if j < len(alt_aa):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
582 if alt_aa_pos == j:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
583 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\"><A NAME=\"%s\">%s</A></TD>" % (aa_anchor,alt_aa[j])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
584 elif alt_aa_pos < j <= alt_aa_end:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
585 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\">%s</TD>" % alt_aa[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
586 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
587 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % alt_aa[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
588 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
589 print >> output_file, "<TD COLSPAN=\"3\"></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
590 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
591 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
592 for j in range(i,i + wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
593 if j < len(alt_seq):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
594 if pos <= j < pos + len(alt):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
595 print >> output_file, "<TD><FONT COLOR=\"#00F\">%s</FONT></TD>" % alt_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
596 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
597 print >> output_file, "<TD>%s</TD>" % alt_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
598 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
599 print >> output_file, "<TD></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
600 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
601 print >> output_file, "</TABLE>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
602 print >> output_file, "</BODY></HTML>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
603
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
604 def printSeqText(output_file,snpEffect, wrap_len=120):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
605 nw = 6
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
606 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
607 if coding_seq:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
608 seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
609 rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
610 for i in range(0,seq_len,wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
611 e = min(i + wrap_len,seq_len)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
612 sa = i/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
613 ea = e/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
614 print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
615 print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
616 if coding_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
617 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(coding_aa[sa:ea]),nw,ea)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
618 if alt_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
619 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(alt_aa[sa:ea]),nw,ea)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
620 if alt_seq:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
621 print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
622
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
623 # Parse the command line options
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
624 usage = "Usage: snpEff_cds_report.py filter [options]"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
625 parser = optparse.OptionParser(usage = usage)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
626 parser.add_option("-i", "--in",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
627 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
628 dest="vcfFile", help="input vcf file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
629 parser.add_option("-o", "--out",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
630 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
631 dest="output", help="output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
632 parser.add_option("-H", "--html_report",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
633 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
634 dest="html_file", help="html output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
635 parser.add_option("-D", "--html_dir",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
636 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
637 dest="html_dir", help="html output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
638 parser.add_option("-t", "--tsv_file",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
639 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
640 dest="tsv_file", help="TAB Separated Value (.tsv) output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
641 parser.add_option("-e", "--ensembl_host",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
642 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
643 dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
644 parser.add_option("-f", "--effects_filter",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
645 action="store", type="string", default=None,
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
646 dest="effects_filter", help="a list of effects to filter")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
647 parser.add_option("-O", "--ensembl_dataset",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
648 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
649 dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
650 parser.add_option("-p", "--polyA_limit",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
651 action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' )
5
85b933b7d231 Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents: 4
diff changeset
652 parser.add_option('-S', '--snpeff_aa_change', dest='snpeff_aa_change', action='store_true', default=False, help='Report the SnpEff Amino_Acid_change field' )
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
653 parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
654 parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
655 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
656
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
657 (options, args) = parser.parse_args()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
658
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
659 debug = options.debug
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
660 ensembl_host = options.ensembl_host
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
661
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
662 # Check that a single vcf file is given.
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
663 if options.vcfFile == None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
664 parser.print_help()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
665 print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report "
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
666 exit(1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
667 outputFile = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
668 outputHtml = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
669 detailDir = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
670 outputTsv = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
671 tmpHtmlName = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
672 tmpHtml = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
673 effects_list = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
674 # Set the output file to stdout if no output file was specified.
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
675 if options.output == None and options.html_file == None and options.tsv_file == None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
676 outputFile = sys.stdout
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
677 if options.output != None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
678 output = os.path.abspath(options.output)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
679 outputFile = open(output, 'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
680 if options.tsv_file != None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
681 output = os.path.abspath(options.tsv_file)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
682 outputTsv = open(output, 'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
683 if options.html_file != None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
684 output = os.path.abspath(options.html_file)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
685 outputHtml = open(output, 'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
686 if options.html_dir == None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
687 detailDir = os.path.dirname(os.path.abspath(output))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
688 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
689 detailDir = options.html_dir
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
690 if detailDir:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
691 if not os.path.isdir(detailDir):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
692 os.makedirs(detailDir)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
693 if options.effects_filter:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
694 eff_codes = options.effects_filter.split(',')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
695 for val in eff_codes:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
696 code = val.strip()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
697 if code:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
698 effects_list.append(code)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
699
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
700 #
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
701 # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
702 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
703 snpEffects = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
704 homopolymers = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
705 polya_count = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
706 polya_list = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
707 fh = open( options.vcfFile )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
708 while True:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
709 line = fh.readline()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
710 if not line:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
711 break #EOF
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
712 if line:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
713 if outputFile:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
714 print >> outputFile, "%s\n" % line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
715 line = line.strip()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
716 if len(line) < 1:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
717 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
718 elif line.startswith('#'):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
719 # check for SnpEffVersion
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
720 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
721 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
722 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
723 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
724 if line.startswith('##SnpEffVersion='):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
725 m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
726 if m:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
727 SnpEffVersion = m.group(2)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
728 # check for SnpEff Info Description
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
729 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
730 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
731 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
732 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
733 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
734 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
735 pass
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
736 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
737 if debug: print >> sys.stdout, "%s\n" % line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
738 freq = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
739 depth = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
740 fields = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
741 (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
742 if debug: print >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
743 if options.debug: print(fields)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
744 for idx,alt in enumerate(alts.split(',')):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
745 if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
746 if not re.match('^[ATCG]*$',alt):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
747 print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s %s\n" % (chrom,pos,alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
748 for info_item in info.split(';'):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
749 try:
4
429a6b4df5e5 Restrict to info items that are key=value pairs
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
750 if info_item.find('=') < 0:
429a6b4df5e5 Restrict to info items that are key=value pairs
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
751 continue
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
752 (key,val) = info_item.split('=',1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
753 if key == 'SAF': # Usually a SAF for each variant: if A,AG then SAF=0.333333333333333,SAF=0.333333333333333;
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
754 freqs = info_item.split(',')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
755 (k,v) = freqs[idx].split('=',1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
756 freq = v
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
757 elif key == 'DP':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
758 depth = val
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
759 elif key == 'EFF':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
760 transcript_ids = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
761 effects = val.split(',')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
762 ccds_dict = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
763 for effect in effects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
764 (eff,effs) = effect.rstrip(')').split('(')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
765 eff_fields = effs.split('|')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
766 if SnpEffVersion == '2':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
767 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
768 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
769 EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
770 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
771 (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
772 else: # SnpEffVersion 3 # adds Amino_Acid_length field
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
773 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
774 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
775 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
776 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
777 (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
778 if transcript:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
779 transcript_ids.append(transcript)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
780 if debug: print >> sys.stdout, "transcripts: %s" % (transcript_ids)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
781 if options.with_ccds:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
782 ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
783 if debug: print >> sys.stdout, "ccds_dict: %s" % (ccds_dict)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
784 for effect in effects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
785 snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
786 snpEffect.transcript_ids = transcript_ids
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
787 snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion)
5
85b933b7d231 Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents: 4
diff changeset
788 if options.snpeff_aa_change:
85b933b7d231 Make the reporting of the SnpEff Amino_Acid_change field an option
Jim Johnson <jj@umn.edu>
parents: 4
diff changeset
789 snpEffect.use_eff_aa_change = True
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
790 if effects_list and not snpEffect.effect in effects_list:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
791 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
792 if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
793 if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
794 if snpEffect.cds_pos:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
795 if snpEffect.inHomopolymer(options.polyA_limit):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
796 homopolymers.append(snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
797 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
798 snpEffects.append(snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
799 if outputFile:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
800 print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields())
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
801 printSeqText(outputFile,snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
802 if snpEffect.cds and snpEffect.cds_pos and not options.all_effects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
803 ## Just report the first sequence returned for a vcf line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
804 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
805 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
806 sys.stderr.write( "line: %s err: %s\n" % (line,e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
807 print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
808 # Sort snpEffects
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
809 snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
810 if outputHtml:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
811 printReportHtml(outputHtml, detailDir, snpEffects)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
812 if outputTsv:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
813 printReportTsv(outputTsv,snpEffects)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
814 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
815 print(str(e))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
816
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
817 if __name__ == "__main__": __main__()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
818