annotate snpEff_cds_report.py @ 1:29b286896c50

Change the default number of Amino Acids to display after a missense to 10.
author Jim Johnson <jj@umn.edu>
date Fri, 19 Apr 2013 10:49:05 -0500
parents cdbdac66d6b5
children 15a54fa11ad7
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
1 #!/usr/bin/python
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
2 import sys,os,tempfile,re
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
3 import httplib, urllib
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
4 import optparse
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
5 #import vcfClass
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
6 #from vcfClass import *
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
7 #import tools
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
8 #from tools import *
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
9
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
10 debug = False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
11 cds_anchor = 'cds_variation'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
12 aa_anchor = 'aa_variation'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
13
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
14 codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
15 "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
16 "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
17 "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
18 "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
19 "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
20 "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
21 "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
22 "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
23 "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
24 "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
25 "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
26 "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
27 "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
28 "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
29 "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
30
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
31 def reverseComplement(seq) :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
32 rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
33 return rev
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
34
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
35 def translate(seq) :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
36 rna = seq.upper().replace('T','U')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
37 aa = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
38 for i in range(0,len(rna) - 2, 3):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
39 aa.append(codon_map[rna[i:i+3]])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
40 return ''.join(aa)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
41
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
42 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
43 SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
44 Queries that request a sequence always return the sequence in the first column regardless of the order of attributes.
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
45 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
46 query_ccds_template = """<?xml version="1.0" encoding="UTF-8"?>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
47 <!DOCTYPE Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
48 <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
49 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
50 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
51 <Filter name = "with_ccds" excluded = "0"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
52 <Attribute name = "ensembl_transcript_id" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
53 <Attribute name = "ccds" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
54 </Dataset>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
55 </Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
56 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
57 ccds_filter = '<Filter name = "with_ccds" excluded = "0"/>'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
58 query_template = """<?xml version="1.0" encoding="UTF-8"?>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
59 <!DOCTYPE Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
60 <Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
61 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
62 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
63 <Filter name = "biotype" value = "protein_coding"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
64 __CCDS_FILTER_HERE__
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
65 <Attribute name = "cdna" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
66 <Attribute name = "ensembl_gene_id" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
67 <Attribute name = "ensembl_transcript_id" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
68 <Attribute name = "strand" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
69 <Attribute name = "transcript_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
70 <Attribute name = "transcript_end"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
71 <Attribute name = "exon_chrom_start"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
72 <Attribute name = "exon_chrom_end"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
73 <Attribute name = "cdna_coding_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
74 <Attribute name = "cdna_coding_end" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
75 <Attribute name = "cds_length"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
76 <Attribute name = "rank"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
77 <Attribute name = "5_utr_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
78 <Attribute name = "5_utr_end" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
79 <Attribute name = "3_utr_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
80 <Attribute name = "3_utr_end" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
81 <Attribute name = "ensembl_peptide_id"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
82 <Attribute name = "start_position" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
83 <Attribute name = "end_position" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
84 </Dataset>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
85 </Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
86 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
87 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
88 Columns(19):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
89 ['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)']
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
90
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
91 0 cDNA sequence
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
92 1 Ensembl Gene ID
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
93 2 Ensembl Transcript ID
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
94 3 Strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
95 - 4 Transcript Start (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
96 - 5 Transcript End (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
97 6 Exon Chr Start (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
98 7 Exon Chr End (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
99 8 CDS Start
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
100 9 CDS End
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
101 10 CDS Length
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
102 - 11 Exon Rank in Transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
103 12 5' UTR Start
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
104 13 5' UTR End
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
105 14 3' UTR Start
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
106 15 3' UTR End
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
107 - 16 Ensembl Protein ID
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
108 - 17 Gene Start (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
109 - 18 Gene End (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
110 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
111
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
112 # return {transcript_id : ccds_id}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
113 def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
114 ccds_dict = dict()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
115 if transcript_ids:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
116 query = query_ccds_template
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
117 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
118 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
119 params = urllib.urlencode({'query':query})
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
120 headers = {"Accept": "text/plain"}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
121 if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
122 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
123 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
124 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
125 conn.request("POST", "/biomart/martservice", params, headers)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
126 response = conn.getresponse()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
127 data = response.read()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
128 if len(data) > 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
129 # if debug: print >> sys.stdout, "%s\n\n" % data
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
130 lines = data.split('\n')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
131 for line in lines:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
132 fields = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
133 if len(fields) == 2:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
134 (transcript_id,ccds) = fields
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
135 ccds_dict[transcript_id] = ccds
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
136 if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
137 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
138 sys.stdout.write( "transcript_ids: %s %s\n" % (transcript_ids, e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
139 return ccds_dict
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
140
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
141 def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
142 query = query_template
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
143 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
144 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
145 query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
146 return query
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
147
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
148 # return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
149 def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
150 transcript_id = snpEffect.transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
151 chr_pos = snpEffect.pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
152 query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
153 if debug: print >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
154 params = urllib.urlencode({'query':query})
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
155 headers = {"Accept": "text/plain"}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
156 pos = snpEffect.pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
157 cds_pos = None # zero based offset
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
158 coding_sequence = ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
159 cds_ref = snpEffect.ref
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
160 cds_alt = snpEffect.alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
161 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
162 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
163 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
164 conn.request("POST", "/biomart/martservice", params, headers)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
165 response = conn.getresponse()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
166 data = response.read()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
167 if len(data) > 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
168 # if debug: print >> sys.stdout, "%s\n\n" % data
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
169 lines = data.split('\n')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
170 # Use the header line to determine the order of fields
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
171 line = lines[0]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
172 # if debug: print >> sys.stdout, "Headers:\n%s\n" % line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
173 colnames = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
174 # for i in range(len(colnames)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
175 #
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
176 if debug: print >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
177 for line in lines[1:]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
178 if line == None or len(line) < 2:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
179 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
180 field = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
181 if len(field) != len(colnames):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
182 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
183 if debug: print >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
184 if field[10] == None or len(field[10]) < 1:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
185 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
186 if debug:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
187 for i in range(len(colnames)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
188 print >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
189 snpEffect.gene_id = field[1]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
190 strand = '+' if int(field[3]) > 0 else '-'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
191 snpEffect.strand = strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
192 # coding_sequence is first
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
193 if len(field) > 10 and re.match('[ATGCN]+',field[0]):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
194 if field[10] == None or len(field[10]) < 1:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
195 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
196 cdna_seq = field[0]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
197 in_utr = False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
198 # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
199 utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
200 utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
201 utr_5_lens = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
202 for i in range(len(utr_5_starts)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
203 utr_5_start = int(utr_5_starts[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
204 utr_5_end = int(utr_5_ends[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
205 utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
206 if utr_5_start <= pos <= utr_5_end :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
207 in_utr = True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
208 if debug: print >> sys.stdout, "in utr_5: %s %s %s\n" % (utr_5_start,pos,utr_5_end);
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
209 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
210 utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
211 utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
212 for i in range(len(utr_3_starts)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
213 utr_3_start = int(utr_3_starts[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
214 utr_3_end = int(utr_3_ends[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
215 if utr_3_start <= pos <= utr_3_end :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
216 in_utr = True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
217 if debug: print >> sys.stdout, "in utr_3: %s %s %s\n" % (utr_3_start,pos,utr_3_end);
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
218 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
219 # Get coding sequence from cdna
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
220 cdna_length = int(field[10])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
221 cdna_coding_start = sorted(eval('[' + re.sub(';',',',field[8]) + ']'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
222 cdna_coding_end = sorted(eval('[' + re.sub(';',',',field[9]) + ']'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
223 cdna_lens = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
224 for i in range(len(cdna_coding_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
225 cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
226 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
227 cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
228 for i in range(len(cdna_coding_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
229 if debug: print >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
230 coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
231 snpEffect.cds = coding_sequence
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
232 if coding_sequence and len(coding_sequence) >= 3:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
233 snpEffect.cds_stop_codon = coding_sequence[-3:]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
234 if debug: print >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
235 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
236 if debug: print >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
237 if not in_utr:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
238 exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
239 exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
240 exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
241 exon_lens = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
242 for i in range(len(exon_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
243 exon_lens.append(exon_end[i] - exon_start[i] + 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
244 if debug: print >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
245 if debug:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
246 bp_tot = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
247 for i in range(len(exon_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
248 exon_len = exon_end[i] - exon_start[i] + 1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
249 bp_tot += exon_len
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
250 print >> sys.stdout, "test: %s %s %s %s %s %d %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
251 cds_idx = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
252 for i in range(len(exon_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
253 # Does this exon have cds bases - i.e. not entirely in UTR
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
254 if len(utr_5_ends) > 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
255 if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
256 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
257 if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
258 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
259 exon_len = exon_end[i] - exon_start[i] + 1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
260 if exon_start[i] <= pos <= exon_end[i]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
261 if strand == '+':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
262 if cds_idx: # find offset from start of cdna_coding and exon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
263 offset = pos - exon_start[i]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
264 else: # find offset from end of cdna_coding and exon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
265 offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
266 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
267 else: # negative strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
268 cds_ref = reverseComplement(snpEffect.ref)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
269 cds_alt = reverseComplement(snpEffect.alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
270 offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
271 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
272 snpEffect.cds_pos = cds_pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
273 snpEffect.cds_ref = cds_ref
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
274 snpEffect.cds_alt = cds_alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
275 return snpEffect
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
276 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
277 cds_idx += 1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
278 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
279 sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
280 finally:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
281 if conn != None :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
282 conn.close()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
283 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
284
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
285 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
286 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
287 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
288 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
289 class SnpEffect( object ):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
290 report_headings = ['Gene','Variant Position','Reference','Variation','Penetrance Percent','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','AA Variation']
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
291 def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
292 self.chrom = chrom
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
293 self.pos = int(pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
294 self.ref = ref
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
295 self.alt = alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
296 self.freq = float(freq) if freq else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
297 self.depth = int(depth) if depth else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
298 # From SnpEffect VCF String
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
299 self.effect = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
300 self.effect_impact = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
301 self.functional_class = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
302 self.codon_change = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
303 self.amino_acid_change = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
304 self.amino_acid_length = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
305 self.gene_name = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
306 self.gene_id = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
307 self.gene_biotype = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
308 self.coding = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
309 self.transcript = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
310 self.transcript_ids = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
311 self.exon = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
312 self.cds_stop_codon = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
313 # retrieve from ensembl
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
314 self.strand = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
315 self.cds_pos = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
316 self.cds_ref = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
317 self.cds_alt = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
318 self.cds = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
319 self.aa_pos = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
320 self.aa_len = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
321 self.alt_stop_codon = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
322 def chrPos(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
323 return "%s:%s" % (self.chrom,self.pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
324 def setEffect(self, effect, snpEffVersion = None):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
325 if snpEffVersion and snpEffVersion == '2':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
326 (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
327 else: # SnpEffVersion 3 # adds Amino_Acid_length field
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
328 (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
329 self.amino_acid_length = amino_acid_length
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
330 self.effect_impact = effect_impact
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
331 self.functional_class = functional_class
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
332 self.codon_change = codon_change
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
333 self.amino_acid_change = amino_acid_change
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
334 self.gene_name = gene_name
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
335 self.gene_biotype = gene_biotype
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
336 self.coding = coding
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
337 self.transcript = transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
338 self.exon = exon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
339 def parseEffect(self, effect, snpEffVersion = None):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
340 (eff,effs) = effect.rstrip(')').split('(')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
341 self.effect = eff
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
342 eff_fields = effs.split('|')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
343 if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
344 self.setEffect(eff_fields, snpEffVersion)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
345 def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
346 getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
347 return len(self.cds) > 0 if self.cds else False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
348 def score(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
349 return self.freq * self.depth if self.freq and self.depth else -1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
350 def getCodingSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
351 return self.cds
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
352 def getVariantSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
353 if self.cds and self.cds_pos and self.cds_ref and self.cds_alt:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
354 return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
355 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
356 if debug: print >> sys.stdout, "getVariantSequence: %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
357 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
358 def getCodingAminoSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
359 seq = translate(self.cds) if self.cds else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
360 if seq:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
361 self.aa_len = len(seq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
362 return seq
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
363 def getVariantAminoSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
364 variant_seq = self.getVariantSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
365 return translate(variant_seq) if variant_seq else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
366 def getVariantPeptide(self,toStopCodon=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
367 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
368 if var_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
369 if toStopCodon:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
370 end_pos = var_aa_end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
371 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
372 end_pos = len(var_aa)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
373 novel_peptide = var_aa[var_aa_pos:end_pos]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
374 return novel_peptide
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
375 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
376 # [preAA,varAA,postAA, varPeptide, varOffset, subAA]
1
29b286896c50 Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
377 def getNonSynonymousPeptide(self,start_count,end_count,toStopCodon=True):
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
378 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
379 if var_aa:
1
29b286896c50 Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
380 start_pos = max(var_aa_pos - start_count,0) if start_count and int(start_count) >= 0 else 0
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
381 if toStopCodon:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
382 end_pos = var_aa_end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
383 else:
1
29b286896c50 Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
384 end_offset = end_count + 1
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
385 end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
386 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
387 varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
388 if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
389 mutation = "p.%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
390 preAA = var_aa[start_pos:var_aa_pos] # N-term side
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
391 postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
392 novel_peptide = var_aa[start_pos:end_pos]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
393 return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
394 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
395 sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
396 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
397 # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
398 # positions are 0-based array indexes
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
399 def getSequences(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
400 coding_dna = self.getCodingSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
401 coding_aa = self.getCodingAminoSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
402 var_dna = self.getVariantSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
403 var_aa = self.getVariantAminoSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
404 var_aa_pos = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
405 var_aa_end = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
406 if var_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
407 var_aa_pos = self.cds_pos / 3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
408 for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
409 if var_aa[j] != coding_aa[j]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
410 var_aa_pos = j
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
411 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
412 self.aa_pos = var_aa_pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
413 var_stop = var_aa.find('*',var_aa_pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
414 if var_stop < 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
415 var_aa_end = len(var_aa)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
416 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
417 var_aa_end = var_stop + 1 # include the stop codon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
418 stop_pos = var_stop * 3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
419 self.alt_stop_codon = var_dna[stop_pos:stop_pos+3]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
420 return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
421 def inHomopolymer(self,polyA_limit):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
422 if polyA_limit: ## library prep can results in inserting or deleting an A in a poly A region
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
423 ## check if base changes at cds_pos involve A or T
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
424 bdiff = None # the difference between the cds_ref and cds_alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
425 boff = 0 # the offset to the difference
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
426 if len(self.cds_ref) < len(self.cds_alt):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
427 bdiff = re.sub(self.cds_ref,'',self.cds_alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
428 boff = self.cds_alt.find(bdiff)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
429 elif len(self.cds_alt) < len(self.cds_ref):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
430 bdiff = re.sub(self.cds_alt,'',self.cds_ref)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
431 boff = self.cds_ref.find(bdiff)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
432 if bdiff:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
433 ## check the number of similar base neighboring the change
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
434 alt_seq = self.getVariantSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
435 subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
436 ## adjust match length if the cps_pos is within polyA_limit form sequence end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
437 match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
438 if debug: print >> sys.stdout, "polyA bdiff %s %s %d %d\n" % (bdiff,subseq, match_len, boff)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
439 ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
440 if bdiff.find('A') >= 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
441 pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
442 match = re.match(pat,subseq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
443 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
444 if match:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
445 return True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
446 if bdiff.find('T') >= 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
447 pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
448 match = re.match(pat,subseq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
449 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
450 if match:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
451 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups())
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
452 return True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
453 return False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
454 def getReportHeader():
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
455 return report_headings
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
456 def getReportFields(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
457 gene_name = self.gene_name if self.gene_name else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
458 cds_ref = self.cds_ref if self.cds_ref else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
459 cds_alt = self.cds_alt if self.cds_alt else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
460 hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
461 if debug: print >> sys.stdout, "HGVS %s" % hgvs
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
462 var_aa = self.getVariantPeptide(toStopCodon=True)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
463 freq = "%2.2f" % self.freq if self.freq else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
464 depth = "%d" % self.depth if self.depth else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
465 aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
466 aa_len = "%d" % self.aa_len if self.aa_len else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
467 gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
468 stop_codon = self.alt_stop_codon if self.alt_stop_codon else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
469 chrpos = self.chrPos()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
470 aa_change = self.amino_acid_change if self.amino_acid_change else hgvs[5]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
471 return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,var_aa if var_aa else '']
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
472
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
473 def __main__():
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
474
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
475 def printReportTsv(output_file, snpEffects):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
476 if output_file:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
477 print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
478 for snpEffect in snpEffects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
479 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
480 if not snpEffect.effect == 'FRAME_SHIFT':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
481 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
482 novel_peptide = "%s_%s_%s" % (preAA,varAA,postAA)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
483 print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
484
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
485 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
486 http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
487 http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
488 http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
489 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
490 def printReportHtml(output_file, detail_dir, snpEffects):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
491 if output_file:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
492 print >> output_file, '<HTML><BODY>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
493 print >> output_file, '<TABLE BORDER="1">\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
494 print >> output_file, '<TR align="LEFT"><TH>Gene</TH><TH>Variant Position</TH><TH>Reference</TH><TH>Variation</TH><TH>Penetrance Percent</TH><TH>Sequencing Depth</TH><TH>Transcript Details</TH><TH>AA Position</TH><TH>AA Change</TH><TH>AA Length</TH> <TH>Stop Codon</TH><TH>AA Variation</TH></TR>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
495 for snpEffect in snpEffects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
496 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
497 refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
498 aa_ref = '#'.join([refname,aa_anchor])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
499 refpath = os.path.join(detail_dir,refname)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
500 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
501 ref_file = open(refpath,'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
502 printEffDetailHtml(ref_file,snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
503 ref_file.close()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
504 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
505 sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
506 if snpEffect.effect == 'FRAME_SHIFT':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
507 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s">%s</A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref,novel_peptide)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
508 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
509 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
510 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s"><small><I>%s</I></small><B>%s</B><small><I>%s</I></small></A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref, preAA,varAA,postAA)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
511 print >> output_file, '</TABLE>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
512 print >> output_file, '</BODY></HTML>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
513
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
514 def printEffDetailHtml(output_file,snpEffect,wrap_len=60):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
515 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
516 seq_len = len(coding_seq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
517 print >> output_file, '<HTML><BODY>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
518 print >> output_file, '<TABLE>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
519 print >> output_file, '<TR><TD>Gene:</TD><TD>%s</TD></TR>\n' % snpEffect.gene_name
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
520 print >> output_file, '<TR><TD>Allele:</TD><TD>%s/%s</TD></TR>\n' % (snpEffect.cds_ref,snpEffect.cds_alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
521 print >> output_file, '<TR><TD>Transcript:</TD><TD>%s|%s</TD></TR>\n' % (snpEffect.gene_id,snpEffect.transcript)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
522 print >> output_file, '<TR><TD>Transcript Strand:</TD><TD>%s</TD></TR>\n' % snpEffect.strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
523 print >> output_file, '<TR><TD>Position in transcript:</TD><TD><A HREF="%s">%d</A></TD></TR>\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
524 print >> output_file, '</TABLE>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
525 if alt_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
526 alt_aa_pos = pos / 3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
527 if coding_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
528 for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
529 if alt_aa[j] != coding_aa[j]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
530 alt_aa_pos = j
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
531 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
532 alt_aa_end = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
533 for j in range(alt_aa_pos,len(alt_aa)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
534 if alt_aa[j] == '*':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
535 alt_aa_end = j
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
536 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
537 seq = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
538 for j in range(1,wrap_len+1):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
539 seq.append('-' if j % 10 else '|')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
540 hrule = '</TD><TD>'.join(seq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
541 print >> output_file, '<TABLE cellspacing="0">'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
542 for i in range(0,seq_len,wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
543 e = min(i + wrap_len,seq_len)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
544 sa = i/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
545 ea = (i+wrap_len)/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
546 print >> output_file, "<TR STYLE=\"background:#DDD\"><TD ALIGN=RIGHT></TD><TD>%s</TD><TD></TD ALIGN=RIGHT></TR>\n" % (hrule)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
547 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
548 for j in range(i,i + wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
549 if j < len(coding_seq):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
550 if pos == j:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
551 print >> output_file, "<TD><FONT COLOR=\"#F00\"><A NAME=\"%s\">%s</A></FONT></TD>" % (cds_anchor,coding_seq[j])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
552 elif pos <= j < pos + len(ref):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
553 print >> output_file, "<TD><FONT COLOR=\"#F00\">%s</FONT></TD>" % coding_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
554 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
555 print >> output_file, "<TD>%s</TD>" % coding_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
556 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
557 print >> output_file, "<TD></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
558 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
559 if coding_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
560 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
561 for j in range(sa,ea):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
562 if j < len(coding_aa):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
563 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % coding_aa[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
564 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
565 print >> output_file, "<TD COLSPAN=\"3\"></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
566 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
567 if alt_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
568 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
569 for j in range(sa,ea):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
570 if j < len(alt_aa):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
571 if alt_aa_pos == j:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
572 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\"><A NAME=\"%s\">%s</A></TD>" % (aa_anchor,alt_aa[j])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
573 elif alt_aa_pos < j <= alt_aa_end:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
574 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\">%s</TD>" % alt_aa[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
575 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
576 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % alt_aa[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
577 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
578 print >> output_file, "<TD COLSPAN=\"3\"></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
579 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
580 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
581 for j in range(i,i + wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
582 if j < len(alt_seq):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
583 if pos <= j < pos + len(alt):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
584 print >> output_file, "<TD><FONT COLOR=\"#00F\">%s</FONT></TD>" % alt_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
585 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
586 print >> output_file, "<TD>%s</TD>" % alt_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
587 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
588 print >> output_file, "<TD></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
589 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
590 print >> output_file, "</TABLE>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
591 print >> output_file, "</BODY></HTML>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
592
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
593 def printSeqText(output_file,snpEffect, wrap_len=120):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
594 nw = 6
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
595 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
596 if coding_seq:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
597 seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
598 rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
599 for i in range(0,seq_len,wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
600 e = min(i + wrap_len,seq_len)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
601 sa = i/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
602 ea = e/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
603 print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
604 print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
605 if coding_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
606 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(coding_aa[sa:ea]),nw,ea)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
607 if alt_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
608 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(alt_aa[sa:ea]),nw,ea)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
609 if alt_seq:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
610 print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
611
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
612 # Parse the command line options
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
613 usage = "Usage: snpEff_cds_report.py filter [options]"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
614 parser = optparse.OptionParser(usage = usage)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
615 parser.add_option("-i", "--in",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
616 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
617 dest="vcfFile", help="input vcf file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
618 parser.add_option("-o", "--out",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
619 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
620 dest="output", help="output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
621 parser.add_option("-H", "--html_report",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
622 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
623 dest="html_file", help="html output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
624 parser.add_option("-D", "--html_dir",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
625 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
626 dest="html_dir", help="html output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
627 parser.add_option("-t", "--tsv_file",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
628 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
629 dest="tsv_file", help="TAB Separated Value (.tsv) output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
630 parser.add_option("-e", "--ensembl_host",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
631 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
632 dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
633 parser.add_option("-f", "--effects_filter",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
634 action="store", type="string", default=None,
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
635 dest="effects_filter", help="a list of effects to filter")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
636 parser.add_option("-O", "--ensembl_dataset",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
637 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
638 dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
639 parser.add_option("-p", "--polyA_limit",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
640 action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
641 parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
642 parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
643 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
644
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
645 (options, args) = parser.parse_args()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
646
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
647 debug = options.debug
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
648 ensembl_host = options.ensembl_host
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
649
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
650 # Check that a single vcf file is given.
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
651 if options.vcfFile == None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
652 parser.print_help()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
653 print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report "
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
654 exit(1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
655 outputFile = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
656 outputHtml = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
657 detailDir = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
658 outputTsv = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
659 tmpHtmlName = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
660 tmpHtml = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
661 effects_list = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
662 # Set the output file to stdout if no output file was specified.
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
663 if options.output == None and options.html_file == None and options.tsv_file == None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
664 outputFile = sys.stdout
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
665 if options.output != None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
666 output = os.path.abspath(options.output)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
667 outputFile = open(output, 'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
668 if options.tsv_file != None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
669 output = os.path.abspath(options.tsv_file)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
670 outputTsv = open(output, 'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
671 if options.html_file != None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
672 output = os.path.abspath(options.html_file)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
673 outputHtml = open(output, 'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
674 if options.html_dir == None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
675 detailDir = os.path.dirname(os.path.abspath(output))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
676 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
677 detailDir = options.html_dir
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
678 if detailDir:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
679 if not os.path.isdir(detailDir):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
680 os.makedirs(detailDir)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
681 if options.effects_filter:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
682 eff_codes = options.effects_filter.split(',')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
683 for val in eff_codes:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
684 code = val.strip()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
685 if code:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
686 effects_list.append(code)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
687
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
688 #
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
689 # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
690 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
691 snpEffects = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
692 homopolymers = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
693 polya_count = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
694 polya_list = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
695 fh = open( options.vcfFile )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
696 while True:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
697 line = fh.readline()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
698 if not line:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
699 break #EOF
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
700 if line:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
701 if outputFile:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
702 print >> outputFile, "%s\n" % line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
703 line = line.strip()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
704 if len(line) < 1:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
705 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
706 elif line.startswith('#'):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
707 # check for SnpEffVersion
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
708 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
709 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
710 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
711 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
712 if line.startswith('##SnpEffVersion='):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
713 m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
714 if m:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
715 SnpEffVersion = m.group(2)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
716 # check for SnpEff Info Description
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
717 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
718 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
719 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
720 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
721 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
722 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
723 pass
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
724 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
725 if debug: print >> sys.stdout, "%s\n" % line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
726 freq = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
727 depth = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
728 fields = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
729 (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
730 if debug: print >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
731 if options.debug: print(fields)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
732 for idx,alt in enumerate(alts.split(',')):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
733 if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
734 if not re.match('^[ATCG]*$',alt):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
735 print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s %s\n" % (chrom,pos,alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
736 for info_item in info.split(';'):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
737 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
738 (key,val) = info_item.split('=',1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
739 if key == 'SAF': # Usually a SAF for each variant: if A,AG then SAF=0.333333333333333,SAF=0.333333333333333;
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
740 freqs = info_item.split(',')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
741 (k,v) = freqs[idx].split('=',1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
742 freq = v
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
743 elif key == 'DP':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
744 depth = val
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
745 elif key == 'EFF':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
746 transcript_ids = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
747 effects = val.split(',')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
748 ccds_dict = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
749 for effect in effects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
750 (eff,effs) = effect.rstrip(')').split('(')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
751 eff_fields = effs.split('|')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
752 if SnpEffVersion == '2':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
753 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
754 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
755 EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
756 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
757 (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
758 else: # SnpEffVersion 3 # adds Amino_Acid_length field
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
759 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
760 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
761 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
762 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
763 (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
764 if transcript:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
765 transcript_ids.append(transcript)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
766 if debug: print >> sys.stdout, "transcripts: %s" % (transcript_ids)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
767 if options.with_ccds:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
768 ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
769 if debug: print >> sys.stdout, "ccds_dict: %s" % (ccds_dict)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
770 for effect in effects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
771 snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
772 snpEffect.transcript_ids = transcript_ids
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
773 snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
774 if effects_list and not snpEffect.effect in effects_list:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
775 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
776 if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
777 if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
778 if snpEffect.cds_pos:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
779 if snpEffect.inHomopolymer(options.polyA_limit):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
780 homopolymers.append(snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
781 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
782 snpEffects.append(snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
783 if outputFile:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
784 print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields())
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
785 printSeqText(outputFile,snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
786 if snpEffect.cds and snpEffect.cds_pos and not options.all_effects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
787 ## Just report the first sequence returned for a vcf line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
788 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
789 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
790 sys.stderr.write( "line: %s err: %s\n" % (line,e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
791 print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
792 # Sort snpEffects
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
793 snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
794 if outputHtml:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
795 printReportHtml(outputHtml, detailDir, snpEffects)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
796 if outputTsv:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
797 printReportTsv(outputTsv,snpEffects)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
798 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
799 print(str(e))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
800
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
801 if __name__ == "__main__": __main__()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
802