annotate trips_create_new_organism/create_annotation_sqlite.py @ 2:4af7eb738348 draft

Uploaded
author triasteran
date Fri, 25 Feb 2022 12:06:51 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
1 # Python3 script which takes in an annotation file(gtf/gff3) and a transcriptomic fasta file
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
2 # and produces an sqlite file which can be uploaded to Trips-Viz
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
3 # All co-ordinates produced are 1 based
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
4 # All start codon positions (including cds_start) should be at the first nucleotide of the codon
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
5 # All stop codon positions (including cds_stop) should be at the last nucleotide of the codon
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
6 import sys
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
7 import re
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
8 import sqlite3
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
9 import intervaltree
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
10 from intervaltree import Interval, IntervalTree
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
11 import itertools
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
12
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
13
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
14
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
15 #This should be a GTF or GFF3 file
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
16 annotation_file = open(sys.argv[1],"r")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
17 #This needs to be the transcriptomic fasta file
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
18 fasta_file = open(sys.argv[2],"r")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
19 #This value will be added used to create UTRs of this length, useful when looking at transcriptomes without annotated UTRs
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
20 pseudo_utr_len = int(sys.argv[3])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
21 #An example of a transcript_id from the annotation file, e.g ENST000000123456
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
22 user_transcript_id = sys.argv[4]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
23 #An example of a gene name from the annotation file
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
24 user_gene_name = sys.argv[5]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
25 # Set to true if transcript version is included in transcript_id, e.g: ENST000000123456.1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
26 TRAN_VERSION = True
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
27
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
28
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
29
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
30 delimiters = {"transcripts":{"before":[],"after":[],"annot_types": ["cds","utr"]},
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
31 "genes":{"before":[],"after":['"'],"annot_types": ["lnc_rna"]}}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
32
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
33 punctuation = [";"," ","-",":","-",".","=","\t"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
34 #Find delimiters in the annotation and fasta files using the user_transcript_id
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
35 #and user_gene_name examples given by user.
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
36 for line in annotation_file:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
37 if user_transcript_id in line:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
38 tabsplitline = line.split("\t")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
39 annot_type = tabsplitline[2]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
40 if annot_type not in delimiters["transcripts"]["annot_types"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
41 delimiters["transcripts"]["annot_types"].append(annot_type.lower())
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
42 splitline = line.split(user_transcript_id)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
43 before_delimiter = splitline[0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
44 for item in punctuation:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
45 if item in before_delimiter:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
46 if len(before_delimiter.split(item)[-1]) >= 5:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
47 before_delimiter = before_delimiter.split(item)[-1]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
48 after_delimiter = splitline[1][:2]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
49 if before_delimiter not in delimiters["transcripts"]["before"] and len(before_delimiter) >= 5:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
50 delimiters["transcripts"]["before"].append(before_delimiter)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
51 if after_delimiter not in delimiters["transcripts"]["after"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
52 delimiters["transcripts"]["after"].append(after_delimiter)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
53 if user_gene_name in line:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
54 tabsplitline = line.split("\t")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
55 annot_type = tabsplitline[2]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
56 if annot_type not in delimiters["genes"]["annot_types"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
57 delimiters["genes"]["annot_types"].append(annot_type.lower())
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
58 splitline = line.split(user_gene_name)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
59 before_delimiter = splitline[0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
60 for item in punctuation:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
61 if item in before_delimiter:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
62 if len(before_delimiter.split(item)[-1]) >= 5:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
63 before_delimiter = before_delimiter.split(item)[-1]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
64 after_delimiter = splitline[1][0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
65 if before_delimiter not in delimiters["genes"]["before"] and len(before_delimiter) >= 5:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
66 delimiters["genes"]["before"].append(before_delimiter)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
67 if after_delimiter not in delimiters["genes"]["after"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
68 if after_delimiter in punctuation:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
69 delimiters["genes"]["after"].append(after_delimiter)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
70 for line in fasta_file:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
71 if user_transcript_id in line:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
72 splitline = line.split(user_transcript_id)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
73 before_delimiter = splitline[0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
74 for item in punctuation:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
75 if item in before_delimiter:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
76 if len(before_delimiter.split(item)[1]) >= 5:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
77 before_delimiter = before_delimiter.split(item)[1]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
78 after_delimiter = splitline[1][0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
79 if before_delimiter not in delimiters["transcripts"]["before"] and len(before_delimiter) >= 5:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
80 delimiters["transcripts"]["before"].append(before_delimiter)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
81 if after_delimiter not in delimiters["transcripts"]["after"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
82 delimiters["transcripts"]["after"].append(after_delimiter)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
83 fasta_file.close()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
84 annotation_file.close()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
85
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
86
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
87
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
88
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
89
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
90 if delimiters["transcripts"]["before"] == []:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
91 print ("ERROR: No transcript_id with the name {} could be found in the annotation file".format(user_transcript_id))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
92 sys.exit()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
93 if delimiters["genes"]["before"] == []:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
94 print ("ERROR: No gene with the name {} could be found in the annotation file".format(user_gene_name))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
95 sys.exit()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
96
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
97 master_dict = {}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
98 coding_dict = {}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
99 notinfasta = open("notinfasta.csv","w")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
100
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
101 #Given a nucleotide sequence returns the positions of all start and stop codons.
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
102 def get_start_stops(transcript_sequence):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
103 transcript_sequence = transcript_sequence.upper()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
104 start_codons = ['ATG']
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
105 stop_codons = ['TAA', 'TAG', 'TGA']
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
106 seq_frames = {'starts': [], 'stops': []}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
107 for codons, positions in ((start_codons, 'starts'),(stop_codons, 'stops')):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
108 if len(codons) > 1:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
109 pat = re.compile('|'.join(codons))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
110 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
111 pat = re.compile(codons[0])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
112 for m in re.finditer(pat, transcript_sequence):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
113 # Increment position by 1, Frame 1 starts at position 1 not 0,
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
114 # if it's a stop codon add another 2 so it points to the last nuc of the codon
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
115 if positions == "starts":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
116 start = m.start() + 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
117 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
118 start = m.start() + 3
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
119 seq_frames[positions].append(start)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
120 return seq_frames
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
121
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
122
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
123 #parse fasta to get the nucleotide sequence of transcripts and the positions of start/stop codons.
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
124 fasta_file = open(sys.argv[2],"r")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
125 read_fasta = fasta_file.read()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
126 split_fasta = read_fasta.split(">")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
127 for entry in split_fasta[1:]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
128 newline_split = entry.split("\n")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
129 tran = newline_split[0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
130 for item in delimiters["transcripts"]["after"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
131 if item in tran:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
132 tran = tran.split(item)[0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
133 tran = tran.replace("-","_").replace("(","").replace(")","")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
134 seq = ("".join(newline_split[1:]))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
135 if "_PAR_Y" in tran:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
136 tran += "_chrY"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
137 elif "_PAR_X" in tran:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
138 tran += "_chrX"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
139 tran = tran.upper()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
140 starts_stops = get_start_stops(seq)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
141 if tran not in master_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
142 master_dict[tran] = {"utr":[], "cds":[], "exon":[],"start_codon":[],"stop_codon":[],"start_list":starts_stops["starts"],
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
143 "stop_list":starts_stops["stops"],"transcript":[], "strand":"" ,"gene_name":"","chrom":"","seq":seq,"cds_start":"NULL","cds_stop":"NULL",
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
144 "length":len(seq),"principal":0,"version":"NULL"}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
145
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
146
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
147
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
148
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
149 def to_ranges(iterable):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
150 tup_list = []
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
151 iterable = sorted(set(iterable))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
152 for key, group in itertools.groupby(enumerate(iterable),lambda t: t[1] - t[0]):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
153 group = list(group)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
154 tup_list.append((group[0][1], group[-1][1]))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
155 return tup_list
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
156
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
157 #parse annotation file to get chromsome, exon location and CDS info for each transcript
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
158 def parse_gtf_file(annotation_file):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
159 for line in annotation_file:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
160 if line == "\n":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
161 continue
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
162 if line[0] != '#':
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
163 splitline = (line.replace("\n","")).split("\t")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
164 chrom = splitline[0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
165 try:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
166 annot_type = splitline[2].lower()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
167 except:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
168 print ("ERROR tried to index to second item in splitline: ",splitline, line)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
169 sys.exit()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
170 #if annot_type not in ["cds", "utr", "exon", "transcript","five_prime_utr", "three_prime_utr","stop_codon","start_codon"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
171 # continue
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
172 if annot_type not in delimiters["transcripts"]["annot_types"] and annot_type not in delimiters["genes"]["annot_types"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
173 continue
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
174 if annot_type == "five_prime_utr" or annot_type == "three_prime_utr":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
175 annot_type = "utr"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
176 strand = splitline[6]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
177 if strand == "+":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
178 start = int(splitline[3])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
179 end = int(splitline[4])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
180 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
181 start = int(splitline[3])+1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
182 end = int(splitline[4])+1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
183 desc = splitline[8]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
184 tran = desc
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
185 gene = desc
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
186 for item in delimiters["transcripts"]["before"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
187 if item in tran:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
188 tran = tran.split(item)[1]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
189 for item in delimiters["transcripts"]["after"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
190 if item in tran:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
191 tran = tran.split(item)[0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
192 if "." in tran and TRAN_VERSION == True:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
193 #print ("raw tran",tran)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
194 tran = tran.split(".")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
195 version = int(tran[-1].split("_")[0])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
196 tran = tran[0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
197 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
198 version = "NULL"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
199 tran = tran.replace("-","_").replace(".","_")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
200 tran = tran.replace("(","").replace(")","")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
201 tran = tran.replace(" ","").replace("\t","")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
202 tran = tran.upper()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
203 tran = tran.replace("GENE_","").replace("ID_","")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
204 #print ("tran",tran,version)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
205 #if tran == "ENST00000316448":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
206 # print ("annot type",annot_type)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
207 # print ("appending exon to tran", start, end,line)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
208
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
209 gene_found = False
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
210
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
211 if annot_type in delimiters["genes"]["annot_types"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
212 for item in delimiters["genes"]["before"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
213 if item in gene:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
214 gene_found = True
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
215 gene = gene.split(item)[1]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
216 for item in delimiters["genes"]["after"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
217 if item in gene:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
218 gene = gene.split(item)[0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
219 gene = gene.replace("'","''")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
220 gene = gene.replace("GENE_","")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
221 gene = gene.replace("ID_","")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
222 gene = gene.upper()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
223
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
224 if tran in master_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
225 if annot_type in master_dict[tran]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
226 master_dict[tran][annot_type].append((start, end))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
227 master_dict[tran]["strand"] = strand
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
228 master_dict[tran]["chrom"] = chrom
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
229 master_dict[tran]["version"] = version
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
230 if gene_found == True:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
231 master_dict[tran]["gene_name"] = gene
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
232 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
233 notinfasta.write("{}\n".format(tran))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
234
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
235 annotation_file = open(sys.argv[1],"r")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
236 parse_gtf_file(annotation_file)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
237
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
238
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
239 #remove transcripts that were in fasta file but not in annotation_file
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
240 notinannotation = []
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
241 for tran in master_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
242 if master_dict[tran]["chrom"] == "":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
243 #print ("tran {} has no chrom :(".format(tran))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
244 notinannotation.append(tran)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
245 for tran in notinannotation:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
246 del master_dict[tran]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
247
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
248 #Dictionary to store the coding status of a gene, if any transcript of this gene is coding, the value will be True
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
249 coding_genes_dict = {}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
250 #parse master_dict to calculate length, cds_start/stop and exon junction positions
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
251 for tran in master_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
252 try:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
253 transeq = master_dict[tran]["seq"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
254 except Exception as e:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
255 print ("not in fasta", tran)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
256 notinfasta.write("{}\n".format(tran))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
257 continue
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
258 exon_junctions = []
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
259 total_length = len(transeq)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
260 three_len = 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
261 five_len = 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
262 strand = master_dict[tran]["strand"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
263 if master_dict[tran]["gene_name"] == "":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
264 master_dict[tran]["gene_name"] = tran
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
265 gene = master_dict[tran]["gene_name"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
266 if gene not in coding_genes_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
267 coding_genes_dict[gene] = False
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
268
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
269 if master_dict[tran]["cds"] == []:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
270 tran_type = "noncoding"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
271 cds_start = 'NULL'
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
272 cds_stop = 'NULL'
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
273 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
274 #get utr lengths from annotation
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
275 tran_type = "coding"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
276 coding_genes_dict[gene] = True
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
277 sorted_exons = sorted(master_dict[tran]["exon"])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
278 sorted_cds = sorted(master_dict[tran]["cds"])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
279 min_cds = sorted_cds[0][0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
280 #Some annotation files do not have utr annotation types, so fix that here if thats the case
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
281 if master_dict[tran]["utr"] == []:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
282 for exon_tup in master_dict[tran]["exon"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
283 if exon_tup not in master_dict[tran]["cds"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
284 # Now check if this overlaps with any of the CDS exons
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
285 overlap = False
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
286 for cds_tup in master_dict[tran]["cds"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
287 if exon_tup[0] == cds_tup[0] and exon_tup[1] != cds_tup[1]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
288 master_dict[tran]["utr"].append((cds_tup[1],exon_tup[1]))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
289 overlap = True
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
290 if exon_tup[0] != cds_tup[0] and exon_tup[1] == cds_tup[1]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
291 master_dict[tran]["utr"].append((exon_tup[0],cds_tup[0]))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
292 overlap = True
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
293 if overlap == False:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
294 master_dict[tran]["utr"].append(exon_tup)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
295
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
296
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
297 '''
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
298 if tran == "NM_001258497":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
299 print ("sorted cds",sorted_cds)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
300 print ("min cds",min_cds)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
301 print ("chrom",master_dict[tran]["chrom"])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
302 print ("sorted exons", sorted_exons)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
303 print ("utr",master_dict[tran]["utr"])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
304 sys.exit()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
305 '''
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
306 #if tran == "ENST00000381401":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
307 # print ("min cds,sorted utr",min_cds,sorted(master_dict[tran]["utr"]))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
308 for tup in sorted(master_dict[tran]["utr"]):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
309 #if tran == "ENST00000381401":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
310 # print ("tup", tup)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
311 if tup[0] < min_cds:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
312 five_len += (tup[1]-tup[0])+1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
313 #if tran == "ENST00000381401":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
314 # print ("adding to fivelen")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
315 elif tup[0] > min_cds:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
316 three_len += (tup[1] - tup[0])+1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
317 #if tran == "ENST00000381401":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
318 # print ("adding to three len")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
319 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
320 pass
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
321 if strand == "+":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
322 if len(sorted_exons) > 1:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
323 sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
324 sorted_exons[-1] = (sorted_exons[-1][0], sorted_exons[-1][1]+pseudo_utr_len)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
325 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
326 sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1]+pseudo_utr_len)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
327 master_dict[tran]["exon"] = sorted_exons
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
328 cds_start = (five_len+pseudo_utr_len)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
329 cds_stop = ((total_length - three_len)-pseudo_utr_len)+1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
330 elif strand == "-":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
331 if len(sorted_exons) > 1:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
332 sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
333 sorted_exons[-1] = (sorted_exons[-1][0], sorted_exons[-1][1]+pseudo_utr_len)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
334 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
335 sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1]+pseudo_utr_len)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
336 master_dict[tran]["exon"] = sorted_exons
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
337 cds_start = (three_len+pseudo_utr_len)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
338 cds_stop = ((total_length - (five_len))-pseudo_utr_len)+1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
339 #if tran == "ENST00000381401":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
340 # print ("cds start, cds stop, five_len, three_len",cds_start,cds_stop,five_len,three_len)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
341 # #sys.exit()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
342 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
343 print ("strand is unknown: {}".format(strand))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
344 sys.exit()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
345
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
346 #get exon junctions, cds is easy just get end of each tuple except last, same for utr except for if same as cds start/stop
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
347 total_intronic = 0
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
348 try:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
349 if strand == "+":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
350 tx_start = min(sorted(master_dict[tran]["exon"]))[0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
351 prev_end = tx_start
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
352 for tup in sorted(master_dict[tran]["exon"])[:-1]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
353 total_intronic += tup[0]-prev_end
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
354 exon_junctions.append(((tup[1])-tx_start)-total_intronic)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
355 prev_end = tup[1]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
356 elif strand == "-":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
357 tx_start = max(sorted(master_dict[tran]["exon"]))[-1]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
358 prev_end = tx_start
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
359 for tup in (sorted(master_dict[tran]["exon"])[1:])[::-1]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
360 total_intronic += (tup[0]+1)-prev_end
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
361 exon_junctions.append(((tup[1])-tx_start)-total_intronic)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
362 prev_end = tup[1]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
363 except:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
364 if strand == "+":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
365 tx_start = min(sorted(master_dict[tran]["cds"]))[0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
366 prev_end = tx_start
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
367 for tup in sorted(master_dict[tran]["cds"])[:-1]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
368 total_intronic += tup[0]-prev_end
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
369 exon_junctions.append(((tup[1])-tx_start)-total_intronic)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
370 prev_end = tup[1]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
371 elif strand == "-":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
372 tx_start = max(sorted(master_dict[tran]["cds"]))[-1]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
373 prev_end = tx_start
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
374 for tup in (sorted(master_dict[tran]["cds"])[1:])[::-1]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
375 total_intronic += (tup[0]+1)-prev_end
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
376 exon_junctions.append(((tup[1])-tx_start)-total_intronic)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
377 prev_end = tup[1]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
378 if strand == "+" and cds_start != "NULL":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
379 master_dict[tran]["cds_start"] = cds_start
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
380 master_dict[tran]["cds_stop"] = cds_stop
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
381 elif strand == "-" and cds_start != "NULL":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
382 master_dict[tran]["cds_start"] = cds_start
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
383 master_dict[tran]["cds_stop"] = cds_stop
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
384 master_dict[tran]["strand"] = strand
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
385 master_dict[tran]["tran_type"] = tran_type
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
386 master_dict[tran]["exon_junctions"] = exon_junctions
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
387
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
388 longest_tran_dict = {}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
389 for tran in master_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
390 try:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
391 gene = master_dict[tran]["gene_name"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
392 except:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
393 continue
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
394 if coding_genes_dict[gene] == True:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
395 if "cds_start" in master_dict[tran]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
396 if master_dict[tran]["cds_stop"] != "NULL" and master_dict[tran]["cds_start"] != "NULL":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
397 cds_length = master_dict[tran]["cds_stop"]- master_dict[tran]["cds_start"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
398 if gene not in longest_tran_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
399 longest_tran_dict[gene] = {"tran":tran,"length":cds_length}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
400 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
401 if cds_length > longest_tran_dict[gene]["length"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
402 longest_tran_dict[gene] = {"tran":tran,"length":cds_length}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
403 if cds_length == longest_tran_dict[gene]["length"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
404 if master_dict[tran]["length"] > master_dict[longest_tran_dict[gene]["tran"]]["length"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
405 longest_tran_dict[gene] = {"tran":tran,"length":cds_length}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
406 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
407 length = master_dict[tran]["length"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
408 if gene not in longest_tran_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
409 longest_tran_dict[gene] = {"tran":tran,"length":length}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
410 elif length > longest_tran_dict[gene]["length"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
411 longest_tran_dict[gene] = {"tran":tran,"length":length}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
412
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
413
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
414
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
415
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
416
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
417 for gene in longest_tran_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
418 longest_tran = longest_tran_dict[gene]["tran"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
419 master_dict[longest_tran]["principal"] = 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
420
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
421 gene_sample = []
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
422 for key in list(master_dict)[:10]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
423 try:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
424 gene_sample.append(master_dict[key]["gene_name"])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
425 except:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
426 pass
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
427
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
428 print ("Here is a sample of the transcript ids: {}".format(list(master_dict)[:10]))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
429 print ("Here is a sample of the gene names: {}".format(gene_sample))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
430
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
431
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
432 # Takes a transcript, transcriptomic position and a master_dict (see ribopipe scripts) and returns the genomic position, positions should be passed 1 at a time.
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
433 def tran_to_genome(tran, start_pos, end_pos, master_dict):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
434 pos_list = []
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
435 for i in range(start_pos,end_pos+1):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
436 pos_list.append(i)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
437 genomic_pos_list = []
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
438 if tran in master_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
439 transcript_info = master_dict[tran]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
440 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
441 return ("Null", [])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
442
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
443 chrom = transcript_info["chrom"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
444 strand = transcript_info["strand"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
445 exons = transcript_info["exon"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
446 #print ("chrom,strand,exons",chrom,strand,exons)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
447 for pos in pos_list:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
448 #print ("pos",pos)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
449 if strand == "+":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
450 exon_start = 0
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
451 for tup in exons:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
452 #print ("tup",tup)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
453 exon_start = tup[0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
454 exonlen = tup[1] - tup[0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
455 if pos > exonlen:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
456 pos = (pos - exonlen)-1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
457 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
458 break
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
459 #print ("appending exon_start-pos", exon_start, pos, exon_start+pos)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
460 genomic_pos_list.append((exon_start+pos)-1)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
461 elif strand == "-":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
462 exon_start = 0
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
463 for tup in exons[::-1]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
464 #print ("tup",tup)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
465 exon_start = tup[1]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
466 exonlen = tup[1] - tup[0]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
467 #print ("exonlen",exonlen)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
468 if pos > exonlen:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
469 #print ("pos is greater")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
470 pos = (pos - exonlen)-1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
471 #print ("new pos",pos)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
472 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
473 break
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
474 #print ("appending exon_start-pos", exon_start, pos, exon_start-pos)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
475 genomic_pos_list.append((exon_start-pos)+1)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
476 return (chrom, genomic_pos_list)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
477
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
478
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
479
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
480
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
481 orf_dict = {"uorf":{},
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
482 "ouorf":{},
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
483 "cds":{},
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
484 "nested":{},
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
485 "odorf":{},
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
486 "dorf":{},
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
487 "minusone":{},
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
488 "readthrough":{},
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
489 "plusone":{},
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
490 "noncoding":{},
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
491 "extension":{},
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
492 "inframe_stop":{}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
493 }
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
494
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
495 start_codons = ["ATG","GTG","CTG"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
496 stop_codons = ["TAG","TAA","TGA"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
497
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
498
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
499 # Keep track of the longest transcript for each noncoding gene, append this to transcript list later
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
500 longest_noncoding = {}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
501
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
502
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
503 tran_count = 0
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
504 # This section is used to gather all cds regions, convert them to genomic regions and store them in a dictionary to check against later (all transcript contribute to this not just those
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
505 # in the transcript list)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
506 genomic_cds_dict = {}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
507 tree_dict = {}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
508 for transcript in master_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
509 #print (transcript, master_dict[transcript]["tran_type"])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
510 tran_count += 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
511 if "seq" not in master_dict[transcript]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
512 continue
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
513 chrom = master_dict[transcript]["chrom"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
514 if chrom not in genomic_cds_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
515 genomic_cds_dict[chrom] = []
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
516 if "cds_start" in master_dict[transcript]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
517 cds_start = master_dict[transcript]["cds_start"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
518 cds_stop = master_dict[transcript]["cds_stop"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
519 if cds_start != "NULL":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
520 cds_pos = []
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
521 for i in range(cds_start, cds_stop+1):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
522 cds_pos.append(i)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
523
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
524 for tup in master_dict[transcript]["cds"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
525 if tup[0] != tup[1]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
526 if tup not in genomic_cds_dict[chrom]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
527 genomic_cds_dict[chrom].append(tup)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
528
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
529 print ("genomic cds dict built")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
530 print (list(genomic_cds_dict))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
531 for chrom in genomic_cds_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
532 tree_dict[chrom] = IntervalTree.from_tuples(genomic_cds_dict[chrom])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
533
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
534 #print (list(tree_dict))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
535
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
536
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
537 connection = sqlite3.connect("{}".format(sys.argv[6]))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
538 cursor = connection.cursor()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
539 cursor.execute("CREATE TABLE IF NOT EXISTS transcripts (transcript VARCHAR(50), gene VARCHAR(50), length INT(6), cds_start INT(6), cds_stop INT(6), sequence VARCHAR(50000), strand CHAR(1), stop_list VARCHAR(10000), start_list VARCHAR(10000), exon_junctions VARCHAR(1000), tran_type INT(1), gene_type INT(1), principal INT(1), version INT(2),gc INT(3),five_gc INT(3), cds_gc INT(3), three_gc INT(3), chrom VARCHAR(20));")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
540 cursor.execute("CREATE TABLE IF NOT EXISTS coding_regions (transcript VARCHAR(50), coding_start INT(6), coding_stop INT(6));")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
541 cursor.execute("CREATE TABLE IF NOT EXISTS exons (transcript VARCHAR(50), exon_start INT(6), exon_stop INT(6));")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
542 cursor.execute("CREATE TABLE IF NOT EXISTS uorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
543 cursor.execute("CREATE TABLE IF NOT EXISTS ouorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
544 cursor.execute("CREATE TABLE IF NOT EXISTS cds (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
545 cursor.execute("CREATE TABLE IF NOT EXISTS nested (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
546 cursor.execute("CREATE TABLE IF NOT EXISTS odorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
547 cursor.execute("CREATE TABLE IF NOT EXISTS dorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
548 cursor.execute("CREATE TABLE IF NOT EXISTS minusone(transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
549 cursor.execute("CREATE TABLE IF NOT EXISTS readthrough (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
550 cursor.execute("CREATE TABLE IF NOT EXISTS plusone (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
551 cursor.execute("CREATE TABLE IF NOT EXISTS noncoding (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
552 cursor.execute("CREATE TABLE IF NOT EXISTS extension (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
553 cursor.execute("CREATE TABLE IF NOT EXISTS inframe_stop (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
554 connection.commit();
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
555
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
556
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
557 print ("Finding ORFs")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
558 transcript_count = 0
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
559 total_transcripts = len(list(master_dict))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
560 for transcript in master_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
561 #print ("transcript",transcript)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
562 #if transcript != "ENST00000316448":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
563 # continue
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
564 transcript_count += 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
565 if transcript_count%100 == 0:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
566 print ("Transcripts complete: {}/{}".format(transcript_count,total_transcripts))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
567 if "seq" not in master_dict[transcript]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
568 print ("transcript {} has no sequence".format(transcript))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
569 continue
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
570 seq = master_dict[transcript]["seq"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
571 cds_start = "NULL"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
572 cds_stop = "NULL"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
573 transcript_len = len(seq)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
574 if "cds_start" in master_dict[transcript]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
575 cds_start = master_dict[transcript]["cds_start"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
576 cds_stop = master_dict[transcript]["cds_stop"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
577
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
578 #Find out what regions of this transcript overlap with any other coding regions
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
579 coding_positions = []
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
580 if cds_start != "NULL":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
581 #If this is a coding transcript don't bother checking the CDS
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
582 for i in range(cds_start,cds_stop):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
583 coding_positions.append(i)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
584 #check 5' leader
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
585 chrom, pos_list = tran_to_genome(transcript, 0, cds_start, master_dict)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
586 for i in range(0,cds_start):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
587 genomic_pos = pos_list[i]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
588 overlap = tree_dict[chrom][genomic_pos]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
589 if len(overlap) != 0:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
590 coding_positions.append(i)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
591 #check 3' trailer
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
592 chrom, pos_list = tran_to_genome(transcript, cds_stop, transcript_len, master_dict)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
593 for i in range(cds_stop,transcript_len+1):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
594 #print ("i",i)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
595 genomic_pos = pos_list[i-cds_stop]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
596 #print ("genomic position",genomic_pos)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
597 overlap = tree_dict[chrom][genomic_pos]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
598 if len(overlap) != 0:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
599 #print ("overlap not empty appending i",overlap)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
600 coding_positions.append(i)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
601 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
602 #check entire transcript
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
603 chrom, pos_list = tran_to_genome(transcript, 0, transcript_len, master_dict)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
604 for i in range(0,transcript_len):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
605 genomic_pos = pos_list[i]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
606 overlap = tree_dict[chrom][genomic_pos]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
607 if len(overlap) != 0:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
608 coding_positions.append(i)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
609 coding_positions_tuple = to_ranges(coding_positions)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
610 coding_dict[transcript] = coding_positions_tuple
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
611 coding_positions = set(coding_positions)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
612 #if this is a coding transcript find the minusone, readhtrough, plusone co-ordinates
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
613 if cds_start != "NULL":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
614 if pseudo_utr_len != 0:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
615 cds_stop -= 3 # take 3 from stop so we can match it with orf_stop, do it here rather than above in case cds_stop is null
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
616 recoding_dict = {0:"minusone",1:"readthrough",2:"plusone"}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
617 for addition in recoding_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
618 orftype = recoding_dict[addition]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
619 for i in range(cds_stop+addition,transcript_len,3):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
620 if seq[i:i+3] in stop_codons:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
621 orf_seq = seq[cds_stop:i+3]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
622 orf_start = cds_stop
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
623 orf_stop = i+2 # +2 so it refers to the end of the stop codon
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
624 start_codon = None
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
625 if orf_seq:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
626 length = len(orf_seq)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
627 orf_pos_list = []
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
628 #determine how many nucleotides in this orf overlap with an annotated coding region
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
629 cds_cov_count = 0.0
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
630 for position in range(orf_start,orf_stop):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
631 orf_pos_list.append(position)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
632 for pos in range(orf_start, orf_stop+1):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
633 if pos in coding_positions:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
634 cds_cov_count += 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
635 cds_cov = cds_cov_count/length
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
636 cursor.execute("INSERT INTO {} VALUES('{}','{}',{},{},{},'{}',{});".format(orftype, transcript, start_codon, length,orf_start,orf_stop,orf_seq,cds_cov))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
637 break
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
638 for frame in [0,1,2]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
639 for i in range(frame,transcript_len,3):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
640 if seq[i:i+3] in start_codons:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
641 for x in range(i, transcript_len,3):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
642 if seq[x:x+3] in stop_codons:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
643 orf_seq = seq[i:x+3]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
644 orf_start = i
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
645 orf_stop = x+2 # +2 so it refers to the end of the stop codon
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
646 start_codon = seq[i:i+3]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
647 length = len(orf_seq)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
648 orf_pos_list = []
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
649 #determine how many nucleotides in this orf overlap with an annotated coding region
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
650 cds_cov_count = 0.0
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
651 for pos in range(orf_start, orf_stop+1):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
652 if pos in coding_positions:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
653 cds_cov_count += 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
654 cds_cov = float(cds_cov_count)/float(length)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
655 # Now determine orf type
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
656 if cds_start == "NULL":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
657 orftype = "noncoding"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
658 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
659 #print ("cds start is not null :{}:".format(cds_start))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
660 if orf_start == cds_start and orf_stop == cds_stop:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
661 orftype = "cds"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
662 elif orf_start < cds_start and orf_stop == cds_stop:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
663 orftype = "extension"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
664 #special case for extensions, we only take from the orf_start to the cds_start, and re-calculate cds coverage
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
665 orf_stop = cds_start
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
666 cds_cov_count = 0.0
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
667 for pos in range(orf_start, orf_stop+1):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
668 if pos in coding_positions:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
669 cds_cov_count += 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
670 cds_cov = float(cds_cov_count)/float(length)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
671 orf_seq = seq[orf_start:cds_start]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
672 length = len(orf_seq)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
673 elif orf_start < cds_start and orf_stop <= cds_start:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
674 orftype = "uorf"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
675 elif orf_start < cds_start and orf_stop > cds_start:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
676 orftype = "ouorf"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
677 elif orf_start >= cds_start and orf_start <= cds_stop and orf_stop <= cds_stop:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
678 if orf_stop == cds_stop:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
679 break
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
680 orftype = "nested"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
681 elif orf_start >= cds_start and orf_start <= cds_stop and orf_stop > cds_stop:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
682 orftype = "odorf"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
683 elif orf_start > cds_stop and orf_stop > cds_stop:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
684 orftype = "dorf"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
685 if orf_stop > cds_start and orf_stop < cds_stop:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
686 if (orf_stop+1)%3 == cds_start%3:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
687 orftype = "inframe_stop"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
688 if transcript not in orf_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
689 orf_dict[orftype][transcript] = []
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
690 cursor.execute("INSERT INTO {} VALUES('{}','{}',{},{},{},'{}',{});".format(orftype, transcript, start_codon, length,orf_start,orf_stop,orf_seq,cds_cov))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
691 break
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
692 # Used to keep track of the codons at cds_start and cds_stop positions,
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
693 # If there is an issue with the cds co-ordinates the starts and stops counts will
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
694 # be much lower than the other count, start with 1 to prevent division by 0
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
695 nuc_dict = {"stops":{"stops":1,"other":0}, "starts":{"starts":1,"other":0}}
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
696
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
697 def calcgc(seq):
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
698 if seq == "":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
699 return "NULL"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
700 g_count = 0
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
701 c_count = 0
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
702 a_count = 0
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
703 t_count = 0
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
704 for char in seq:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
705 if char == "A":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
706 a_count += 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
707 if char == "T":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
708 t_count += 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
709 if char == "G":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
710 g_count += 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
711 if char == "C":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
712 c_count += 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
713 gc = ((g_count+c_count)/float(len(seq)))*100
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
714 return round(gc,2)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
715
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
716
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
717
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
718
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
719 for transcript in master_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
720 #print ("transcripts", transcript)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
721 length = master_dict[transcript]["length"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
722 cds_start = master_dict[transcript]["cds_start"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
723 cds_stop = master_dict[transcript]["cds_stop"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
724 seq = master_dict[transcript]["seq"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
725 strand = master_dict[transcript]["strand"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
726 chrom = master_dict[transcript]["chrom"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
727 gene = master_dict[transcript]["gene_name"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
728 gc = calcgc(seq)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
729 five_gc = "NULL"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
730 cds_gc = "NULL"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
731 three_gc = "NULL"
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
732 if cds_start != "NULL":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
733 five_gc = calcgc(seq[:cds_start])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
734 cds_gc = calcgc(seq[cds_start:cds_stop])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
735 three_gc = calcgc(seq[cds_stop:])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
736 # check that the nucleotide cds_start points to is the first of the start codon
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
737 # take one becase cds_start is 1 based but python indexing is 0 based
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
738 start_nuc = seq[cds_start-1:cds_start+2]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
739 #print ("start nuc",start_nuc)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
740 if start_nuc == "ATG":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
741 nuc_dict["starts"]["starts"] += 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
742 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
743 nuc_dict["starts"]["other"] += 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
744 stop_nuc = seq[cds_stop-3:cds_stop]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
745 #print ("stop_nuc",stop_nuc)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
746 if stop_nuc in ["TAG","TAA","TGA"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
747 nuc_dict["stops"]["stops"] += 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
748 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
749 nuc_dict["stops"]["other"] += 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
750 tran_type = master_dict[transcript]["tran_type"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
751 if coding_genes_dict[gene] == True:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
752 gene_type = 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
753 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
754 gene_type = 0
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
755 #print ("tran type before",tran_type)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
756 if tran_type == "coding":
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
757 tran_type = 1
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
758 else:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
759 tran_type = 0
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
760 #print ("tran type after",tran_type)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
761 start_list = str(master_dict[transcript]["start_list"]).replace(" ","").strip("[]")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
762 stop_list = str(master_dict[transcript]["stop_list"]).replace(" ","").strip("[]")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
763 exon_junctions = str(master_dict[transcript]["exon_junctions"]).replace(" ","").strip("[]")
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
764 principal = master_dict[transcript]["principal"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
765 version = master_dict[transcript]["version"]
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
766 #print (master_dict[transcript])
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
767 #print (tran_type)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
768 #print (gene_type)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
769 #print (principal)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
770 #print (version)
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
771 #print ("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{});".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
772 cursor.execute("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{},{},{},{},{},'{}');".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version,gc,five_gc,cds_gc,three_gc,chrom))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
773
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
774 for tup in master_dict[transcript]["exon"]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
775 cursor.execute("INSERT INTO exons VALUES('{}',{},{});".format(transcript,tup[0],tup[1]))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
776 if transcript in coding_dict:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
777 for tup in coding_dict[transcript]:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
778 cursor.execute("INSERT INTO coding_regions VALUES('{}',{},{});".format(transcript,tup[0],tup[1]))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
779
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
780 connection.commit()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
781 connection.close()
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
782
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
783 if (nuc_dict["starts"]["other"]/nuc_dict["starts"]["starts"]) > 0.05:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
784 print ("Warning: {} transcripts do not have a an AUG at the CDS start position".format(nuc_dict["starts"]["other"]))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
785 if (nuc_dict["stops"]["other"]/nuc_dict["stops"]["stops"]) > 0.05:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
786 print ("Warning: {} transcripts do not have a an stop codon at the CDS stop position".format(nuc_dict["stops"]["other"]))
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
787 if len(notinannotation) >0:
4af7eb738348 Uploaded
triasteran
parents:
diff changeset
788 print ("Warning: {} transcripts were in the fasta file, but not the annotation file, these will be discarded".format(len(notinannotation)))