0
|
1 # Python3 script which takes in an annotation file(gtf/gff3) and a transcriptomic fasta file
|
|
2 # and produces an sqlite file which can be uploaded to Trips-Viz
|
|
3 # All co-ordinates produced are 1 based
|
|
4 # All start codon positions (including cds_start) should be at the first nucleotide of the codon
|
|
5 # All stop codon positions (including cds_stop) should be at the last nucleotide of the codon
|
|
6 import sys
|
|
7 import re
|
|
8 import sqlite3
|
|
9 from intervaltree import Interval, IntervalTree
|
|
10 import itertools
|
|
11
|
|
12
|
|
13
|
|
14 #This should be a GTF or GFF3 file
|
|
15 annotation_file = open(sys.argv[1],"r")
|
|
16 #This needs to be the transcriptomic fasta file
|
|
17 fasta_file = open(sys.argv[2],"r")
|
|
18 #This value will be added used to create UTRs of this length, useful when looking at transcriptomes without annotated UTRs
|
|
19 pseudo_utr_len = int(sys.argv[3])
|
|
20 #An example of a transcript_id from the annotation file, e.g ENST000000123456
|
|
21 user_transcript_id = sys.argv[4]
|
|
22 #An example of a gene name from the annotation file
|
|
23 user_gene_name = sys.argv[5]
|
|
24 # Set to true if transcript version is included in transcript_id, e.g: ENST000000123456.1
|
|
25 TRAN_VERSION = True
|
|
26
|
|
27
|
|
28
|
|
29 delimiters = {"transcripts":{"before":[],"after":[],"annot_types": ["cds","utr"]},
|
|
30 "genes":{"before":[],"after":['"'],"annot_types": ["lnc_rna"]}}
|
|
31
|
|
32 punctuation = [";"," ","-",":","-",".","=","\t"]
|
|
33 #Find delimiters in the annotation and fasta files using the user_transcript_id
|
|
34 #and user_gene_name examples given by user.
|
|
35 for line in annotation_file:
|
|
36 if user_transcript_id in line:
|
|
37 tabsplitline = line.split("\t")
|
|
38 annot_type = tabsplitline[2]
|
|
39 if annot_type not in delimiters["transcripts"]["annot_types"]:
|
|
40 delimiters["transcripts"]["annot_types"].append(annot_type.lower())
|
|
41 splitline = line.split(user_transcript_id)
|
|
42 before_delimiter = splitline[0]
|
|
43 for item in punctuation:
|
|
44 if item in before_delimiter:
|
|
45 if len(before_delimiter.split(item)[-1]) >= 5:
|
|
46 before_delimiter = before_delimiter.split(item)[-1]
|
|
47 after_delimiter = splitline[1][:2]
|
|
48 if before_delimiter not in delimiters["transcripts"]["before"] and len(before_delimiter) >= 5:
|
|
49 delimiters["transcripts"]["before"].append(before_delimiter)
|
|
50 if after_delimiter not in delimiters["transcripts"]["after"]:
|
|
51 delimiters["transcripts"]["after"].append(after_delimiter)
|
|
52 if user_gene_name in line:
|
|
53 tabsplitline = line.split("\t")
|
|
54 annot_type = tabsplitline[2]
|
|
55 if annot_type not in delimiters["genes"]["annot_types"]:
|
|
56 delimiters["genes"]["annot_types"].append(annot_type.lower())
|
|
57 splitline = line.split(user_gene_name)
|
|
58 before_delimiter = splitline[0]
|
|
59 for item in punctuation:
|
|
60 if item in before_delimiter:
|
|
61 if len(before_delimiter.split(item)[-1]) >= 5:
|
|
62 before_delimiter = before_delimiter.split(item)[-1]
|
|
63 after_delimiter = splitline[1][0]
|
|
64 if before_delimiter not in delimiters["genes"]["before"] and len(before_delimiter) >= 5:
|
|
65 delimiters["genes"]["before"].append(before_delimiter)
|
|
66 if after_delimiter not in delimiters["genes"]["after"]:
|
|
67 if after_delimiter in punctuation:
|
|
68 delimiters["genes"]["after"].append(after_delimiter)
|
|
69 for line in fasta_file:
|
|
70 if user_transcript_id in line:
|
|
71 splitline = line.split(user_transcript_id)
|
|
72 before_delimiter = splitline[0]
|
|
73 for item in punctuation:
|
|
74 if item in before_delimiter:
|
|
75 if len(before_delimiter.split(item)[1]) >= 5:
|
|
76 before_delimiter = before_delimiter.split(item)[1]
|
|
77 after_delimiter = splitline[1][0]
|
|
78 if before_delimiter not in delimiters["transcripts"]["before"] and len(before_delimiter) >= 5:
|
|
79 delimiters["transcripts"]["before"].append(before_delimiter)
|
|
80 if after_delimiter not in delimiters["transcripts"]["after"]:
|
|
81 delimiters["transcripts"]["after"].append(after_delimiter)
|
|
82 fasta_file.close()
|
|
83 annotation_file.close()
|
|
84
|
|
85
|
|
86
|
|
87
|
|
88
|
|
89 if delimiters["transcripts"]["before"] == []:
|
|
90 print ("ERROR: No transcript_id with the name {} could be found in the annotation file".format(user_transcript_id))
|
|
91 sys.exit()
|
|
92 if delimiters["genes"]["before"] == []:
|
|
93 print ("ERROR: No gene with the name {} could be found in the annotation file".format(user_gene_name))
|
|
94 sys.exit()
|
|
95
|
|
96 master_dict = {}
|
|
97 coding_dict = {}
|
|
98 notinfasta = open("notinfasta.csv","w")
|
|
99
|
|
100 #Given a nucleotide sequence returns the positions of all start and stop codons.
|
|
101 def get_start_stops(transcript_sequence):
|
|
102 transcript_sequence = transcript_sequence.upper()
|
|
103 start_codons = ['ATG']
|
|
104 stop_codons = ['TAA', 'TAG', 'TGA']
|
|
105 seq_frames = {'starts': [], 'stops': []}
|
|
106 for codons, positions in ((start_codons, 'starts'),(stop_codons, 'stops')):
|
|
107 if len(codons) > 1:
|
|
108 pat = re.compile('|'.join(codons))
|
|
109 else:
|
|
110 pat = re.compile(codons[0])
|
|
111 for m in re.finditer(pat, transcript_sequence):
|
|
112 # Increment position by 1, Frame 1 starts at position 1 not 0,
|
|
113 # if it's a stop codon add another 2 so it points to the last nuc of the codon
|
|
114 if positions == "starts":
|
|
115 start = m.start() + 1
|
|
116 else:
|
|
117 start = m.start() + 3
|
|
118 seq_frames[positions].append(start)
|
|
119 return seq_frames
|
|
120
|
|
121
|
|
122 #parse fasta to get the nucleotide sequence of transcripts and the positions of start/stop codons.
|
|
123 fasta_file = open(sys.argv[2],"r")
|
|
124 read_fasta = fasta_file.read()
|
|
125 split_fasta = read_fasta.split(">")
|
|
126 for entry in split_fasta[1:]:
|
|
127 newline_split = entry.split("\n")
|
|
128 tran = newline_split[0]
|
|
129 for item in delimiters["transcripts"]["after"]:
|
|
130 if item in tran:
|
|
131 tran = tran.split(item)[0]
|
|
132 tran = tran.replace("-","_").replace("(","").replace(")","")
|
|
133 seq = ("".join(newline_split[1:]))
|
|
134 if "_PAR_Y" in tran:
|
|
135 tran += "_chrY"
|
|
136 elif "_PAR_X" in tran:
|
|
137 tran += "_chrX"
|
|
138 tran = tran.upper()
|
|
139 starts_stops = get_start_stops(seq)
|
|
140 if tran not in master_dict:
|
|
141 master_dict[tran] = {"utr":[], "cds":[], "exon":[],"start_codon":[],"stop_codon":[],"start_list":starts_stops["starts"],
|
|
142 "stop_list":starts_stops["stops"],"transcript":[], "strand":"" ,"gene_name":"","chrom":"","seq":seq,"cds_start":"NULL","cds_stop":"NULL",
|
|
143 "length":len(seq),"principal":0,"version":"NULL"}
|
|
144
|
|
145
|
|
146
|
|
147
|
|
148 def to_ranges(iterable):
|
|
149 tup_list = []
|
|
150 iterable = sorted(set(iterable))
|
|
151 for key, group in itertools.groupby(enumerate(iterable),lambda t: t[1] - t[0]):
|
|
152 group = list(group)
|
|
153 tup_list.append((group[0][1], group[-1][1]))
|
|
154 return tup_list
|
|
155
|
|
156 #parse annotation file to get chromsome, exon location and CDS info for each transcript
|
|
157 def parse_gtf_file(annotation_file):
|
|
158 for line in annotation_file:
|
|
159 if line == "\n":
|
|
160 continue
|
|
161 if line[0] != '#':
|
|
162 splitline = (line.replace("\n","")).split("\t")
|
|
163 chrom = splitline[0]
|
|
164 try:
|
|
165 annot_type = splitline[2].lower()
|
|
166 except:
|
|
167 print ("ERROR tried to index to second item in splitline: ",splitline, line)
|
|
168 sys.exit()
|
|
169 #if annot_type not in ["cds", "utr", "exon", "transcript","five_prime_utr", "three_prime_utr","stop_codon","start_codon"]:
|
|
170 # continue
|
|
171 if annot_type not in delimiters["transcripts"]["annot_types"] and annot_type not in delimiters["genes"]["annot_types"]:
|
|
172 continue
|
|
173 if annot_type == "five_prime_utr" or annot_type == "three_prime_utr":
|
|
174 annot_type = "utr"
|
|
175 strand = splitline[6]
|
|
176 if strand == "+":
|
|
177 start = int(splitline[3])
|
|
178 end = int(splitline[4])
|
|
179 else:
|
|
180 start = int(splitline[3])+1
|
|
181 end = int(splitline[4])+1
|
|
182 desc = splitline[8]
|
|
183 tran = desc
|
|
184 gene = desc
|
|
185 for item in delimiters["transcripts"]["before"]:
|
|
186 if item in tran:
|
|
187 tran = tran.split(item)[1]
|
|
188 for item in delimiters["transcripts"]["after"]:
|
|
189 if item in tran:
|
|
190 tran = tran.split(item)[0]
|
|
191 if "." in tran and TRAN_VERSION == True:
|
|
192 #print ("raw tran",tran)
|
|
193 tran = tran.split(".")
|
|
194 version = int(tran[-1].split("_")[0])
|
|
195 tran = tran[0]
|
|
196 else:
|
|
197 version = "NULL"
|
|
198 tran = tran.replace("-","_").replace(".","_")
|
|
199 tran = tran.replace("(","").replace(")","")
|
|
200 tran = tran.replace(" ","").replace("\t","")
|
|
201 tran = tran.upper()
|
|
202 tran = tran.replace("GENE_","").replace("ID_","")
|
|
203 #print ("tran",tran,version)
|
|
204 #if tran == "ENST00000316448":
|
|
205 # print ("annot type",annot_type)
|
|
206 # print ("appending exon to tran", start, end,line)
|
|
207
|
|
208 gene_found = False
|
|
209
|
|
210 if annot_type in delimiters["genes"]["annot_types"]:
|
|
211 for item in delimiters["genes"]["before"]:
|
|
212 if item in gene:
|
|
213 gene_found = True
|
|
214 gene = gene.split(item)[1]
|
|
215 for item in delimiters["genes"]["after"]:
|
|
216 if item in gene:
|
|
217 gene = gene.split(item)[0]
|
|
218 gene = gene.replace("'","''")
|
|
219 gene = gene.replace("GENE_","")
|
|
220 gene = gene.replace("ID_","")
|
|
221 gene = gene.upper()
|
|
222
|
|
223 if tran in master_dict:
|
|
224 if annot_type in master_dict[tran]:
|
|
225 master_dict[tran][annot_type].append((start, end))
|
|
226 master_dict[tran]["strand"] = strand
|
|
227 master_dict[tran]["chrom"] = chrom
|
|
228 master_dict[tran]["version"] = version
|
|
229 if gene_found == True:
|
|
230 master_dict[tran]["gene_name"] = gene
|
|
231 else:
|
|
232 notinfasta.write("{}\n".format(tran))
|
|
233
|
|
234 annotation_file = open(sys.argv[1],"r")
|
|
235 parse_gtf_file(annotation_file)
|
|
236
|
|
237
|
|
238 #remove transcripts that were in fasta file but not in annotation_file
|
|
239 notinannotation = []
|
|
240 for tran in master_dict:
|
|
241 if master_dict[tran]["chrom"] == "":
|
|
242 #print ("tran {} has no chrom :(".format(tran))
|
|
243 notinannotation.append(tran)
|
|
244 for tran in notinannotation:
|
|
245 del master_dict[tran]
|
|
246
|
|
247 #Dictionary to store the coding status of a gene, if any transcript of this gene is coding, the value will be True
|
|
248 coding_genes_dict = {}
|
|
249 #parse master_dict to calculate length, cds_start/stop and exon junction positions
|
|
250 for tran in master_dict:
|
|
251 try:
|
|
252 transeq = master_dict[tran]["seq"]
|
|
253 except Exception as e:
|
|
254 print ("not in fasta", tran)
|
|
255 notinfasta.write("{}\n".format(tran))
|
|
256 continue
|
|
257 exon_junctions = []
|
|
258 total_length = len(transeq)
|
|
259 three_len = 1
|
|
260 five_len = 1
|
|
261 strand = master_dict[tran]["strand"]
|
|
262 if master_dict[tran]["gene_name"] == "":
|
|
263 master_dict[tran]["gene_name"] = tran
|
|
264 gene = master_dict[tran]["gene_name"]
|
|
265 if gene not in coding_genes_dict:
|
|
266 coding_genes_dict[gene] = False
|
|
267
|
|
268 if master_dict[tran]["cds"] == []:
|
|
269 tran_type = "noncoding"
|
|
270 cds_start = 'NULL'
|
|
271 cds_stop = 'NULL'
|
|
272 else:
|
|
273 #get utr lengths from annotation
|
|
274 tran_type = "coding"
|
|
275 coding_genes_dict[gene] = True
|
|
276 sorted_exons = sorted(master_dict[tran]["exon"])
|
|
277 sorted_cds = sorted(master_dict[tran]["cds"])
|
|
278 min_cds = sorted_cds[0][0]
|
|
279 #Some annotation files do not have utr annotation types, so fix that here if thats the case
|
|
280 if master_dict[tran]["utr"] == []:
|
|
281 for exon_tup in master_dict[tran]["exon"]:
|
|
282 if exon_tup not in master_dict[tran]["cds"]:
|
|
283 # Now check if this overlaps with any of the CDS exons
|
|
284 overlap = False
|
|
285 for cds_tup in master_dict[tran]["cds"]:
|
|
286 if exon_tup[0] == cds_tup[0] and exon_tup[1] != cds_tup[1]:
|
|
287 master_dict[tran]["utr"].append((cds_tup[1],exon_tup[1]))
|
|
288 overlap = True
|
|
289 if exon_tup[0] != cds_tup[0] and exon_tup[1] == cds_tup[1]:
|
|
290 master_dict[tran]["utr"].append((exon_tup[0],cds_tup[0]))
|
|
291 overlap = True
|
|
292 if overlap == False:
|
|
293 master_dict[tran]["utr"].append(exon_tup)
|
|
294
|
|
295
|
|
296 '''
|
|
297 if tran == "NM_001258497":
|
|
298 print ("sorted cds",sorted_cds)
|
|
299 print ("min cds",min_cds)
|
|
300 print ("chrom",master_dict[tran]["chrom"])
|
|
301 print ("sorted exons", sorted_exons)
|
|
302 print ("utr",master_dict[tran]["utr"])
|
|
303 sys.exit()
|
|
304 '''
|
|
305 #if tran == "ENST00000381401":
|
|
306 # print ("min cds,sorted utr",min_cds,sorted(master_dict[tran]["utr"]))
|
|
307 for tup in sorted(master_dict[tran]["utr"]):
|
|
308 #if tran == "ENST00000381401":
|
|
309 # print ("tup", tup)
|
|
310 if tup[0] < min_cds:
|
|
311 five_len += (tup[1]-tup[0])+1
|
|
312 #if tran == "ENST00000381401":
|
|
313 # print ("adding to fivelen")
|
|
314 elif tup[0] > min_cds:
|
|
315 three_len += (tup[1] - tup[0])+1
|
|
316 #if tran == "ENST00000381401":
|
|
317 # print ("adding to three len")
|
|
318 else:
|
|
319 pass
|
|
320 if strand == "+":
|
|
321 if len(sorted_exons) > 1:
|
|
322 sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1])
|
|
323 sorted_exons[-1] = (sorted_exons[-1][0], sorted_exons[-1][1]+pseudo_utr_len)
|
|
324 else:
|
|
325 sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1]+pseudo_utr_len)
|
|
326 master_dict[tran]["exon"] = sorted_exons
|
|
327 cds_start = (five_len+pseudo_utr_len)
|
|
328 cds_stop = ((total_length - three_len)-pseudo_utr_len)+1
|
|
329 elif strand == "-":
|
|
330 if len(sorted_exons) > 1:
|
|
331 sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1])
|
|
332 sorted_exons[-1] = (sorted_exons[-1][0], sorted_exons[-1][1]+pseudo_utr_len)
|
|
333 else:
|
|
334 sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1]+pseudo_utr_len)
|
|
335 master_dict[tran]["exon"] = sorted_exons
|
|
336 cds_start = (three_len+pseudo_utr_len)
|
|
337 cds_stop = ((total_length - (five_len))-pseudo_utr_len)+1
|
|
338 #if tran == "ENST00000381401":
|
|
339 # print ("cds start, cds stop, five_len, three_len",cds_start,cds_stop,five_len,three_len)
|
|
340 # #sys.exit()
|
|
341 else:
|
|
342 print ("strand is unknown: {}".format(strand))
|
|
343 sys.exit()
|
|
344
|
|
345 #get exon junctions, cds is easy just get end of each tuple except last, same for utr except for if same as cds start/stop
|
|
346 total_intronic = 0
|
|
347 try:
|
|
348 if strand == "+":
|
|
349 tx_start = min(sorted(master_dict[tran]["exon"]))[0]
|
|
350 prev_end = tx_start
|
|
351 for tup in sorted(master_dict[tran]["exon"])[:-1]:
|
|
352 total_intronic += tup[0]-prev_end
|
|
353 exon_junctions.append(((tup[1])-tx_start)-total_intronic)
|
|
354 prev_end = tup[1]
|
|
355 elif strand == "-":
|
|
356 tx_start = max(sorted(master_dict[tran]["exon"]))[-1]
|
|
357 prev_end = tx_start
|
|
358 for tup in (sorted(master_dict[tran]["exon"])[1:])[::-1]:
|
|
359 total_intronic += (tup[0]+1)-prev_end
|
|
360 exon_junctions.append(((tup[1])-tx_start)-total_intronic)
|
|
361 prev_end = tup[1]
|
|
362 except:
|
|
363 if strand == "+":
|
|
364 tx_start = min(sorted(master_dict[tran]["cds"]))[0]
|
|
365 prev_end = tx_start
|
|
366 for tup in sorted(master_dict[tran]["cds"])[:-1]:
|
|
367 total_intronic += tup[0]-prev_end
|
|
368 exon_junctions.append(((tup[1])-tx_start)-total_intronic)
|
|
369 prev_end = tup[1]
|
|
370 elif strand == "-":
|
|
371 tx_start = max(sorted(master_dict[tran]["cds"]))[-1]
|
|
372 prev_end = tx_start
|
|
373 for tup in (sorted(master_dict[tran]["cds"])[1:])[::-1]:
|
|
374 total_intronic += (tup[0]+1)-prev_end
|
|
375 exon_junctions.append(((tup[1])-tx_start)-total_intronic)
|
|
376 prev_end = tup[1]
|
|
377 if strand == "+" and cds_start != "NULL":
|
|
378 master_dict[tran]["cds_start"] = cds_start
|
|
379 master_dict[tran]["cds_stop"] = cds_stop
|
|
380 elif strand == "-" and cds_start != "NULL":
|
|
381 master_dict[tran]["cds_start"] = cds_start
|
|
382 master_dict[tran]["cds_stop"] = cds_stop
|
|
383 master_dict[tran]["strand"] = strand
|
|
384 master_dict[tran]["tran_type"] = tran_type
|
|
385 master_dict[tran]["exon_junctions"] = exon_junctions
|
|
386
|
|
387 longest_tran_dict = {}
|
|
388 for tran in master_dict:
|
|
389 try:
|
|
390 gene = master_dict[tran]["gene_name"]
|
|
391 except:
|
|
392 continue
|
|
393 if coding_genes_dict[gene] == True:
|
|
394 if "cds_start" in master_dict[tran]:
|
|
395 if master_dict[tran]["cds_stop"] != "NULL" and master_dict[tran]["cds_start"] != "NULL":
|
|
396 cds_length = master_dict[tran]["cds_stop"]- master_dict[tran]["cds_start"]
|
|
397 if gene not in longest_tran_dict:
|
|
398 longest_tran_dict[gene] = {"tran":tran,"length":cds_length}
|
|
399 else:
|
|
400 if cds_length > longest_tran_dict[gene]["length"]:
|
|
401 longest_tran_dict[gene] = {"tran":tran,"length":cds_length}
|
|
402 if cds_length == longest_tran_dict[gene]["length"]:
|
|
403 if master_dict[tran]["length"] > master_dict[longest_tran_dict[gene]["tran"]]["length"]:
|
|
404 longest_tran_dict[gene] = {"tran":tran,"length":cds_length}
|
|
405 else:
|
|
406 length = master_dict[tran]["length"]
|
|
407 if gene not in longest_tran_dict:
|
|
408 longest_tran_dict[gene] = {"tran":tran,"length":length}
|
|
409 elif length > longest_tran_dict[gene]["length"]:
|
|
410 longest_tran_dict[gene] = {"tran":tran,"length":length}
|
|
411
|
|
412
|
|
413
|
|
414
|
|
415
|
|
416 for gene in longest_tran_dict:
|
|
417 longest_tran = longest_tran_dict[gene]["tran"]
|
|
418 master_dict[longest_tran]["principal"] = 1
|
|
419
|
|
420 gene_sample = []
|
|
421 for key in list(master_dict)[:10]:
|
|
422 try:
|
|
423 gene_sample.append(master_dict[key]["gene_name"])
|
|
424 except:
|
|
425 pass
|
|
426
|
|
427 print ("Here is a sample of the transcript ids: {}".format(list(master_dict)[:10]))
|
|
428 print ("Here is a sample of the gene names: {}".format(gene_sample))
|
|
429
|
|
430
|
|
431 # Takes a transcript, transcriptomic position and a master_dict (see ribopipe scripts) and returns the genomic position, positions should be passed 1 at a time.
|
|
432 def tran_to_genome(tran, start_pos, end_pos, master_dict):
|
|
433 pos_list = []
|
|
434 for i in range(start_pos,end_pos+1):
|
|
435 pos_list.append(i)
|
|
436 genomic_pos_list = []
|
|
437 if tran in master_dict:
|
|
438 transcript_info = master_dict[tran]
|
|
439 else:
|
|
440 return ("Null", [])
|
|
441
|
|
442 chrom = transcript_info["chrom"]
|
|
443 strand = transcript_info["strand"]
|
|
444 exons = transcript_info["exon"]
|
|
445 #print ("chrom,strand,exons",chrom,strand,exons)
|
|
446 for pos in pos_list:
|
|
447 #print ("pos",pos)
|
|
448 if strand == "+":
|
|
449 exon_start = 0
|
|
450 for tup in exons:
|
|
451 #print ("tup",tup)
|
|
452 exon_start = tup[0]
|
|
453 exonlen = tup[1] - tup[0]
|
|
454 if pos > exonlen:
|
|
455 pos = (pos - exonlen)-1
|
|
456 else:
|
|
457 break
|
|
458 #print ("appending exon_start-pos", exon_start, pos, exon_start+pos)
|
|
459 genomic_pos_list.append((exon_start+pos)-1)
|
|
460 elif strand == "-":
|
|
461 exon_start = 0
|
|
462 for tup in exons[::-1]:
|
|
463 #print ("tup",tup)
|
|
464 exon_start = tup[1]
|
|
465 exonlen = tup[1] - tup[0]
|
|
466 #print ("exonlen",exonlen)
|
|
467 if pos > exonlen:
|
|
468 #print ("pos is greater")
|
|
469 pos = (pos - exonlen)-1
|
|
470 #print ("new pos",pos)
|
|
471 else:
|
|
472 break
|
|
473 #print ("appending exon_start-pos", exon_start, pos, exon_start-pos)
|
|
474 genomic_pos_list.append((exon_start-pos)+1)
|
|
475 return (chrom, genomic_pos_list)
|
|
476
|
|
477
|
|
478
|
|
479
|
|
480 orf_dict = {"uorf":{},
|
|
481 "ouorf":{},
|
|
482 "cds":{},
|
|
483 "nested":{},
|
|
484 "odorf":{},
|
|
485 "dorf":{},
|
|
486 "minusone":{},
|
|
487 "readthrough":{},
|
|
488 "plusone":{},
|
|
489 "noncoding":{},
|
|
490 "extension":{},
|
|
491 "inframe_stop":{}
|
|
492 }
|
|
493
|
|
494 start_codons = ["ATG","GTG","CTG"]
|
|
495 stop_codons = ["TAG","TAA","TGA"]
|
|
496
|
|
497
|
|
498 # Keep track of the longest transcript for each noncoding gene, append this to transcript list later
|
|
499 longest_noncoding = {}
|
|
500
|
|
501
|
|
502 tran_count = 0
|
|
503 # This section is used to gather all cds regions, convert them to genomic regions and store them in a dictionary to check against later (all transcript contribute to this not just those
|
|
504 # in the transcript list)
|
|
505 genomic_cds_dict = {}
|
|
506 tree_dict = {}
|
|
507 for transcript in master_dict:
|
|
508 #print (transcript, master_dict[transcript]["tran_type"])
|
|
509 tran_count += 1
|
|
510 if "seq" not in master_dict[transcript]:
|
|
511 continue
|
|
512 chrom = master_dict[transcript]["chrom"]
|
|
513 if chrom not in genomic_cds_dict:
|
|
514 genomic_cds_dict[chrom] = []
|
|
515 if "cds_start" in master_dict[transcript]:
|
|
516 cds_start = master_dict[transcript]["cds_start"]
|
|
517 cds_stop = master_dict[transcript]["cds_stop"]
|
|
518 if cds_start != "NULL":
|
|
519 cds_pos = []
|
|
520 for i in range(cds_start, cds_stop+1):
|
|
521 cds_pos.append(i)
|
|
522
|
|
523 for tup in master_dict[transcript]["cds"]:
|
|
524 if tup[0] != tup[1]:
|
|
525 if tup not in genomic_cds_dict[chrom]:
|
|
526 genomic_cds_dict[chrom].append(tup)
|
|
527
|
|
528 print ("genomic cds dict built")
|
|
529 print (list(genomic_cds_dict))
|
|
530 for chrom in genomic_cds_dict:
|
|
531 tree_dict[chrom] = IntervalTree.from_tuples(genomic_cds_dict[chrom])
|
|
532
|
|
533 #print (list(tree_dict))
|
|
534
|
|
535
|
|
536 connection = sqlite3.connect("{}".format(sys.argv[6]))
|
|
537 cursor = connection.cursor()
|
|
538 cursor.execute("CREATE TABLE IF NOT EXISTS transcripts (transcript VARCHAR(50), gene VARCHAR(50), length INT(6), cds_start INT(6), cds_stop INT(6), sequence VARCHAR(50000), strand CHAR(1), stop_list VARCHAR(10000), start_list VARCHAR(10000), exon_junctions VARCHAR(1000), tran_type INT(1), gene_type INT(1), principal INT(1), version INT(2),gc INT(3),five_gc INT(3), cds_gc INT(3), three_gc INT(3), chrom VARCHAR(20));")
|
|
539 cursor.execute("CREATE TABLE IF NOT EXISTS coding_regions (transcript VARCHAR(50), coding_start INT(6), coding_stop INT(6));")
|
|
540 cursor.execute("CREATE TABLE IF NOT EXISTS exons (transcript VARCHAR(50), exon_start INT(6), exon_stop INT(6));")
|
|
541 cursor.execute("CREATE TABLE IF NOT EXISTS uorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
|
|
542 cursor.execute("CREATE TABLE IF NOT EXISTS ouorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
|
|
543 cursor.execute("CREATE TABLE IF NOT EXISTS cds (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
|
|
544 cursor.execute("CREATE TABLE IF NOT EXISTS nested (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
|
|
545 cursor.execute("CREATE TABLE IF NOT EXISTS odorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
|
|
546 cursor.execute("CREATE TABLE IF NOT EXISTS dorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
|
|
547 cursor.execute("CREATE TABLE IF NOT EXISTS minusone(transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
|
|
548 cursor.execute("CREATE TABLE IF NOT EXISTS readthrough (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
|
|
549 cursor.execute("CREATE TABLE IF NOT EXISTS plusone (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
|
|
550 cursor.execute("CREATE TABLE IF NOT EXISTS noncoding (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
|
|
551 cursor.execute("CREATE TABLE IF NOT EXISTS extension (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
|
|
552 cursor.execute("CREATE TABLE IF NOT EXISTS inframe_stop (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
|
|
553 connection.commit();
|
|
554
|
|
555
|
|
556 print ("Finding ORFs")
|
|
557 transcript_count = 0
|
|
558 total_transcripts = len(list(master_dict))
|
|
559 for transcript in master_dict:
|
|
560 #print ("transcript",transcript)
|
|
561 #if transcript != "ENST00000316448":
|
|
562 # continue
|
|
563 transcript_count += 1
|
|
564 if transcript_count%100 == 0:
|
|
565 print ("Transcripts complete: {}/{}".format(transcript_count,total_transcripts))
|
|
566 if "seq" not in master_dict[transcript]:
|
|
567 print ("transcript {} has no sequence".format(transcript))
|
|
568 continue
|
|
569 seq = master_dict[transcript]["seq"]
|
|
570 cds_start = "NULL"
|
|
571 cds_stop = "NULL"
|
|
572 transcript_len = len(seq)
|
|
573 if "cds_start" in master_dict[transcript]:
|
|
574 cds_start = master_dict[transcript]["cds_start"]
|
|
575 cds_stop = master_dict[transcript]["cds_stop"]
|
|
576
|
|
577 #Find out what regions of this transcript overlap with any other coding regions
|
|
578 coding_positions = []
|
|
579 if cds_start != "NULL":
|
|
580 #If this is a coding transcript don't bother checking the CDS
|
|
581 for i in range(cds_start,cds_stop):
|
|
582 coding_positions.append(i)
|
|
583 #check 5' leader
|
|
584 chrom, pos_list = tran_to_genome(transcript, 0, cds_start, master_dict)
|
|
585 for i in range(0,cds_start):
|
|
586 genomic_pos = pos_list[i]
|
|
587 overlap = tree_dict[chrom][genomic_pos]
|
|
588 if len(overlap) != 0:
|
|
589 coding_positions.append(i)
|
|
590 #check 3' trailer
|
|
591 chrom, pos_list = tran_to_genome(transcript, cds_stop, transcript_len, master_dict)
|
|
592 for i in range(cds_stop,transcript_len+1):
|
|
593 #print ("i",i)
|
|
594 genomic_pos = pos_list[i-cds_stop]
|
|
595 #print ("genomic position",genomic_pos)
|
|
596 overlap = tree_dict[chrom][genomic_pos]
|
|
597 if len(overlap) != 0:
|
|
598 #print ("overlap not empty appending i",overlap)
|
|
599 coding_positions.append(i)
|
|
600 else:
|
|
601 #check entire transcript
|
|
602 chrom, pos_list = tran_to_genome(transcript, 0, transcript_len, master_dict)
|
|
603 for i in range(0,transcript_len):
|
|
604 genomic_pos = pos_list[i]
|
|
605 overlap = tree_dict[chrom][genomic_pos]
|
|
606 if len(overlap) != 0:
|
|
607 coding_positions.append(i)
|
|
608 coding_positions_tuple = to_ranges(coding_positions)
|
|
609 coding_dict[transcript] = coding_positions_tuple
|
|
610 coding_positions = set(coding_positions)
|
|
611 #if this is a coding transcript find the minusone, readhtrough, plusone co-ordinates
|
|
612 if cds_start != "NULL":
|
|
613 if pseudo_utr_len != 0:
|
|
614 cds_stop -= 3 # take 3 from stop so we can match it with orf_stop, do it here rather than above in case cds_stop is null
|
|
615 recoding_dict = {0:"minusone",1:"readthrough",2:"plusone"}
|
|
616 for addition in recoding_dict:
|
|
617 orftype = recoding_dict[addition]
|
|
618 for i in range(cds_stop+addition,transcript_len,3):
|
|
619 if seq[i:i+3] in stop_codons:
|
|
620 orf_seq = seq[cds_stop:i+3]
|
|
621 orf_start = cds_stop
|
|
622 orf_stop = i+2 # +2 so it refers to the end of the stop codon
|
|
623 start_codon = None
|
|
624 if orf_seq:
|
|
625 length = len(orf_seq)
|
|
626 orf_pos_list = []
|
|
627 #determine how many nucleotides in this orf overlap with an annotated coding region
|
|
628 cds_cov_count = 0.0
|
|
629 for position in range(orf_start,orf_stop):
|
|
630 orf_pos_list.append(position)
|
|
631 for pos in range(orf_start, orf_stop+1):
|
|
632 if pos in coding_positions:
|
|
633 cds_cov_count += 1
|
|
634 cds_cov = cds_cov_count/length
|
|
635 cursor.execute("INSERT INTO {} VALUES('{}','{}',{},{},{},'{}',{});".format(orftype, transcript, start_codon, length,orf_start,orf_stop,orf_seq,cds_cov))
|
|
636 break
|
|
637 for frame in [0,1,2]:
|
|
638 for i in range(frame,transcript_len,3):
|
|
639 if seq[i:i+3] in start_codons:
|
|
640 for x in range(i, transcript_len,3):
|
|
641 if seq[x:x+3] in stop_codons:
|
|
642 orf_seq = seq[i:x+3]
|
|
643 orf_start = i
|
|
644 orf_stop = x+2 # +2 so it refers to the end of the stop codon
|
|
645 start_codon = seq[i:i+3]
|
|
646 length = len(orf_seq)
|
|
647 orf_pos_list = []
|
|
648 #determine how many nucleotides in this orf overlap with an annotated coding region
|
|
649 cds_cov_count = 0.0
|
|
650 for pos in range(orf_start, orf_stop+1):
|
|
651 if pos in coding_positions:
|
|
652 cds_cov_count += 1
|
|
653 cds_cov = float(cds_cov_count)/float(length)
|
|
654 # Now determine orf type
|
|
655 if cds_start == "NULL":
|
|
656 orftype = "noncoding"
|
|
657 else:
|
|
658 #print ("cds start is not null :{}:".format(cds_start))
|
|
659 if orf_start == cds_start and orf_stop == cds_stop:
|
|
660 orftype = "cds"
|
|
661 elif orf_start < cds_start and orf_stop == cds_stop:
|
|
662 orftype = "extension"
|
|
663 #special case for extensions, we only take from the orf_start to the cds_start, and re-calculate cds coverage
|
|
664 orf_stop = cds_start
|
|
665 cds_cov_count = 0.0
|
|
666 for pos in range(orf_start, orf_stop+1):
|
|
667 if pos in coding_positions:
|
|
668 cds_cov_count += 1
|
|
669 cds_cov = float(cds_cov_count)/float(length)
|
|
670 orf_seq = seq[orf_start:cds_start]
|
|
671 length = len(orf_seq)
|
|
672 elif orf_start < cds_start and orf_stop <= cds_start:
|
|
673 orftype = "uorf"
|
|
674 elif orf_start < cds_start and orf_stop > cds_start:
|
|
675 orftype = "ouorf"
|
|
676 elif orf_start >= cds_start and orf_start <= cds_stop and orf_stop <= cds_stop:
|
|
677 if orf_stop == cds_stop:
|
|
678 break
|
|
679 orftype = "nested"
|
|
680 elif orf_start >= cds_start and orf_start <= cds_stop and orf_stop > cds_stop:
|
|
681 orftype = "odorf"
|
|
682 elif orf_start > cds_stop and orf_stop > cds_stop:
|
|
683 orftype = "dorf"
|
|
684 if orf_stop > cds_start and orf_stop < cds_stop:
|
|
685 if (orf_stop+1)%3 == cds_start%3:
|
|
686 orftype = "inframe_stop"
|
|
687 if transcript not in orf_dict:
|
|
688 orf_dict[orftype][transcript] = []
|
|
689 cursor.execute("INSERT INTO {} VALUES('{}','{}',{},{},{},'{}',{});".format(orftype, transcript, start_codon, length,orf_start,orf_stop,orf_seq,cds_cov))
|
|
690 break
|
|
691 # Used to keep track of the codons at cds_start and cds_stop positions,
|
|
692 # If there is an issue with the cds co-ordinates the starts and stops counts will
|
|
693 # be much lower than the other count, start with 1 to prevent division by 0
|
|
694 nuc_dict = {"stops":{"stops":1,"other":0}, "starts":{"starts":1,"other":0}}
|
|
695
|
|
696 def calcgc(seq):
|
|
697 if seq == "":
|
|
698 return "NULL"
|
|
699 g_count = 0
|
|
700 c_count = 0
|
|
701 a_count = 0
|
|
702 t_count = 0
|
|
703 for char in seq:
|
|
704 if char == "A":
|
|
705 a_count += 1
|
|
706 if char == "T":
|
|
707 t_count += 1
|
|
708 if char == "G":
|
|
709 g_count += 1
|
|
710 if char == "C":
|
|
711 c_count += 1
|
|
712 gc = ((g_count+c_count)/float(len(seq)))*100
|
|
713 return round(gc,2)
|
|
714
|
|
715
|
|
716
|
|
717
|
|
718 for transcript in master_dict:
|
|
719 #print ("transcripts", transcript)
|
|
720 length = master_dict[transcript]["length"]
|
|
721 cds_start = master_dict[transcript]["cds_start"]
|
|
722 cds_stop = master_dict[transcript]["cds_stop"]
|
|
723 seq = master_dict[transcript]["seq"]
|
|
724 strand = master_dict[transcript]["strand"]
|
|
725 chrom = master_dict[transcript]["chrom"]
|
|
726 gene = master_dict[transcript]["gene_name"]
|
|
727 gc = calcgc(seq)
|
|
728 five_gc = "NULL"
|
|
729 cds_gc = "NULL"
|
|
730 three_gc = "NULL"
|
|
731 if cds_start != "NULL":
|
|
732 five_gc = calcgc(seq[:cds_start])
|
|
733 cds_gc = calcgc(seq[cds_start:cds_stop])
|
|
734 three_gc = calcgc(seq[cds_stop:])
|
|
735 # check that the nucleotide cds_start points to is the first of the start codon
|
|
736 # take one becase cds_start is 1 based but python indexing is 0 based
|
|
737 start_nuc = seq[cds_start-1:cds_start+2]
|
|
738 #print ("start nuc",start_nuc)
|
|
739 if start_nuc == "ATG":
|
|
740 nuc_dict["starts"]["starts"] += 1
|
|
741 else:
|
|
742 nuc_dict["starts"]["other"] += 1
|
|
743 stop_nuc = seq[cds_stop-3:cds_stop]
|
|
744 #print ("stop_nuc",stop_nuc)
|
|
745 if stop_nuc in ["TAG","TAA","TGA"]:
|
|
746 nuc_dict["stops"]["stops"] += 1
|
|
747 else:
|
|
748 nuc_dict["stops"]["other"] += 1
|
|
749 tran_type = master_dict[transcript]["tran_type"]
|
|
750 if coding_genes_dict[gene] == True:
|
|
751 gene_type = 1
|
|
752 else:
|
|
753 gene_type = 0
|
|
754 #print ("tran type before",tran_type)
|
|
755 if tran_type == "coding":
|
|
756 tran_type = 1
|
|
757 else:
|
|
758 tran_type = 0
|
|
759 #print ("tran type after",tran_type)
|
|
760 start_list = str(master_dict[transcript]["start_list"]).replace(" ","").strip("[]")
|
|
761 stop_list = str(master_dict[transcript]["stop_list"]).replace(" ","").strip("[]")
|
|
762 exon_junctions = str(master_dict[transcript]["exon_junctions"]).replace(" ","").strip("[]")
|
|
763 principal = master_dict[transcript]["principal"]
|
|
764 version = master_dict[transcript]["version"]
|
|
765 #print (master_dict[transcript])
|
|
766 #print (tran_type)
|
|
767 #print (gene_type)
|
|
768 #print (principal)
|
|
769 #print (version)
|
|
770 #print ("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{});".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version))
|
|
771 cursor.execute("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{},{},{},{},{},'{}');".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version,gc,five_gc,cds_gc,three_gc,chrom))
|
|
772
|
|
773 for tup in master_dict[transcript]["exon"]:
|
|
774 cursor.execute("INSERT INTO exons VALUES('{}',{},{});".format(transcript,tup[0],tup[1]))
|
|
775 if transcript in coding_dict:
|
|
776 for tup in coding_dict[transcript]:
|
|
777 cursor.execute("INSERT INTO coding_regions VALUES('{}',{},{});".format(transcript,tup[0],tup[1]))
|
|
778
|
|
779 connection.commit()
|
|
780 connection.close()
|
|
781
|
|
782 if (nuc_dict["starts"]["other"]/nuc_dict["starts"]["starts"]) > 0.05:
|
|
783 print ("Warning: {} transcripts do not have a an AUG at the CDS start position".format(nuc_dict["starts"]["other"]))
|
|
784 if (nuc_dict["stops"]["other"]/nuc_dict["stops"]["stops"]) > 0.05:
|
|
785 print ("Warning: {} transcripts do not have a an stop codon at the CDS stop position".format(nuc_dict["stops"]["other"]))
|
|
786 if len(notinannotation) >0:
|
|
787 print ("Warning: {} transcripts were in the fasta file, but not the annotation file, these will be discarded".format(len(notinannotation)))
|