0
|
1 # Python3 script which takes in an annotation file(gtf/gff3) and a transcriptomic fasta file
|
|
2 # and produces an sqlite file which can be uploaded to Trips-Viz
|
|
3 # All co-ordinates produced are 1 based
|
|
4 # All start codon positions (including cds_start) should be at the first nucleotide of the codon
|
|
5 # All stop codon positions (including cds_stop) should be at the last nucleotide of the codon
|
|
6 import sys
|
|
7 import re
|
|
8 import sqlite3
|
3
|
9 import subprocess
|
|
10
|
0
|
11 from intervaltree import Interval, IntervalTree
|
|
12 import itertools
|
|
13 from sqlitedict import SqliteDict
|
|
14 import os
|
|
15
|
|
16 organism = sys.argv[1]
|
|
17 # This should be a GTF or GFF3 file
|
|
18 annotation_file = open(sys.argv[2], "r")
|
|
19 # This needs to be the transcriptomic fasta file
|
|
20 fasta_file = open(sys.argv[3], "r")
|
|
21 # This value will be added used to create UTRs of this length, useful when looking at transcriptomes without annotated UTRs
|
|
22 pseudo_utr_len = int(sys.argv[4])
|
|
23 # An example of a transcript_id from the annotation file, e.g ENST000000123456
|
|
24 user_transcript_id = sys.argv[5]
|
|
25 # An example of a gene name from the annotation file
|
|
26 user_gene_name = sys.argv[6]
|
|
27 # Set to true if transcript version is included in transcript_id, e.g: ENST000000123456.1
|
|
28 TRAN_VERSION = True
|
3
|
29 output = sys.argv[7]
|
0
|
30
|
|
31
|
|
32 if os.path.isfile("{}.sqlite".format(organism)):
|
|
33 print("{}.sqlite already exists".format(organism))
|
|
34 sys.exit()
|
|
35
|
|
36
|
|
37 # old_exons = SqliteDict(
|
2
|
38 # "/home/data2/www/tripsviz/tripsviz/trips_annotations/mus_musculus/transcriptomic_to_genomic.sqlite"
|
0
|
39 # )
|
|
40
|
|
41
|
|
42 delimiters = {
|
|
43 "transcripts": {"before": [], "after": ["."], "annot_types": ["cds", "utr"]},
|
|
44 "genes": {"before": [], "after": ['"'], "annot_types": ["lnc_rna"]},
|
|
45 }
|
|
46
|
|
47 punctuation = [";", " ", "-", ":", "-", ".", "=", "\t"]
|
|
48 # Find delimiters in the annotation and fasta files using the user_transcript_id
|
|
49 # and user_gene_name examples given by user.
|
|
50 for line in annotation_file:
|
|
51 if user_transcript_id in line:
|
|
52 tabsplitline = line.split("\t")
|
|
53 annot_type = tabsplitline[2]
|
|
54 if annot_type not in delimiters["transcripts"]["annot_types"]:
|
|
55 delimiters["transcripts"]["annot_types"].append(annot_type.lower())
|
|
56 splitline = line.split(user_transcript_id)
|
|
57 before_delimiter = splitline[0]
|
|
58 for item in punctuation:
|
|
59 if item in before_delimiter:
|
|
60 if len(before_delimiter.split(item)[-1]) >= 5:
|
|
61 before_delimiter = before_delimiter.split(item)[-1]
|
|
62 after_delimiter = splitline[1][:2]
|
|
63 if (
|
|
64 before_delimiter not in delimiters["transcripts"]["before"]
|
|
65 and len(before_delimiter) >= 5
|
|
66 ):
|
|
67 delimiters["transcripts"]["before"].append(before_delimiter)
|
|
68 if after_delimiter not in delimiters["transcripts"]["after"]:
|
|
69 delimiters["transcripts"]["after"].append(after_delimiter)
|
|
70 if user_gene_name in line:
|
|
71 tabsplitline = line.split("\t")
|
|
72 annot_type = tabsplitline[2]
|
|
73 print("ANNOT TYPE", annot_type)
|
|
74 if annot_type not in delimiters["genes"]["annot_types"]:
|
|
75 delimiters["genes"]["annot_types"].append(annot_type.lower())
|
|
76 splitline = line.split(user_gene_name)
|
|
77 before_delimiter = splitline[0]
|
|
78 for item in punctuation:
|
|
79 if item in before_delimiter:
|
|
80 if len(before_delimiter.split(item)[-1]) >= 5:
|
|
81 before_delimiter = before_delimiter.split(item)[-1]
|
|
82 after_delimiter = splitline[1][0]
|
|
83 if (
|
|
84 before_delimiter not in delimiters["genes"]["before"]
|
|
85 and len(before_delimiter) >= 5
|
|
86 ):
|
|
87 delimiters["genes"]["before"].append(before_delimiter)
|
|
88 if after_delimiter not in delimiters["genes"]["after"]:
|
|
89 if after_delimiter in punctuation:
|
|
90 delimiters["genes"]["after"].append(after_delimiter)
|
|
91
|
|
92 print("delimeters[genes]", delimiters["transcripts"]["annot_types"])
|
|
93
|
|
94 for line in fasta_file:
|
|
95 if user_transcript_id in line:
|
|
96 splitline = line.split(user_transcript_id)
|
|
97 before_delimiter = splitline[0]
|
|
98 for item in punctuation:
|
|
99 if item in before_delimiter:
|
|
100 if len(before_delimiter.split(item)[1]) >= 5:
|
|
101 before_delimiter = before_delimiter.split(item)[1]
|
|
102 after_delimiter = splitline[1][0]
|
|
103 if (
|
|
104 before_delimiter not in delimiters["transcripts"]["before"]
|
|
105 and len(before_delimiter) >= 5
|
|
106 ):
|
|
107 delimiters["transcripts"]["before"].append(before_delimiter)
|
|
108 if after_delimiter not in delimiters["transcripts"]["after"]:
|
|
109 delimiters["transcripts"]["after"].append(after_delimiter)
|
|
110 fasta_file.close()
|
|
111 annotation_file.close()
|
|
112
|
|
113
|
|
114 if delimiters["transcripts"]["before"] == []:
|
|
115 print(
|
|
116 "ERROR: No transcript_id with the name {} could be found in the annotation file".format(
|
|
117 user_transcript_id
|
|
118 )
|
|
119 )
|
|
120 sys.exit()
|
|
121 if delimiters["genes"]["before"] == []:
|
|
122 print(
|
|
123 "ERROR: No gene with the name {} could be found in the annotation file".format(
|
|
124 user_gene_name
|
|
125 )
|
|
126 )
|
|
127 sys.exit()
|
|
128
|
|
129 master_dict = {}
|
|
130 coding_dict = {}
|
|
131 notinfasta = open("notinfasta.csv", "w")
|
|
132
|
|
133 # Given a nucleotide sequence returns the positions of all start and stop codons.
|
|
134 def get_start_stops(transcript_sequence):
|
|
135 transcript_sequence = transcript_sequence.upper()
|
|
136 start_codons = ["ATG"]
|
|
137 stop_codons = ["TAA", "TAG", "TGA"]
|
|
138 seq_frames = {"starts": [], "stops": []}
|
|
139 for codons, positions in ((start_codons, "starts"), (stop_codons, "stops")):
|
|
140 if len(codons) > 1:
|
|
141 pat = re.compile("|".join(codons))
|
|
142 else:
|
|
143 pat = re.compile(codons[0])
|
|
144 for m in re.finditer(pat, transcript_sequence):
|
|
145 # Increment position by 1, Frame 1 starts at position 1 not 0,
|
|
146 # if it's a stop codon add another 2 so it points to the last nuc of the codon
|
|
147 if positions == "starts":
|
|
148 start = m.start() + 1
|
|
149 else:
|
|
150 start = m.start() + 3
|
|
151 seq_frames[positions].append(start)
|
|
152 return seq_frames
|
|
153
|
|
154
|
|
155 # parse fasta to get the nucleotide sequence of transcripts and the positions of start/stop codons.
|
|
156 fasta_file = open(sys.argv[3], "r")
|
|
157 read_fasta = fasta_file.read()
|
|
158 split_fasta = read_fasta.split(">")
|
|
159 for entry in split_fasta[1:]:
|
|
160 newline_split = entry.split("\n")
|
|
161 tran = newline_split[0]
|
|
162 for item in delimiters["transcripts"]["after"]:
|
|
163 if item in tran:
|
|
164 tran = tran.split(item)[0]
|
|
165 tran = tran.replace("-", "_").replace("(", "").replace(")", "")
|
|
166 seq = "".join(newline_split[1:])
|
|
167 if "_PAR_Y" in tran:
|
|
168 tran += "_chrY"
|
|
169 elif "_PAR_X" in tran:
|
|
170 tran += "_chrX"
|
|
171 tran = tran.upper()
|
|
172 starts_stops = get_start_stops(seq)
|
|
173 print("tran", tran)
|
|
174 if tran not in master_dict:
|
|
175 master_dict[tran] = {
|
|
176 "utr": [],
|
|
177 "cds": [],
|
|
178 "exon": [],
|
|
179 "start_codon": [],
|
|
180 "stop_codon": [],
|
|
181 "start_list": starts_stops["starts"],
|
|
182 "stop_list": starts_stops["stops"],
|
|
183 "transcript": [],
|
|
184 "strand": "",
|
|
185 "gene_name": "",
|
|
186 "chrom": "",
|
|
187 "seq": seq,
|
|
188 "cds_start": "NULL",
|
|
189 "cds_stop": "NULL",
|
|
190 "length": len(seq),
|
|
191 "principal": 0,
|
|
192 "version": "NULL",
|
|
193 }
|
|
194
|
|
195
|
|
196 def to_ranges(iterable):
|
|
197 tup_list = []
|
|
198 iterable = sorted(set(iterable))
|
|
199 for key, group in itertools.groupby(enumerate(iterable), lambda t: t[1] - t[0]):
|
|
200 group = list(group)
|
|
201 tup_list.append((group[0][1], group[-1][1]))
|
|
202 return tup_list
|
|
203
|
|
204
|
|
205 # parse annotation file to get chromsome, exon location and CDS info for each transcript
|
|
206 def parse_gtf_file(annotation_file):
|
|
207 for line in annotation_file:
|
|
208 if line == "\n":
|
|
209 continue
|
|
210 if line[0] != "#":
|
|
211 splitline = (line.replace("\n", "")).split("\t")
|
|
212 chrom = splitline[0]
|
|
213 try:
|
|
214 annot_type = splitline[2].lower()
|
|
215 except:
|
|
216 print(
|
|
217 "ERROR tried to index to second item in splitline: ",
|
|
218 splitline,
|
|
219 line,
|
|
220 )
|
|
221 sys.exit()
|
|
222 # if annot_type not in ["cds", "utr", "exon", "transcript","five_prime_utr", "three_prime_utr","stop_codon","start_codon"]:
|
|
223 # continue
|
|
224 if (
|
|
225 annot_type not in delimiters["transcripts"]["annot_types"]
|
|
226 and annot_type not in delimiters["genes"]["annot_types"]
|
|
227 ):
|
|
228 continue
|
|
229 if annot_type == "five_prime_utr" or annot_type == "three_prime_utr":
|
|
230 annot_type = "utr"
|
|
231 strand = splitline[6]
|
|
232 if strand == "+":
|
|
233 start = int(splitline[3])
|
|
234 end = int(splitline[4])
|
|
235 else:
|
|
236 start = int(splitline[3]) + 1
|
|
237 end = int(splitline[4]) + 1
|
|
238 desc = splitline[8]
|
|
239 tran = desc
|
|
240 gene = desc
|
|
241 for item in delimiters["transcripts"]["before"]:
|
|
242 if item in tran:
|
|
243 tran = tran.split(item)[1]
|
|
244 for item in delimiters["transcripts"]["after"]:
|
|
245 if item in tran:
|
|
246 tran = tran.split(item)[0]
|
|
247 if "." in tran and TRAN_VERSION == True:
|
|
248 # print ("raw tran",tran)
|
|
249 tran = tran.split(".")
|
|
250 version = int(tran[-1].split("_")[0])
|
|
251 tran = tran[0]
|
|
252 else:
|
|
253 version = "NULL"
|
|
254 tran = tran.replace("-", "_").replace(".", "_")
|
|
255 tran = tran.replace("(", "").replace(")", "")
|
|
256 tran = tran.replace(" ", "").replace("\t", "")
|
|
257 tran = tran.upper()
|
|
258 tran = tran.replace("GENE_", "").replace("ID_", "")
|
|
259 if "_PAR_Y" in desc:
|
|
260 # print ("adding _PAR_Y to tran")
|
|
261 tran = tran + "_PAR_Y"
|
|
262 # print ("New tran ", tran)
|
|
263 # if "PAR_Y" in line:
|
|
264 # print (line)
|
|
265 # #sys.exit()
|
|
266 # print ("tran",tran,version)
|
|
267 # if tran == "ENST00000316448":
|
|
268 # print ("annot type",annot_type)
|
|
269 # print ("appending exon to tran", start, end,line)
|
|
270
|
|
271 gene_found = False
|
|
272
|
|
273 if annot_type in delimiters["genes"]["annot_types"]:
|
|
274 for item in delimiters["genes"]["before"]:
|
|
275 if item in gene:
|
|
276 gene_found = True
|
|
277 gene = gene.split(item)[1]
|
|
278 for item in delimiters["genes"]["after"]:
|
|
279 if item in gene:
|
|
280 gene = gene.split(item)[0]
|
|
281 gene = gene.replace("'", "''")
|
|
282 gene = gene.replace("GENE_", "")
|
|
283 gene = gene.replace("ID_", "")
|
|
284 gene = gene.upper()
|
|
285 if tran in master_dict:
|
|
286 master_dict[tran]["strand"] = strand
|
|
287 if strand == "+":
|
|
288 if annot_type in master_dict[tran]:
|
|
289 master_dict[tran][annot_type].append((start, end))
|
|
290 else:
|
|
291 if annot_type in master_dict[tran]:
|
|
292 master_dict[tran][annot_type].append((start, end))
|
|
293 master_dict[tran]["chrom"] = chrom
|
|
294 master_dict[tran]["version"] = version
|
|
295 if gene_found == True:
|
|
296 master_dict[tran]["gene_name"] = gene
|
|
297 else:
|
|
298 notinfasta.write("{}\n".format(tran))
|
|
299
|
|
300
|
|
301 annotation_file = open(sys.argv[2], "r")
|
|
302 parse_gtf_file(annotation_file)
|
|
303
|
|
304
|
|
305 # remove transcripts that were in fasta file but not in annotation_file
|
|
306 notinannotation = []
|
|
307 for tran in master_dict:
|
|
308 if master_dict[tran]["chrom"] == "":
|
|
309 # print ("tran {} has no chrom :(".format(tran))
|
|
310 notinannotation.append(tran)
|
|
311 for tran in notinannotation:
|
|
312 print(tran)
|
|
313 del master_dict[tran]
|
|
314 # Dictionary to store the coding status of a gene, if any transcript of this gene is coding, the value will be True
|
|
315 coding_genes_dict = {}
|
|
316 # parse master_dict to calculate length, cds_start/stop and exon junction positions
|
|
317 for tran in master_dict:
|
|
318 try:
|
|
319 transeq = master_dict[tran]["seq"]
|
|
320 except Exception as e:
|
|
321 print("not in fasta", tran)
|
|
322 notinfasta.write("{}\n".format(tran))
|
|
323 continue
|
|
324 exon_junctions = []
|
|
325 total_length = len(transeq)
|
|
326 three_len = 1
|
|
327 five_len = 1
|
|
328 strand = master_dict[tran]["strand"]
|
|
329 if master_dict[tran]["gene_name"] == "":
|
|
330 master_dict[tran]["gene_name"] = tran
|
|
331 gene = master_dict[tran]["gene_name"]
|
|
332 if gene not in coding_genes_dict:
|
|
333 coding_genes_dict[gene] = False
|
|
334
|
|
335 if master_dict[tran]["cds"] == []:
|
|
336 tran_type = "noncoding"
|
|
337 cds_start = "NULL"
|
|
338 cds_stop = "NULL"
|
|
339 else:
|
|
340 # get utr lengths from annotation
|
|
341 tran_type = "coding"
|
|
342 coding_genes_dict[gene] = True
|
|
343 sorted_exons = sorted(master_dict[tran]["exon"])
|
|
344 sorted_cds = sorted(master_dict[tran]["cds"])
|
|
345
|
|
346 min_cds = sorted_cds[0][0]
|
|
347 # Some annotation files do not have utr annotation types, so fix that here if thats the case
|
|
348 if master_dict[tran]["utr"] == []:
|
|
349 for exon_tup in master_dict[tran]["exon"]:
|
|
350 if exon_tup not in master_dict[tran]["cds"]:
|
|
351 # Now check if this overlaps with any of the CDS exons
|
|
352 overlap = False
|
|
353 for cds_tup in master_dict[tran]["cds"]:
|
|
354 if exon_tup[0] == cds_tup[0] and exon_tup[1] != cds_tup[1]:
|
|
355 master_dict[tran]["utr"].append((cds_tup[1], exon_tup[1]))
|
|
356 overlap = True
|
|
357 if exon_tup[0] != cds_tup[0] and exon_tup[1] == cds_tup[1]:
|
|
358 master_dict[tran]["utr"].append((exon_tup[0], cds_tup[0]))
|
|
359 overlap = True
|
|
360 if overlap == False:
|
|
361 master_dict[tran]["utr"].append(exon_tup)
|
|
362
|
|
363 for tup in sorted(master_dict[tran]["utr"]):
|
|
364 if tup[0] < min_cds:
|
|
365 five_len += (tup[1] - tup[0]) + 1
|
|
366 elif tup[0] > min_cds:
|
|
367 three_len += (tup[1] - tup[0]) + 1
|
|
368 else:
|
|
369 pass
|
|
370 if strand == "+":
|
|
371 if len(sorted_exons) > 1:
|
|
372 sorted_exons[0] = (
|
|
373 sorted_exons[0][0] - pseudo_utr_len,
|
|
374 sorted_exons[0][1],
|
|
375 )
|
|
376 sorted_exons[-1] = (
|
|
377 sorted_exons[-1][0],
|
|
378 sorted_exons[-1][1] + pseudo_utr_len,
|
|
379 )
|
|
380 else:
|
|
381 sorted_exons[0] = (
|
|
382 sorted_exons[0][0] - pseudo_utr_len,
|
|
383 sorted_exons[0][1] + pseudo_utr_len,
|
|
384 )
|
|
385 master_dict[tran]["exon"] = sorted_exons
|
|
386 cds_start = five_len + pseudo_utr_len
|
|
387 cds_stop = ((total_length - three_len) - pseudo_utr_len) + 4
|
|
388 elif strand == "-":
|
|
389 if len(sorted_exons) > 1:
|
|
390 sorted_exons[0] = (
|
|
391 (sorted_exons[0][0] - pseudo_utr_len),
|
|
392 sorted_exons[0][1],
|
|
393 )
|
|
394 sorted_exons[-1] = (
|
|
395 sorted_exons[-1][0],
|
|
396 (sorted_exons[-1][1] + pseudo_utr_len),
|
|
397 )
|
|
398 else:
|
|
399 sorted_exons[0] = (
|
|
400 (sorted_exons[0][0] - pseudo_utr_len),
|
|
401 (sorted_exons[0][1] + pseudo_utr_len),
|
|
402 )
|
|
403 master_dict[tran]["exon"] = sorted_exons
|
|
404 cds_start = three_len + pseudo_utr_len
|
|
405 cds_stop = ((total_length - (five_len)) - pseudo_utr_len) + 4
|
|
406 # if tran == "ENST00000381401":
|
|
407 # print ("cds start, cds stop, five_len, three_len",cds_start,cds_stop,five_len,three_len)
|
|
408 # #sys.exit()
|
|
409 else:
|
|
410 print("strand is unknown: {}".format(strand))
|
|
411 sys.exit()
|
|
412 # get exon junctions, cds is easy just get end of each tuple except last, same for utr except for if same as cds start/stop
|
|
413 total_intronic = 0
|
|
414 try:
|
|
415 if strand == "+":
|
|
416 tx_start = min(sorted(master_dict[tran]["exon"]))[0]
|
|
417 prev_end = tx_start
|
|
418 for tup in sorted(master_dict[tran]["exon"])[:-1]:
|
|
419 total_intronic += tup[0] - prev_end
|
|
420 exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
|
|
421 prev_end = tup[1]
|
|
422 elif strand == "-":
|
|
423 tx_start = max(sorted(master_dict[tran]["exon"]))[-1]
|
|
424 prev_end = tx_start
|
|
425 for tup in (sorted(master_dict[tran]["exon"])[1:])[::-1]:
|
|
426 total_intronic += (tup[0] + 1) - prev_end
|
|
427 exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
|
|
428 prev_end = tup[1]
|
|
429 except:
|
|
430 if strand == "+":
|
|
431 tx_start = min(sorted(master_dict[tran]["cds"]))[0]
|
|
432 prev_end = tx_start
|
|
433 for tup in sorted(master_dict[tran]["cds"])[:-1]:
|
|
434 total_intronic += tup[0] - prev_end
|
|
435 exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
|
|
436 prev_end = tup[1]
|
|
437 elif strand == "-":
|
|
438 tx_start = max(sorted(master_dict[tran]["cds"]))[-1]
|
|
439 prev_end = tx_start
|
|
440 for tup in (sorted(master_dict[tran]["cds"])[1:])[::-1]:
|
|
441 total_intronic += (tup[0] + 1) - prev_end
|
|
442 exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
|
|
443 prev_end = tup[1]
|
|
444 # This can happen when a coding transcript doesn't have a properly annotated 3' trailer
|
|
445 if cds_stop != "NULL":
|
|
446 if cds_stop > total_length:
|
|
447 cds_stop = total_length
|
|
448 if strand == "+" and cds_start != "NULL":
|
|
449 master_dict[tran]["cds_start"] = cds_start
|
|
450 master_dict[tran]["cds_stop"] = cds_stop
|
|
451 elif strand == "-" and cds_start != "NULL":
|
|
452 master_dict[tran]["cds_start"] = cds_start
|
|
453 master_dict[tran]["cds_stop"] = cds_stop
|
|
454
|
|
455 master_dict[tran]["strand"] = strand
|
|
456 master_dict[tran]["tran_type"] = tran_type
|
|
457 master_dict[tran]["exon_junctions"] = exon_junctions
|
|
458
|
|
459 longest_tran_dict = {}
|
|
460 for tran in master_dict:
|
|
461 try:
|
|
462 gene = master_dict[tran]["gene_name"]
|
|
463 except:
|
|
464 continue
|
|
465 if coding_genes_dict[gene] == True:
|
|
466 if "cds_start" in master_dict[tran]:
|
|
467 if (
|
|
468 master_dict[tran]["cds_stop"] != "NULL"
|
|
469 and master_dict[tran]["cds_start"] != "NULL"
|
|
470 ):
|
|
471 cds_length = (
|
|
472 master_dict[tran]["cds_stop"] - master_dict[tran]["cds_start"]
|
|
473 )
|
|
474 if gene not in longest_tran_dict:
|
|
475 longest_tran_dict[gene] = {"tran": tran, "length": cds_length}
|
|
476 else:
|
|
477 if cds_length > longest_tran_dict[gene]["length"]:
|
|
478 longest_tran_dict[gene] = {"tran": tran, "length": cds_length}
|
|
479 if cds_length == longest_tran_dict[gene]["length"]:
|
|
480 if (
|
|
481 master_dict[tran]["length"]
|
|
482 > master_dict[longest_tran_dict[gene]["tran"]]["length"]
|
|
483 ):
|
|
484 longest_tran_dict[gene] = {
|
|
485 "tran": tran,
|
|
486 "length": cds_length,
|
|
487 }
|
|
488 else:
|
|
489 length = master_dict[tran]["length"]
|
|
490 if gene not in longest_tran_dict:
|
|
491 longest_tran_dict[gene] = {"tran": tran, "length": length}
|
|
492 elif length > longest_tran_dict[gene]["length"]:
|
|
493 longest_tran_dict[gene] = {"tran": tran, "length": length}
|
|
494
|
|
495
|
|
496 for gene in longest_tran_dict:
|
|
497 longest_tran = longest_tran_dict[gene]["tran"]
|
|
498 master_dict[longest_tran]["principal"] = 1
|
|
499
|
|
500 gene_sample = []
|
|
501 for key in list(master_dict)[:10]:
|
|
502 try:
|
|
503 gene_sample.append(master_dict[key]["gene_name"])
|
|
504 except:
|
|
505 pass
|
|
506 print(master_dict)
|
|
507 print("Here is a sample of the transcript ids: {}".format(list(master_dict)[:10]))
|
|
508 print("Here is a sample of the gene names: {}".format(gene_sample))
|
|
509
|
|
510
|
|
511 # Takes a transcript, transcriptomic position and a master_dict (see ribopipe scripts) and returns the genomic position, positions should be passed 1 at a time.
|
|
512 def tran_to_genome(tran, start_pos, end_pos, master_dict):
|
|
513 pos_list = []
|
|
514 for i in range(start_pos, end_pos + 1):
|
|
515 pos_list.append(i)
|
|
516 genomic_pos_list = []
|
|
517 if tran in master_dict:
|
|
518 transcript_info = master_dict[tran]
|
|
519 else:
|
|
520 return ("Null", [])
|
|
521
|
|
522 chrom = transcript_info["chrom"]
|
|
523 strand = transcript_info["strand"]
|
|
524 exons = sorted(transcript_info["exon"])
|
|
525 # print ("chrom,strand,exons",chrom,strand,exons)
|
|
526 for pos in pos_list:
|
|
527 # print ("pos",pos)
|
|
528 if strand == "+":
|
|
529 exon_start = 0
|
|
530 for tup in exons:
|
|
531 # print ("tup",tup)
|
|
532 exon_start = tup[0]
|
|
533 exonlen = tup[1] - tup[0]
|
|
534 if pos > exonlen:
|
|
535 pos = (pos - exonlen) - 1
|
|
536 else:
|
|
537 break
|
|
538 # print ("appending exon_start-pos", exon_start, pos, exon_start+pos)
|
|
539 genomic_pos_list.append((exon_start + pos) - 1)
|
|
540 elif strand == "-":
|
|
541 exon_start = 0
|
|
542 for tup in exons[::-1]:
|
|
543 # print ("tup",tup)
|
|
544 exon_start = tup[1]
|
|
545 exonlen = tup[1] - tup[0]
|
|
546 # print ("exonlen",exonlen)
|
|
547 if pos > exonlen:
|
|
548 # print ("pos is greater")
|
|
549 pos = (pos - exonlen) - 1
|
|
550 # print ("new pos",pos)
|
|
551 else:
|
|
552 break
|
|
553 # print ("appending exon_start-pos", exon_start, pos, exon_start-pos)
|
|
554 genomic_pos_list.append((exon_start - pos) + 1)
|
|
555 return (chrom, genomic_pos_list)
|
|
556
|
|
557
|
|
558 orf_dict = {
|
|
559 "uorf": {},
|
|
560 "ouorf": {},
|
|
561 "cds": {},
|
|
562 "nested": {},
|
|
563 "odorf": {},
|
|
564 "dorf": {},
|
|
565 "minusone": {},
|
|
566 "readthrough": {},
|
|
567 "plusone": {},
|
|
568 "noncoding": {},
|
|
569 "extension": {},
|
|
570 "inframe_stop": {},
|
|
571 }
|
|
572
|
|
573 start_codons = ["ATG", "CTG", "GTG", "TTG", "ATC", "ATA", "ATT", "ACG", "AAG", "AGG"]
|
|
574
|
|
575 stop_codons = ["TAG", "TAA", "TGA"]
|
|
576
|
|
577
|
|
578 # Keep track of the longest transcript for each noncoding gene, append this to transcript list later
|
|
579 longest_noncoding = {}
|
|
580
|
|
581
|
|
582 tran_count = 0
|
|
583 # This section is used to gather all cds regions, convert them to genomic regions and store them in a dictionary to check against later (all transcript contribute to this not just those
|
|
584 # in the transcript list)
|
|
585 genomic_cds_dict = {}
|
|
586 tree_dict = {}
|
|
587 for transcript in master_dict:
|
|
588 # print (transcript, master_dict[transcript]["tran_type"])
|
|
589 tran_count += 1
|
|
590 if "seq" not in master_dict[transcript]:
|
|
591 continue
|
|
592 chrom = master_dict[transcript]["chrom"]
|
|
593 if chrom not in genomic_cds_dict:
|
|
594 genomic_cds_dict[chrom] = []
|
|
595 if "cds_start" in master_dict[transcript]:
|
|
596 cds_start = master_dict[transcript]["cds_start"]
|
|
597 cds_stop = master_dict[transcript]["cds_stop"]
|
|
598 if cds_start != "NULL":
|
|
599 cds_pos = []
|
|
600 for i in range(cds_start, cds_stop + 1):
|
|
601 cds_pos.append(i)
|
|
602
|
|
603 for tup in master_dict[transcript]["cds"]:
|
|
604 if tup[0] != tup[1]:
|
|
605 if tup not in genomic_cds_dict[chrom]:
|
|
606 genomic_cds_dict[chrom].append(tup)
|
|
607
|
|
608 print("genomic cds dict built")
|
|
609 print(list(genomic_cds_dict))
|
|
610 for chrom in genomic_cds_dict:
|
|
611 tree_dict[chrom] = IntervalTree.from_tuples(genomic_cds_dict[chrom])
|
|
612
|
|
613 # print (list(tree_dict))
|
|
614
|
|
615
|
3
|
616 connection = sqlite3.connect(output)
|
0
|
617 cursor = connection.cursor()
|
|
618 cursor.execute(
|
|
619 "CREATE TABLE IF NOT EXISTS transcripts (transcript VARCHAR(50), gene VARCHAR(50), length INT(6), cds_start INT(6), cds_stop INT(6), sequence VARCHAR(50000), strand CHAR(1), stop_list VARCHAR(10000), start_list VARCHAR(10000), exon_junctions VARCHAR(1000), tran_type INT(1), gene_type INT(1), principal INT(1), version INT(2),gc INT(3),five_gc INT(3), cds_gc INT(3), three_gc INT(3), chrom VARCHAR(20));"
|
|
620 )
|
|
621 cursor.execute(
|
|
622 "CREATE TABLE IF NOT EXISTS coding_regions (transcript VARCHAR(50), coding_start INT(6), coding_stop INT(6));"
|
|
623 )
|
|
624 cursor.execute(
|
|
625 "CREATE TABLE IF NOT EXISTS exons (transcript VARCHAR(50), exon_start INT(6), exon_stop INT(6));"
|
|
626 )
|
|
627 cursor.execute(
|
|
628 "CREATE TABLE IF NOT EXISTS uorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
|
|
629 )
|
|
630 cursor.execute(
|
|
631 "CREATE TABLE IF NOT EXISTS ouorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
|
|
632 )
|
|
633 cursor.execute(
|
|
634 "CREATE TABLE IF NOT EXISTS cds (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
|
|
635 )
|
|
636 cursor.execute(
|
|
637 "CREATE TABLE IF NOT EXISTS nested (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
|
|
638 )
|
|
639 cursor.execute(
|
|
640 "CREATE TABLE IF NOT EXISTS odorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
|
|
641 )
|
|
642 cursor.execute(
|
|
643 "CREATE TABLE IF NOT EXISTS dorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
|
|
644 )
|
|
645 cursor.execute(
|
|
646 "CREATE TABLE IF NOT EXISTS minusone(transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
|
|
647 )
|
|
648 cursor.execute(
|
|
649 "CREATE TABLE IF NOT EXISTS readthrough (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
|
|
650 )
|
|
651 cursor.execute(
|
|
652 "CREATE TABLE IF NOT EXISTS plusone (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
|
|
653 )
|
|
654 cursor.execute(
|
|
655 "CREATE TABLE IF NOT EXISTS noncoding (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
|
|
656 )
|
|
657 cursor.execute(
|
|
658 "CREATE TABLE IF NOT EXISTS extension (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
|
|
659 )
|
|
660 cursor.execute(
|
|
661 "CREATE TABLE IF NOT EXISTS inframe_stop (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
|
|
662 )
|
|
663 connection.commit()
|
|
664
|
|
665
|
|
666 print("Finding ORFs")
|
|
667 transcript_count = 0
|
|
668 total_transcripts = len(list(master_dict))
|
|
669 for transcript in master_dict:
|
|
670 # print ("transcript",transcript)
|
|
671 # if transcript != "ENST00000316448":
|
|
672 # continue
|
|
673 transcript_count += 1
|
|
674 if transcript_count % 100 == 0:
|
|
675 print("Transcripts complete: {}/{}".format(transcript_count, total_transcripts))
|
|
676 if "seq" not in master_dict[transcript]:
|
|
677 print("transcript {} has no sequence".format(transcript))
|
|
678 continue
|
|
679 seq = master_dict[transcript]["seq"]
|
|
680 cds_start = "NULL"
|
|
681 cds_stop = "NULL"
|
|
682 transcript_len = len(seq)
|
|
683 if "cds_start" in master_dict[transcript]:
|
|
684 cds_start = master_dict[transcript]["cds_start"]
|
|
685 cds_stop = master_dict[transcript]["cds_stop"]
|
|
686
|
|
687 # Find out what regions of this transcript overlap with any other coding regions
|
|
688 coding_positions = []
|
|
689 if cds_start != "NULL":
|
|
690 # If this is a coding transcript don't bother checking the CDS
|
|
691 for i in range(cds_start, cds_stop):
|
|
692 coding_positions.append(i)
|
|
693 # check 5' leader
|
|
694 chrom, pos_list = tran_to_genome(transcript, 0, cds_start, master_dict)
|
|
695 for i in range(0, cds_start):
|
|
696 genomic_pos = pos_list[i]
|
|
697 overlap = tree_dict[chrom][genomic_pos]
|
|
698 if len(overlap) != 0:
|
|
699 coding_positions.append(i)
|
|
700 # check 3' trailer
|
|
701 chrom, pos_list = tran_to_genome(
|
|
702 transcript, cds_stop, transcript_len, master_dict
|
|
703 )
|
|
704 for i in range(cds_stop, transcript_len + 1):
|
|
705 # print ("i",i)
|
|
706 genomic_pos = pos_list[i - cds_stop]
|
|
707 # print ("genomic position",genomic_pos)
|
|
708 overlap = tree_dict[chrom][genomic_pos]
|
|
709 if len(overlap) != 0:
|
|
710 # print ("overlap not empty appending i",overlap)
|
|
711 coding_positions.append(i)
|
|
712 else:
|
|
713 # check entire transcript
|
|
714 chrom, pos_list = tran_to_genome(transcript, 0, transcript_len, master_dict)
|
|
715 for i in range(0, transcript_len):
|
|
716 genomic_pos = pos_list[i]
|
|
717 overlap = tree_dict[chrom][genomic_pos]
|
|
718 if len(overlap) != 0:
|
|
719 coding_positions.append(i)
|
|
720 coding_positions_tuple = to_ranges(coding_positions)
|
|
721 coding_dict[transcript] = coding_positions_tuple
|
|
722 coding_positions = set(coding_positions)
|
|
723 # if this is a coding transcript find the minusone, readhtrough, plusone co-ordinates
|
|
724 if cds_start != "NULL":
|
|
725 # print ("transcript", transcript)
|
|
726 # if pseudo_utr_len != 0:
|
|
727 # cds_stop -= 3 # take 3 from stop so we can match it with orf_stop, do it here rather than above in case cds_stop is null
|
|
728 recoding_dict = {2: "minusone", 0: "readthrough", 1: "plusone"}
|
|
729 for addition in recoding_dict:
|
|
730 orftype = recoding_dict[addition]
|
|
731 for i in range(cds_stop + addition, transcript_len, 3):
|
|
732 if seq[i : i + 3] in stop_codons:
|
|
733 # orf_seq = seq[cds_stop:i+3]
|
|
734 orf_start = cds_stop
|
|
735 if orftype == "readthrough":
|
|
736 orf_start -= 2
|
|
737 if orftype == "plusone":
|
|
738 orf_start -= 1
|
|
739 orf_stop = i + 3 # +2 so it refers to the end of the stop codon
|
|
740 start_codon = None
|
|
741 length = (i + 3) - cds_stop
|
|
742 orf_pos_list = []
|
|
743 # determine how many nucleotides in this orf overlap with an annotated coding region
|
|
744 cds_cov_count = 0.0
|
|
745 for position in range(orf_start, orf_stop):
|
|
746 orf_pos_list.append(position)
|
|
747 for pos in range(orf_start, orf_stop + 1):
|
|
748 if pos in coding_positions:
|
|
749 cds_cov_count += 1
|
|
750 cds_cov = cds_cov_count / length
|
|
751 # print ("orftype, start, stop", orftype, orf_start, orf_stop)
|
|
752 cursor.execute(
|
|
753 "INSERT INTO {} VALUES('{}','{}',{},{},{},{});".format(
|
|
754 orftype,
|
|
755 transcript,
|
|
756 start_codon,
|
|
757 length,
|
|
758 orf_start,
|
|
759 orf_stop,
|
|
760 cds_cov,
|
|
761 )
|
|
762 )
|
|
763 break
|
|
764 # sys.exit()
|
|
765 for frame in [0, 1, 2]:
|
|
766 for i in range(frame, transcript_len, 3):
|
|
767 if seq[i : i + 3] in start_codons:
|
|
768 for x in range(i, transcript_len, 3):
|
|
769 if seq[x : x + 3] in stop_codons:
|
|
770 # orf_seq = seq[i:x+3]
|
|
771 orf_start = i + 1
|
|
772 orf_stop = x + 3 # +2 so it refers to the end of the stop codon
|
|
773 start_codon = seq[i : i + 3]
|
|
774 length = (x + 3) - i
|
|
775 orf_pos_list = []
|
|
776 # determine how many nucleotides in this orf overlap with an annotated coding region
|
|
777 cds_cov_count = 0.0
|
|
778 for pos in range(orf_start, orf_stop + 1):
|
|
779 if pos in coding_positions:
|
|
780 cds_cov_count += 1
|
|
781 cds_cov = float(cds_cov_count) / float(length)
|
|
782 # Now determine orf type
|
|
783 if cds_start == "NULL":
|
|
784 orftype = "noncoding"
|
|
785 else:
|
|
786 # print ("cds start is not null :{}:{}".format(cds_start,cds_stop))
|
|
787 # print ("orf start, orf stop", orf_start, orf_stop)
|
|
788 if orf_start == cds_start and orf_stop == cds_stop:
|
|
789 orftype = "cds"
|
|
790 # print ("orf type is cds")
|
|
791 elif orf_start < cds_start and orf_stop == cds_stop:
|
|
792 orftype = "extension"
|
|
793 # special case for extensions, we only take from the orf_start to the cds_start, and re-calculate cds coverage
|
|
794 orf_stop = cds_start
|
|
795 cds_cov_count = 0.0
|
|
796 for pos in range(orf_start, orf_stop + 1):
|
|
797 if pos in coding_positions:
|
|
798 cds_cov_count += 1
|
|
799 cds_cov = float(cds_cov_count) / float(length)
|
|
800 # orf_seq = seq[orf_start:cds_start]
|
|
801 length = cds_start - orf_start
|
|
802 # print ("orf type is extension")
|
|
803 elif orf_start < cds_start and orf_stop <= cds_start:
|
|
804 orftype = "uorf"
|
|
805 # print ("orf type is uorf")
|
|
806 elif orf_start < cds_start and orf_stop > cds_start:
|
|
807 orftype = "ouorf"
|
|
808 # print ("orf type is ouorf")
|
|
809 # sys.exit()
|
|
810 elif (
|
|
811 orf_start >= cds_start
|
|
812 and orf_start <= cds_stop
|
|
813 and orf_stop <= cds_stop
|
|
814 ):
|
|
815 if orf_stop == cds_stop:
|
|
816 break
|
|
817 # print ("Tran, cds_start, cds_stop, orf_start, orf_stop, tranlen",tran, cds_start, cds_stop, orf_start, orf_stop,transcript_len)
|
|
818 if (
|
|
819 orf_stop < transcript_len
|
|
820 and orf_stop % 3 == cds_stop % 3
|
|
821 ) or (
|
|
822 cds_start != 1
|
|
823 and orf_stop % 3 == (cds_start + 2) % 3
|
|
824 ):
|
|
825 # print ("Transcript {} has an inframe stop codon".format(transcript))
|
|
826 break
|
|
827 orftype = "nested"
|
|
828 # print ("orf type is nested")
|
|
829 elif (
|
|
830 orf_start >= cds_start
|
|
831 and orf_start <= cds_stop
|
|
832 and orf_stop > cds_stop
|
|
833 ):
|
|
834 orftype = "odorf"
|
|
835 # print ("orftype is odorf")
|
|
836 elif orf_start > cds_stop and orf_stop > cds_stop:
|
|
837 orftype = "dorf"
|
|
838 # print ("orftype is dorf")
|
|
839 # if orf_stop > cds_start and orf_stop < cds_stop:
|
|
840 # if (orf_stop+1)%3 == cds_start%3:
|
|
841 # orftype = "inframe_stop"
|
|
842 # print ("inframe stop, transcript, orf_stop", transcript, orf_stop)
|
|
843 # sys.exit()
|
|
844 # if transcript not in orf_dict:
|
|
845 # orf_dict[orftype][transcript] = []
|
|
846 # #print ("some weird stop or something")
|
|
847 cursor.execute(
|
|
848 "INSERT INTO {} VALUES('{}','{}',{},{},{},{});".format(
|
|
849 orftype,
|
|
850 transcript,
|
|
851 start_codon,
|
|
852 length,
|
|
853 orf_start,
|
|
854 orf_stop,
|
|
855 cds_cov,
|
|
856 )
|
|
857 )
|
|
858 break
|
|
859 # Used to keep track of the codons at cds_start and cds_stop positions,
|
|
860 # If there is an issue with the cds co-ordinates the starts and stops counts will
|
|
861 # be much lower than the other count, start with 1 to prevent division by 0
|
|
862 nuc_dict = {"stops": {"stops": 1, "other": 0}, "starts": {"starts": 1, "other": 0}}
|
|
863
|
|
864
|
|
865 def calcgc(seq):
|
|
866 if seq == "":
|
|
867 return "NULL"
|
|
868 g_count = 0
|
|
869 c_count = 0
|
|
870 a_count = 0
|
|
871 t_count = 0
|
|
872 for char in seq:
|
|
873 if char == "A":
|
|
874 a_count += 1
|
|
875 if char == "T":
|
|
876 t_count += 1
|
|
877 if char == "G":
|
|
878 g_count += 1
|
|
879 if char == "C":
|
|
880 c_count += 1
|
|
881 gc = ((g_count + c_count) / float(len(seq))) * 100
|
|
882 return round(gc, 2)
|
|
883
|
|
884
|
|
885 for transcript in master_dict:
|
|
886 # print ("transcripts", transcript)
|
|
887 length = master_dict[transcript]["length"]
|
|
888 cds_start = master_dict[transcript]["cds_start"]
|
|
889 cds_stop = master_dict[transcript]["cds_stop"]
|
|
890 seq = master_dict[transcript]["seq"]
|
|
891 strand = master_dict[transcript]["strand"]
|
|
892 chrom = master_dict[transcript]["chrom"]
|
|
893 gene = master_dict[transcript]["gene_name"]
|
|
894 gc = calcgc(seq)
|
|
895 five_gc = "NULL"
|
|
896 cds_gc = "NULL"
|
|
897 three_gc = "NULL"
|
|
898 if cds_start != "NULL":
|
|
899 five_gc = calcgc(seq[:cds_start])
|
|
900 cds_gc = calcgc(seq[cds_start:cds_stop])
|
|
901 three_gc = calcgc(seq[cds_stop:])
|
|
902 # check that the nucleotide cds_start points to is the first of the start codon
|
|
903 # take one becase cds_start is 1 based but python indexing is 0 based
|
|
904 start_nuc = seq[cds_start - 1 : cds_start + 2]
|
|
905 # print ("start nuc",start_nuc)
|
|
906 if start_nuc == "ATG":
|
|
907 nuc_dict["starts"]["starts"] += 1
|
|
908 else:
|
|
909 nuc_dict["starts"]["other"] += 1
|
|
910 stop_nuc = seq[cds_stop - 3 : cds_stop]
|
|
911 # print ("stop_nuc",stop_nuc)
|
|
912 if stop_nuc in ["TAG", "TAA", "TGA"]:
|
|
913 nuc_dict["stops"]["stops"] += 1
|
|
914 else:
|
|
915 nuc_dict["stops"]["other"] += 1
|
|
916 tran_type = master_dict[transcript]["tran_type"]
|
|
917 if coding_genes_dict[gene] == True:
|
|
918 gene_type = 1
|
|
919 else:
|
|
920 gene_type = 0
|
|
921 # print ("tran type before",tran_type)
|
|
922 if tran_type == "coding":
|
|
923 tran_type = 1
|
|
924 else:
|
|
925 tran_type = 0
|
|
926 # print ("tran type after",tran_type)
|
|
927 start_list = str(master_dict[transcript]["start_list"]).replace(" ", "").strip("[]")
|
|
928 stop_list = str(master_dict[transcript]["stop_list"]).replace(" ", "").strip("[]")
|
|
929 exon_junctions = (
|
|
930 str(master_dict[transcript]["exon_junctions"]).replace(" ", "").strip("[]")
|
|
931 )
|
|
932 principal = master_dict[transcript]["principal"]
|
|
933 version = master_dict[transcript]["version"]
|
|
934 # print (master_dict[transcript])
|
|
935 # print (tran_type)
|
|
936 # print (gene_type)
|
|
937 # print (principal)
|
|
938 # print (version)
|
|
939 # print ("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{});".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version))
|
|
940 cursor.execute(
|
|
941 "INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{},{},{},{},{},'{}');".format(
|
|
942 transcript,
|
|
943 gene,
|
|
944 length,
|
|
945 cds_start,
|
|
946 cds_stop,
|
|
947 seq,
|
|
948 strand,
|
|
949 stop_list,
|
|
950 start_list,
|
|
951 exon_junctions,
|
|
952 tran_type,
|
|
953 gene_type,
|
|
954 principal,
|
|
955 version,
|
|
956 gc,
|
|
957 five_gc,
|
|
958 cds_gc,
|
|
959 three_gc,
|
|
960 chrom,
|
|
961 )
|
|
962 )
|
|
963
|
|
964 for tup in master_dict[transcript]["exon"]:
|
|
965 cursor.execute(
|
|
966 "INSERT INTO exons VALUES('{}',{},{});".format(transcript, tup[0], tup[1])
|
|
967 )
|
|
968 if transcript in coding_dict:
|
|
969 for tup in coding_dict[transcript]:
|
|
970 cursor.execute(
|
|
971 "INSERT INTO coding_regions VALUES('{}',{},{});".format(
|
|
972 transcript, tup[0], tup[1]
|
|
973 )
|
|
974 )
|
3
|
975 # print(cursor.execute(
|
|
976 # ".tables"
|
|
977 # ))
|
0
|
978
|
|
979 print("delim", delimiters)
|
|
980 if (nuc_dict["starts"]["other"] / nuc_dict["starts"]["starts"]) > 0.05:
|
|
981 print(
|
|
982 "Warning: {} transcripts do not have a an AUG at the CDS start position".format(
|
|
983 nuc_dict["starts"]["other"]
|
|
984 )
|
|
985 )
|
|
986 if (nuc_dict["stops"]["other"] / nuc_dict["stops"]["stops"]) > 0.05:
|
|
987 print(
|
|
988 "Warning: {} transcripts do not have a an stop codon at the CDS stop position".format(
|
|
989 nuc_dict["stops"]["other"]
|
|
990 )
|
|
991 )
|
|
992 if len(notinannotation) > 0:
|
|
993 print(
|
|
994 "Warning: {} transcripts were in the fasta file, but not the annotation file, these will be discarded".format(
|
|
995 len(notinannotation)
|
|
996 )
|
|
997 )
|
3
|
998
|
|
999
|
|
1000
|
|
1001 connection.commit()
|
|
1002 connection.close()
|
|
1003
|
|
1004
|