Mercurial > repos > triasteran > trips_create_new_organism
comparison final/create_annotation_sqlite.py @ 4:9482a43a3d83 draft
Uploaded
author | triasteran |
---|---|
date | Tue, 01 Mar 2022 12:40:17 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:d55c72b843d9 | 4:9482a43a3d83 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 # Python3 script which takes in an annotation file(gtf/gff3) and a transcriptomic fasta file | |
4 # and produces an sqlite file which can be uploaded to Trips-Viz | |
5 # All co-ordinates produced are 1 based | |
6 # All start codon positions (including cds_start) should be at the first nucleotide of the codon | |
7 # All stop codon positions (including cds_stop) should be at the last nucleotide of the codon | |
8 import sys | |
9 import re | |
10 import sqlite3 | |
11 import intervaltree | |
12 from intervaltree import Interval, IntervalTree | |
13 import itertools | |
14 | |
15 #This should be a GTF or GFF3 file | |
16 annotation_file = open(sys.argv[1],"r") | |
17 #This needs to be the transcriptomic fasta file | |
18 fasta_file = open(sys.argv[2],"r") | |
19 #This value will be added used to create UTRs of this length, useful when looking at transcriptomes without annotated UTRs | |
20 pseudo_utr_len = int(sys.argv[3]) | |
21 #An example of a transcript_id from the annotation file, e.g ENST000000123456 | |
22 user_transcript_id = sys.argv[4] | |
23 #An example of a gene name from the annotation file | |
24 user_gene_name = sys.argv[5] | |
25 # Set to true if transcript version is included in transcript_id, e.g: ENST000000123456.1 | |
26 TRAN_VERSION = True | |
27 | |
28 | |
29 | |
30 delimiters = {"transcripts":{"before":[],"after":[],"annot_types": ["cds","utr"]}, | |
31 "genes":{"before":[],"after":['"'],"annot_types": ["lnc_rna"]}} | |
32 | |
33 punctuation = [";"," ","-",":","-",".","=","\t"] | |
34 #Find delimiters in the annotation and fasta files using the user_transcript_id | |
35 #and user_gene_name examples given by user. | |
36 for line in annotation_file: | |
37 if user_transcript_id in line: | |
38 tabsplitline = line.split("\t") | |
39 annot_type = tabsplitline[2] | |
40 if annot_type not in delimiters["transcripts"]["annot_types"]: | |
41 delimiters["transcripts"]["annot_types"].append(annot_type.lower()) | |
42 splitline = line.split(user_transcript_id) | |
43 before_delimiter = splitline[0] | |
44 for item in punctuation: | |
45 if item in before_delimiter: | |
46 if len(before_delimiter.split(item)[-1]) >= 5: | |
47 before_delimiter = before_delimiter.split(item)[-1] | |
48 after_delimiter = splitline[1][:2] | |
49 if before_delimiter not in delimiters["transcripts"]["before"] and len(before_delimiter) >= 5: | |
50 delimiters["transcripts"]["before"].append(before_delimiter) | |
51 if after_delimiter not in delimiters["transcripts"]["after"]: | |
52 delimiters["transcripts"]["after"].append(after_delimiter) | |
53 if user_gene_name in line: | |
54 tabsplitline = line.split("\t") | |
55 annot_type = tabsplitline[2] | |
56 if annot_type not in delimiters["genes"]["annot_types"]: | |
57 delimiters["genes"]["annot_types"].append(annot_type.lower()) | |
58 splitline = line.split(user_gene_name) | |
59 before_delimiter = splitline[0] | |
60 for item in punctuation: | |
61 if item in before_delimiter: | |
62 if len(before_delimiter.split(item)[-1]) >= 5: | |
63 before_delimiter = before_delimiter.split(item)[-1] | |
64 after_delimiter = splitline[1][0] | |
65 if before_delimiter not in delimiters["genes"]["before"] and len(before_delimiter) >= 5: | |
66 delimiters["genes"]["before"].append(before_delimiter) | |
67 if after_delimiter not in delimiters["genes"]["after"]: | |
68 if after_delimiter in punctuation: | |
69 delimiters["genes"]["after"].append(after_delimiter) | |
70 for line in fasta_file: | |
71 if user_transcript_id in line: | |
72 splitline = line.split(user_transcript_id) | |
73 before_delimiter = splitline[0] | |
74 for item in punctuation: | |
75 if item in before_delimiter: | |
76 if len(before_delimiter.split(item)[1]) >= 5: | |
77 before_delimiter = before_delimiter.split(item)[1] | |
78 after_delimiter = splitline[1][0] | |
79 if before_delimiter not in delimiters["transcripts"]["before"] and len(before_delimiter) >= 5: | |
80 delimiters["transcripts"]["before"].append(before_delimiter) | |
81 if after_delimiter not in delimiters["transcripts"]["after"]: | |
82 delimiters["transcripts"]["after"].append(after_delimiter) | |
83 fasta_file.close() | |
84 annotation_file.close() | |
85 | |
86 | |
87 | |
88 | |
89 | |
90 if delimiters["transcripts"]["before"] == []: | |
91 print ("ERROR: No transcript_id with the name {} could be found in the annotation file".format(user_transcript_id)) | |
92 sys.exit() | |
93 if delimiters["genes"]["before"] == []: | |
94 print ("ERROR: No gene with the name {} could be found in the annotation file".format(user_gene_name)) | |
95 sys.exit() | |
96 | |
97 master_dict = {} | |
98 coding_dict = {} | |
99 notinfasta = open("notinfasta.csv","w") | |
100 | |
101 #Given a nucleotide sequence returns the positions of all start and stop codons. | |
102 def get_start_stops(transcript_sequence): | |
103 transcript_sequence = transcript_sequence.upper() | |
104 start_codons = ['ATG'] | |
105 stop_codons = ['TAA', 'TAG', 'TGA'] | |
106 seq_frames = {'starts': [], 'stops': []} | |
107 for codons, positions in ((start_codons, 'starts'),(stop_codons, 'stops')): | |
108 if len(codons) > 1: | |
109 pat = re.compile('|'.join(codons)) | |
110 else: | |
111 pat = re.compile(codons[0]) | |
112 for m in re.finditer(pat, transcript_sequence): | |
113 # Increment position by 1, Frame 1 starts at position 1 not 0, | |
114 # if it's a stop codon add another 2 so it points to the last nuc of the codon | |
115 if positions == "starts": | |
116 start = m.start() + 1 | |
117 else: | |
118 start = m.start() + 3 | |
119 seq_frames[positions].append(start) | |
120 return seq_frames | |
121 | |
122 | |
123 #parse fasta to get the nucleotide sequence of transcripts and the positions of start/stop codons. | |
124 fasta_file = open(sys.argv[2],"r") | |
125 read_fasta = fasta_file.read() | |
126 split_fasta = read_fasta.split(">") | |
127 for entry in split_fasta[1:]: | |
128 newline_split = entry.split("\n") | |
129 tran = newline_split[0] | |
130 for item in delimiters["transcripts"]["after"]: | |
131 if item in tran: | |
132 tran = tran.split(item)[0] | |
133 tran = tran.replace("-","_").replace("(","").replace(")","") | |
134 seq = ("".join(newline_split[1:])) | |
135 if "_PAR_Y" in tran: | |
136 tran += "_chrY" | |
137 elif "_PAR_X" in tran: | |
138 tran += "_chrX" | |
139 tran = tran.upper() | |
140 starts_stops = get_start_stops(seq) | |
141 if tran not in master_dict: | |
142 master_dict[tran] = {"utr":[], "cds":[], "exon":[],"start_codon":[],"stop_codon":[],"start_list":starts_stops["starts"], | |
143 "stop_list":starts_stops["stops"],"transcript":[], "strand":"" ,"gene_name":"","chrom":"","seq":seq,"cds_start":"NULL","cds_stop":"NULL", | |
144 "length":len(seq),"principal":0,"version":"NULL"} | |
145 | |
146 | |
147 | |
148 | |
149 def to_ranges(iterable): | |
150 tup_list = [] | |
151 iterable = sorted(set(iterable)) | |
152 for key, group in itertools.groupby(enumerate(iterable),lambda t: t[1] - t[0]): | |
153 group = list(group) | |
154 tup_list.append((group[0][1], group[-1][1])) | |
155 return tup_list | |
156 | |
157 #parse annotation file to get chromsome, exon location and CDS info for each transcript | |
158 def parse_gtf_file(annotation_file): | |
159 for line in annotation_file: | |
160 if line == "\n": | |
161 continue | |
162 if line[0] != '#': | |
163 splitline = (line.replace("\n","")).split("\t") | |
164 chrom = splitline[0] | |
165 try: | |
166 annot_type = splitline[2].lower() | |
167 except: | |
168 print ("ERROR tried to index to second item in splitline: ",splitline, line) | |
169 sys.exit() | |
170 #if annot_type not in ["cds", "utr", "exon", "transcript","five_prime_utr", "three_prime_utr","stop_codon","start_codon"]: | |
171 # continue | |
172 if annot_type not in delimiters["transcripts"]["annot_types"] and annot_type not in delimiters["genes"]["annot_types"]: | |
173 continue | |
174 if annot_type == "five_prime_utr" or annot_type == "three_prime_utr": | |
175 annot_type = "utr" | |
176 strand = splitline[6] | |
177 if strand == "+": | |
178 start = int(splitline[3]) | |
179 end = int(splitline[4]) | |
180 else: | |
181 start = int(splitline[3])+1 | |
182 end = int(splitline[4])+1 | |
183 desc = splitline[8] | |
184 tran = desc | |
185 gene = desc | |
186 for item in delimiters["transcripts"]["before"]: | |
187 if item in tran: | |
188 tran = tran.split(item)[1] | |
189 for item in delimiters["transcripts"]["after"]: | |
190 if item in tran: | |
191 tran = tran.split(item)[0] | |
192 if "." in tran and TRAN_VERSION == True: | |
193 #print ("raw tran",tran) | |
194 tran = tran.split(".") | |
195 version = int(tran[-1].split("_")[0]) | |
196 tran = tran[0] | |
197 else: | |
198 version = "NULL" | |
199 tran = tran.replace("-","_").replace(".","_") | |
200 tran = tran.replace("(","").replace(")","") | |
201 tran = tran.replace(" ","").replace("\t","") | |
202 tran = tran.upper() | |
203 tran = tran.replace("GENE_","").replace("ID_","") | |
204 #print ("tran",tran,version) | |
205 #if tran == "ENST00000316448": | |
206 # print ("annot type",annot_type) | |
207 # print ("appending exon to tran", start, end,line) | |
208 | |
209 gene_found = False | |
210 | |
211 if annot_type in delimiters["genes"]["annot_types"]: | |
212 for item in delimiters["genes"]["before"]: | |
213 if item in gene: | |
214 gene_found = True | |
215 gene = gene.split(item)[1] | |
216 for item in delimiters["genes"]["after"]: | |
217 if item in gene: | |
218 gene = gene.split(item)[0] | |
219 gene = gene.replace("'","''") | |
220 gene = gene.replace("GENE_","") | |
221 gene = gene.replace("ID_","") | |
222 gene = gene.upper() | |
223 | |
224 if tran in master_dict: | |
225 if annot_type in master_dict[tran]: | |
226 master_dict[tran][annot_type].append((start, end)) | |
227 master_dict[tran]["strand"] = strand | |
228 master_dict[tran]["chrom"] = chrom | |
229 master_dict[tran]["version"] = version | |
230 if gene_found == True: | |
231 master_dict[tran]["gene_name"] = gene | |
232 else: | |
233 notinfasta.write("{}\n".format(tran)) | |
234 | |
235 annotation_file = open(sys.argv[1],"r") | |
236 parse_gtf_file(annotation_file) | |
237 | |
238 | |
239 #remove transcripts that were in fasta file but not in annotation_file | |
240 notinannotation = [] | |
241 for tran in master_dict: | |
242 if master_dict[tran]["chrom"] == "": | |
243 #print ("tran {} has no chrom :(".format(tran)) | |
244 notinannotation.append(tran) | |
245 for tran in notinannotation: | |
246 del master_dict[tran] | |
247 | |
248 #Dictionary to store the coding status of a gene, if any transcript of this gene is coding, the value will be True | |
249 coding_genes_dict = {} | |
250 #parse master_dict to calculate length, cds_start/stop and exon junction positions | |
251 for tran in master_dict: | |
252 try: | |
253 transeq = master_dict[tran]["seq"] | |
254 except Exception as e: | |
255 print ("not in fasta", tran) | |
256 notinfasta.write("{}\n".format(tran)) | |
257 continue | |
258 exon_junctions = [] | |
259 total_length = len(transeq) | |
260 three_len = 1 | |
261 five_len = 1 | |
262 strand = master_dict[tran]["strand"] | |
263 if master_dict[tran]["gene_name"] == "": | |
264 master_dict[tran]["gene_name"] = tran | |
265 gene = master_dict[tran]["gene_name"] | |
266 if gene not in coding_genes_dict: | |
267 coding_genes_dict[gene] = False | |
268 | |
269 if master_dict[tran]["cds"] == []: | |
270 tran_type = "noncoding" | |
271 cds_start = 'NULL' | |
272 cds_stop = 'NULL' | |
273 else: | |
274 #get utr lengths from annotation | |
275 tran_type = "coding" | |
276 coding_genes_dict[gene] = True | |
277 sorted_exons = sorted(master_dict[tran]["exon"]) | |
278 sorted_cds = sorted(master_dict[tran]["cds"]) | |
279 min_cds = sorted_cds[0][0] | |
280 #Some annotation files do not have utr annotation types, so fix that here if thats the case | |
281 if master_dict[tran]["utr"] == []: | |
282 for exon_tup in master_dict[tran]["exon"]: | |
283 if exon_tup not in master_dict[tran]["cds"]: | |
284 # Now check if this overlaps with any of the CDS exons | |
285 overlap = False | |
286 for cds_tup in master_dict[tran]["cds"]: | |
287 if exon_tup[0] == cds_tup[0] and exon_tup[1] != cds_tup[1]: | |
288 master_dict[tran]["utr"].append((cds_tup[1],exon_tup[1])) | |
289 overlap = True | |
290 if exon_tup[0] != cds_tup[0] and exon_tup[1] == cds_tup[1]: | |
291 master_dict[tran]["utr"].append((exon_tup[0],cds_tup[0])) | |
292 overlap = True | |
293 if overlap == False: | |
294 master_dict[tran]["utr"].append(exon_tup) | |
295 | |
296 | |
297 ''' | |
298 if tran == "NM_001258497": | |
299 print ("sorted cds",sorted_cds) | |
300 print ("min cds",min_cds) | |
301 print ("chrom",master_dict[tran]["chrom"]) | |
302 print ("sorted exons", sorted_exons) | |
303 print ("utr",master_dict[tran]["utr"]) | |
304 sys.exit() | |
305 ''' | |
306 #if tran == "ENST00000381401": | |
307 # print ("min cds,sorted utr",min_cds,sorted(master_dict[tran]["utr"])) | |
308 for tup in sorted(master_dict[tran]["utr"]): | |
309 #if tran == "ENST00000381401": | |
310 # print ("tup", tup) | |
311 if tup[0] < min_cds: | |
312 five_len += (tup[1]-tup[0])+1 | |
313 #if tran == "ENST00000381401": | |
314 # print ("adding to fivelen") | |
315 elif tup[0] > min_cds: | |
316 three_len += (tup[1] - tup[0])+1 | |
317 #if tran == "ENST00000381401": | |
318 # print ("adding to three len") | |
319 else: | |
320 pass | |
321 if strand == "+": | |
322 if len(sorted_exons) > 1: | |
323 sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1]) | |
324 sorted_exons[-1] = (sorted_exons[-1][0], sorted_exons[-1][1]+pseudo_utr_len) | |
325 else: | |
326 sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1]+pseudo_utr_len) | |
327 master_dict[tran]["exon"] = sorted_exons | |
328 cds_start = (five_len+pseudo_utr_len) | |
329 cds_stop = ((total_length - three_len)-pseudo_utr_len)+1 | |
330 elif strand == "-": | |
331 if len(sorted_exons) > 1: | |
332 sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1]) | |
333 sorted_exons[-1] = (sorted_exons[-1][0], sorted_exons[-1][1]+pseudo_utr_len) | |
334 else: | |
335 sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1]+pseudo_utr_len) | |
336 master_dict[tran]["exon"] = sorted_exons | |
337 cds_start = (three_len+pseudo_utr_len) | |
338 cds_stop = ((total_length - (five_len))-pseudo_utr_len)+1 | |
339 #if tran == "ENST00000381401": | |
340 # print ("cds start, cds stop, five_len, three_len",cds_start,cds_stop,five_len,three_len) | |
341 # #sys.exit() | |
342 else: | |
343 print ("strand is unknown: {}".format(strand)) | |
344 sys.exit() | |
345 | |
346 #get exon junctions, cds is easy just get end of each tuple except last, same for utr except for if same as cds start/stop | |
347 total_intronic = 0 | |
348 try: | |
349 if strand == "+": | |
350 tx_start = min(sorted(master_dict[tran]["exon"]))[0] | |
351 prev_end = tx_start | |
352 for tup in sorted(master_dict[tran]["exon"])[:-1]: | |
353 total_intronic += tup[0]-prev_end | |
354 exon_junctions.append(((tup[1])-tx_start)-total_intronic) | |
355 prev_end = tup[1] | |
356 elif strand == "-": | |
357 tx_start = max(sorted(master_dict[tran]["exon"]))[-1] | |
358 prev_end = tx_start | |
359 for tup in (sorted(master_dict[tran]["exon"])[1:])[::-1]: | |
360 total_intronic += (tup[0]+1)-prev_end | |
361 exon_junctions.append(((tup[1])-tx_start)-total_intronic) | |
362 prev_end = tup[1] | |
363 except: | |
364 if strand == "+": | |
365 tx_start = min(sorted(master_dict[tran]["cds"]))[0] | |
366 prev_end = tx_start | |
367 for tup in sorted(master_dict[tran]["cds"])[:-1]: | |
368 total_intronic += tup[0]-prev_end | |
369 exon_junctions.append(((tup[1])-tx_start)-total_intronic) | |
370 prev_end = tup[1] | |
371 elif strand == "-": | |
372 tx_start = max(sorted(master_dict[tran]["cds"]))[-1] | |
373 prev_end = tx_start | |
374 for tup in (sorted(master_dict[tran]["cds"])[1:])[::-1]: | |
375 total_intronic += (tup[0]+1)-prev_end | |
376 exon_junctions.append(((tup[1])-tx_start)-total_intronic) | |
377 prev_end = tup[1] | |
378 if strand == "+" and cds_start != "NULL": | |
379 master_dict[tran]["cds_start"] = cds_start | |
380 master_dict[tran]["cds_stop"] = cds_stop | |
381 elif strand == "-" and cds_start != "NULL": | |
382 master_dict[tran]["cds_start"] = cds_start | |
383 master_dict[tran]["cds_stop"] = cds_stop | |
384 master_dict[tran]["strand"] = strand | |
385 master_dict[tran]["tran_type"] = tran_type | |
386 master_dict[tran]["exon_junctions"] = exon_junctions | |
387 | |
388 longest_tran_dict = {} | |
389 for tran in master_dict: | |
390 try: | |
391 gene = master_dict[tran]["gene_name"] | |
392 except: | |
393 continue | |
394 if coding_genes_dict[gene] == True: | |
395 if "cds_start" in master_dict[tran]: | |
396 if master_dict[tran]["cds_stop"] != "NULL" and master_dict[tran]["cds_start"] != "NULL": | |
397 cds_length = master_dict[tran]["cds_stop"]- master_dict[tran]["cds_start"] | |
398 if gene not in longest_tran_dict: | |
399 longest_tran_dict[gene] = {"tran":tran,"length":cds_length} | |
400 else: | |
401 if cds_length > longest_tran_dict[gene]["length"]: | |
402 longest_tran_dict[gene] = {"tran":tran,"length":cds_length} | |
403 if cds_length == longest_tran_dict[gene]["length"]: | |
404 if master_dict[tran]["length"] > master_dict[longest_tran_dict[gene]["tran"]]["length"]: | |
405 longest_tran_dict[gene] = {"tran":tran,"length":cds_length} | |
406 else: | |
407 length = master_dict[tran]["length"] | |
408 if gene not in longest_tran_dict: | |
409 longest_tran_dict[gene] = {"tran":tran,"length":length} | |
410 elif length > longest_tran_dict[gene]["length"]: | |
411 longest_tran_dict[gene] = {"tran":tran,"length":length} | |
412 | |
413 | |
414 | |
415 | |
416 | |
417 for gene in longest_tran_dict: | |
418 longest_tran = longest_tran_dict[gene]["tran"] | |
419 master_dict[longest_tran]["principal"] = 1 | |
420 | |
421 gene_sample = [] | |
422 for key in list(master_dict)[:10]: | |
423 try: | |
424 gene_sample.append(master_dict[key]["gene_name"]) | |
425 except: | |
426 pass | |
427 | |
428 print ("Here is a sample of the transcript ids: {}".format(list(master_dict)[:10])) | |
429 print ("Here is a sample of the gene names: {}".format(gene_sample)) | |
430 | |
431 | |
432 # Takes a transcript, transcriptomic position and a master_dict (see ribopipe scripts) and returns the genomic position, positions should be passed 1 at a time. | |
433 def tran_to_genome(tran, start_pos, end_pos, master_dict): | |
434 pos_list = [] | |
435 for i in range(start_pos,end_pos+1): | |
436 pos_list.append(i) | |
437 genomic_pos_list = [] | |
438 if tran in master_dict: | |
439 transcript_info = master_dict[tran] | |
440 else: | |
441 return ("Null", []) | |
442 | |
443 chrom = transcript_info["chrom"] | |
444 strand = transcript_info["strand"] | |
445 exons = transcript_info["exon"] | |
446 #print ("chrom,strand,exons",chrom,strand,exons) | |
447 for pos in pos_list: | |
448 #print ("pos",pos) | |
449 if strand == "+": | |
450 exon_start = 0 | |
451 for tup in exons: | |
452 #print ("tup",tup) | |
453 exon_start = tup[0] | |
454 exonlen = tup[1] - tup[0] | |
455 if pos > exonlen: | |
456 pos = (pos - exonlen)-1 | |
457 else: | |
458 break | |
459 #print ("appending exon_start-pos", exon_start, pos, exon_start+pos) | |
460 genomic_pos_list.append((exon_start+pos)-1) | |
461 elif strand == "-": | |
462 exon_start = 0 | |
463 for tup in exons[::-1]: | |
464 #print ("tup",tup) | |
465 exon_start = tup[1] | |
466 exonlen = tup[1] - tup[0] | |
467 #print ("exonlen",exonlen) | |
468 if pos > exonlen: | |
469 #print ("pos is greater") | |
470 pos = (pos - exonlen)-1 | |
471 #print ("new pos",pos) | |
472 else: | |
473 break | |
474 #print ("appending exon_start-pos", exon_start, pos, exon_start-pos) | |
475 genomic_pos_list.append((exon_start-pos)+1) | |
476 return (chrom, genomic_pos_list) | |
477 | |
478 | |
479 | |
480 | |
481 orf_dict = {"uorf":{}, | |
482 "ouorf":{}, | |
483 "cds":{}, | |
484 "nested":{}, | |
485 "odorf":{}, | |
486 "dorf":{}, | |
487 "minusone":{}, | |
488 "readthrough":{}, | |
489 "plusone":{}, | |
490 "noncoding":{}, | |
491 "extension":{}, | |
492 "inframe_stop":{} | |
493 } | |
494 | |
495 start_codons = ["ATG","GTG","CTG"] | |
496 stop_codons = ["TAG","TAA","TGA"] | |
497 | |
498 | |
499 # Keep track of the longest transcript for each noncoding gene, append this to transcript list later | |
500 longest_noncoding = {} | |
501 | |
502 | |
503 tran_count = 0 | |
504 # This section is used to gather all cds regions, convert them to genomic regions and store them in a dictionary to check against later (all transcript contribute to this not just those | |
505 # in the transcript list) | |
506 genomic_cds_dict = {} | |
507 tree_dict = {} | |
508 for transcript in master_dict: | |
509 #print (transcript, master_dict[transcript]["tran_type"]) | |
510 tran_count += 1 | |
511 if "seq" not in master_dict[transcript]: | |
512 continue | |
513 chrom = master_dict[transcript]["chrom"] | |
514 if chrom not in genomic_cds_dict: | |
515 genomic_cds_dict[chrom] = [] | |
516 if "cds_start" in master_dict[transcript]: | |
517 cds_start = master_dict[transcript]["cds_start"] | |
518 cds_stop = master_dict[transcript]["cds_stop"] | |
519 if cds_start != "NULL": | |
520 cds_pos = [] | |
521 for i in range(cds_start, cds_stop+1): | |
522 cds_pos.append(i) | |
523 | |
524 for tup in master_dict[transcript]["cds"]: | |
525 if tup[0] != tup[1]: | |
526 if tup not in genomic_cds_dict[chrom]: | |
527 genomic_cds_dict[chrom].append(tup) | |
528 | |
529 print ("genomic cds dict built") | |
530 print (list(genomic_cds_dict)) | |
531 for chrom in genomic_cds_dict: | |
532 tree_dict[chrom] = IntervalTree.from_tuples(genomic_cds_dict[chrom]) | |
533 | |
534 #print (list(tree_dict)) | |
535 | |
536 | |
537 connection = sqlite3.connect("{}".format(sys.argv[6])) | |
538 cursor = connection.cursor() | |
539 cursor.execute("CREATE TABLE IF NOT EXISTS transcripts (transcript VARCHAR(50), gene VARCHAR(50), length INT(6), cds_start INT(6), cds_stop INT(6), sequence VARCHAR(50000), strand CHAR(1), stop_list VARCHAR(10000), start_list VARCHAR(10000), exon_junctions VARCHAR(1000), tran_type INT(1), gene_type INT(1), principal INT(1), version INT(2),gc INT(3),five_gc INT(3), cds_gc INT(3), three_gc INT(3), chrom VARCHAR(20));") | |
540 cursor.execute("CREATE TABLE IF NOT EXISTS coding_regions (transcript VARCHAR(50), coding_start INT(6), coding_stop INT(6));") | |
541 cursor.execute("CREATE TABLE IF NOT EXISTS exons (transcript VARCHAR(50), exon_start INT(6), exon_stop INT(6));") | |
542 cursor.execute("CREATE TABLE IF NOT EXISTS uorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") | |
543 cursor.execute("CREATE TABLE IF NOT EXISTS ouorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") | |
544 cursor.execute("CREATE TABLE IF NOT EXISTS cds (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") | |
545 cursor.execute("CREATE TABLE IF NOT EXISTS nested (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") | |
546 cursor.execute("CREATE TABLE IF NOT EXISTS odorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") | |
547 cursor.execute("CREATE TABLE IF NOT EXISTS dorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") | |
548 cursor.execute("CREATE TABLE IF NOT EXISTS minusone(transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") | |
549 cursor.execute("CREATE TABLE IF NOT EXISTS readthrough (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") | |
550 cursor.execute("CREATE TABLE IF NOT EXISTS plusone (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") | |
551 cursor.execute("CREATE TABLE IF NOT EXISTS noncoding (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") | |
552 cursor.execute("CREATE TABLE IF NOT EXISTS extension (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") | |
553 cursor.execute("CREATE TABLE IF NOT EXISTS inframe_stop (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") | |
554 connection.commit(); | |
555 | |
556 | |
557 print ("Finding ORFs") | |
558 transcript_count = 0 | |
559 total_transcripts = len(list(master_dict)) | |
560 for transcript in master_dict: | |
561 #print ("transcript",transcript) | |
562 #if transcript != "ENST00000316448": | |
563 # continue | |
564 transcript_count += 1 | |
565 if transcript_count%100 == 0: | |
566 print ("Transcripts complete: {}/{}".format(transcript_count,total_transcripts)) | |
567 if "seq" not in master_dict[transcript]: | |
568 print ("transcript {} has no sequence".format(transcript)) | |
569 continue | |
570 seq = master_dict[transcript]["seq"] | |
571 cds_start = "NULL" | |
572 cds_stop = "NULL" | |
573 transcript_len = len(seq) | |
574 if "cds_start" in master_dict[transcript]: | |
575 cds_start = master_dict[transcript]["cds_start"] | |
576 cds_stop = master_dict[transcript]["cds_stop"] | |
577 | |
578 #Find out what regions of this transcript overlap with any other coding regions | |
579 coding_positions = [] | |
580 if cds_start != "NULL": | |
581 #If this is a coding transcript don't bother checking the CDS | |
582 for i in range(cds_start,cds_stop): | |
583 coding_positions.append(i) | |
584 #check 5' leader | |
585 chrom, pos_list = tran_to_genome(transcript, 0, cds_start, master_dict) | |
586 for i in range(0,cds_start): | |
587 genomic_pos = pos_list[i] | |
588 overlap = tree_dict[chrom][genomic_pos] | |
589 if len(overlap) != 0: | |
590 coding_positions.append(i) | |
591 #check 3' trailer | |
592 chrom, pos_list = tran_to_genome(transcript, cds_stop, transcript_len, master_dict) | |
593 for i in range(cds_stop,transcript_len+1): | |
594 #print ("i",i) | |
595 genomic_pos = pos_list[i-cds_stop] | |
596 #print ("genomic position",genomic_pos) | |
597 overlap = tree_dict[chrom][genomic_pos] | |
598 if len(overlap) != 0: | |
599 #print ("overlap not empty appending i",overlap) | |
600 coding_positions.append(i) | |
601 else: | |
602 #check entire transcript | |
603 chrom, pos_list = tran_to_genome(transcript, 0, transcript_len, master_dict) | |
604 for i in range(0,transcript_len): | |
605 genomic_pos = pos_list[i] | |
606 overlap = tree_dict[chrom][genomic_pos] | |
607 if len(overlap) != 0: | |
608 coding_positions.append(i) | |
609 coding_positions_tuple = to_ranges(coding_positions) | |
610 coding_dict[transcript] = coding_positions_tuple | |
611 coding_positions = set(coding_positions) | |
612 #if this is a coding transcript find the minusone, readhtrough, plusone co-ordinates | |
613 if cds_start != "NULL": | |
614 if pseudo_utr_len != 0: | |
615 cds_stop -= 3 # take 3 from stop so we can match it with orf_stop, do it here rather than above in case cds_stop is null | |
616 recoding_dict = {0:"minusone",1:"readthrough",2:"plusone"} | |
617 for addition in recoding_dict: | |
618 orftype = recoding_dict[addition] | |
619 for i in range(cds_stop+addition,transcript_len,3): | |
620 if seq[i:i+3] in stop_codons: | |
621 orf_seq = seq[cds_stop:i+3] | |
622 orf_start = cds_stop | |
623 orf_stop = i+2 # +2 so it refers to the end of the stop codon | |
624 start_codon = None | |
625 if orf_seq: | |
626 length = len(orf_seq) | |
627 orf_pos_list = [] | |
628 #determine how many nucleotides in this orf overlap with an annotated coding region | |
629 cds_cov_count = 0.0 | |
630 for position in range(orf_start,orf_stop): | |
631 orf_pos_list.append(position) | |
632 for pos in range(orf_start, orf_stop+1): | |
633 if pos in coding_positions: | |
634 cds_cov_count += 1 | |
635 cds_cov = cds_cov_count/length | |
636 cursor.execute("INSERT INTO {} VALUES('{}','{}',{},{},{},'{}',{});".format(orftype, transcript, start_codon, length,orf_start,orf_stop,orf_seq,cds_cov)) | |
637 break | |
638 for frame in [0,1,2]: | |
639 for i in range(frame,transcript_len,3): | |
640 if seq[i:i+3] in start_codons: | |
641 for x in range(i, transcript_len,3): | |
642 if seq[x:x+3] in stop_codons: | |
643 orf_seq = seq[i:x+3] | |
644 orf_start = i | |
645 orf_stop = x+2 # +2 so it refers to the end of the stop codon | |
646 start_codon = seq[i:i+3] | |
647 length = len(orf_seq) | |
648 orf_pos_list = [] | |
649 #determine how many nucleotides in this orf overlap with an annotated coding region | |
650 cds_cov_count = 0.0 | |
651 for pos in range(orf_start, orf_stop+1): | |
652 if pos in coding_positions: | |
653 cds_cov_count += 1 | |
654 cds_cov = float(cds_cov_count)/float(length) | |
655 # Now determine orf type | |
656 if cds_start == "NULL": | |
657 orftype = "noncoding" | |
658 else: | |
659 #print ("cds start is not null :{}:".format(cds_start)) | |
660 if orf_start == cds_start and orf_stop == cds_stop: | |
661 orftype = "cds" | |
662 elif orf_start < cds_start and orf_stop == cds_stop: | |
663 orftype = "extension" | |
664 #special case for extensions, we only take from the orf_start to the cds_start, and re-calculate cds coverage | |
665 orf_stop = cds_start | |
666 cds_cov_count = 0.0 | |
667 for pos in range(orf_start, orf_stop+1): | |
668 if pos in coding_positions: | |
669 cds_cov_count += 1 | |
670 cds_cov = float(cds_cov_count)/float(length) | |
671 orf_seq = seq[orf_start:cds_start] | |
672 length = len(orf_seq) | |
673 elif orf_start < cds_start and orf_stop <= cds_start: | |
674 orftype = "uorf" | |
675 elif orf_start < cds_start and orf_stop > cds_start: | |
676 orftype = "ouorf" | |
677 elif orf_start >= cds_start and orf_start <= cds_stop and orf_stop <= cds_stop: | |
678 if orf_stop == cds_stop: | |
679 break | |
680 orftype = "nested" | |
681 elif orf_start >= cds_start and orf_start <= cds_stop and orf_stop > cds_stop: | |
682 orftype = "odorf" | |
683 elif orf_start > cds_stop and orf_stop > cds_stop: | |
684 orftype = "dorf" | |
685 if orf_stop > cds_start and orf_stop < cds_stop: | |
686 if (orf_stop+1)%3 == cds_start%3: | |
687 orftype = "inframe_stop" | |
688 if transcript not in orf_dict: | |
689 orf_dict[orftype][transcript] = [] | |
690 cursor.execute("INSERT INTO {} VALUES('{}','{}',{},{},{},'{}',{});".format(orftype, transcript, start_codon, length,orf_start,orf_stop,orf_seq,cds_cov)) | |
691 break | |
692 # Used to keep track of the codons at cds_start and cds_stop positions, | |
693 # If there is an issue with the cds co-ordinates the starts and stops counts will | |
694 # be much lower than the other count, start with 1 to prevent division by 0 | |
695 nuc_dict = {"stops":{"stops":1,"other":0}, "starts":{"starts":1,"other":0}} | |
696 | |
697 def calcgc(seq): | |
698 if seq == "": | |
699 return "NULL" | |
700 g_count = 0 | |
701 c_count = 0 | |
702 a_count = 0 | |
703 t_count = 0 | |
704 for char in seq: | |
705 if char == "A": | |
706 a_count += 1 | |
707 if char == "T": | |
708 t_count += 1 | |
709 if char == "G": | |
710 g_count += 1 | |
711 if char == "C": | |
712 c_count += 1 | |
713 gc = ((g_count+c_count)/float(len(seq)))*100 | |
714 return round(gc,2) | |
715 | |
716 | |
717 | |
718 | |
719 for transcript in master_dict: | |
720 #print ("transcripts", transcript) | |
721 length = master_dict[transcript]["length"] | |
722 cds_start = master_dict[transcript]["cds_start"] | |
723 cds_stop = master_dict[transcript]["cds_stop"] | |
724 seq = master_dict[transcript]["seq"] | |
725 strand = master_dict[transcript]["strand"] | |
726 chrom = master_dict[transcript]["chrom"] | |
727 gene = master_dict[transcript]["gene_name"] | |
728 gc = calcgc(seq) | |
729 five_gc = "NULL" | |
730 cds_gc = "NULL" | |
731 three_gc = "NULL" | |
732 if cds_start != "NULL": | |
733 five_gc = calcgc(seq[:cds_start]) | |
734 cds_gc = calcgc(seq[cds_start:cds_stop]) | |
735 three_gc = calcgc(seq[cds_stop:]) | |
736 # check that the nucleotide cds_start points to is the first of the start codon | |
737 # take one becase cds_start is 1 based but python indexing is 0 based | |
738 start_nuc = seq[cds_start-1:cds_start+2] | |
739 #print ("start nuc",start_nuc) | |
740 if start_nuc == "ATG": | |
741 nuc_dict["starts"]["starts"] += 1 | |
742 else: | |
743 nuc_dict["starts"]["other"] += 1 | |
744 stop_nuc = seq[cds_stop-3:cds_stop] | |
745 #print ("stop_nuc",stop_nuc) | |
746 if stop_nuc in ["TAG","TAA","TGA"]: | |
747 nuc_dict["stops"]["stops"] += 1 | |
748 else: | |
749 nuc_dict["stops"]["other"] += 1 | |
750 tran_type = master_dict[transcript]["tran_type"] | |
751 if coding_genes_dict[gene] == True: | |
752 gene_type = 1 | |
753 else: | |
754 gene_type = 0 | |
755 #print ("tran type before",tran_type) | |
756 if tran_type == "coding": | |
757 tran_type = 1 | |
758 else: | |
759 tran_type = 0 | |
760 #print ("tran type after",tran_type) | |
761 start_list = str(master_dict[transcript]["start_list"]).replace(" ","").strip("[]") | |
762 stop_list = str(master_dict[transcript]["stop_list"]).replace(" ","").strip("[]") | |
763 exon_junctions = str(master_dict[transcript]["exon_junctions"]).replace(" ","").strip("[]") | |
764 principal = master_dict[transcript]["principal"] | |
765 version = master_dict[transcript]["version"] | |
766 #print (master_dict[transcript]) | |
767 #print (tran_type) | |
768 #print (gene_type) | |
769 #print (principal) | |
770 #print (version) | |
771 #print ("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{});".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version)) | |
772 cursor.execute("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{},{},{},{},{},'{}');".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version,gc,five_gc,cds_gc,three_gc,chrom)) | |
773 | |
774 for tup in master_dict[transcript]["exon"]: | |
775 cursor.execute("INSERT INTO exons VALUES('{}',{},{});".format(transcript,tup[0],tup[1])) | |
776 if transcript in coding_dict: | |
777 for tup in coding_dict[transcript]: | |
778 cursor.execute("INSERT INTO coding_regions VALUES('{}',{},{});".format(transcript,tup[0],tup[1])) | |
779 | |
780 connection.commit() | |
781 connection.close() | |
782 | |
783 if (nuc_dict["starts"]["other"]/nuc_dict["starts"]["starts"]) > 0.05: | |
784 print ("Warning: {} transcripts do not have a an AUG at the CDS start position".format(nuc_dict["starts"]["other"])) | |
785 if (nuc_dict["stops"]["other"]/nuc_dict["stops"]["stops"]) > 0.05: | |
786 print ("Warning: {} transcripts do not have a an stop codon at the CDS stop position".format(nuc_dict["stops"]["other"])) | |
787 if len(notinannotation) >0: | |
788 print ("Warning: {} transcripts were in the fasta file, but not the annotation file, these will be discarded".format(len(notinannotation))) |