annotate trips_create_annotation/create_annotation_sqlite.py @ 5:cdecd5f9a4d3 draft

Uploaded
author jackcurragh
date Fri, 27 May 2022 10:14:09 +0000
parents f1c72ed4b32c
children 8d47b5f57579
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
1 # Python3 script which takes in an annotation file(gtf/gff3) and a transcriptomic fasta file
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
2 # and produces an sqlite file which can be uploaded to Trips-Viz
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
3 # All co-ordinates produced are 1 based
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
4 # All start codon positions (including cds_start) should be at the first nucleotide of the codon
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
5 # All stop codon positions (including cds_stop) should be at the last nucleotide of the codon
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
6 import sys
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
7 import re
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
8 import sqlite3
3
f1c72ed4b32c Uploaded
jackcurragh
parents: 2
diff changeset
9 import subprocess
f1c72ed4b32c Uploaded
jackcurragh
parents: 2
diff changeset
10
0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
11 from intervaltree import Interval, IntervalTree
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
12 import itertools
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
13 from sqlitedict import SqliteDict
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
14 import os
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
15
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
16 organism = sys.argv[1]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
17 # This should be a GTF or GFF3 file
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
18 annotation_file = open(sys.argv[2], "r")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
19 # This needs to be the transcriptomic fasta file
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
20 fasta_file = open(sys.argv[3], "r")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
21 # This value will be added used to create UTRs of this length, useful when looking at transcriptomes without annotated UTRs
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
22 pseudo_utr_len = int(sys.argv[4])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
23 # An example of a transcript_id from the annotation file, e.g ENST000000123456
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
24 user_transcript_id = sys.argv[5]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
25 # An example of a gene name from the annotation file
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
26 user_gene_name = sys.argv[6]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
27 # Set to true if transcript version is included in transcript_id, e.g: ENST000000123456.1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
28 TRAN_VERSION = True
3
f1c72ed4b32c Uploaded
jackcurragh
parents: 2
diff changeset
29 output = sys.argv[7]
0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
30
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
31
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
32 if os.path.isfile("{}.sqlite".format(organism)):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
33 print("{}.sqlite already exists".format(organism))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
34 sys.exit()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
35
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
36
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
37 # old_exons = SqliteDict(
2
d70696d3341e Uploaded
jackcurragh
parents: 0
diff changeset
38 # "/home/data2/www/tripsviz/tripsviz/trips_annotations/mus_musculus/transcriptomic_to_genomic.sqlite"
0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
39 # )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
40
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
41
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
42 delimiters = {
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
43 "transcripts": {"before": [], "after": ["."], "annot_types": ["cds", "utr"]},
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
44 "genes": {"before": [], "after": ['"'], "annot_types": ["lnc_rna"]},
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
45 }
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
46
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
47 punctuation = [";", " ", "-", ":", "-", ".", "=", "\t"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
48 # Find delimiters in the annotation and fasta files using the user_transcript_id
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
49 # and user_gene_name examples given by user.
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
50 for line in annotation_file:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
51 if user_transcript_id in line:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
52 tabsplitline = line.split("\t")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
53 annot_type = tabsplitline[2]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
54 if annot_type not in delimiters["transcripts"]["annot_types"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
55 delimiters["transcripts"]["annot_types"].append(annot_type.lower())
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
56 splitline = line.split(user_transcript_id)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
57 before_delimiter = splitline[0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
58 for item in punctuation:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
59 if item in before_delimiter:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
60 if len(before_delimiter.split(item)[-1]) >= 5:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
61 before_delimiter = before_delimiter.split(item)[-1]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
62 after_delimiter = splitline[1][:2]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
63 if (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
64 before_delimiter not in delimiters["transcripts"]["before"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
65 and len(before_delimiter) >= 5
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
66 ):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
67 delimiters["transcripts"]["before"].append(before_delimiter)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
68 if after_delimiter not in delimiters["transcripts"]["after"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
69 delimiters["transcripts"]["after"].append(after_delimiter)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
70 if user_gene_name in line:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
71 tabsplitline = line.split("\t")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
72 annot_type = tabsplitline[2]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
73 print("ANNOT TYPE", annot_type)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
74 if annot_type not in delimiters["genes"]["annot_types"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
75 delimiters["genes"]["annot_types"].append(annot_type.lower())
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
76 splitline = line.split(user_gene_name)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
77 before_delimiter = splitline[0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
78 for item in punctuation:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
79 if item in before_delimiter:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
80 if len(before_delimiter.split(item)[-1]) >= 5:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
81 before_delimiter = before_delimiter.split(item)[-1]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
82 after_delimiter = splitline[1][0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
83 if (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
84 before_delimiter not in delimiters["genes"]["before"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
85 and len(before_delimiter) >= 5
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
86 ):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
87 delimiters["genes"]["before"].append(before_delimiter)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
88 if after_delimiter not in delimiters["genes"]["after"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
89 if after_delimiter in punctuation:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
90 delimiters["genes"]["after"].append(after_delimiter)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
91
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
92 print("delimeters[genes]", delimiters["transcripts"]["annot_types"])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
93
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
94 for line in fasta_file:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
95 if user_transcript_id in line:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
96 splitline = line.split(user_transcript_id)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
97 before_delimiter = splitline[0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
98 for item in punctuation:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
99 if item in before_delimiter:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
100 if len(before_delimiter.split(item)[1]) >= 5:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
101 before_delimiter = before_delimiter.split(item)[1]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
102 after_delimiter = splitline[1][0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
103 if (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
104 before_delimiter not in delimiters["transcripts"]["before"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
105 and len(before_delimiter) >= 5
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
106 ):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
107 delimiters["transcripts"]["before"].append(before_delimiter)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
108 if after_delimiter not in delimiters["transcripts"]["after"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
109 delimiters["transcripts"]["after"].append(after_delimiter)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
110 fasta_file.close()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
111 annotation_file.close()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
112
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
113
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
114 if delimiters["transcripts"]["before"] == []:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
115 print(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
116 "ERROR: No transcript_id with the name {} could be found in the annotation file".format(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
117 user_transcript_id
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
118 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
119 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
120 sys.exit()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
121 if delimiters["genes"]["before"] == []:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
122 print(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
123 "ERROR: No gene with the name {} could be found in the annotation file".format(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
124 user_gene_name
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
125 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
126 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
127 sys.exit()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
128
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
129 master_dict = {}
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
130 coding_dict = {}
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
131 notinfasta = open("notinfasta.csv", "w")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
132
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
133 # Given a nucleotide sequence returns the positions of all start and stop codons.
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
134 def get_start_stops(transcript_sequence):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
135 transcript_sequence = transcript_sequence.upper()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
136 start_codons = ["ATG"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
137 stop_codons = ["TAA", "TAG", "TGA"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
138 seq_frames = {"starts": [], "stops": []}
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
139 for codons, positions in ((start_codons, "starts"), (stop_codons, "stops")):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
140 if len(codons) > 1:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
141 pat = re.compile("|".join(codons))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
142 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
143 pat = re.compile(codons[0])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
144 for m in re.finditer(pat, transcript_sequence):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
145 # Increment position by 1, Frame 1 starts at position 1 not 0,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
146 # if it's a stop codon add another 2 so it points to the last nuc of the codon
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
147 if positions == "starts":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
148 start = m.start() + 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
149 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
150 start = m.start() + 3
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
151 seq_frames[positions].append(start)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
152 return seq_frames
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
153
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
154
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
155 # parse fasta to get the nucleotide sequence of transcripts and the positions of start/stop codons.
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
156 fasta_file = open(sys.argv[3], "r")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
157 read_fasta = fasta_file.read()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
158 split_fasta = read_fasta.split(">")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
159 for entry in split_fasta[1:]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
160 newline_split = entry.split("\n")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
161 tran = newline_split[0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
162 for item in delimiters["transcripts"]["after"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
163 if item in tran:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
164 tran = tran.split(item)[0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
165 tran = tran.replace("-", "_").replace("(", "").replace(")", "")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
166 seq = "".join(newline_split[1:])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
167 if "_PAR_Y" in tran:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
168 tran += "_chrY"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
169 elif "_PAR_X" in tran:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
170 tran += "_chrX"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
171 tran = tran.upper()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
172 starts_stops = get_start_stops(seq)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
173 print("tran", tran)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
174 if tran not in master_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
175 master_dict[tran] = {
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
176 "utr": [],
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
177 "cds": [],
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
178 "exon": [],
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
179 "start_codon": [],
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
180 "stop_codon": [],
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
181 "start_list": starts_stops["starts"],
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
182 "stop_list": starts_stops["stops"],
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
183 "transcript": [],
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
184 "strand": "",
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
185 "gene_name": "",
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
186 "chrom": "",
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
187 "seq": seq,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
188 "cds_start": "NULL",
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
189 "cds_stop": "NULL",
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
190 "length": len(seq),
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
191 "principal": 0,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
192 "version": "NULL",
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
193 }
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
194
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
195
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
196 def to_ranges(iterable):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
197 tup_list = []
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
198 iterable = sorted(set(iterable))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
199 for key, group in itertools.groupby(enumerate(iterable), lambda t: t[1] - t[0]):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
200 group = list(group)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
201 tup_list.append((group[0][1], group[-1][1]))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
202 return tup_list
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
203
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
204
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
205 # parse annotation file to get chromsome, exon location and CDS info for each transcript
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
206 def parse_gtf_file(annotation_file):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
207 for line in annotation_file:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
208 if line == "\n":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
209 continue
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
210 if line[0] != "#":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
211 splitline = (line.replace("\n", "")).split("\t")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
212 chrom = splitline[0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
213 try:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
214 annot_type = splitline[2].lower()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
215 except:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
216 print(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
217 "ERROR tried to index to second item in splitline: ",
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
218 splitline,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
219 line,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
220 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
221 sys.exit()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
222 # if annot_type not in ["cds", "utr", "exon", "transcript","five_prime_utr", "three_prime_utr","stop_codon","start_codon"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
223 # continue
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
224 if (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
225 annot_type not in delimiters["transcripts"]["annot_types"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
226 and annot_type not in delimiters["genes"]["annot_types"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
227 ):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
228 continue
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
229 if annot_type == "five_prime_utr" or annot_type == "three_prime_utr":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
230 annot_type = "utr"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
231 strand = splitline[6]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
232 if strand == "+":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
233 start = int(splitline[3])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
234 end = int(splitline[4])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
235 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
236 start = int(splitline[3]) + 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
237 end = int(splitline[4]) + 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
238 desc = splitline[8]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
239 tran = desc
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
240 gene = desc
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
241 for item in delimiters["transcripts"]["before"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
242 if item in tran:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
243 tran = tran.split(item)[1]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
244 for item in delimiters["transcripts"]["after"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
245 if item in tran:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
246 tran = tran.split(item)[0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
247 if "." in tran and TRAN_VERSION == True:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
248 # print ("raw tran",tran)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
249 tran = tran.split(".")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
250 version = int(tran[-1].split("_")[0])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
251 tran = tran[0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
252 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
253 version = "NULL"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
254 tran = tran.replace("-", "_").replace(".", "_")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
255 tran = tran.replace("(", "").replace(")", "")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
256 tran = tran.replace(" ", "").replace("\t", "")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
257 tran = tran.upper()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
258 tran = tran.replace("GENE_", "").replace("ID_", "")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
259 if "_PAR_Y" in desc:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
260 # print ("adding _PAR_Y to tran")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
261 tran = tran + "_PAR_Y"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
262 # print ("New tran ", tran)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
263 # if "PAR_Y" in line:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
264 # print (line)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
265 # #sys.exit()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
266 # print ("tran",tran,version)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
267 # if tran == "ENST00000316448":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
268 # print ("annot type",annot_type)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
269 # print ("appending exon to tran", start, end,line)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
270
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
271 gene_found = False
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
272
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
273 if annot_type in delimiters["genes"]["annot_types"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
274 for item in delimiters["genes"]["before"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
275 if item in gene:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
276 gene_found = True
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
277 gene = gene.split(item)[1]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
278 for item in delimiters["genes"]["after"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
279 if item in gene:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
280 gene = gene.split(item)[0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
281 gene = gene.replace("'", "''")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
282 gene = gene.replace("GENE_", "")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
283 gene = gene.replace("ID_", "")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
284 gene = gene.upper()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
285 if tran in master_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
286 master_dict[tran]["strand"] = strand
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
287 if strand == "+":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
288 if annot_type in master_dict[tran]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
289 master_dict[tran][annot_type].append((start, end))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
290 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
291 if annot_type in master_dict[tran]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
292 master_dict[tran][annot_type].append((start, end))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
293 master_dict[tran]["chrom"] = chrom
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
294 master_dict[tran]["version"] = version
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
295 if gene_found == True:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
296 master_dict[tran]["gene_name"] = gene
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
297 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
298 notinfasta.write("{}\n".format(tran))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
299
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
300
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
301 annotation_file = open(sys.argv[2], "r")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
302 parse_gtf_file(annotation_file)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
303
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
304
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
305 # remove transcripts that were in fasta file but not in annotation_file
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
306 notinannotation = []
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
307 for tran in master_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
308 if master_dict[tran]["chrom"] == "":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
309 # print ("tran {} has no chrom :(".format(tran))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
310 notinannotation.append(tran)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
311 for tran in notinannotation:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
312 del master_dict[tran]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
313 # Dictionary to store the coding status of a gene, if any transcript of this gene is coding, the value will be True
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
314 coding_genes_dict = {}
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
315 # parse master_dict to calculate length, cds_start/stop and exon junction positions
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
316 for tran in master_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
317 try:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
318 transeq = master_dict[tran]["seq"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
319 except Exception as e:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
320 print("not in fasta", tran)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
321 notinfasta.write("{}\n".format(tran))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
322 continue
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
323 exon_junctions = []
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
324 total_length = len(transeq)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
325 three_len = 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
326 five_len = 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
327 strand = master_dict[tran]["strand"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
328 if master_dict[tran]["gene_name"] == "":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
329 master_dict[tran]["gene_name"] = tran
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
330 gene = master_dict[tran]["gene_name"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
331 if gene not in coding_genes_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
332 coding_genes_dict[gene] = False
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
333
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
334 if master_dict[tran]["cds"] == []:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
335 tran_type = "noncoding"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
336 cds_start = "NULL"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
337 cds_stop = "NULL"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
338 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
339 # get utr lengths from annotation
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
340 tran_type = "coding"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
341 coding_genes_dict[gene] = True
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
342 sorted_exons = sorted(master_dict[tran]["exon"])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
343 sorted_cds = sorted(master_dict[tran]["cds"])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
344
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
345 min_cds = sorted_cds[0][0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
346 # Some annotation files do not have utr annotation types, so fix that here if thats the case
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
347 if master_dict[tran]["utr"] == []:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
348 for exon_tup in master_dict[tran]["exon"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
349 if exon_tup not in master_dict[tran]["cds"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
350 # Now check if this overlaps with any of the CDS exons
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
351 overlap = False
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
352 for cds_tup in master_dict[tran]["cds"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
353 if exon_tup[0] == cds_tup[0] and exon_tup[1] != cds_tup[1]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
354 master_dict[tran]["utr"].append((cds_tup[1], exon_tup[1]))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
355 overlap = True
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
356 if exon_tup[0] != cds_tup[0] and exon_tup[1] == cds_tup[1]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
357 master_dict[tran]["utr"].append((exon_tup[0], cds_tup[0]))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
358 overlap = True
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
359 if overlap == False:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
360 master_dict[tran]["utr"].append(exon_tup)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
361
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
362 for tup in sorted(master_dict[tran]["utr"]):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
363 if tup[0] < min_cds:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
364 five_len += (tup[1] - tup[0]) + 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
365 elif tup[0] > min_cds:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
366 three_len += (tup[1] - tup[0]) + 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
367 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
368 pass
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
369 if strand == "+":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
370 if len(sorted_exons) > 1:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
371 sorted_exons[0] = (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
372 sorted_exons[0][0] - pseudo_utr_len,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
373 sorted_exons[0][1],
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
374 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
375 sorted_exons[-1] = (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
376 sorted_exons[-1][0],
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
377 sorted_exons[-1][1] + pseudo_utr_len,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
378 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
379 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
380 sorted_exons[0] = (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
381 sorted_exons[0][0] - pseudo_utr_len,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
382 sorted_exons[0][1] + pseudo_utr_len,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
383 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
384 master_dict[tran]["exon"] = sorted_exons
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
385 cds_start = five_len + pseudo_utr_len
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
386 cds_stop = ((total_length - three_len) - pseudo_utr_len) + 4
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
387 elif strand == "-":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
388 if len(sorted_exons) > 1:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
389 sorted_exons[0] = (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
390 (sorted_exons[0][0] - pseudo_utr_len),
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
391 sorted_exons[0][1],
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
392 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
393 sorted_exons[-1] = (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
394 sorted_exons[-1][0],
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
395 (sorted_exons[-1][1] + pseudo_utr_len),
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
396 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
397 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
398 sorted_exons[0] = (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
399 (sorted_exons[0][0] - pseudo_utr_len),
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
400 (sorted_exons[0][1] + pseudo_utr_len),
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
401 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
402 master_dict[tran]["exon"] = sorted_exons
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
403 cds_start = three_len + pseudo_utr_len
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
404 cds_stop = ((total_length - (five_len)) - pseudo_utr_len) + 4
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
405 # if tran == "ENST00000381401":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
406 # print ("cds start, cds stop, five_len, three_len",cds_start,cds_stop,five_len,three_len)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
407 # #sys.exit()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
408 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
409 print("strand is unknown: {}".format(strand))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
410 sys.exit()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
411 # get exon junctions, cds is easy just get end of each tuple except last, same for utr except for if same as cds start/stop
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
412 total_intronic = 0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
413 try:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
414 if strand == "+":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
415 tx_start = min(sorted(master_dict[tran]["exon"]))[0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
416 prev_end = tx_start
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
417 for tup in sorted(master_dict[tran]["exon"])[:-1]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
418 total_intronic += tup[0] - prev_end
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
419 exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
420 prev_end = tup[1]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
421 elif strand == "-":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
422 tx_start = max(sorted(master_dict[tran]["exon"]))[-1]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
423 prev_end = tx_start
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
424 for tup in (sorted(master_dict[tran]["exon"])[1:])[::-1]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
425 total_intronic += (tup[0] + 1) - prev_end
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
426 exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
427 prev_end = tup[1]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
428 except:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
429 if strand == "+":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
430 tx_start = min(sorted(master_dict[tran]["cds"]))[0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
431 prev_end = tx_start
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
432 for tup in sorted(master_dict[tran]["cds"])[:-1]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
433 total_intronic += tup[0] - prev_end
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
434 exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
435 prev_end = tup[1]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
436 elif strand == "-":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
437 tx_start = max(sorted(master_dict[tran]["cds"]))[-1]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
438 prev_end = tx_start
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
439 for tup in (sorted(master_dict[tran]["cds"])[1:])[::-1]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
440 total_intronic += (tup[0] + 1) - prev_end
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
441 exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
442 prev_end = tup[1]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
443 # This can happen when a coding transcript doesn't have a properly annotated 3' trailer
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
444 if cds_stop != "NULL":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
445 if cds_stop > total_length:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
446 cds_stop = total_length
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
447 if strand == "+" and cds_start != "NULL":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
448 master_dict[tran]["cds_start"] = cds_start
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
449 master_dict[tran]["cds_stop"] = cds_stop
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
450 elif strand == "-" and cds_start != "NULL":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
451 master_dict[tran]["cds_start"] = cds_start
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
452 master_dict[tran]["cds_stop"] = cds_stop
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
453
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
454 master_dict[tran]["strand"] = strand
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
455 master_dict[tran]["tran_type"] = tran_type
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
456 master_dict[tran]["exon_junctions"] = exon_junctions
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
457
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
458 longest_tran_dict = {}
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
459 for tran in master_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
460 try:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
461 gene = master_dict[tran]["gene_name"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
462 except:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
463 continue
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
464 if coding_genes_dict[gene] == True:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
465 if "cds_start" in master_dict[tran]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
466 if (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
467 master_dict[tran]["cds_stop"] != "NULL"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
468 and master_dict[tran]["cds_start"] != "NULL"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
469 ):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
470 cds_length = (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
471 master_dict[tran]["cds_stop"] - master_dict[tran]["cds_start"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
472 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
473 if gene not in longest_tran_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
474 longest_tran_dict[gene] = {"tran": tran, "length": cds_length}
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
475 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
476 if cds_length > longest_tran_dict[gene]["length"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
477 longest_tran_dict[gene] = {"tran": tran, "length": cds_length}
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
478 if cds_length == longest_tran_dict[gene]["length"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
479 if (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
480 master_dict[tran]["length"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
481 > master_dict[longest_tran_dict[gene]["tran"]]["length"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
482 ):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
483 longest_tran_dict[gene] = {
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
484 "tran": tran,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
485 "length": cds_length,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
486 }
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
487 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
488 length = master_dict[tran]["length"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
489 if gene not in longest_tran_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
490 longest_tran_dict[gene] = {"tran": tran, "length": length}
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
491 elif length > longest_tran_dict[gene]["length"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
492 longest_tran_dict[gene] = {"tran": tran, "length": length}
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
493
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
494
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
495 for gene in longest_tran_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
496 longest_tran = longest_tran_dict[gene]["tran"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
497 master_dict[longest_tran]["principal"] = 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
498
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
499 gene_sample = []
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
500 for key in list(master_dict)[:10]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
501 try:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
502 gene_sample.append(master_dict[key]["gene_name"])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
503 except:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
504 pass
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
505 print(master_dict)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
506 print("Here is a sample of the transcript ids: {}".format(list(master_dict)[:10]))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
507 print("Here is a sample of the gene names: {}".format(gene_sample))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
508
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
509
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
510 # Takes a transcript, transcriptomic position and a master_dict (see ribopipe scripts) and returns the genomic position, positions should be passed 1 at a time.
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
511 def tran_to_genome(tran, start_pos, end_pos, master_dict):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
512 pos_list = []
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
513 for i in range(start_pos, end_pos + 1):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
514 pos_list.append(i)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
515 genomic_pos_list = []
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
516 if tran in master_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
517 transcript_info = master_dict[tran]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
518 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
519 return ("Null", [])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
520
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
521 chrom = transcript_info["chrom"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
522 strand = transcript_info["strand"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
523 exons = sorted(transcript_info["exon"])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
524 # print ("chrom,strand,exons",chrom,strand,exons)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
525 for pos in pos_list:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
526 # print ("pos",pos)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
527 if strand == "+":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
528 exon_start = 0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
529 for tup in exons:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
530 # print ("tup",tup)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
531 exon_start = tup[0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
532 exonlen = tup[1] - tup[0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
533 if pos > exonlen:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
534 pos = (pos - exonlen) - 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
535 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
536 break
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
537 # print ("appending exon_start-pos", exon_start, pos, exon_start+pos)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
538 genomic_pos_list.append((exon_start + pos) - 1)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
539 elif strand == "-":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
540 exon_start = 0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
541 for tup in exons[::-1]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
542 # print ("tup",tup)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
543 exon_start = tup[1]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
544 exonlen = tup[1] - tup[0]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
545 # print ("exonlen",exonlen)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
546 if pos > exonlen:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
547 # print ("pos is greater")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
548 pos = (pos - exonlen) - 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
549 # print ("new pos",pos)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
550 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
551 break
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
552 # print ("appending exon_start-pos", exon_start, pos, exon_start-pos)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
553 genomic_pos_list.append((exon_start - pos) + 1)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
554 return (chrom, genomic_pos_list)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
555
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
556
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
557 orf_dict = {
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
558 "uorf": {},
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
559 "ouorf": {},
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
560 "cds": {},
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
561 "nested": {},
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
562 "odorf": {},
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
563 "dorf": {},
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
564 "minusone": {},
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
565 "readthrough": {},
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
566 "plusone": {},
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
567 "noncoding": {},
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
568 "extension": {},
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
569 "inframe_stop": {},
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
570 }
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
571
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
572 start_codons = ["ATG", "CTG", "GTG", "TTG", "ATC", "ATA", "ATT", "ACG", "AAG", "AGG"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
573
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
574 stop_codons = ["TAG", "TAA", "TGA"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
575
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
576
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
577 # Keep track of the longest transcript for each noncoding gene, append this to transcript list later
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
578 longest_noncoding = {}
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
579
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
580
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
581 tran_count = 0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
582 # This section is used to gather all cds regions, convert them to genomic regions and store them in a dictionary to check against later (all transcript contribute to this not just those
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
583 # in the transcript list)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
584 genomic_cds_dict = {}
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
585 tree_dict = {}
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
586 for transcript in master_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
587 # print (transcript, master_dict[transcript]["tran_type"])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
588 tran_count += 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
589 if "seq" not in master_dict[transcript]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
590 continue
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
591 chrom = master_dict[transcript]["chrom"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
592 if chrom not in genomic_cds_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
593 genomic_cds_dict[chrom] = []
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
594 if "cds_start" in master_dict[transcript]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
595 cds_start = master_dict[transcript]["cds_start"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
596 cds_stop = master_dict[transcript]["cds_stop"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
597 if cds_start != "NULL":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
598 cds_pos = []
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
599 for i in range(cds_start, cds_stop + 1):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
600 cds_pos.append(i)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
601
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
602 for tup in master_dict[transcript]["cds"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
603 if tup[0] != tup[1]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
604 if tup not in genomic_cds_dict[chrom]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
605 genomic_cds_dict[chrom].append(tup)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
606
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
607 print("genomic cds dict built")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
608 print(list(genomic_cds_dict))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
609 for chrom in genomic_cds_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
610 tree_dict[chrom] = IntervalTree.from_tuples(genomic_cds_dict[chrom])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
611
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
612 # print (list(tree_dict))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
613
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
614
3
f1c72ed4b32c Uploaded
jackcurragh
parents: 2
diff changeset
615 connection = sqlite3.connect(output)
0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
616 cursor = connection.cursor()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
617 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
618 "CREATE TABLE IF NOT EXISTS transcripts (transcript VARCHAR(50), gene VARCHAR(50), length INT(6), cds_start INT(6), cds_stop INT(6), sequence VARCHAR(50000), strand CHAR(1), stop_list VARCHAR(10000), start_list VARCHAR(10000), exon_junctions VARCHAR(1000), tran_type INT(1), gene_type INT(1), principal INT(1), version INT(2),gc INT(3),five_gc INT(3), cds_gc INT(3), three_gc INT(3), chrom VARCHAR(20));"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
619 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
620 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
621 "CREATE TABLE IF NOT EXISTS coding_regions (transcript VARCHAR(50), coding_start INT(6), coding_stop INT(6));"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
622 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
623 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
624 "CREATE TABLE IF NOT EXISTS exons (transcript VARCHAR(50), exon_start INT(6), exon_stop INT(6));"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
625 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
626 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
627 "CREATE TABLE IF NOT EXISTS uorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
628 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
629 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
630 "CREATE TABLE IF NOT EXISTS ouorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
631 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
632 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
633 "CREATE TABLE IF NOT EXISTS cds (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
634 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
635 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
636 "CREATE TABLE IF NOT EXISTS nested (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
637 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
638 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
639 "CREATE TABLE IF NOT EXISTS odorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
640 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
641 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
642 "CREATE TABLE IF NOT EXISTS dorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
643 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
644 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
645 "CREATE TABLE IF NOT EXISTS minusone(transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
646 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
647 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
648 "CREATE TABLE IF NOT EXISTS readthrough (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
649 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
650 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
651 "CREATE TABLE IF NOT EXISTS plusone (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
652 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
653 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
654 "CREATE TABLE IF NOT EXISTS noncoding (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
655 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
656 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
657 "CREATE TABLE IF NOT EXISTS extension (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
658 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
659 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
660 "CREATE TABLE IF NOT EXISTS inframe_stop (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
661 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
662 connection.commit()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
663
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
664
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
665 print("Finding ORFs")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
666 transcript_count = 0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
667 total_transcripts = len(list(master_dict))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
668 for transcript in master_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
669 # print ("transcript",transcript)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
670 # if transcript != "ENST00000316448":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
671 # continue
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
672 transcript_count += 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
673 if transcript_count % 100 == 0:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
674 print("Transcripts complete: {}/{}".format(transcript_count, total_transcripts))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
675 if "seq" not in master_dict[transcript]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
676 print("transcript {} has no sequence".format(transcript))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
677 continue
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
678 seq = master_dict[transcript]["seq"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
679 cds_start = "NULL"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
680 cds_stop = "NULL"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
681 transcript_len = len(seq)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
682 if "cds_start" in master_dict[transcript]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
683 cds_start = master_dict[transcript]["cds_start"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
684 cds_stop = master_dict[transcript]["cds_stop"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
685
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
686 # Find out what regions of this transcript overlap with any other coding regions
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
687 coding_positions = []
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
688 if cds_start != "NULL":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
689 # If this is a coding transcript don't bother checking the CDS
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
690 for i in range(cds_start, cds_stop):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
691 coding_positions.append(i)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
692 # check 5' leader
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
693 chrom, pos_list = tran_to_genome(transcript, 0, cds_start, master_dict)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
694 for i in range(0, cds_start):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
695 genomic_pos = pos_list[i]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
696 overlap = tree_dict[chrom][genomic_pos]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
697 if len(overlap) != 0:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
698 coding_positions.append(i)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
699 # check 3' trailer
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
700 chrom, pos_list = tran_to_genome(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
701 transcript, cds_stop, transcript_len, master_dict
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
702 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
703 for i in range(cds_stop, transcript_len + 1):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
704 # print ("i",i)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
705 genomic_pos = pos_list[i - cds_stop]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
706 # print ("genomic position",genomic_pos)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
707 overlap = tree_dict[chrom][genomic_pos]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
708 if len(overlap) != 0:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
709 # print ("overlap not empty appending i",overlap)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
710 coding_positions.append(i)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
711 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
712 # check entire transcript
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
713 chrom, pos_list = tran_to_genome(transcript, 0, transcript_len, master_dict)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
714 for i in range(0, transcript_len):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
715 genomic_pos = pos_list[i]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
716 overlap = tree_dict[chrom][genomic_pos]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
717 if len(overlap) != 0:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
718 coding_positions.append(i)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
719 coding_positions_tuple = to_ranges(coding_positions)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
720 coding_dict[transcript] = coding_positions_tuple
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
721 coding_positions = set(coding_positions)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
722 # if this is a coding transcript find the minusone, readhtrough, plusone co-ordinates
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
723 if cds_start != "NULL":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
724 # print ("transcript", transcript)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
725 # if pseudo_utr_len != 0:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
726 # cds_stop -= 3 # take 3 from stop so we can match it with orf_stop, do it here rather than above in case cds_stop is null
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
727 recoding_dict = {2: "minusone", 0: "readthrough", 1: "plusone"}
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
728 for addition in recoding_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
729 orftype = recoding_dict[addition]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
730 for i in range(cds_stop + addition, transcript_len, 3):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
731 if seq[i : i + 3] in stop_codons:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
732 # orf_seq = seq[cds_stop:i+3]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
733 orf_start = cds_stop
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
734 if orftype == "readthrough":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
735 orf_start -= 2
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
736 if orftype == "plusone":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
737 orf_start -= 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
738 orf_stop = i + 3 # +2 so it refers to the end of the stop codon
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
739 start_codon = None
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
740 length = (i + 3) - cds_stop
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
741 orf_pos_list = []
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
742 # determine how many nucleotides in this orf overlap with an annotated coding region
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
743 cds_cov_count = 0.0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
744 for position in range(orf_start, orf_stop):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
745 orf_pos_list.append(position)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
746 for pos in range(orf_start, orf_stop + 1):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
747 if pos in coding_positions:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
748 cds_cov_count += 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
749 cds_cov = cds_cov_count / length
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
750 # print ("orftype, start, stop", orftype, orf_start, orf_stop)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
751 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
752 "INSERT INTO {} VALUES('{}','{}',{},{},{},{});".format(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
753 orftype,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
754 transcript,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
755 start_codon,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
756 length,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
757 orf_start,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
758 orf_stop,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
759 cds_cov,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
760 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
761 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
762 break
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
763 # sys.exit()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
764 for frame in [0, 1, 2]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
765 for i in range(frame, transcript_len, 3):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
766 if seq[i : i + 3] in start_codons:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
767 for x in range(i, transcript_len, 3):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
768 if seq[x : x + 3] in stop_codons:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
769 # orf_seq = seq[i:x+3]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
770 orf_start = i + 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
771 orf_stop = x + 3 # +2 so it refers to the end of the stop codon
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
772 start_codon = seq[i : i + 3]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
773 length = (x + 3) - i
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
774 orf_pos_list = []
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
775 # determine how many nucleotides in this orf overlap with an annotated coding region
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
776 cds_cov_count = 0.0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
777 for pos in range(orf_start, orf_stop + 1):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
778 if pos in coding_positions:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
779 cds_cov_count += 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
780 cds_cov = float(cds_cov_count) / float(length)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
781 # Now determine orf type
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
782 if cds_start == "NULL":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
783 orftype = "noncoding"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
784 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
785 # print ("cds start is not null :{}:{}".format(cds_start,cds_stop))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
786 # print ("orf start, orf stop", orf_start, orf_stop)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
787 if orf_start == cds_start and orf_stop == cds_stop:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
788 orftype = "cds"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
789 # print ("orf type is cds")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
790 elif orf_start < cds_start and orf_stop == cds_stop:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
791 orftype = "extension"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
792 # special case for extensions, we only take from the orf_start to the cds_start, and re-calculate cds coverage
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
793 orf_stop = cds_start
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
794 cds_cov_count = 0.0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
795 for pos in range(orf_start, orf_stop + 1):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
796 if pos in coding_positions:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
797 cds_cov_count += 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
798 cds_cov = float(cds_cov_count) / float(length)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
799 # orf_seq = seq[orf_start:cds_start]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
800 length = cds_start - orf_start
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
801 # print ("orf type is extension")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
802 elif orf_start < cds_start and orf_stop <= cds_start:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
803 orftype = "uorf"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
804 # print ("orf type is uorf")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
805 elif orf_start < cds_start and orf_stop > cds_start:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
806 orftype = "ouorf"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
807 # print ("orf type is ouorf")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
808 # sys.exit()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
809 elif (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
810 orf_start >= cds_start
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
811 and orf_start <= cds_stop
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
812 and orf_stop <= cds_stop
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
813 ):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
814 if orf_stop == cds_stop:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
815 break
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
816 # print ("Tran, cds_start, cds_stop, orf_start, orf_stop, tranlen",tran, cds_start, cds_stop, orf_start, orf_stop,transcript_len)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
817 if (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
818 orf_stop < transcript_len
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
819 and orf_stop % 3 == cds_stop % 3
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
820 ) or (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
821 cds_start != 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
822 and orf_stop % 3 == (cds_start + 2) % 3
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
823 ):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
824 # print ("Transcript {} has an inframe stop codon".format(transcript))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
825 break
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
826 orftype = "nested"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
827 # print ("orf type is nested")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
828 elif (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
829 orf_start >= cds_start
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
830 and orf_start <= cds_stop
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
831 and orf_stop > cds_stop
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
832 ):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
833 orftype = "odorf"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
834 # print ("orftype is odorf")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
835 elif orf_start > cds_stop and orf_stop > cds_stop:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
836 orftype = "dorf"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
837 # print ("orftype is dorf")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
838 # if orf_stop > cds_start and orf_stop < cds_stop:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
839 # if (orf_stop+1)%3 == cds_start%3:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
840 # orftype = "inframe_stop"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
841 # print ("inframe stop, transcript, orf_stop", transcript, orf_stop)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
842 # sys.exit()
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
843 # if transcript not in orf_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
844 # orf_dict[orftype][transcript] = []
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
845 # #print ("some weird stop or something")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
846 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
847 "INSERT INTO {} VALUES('{}','{}',{},{},{},{});".format(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
848 orftype,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
849 transcript,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
850 start_codon,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
851 length,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
852 orf_start,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
853 orf_stop,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
854 cds_cov,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
855 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
856 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
857 break
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
858 # Used to keep track of the codons at cds_start and cds_stop positions,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
859 # If there is an issue with the cds co-ordinates the starts and stops counts will
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
860 # be much lower than the other count, start with 1 to prevent division by 0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
861 nuc_dict = {"stops": {"stops": 1, "other": 0}, "starts": {"starts": 1, "other": 0}}
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
862
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
863
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
864 def calcgc(seq):
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
865 if seq == "":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
866 return "NULL"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
867 g_count = 0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
868 c_count = 0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
869 a_count = 0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
870 t_count = 0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
871 for char in seq:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
872 if char == "A":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
873 a_count += 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
874 if char == "T":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
875 t_count += 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
876 if char == "G":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
877 g_count += 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
878 if char == "C":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
879 c_count += 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
880 gc = ((g_count + c_count) / float(len(seq))) * 100
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
881 return round(gc, 2)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
882
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
883
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
884 for transcript in master_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
885 # print ("transcripts", transcript)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
886 length = master_dict[transcript]["length"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
887 cds_start = master_dict[transcript]["cds_start"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
888 cds_stop = master_dict[transcript]["cds_stop"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
889 seq = master_dict[transcript]["seq"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
890 strand = master_dict[transcript]["strand"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
891 chrom = master_dict[transcript]["chrom"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
892 gene = master_dict[transcript]["gene_name"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
893 gc = calcgc(seq)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
894 five_gc = "NULL"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
895 cds_gc = "NULL"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
896 three_gc = "NULL"
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
897 if cds_start != "NULL":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
898 five_gc = calcgc(seq[:cds_start])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
899 cds_gc = calcgc(seq[cds_start:cds_stop])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
900 three_gc = calcgc(seq[cds_stop:])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
901 # check that the nucleotide cds_start points to is the first of the start codon
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
902 # take one becase cds_start is 1 based but python indexing is 0 based
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
903 start_nuc = seq[cds_start - 1 : cds_start + 2]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
904 # print ("start nuc",start_nuc)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
905 if start_nuc == "ATG":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
906 nuc_dict["starts"]["starts"] += 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
907 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
908 nuc_dict["starts"]["other"] += 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
909 stop_nuc = seq[cds_stop - 3 : cds_stop]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
910 # print ("stop_nuc",stop_nuc)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
911 if stop_nuc in ["TAG", "TAA", "TGA"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
912 nuc_dict["stops"]["stops"] += 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
913 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
914 nuc_dict["stops"]["other"] += 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
915 tran_type = master_dict[transcript]["tran_type"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
916 if coding_genes_dict[gene] == True:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
917 gene_type = 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
918 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
919 gene_type = 0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
920 # print ("tran type before",tran_type)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
921 if tran_type == "coding":
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
922 tran_type = 1
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
923 else:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
924 tran_type = 0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
925 # print ("tran type after",tran_type)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
926 start_list = str(master_dict[transcript]["start_list"]).replace(" ", "").strip("[]")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
927 stop_list = str(master_dict[transcript]["stop_list"]).replace(" ", "").strip("[]")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
928 exon_junctions = (
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
929 str(master_dict[transcript]["exon_junctions"]).replace(" ", "").strip("[]")
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
930 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
931 principal = master_dict[transcript]["principal"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
932 version = master_dict[transcript]["version"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
933 # print (master_dict[transcript])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
934 # print (tran_type)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
935 # print (gene_type)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
936 # print (principal)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
937 # print (version)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
938 # print ("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{});".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version))
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
939 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
940 "INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{},{},{},{},{},'{}');".format(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
941 transcript,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
942 gene,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
943 length,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
944 cds_start,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
945 cds_stop,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
946 seq,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
947 strand,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
948 stop_list,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
949 start_list,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
950 exon_junctions,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
951 tran_type,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
952 gene_type,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
953 principal,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
954 version,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
955 gc,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
956 five_gc,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
957 cds_gc,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
958 three_gc,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
959 chrom,
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
960 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
961 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
962
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
963 for tup in master_dict[transcript]["exon"]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
964 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
965 "INSERT INTO exons VALUES('{}',{},{});".format(transcript, tup[0], tup[1])
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
966 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
967 if transcript in coding_dict:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
968 for tup in coding_dict[transcript]:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
969 cursor.execute(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
970 "INSERT INTO coding_regions VALUES('{}',{},{});".format(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
971 transcript, tup[0], tup[1]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
972 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
973 )
3
f1c72ed4b32c Uploaded
jackcurragh
parents: 2
diff changeset
974 # print(cursor.execute(
f1c72ed4b32c Uploaded
jackcurragh
parents: 2
diff changeset
975 # ".tables"
f1c72ed4b32c Uploaded
jackcurragh
parents: 2
diff changeset
976 # ))
0
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
977
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
978 print("delim", delimiters)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
979 if (nuc_dict["starts"]["other"] / nuc_dict["starts"]["starts"]) > 0.05:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
980 print(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
981 "Warning: {} transcripts do not have a an AUG at the CDS start position".format(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
982 nuc_dict["starts"]["other"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
983 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
984 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
985 if (nuc_dict["stops"]["other"] / nuc_dict["stops"]["stops"]) > 0.05:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
986 print(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
987 "Warning: {} transcripts do not have a an stop codon at the CDS stop position".format(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
988 nuc_dict["stops"]["other"]
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
989 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
990 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
991 if len(notinannotation) > 0:
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
992 print(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
993 "Warning: {} transcripts were in the fasta file, but not the annotation file, these will be discarded".format(
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
994 len(notinannotation)
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
995 )
f24aed7a5cc7 Uploaded
jackcurragh
parents:
diff changeset
996 )
3
f1c72ed4b32c Uploaded
jackcurragh
parents: 2
diff changeset
997
f1c72ed4b32c Uploaded
jackcurragh
parents: 2
diff changeset
998
f1c72ed4b32c Uploaded
jackcurragh
parents: 2
diff changeset
999
f1c72ed4b32c Uploaded
jackcurragh
parents: 2
diff changeset
1000 connection.commit()
f1c72ed4b32c Uploaded
jackcurragh
parents: 2
diff changeset
1001 connection.close()
f1c72ed4b32c Uploaded
jackcurragh
parents: 2
diff changeset
1002
f1c72ed4b32c Uploaded
jackcurragh
parents: 2
diff changeset
1003