trips_viz_create_annotation: trips_create_annotation/create_annotation

comparison trips_create_annotation/create_annotation_sqlite.py @ 0:f24aed7a5cc7 draft

Uploaded

author	jackcurragh
date	Mon, 04 Apr 2022 09:30:51 +0000
parents
children	d70696d3341e

comparison

equal deleted inserted replaced

--1:000000000000
+:f24aed7a5cc7
+# Python3 script which takes in an annotation file(gtf/gff3) and a transcriptomic fasta file
+# and produces an sqlite file which can be uploaded to Trips-Viz
+# All co-ordinates produced are 1 based
+# All start codon positions (including cds_start) should be at the first nucleotide of the codon
+# All stop codon positions (including cds_stop) should be at the last nucleotide of the codon
+import sys
+import re
+import sqlite3
+from intervaltree import Interval, IntervalTree
+import itertools
+from sqlitedict import SqliteDict
+import os
+organism = sys.argv[1]
+# This should be a GTF or GFF3 file
+annotation_file = open(sys.argv[2], "r")
+# This needs to be the transcriptomic fasta file
+fasta_file = open(sys.argv[3], "r")
+# This value will be added used to create UTRs of this length, useful when looking at transcriptomes without annotated UTRs
+pseudo_utr_len = int(sys.argv[4])
+# An example of a transcript_id from the annotation file, e.g ENST000000123456
+user_transcript_id = sys.argv[5]
+# An example of a gene name from the annotation file
+user_gene_name = sys.argv[6]
+# Set to true if transcript version is included in transcript_id, e.g: ENST000000123456.1
+TRAN_VERSION = True
+if os.path.isfile("{}.sqlite".format(organism)):
+print("{}.sqlite already exists".format(organism))
+sys.exit()
+# old_exons = SqliteDict(
+#     "/home/DATA/www/tripsviz/tripsviz/trips_annotations/mus_musculus/transcriptomic_to_genomic.sqlite"
+# )
+delimiters = {
+"transcripts": {"before": [], "after": ["."], "annot_types": ["cds", "utr"]},
+"genes": {"before": [], "after": ['"'], "annot_types": ["lnc_rna"]},
+}
+punctuation = [";", " ", "-", ":", "-", ".", "=", "\t"]
+# Find delimiters in the annotation and fasta files using the user_transcript_id
+# and user_gene_name examples given by user.
+for line in annotation_file:
+if user_transcript_id in line:
+tabsplitline = line.split("\t")
+annot_type = tabsplitline[2]
+if annot_type not in delimiters["transcripts"]["annot_types"]:
+delimiters["transcripts"]["annot_types"].append(annot_type.lower())
+splitline = line.split(user_transcript_id)
+before_delimiter = splitline[0]
+for item in punctuation:
+if item in before_delimiter:
+if len(before_delimiter.split(item)[-1]) >= 5:
+before_delimiter = before_delimiter.split(item)[-1]
+after_delimiter = splitline[1][:2]
+if (
+before_delimiter not in delimiters["transcripts"]["before"]
+and len(before_delimiter) >= 5
+):
+delimiters["transcripts"]["before"].append(before_delimiter)
+if after_delimiter not in delimiters["transcripts"]["after"]:
+delimiters["transcripts"]["after"].append(after_delimiter)
+if user_gene_name in line:
+tabsplitline = line.split("\t")
+annot_type = tabsplitline[2]
+print("ANNOT TYPE", annot_type)
+if annot_type not in delimiters["genes"]["annot_types"]:
+delimiters["genes"]["annot_types"].append(annot_type.lower())
+splitline = line.split(user_gene_name)
+before_delimiter = splitline[0]
+for item in punctuation:
+if item in before_delimiter:
+if len(before_delimiter.split(item)[-1]) >= 5:
+before_delimiter = before_delimiter.split(item)[-1]
+after_delimiter = splitline[1][0]
+if (
+before_delimiter not in delimiters["genes"]["before"]
+and len(before_delimiter) >= 5
+):
+delimiters["genes"]["before"].append(before_delimiter)
+if after_delimiter not in delimiters["genes"]["after"]:
+if after_delimiter in punctuation:
+delimiters["genes"]["after"].append(after_delimiter)
+print("delimeters[genes]", delimiters["transcripts"]["annot_types"])
+for line in fasta_file:
+if user_transcript_id in line:
+splitline = line.split(user_transcript_id)
+before_delimiter = splitline[0]
+for item in punctuation:
+if item in before_delimiter:
+if len(before_delimiter.split(item)[1]) >= 5:
+before_delimiter = before_delimiter.split(item)[1]
+after_delimiter = splitline[1][0]
+if (
+before_delimiter not in delimiters["transcripts"]["before"]
+and len(before_delimiter) >= 5
+):
+delimiters["transcripts"]["before"].append(before_delimiter)
+if after_delimiter not in delimiters["transcripts"]["after"]:
+delimiters["transcripts"]["after"].append(after_delimiter)
+fasta_file.close()
+annotation_file.close()
+if delimiters["transcripts"]["before"] == []:
+print(
+"ERROR: No transcript_id with the name {} could be found in the annotation file".format(
+user_transcript_id
+)
+)
+sys.exit()
+if delimiters["genes"]["before"] == []:
+print(
+"ERROR: No gene with the name {} could be found in the annotation file".format(
+user_gene_name
+)
+)
+sys.exit()
+master_dict = {}
+coding_dict = {}
+notinfasta = open("notinfasta.csv", "w")
+# Given a nucleotide sequence returns the positions of all start and stop codons.
+def get_start_stops(transcript_sequence):
+transcript_sequence = transcript_sequence.upper()
+start_codons = ["ATG"]
+stop_codons = ["TAA", "TAG", "TGA"]
+seq_frames = {"starts": [], "stops": []}
+for codons, positions in ((start_codons, "starts"), (stop_codons, "stops")):
+if len(codons) > 1:
+pat = re.compile("|".join(codons))
+else:
+pat = re.compile(codons[0])
+for m in re.finditer(pat, transcript_sequence):
+# Increment position by 1, Frame 1 starts at position 1 not 0,
+# if it's a stop codon add another 2 so it points to the last nuc of the codon
+if positions == "starts":
+start = m.start() + 1
+else:
+start = m.start() + 3
+seq_frames[positions].append(start)
+return seq_frames
+# parse fasta to get the nucleotide sequence of transcripts and the positions of start/stop codons.
+fasta_file = open(sys.argv[3], "r")
+read_fasta = fasta_file.read()
+split_fasta = read_fasta.split(">")
+for entry in split_fasta[1:]:
+newline_split = entry.split("\n")
+tran = newline_split[0]
+for item in delimiters["transcripts"]["after"]:
+if item in tran:
+tran = tran.split(item)[0]
+tran = tran.replace("-", "_").replace("(", "").replace(")", "")
+seq = "".join(newline_split[1:])
+if "_PAR_Y" in tran:
+tran += "_chrY"
+elif "_PAR_X" in tran:
+tran += "_chrX"
+tran = tran.upper()
+starts_stops = get_start_stops(seq)
+print("tran", tran)
+if tran not in master_dict:
+master_dict[tran] = {
+"utr": [],
+"cds": [],
+"exon": [],
+"start_codon": [],
+"stop_codon": [],
+"start_list": starts_stops["starts"],
+"stop_list": starts_stops["stops"],
+"transcript": [],
+"strand": "",
+"gene_name": "",
+"chrom": "",
+"seq": seq,
+"cds_start": "NULL",
+"cds_stop": "NULL",
+"length": len(seq),
+"principal": 0,
+"version": "NULL",
+}
+def to_ranges(iterable):
+tup_list = []
+iterable = sorted(set(iterable))
+for key, group in itertools.groupby(enumerate(iterable), lambda t: t[1] - t[0]):
+group = list(group)
+tup_list.append((group[0][1], group[-1][1]))
+return tup_list
+# parse annotation file to get chromsome, exon location and CDS info for each transcript
+def parse_gtf_file(annotation_file):
+for line in annotation_file:
+if line == "\n":
+continue
+if line[0] != "#":
+splitline = (line.replace("\n", "")).split("\t")
+chrom = splitline[0]
+try:
+annot_type = splitline[2].lower()
+except:
+print(
+"ERROR tried to index to second item in splitline: ",
+splitline,
+line,
+)
+sys.exit()
+# if annot_type not in ["cds", "utr", "exon", "transcript","five_prime_utr", "three_prime_utr","stop_codon","start_codon"]:
+# 	continue
+if (
+annot_type not in delimiters["transcripts"]["annot_types"]
+and annot_type not in delimiters["genes"]["annot_types"]
+):
+continue
+if annot_type == "five_prime_utr" or annot_type == "three_prime_utr":
+annot_type = "utr"
+strand = splitline[6]
+if strand == "+":
+start = int(splitline[3])
+end = int(splitline[4])
+else:
+start = int(splitline[3]) + 1
+end = int(splitline[4]) + 1
+desc = splitline[8]
+tran = desc
+gene = desc
+for item in delimiters["transcripts"]["before"]:
+if item in tran:
+tran = tran.split(item)[1]
+for item in delimiters["transcripts"]["after"]:
+if item in tran:
+tran = tran.split(item)[0]
+if "." in tran and TRAN_VERSION == True:
+# print ("raw tran",tran)
+tran = tran.split(".")
+version = int(tran[-1].split("_")[0])
+tran = tran[0]
+else:
+version = "NULL"
+tran = tran.replace("-", "_").replace(".", "_")
+tran = tran.replace("(", "").replace(")", "")
+tran = tran.replace(" ", "").replace("\t", "")
+tran = tran.upper()
+tran = tran.replace("GENE_", "").replace("ID_", "")
+if "_PAR_Y" in desc:
+# print ("adding _PAR_Y to tran")
+tran = tran + "_PAR_Y"
+# print ("New tran ", tran)
+# if "PAR_Y" in line:
+# 	print (line)
+# 	#sys.exit()
+# print ("tran",tran,version)
+# if tran == "ENST00000316448":
+# 	print ("annot type",annot_type)
+# 	print ("appending exon to tran", start, end,line)
+gene_found = False
+if annot_type in delimiters["genes"]["annot_types"]:
+for item in delimiters["genes"]["before"]:
+if item in gene:
+gene_found = True
+gene = gene.split(item)[1]
+for item in delimiters["genes"]["after"]:
+if item in gene:
+gene = gene.split(item)[0]
+gene = gene.replace("'", "''")
+gene = gene.replace("GENE_", "")
+gene = gene.replace("ID_", "")
+gene = gene.upper()
+if tran in master_dict:
+master_dict[tran]["strand"] = strand
+if strand == "+":
+if annot_type in master_dict[tran]:
+master_dict[tran][annot_type].append((start, end))
+else:
+if annot_type in master_dict[tran]:
+master_dict[tran][annot_type].append((start, end))
+master_dict[tran]["chrom"] = chrom
+master_dict[tran]["version"] = version
+if gene_found == True:
+master_dict[tran]["gene_name"] = gene
+else:
+notinfasta.write("{}\n".format(tran))
+annotation_file = open(sys.argv[2], "r")
+parse_gtf_file(annotation_file)
+# remove transcripts that were in fasta file but not in annotation_file
+notinannotation = []
+for tran in master_dict:
+if master_dict[tran]["chrom"] == "":
+# print ("tran {} has no chrom :(".format(tran))
+notinannotation.append(tran)
+for tran in notinannotation:
+print(tran)
+del master_dict[tran]
+# Dictionary to store the coding status of a gene, if any transcript of this gene is coding, the value will be True
+coding_genes_dict = {}
+# parse master_dict to calculate length, cds_start/stop and exon junction positions
+for tran in master_dict:
+try:
+transeq = master_dict[tran]["seq"]
+except Exception as e:
+print("not in fasta", tran)
+notinfasta.write("{}\n".format(tran))
+continue
+exon_junctions = []
+total_length = len(transeq)
+three_len = 1
+five_len = 1
+strand = master_dict[tran]["strand"]
+if master_dict[tran]["gene_name"] == "":
+master_dict[tran]["gene_name"] = tran
+gene = master_dict[tran]["gene_name"]
+if gene not in coding_genes_dict:
+coding_genes_dict[gene] = False
+if master_dict[tran]["cds"] == []:
+tran_type = "noncoding"
+cds_start = "NULL"
+cds_stop = "NULL"
+else:
+# get utr lengths from annotation
+tran_type = "coding"
+coding_genes_dict[gene] = True
+sorted_exons = sorted(master_dict[tran]["exon"])
+sorted_cds = sorted(master_dict[tran]["cds"])
+min_cds = sorted_cds[0][0]
+# Some annotation files do not have utr annotation types, so fix that here if thats the case
+if master_dict[tran]["utr"] == []:
+for exon_tup in master_dict[tran]["exon"]:
+if exon_tup not in master_dict[tran]["cds"]:
+# Now check if this overlaps with any of the CDS exons
+overlap = False
+for cds_tup in master_dict[tran]["cds"]:
+if exon_tup[0] == cds_tup[0] and exon_tup[1] != cds_tup[1]:
+master_dict[tran]["utr"].append((cds_tup[1], exon_tup[1]))
+overlap = True
+if exon_tup[0] != cds_tup[0] and exon_tup[1] == cds_tup[1]:
+master_dict[tran]["utr"].append((exon_tup[0], cds_tup[0]))
+overlap = True
+if overlap == False:
+master_dict[tran]["utr"].append(exon_tup)
+for tup in sorted(master_dict[tran]["utr"]):
+if tup[0] < min_cds:
+five_len += (tup[1] - tup[0]) + 1
+elif tup[0] > min_cds:
+three_len += (tup[1] - tup[0]) + 1
+else:
+pass
+if strand == "+":
+if len(sorted_exons) > 1:
+sorted_exons[0] = (
+sorted_exons[0][0] - pseudo_utr_len,
+sorted_exons[0][1],
+)
+sorted_exons[-1] = (
+sorted_exons[-1][0],
+sorted_exons[-1][1] + pseudo_utr_len,
+)
+else:
+sorted_exons[0] = (
+sorted_exons[0][0] - pseudo_utr_len,
+sorted_exons[0][1] + pseudo_utr_len,
+)
+master_dict[tran]["exon"] = sorted_exons
+cds_start = five_len + pseudo_utr_len
+cds_stop = ((total_length - three_len) - pseudo_utr_len) + 4
+elif strand == "-":
+if len(sorted_exons) > 1:
+sorted_exons[0] = (
+(sorted_exons[0][0] - pseudo_utr_len),
+sorted_exons[0][1],
+)
+sorted_exons[-1] = (
+sorted_exons[-1][0],
+(sorted_exons[-1][1] + pseudo_utr_len),
+)
+else:
+sorted_exons[0] = (
+(sorted_exons[0][0] - pseudo_utr_len),
+(sorted_exons[0][1] + pseudo_utr_len),
+)
+master_dict[tran]["exon"] = sorted_exons
+cds_start = three_len + pseudo_utr_len
+cds_stop = ((total_length - (five_len)) - pseudo_utr_len) + 4
+# if tran == "ENST00000381401":
+# 	print ("cds start, cds stop, five_len, three_len",cds_start,cds_stop,five_len,three_len)
+# 	#sys.exit()
+else:
+print("strand is unknown: {}".format(strand))
+sys.exit()
+# get exon junctions, cds is easy just get end of each tuple except last, same for utr except for if same as cds start/stop
+total_intronic = 0
+try:
+if strand == "+":
+tx_start = min(sorted(master_dict[tran]["exon"]))[0]
+prev_end = tx_start
+for tup in sorted(master_dict[tran]["exon"])[:-1]:
+total_intronic += tup[0] - prev_end
+exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
+prev_end = tup[1]
+elif strand == "-":
+tx_start = max(sorted(master_dict[tran]["exon"]))[-1]
+prev_end = tx_start
+for tup in (sorted(master_dict[tran]["exon"])[1:])[::-1]:
+total_intronic += (tup[0] + 1) - prev_end
+exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
+prev_end = tup[1]
+except:
+if strand == "+":
+tx_start = min(sorted(master_dict[tran]["cds"]))[0]
+prev_end = tx_start
+for tup in sorted(master_dict[tran]["cds"])[:-1]:
+total_intronic += tup[0] - prev_end
+exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
+prev_end = tup[1]
+elif strand == "-":
+tx_start = max(sorted(master_dict[tran]["cds"]))[-1]
+prev_end = tx_start
+for tup in (sorted(master_dict[tran]["cds"])[1:])[::-1]:
+total_intronic += (tup[0] + 1) - prev_end
+exon_junctions.append(((tup[1]) - tx_start) - total_intronic)
+prev_end = tup[1]
+# This can happen when a coding transcript doesn't have a properly annotated 3' trailer
+if cds_stop != "NULL":
+if cds_stop > total_length:
+cds_stop = total_length
+if strand == "+" and cds_start != "NULL":
+master_dict[tran]["cds_start"] = cds_start
+master_dict[tran]["cds_stop"] = cds_stop
+elif strand == "-" and cds_start != "NULL":
+master_dict[tran]["cds_start"] = cds_start
+master_dict[tran]["cds_stop"] = cds_stop
+master_dict[tran]["strand"] = strand
+master_dict[tran]["tran_type"] = tran_type
+master_dict[tran]["exon_junctions"] = exon_junctions
+longest_tran_dict = {}
+for tran in master_dict:
+try:
+gene = master_dict[tran]["gene_name"]
+except:
+continue
+if coding_genes_dict[gene] == True:
+if "cds_start" in master_dict[tran]:
+if (
+master_dict[tran]["cds_stop"] != "NULL"
+and master_dict[tran]["cds_start"] != "NULL"
+):
+cds_length = (
+master_dict[tran]["cds_stop"] - master_dict[tran]["cds_start"]
+)
+if gene not in longest_tran_dict:
+longest_tran_dict[gene] = {"tran": tran, "length": cds_length}
+else:
+if cds_length > longest_tran_dict[gene]["length"]:
+longest_tran_dict[gene] = {"tran": tran, "length": cds_length}
+if cds_length == longest_tran_dict[gene]["length"]:
+if (
+master_dict[tran]["length"]
+> master_dict[longest_tran_dict[gene]["tran"]]["length"]
+):
+longest_tran_dict[gene] = {
+"tran": tran,
+"length": cds_length,
+}
+else:
+length = master_dict[tran]["length"]
+if gene not in longest_tran_dict:
+longest_tran_dict[gene] = {"tran": tran, "length": length}
+elif length > longest_tran_dict[gene]["length"]:
+longest_tran_dict[gene] = {"tran": tran, "length": length}
+for gene in longest_tran_dict:
+longest_tran = longest_tran_dict[gene]["tran"]
+master_dict[longest_tran]["principal"] = 1
+gene_sample = []
+for key in list(master_dict)[:10]:
+try:
+gene_sample.append(master_dict[key]["gene_name"])
+except:
+pass
+print(master_dict)
+print("Here is a sample of the transcript ids: {}".format(list(master_dict)[:10]))
+print("Here is a sample of the gene names: {}".format(gene_sample))
+# Takes a transcript, transcriptomic position and a master_dict (see ribopipe scripts) and returns the genomic position, positions should be passed 1 at a time.
+def tran_to_genome(tran, start_pos, end_pos, master_dict):
+pos_list = []
+for i in range(start_pos, end_pos + 1):
+pos_list.append(i)
+genomic_pos_list = []
+if tran in master_dict:
+transcript_info = master_dict[tran]
+else:
+return ("Null", [])
+chrom = transcript_info["chrom"]
+strand = transcript_info["strand"]
+exons = sorted(transcript_info["exon"])
+# print ("chrom,strand,exons",chrom,strand,exons)
+for pos in pos_list:
+# print ("pos",pos)
+if strand == "+":
+exon_start = 0
+for tup in exons:
+# print ("tup",tup)
+exon_start = tup[0]
+exonlen = tup[1] - tup[0]
+if pos > exonlen:
+pos = (pos - exonlen) - 1
+else:
+break
+# print ("appending exon_start-pos", exon_start, pos, exon_start+pos)
+genomic_pos_list.append((exon_start + pos) - 1)
+elif strand == "-":
+exon_start = 0
+for tup in exons[::-1]:
+# print ("tup",tup)
+exon_start = tup[1]
+exonlen = tup[1] - tup[0]
+# print ("exonlen",exonlen)
+if pos > exonlen:
+# print ("pos is greater")
+pos = (pos - exonlen) - 1
+# print ("new pos",pos)
+else:
+break
+# print ("appending exon_start-pos", exon_start, pos, exon_start-pos)
+genomic_pos_list.append((exon_start - pos) + 1)
+return (chrom, genomic_pos_list)
+orf_dict = {
+"uorf": {},
+"ouorf": {},
+"cds": {},
+"nested": {},
+"odorf": {},
+"dorf": {},
+"minusone": {},
+"readthrough": {},
+"plusone": {},
+"noncoding": {},
+"extension": {},
+"inframe_stop": {},
+}
+start_codons = ["ATG", "CTG", "GTG", "TTG", "ATC", "ATA", "ATT", "ACG", "AAG", "AGG"]
+stop_codons = ["TAG", "TAA", "TGA"]
+# Keep track of the longest transcript for each noncoding gene, append this to transcript list later
+longest_noncoding = {}
+tran_count = 0
+# This section is used to gather all cds regions, convert them to genomic regions and store them in a dictionary to check against later (all transcript contribute to this not just those
+# in the transcript list)
+genomic_cds_dict = {}
+tree_dict = {}
+for transcript in master_dict:
+# print (transcript, master_dict[transcript]["tran_type"])
+tran_count += 1
+if "seq" not in master_dict[transcript]:
+continue
+chrom = master_dict[transcript]["chrom"]
+if chrom not in genomic_cds_dict:
+genomic_cds_dict[chrom] = []
+if "cds_start" in master_dict[transcript]:
+cds_start = master_dict[transcript]["cds_start"]
+cds_stop = master_dict[transcript]["cds_stop"]
+if cds_start != "NULL":
+cds_pos = []
+for i in range(cds_start, cds_stop + 1):
+cds_pos.append(i)
+for tup in master_dict[transcript]["cds"]:
+if tup[0] != tup[1]:
+if tup not in genomic_cds_dict[chrom]:
+genomic_cds_dict[chrom].append(tup)
+print("genomic cds dict built")
+print(list(genomic_cds_dict))
+for chrom in genomic_cds_dict:
+tree_dict[chrom] = IntervalTree.from_tuples(genomic_cds_dict[chrom])
+# print (list(tree_dict))
+connection = sqlite3.connect("{}.sqlite".format(organism))
+cursor = connection.cursor()
+cursor.execute(
+"CREATE TABLE IF NOT EXISTS transcripts (transcript VARCHAR(50), gene VARCHAR(50), length INT(6), cds_start INT(6), cds_stop INT(6), sequence VARCHAR(50000), strand CHAR(1), stop_list VARCHAR(10000), start_list VARCHAR(10000), exon_junctions VARCHAR(1000), tran_type INT(1), gene_type INT(1), principal INT(1), version INT(2),gc INT(3),five_gc INT(3), cds_gc INT(3), three_gc INT(3), chrom VARCHAR(20));"
+)
+cursor.execute(
+"CREATE TABLE IF NOT EXISTS coding_regions (transcript VARCHAR(50), coding_start INT(6), coding_stop INT(6));"
+)
+cursor.execute(
+"CREATE TABLE IF NOT EXISTS exons (transcript VARCHAR(50), exon_start INT(6), exon_stop INT(6));"
+)
+cursor.execute(
+"CREATE TABLE IF NOT EXISTS uorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
+)
+cursor.execute(
+"CREATE TABLE IF NOT EXISTS ouorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
+)
+cursor.execute(
+"CREATE TABLE IF NOT EXISTS cds (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
+)
+cursor.execute(
+"CREATE TABLE IF NOT EXISTS nested (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
+)
+cursor.execute(
+"CREATE TABLE IF NOT EXISTS odorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
+)
+cursor.execute(
+"CREATE TABLE IF NOT EXISTS dorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
+)
+cursor.execute(
+"CREATE TABLE IF NOT EXISTS minusone(transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
+)
+cursor.execute(
+"CREATE TABLE IF NOT EXISTS readthrough (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
+)
+cursor.execute(
+"CREATE TABLE IF NOT EXISTS plusone (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
+)
+cursor.execute(
+"CREATE TABLE IF NOT EXISTS noncoding (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
+)
+cursor.execute(
+"CREATE TABLE IF NOT EXISTS extension (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
+)
+cursor.execute(
+"CREATE TABLE IF NOT EXISTS inframe_stop (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));"
+)
+connection.commit()
+print("Finding ORFs")
+transcript_count = 0
+total_transcripts = len(list(master_dict))
+for transcript in master_dict:
+# print ("transcript",transcript)
+# if transcript != "ENST00000316448":
+# 	continue
+transcript_count += 1
+if transcript_count % 100 == 0:
+print("Transcripts complete: {}/{}".format(transcript_count, total_transcripts))
+if "seq" not in master_dict[transcript]:
+print("transcript {} has no sequence".format(transcript))
+continue
+seq = master_dict[transcript]["seq"]
+cds_start = "NULL"
+cds_stop = "NULL"
+transcript_len = len(seq)
+if "cds_start" in master_dict[transcript]:
+cds_start = master_dict[transcript]["cds_start"]
+cds_stop = master_dict[transcript]["cds_stop"]
+# Find out what regions of this transcript overlap with any other coding regions
+coding_positions = []
+if cds_start != "NULL":
+# If this is a coding transcript don't bother checking the CDS
+for i in range(cds_start, cds_stop):
+coding_positions.append(i)
+# check 5' leader
+chrom, pos_list = tran_to_genome(transcript, 0, cds_start, master_dict)
+for i in range(0, cds_start):
+genomic_pos = pos_list[i]
+overlap = tree_dict[chrom][genomic_pos]
+if len(overlap) != 0:
+coding_positions.append(i)
+# check 3' trailer
+chrom, pos_list = tran_to_genome(
+transcript, cds_stop, transcript_len, master_dict
+)
+for i in range(cds_stop, transcript_len + 1):
+# print ("i",i)
+genomic_pos = pos_list[i - cds_stop]
+# print ("genomic position",genomic_pos)
+overlap = tree_dict[chrom][genomic_pos]
+if len(overlap) != 0:
+# print ("overlap not empty appending i",overlap)
+coding_positions.append(i)
+else:
+# check entire transcript
+chrom, pos_list = tran_to_genome(transcript, 0, transcript_len, master_dict)
+for i in range(0, transcript_len):
+genomic_pos = pos_list[i]
+overlap = tree_dict[chrom][genomic_pos]
+if len(overlap) != 0:
+coding_positions.append(i)
+coding_positions_tuple = to_ranges(coding_positions)
+coding_dict[transcript] = coding_positions_tuple
+coding_positions = set(coding_positions)
+# if this is a coding transcript find the minusone, readhtrough, plusone co-ordinates
+if cds_start != "NULL":
+# print ("transcript", transcript)
+# if pseudo_utr_len != 0:
+# 	cds_stop -= 3 # take 3 from stop so we can match it with orf_stop, do it here rather than above in case cds_stop is null
+recoding_dict = {2: "minusone", 0: "readthrough", 1: "plusone"}
+for addition in recoding_dict:
+orftype = recoding_dict[addition]
+for i in range(cds_stop + addition, transcript_len, 3):
+if seq[i : i + 3] in stop_codons:
+# orf_seq = seq[cds_stop:i+3]
+orf_start = cds_stop
+if orftype == "readthrough":
+orf_start -= 2
+if orftype == "plusone":
+orf_start -= 1
+orf_stop = i + 3  # +2 so it refers to the end of the stop codon
+start_codon = None
+length = (i + 3) - cds_stop
+orf_pos_list = []
+# determine how many nucleotides in this orf overlap with an annotated coding region
+cds_cov_count = 0.0
+for position in range(orf_start, orf_stop):
+orf_pos_list.append(position)
+for pos in range(orf_start, orf_stop + 1):
+if pos in coding_positions:
+cds_cov_count += 1
+cds_cov = cds_cov_count / length
+# print ("orftype, start, stop", orftype, orf_start, orf_stop)
+cursor.execute(
+"INSERT INTO {} VALUES('{}','{}',{},{},{},{});".format(
+orftype,
+transcript,
+start_codon,
+length,
+orf_start,
+orf_stop,
+cds_cov,
+)
+)
+break
+# sys.exit()
+for frame in [0, 1, 2]:
+for i in range(frame, transcript_len, 3):
+if seq[i : i + 3] in start_codons:
+for x in range(i, transcript_len, 3):
+if seq[x : x + 3] in stop_codons:
+# orf_seq = seq[i:x+3]
+orf_start = i + 1
+orf_stop = x + 3  # +2 so it refers to the end of the stop codon
+start_codon = seq[i : i + 3]
+length = (x + 3) - i
+orf_pos_list = []
+# determine how many nucleotides in this orf overlap with an annotated coding region
+cds_cov_count = 0.0
+for pos in range(orf_start, orf_stop + 1):
+if pos in coding_positions:
+cds_cov_count += 1
+cds_cov = float(cds_cov_count) / float(length)
+# Now determine orf type
+if cds_start == "NULL":
+orftype = "noncoding"
+else:
+# print ("cds start is not null :{}:{}".format(cds_start,cds_stop))
+# print ("orf start, orf stop", orf_start, orf_stop)
+if orf_start == cds_start and orf_stop == cds_stop:
+orftype = "cds"
+# print ("orf type is cds")
+elif orf_start < cds_start and orf_stop == cds_stop:
+orftype = "extension"
+# special case for extensions, we only take from the orf_start to the cds_start, and re-calculate cds coverage
+orf_stop = cds_start
+cds_cov_count = 0.0
+for pos in range(orf_start, orf_stop + 1):
+if pos in coding_positions:
+cds_cov_count += 1
+cds_cov = float(cds_cov_count) / float(length)
+# orf_seq = seq[orf_start:cds_start]
+length = cds_start - orf_start
+# print ("orf type is extension")
+elif orf_start < cds_start and orf_stop <= cds_start:
+orftype = "uorf"
+# print ("orf type is uorf")
+elif orf_start < cds_start and orf_stop > cds_start:
+orftype = "ouorf"
+# print ("orf type is ouorf")
+# sys.exit()
+elif (
+orf_start >= cds_start
+and orf_start <= cds_stop
+and orf_stop <= cds_stop
+):
+if orf_stop == cds_stop:
+break
+# print ("Tran, cds_start, cds_stop, orf_start, orf_stop, tranlen",tran, cds_start, cds_stop, orf_start, orf_stop,transcript_len)
+if (
+orf_stop < transcript_len
+and orf_stop % 3 == cds_stop % 3
+) or (
+cds_start != 1
+and orf_stop % 3 == (cds_start + 2) % 3
+):
+# print ("Transcript {} has an inframe stop codon".format(transcript))
+break
+orftype = "nested"
+# print ("orf type is nested")
+elif (
+orf_start >= cds_start
+and orf_start <= cds_stop
+and orf_stop > cds_stop
+):
+orftype = "odorf"
+# print ("orftype is odorf")
+elif orf_start > cds_stop and orf_stop > cds_stop:
+orftype = "dorf"
+# print ("orftype is dorf")
+# if orf_stop > cds_start and orf_stop < cds_stop:
+# 	if (orf_stop+1)%3 == cds_start%3:
+# 		orftype = "inframe_stop"
+# 		print ("inframe stop, transcript, orf_stop", transcript, orf_stop)
+# 		sys.exit()
+# 		if transcript not in orf_dict:
+# 			orf_dict[orftype][transcript] = []
+# 	#print ("some weird stop or something")
+cursor.execute(
+"INSERT INTO {} VALUES('{}','{}',{},{},{},{});".format(
+orftype,
+transcript,
+start_codon,
+length,
+orf_start,
+orf_stop,
+cds_cov,
+)
+)
+break
+# Used to keep track of the codons at cds_start and cds_stop positions,
+# If there is an issue with the cds co-ordinates the starts and stops counts will
+# be much lower than the other count, start with 1 to prevent division by 0
+nuc_dict = {"stops": {"stops": 1, "other": 0}, "starts": {"starts": 1, "other": 0}}
+def calcgc(seq):
+if seq == "":
+return "NULL"
+g_count = 0
+c_count = 0
+a_count = 0
+t_count = 0
+for char in seq:
+if char == "A":
+a_count += 1
+if char == "T":
+t_count += 1
+if char == "G":
+g_count += 1
+if char == "C":
+c_count += 1
+gc = ((g_count + c_count) / float(len(seq))) * 100
+return round(gc, 2)
+for transcript in master_dict:
+# print ("transcripts", transcript)
+length = master_dict[transcript]["length"]
+cds_start = master_dict[transcript]["cds_start"]
+cds_stop = master_dict[transcript]["cds_stop"]
+seq = master_dict[transcript]["seq"]
+strand = master_dict[transcript]["strand"]
+chrom = master_dict[transcript]["chrom"]
+gene = master_dict[transcript]["gene_name"]
+gc = calcgc(seq)
+five_gc = "NULL"
+cds_gc = "NULL"
+three_gc = "NULL"
+if cds_start != "NULL":
+five_gc = calcgc(seq[:cds_start])
+cds_gc = calcgc(seq[cds_start:cds_stop])
+three_gc = calcgc(seq[cds_stop:])
+# check that the nucleotide cds_start points to is the first of the start codon
+# take one becase cds_start is 1 based but python indexing is 0 based
+start_nuc = seq[cds_start - 1 : cds_start + 2]
+# print ("start nuc",start_nuc)
+if start_nuc == "ATG":
+nuc_dict["starts"]["starts"] += 1
+else:
+nuc_dict["starts"]["other"] += 1
+stop_nuc = seq[cds_stop - 3 : cds_stop]
+# print ("stop_nuc",stop_nuc)
+if stop_nuc in ["TAG", "TAA", "TGA"]:
+nuc_dict["stops"]["stops"] += 1
+else:
+nuc_dict["stops"]["other"] += 1
+tran_type = master_dict[transcript]["tran_type"]
+if coding_genes_dict[gene] == True:
+gene_type = 1
+else:
+gene_type = 0
+# print ("tran type before",tran_type)
+if tran_type == "coding":
+tran_type = 1
+else:
+tran_type = 0
+# print ("tran type after",tran_type)
+start_list = str(master_dict[transcript]["start_list"]).replace(" ", "").strip("[]")
+stop_list = str(master_dict[transcript]["stop_list"]).replace(" ", "").strip("[]")
+exon_junctions = (
+str(master_dict[transcript]["exon_junctions"]).replace(" ", "").strip("[]")
+)
+principal = master_dict[transcript]["principal"]
+version = master_dict[transcript]["version"]
+# print (master_dict[transcript])
+# print (tran_type)
+# print (gene_type)
+# print (principal)
+# print (version)
+# print ("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{});".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version))
+cursor.execute(
+"INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{},{},{},{},{},'{}');".format(
+transcript,
+gene,
+length,
+cds_start,
+cds_stop,
+seq,
+strand,
+stop_list,
+start_list,
+exon_junctions,
+tran_type,
+gene_type,
+principal,
+version,
+gc,
+five_gc,
+cds_gc,
+three_gc,
+chrom,
+)
+)
+for tup in master_dict[transcript]["exon"]:
+cursor.execute(
+"INSERT INTO exons VALUES('{}',{},{});".format(transcript, tup[0], tup[1])
+)
+if transcript in coding_dict:
+for tup in coding_dict[transcript]:
+cursor.execute(
+"INSERT INTO coding_regions VALUES('{}',{},{});".format(
+transcript, tup[0], tup[1]
+)
+)
+connection.commit()
+connection.close()
+print("delim", delimiters)
+if (nuc_dict["starts"]["other"] / nuc_dict["starts"]["starts"]) > 0.05:
+print(
+"Warning: {} transcripts do not have a an AUG at the CDS start position".format(
+nuc_dict["starts"]["other"]
+)
+)
+if (nuc_dict["stops"]["other"] / nuc_dict["stops"]["stops"]) > 0.05:
+print(
+"Warning: {} transcripts do not have a an stop codon at the CDS stop position".format(
+nuc_dict["stops"]["other"]
+)
+)
+if len(notinannotation) > 0:
+print(
+"Warning: {} transcripts were in the fasta file, but not the annotation file, these will be discarded".format(
+len(notinannotation)
+)
+)

Mercurial > repos > jackcurragh > trips_viz_create_annotation

comparison trips_create_annotation/create_annotation_sqlite.py @ 0:f24aed7a5cc7 draft