Mercurial > repos > jackcurragh > trips_viz_create_annotation
diff trips_create_annotation/create_annotation_sqlite.py @ 0:f24aed7a5cc7 draft
Uploaded
author | jackcurragh |
---|---|
date | Mon, 04 Apr 2022 09:30:51 +0000 |
parents | |
children | d70696d3341e |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trips_create_annotation/create_annotation_sqlite.py Mon Apr 04 09:30:51 2022 +0000 @@ -0,0 +1,994 @@ +# Python3 script which takes in an annotation file(gtf/gff3) and a transcriptomic fasta file +# and produces an sqlite file which can be uploaded to Trips-Viz +# All co-ordinates produced are 1 based +# All start codon positions (including cds_start) should be at the first nucleotide of the codon +# All stop codon positions (including cds_stop) should be at the last nucleotide of the codon +import sys +import re +import sqlite3 +from intervaltree import Interval, IntervalTree +import itertools +from sqlitedict import SqliteDict +import os + +organism = sys.argv[1] +# This should be a GTF or GFF3 file +annotation_file = open(sys.argv[2], "r") +# This needs to be the transcriptomic fasta file +fasta_file = open(sys.argv[3], "r") +# This value will be added used to create UTRs of this length, useful when looking at transcriptomes without annotated UTRs +pseudo_utr_len = int(sys.argv[4]) +# An example of a transcript_id from the annotation file, e.g ENST000000123456 +user_transcript_id = sys.argv[5] +# An example of a gene name from the annotation file +user_gene_name = sys.argv[6] +# Set to true if transcript version is included in transcript_id, e.g: ENST000000123456.1 +TRAN_VERSION = True + + +if os.path.isfile("{}.sqlite".format(organism)): + print("{}.sqlite already exists".format(organism)) + sys.exit() + + +# old_exons = SqliteDict( +# "/home/DATA/www/tripsviz/tripsviz/trips_annotations/mus_musculus/transcriptomic_to_genomic.sqlite" +# ) + + +delimiters = { + "transcripts": {"before": [], "after": ["."], "annot_types": ["cds", "utr"]}, + "genes": {"before": [], "after": ['"'], "annot_types": ["lnc_rna"]}, +} + +punctuation = [";", " ", "-", ":", "-", ".", "=", "\t"] +# Find delimiters in the annotation and fasta files using the user_transcript_id +# and user_gene_name examples given by user. +for line in annotation_file: + if user_transcript_id in line: + tabsplitline = line.split("\t") + annot_type = tabsplitline[2] + if annot_type not in delimiters["transcripts"]["annot_types"]: + delimiters["transcripts"]["annot_types"].append(annot_type.lower()) + splitline = line.split(user_transcript_id) + before_delimiter = splitline[0] + for item in punctuation: + if item in before_delimiter: + if len(before_delimiter.split(item)[-1]) >= 5: + before_delimiter = before_delimiter.split(item)[-1] + after_delimiter = splitline[1][:2] + if ( + before_delimiter not in delimiters["transcripts"]["before"] + and len(before_delimiter) >= 5 + ): + delimiters["transcripts"]["before"].append(before_delimiter) + if after_delimiter not in delimiters["transcripts"]["after"]: + delimiters["transcripts"]["after"].append(after_delimiter) + if user_gene_name in line: + tabsplitline = line.split("\t") + annot_type = tabsplitline[2] + print("ANNOT TYPE", annot_type) + if annot_type not in delimiters["genes"]["annot_types"]: + delimiters["genes"]["annot_types"].append(annot_type.lower()) + splitline = line.split(user_gene_name) + before_delimiter = splitline[0] + for item in punctuation: + if item in before_delimiter: + if len(before_delimiter.split(item)[-1]) >= 5: + before_delimiter = before_delimiter.split(item)[-1] + after_delimiter = splitline[1][0] + if ( + before_delimiter not in delimiters["genes"]["before"] + and len(before_delimiter) >= 5 + ): + delimiters["genes"]["before"].append(before_delimiter) + if after_delimiter not in delimiters["genes"]["after"]: + if after_delimiter in punctuation: + delimiters["genes"]["after"].append(after_delimiter) + +print("delimeters[genes]", delimiters["transcripts"]["annot_types"]) + +for line in fasta_file: + if user_transcript_id in line: + splitline = line.split(user_transcript_id) + before_delimiter = splitline[0] + for item in punctuation: + if item in before_delimiter: + if len(before_delimiter.split(item)[1]) >= 5: + before_delimiter = before_delimiter.split(item)[1] + after_delimiter = splitline[1][0] + if ( + before_delimiter not in delimiters["transcripts"]["before"] + and len(before_delimiter) >= 5 + ): + delimiters["transcripts"]["before"].append(before_delimiter) + if after_delimiter not in delimiters["transcripts"]["after"]: + delimiters["transcripts"]["after"].append(after_delimiter) +fasta_file.close() +annotation_file.close() + + +if delimiters["transcripts"]["before"] == []: + print( + "ERROR: No transcript_id with the name {} could be found in the annotation file".format( + user_transcript_id + ) + ) + sys.exit() +if delimiters["genes"]["before"] == []: + print( + "ERROR: No gene with the name {} could be found in the annotation file".format( + user_gene_name + ) + ) + sys.exit() + +master_dict = {} +coding_dict = {} +notinfasta = open("notinfasta.csv", "w") + +# Given a nucleotide sequence returns the positions of all start and stop codons. +def get_start_stops(transcript_sequence): + transcript_sequence = transcript_sequence.upper() + start_codons = ["ATG"] + stop_codons = ["TAA", "TAG", "TGA"] + seq_frames = {"starts": [], "stops": []} + for codons, positions in ((start_codons, "starts"), (stop_codons, "stops")): + if len(codons) > 1: + pat = re.compile("|".join(codons)) + else: + pat = re.compile(codons[0]) + for m in re.finditer(pat, transcript_sequence): + # Increment position by 1, Frame 1 starts at position 1 not 0, + # if it's a stop codon add another 2 so it points to the last nuc of the codon + if positions == "starts": + start = m.start() + 1 + else: + start = m.start() + 3 + seq_frames[positions].append(start) + return seq_frames + + +# parse fasta to get the nucleotide sequence of transcripts and the positions of start/stop codons. +fasta_file = open(sys.argv[3], "r") +read_fasta = fasta_file.read() +split_fasta = read_fasta.split(">") +for entry in split_fasta[1:]: + newline_split = entry.split("\n") + tran = newline_split[0] + for item in delimiters["transcripts"]["after"]: + if item in tran: + tran = tran.split(item)[0] + tran = tran.replace("-", "_").replace("(", "").replace(")", "") + seq = "".join(newline_split[1:]) + if "_PAR_Y" in tran: + tran += "_chrY" + elif "_PAR_X" in tran: + tran += "_chrX" + tran = tran.upper() + starts_stops = get_start_stops(seq) + print("tran", tran) + if tran not in master_dict: + master_dict[tran] = { + "utr": [], + "cds": [], + "exon": [], + "start_codon": [], + "stop_codon": [], + "start_list": starts_stops["starts"], + "stop_list": starts_stops["stops"], + "transcript": [], + "strand": "", + "gene_name": "", + "chrom": "", + "seq": seq, + "cds_start": "NULL", + "cds_stop": "NULL", + "length": len(seq), + "principal": 0, + "version": "NULL", + } + + +def to_ranges(iterable): + tup_list = [] + iterable = sorted(set(iterable)) + for key, group in itertools.groupby(enumerate(iterable), lambda t: t[1] - t[0]): + group = list(group) + tup_list.append((group[0][1], group[-1][1])) + return tup_list + + +# parse annotation file to get chromsome, exon location and CDS info for each transcript +def parse_gtf_file(annotation_file): + for line in annotation_file: + if line == "\n": + continue + if line[0] != "#": + splitline = (line.replace("\n", "")).split("\t") + chrom = splitline[0] + try: + annot_type = splitline[2].lower() + except: + print( + "ERROR tried to index to second item in splitline: ", + splitline, + line, + ) + sys.exit() + # if annot_type not in ["cds", "utr", "exon", "transcript","five_prime_utr", "three_prime_utr","stop_codon","start_codon"]: + # continue + if ( + annot_type not in delimiters["transcripts"]["annot_types"] + and annot_type not in delimiters["genes"]["annot_types"] + ): + continue + if annot_type == "five_prime_utr" or annot_type == "three_prime_utr": + annot_type = "utr" + strand = splitline[6] + if strand == "+": + start = int(splitline[3]) + end = int(splitline[4]) + else: + start = int(splitline[3]) + 1 + end = int(splitline[4]) + 1 + desc = splitline[8] + tran = desc + gene = desc + for item in delimiters["transcripts"]["before"]: + if item in tran: + tran = tran.split(item)[1] + for item in delimiters["transcripts"]["after"]: + if item in tran: + tran = tran.split(item)[0] + if "." in tran and TRAN_VERSION == True: + # print ("raw tran",tran) + tran = tran.split(".") + version = int(tran[-1].split("_")[0]) + tran = tran[0] + else: + version = "NULL" + tran = tran.replace("-", "_").replace(".", "_") + tran = tran.replace("(", "").replace(")", "") + tran = tran.replace(" ", "").replace("\t", "") + tran = tran.upper() + tran = tran.replace("GENE_", "").replace("ID_", "") + if "_PAR_Y" in desc: + # print ("adding _PAR_Y to tran") + tran = tran + "_PAR_Y" + # print ("New tran ", tran) + # if "PAR_Y" in line: + # print (line) + # #sys.exit() + # print ("tran",tran,version) + # if tran == "ENST00000316448": + # print ("annot type",annot_type) + # print ("appending exon to tran", start, end,line) + + gene_found = False + + if annot_type in delimiters["genes"]["annot_types"]: + for item in delimiters["genes"]["before"]: + if item in gene: + gene_found = True + gene = gene.split(item)[1] + for item in delimiters["genes"]["after"]: + if item in gene: + gene = gene.split(item)[0] + gene = gene.replace("'", "''") + gene = gene.replace("GENE_", "") + gene = gene.replace("ID_", "") + gene = gene.upper() + if tran in master_dict: + master_dict[tran]["strand"] = strand + if strand == "+": + if annot_type in master_dict[tran]: + master_dict[tran][annot_type].append((start, end)) + else: + if annot_type in master_dict[tran]: + master_dict[tran][annot_type].append((start, end)) + master_dict[tran]["chrom"] = chrom + master_dict[tran]["version"] = version + if gene_found == True: + master_dict[tran]["gene_name"] = gene + else: + notinfasta.write("{}\n".format(tran)) + + +annotation_file = open(sys.argv[2], "r") +parse_gtf_file(annotation_file) + + +# remove transcripts that were in fasta file but not in annotation_file +notinannotation = [] +for tran in master_dict: + if master_dict[tran]["chrom"] == "": + # print ("tran {} has no chrom :(".format(tran)) + notinannotation.append(tran) +for tran in notinannotation: + print(tran) + del master_dict[tran] +# Dictionary to store the coding status of a gene, if any transcript of this gene is coding, the value will be True +coding_genes_dict = {} +# parse master_dict to calculate length, cds_start/stop and exon junction positions +for tran in master_dict: + try: + transeq = master_dict[tran]["seq"] + except Exception as e: + print("not in fasta", tran) + notinfasta.write("{}\n".format(tran)) + continue + exon_junctions = [] + total_length = len(transeq) + three_len = 1 + five_len = 1 + strand = master_dict[tran]["strand"] + if master_dict[tran]["gene_name"] == "": + master_dict[tran]["gene_name"] = tran + gene = master_dict[tran]["gene_name"] + if gene not in coding_genes_dict: + coding_genes_dict[gene] = False + + if master_dict[tran]["cds"] == []: + tran_type = "noncoding" + cds_start = "NULL" + cds_stop = "NULL" + else: + # get utr lengths from annotation + tran_type = "coding" + coding_genes_dict[gene] = True + sorted_exons = sorted(master_dict[tran]["exon"]) + sorted_cds = sorted(master_dict[tran]["cds"]) + + min_cds = sorted_cds[0][0] + # Some annotation files do not have utr annotation types, so fix that here if thats the case + if master_dict[tran]["utr"] == []: + for exon_tup in master_dict[tran]["exon"]: + if exon_tup not in master_dict[tran]["cds"]: + # Now check if this overlaps with any of the CDS exons + overlap = False + for cds_tup in master_dict[tran]["cds"]: + if exon_tup[0] == cds_tup[0] and exon_tup[1] != cds_tup[1]: + master_dict[tran]["utr"].append((cds_tup[1], exon_tup[1])) + overlap = True + if exon_tup[0] != cds_tup[0] and exon_tup[1] == cds_tup[1]: + master_dict[tran]["utr"].append((exon_tup[0], cds_tup[0])) + overlap = True + if overlap == False: + master_dict[tran]["utr"].append(exon_tup) + + for tup in sorted(master_dict[tran]["utr"]): + if tup[0] < min_cds: + five_len += (tup[1] - tup[0]) + 1 + elif tup[0] > min_cds: + three_len += (tup[1] - tup[0]) + 1 + else: + pass + if strand == "+": + if len(sorted_exons) > 1: + sorted_exons[0] = ( + sorted_exons[0][0] - pseudo_utr_len, + sorted_exons[0][1], + ) + sorted_exons[-1] = ( + sorted_exons[-1][0], + sorted_exons[-1][1] + pseudo_utr_len, + ) + else: + sorted_exons[0] = ( + sorted_exons[0][0] - pseudo_utr_len, + sorted_exons[0][1] + pseudo_utr_len, + ) + master_dict[tran]["exon"] = sorted_exons + cds_start = five_len + pseudo_utr_len + cds_stop = ((total_length - three_len) - pseudo_utr_len) + 4 + elif strand == "-": + if len(sorted_exons) > 1: + sorted_exons[0] = ( + (sorted_exons[0][0] - pseudo_utr_len), + sorted_exons[0][1], + ) + sorted_exons[-1] = ( + sorted_exons[-1][0], + (sorted_exons[-1][1] + pseudo_utr_len), + ) + else: + sorted_exons[0] = ( + (sorted_exons[0][0] - pseudo_utr_len), + (sorted_exons[0][1] + pseudo_utr_len), + ) + master_dict[tran]["exon"] = sorted_exons + cds_start = three_len + pseudo_utr_len + cds_stop = ((total_length - (five_len)) - pseudo_utr_len) + 4 + # if tran == "ENST00000381401": + # print ("cds start, cds stop, five_len, three_len",cds_start,cds_stop,five_len,three_len) + # #sys.exit() + else: + print("strand is unknown: {}".format(strand)) + sys.exit() + # get exon junctions, cds is easy just get end of each tuple except last, same for utr except for if same as cds start/stop + total_intronic = 0 + try: + if strand == "+": + tx_start = min(sorted(master_dict[tran]["exon"]))[0] + prev_end = tx_start + for tup in sorted(master_dict[tran]["exon"])[:-1]: + total_intronic += tup[0] - prev_end + exon_junctions.append(((tup[1]) - tx_start) - total_intronic) + prev_end = tup[1] + elif strand == "-": + tx_start = max(sorted(master_dict[tran]["exon"]))[-1] + prev_end = tx_start + for tup in (sorted(master_dict[tran]["exon"])[1:])[::-1]: + total_intronic += (tup[0] + 1) - prev_end + exon_junctions.append(((tup[1]) - tx_start) - total_intronic) + prev_end = tup[1] + except: + if strand == "+": + tx_start = min(sorted(master_dict[tran]["cds"]))[0] + prev_end = tx_start + for tup in sorted(master_dict[tran]["cds"])[:-1]: + total_intronic += tup[0] - prev_end + exon_junctions.append(((tup[1]) - tx_start) - total_intronic) + prev_end = tup[1] + elif strand == "-": + tx_start = max(sorted(master_dict[tran]["cds"]))[-1] + prev_end = tx_start + for tup in (sorted(master_dict[tran]["cds"])[1:])[::-1]: + total_intronic += (tup[0] + 1) - prev_end + exon_junctions.append(((tup[1]) - tx_start) - total_intronic) + prev_end = tup[1] + # This can happen when a coding transcript doesn't have a properly annotated 3' trailer + if cds_stop != "NULL": + if cds_stop > total_length: + cds_stop = total_length + if strand == "+" and cds_start != "NULL": + master_dict[tran]["cds_start"] = cds_start + master_dict[tran]["cds_stop"] = cds_stop + elif strand == "-" and cds_start != "NULL": + master_dict[tran]["cds_start"] = cds_start + master_dict[tran]["cds_stop"] = cds_stop + + master_dict[tran]["strand"] = strand + master_dict[tran]["tran_type"] = tran_type + master_dict[tran]["exon_junctions"] = exon_junctions + +longest_tran_dict = {} +for tran in master_dict: + try: + gene = master_dict[tran]["gene_name"] + except: + continue + if coding_genes_dict[gene] == True: + if "cds_start" in master_dict[tran]: + if ( + master_dict[tran]["cds_stop"] != "NULL" + and master_dict[tran]["cds_start"] != "NULL" + ): + cds_length = ( + master_dict[tran]["cds_stop"] - master_dict[tran]["cds_start"] + ) + if gene not in longest_tran_dict: + longest_tran_dict[gene] = {"tran": tran, "length": cds_length} + else: + if cds_length > longest_tran_dict[gene]["length"]: + longest_tran_dict[gene] = {"tran": tran, "length": cds_length} + if cds_length == longest_tran_dict[gene]["length"]: + if ( + master_dict[tran]["length"] + > master_dict[longest_tran_dict[gene]["tran"]]["length"] + ): + longest_tran_dict[gene] = { + "tran": tran, + "length": cds_length, + } + else: + length = master_dict[tran]["length"] + if gene not in longest_tran_dict: + longest_tran_dict[gene] = {"tran": tran, "length": length} + elif length > longest_tran_dict[gene]["length"]: + longest_tran_dict[gene] = {"tran": tran, "length": length} + + +for gene in longest_tran_dict: + longest_tran = longest_tran_dict[gene]["tran"] + master_dict[longest_tran]["principal"] = 1 + +gene_sample = [] +for key in list(master_dict)[:10]: + try: + gene_sample.append(master_dict[key]["gene_name"]) + except: + pass +print(master_dict) +print("Here is a sample of the transcript ids: {}".format(list(master_dict)[:10])) +print("Here is a sample of the gene names: {}".format(gene_sample)) + + +# Takes a transcript, transcriptomic position and a master_dict (see ribopipe scripts) and returns the genomic position, positions should be passed 1 at a time. +def tran_to_genome(tran, start_pos, end_pos, master_dict): + pos_list = [] + for i in range(start_pos, end_pos + 1): + pos_list.append(i) + genomic_pos_list = [] + if tran in master_dict: + transcript_info = master_dict[tran] + else: + return ("Null", []) + + chrom = transcript_info["chrom"] + strand = transcript_info["strand"] + exons = sorted(transcript_info["exon"]) + # print ("chrom,strand,exons",chrom,strand,exons) + for pos in pos_list: + # print ("pos",pos) + if strand == "+": + exon_start = 0 + for tup in exons: + # print ("tup",tup) + exon_start = tup[0] + exonlen = tup[1] - tup[0] + if pos > exonlen: + pos = (pos - exonlen) - 1 + else: + break + # print ("appending exon_start-pos", exon_start, pos, exon_start+pos) + genomic_pos_list.append((exon_start + pos) - 1) + elif strand == "-": + exon_start = 0 + for tup in exons[::-1]: + # print ("tup",tup) + exon_start = tup[1] + exonlen = tup[1] - tup[0] + # print ("exonlen",exonlen) + if pos > exonlen: + # print ("pos is greater") + pos = (pos - exonlen) - 1 + # print ("new pos",pos) + else: + break + # print ("appending exon_start-pos", exon_start, pos, exon_start-pos) + genomic_pos_list.append((exon_start - pos) + 1) + return (chrom, genomic_pos_list) + + +orf_dict = { + "uorf": {}, + "ouorf": {}, + "cds": {}, + "nested": {}, + "odorf": {}, + "dorf": {}, + "minusone": {}, + "readthrough": {}, + "plusone": {}, + "noncoding": {}, + "extension": {}, + "inframe_stop": {}, +} + +start_codons = ["ATG", "CTG", "GTG", "TTG", "ATC", "ATA", "ATT", "ACG", "AAG", "AGG"] + +stop_codons = ["TAG", "TAA", "TGA"] + + +# Keep track of the longest transcript for each noncoding gene, append this to transcript list later +longest_noncoding = {} + + +tran_count = 0 +# This section is used to gather all cds regions, convert them to genomic regions and store them in a dictionary to check against later (all transcript contribute to this not just those +# in the transcript list) +genomic_cds_dict = {} +tree_dict = {} +for transcript in master_dict: + # print (transcript, master_dict[transcript]["tran_type"]) + tran_count += 1 + if "seq" not in master_dict[transcript]: + continue + chrom = master_dict[transcript]["chrom"] + if chrom not in genomic_cds_dict: + genomic_cds_dict[chrom] = [] + if "cds_start" in master_dict[transcript]: + cds_start = master_dict[transcript]["cds_start"] + cds_stop = master_dict[transcript]["cds_stop"] + if cds_start != "NULL": + cds_pos = [] + for i in range(cds_start, cds_stop + 1): + cds_pos.append(i) + + for tup in master_dict[transcript]["cds"]: + if tup[0] != tup[1]: + if tup not in genomic_cds_dict[chrom]: + genomic_cds_dict[chrom].append(tup) + +print("genomic cds dict built") +print(list(genomic_cds_dict)) +for chrom in genomic_cds_dict: + tree_dict[chrom] = IntervalTree.from_tuples(genomic_cds_dict[chrom]) + +# print (list(tree_dict)) + + +connection = sqlite3.connect("{}.sqlite".format(organism)) +cursor = connection.cursor() +cursor.execute( + "CREATE TABLE IF NOT EXISTS transcripts (transcript VARCHAR(50), gene VARCHAR(50), length INT(6), cds_start INT(6), cds_stop INT(6), sequence VARCHAR(50000), strand CHAR(1), stop_list VARCHAR(10000), start_list VARCHAR(10000), exon_junctions VARCHAR(1000), tran_type INT(1), gene_type INT(1), principal INT(1), version INT(2),gc INT(3),five_gc INT(3), cds_gc INT(3), three_gc INT(3), chrom VARCHAR(20));" +) +cursor.execute( + "CREATE TABLE IF NOT EXISTS coding_regions (transcript VARCHAR(50), coding_start INT(6), coding_stop INT(6));" +) +cursor.execute( + "CREATE TABLE IF NOT EXISTS exons (transcript VARCHAR(50), exon_start INT(6), exon_stop INT(6));" +) +cursor.execute( + "CREATE TABLE IF NOT EXISTS uorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" +) +cursor.execute( + "CREATE TABLE IF NOT EXISTS ouorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" +) +cursor.execute( + "CREATE TABLE IF NOT EXISTS cds (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" +) +cursor.execute( + "CREATE TABLE IF NOT EXISTS nested (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" +) +cursor.execute( + "CREATE TABLE IF NOT EXISTS odorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" +) +cursor.execute( + "CREATE TABLE IF NOT EXISTS dorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" +) +cursor.execute( + "CREATE TABLE IF NOT EXISTS minusone(transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" +) +cursor.execute( + "CREATE TABLE IF NOT EXISTS readthrough (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" +) +cursor.execute( + "CREATE TABLE IF NOT EXISTS plusone (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" +) +cursor.execute( + "CREATE TABLE IF NOT EXISTS noncoding (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" +) +cursor.execute( + "CREATE TABLE IF NOT EXISTS extension (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" +) +cursor.execute( + "CREATE TABLE IF NOT EXISTS inframe_stop (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), cds_coverage FLOAT(20));" +) +connection.commit() + + +print("Finding ORFs") +transcript_count = 0 +total_transcripts = len(list(master_dict)) +for transcript in master_dict: + # print ("transcript",transcript) + # if transcript != "ENST00000316448": + # continue + transcript_count += 1 + if transcript_count % 100 == 0: + print("Transcripts complete: {}/{}".format(transcript_count, total_transcripts)) + if "seq" not in master_dict[transcript]: + print("transcript {} has no sequence".format(transcript)) + continue + seq = master_dict[transcript]["seq"] + cds_start = "NULL" + cds_stop = "NULL" + transcript_len = len(seq) + if "cds_start" in master_dict[transcript]: + cds_start = master_dict[transcript]["cds_start"] + cds_stop = master_dict[transcript]["cds_stop"] + + # Find out what regions of this transcript overlap with any other coding regions + coding_positions = [] + if cds_start != "NULL": + # If this is a coding transcript don't bother checking the CDS + for i in range(cds_start, cds_stop): + coding_positions.append(i) + # check 5' leader + chrom, pos_list = tran_to_genome(transcript, 0, cds_start, master_dict) + for i in range(0, cds_start): + genomic_pos = pos_list[i] + overlap = tree_dict[chrom][genomic_pos] + if len(overlap) != 0: + coding_positions.append(i) + # check 3' trailer + chrom, pos_list = tran_to_genome( + transcript, cds_stop, transcript_len, master_dict + ) + for i in range(cds_stop, transcript_len + 1): + # print ("i",i) + genomic_pos = pos_list[i - cds_stop] + # print ("genomic position",genomic_pos) + overlap = tree_dict[chrom][genomic_pos] + if len(overlap) != 0: + # print ("overlap not empty appending i",overlap) + coding_positions.append(i) + else: + # check entire transcript + chrom, pos_list = tran_to_genome(transcript, 0, transcript_len, master_dict) + for i in range(0, transcript_len): + genomic_pos = pos_list[i] + overlap = tree_dict[chrom][genomic_pos] + if len(overlap) != 0: + coding_positions.append(i) + coding_positions_tuple = to_ranges(coding_positions) + coding_dict[transcript] = coding_positions_tuple + coding_positions = set(coding_positions) + # if this is a coding transcript find the minusone, readhtrough, plusone co-ordinates + if cds_start != "NULL": + # print ("transcript", transcript) + # if pseudo_utr_len != 0: + # cds_stop -= 3 # take 3 from stop so we can match it with orf_stop, do it here rather than above in case cds_stop is null + recoding_dict = {2: "minusone", 0: "readthrough", 1: "plusone"} + for addition in recoding_dict: + orftype = recoding_dict[addition] + for i in range(cds_stop + addition, transcript_len, 3): + if seq[i : i + 3] in stop_codons: + # orf_seq = seq[cds_stop:i+3] + orf_start = cds_stop + if orftype == "readthrough": + orf_start -= 2 + if orftype == "plusone": + orf_start -= 1 + orf_stop = i + 3 # +2 so it refers to the end of the stop codon + start_codon = None + length = (i + 3) - cds_stop + orf_pos_list = [] + # determine how many nucleotides in this orf overlap with an annotated coding region + cds_cov_count = 0.0 + for position in range(orf_start, orf_stop): + orf_pos_list.append(position) + for pos in range(orf_start, orf_stop + 1): + if pos in coding_positions: + cds_cov_count += 1 + cds_cov = cds_cov_count / length + # print ("orftype, start, stop", orftype, orf_start, orf_stop) + cursor.execute( + "INSERT INTO {} VALUES('{}','{}',{},{},{},{});".format( + orftype, + transcript, + start_codon, + length, + orf_start, + orf_stop, + cds_cov, + ) + ) + break + # sys.exit() + for frame in [0, 1, 2]: + for i in range(frame, transcript_len, 3): + if seq[i : i + 3] in start_codons: + for x in range(i, transcript_len, 3): + if seq[x : x + 3] in stop_codons: + # orf_seq = seq[i:x+3] + orf_start = i + 1 + orf_stop = x + 3 # +2 so it refers to the end of the stop codon + start_codon = seq[i : i + 3] + length = (x + 3) - i + orf_pos_list = [] + # determine how many nucleotides in this orf overlap with an annotated coding region + cds_cov_count = 0.0 + for pos in range(orf_start, orf_stop + 1): + if pos in coding_positions: + cds_cov_count += 1 + cds_cov = float(cds_cov_count) / float(length) + # Now determine orf type + if cds_start == "NULL": + orftype = "noncoding" + else: + # print ("cds start is not null :{}:{}".format(cds_start,cds_stop)) + # print ("orf start, orf stop", orf_start, orf_stop) + if orf_start == cds_start and orf_stop == cds_stop: + orftype = "cds" + # print ("orf type is cds") + elif orf_start < cds_start and orf_stop == cds_stop: + orftype = "extension" + # special case for extensions, we only take from the orf_start to the cds_start, and re-calculate cds coverage + orf_stop = cds_start + cds_cov_count = 0.0 + for pos in range(orf_start, orf_stop + 1): + if pos in coding_positions: + cds_cov_count += 1 + cds_cov = float(cds_cov_count) / float(length) + # orf_seq = seq[orf_start:cds_start] + length = cds_start - orf_start + # print ("orf type is extension") + elif orf_start < cds_start and orf_stop <= cds_start: + orftype = "uorf" + # print ("orf type is uorf") + elif orf_start < cds_start and orf_stop > cds_start: + orftype = "ouorf" + # print ("orf type is ouorf") + # sys.exit() + elif ( + orf_start >= cds_start + and orf_start <= cds_stop + and orf_stop <= cds_stop + ): + if orf_stop == cds_stop: + break + # print ("Tran, cds_start, cds_stop, orf_start, orf_stop, tranlen",tran, cds_start, cds_stop, orf_start, orf_stop,transcript_len) + if ( + orf_stop < transcript_len + and orf_stop % 3 == cds_stop % 3 + ) or ( + cds_start != 1 + and orf_stop % 3 == (cds_start + 2) % 3 + ): + # print ("Transcript {} has an inframe stop codon".format(transcript)) + break + orftype = "nested" + # print ("orf type is nested") + elif ( + orf_start >= cds_start + and orf_start <= cds_stop + and orf_stop > cds_stop + ): + orftype = "odorf" + # print ("orftype is odorf") + elif orf_start > cds_stop and orf_stop > cds_stop: + orftype = "dorf" + # print ("orftype is dorf") + # if orf_stop > cds_start and orf_stop < cds_stop: + # if (orf_stop+1)%3 == cds_start%3: + # orftype = "inframe_stop" + # print ("inframe stop, transcript, orf_stop", transcript, orf_stop) + # sys.exit() + # if transcript not in orf_dict: + # orf_dict[orftype][transcript] = [] + # #print ("some weird stop or something") + cursor.execute( + "INSERT INTO {} VALUES('{}','{}',{},{},{},{});".format( + orftype, + transcript, + start_codon, + length, + orf_start, + orf_stop, + cds_cov, + ) + ) + break +# Used to keep track of the codons at cds_start and cds_stop positions, +# If there is an issue with the cds co-ordinates the starts and stops counts will +# be much lower than the other count, start with 1 to prevent division by 0 +nuc_dict = {"stops": {"stops": 1, "other": 0}, "starts": {"starts": 1, "other": 0}} + + +def calcgc(seq): + if seq == "": + return "NULL" + g_count = 0 + c_count = 0 + a_count = 0 + t_count = 0 + for char in seq: + if char == "A": + a_count += 1 + if char == "T": + t_count += 1 + if char == "G": + g_count += 1 + if char == "C": + c_count += 1 + gc = ((g_count + c_count) / float(len(seq))) * 100 + return round(gc, 2) + + +for transcript in master_dict: + # print ("transcripts", transcript) + length = master_dict[transcript]["length"] + cds_start = master_dict[transcript]["cds_start"] + cds_stop = master_dict[transcript]["cds_stop"] + seq = master_dict[transcript]["seq"] + strand = master_dict[transcript]["strand"] + chrom = master_dict[transcript]["chrom"] + gene = master_dict[transcript]["gene_name"] + gc = calcgc(seq) + five_gc = "NULL" + cds_gc = "NULL" + three_gc = "NULL" + if cds_start != "NULL": + five_gc = calcgc(seq[:cds_start]) + cds_gc = calcgc(seq[cds_start:cds_stop]) + three_gc = calcgc(seq[cds_stop:]) + # check that the nucleotide cds_start points to is the first of the start codon + # take one becase cds_start is 1 based but python indexing is 0 based + start_nuc = seq[cds_start - 1 : cds_start + 2] + # print ("start nuc",start_nuc) + if start_nuc == "ATG": + nuc_dict["starts"]["starts"] += 1 + else: + nuc_dict["starts"]["other"] += 1 + stop_nuc = seq[cds_stop - 3 : cds_stop] + # print ("stop_nuc",stop_nuc) + if stop_nuc in ["TAG", "TAA", "TGA"]: + nuc_dict["stops"]["stops"] += 1 + else: + nuc_dict["stops"]["other"] += 1 + tran_type = master_dict[transcript]["tran_type"] + if coding_genes_dict[gene] == True: + gene_type = 1 + else: + gene_type = 0 + # print ("tran type before",tran_type) + if tran_type == "coding": + tran_type = 1 + else: + tran_type = 0 + # print ("tran type after",tran_type) + start_list = str(master_dict[transcript]["start_list"]).replace(" ", "").strip("[]") + stop_list = str(master_dict[transcript]["stop_list"]).replace(" ", "").strip("[]") + exon_junctions = ( + str(master_dict[transcript]["exon_junctions"]).replace(" ", "").strip("[]") + ) + principal = master_dict[transcript]["principal"] + version = master_dict[transcript]["version"] + # print (master_dict[transcript]) + # print (tran_type) + # print (gene_type) + # print (principal) + # print (version) + # print ("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{});".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version)) + cursor.execute( + "INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{},{},{},{},{},'{}');".format( + transcript, + gene, + length, + cds_start, + cds_stop, + seq, + strand, + stop_list, + start_list, + exon_junctions, + tran_type, + gene_type, + principal, + version, + gc, + five_gc, + cds_gc, + three_gc, + chrom, + ) + ) + + for tup in master_dict[transcript]["exon"]: + cursor.execute( + "INSERT INTO exons VALUES('{}',{},{});".format(transcript, tup[0], tup[1]) + ) + if transcript in coding_dict: + for tup in coding_dict[transcript]: + cursor.execute( + "INSERT INTO coding_regions VALUES('{}',{},{});".format( + transcript, tup[0], tup[1] + ) + ) + +connection.commit() +connection.close() + +print("delim", delimiters) +if (nuc_dict["starts"]["other"] / nuc_dict["starts"]["starts"]) > 0.05: + print( + "Warning: {} transcripts do not have a an AUG at the CDS start position".format( + nuc_dict["starts"]["other"] + ) + ) +if (nuc_dict["stops"]["other"] / nuc_dict["stops"]["stops"]) > 0.05: + print( + "Warning: {} transcripts do not have a an stop codon at the CDS stop position".format( + nuc_dict["stops"]["other"] + ) + ) +if len(notinannotation) > 0: + print( + "Warning: {} transcripts were in the fasta file, but not the annotation file, these will be discarded".format( + len(notinannotation) + ) + )