# HG changeset patch # User triasteran # Date 1645790782 0 # Node ID 68e8704d9484669a4401ba19a05f6b1ce8ad3961 # Parent c5a566609a25b4a6f78e274f929b3939d6658837 Deleted selected files diff -r c5a566609a25 -r 68e8704d9484 trips_create_new_organism/create_annotation_sqlite.py --- a/trips_create_new_organism/create_annotation_sqlite.py Fri Feb 25 11:24:50 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,787 +0,0 @@ -# Python3 script which takes in an annotation file(gtf/gff3) and a transcriptomic fasta file -# and produces an sqlite file which can be uploaded to Trips-Viz -# All co-ordinates produced are 1 based -# All start codon positions (including cds_start) should be at the first nucleotide of the codon -# All stop codon positions (including cds_stop) should be at the last nucleotide of the codon -import sys -import re -import sqlite3 -from intervaltree import Interval, IntervalTree -import itertools - - - -#This should be a GTF or GFF3 file -annotation_file = open(sys.argv[1],"r") -#This needs to be the transcriptomic fasta file -fasta_file = open(sys.argv[2],"r") -#This value will be added used to create UTRs of this length, useful when looking at transcriptomes without annotated UTRs -pseudo_utr_len = int(sys.argv[3]) -#An example of a transcript_id from the annotation file, e.g ENST000000123456 -user_transcript_id = sys.argv[4] -#An example of a gene name from the annotation file -user_gene_name = sys.argv[5] -# Set to true if transcript version is included in transcript_id, e.g: ENST000000123456.1 -TRAN_VERSION = True - - - -delimiters = {"transcripts":{"before":[],"after":[],"annot_types": ["cds","utr"]}, - "genes":{"before":[],"after":['"'],"annot_types": ["lnc_rna"]}} - -punctuation = [";"," ","-",":","-",".","=","\t"] -#Find delimiters in the annotation and fasta files using the user_transcript_id -#and user_gene_name examples given by user. -for line in annotation_file: - if user_transcript_id in line: - tabsplitline = line.split("\t") - annot_type = tabsplitline[2] - if annot_type not in delimiters["transcripts"]["annot_types"]: - delimiters["transcripts"]["annot_types"].append(annot_type.lower()) - splitline = line.split(user_transcript_id) - before_delimiter = splitline[0] - for item in punctuation: - if item in before_delimiter: - if len(before_delimiter.split(item)[-1]) >= 5: - before_delimiter = before_delimiter.split(item)[-1] - after_delimiter = splitline[1][:2] - if before_delimiter not in delimiters["transcripts"]["before"] and len(before_delimiter) >= 5: - delimiters["transcripts"]["before"].append(before_delimiter) - if after_delimiter not in delimiters["transcripts"]["after"]: - delimiters["transcripts"]["after"].append(after_delimiter) - if user_gene_name in line: - tabsplitline = line.split("\t") - annot_type = tabsplitline[2] - if annot_type not in delimiters["genes"]["annot_types"]: - delimiters["genes"]["annot_types"].append(annot_type.lower()) - splitline = line.split(user_gene_name) - before_delimiter = splitline[0] - for item in punctuation: - if item in before_delimiter: - if len(before_delimiter.split(item)[-1]) >= 5: - before_delimiter = before_delimiter.split(item)[-1] - after_delimiter = splitline[1][0] - if before_delimiter not in delimiters["genes"]["before"] and len(before_delimiter) >= 5: - delimiters["genes"]["before"].append(before_delimiter) - if after_delimiter not in delimiters["genes"]["after"]: - if after_delimiter in punctuation: - delimiters["genes"]["after"].append(after_delimiter) -for line in fasta_file: - if user_transcript_id in line: - splitline = line.split(user_transcript_id) - before_delimiter = splitline[0] - for item in punctuation: - if item in before_delimiter: - if len(before_delimiter.split(item)[1]) >= 5: - before_delimiter = before_delimiter.split(item)[1] - after_delimiter = splitline[1][0] - if before_delimiter not in delimiters["transcripts"]["before"] and len(before_delimiter) >= 5: - delimiters["transcripts"]["before"].append(before_delimiter) - if after_delimiter not in delimiters["transcripts"]["after"]: - delimiters["transcripts"]["after"].append(after_delimiter) -fasta_file.close() -annotation_file.close() - - - - - -if delimiters["transcripts"]["before"] == []: - print ("ERROR: No transcript_id with the name {} could be found in the annotation file".format(user_transcript_id)) - sys.exit() -if delimiters["genes"]["before"] == []: - print ("ERROR: No gene with the name {} could be found in the annotation file".format(user_gene_name)) - sys.exit() - -master_dict = {} -coding_dict = {} -notinfasta = open("notinfasta.csv","w") - -#Given a nucleotide sequence returns the positions of all start and stop codons. -def get_start_stops(transcript_sequence): - transcript_sequence = transcript_sequence.upper() - start_codons = ['ATG'] - stop_codons = ['TAA', 'TAG', 'TGA'] - seq_frames = {'starts': [], 'stops': []} - for codons, positions in ((start_codons, 'starts'),(stop_codons, 'stops')): - if len(codons) > 1: - pat = re.compile('|'.join(codons)) - else: - pat = re.compile(codons[0]) - for m in re.finditer(pat, transcript_sequence): - # Increment position by 1, Frame 1 starts at position 1 not 0, - # if it's a stop codon add another 2 so it points to the last nuc of the codon - if positions == "starts": - start = m.start() + 1 - else: - start = m.start() + 3 - seq_frames[positions].append(start) - return seq_frames - - -#parse fasta to get the nucleotide sequence of transcripts and the positions of start/stop codons. -fasta_file = open(sys.argv[2],"r") -read_fasta = fasta_file.read() -split_fasta = read_fasta.split(">") -for entry in split_fasta[1:]: - newline_split = entry.split("\n") - tran = newline_split[0] - for item in delimiters["transcripts"]["after"]: - if item in tran: - tran = tran.split(item)[0] - tran = tran.replace("-","_").replace("(","").replace(")","") - seq = ("".join(newline_split[1:])) - if "_PAR_Y" in tran: - tran += "_chrY" - elif "_PAR_X" in tran: - tran += "_chrX" - tran = tran.upper() - starts_stops = get_start_stops(seq) - if tran not in master_dict: - master_dict[tran] = {"utr":[], "cds":[], "exon":[],"start_codon":[],"stop_codon":[],"start_list":starts_stops["starts"], - "stop_list":starts_stops["stops"],"transcript":[], "strand":"" ,"gene_name":"","chrom":"","seq":seq,"cds_start":"NULL","cds_stop":"NULL", - "length":len(seq),"principal":0,"version":"NULL"} - - - - -def to_ranges(iterable): - tup_list = [] - iterable = sorted(set(iterable)) - for key, group in itertools.groupby(enumerate(iterable),lambda t: t[1] - t[0]): - group = list(group) - tup_list.append((group[0][1], group[-1][1])) - return tup_list - -#parse annotation file to get chromsome, exon location and CDS info for each transcript -def parse_gtf_file(annotation_file): - for line in annotation_file: - if line == "\n": - continue - if line[0] != '#': - splitline = (line.replace("\n","")).split("\t") - chrom = splitline[0] - try: - annot_type = splitline[2].lower() - except: - print ("ERROR tried to index to second item in splitline: ",splitline, line) - sys.exit() - #if annot_type not in ["cds", "utr", "exon", "transcript","five_prime_utr", "three_prime_utr","stop_codon","start_codon"]: - # continue - if annot_type not in delimiters["transcripts"]["annot_types"] and annot_type not in delimiters["genes"]["annot_types"]: - continue - if annot_type == "five_prime_utr" or annot_type == "three_prime_utr": - annot_type = "utr" - strand = splitline[6] - if strand == "+": - start = int(splitline[3]) - end = int(splitline[4]) - else: - start = int(splitline[3])+1 - end = int(splitline[4])+1 - desc = splitline[8] - tran = desc - gene = desc - for item in delimiters["transcripts"]["before"]: - if item in tran: - tran = tran.split(item)[1] - for item in delimiters["transcripts"]["after"]: - if item in tran: - tran = tran.split(item)[0] - if "." in tran and TRAN_VERSION == True: - #print ("raw tran",tran) - tran = tran.split(".") - version = int(tran[-1].split("_")[0]) - tran = tran[0] - else: - version = "NULL" - tran = tran.replace("-","_").replace(".","_") - tran = tran.replace("(","").replace(")","") - tran = tran.replace(" ","").replace("\t","") - tran = tran.upper() - tran = tran.replace("GENE_","").replace("ID_","") - #print ("tran",tran,version) - #if tran == "ENST00000316448": - # print ("annot type",annot_type) - # print ("appending exon to tran", start, end,line) - - gene_found = False - - if annot_type in delimiters["genes"]["annot_types"]: - for item in delimiters["genes"]["before"]: - if item in gene: - gene_found = True - gene = gene.split(item)[1] - for item in delimiters["genes"]["after"]: - if item in gene: - gene = gene.split(item)[0] - gene = gene.replace("'","''") - gene = gene.replace("GENE_","") - gene = gene.replace("ID_","") - gene = gene.upper() - - if tran in master_dict: - if annot_type in master_dict[tran]: - master_dict[tran][annot_type].append((start, end)) - master_dict[tran]["strand"] = strand - master_dict[tran]["chrom"] = chrom - master_dict[tran]["version"] = version - if gene_found == True: - master_dict[tran]["gene_name"] = gene - else: - notinfasta.write("{}\n".format(tran)) - -annotation_file = open(sys.argv[1],"r") -parse_gtf_file(annotation_file) - - -#remove transcripts that were in fasta file but not in annotation_file -notinannotation = [] -for tran in master_dict: - if master_dict[tran]["chrom"] == "": - #print ("tran {} has no chrom :(".format(tran)) - notinannotation.append(tran) -for tran in notinannotation: - del master_dict[tran] - -#Dictionary to store the coding status of a gene, if any transcript of this gene is coding, the value will be True -coding_genes_dict = {} -#parse master_dict to calculate length, cds_start/stop and exon junction positions -for tran in master_dict: - try: - transeq = master_dict[tran]["seq"] - except Exception as e: - print ("not in fasta", tran) - notinfasta.write("{}\n".format(tran)) - continue - exon_junctions = [] - total_length = len(transeq) - three_len = 1 - five_len = 1 - strand = master_dict[tran]["strand"] - if master_dict[tran]["gene_name"] == "": - master_dict[tran]["gene_name"] = tran - gene = master_dict[tran]["gene_name"] - if gene not in coding_genes_dict: - coding_genes_dict[gene] = False - - if master_dict[tran]["cds"] == []: - tran_type = "noncoding" - cds_start = 'NULL' - cds_stop = 'NULL' - else: - #get utr lengths from annotation - tran_type = "coding" - coding_genes_dict[gene] = True - sorted_exons = sorted(master_dict[tran]["exon"]) - sorted_cds = sorted(master_dict[tran]["cds"]) - min_cds = sorted_cds[0][0] - #Some annotation files do not have utr annotation types, so fix that here if thats the case - if master_dict[tran]["utr"] == []: - for exon_tup in master_dict[tran]["exon"]: - if exon_tup not in master_dict[tran]["cds"]: - # Now check if this overlaps with any of the CDS exons - overlap = False - for cds_tup in master_dict[tran]["cds"]: - if exon_tup[0] == cds_tup[0] and exon_tup[1] != cds_tup[1]: - master_dict[tran]["utr"].append((cds_tup[1],exon_tup[1])) - overlap = True - if exon_tup[0] != cds_tup[0] and exon_tup[1] == cds_tup[1]: - master_dict[tran]["utr"].append((exon_tup[0],cds_tup[0])) - overlap = True - if overlap == False: - master_dict[tran]["utr"].append(exon_tup) - - - ''' - if tran == "NM_001258497": - print ("sorted cds",sorted_cds) - print ("min cds",min_cds) - print ("chrom",master_dict[tran]["chrom"]) - print ("sorted exons", sorted_exons) - print ("utr",master_dict[tran]["utr"]) - sys.exit() - ''' - #if tran == "ENST00000381401": - # print ("min cds,sorted utr",min_cds,sorted(master_dict[tran]["utr"])) - for tup in sorted(master_dict[tran]["utr"]): - #if tran == "ENST00000381401": - # print ("tup", tup) - if tup[0] < min_cds: - five_len += (tup[1]-tup[0])+1 - #if tran == "ENST00000381401": - # print ("adding to fivelen") - elif tup[0] > min_cds: - three_len += (tup[1] - tup[0])+1 - #if tran == "ENST00000381401": - # print ("adding to three len") - else: - pass - if strand == "+": - if len(sorted_exons) > 1: - sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1]) - sorted_exons[-1] = (sorted_exons[-1][0], sorted_exons[-1][1]+pseudo_utr_len) - else: - sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1]+pseudo_utr_len) - master_dict[tran]["exon"] = sorted_exons - cds_start = (five_len+pseudo_utr_len) - cds_stop = ((total_length - three_len)-pseudo_utr_len)+1 - elif strand == "-": - if len(sorted_exons) > 1: - sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1]) - sorted_exons[-1] = (sorted_exons[-1][0], sorted_exons[-1][1]+pseudo_utr_len) - else: - sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1]+pseudo_utr_len) - master_dict[tran]["exon"] = sorted_exons - cds_start = (three_len+pseudo_utr_len) - cds_stop = ((total_length - (five_len))-pseudo_utr_len)+1 - #if tran == "ENST00000381401": - # print ("cds start, cds stop, five_len, three_len",cds_start,cds_stop,five_len,three_len) - # #sys.exit() - else: - print ("strand is unknown: {}".format(strand)) - sys.exit() - - #get exon junctions, cds is easy just get end of each tuple except last, same for utr except for if same as cds start/stop - total_intronic = 0 - try: - if strand == "+": - tx_start = min(sorted(master_dict[tran]["exon"]))[0] - prev_end = tx_start - for tup in sorted(master_dict[tran]["exon"])[:-1]: - total_intronic += tup[0]-prev_end - exon_junctions.append(((tup[1])-tx_start)-total_intronic) - prev_end = tup[1] - elif strand == "-": - tx_start = max(sorted(master_dict[tran]["exon"]))[-1] - prev_end = tx_start - for tup in (sorted(master_dict[tran]["exon"])[1:])[::-1]: - total_intronic += (tup[0]+1)-prev_end - exon_junctions.append(((tup[1])-tx_start)-total_intronic) - prev_end = tup[1] - except: - if strand == "+": - tx_start = min(sorted(master_dict[tran]["cds"]))[0] - prev_end = tx_start - for tup in sorted(master_dict[tran]["cds"])[:-1]: - total_intronic += tup[0]-prev_end - exon_junctions.append(((tup[1])-tx_start)-total_intronic) - prev_end = tup[1] - elif strand == "-": - tx_start = max(sorted(master_dict[tran]["cds"]))[-1] - prev_end = tx_start - for tup in (sorted(master_dict[tran]["cds"])[1:])[::-1]: - total_intronic += (tup[0]+1)-prev_end - exon_junctions.append(((tup[1])-tx_start)-total_intronic) - prev_end = tup[1] - if strand == "+" and cds_start != "NULL": - master_dict[tran]["cds_start"] = cds_start - master_dict[tran]["cds_stop"] = cds_stop - elif strand == "-" and cds_start != "NULL": - master_dict[tran]["cds_start"] = cds_start - master_dict[tran]["cds_stop"] = cds_stop - master_dict[tran]["strand"] = strand - master_dict[tran]["tran_type"] = tran_type - master_dict[tran]["exon_junctions"] = exon_junctions - -longest_tran_dict = {} -for tran in master_dict: - try: - gene = master_dict[tran]["gene_name"] - except: - continue - if coding_genes_dict[gene] == True: - if "cds_start" in master_dict[tran]: - if master_dict[tran]["cds_stop"] != "NULL" and master_dict[tran]["cds_start"] != "NULL": - cds_length = master_dict[tran]["cds_stop"]- master_dict[tran]["cds_start"] - if gene not in longest_tran_dict: - longest_tran_dict[gene] = {"tran":tran,"length":cds_length} - else: - if cds_length > longest_tran_dict[gene]["length"]: - longest_tran_dict[gene] = {"tran":tran,"length":cds_length} - if cds_length == longest_tran_dict[gene]["length"]: - if master_dict[tran]["length"] > master_dict[longest_tran_dict[gene]["tran"]]["length"]: - longest_tran_dict[gene] = {"tran":tran,"length":cds_length} - else: - length = master_dict[tran]["length"] - if gene not in longest_tran_dict: - longest_tran_dict[gene] = {"tran":tran,"length":length} - elif length > longest_tran_dict[gene]["length"]: - longest_tran_dict[gene] = {"tran":tran,"length":length} - - - - - -for gene in longest_tran_dict: - longest_tran = longest_tran_dict[gene]["tran"] - master_dict[longest_tran]["principal"] = 1 - -gene_sample = [] -for key in list(master_dict)[:10]: - try: - gene_sample.append(master_dict[key]["gene_name"]) - except: - pass - -print ("Here is a sample of the transcript ids: {}".format(list(master_dict)[:10])) -print ("Here is a sample of the gene names: {}".format(gene_sample)) - - -# Takes a transcript, transcriptomic position and a master_dict (see ribopipe scripts) and returns the genomic position, positions should be passed 1 at a time. -def tran_to_genome(tran, start_pos, end_pos, master_dict): - pos_list = [] - for i in range(start_pos,end_pos+1): - pos_list.append(i) - genomic_pos_list = [] - if tran in master_dict: - transcript_info = master_dict[tran] - else: - return ("Null", []) - - chrom = transcript_info["chrom"] - strand = transcript_info["strand"] - exons = transcript_info["exon"] - #print ("chrom,strand,exons",chrom,strand,exons) - for pos in pos_list: - #print ("pos",pos) - if strand == "+": - exon_start = 0 - for tup in exons: - #print ("tup",tup) - exon_start = tup[0] - exonlen = tup[1] - tup[0] - if pos > exonlen: - pos = (pos - exonlen)-1 - else: - break - #print ("appending exon_start-pos", exon_start, pos, exon_start+pos) - genomic_pos_list.append((exon_start+pos)-1) - elif strand == "-": - exon_start = 0 - for tup in exons[::-1]: - #print ("tup",tup) - exon_start = tup[1] - exonlen = tup[1] - tup[0] - #print ("exonlen",exonlen) - if pos > exonlen: - #print ("pos is greater") - pos = (pos - exonlen)-1 - #print ("new pos",pos) - else: - break - #print ("appending exon_start-pos", exon_start, pos, exon_start-pos) - genomic_pos_list.append((exon_start-pos)+1) - return (chrom, genomic_pos_list) - - - - -orf_dict = {"uorf":{}, - "ouorf":{}, - "cds":{}, - "nested":{}, - "odorf":{}, - "dorf":{}, - "minusone":{}, - "readthrough":{}, - "plusone":{}, - "noncoding":{}, - "extension":{}, - "inframe_stop":{} - } - -start_codons = ["ATG","GTG","CTG"] -stop_codons = ["TAG","TAA","TGA"] - - -# Keep track of the longest transcript for each noncoding gene, append this to transcript list later -longest_noncoding = {} - - -tran_count = 0 -# This section is used to gather all cds regions, convert them to genomic regions and store them in a dictionary to check against later (all transcript contribute to this not just those -# in the transcript list) -genomic_cds_dict = {} -tree_dict = {} -for transcript in master_dict: - #print (transcript, master_dict[transcript]["tran_type"]) - tran_count += 1 - if "seq" not in master_dict[transcript]: - continue - chrom = master_dict[transcript]["chrom"] - if chrom not in genomic_cds_dict: - genomic_cds_dict[chrom] = [] - if "cds_start" in master_dict[transcript]: - cds_start = master_dict[transcript]["cds_start"] - cds_stop = master_dict[transcript]["cds_stop"] - if cds_start != "NULL": - cds_pos = [] - for i in range(cds_start, cds_stop+1): - cds_pos.append(i) - - for tup in master_dict[transcript]["cds"]: - if tup[0] != tup[1]: - if tup not in genomic_cds_dict[chrom]: - genomic_cds_dict[chrom].append(tup) - -print ("genomic cds dict built") -print (list(genomic_cds_dict)) -for chrom in genomic_cds_dict: - tree_dict[chrom] = IntervalTree.from_tuples(genomic_cds_dict[chrom]) - -#print (list(tree_dict)) - - -connection = sqlite3.connect("{}".format(sys.argv[6])) -cursor = connection.cursor() -cursor.execute("CREATE TABLE IF NOT EXISTS transcripts (transcript VARCHAR(50), gene VARCHAR(50), length INT(6), cds_start INT(6), cds_stop INT(6), sequence VARCHAR(50000), strand CHAR(1), stop_list VARCHAR(10000), start_list VARCHAR(10000), exon_junctions VARCHAR(1000), tran_type INT(1), gene_type INT(1), principal INT(1), version INT(2),gc INT(3),five_gc INT(3), cds_gc INT(3), three_gc INT(3), chrom VARCHAR(20));") -cursor.execute("CREATE TABLE IF NOT EXISTS coding_regions (transcript VARCHAR(50), coding_start INT(6), coding_stop INT(6));") -cursor.execute("CREATE TABLE IF NOT EXISTS exons (transcript VARCHAR(50), exon_start INT(6), exon_stop INT(6));") -cursor.execute("CREATE TABLE IF NOT EXISTS uorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") -cursor.execute("CREATE TABLE IF NOT EXISTS ouorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") -cursor.execute("CREATE TABLE IF NOT EXISTS cds (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") -cursor.execute("CREATE TABLE IF NOT EXISTS nested (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") -cursor.execute("CREATE TABLE IF NOT EXISTS odorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") -cursor.execute("CREATE TABLE IF NOT EXISTS dorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") -cursor.execute("CREATE TABLE IF NOT EXISTS minusone(transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") -cursor.execute("CREATE TABLE IF NOT EXISTS readthrough (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") -cursor.execute("CREATE TABLE IF NOT EXISTS plusone (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") -cursor.execute("CREATE TABLE IF NOT EXISTS noncoding (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") -cursor.execute("CREATE TABLE IF NOT EXISTS extension (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") -cursor.execute("CREATE TABLE IF NOT EXISTS inframe_stop (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));") -connection.commit(); - - -print ("Finding ORFs") -transcript_count = 0 -total_transcripts = len(list(master_dict)) -for transcript in master_dict: - #print ("transcript",transcript) - #if transcript != "ENST00000316448": - # continue - transcript_count += 1 - if transcript_count%100 == 0: - print ("Transcripts complete: {}/{}".format(transcript_count,total_transcripts)) - if "seq" not in master_dict[transcript]: - print ("transcript {} has no sequence".format(transcript)) - continue - seq = master_dict[transcript]["seq"] - cds_start = "NULL" - cds_stop = "NULL" - transcript_len = len(seq) - if "cds_start" in master_dict[transcript]: - cds_start = master_dict[transcript]["cds_start"] - cds_stop = master_dict[transcript]["cds_stop"] - - #Find out what regions of this transcript overlap with any other coding regions - coding_positions = [] - if cds_start != "NULL": - #If this is a coding transcript don't bother checking the CDS - for i in range(cds_start,cds_stop): - coding_positions.append(i) - #check 5' leader - chrom, pos_list = tran_to_genome(transcript, 0, cds_start, master_dict) - for i in range(0,cds_start): - genomic_pos = pos_list[i] - overlap = tree_dict[chrom][genomic_pos] - if len(overlap) != 0: - coding_positions.append(i) - #check 3' trailer - chrom, pos_list = tran_to_genome(transcript, cds_stop, transcript_len, master_dict) - for i in range(cds_stop,transcript_len+1): - #print ("i",i) - genomic_pos = pos_list[i-cds_stop] - #print ("genomic position",genomic_pos) - overlap = tree_dict[chrom][genomic_pos] - if len(overlap) != 0: - #print ("overlap not empty appending i",overlap) - coding_positions.append(i) - else: - #check entire transcript - chrom, pos_list = tran_to_genome(transcript, 0, transcript_len, master_dict) - for i in range(0,transcript_len): - genomic_pos = pos_list[i] - overlap = tree_dict[chrom][genomic_pos] - if len(overlap) != 0: - coding_positions.append(i) - coding_positions_tuple = to_ranges(coding_positions) - coding_dict[transcript] = coding_positions_tuple - coding_positions = set(coding_positions) - #if this is a coding transcript find the minusone, readhtrough, plusone co-ordinates - if cds_start != "NULL": - if pseudo_utr_len != 0: - cds_stop -= 3 # take 3 from stop so we can match it with orf_stop, do it here rather than above in case cds_stop is null - recoding_dict = {0:"minusone",1:"readthrough",2:"plusone"} - for addition in recoding_dict: - orftype = recoding_dict[addition] - for i in range(cds_stop+addition,transcript_len,3): - if seq[i:i+3] in stop_codons: - orf_seq = seq[cds_stop:i+3] - orf_start = cds_stop - orf_stop = i+2 # +2 so it refers to the end of the stop codon - start_codon = None - if orf_seq: - length = len(orf_seq) - orf_pos_list = [] - #determine how many nucleotides in this orf overlap with an annotated coding region - cds_cov_count = 0.0 - for position in range(orf_start,orf_stop): - orf_pos_list.append(position) - for pos in range(orf_start, orf_stop+1): - if pos in coding_positions: - cds_cov_count += 1 - cds_cov = cds_cov_count/length - cursor.execute("INSERT INTO {} VALUES('{}','{}',{},{},{},'{}',{});".format(orftype, transcript, start_codon, length,orf_start,orf_stop,orf_seq,cds_cov)) - break - for frame in [0,1,2]: - for i in range(frame,transcript_len,3): - if seq[i:i+3] in start_codons: - for x in range(i, transcript_len,3): - if seq[x:x+3] in stop_codons: - orf_seq = seq[i:x+3] - orf_start = i - orf_stop = x+2 # +2 so it refers to the end of the stop codon - start_codon = seq[i:i+3] - length = len(orf_seq) - orf_pos_list = [] - #determine how many nucleotides in this orf overlap with an annotated coding region - cds_cov_count = 0.0 - for pos in range(orf_start, orf_stop+1): - if pos in coding_positions: - cds_cov_count += 1 - cds_cov = float(cds_cov_count)/float(length) - # Now determine orf type - if cds_start == "NULL": - orftype = "noncoding" - else: - #print ("cds start is not null :{}:".format(cds_start)) - if orf_start == cds_start and orf_stop == cds_stop: - orftype = "cds" - elif orf_start < cds_start and orf_stop == cds_stop: - orftype = "extension" - #special case for extensions, we only take from the orf_start to the cds_start, and re-calculate cds coverage - orf_stop = cds_start - cds_cov_count = 0.0 - for pos in range(orf_start, orf_stop+1): - if pos in coding_positions: - cds_cov_count += 1 - cds_cov = float(cds_cov_count)/float(length) - orf_seq = seq[orf_start:cds_start] - length = len(orf_seq) - elif orf_start < cds_start and orf_stop <= cds_start: - orftype = "uorf" - elif orf_start < cds_start and orf_stop > cds_start: - orftype = "ouorf" - elif orf_start >= cds_start and orf_start <= cds_stop and orf_stop <= cds_stop: - if orf_stop == cds_stop: - break - orftype = "nested" - elif orf_start >= cds_start and orf_start <= cds_stop and orf_stop > cds_stop: - orftype = "odorf" - elif orf_start > cds_stop and orf_stop > cds_stop: - orftype = "dorf" - if orf_stop > cds_start and orf_stop < cds_stop: - if (orf_stop+1)%3 == cds_start%3: - orftype = "inframe_stop" - if transcript not in orf_dict: - orf_dict[orftype][transcript] = [] - cursor.execute("INSERT INTO {} VALUES('{}','{}',{},{},{},'{}',{});".format(orftype, transcript, start_codon, length,orf_start,orf_stop,orf_seq,cds_cov)) - break -# Used to keep track of the codons at cds_start and cds_stop positions, -# If there is an issue with the cds co-ordinates the starts and stops counts will -# be much lower than the other count, start with 1 to prevent division by 0 -nuc_dict = {"stops":{"stops":1,"other":0}, "starts":{"starts":1,"other":0}} - -def calcgc(seq): - if seq == "": - return "NULL" - g_count = 0 - c_count = 0 - a_count = 0 - t_count = 0 - for char in seq: - if char == "A": - a_count += 1 - if char == "T": - t_count += 1 - if char == "G": - g_count += 1 - if char == "C": - c_count += 1 - gc = ((g_count+c_count)/float(len(seq)))*100 - return round(gc,2) - - - - -for transcript in master_dict: - #print ("transcripts", transcript) - length = master_dict[transcript]["length"] - cds_start = master_dict[transcript]["cds_start"] - cds_stop = master_dict[transcript]["cds_stop"] - seq = master_dict[transcript]["seq"] - strand = master_dict[transcript]["strand"] - chrom = master_dict[transcript]["chrom"] - gene = master_dict[transcript]["gene_name"] - gc = calcgc(seq) - five_gc = "NULL" - cds_gc = "NULL" - three_gc = "NULL" - if cds_start != "NULL": - five_gc = calcgc(seq[:cds_start]) - cds_gc = calcgc(seq[cds_start:cds_stop]) - three_gc = calcgc(seq[cds_stop:]) - # check that the nucleotide cds_start points to is the first of the start codon - # take one becase cds_start is 1 based but python indexing is 0 based - start_nuc = seq[cds_start-1:cds_start+2] - #print ("start nuc",start_nuc) - if start_nuc == "ATG": - nuc_dict["starts"]["starts"] += 1 - else: - nuc_dict["starts"]["other"] += 1 - stop_nuc = seq[cds_stop-3:cds_stop] - #print ("stop_nuc",stop_nuc) - if stop_nuc in ["TAG","TAA","TGA"]: - nuc_dict["stops"]["stops"] += 1 - else: - nuc_dict["stops"]["other"] += 1 - tran_type = master_dict[transcript]["tran_type"] - if coding_genes_dict[gene] == True: - gene_type = 1 - else: - gene_type = 0 - #print ("tran type before",tran_type) - if tran_type == "coding": - tran_type = 1 - else: - tran_type = 0 - #print ("tran type after",tran_type) - start_list = str(master_dict[transcript]["start_list"]).replace(" ","").strip("[]") - stop_list = str(master_dict[transcript]["stop_list"]).replace(" ","").strip("[]") - exon_junctions = str(master_dict[transcript]["exon_junctions"]).replace(" ","").strip("[]") - principal = master_dict[transcript]["principal"] - version = master_dict[transcript]["version"] - #print (master_dict[transcript]) - #print (tran_type) - #print (gene_type) - #print (principal) - #print (version) - #print ("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{});".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version)) - cursor.execute("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{},{},{},{},{},'{}');".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version,gc,five_gc,cds_gc,three_gc,chrom)) - - for tup in master_dict[transcript]["exon"]: - cursor.execute("INSERT INTO exons VALUES('{}',{},{});".format(transcript,tup[0],tup[1])) - if transcript in coding_dict: - for tup in coding_dict[transcript]: - cursor.execute("INSERT INTO coding_regions VALUES('{}',{},{});".format(transcript,tup[0],tup[1])) - -connection.commit() -connection.close() - -if (nuc_dict["starts"]["other"]/nuc_dict["starts"]["starts"]) > 0.05: - print ("Warning: {} transcripts do not have a an AUG at the CDS start position".format(nuc_dict["starts"]["other"])) -if (nuc_dict["stops"]["other"]/nuc_dict["stops"]["stops"]) > 0.05: - print ("Warning: {} transcripts do not have a an stop codon at the CDS stop position".format(nuc_dict["stops"]["other"])) -if len(notinannotation) >0: - print ("Warning: {} transcripts were in the fasta file, but not the annotation file, these will be discarded".format(len(notinannotation))) diff -r c5a566609a25 -r 68e8704d9484 trips_create_new_organism/create_annotation_sqlite.xml --- a/trips_create_new_organism/create_annotation_sqlite.xml Fri Feb 25 11:24:50 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,84 +0,0 @@ - - - intervaltree - sqlite - pysqlite3 - python-intervaltree - - - - - - - - - - - - - - - - - - - - - -@misc{githubTrips-Viz, - author = {LastTODO, FirstTODO}, - year = {TODO}, - title = {Trips-Viz}, - publisher = {GitHub}, - journal = {GitHub repository}, - url = {https://github.com/skiniry/Trips-Viz}, -} - - \ No newline at end of file diff -r c5a566609a25 -r 68e8704d9484 trips_create_new_organism/create_annotation_sqlite_.xml --- a/trips_create_new_organism/create_annotation_sqlite_.xml Fri Feb 25 11:24:50 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ - - create new organism - create_annotation_sqlite.py $annotation $fasta $pseudo_utr_len $transcript $gene $output - - - - - - - - - - - - - - - - - - - - - -**GTF/GFF3 File** - -GFF lines have nine required fields that must be tab-separated. -The GFF3 format addresses the most common extensions to GFF, while preserving backward compatibility with previous formats. - -Both transcript ids and gene names should be listed in the file. - ------ - -**Transcriptome FASTA file** - -A FASTA file with an entry for every transcript. The headers should be the transcript id's as they appear in the GTF/GFF3 file. - ------ - -**Psuedo UTR length** -An integer representing the length (in nucleotides) to be added to the 5' end and 3' end of every transcript with an annotated -CDS. Useful for when an organism does not have any annotated UTR's, if it does use 0. If not 0, the extra nucleotides should -already be present in the FASTA file. - ------ - -**Example transcript** - -An example of a transcript id that appears in the FASTA/GTF/GFF3 file, e.g ENST00000123456 - ------ - -**Example Gene** - -An example of a gene name as it appears in the GTF/GFF3 file, e.g BRCA1 - ------ - -**Output** - -The output of the script can be downloaded and uploaded to Trips-viz_. by signing in and going to the uploads page, then selecting -"Upload new transcriptome". When uploaded the new organism will appear on the home page of Trips-viz, or under the transcriptomes -page if the organism name used is already present on Trips-viz. - - - -.. _Trips-viz: http://trips.ucc.ie - - - -