# HG changeset patch
# User triasteran
# Date 1645790782 0
# Node ID 68e8704d9484669a4401ba19a05f6b1ce8ad3961
# Parent c5a566609a25b4a6f78e274f929b3939d6658837
Deleted selected files
diff -r c5a566609a25 -r 68e8704d9484 trips_create_new_organism/create_annotation_sqlite.py
--- a/trips_create_new_organism/create_annotation_sqlite.py Fri Feb 25 11:24:50 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,787 +0,0 @@
-# Python3 script which takes in an annotation file(gtf/gff3) and a transcriptomic fasta file
-# and produces an sqlite file which can be uploaded to Trips-Viz
-# All co-ordinates produced are 1 based
-# All start codon positions (including cds_start) should be at the first nucleotide of the codon
-# All stop codon positions (including cds_stop) should be at the last nucleotide of the codon
-import sys
-import re
-import sqlite3
-from intervaltree import Interval, IntervalTree
-import itertools
-
-
-
-#This should be a GTF or GFF3 file
-annotation_file = open(sys.argv[1],"r")
-#This needs to be the transcriptomic fasta file
-fasta_file = open(sys.argv[2],"r")
-#This value will be added used to create UTRs of this length, useful when looking at transcriptomes without annotated UTRs
-pseudo_utr_len = int(sys.argv[3])
-#An example of a transcript_id from the annotation file, e.g ENST000000123456
-user_transcript_id = sys.argv[4]
-#An example of a gene name from the annotation file
-user_gene_name = sys.argv[5]
-# Set to true if transcript version is included in transcript_id, e.g: ENST000000123456.1
-TRAN_VERSION = True
-
-
-
-delimiters = {"transcripts":{"before":[],"after":[],"annot_types": ["cds","utr"]},
- "genes":{"before":[],"after":['"'],"annot_types": ["lnc_rna"]}}
-
-punctuation = [";"," ","-",":","-",".","=","\t"]
-#Find delimiters in the annotation and fasta files using the user_transcript_id
-#and user_gene_name examples given by user.
-for line in annotation_file:
- if user_transcript_id in line:
- tabsplitline = line.split("\t")
- annot_type = tabsplitline[2]
- if annot_type not in delimiters["transcripts"]["annot_types"]:
- delimiters["transcripts"]["annot_types"].append(annot_type.lower())
- splitline = line.split(user_transcript_id)
- before_delimiter = splitline[0]
- for item in punctuation:
- if item in before_delimiter:
- if len(before_delimiter.split(item)[-1]) >= 5:
- before_delimiter = before_delimiter.split(item)[-1]
- after_delimiter = splitline[1][:2]
- if before_delimiter not in delimiters["transcripts"]["before"] and len(before_delimiter) >= 5:
- delimiters["transcripts"]["before"].append(before_delimiter)
- if after_delimiter not in delimiters["transcripts"]["after"]:
- delimiters["transcripts"]["after"].append(after_delimiter)
- if user_gene_name in line:
- tabsplitline = line.split("\t")
- annot_type = tabsplitline[2]
- if annot_type not in delimiters["genes"]["annot_types"]:
- delimiters["genes"]["annot_types"].append(annot_type.lower())
- splitline = line.split(user_gene_name)
- before_delimiter = splitline[0]
- for item in punctuation:
- if item in before_delimiter:
- if len(before_delimiter.split(item)[-1]) >= 5:
- before_delimiter = before_delimiter.split(item)[-1]
- after_delimiter = splitline[1][0]
- if before_delimiter not in delimiters["genes"]["before"] and len(before_delimiter) >= 5:
- delimiters["genes"]["before"].append(before_delimiter)
- if after_delimiter not in delimiters["genes"]["after"]:
- if after_delimiter in punctuation:
- delimiters["genes"]["after"].append(after_delimiter)
-for line in fasta_file:
- if user_transcript_id in line:
- splitline = line.split(user_transcript_id)
- before_delimiter = splitline[0]
- for item in punctuation:
- if item in before_delimiter:
- if len(before_delimiter.split(item)[1]) >= 5:
- before_delimiter = before_delimiter.split(item)[1]
- after_delimiter = splitline[1][0]
- if before_delimiter not in delimiters["transcripts"]["before"] and len(before_delimiter) >= 5:
- delimiters["transcripts"]["before"].append(before_delimiter)
- if after_delimiter not in delimiters["transcripts"]["after"]:
- delimiters["transcripts"]["after"].append(after_delimiter)
-fasta_file.close()
-annotation_file.close()
-
-
-
-
-
-if delimiters["transcripts"]["before"] == []:
- print ("ERROR: No transcript_id with the name {} could be found in the annotation file".format(user_transcript_id))
- sys.exit()
-if delimiters["genes"]["before"] == []:
- print ("ERROR: No gene with the name {} could be found in the annotation file".format(user_gene_name))
- sys.exit()
-
-master_dict = {}
-coding_dict = {}
-notinfasta = open("notinfasta.csv","w")
-
-#Given a nucleotide sequence returns the positions of all start and stop codons.
-def get_start_stops(transcript_sequence):
- transcript_sequence = transcript_sequence.upper()
- start_codons = ['ATG']
- stop_codons = ['TAA', 'TAG', 'TGA']
- seq_frames = {'starts': [], 'stops': []}
- for codons, positions in ((start_codons, 'starts'),(stop_codons, 'stops')):
- if len(codons) > 1:
- pat = re.compile('|'.join(codons))
- else:
- pat = re.compile(codons[0])
- for m in re.finditer(pat, transcript_sequence):
- # Increment position by 1, Frame 1 starts at position 1 not 0,
- # if it's a stop codon add another 2 so it points to the last nuc of the codon
- if positions == "starts":
- start = m.start() + 1
- else:
- start = m.start() + 3
- seq_frames[positions].append(start)
- return seq_frames
-
-
-#parse fasta to get the nucleotide sequence of transcripts and the positions of start/stop codons.
-fasta_file = open(sys.argv[2],"r")
-read_fasta = fasta_file.read()
-split_fasta = read_fasta.split(">")
-for entry in split_fasta[1:]:
- newline_split = entry.split("\n")
- tran = newline_split[0]
- for item in delimiters["transcripts"]["after"]:
- if item in tran:
- tran = tran.split(item)[0]
- tran = tran.replace("-","_").replace("(","").replace(")","")
- seq = ("".join(newline_split[1:]))
- if "_PAR_Y" in tran:
- tran += "_chrY"
- elif "_PAR_X" in tran:
- tran += "_chrX"
- tran = tran.upper()
- starts_stops = get_start_stops(seq)
- if tran not in master_dict:
- master_dict[tran] = {"utr":[], "cds":[], "exon":[],"start_codon":[],"stop_codon":[],"start_list":starts_stops["starts"],
- "stop_list":starts_stops["stops"],"transcript":[], "strand":"" ,"gene_name":"","chrom":"","seq":seq,"cds_start":"NULL","cds_stop":"NULL",
- "length":len(seq),"principal":0,"version":"NULL"}
-
-
-
-
-def to_ranges(iterable):
- tup_list = []
- iterable = sorted(set(iterable))
- for key, group in itertools.groupby(enumerate(iterable),lambda t: t[1] - t[0]):
- group = list(group)
- tup_list.append((group[0][1], group[-1][1]))
- return tup_list
-
-#parse annotation file to get chromsome, exon location and CDS info for each transcript
-def parse_gtf_file(annotation_file):
- for line in annotation_file:
- if line == "\n":
- continue
- if line[0] != '#':
- splitline = (line.replace("\n","")).split("\t")
- chrom = splitline[0]
- try:
- annot_type = splitline[2].lower()
- except:
- print ("ERROR tried to index to second item in splitline: ",splitline, line)
- sys.exit()
- #if annot_type not in ["cds", "utr", "exon", "transcript","five_prime_utr", "three_prime_utr","stop_codon","start_codon"]:
- # continue
- if annot_type not in delimiters["transcripts"]["annot_types"] and annot_type not in delimiters["genes"]["annot_types"]:
- continue
- if annot_type == "five_prime_utr" or annot_type == "three_prime_utr":
- annot_type = "utr"
- strand = splitline[6]
- if strand == "+":
- start = int(splitline[3])
- end = int(splitline[4])
- else:
- start = int(splitline[3])+1
- end = int(splitline[4])+1
- desc = splitline[8]
- tran = desc
- gene = desc
- for item in delimiters["transcripts"]["before"]:
- if item in tran:
- tran = tran.split(item)[1]
- for item in delimiters["transcripts"]["after"]:
- if item in tran:
- tran = tran.split(item)[0]
- if "." in tran and TRAN_VERSION == True:
- #print ("raw tran",tran)
- tran = tran.split(".")
- version = int(tran[-1].split("_")[0])
- tran = tran[0]
- else:
- version = "NULL"
- tran = tran.replace("-","_").replace(".","_")
- tran = tran.replace("(","").replace(")","")
- tran = tran.replace(" ","").replace("\t","")
- tran = tran.upper()
- tran = tran.replace("GENE_","").replace("ID_","")
- #print ("tran",tran,version)
- #if tran == "ENST00000316448":
- # print ("annot type",annot_type)
- # print ("appending exon to tran", start, end,line)
-
- gene_found = False
-
- if annot_type in delimiters["genes"]["annot_types"]:
- for item in delimiters["genes"]["before"]:
- if item in gene:
- gene_found = True
- gene = gene.split(item)[1]
- for item in delimiters["genes"]["after"]:
- if item in gene:
- gene = gene.split(item)[0]
- gene = gene.replace("'","''")
- gene = gene.replace("GENE_","")
- gene = gene.replace("ID_","")
- gene = gene.upper()
-
- if tran in master_dict:
- if annot_type in master_dict[tran]:
- master_dict[tran][annot_type].append((start, end))
- master_dict[tran]["strand"] = strand
- master_dict[tran]["chrom"] = chrom
- master_dict[tran]["version"] = version
- if gene_found == True:
- master_dict[tran]["gene_name"] = gene
- else:
- notinfasta.write("{}\n".format(tran))
-
-annotation_file = open(sys.argv[1],"r")
-parse_gtf_file(annotation_file)
-
-
-#remove transcripts that were in fasta file but not in annotation_file
-notinannotation = []
-for tran in master_dict:
- if master_dict[tran]["chrom"] == "":
- #print ("tran {} has no chrom :(".format(tran))
- notinannotation.append(tran)
-for tran in notinannotation:
- del master_dict[tran]
-
-#Dictionary to store the coding status of a gene, if any transcript of this gene is coding, the value will be True
-coding_genes_dict = {}
-#parse master_dict to calculate length, cds_start/stop and exon junction positions
-for tran in master_dict:
- try:
- transeq = master_dict[tran]["seq"]
- except Exception as e:
- print ("not in fasta", tran)
- notinfasta.write("{}\n".format(tran))
- continue
- exon_junctions = []
- total_length = len(transeq)
- three_len = 1
- five_len = 1
- strand = master_dict[tran]["strand"]
- if master_dict[tran]["gene_name"] == "":
- master_dict[tran]["gene_name"] = tran
- gene = master_dict[tran]["gene_name"]
- if gene not in coding_genes_dict:
- coding_genes_dict[gene] = False
-
- if master_dict[tran]["cds"] == []:
- tran_type = "noncoding"
- cds_start = 'NULL'
- cds_stop = 'NULL'
- else:
- #get utr lengths from annotation
- tran_type = "coding"
- coding_genes_dict[gene] = True
- sorted_exons = sorted(master_dict[tran]["exon"])
- sorted_cds = sorted(master_dict[tran]["cds"])
- min_cds = sorted_cds[0][0]
- #Some annotation files do not have utr annotation types, so fix that here if thats the case
- if master_dict[tran]["utr"] == []:
- for exon_tup in master_dict[tran]["exon"]:
- if exon_tup not in master_dict[tran]["cds"]:
- # Now check if this overlaps with any of the CDS exons
- overlap = False
- for cds_tup in master_dict[tran]["cds"]:
- if exon_tup[0] == cds_tup[0] and exon_tup[1] != cds_tup[1]:
- master_dict[tran]["utr"].append((cds_tup[1],exon_tup[1]))
- overlap = True
- if exon_tup[0] != cds_tup[0] and exon_tup[1] == cds_tup[1]:
- master_dict[tran]["utr"].append((exon_tup[0],cds_tup[0]))
- overlap = True
- if overlap == False:
- master_dict[tran]["utr"].append(exon_tup)
-
-
- '''
- if tran == "NM_001258497":
- print ("sorted cds",sorted_cds)
- print ("min cds",min_cds)
- print ("chrom",master_dict[tran]["chrom"])
- print ("sorted exons", sorted_exons)
- print ("utr",master_dict[tran]["utr"])
- sys.exit()
- '''
- #if tran == "ENST00000381401":
- # print ("min cds,sorted utr",min_cds,sorted(master_dict[tran]["utr"]))
- for tup in sorted(master_dict[tran]["utr"]):
- #if tran == "ENST00000381401":
- # print ("tup", tup)
- if tup[0] < min_cds:
- five_len += (tup[1]-tup[0])+1
- #if tran == "ENST00000381401":
- # print ("adding to fivelen")
- elif tup[0] > min_cds:
- three_len += (tup[1] - tup[0])+1
- #if tran == "ENST00000381401":
- # print ("adding to three len")
- else:
- pass
- if strand == "+":
- if len(sorted_exons) > 1:
- sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1])
- sorted_exons[-1] = (sorted_exons[-1][0], sorted_exons[-1][1]+pseudo_utr_len)
- else:
- sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1]+pseudo_utr_len)
- master_dict[tran]["exon"] = sorted_exons
- cds_start = (five_len+pseudo_utr_len)
- cds_stop = ((total_length - three_len)-pseudo_utr_len)+1
- elif strand == "-":
- if len(sorted_exons) > 1:
- sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1])
- sorted_exons[-1] = (sorted_exons[-1][0], sorted_exons[-1][1]+pseudo_utr_len)
- else:
- sorted_exons[0] = (sorted_exons[0][0]-pseudo_utr_len, sorted_exons[0][1]+pseudo_utr_len)
- master_dict[tran]["exon"] = sorted_exons
- cds_start = (three_len+pseudo_utr_len)
- cds_stop = ((total_length - (five_len))-pseudo_utr_len)+1
- #if tran == "ENST00000381401":
- # print ("cds start, cds stop, five_len, three_len",cds_start,cds_stop,five_len,three_len)
- # #sys.exit()
- else:
- print ("strand is unknown: {}".format(strand))
- sys.exit()
-
- #get exon junctions, cds is easy just get end of each tuple except last, same for utr except for if same as cds start/stop
- total_intronic = 0
- try:
- if strand == "+":
- tx_start = min(sorted(master_dict[tran]["exon"]))[0]
- prev_end = tx_start
- for tup in sorted(master_dict[tran]["exon"])[:-1]:
- total_intronic += tup[0]-prev_end
- exon_junctions.append(((tup[1])-tx_start)-total_intronic)
- prev_end = tup[1]
- elif strand == "-":
- tx_start = max(sorted(master_dict[tran]["exon"]))[-1]
- prev_end = tx_start
- for tup in (sorted(master_dict[tran]["exon"])[1:])[::-1]:
- total_intronic += (tup[0]+1)-prev_end
- exon_junctions.append(((tup[1])-tx_start)-total_intronic)
- prev_end = tup[1]
- except:
- if strand == "+":
- tx_start = min(sorted(master_dict[tran]["cds"]))[0]
- prev_end = tx_start
- for tup in sorted(master_dict[tran]["cds"])[:-1]:
- total_intronic += tup[0]-prev_end
- exon_junctions.append(((tup[1])-tx_start)-total_intronic)
- prev_end = tup[1]
- elif strand == "-":
- tx_start = max(sorted(master_dict[tran]["cds"]))[-1]
- prev_end = tx_start
- for tup in (sorted(master_dict[tran]["cds"])[1:])[::-1]:
- total_intronic += (tup[0]+1)-prev_end
- exon_junctions.append(((tup[1])-tx_start)-total_intronic)
- prev_end = tup[1]
- if strand == "+" and cds_start != "NULL":
- master_dict[tran]["cds_start"] = cds_start
- master_dict[tran]["cds_stop"] = cds_stop
- elif strand == "-" and cds_start != "NULL":
- master_dict[tran]["cds_start"] = cds_start
- master_dict[tran]["cds_stop"] = cds_stop
- master_dict[tran]["strand"] = strand
- master_dict[tran]["tran_type"] = tran_type
- master_dict[tran]["exon_junctions"] = exon_junctions
-
-longest_tran_dict = {}
-for tran in master_dict:
- try:
- gene = master_dict[tran]["gene_name"]
- except:
- continue
- if coding_genes_dict[gene] == True:
- if "cds_start" in master_dict[tran]:
- if master_dict[tran]["cds_stop"] != "NULL" and master_dict[tran]["cds_start"] != "NULL":
- cds_length = master_dict[tran]["cds_stop"]- master_dict[tran]["cds_start"]
- if gene not in longest_tran_dict:
- longest_tran_dict[gene] = {"tran":tran,"length":cds_length}
- else:
- if cds_length > longest_tran_dict[gene]["length"]:
- longest_tran_dict[gene] = {"tran":tran,"length":cds_length}
- if cds_length == longest_tran_dict[gene]["length"]:
- if master_dict[tran]["length"] > master_dict[longest_tran_dict[gene]["tran"]]["length"]:
- longest_tran_dict[gene] = {"tran":tran,"length":cds_length}
- else:
- length = master_dict[tran]["length"]
- if gene not in longest_tran_dict:
- longest_tran_dict[gene] = {"tran":tran,"length":length}
- elif length > longest_tran_dict[gene]["length"]:
- longest_tran_dict[gene] = {"tran":tran,"length":length}
-
-
-
-
-
-for gene in longest_tran_dict:
- longest_tran = longest_tran_dict[gene]["tran"]
- master_dict[longest_tran]["principal"] = 1
-
-gene_sample = []
-for key in list(master_dict)[:10]:
- try:
- gene_sample.append(master_dict[key]["gene_name"])
- except:
- pass
-
-print ("Here is a sample of the transcript ids: {}".format(list(master_dict)[:10]))
-print ("Here is a sample of the gene names: {}".format(gene_sample))
-
-
-# Takes a transcript, transcriptomic position and a master_dict (see ribopipe scripts) and returns the genomic position, positions should be passed 1 at a time.
-def tran_to_genome(tran, start_pos, end_pos, master_dict):
- pos_list = []
- for i in range(start_pos,end_pos+1):
- pos_list.append(i)
- genomic_pos_list = []
- if tran in master_dict:
- transcript_info = master_dict[tran]
- else:
- return ("Null", [])
-
- chrom = transcript_info["chrom"]
- strand = transcript_info["strand"]
- exons = transcript_info["exon"]
- #print ("chrom,strand,exons",chrom,strand,exons)
- for pos in pos_list:
- #print ("pos",pos)
- if strand == "+":
- exon_start = 0
- for tup in exons:
- #print ("tup",tup)
- exon_start = tup[0]
- exonlen = tup[1] - tup[0]
- if pos > exonlen:
- pos = (pos - exonlen)-1
- else:
- break
- #print ("appending exon_start-pos", exon_start, pos, exon_start+pos)
- genomic_pos_list.append((exon_start+pos)-1)
- elif strand == "-":
- exon_start = 0
- for tup in exons[::-1]:
- #print ("tup",tup)
- exon_start = tup[1]
- exonlen = tup[1] - tup[0]
- #print ("exonlen",exonlen)
- if pos > exonlen:
- #print ("pos is greater")
- pos = (pos - exonlen)-1
- #print ("new pos",pos)
- else:
- break
- #print ("appending exon_start-pos", exon_start, pos, exon_start-pos)
- genomic_pos_list.append((exon_start-pos)+1)
- return (chrom, genomic_pos_list)
-
-
-
-
-orf_dict = {"uorf":{},
- "ouorf":{},
- "cds":{},
- "nested":{},
- "odorf":{},
- "dorf":{},
- "minusone":{},
- "readthrough":{},
- "plusone":{},
- "noncoding":{},
- "extension":{},
- "inframe_stop":{}
- }
-
-start_codons = ["ATG","GTG","CTG"]
-stop_codons = ["TAG","TAA","TGA"]
-
-
-# Keep track of the longest transcript for each noncoding gene, append this to transcript list later
-longest_noncoding = {}
-
-
-tran_count = 0
-# This section is used to gather all cds regions, convert them to genomic regions and store them in a dictionary to check against later (all transcript contribute to this not just those
-# in the transcript list)
-genomic_cds_dict = {}
-tree_dict = {}
-for transcript in master_dict:
- #print (transcript, master_dict[transcript]["tran_type"])
- tran_count += 1
- if "seq" not in master_dict[transcript]:
- continue
- chrom = master_dict[transcript]["chrom"]
- if chrom not in genomic_cds_dict:
- genomic_cds_dict[chrom] = []
- if "cds_start" in master_dict[transcript]:
- cds_start = master_dict[transcript]["cds_start"]
- cds_stop = master_dict[transcript]["cds_stop"]
- if cds_start != "NULL":
- cds_pos = []
- for i in range(cds_start, cds_stop+1):
- cds_pos.append(i)
-
- for tup in master_dict[transcript]["cds"]:
- if tup[0] != tup[1]:
- if tup not in genomic_cds_dict[chrom]:
- genomic_cds_dict[chrom].append(tup)
-
-print ("genomic cds dict built")
-print (list(genomic_cds_dict))
-for chrom in genomic_cds_dict:
- tree_dict[chrom] = IntervalTree.from_tuples(genomic_cds_dict[chrom])
-
-#print (list(tree_dict))
-
-
-connection = sqlite3.connect("{}".format(sys.argv[6]))
-cursor = connection.cursor()
-cursor.execute("CREATE TABLE IF NOT EXISTS transcripts (transcript VARCHAR(50), gene VARCHAR(50), length INT(6), cds_start INT(6), cds_stop INT(6), sequence VARCHAR(50000), strand CHAR(1), stop_list VARCHAR(10000), start_list VARCHAR(10000), exon_junctions VARCHAR(1000), tran_type INT(1), gene_type INT(1), principal INT(1), version INT(2),gc INT(3),five_gc INT(3), cds_gc INT(3), three_gc INT(3), chrom VARCHAR(20));")
-cursor.execute("CREATE TABLE IF NOT EXISTS coding_regions (transcript VARCHAR(50), coding_start INT(6), coding_stop INT(6));")
-cursor.execute("CREATE TABLE IF NOT EXISTS exons (transcript VARCHAR(50), exon_start INT(6), exon_stop INT(6));")
-cursor.execute("CREATE TABLE IF NOT EXISTS uorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
-cursor.execute("CREATE TABLE IF NOT EXISTS ouorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
-cursor.execute("CREATE TABLE IF NOT EXISTS cds (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
-cursor.execute("CREATE TABLE IF NOT EXISTS nested (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
-cursor.execute("CREATE TABLE IF NOT EXISTS odorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
-cursor.execute("CREATE TABLE IF NOT EXISTS dorf (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
-cursor.execute("CREATE TABLE IF NOT EXISTS minusone(transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
-cursor.execute("CREATE TABLE IF NOT EXISTS readthrough (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
-cursor.execute("CREATE TABLE IF NOT EXISTS plusone (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
-cursor.execute("CREATE TABLE IF NOT EXISTS noncoding (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
-cursor.execute("CREATE TABLE IF NOT EXISTS extension (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
-cursor.execute("CREATE TABLE IF NOT EXISTS inframe_stop (transcript VARCHAR(300), start_codon VARCHAR(10), length INT(6), start INT(6), stop INT(6), sequence VARCHAR(50000), cds_coverage FLOAT(20));")
-connection.commit();
-
-
-print ("Finding ORFs")
-transcript_count = 0
-total_transcripts = len(list(master_dict))
-for transcript in master_dict:
- #print ("transcript",transcript)
- #if transcript != "ENST00000316448":
- # continue
- transcript_count += 1
- if transcript_count%100 == 0:
- print ("Transcripts complete: {}/{}".format(transcript_count,total_transcripts))
- if "seq" not in master_dict[transcript]:
- print ("transcript {} has no sequence".format(transcript))
- continue
- seq = master_dict[transcript]["seq"]
- cds_start = "NULL"
- cds_stop = "NULL"
- transcript_len = len(seq)
- if "cds_start" in master_dict[transcript]:
- cds_start = master_dict[transcript]["cds_start"]
- cds_stop = master_dict[transcript]["cds_stop"]
-
- #Find out what regions of this transcript overlap with any other coding regions
- coding_positions = []
- if cds_start != "NULL":
- #If this is a coding transcript don't bother checking the CDS
- for i in range(cds_start,cds_stop):
- coding_positions.append(i)
- #check 5' leader
- chrom, pos_list = tran_to_genome(transcript, 0, cds_start, master_dict)
- for i in range(0,cds_start):
- genomic_pos = pos_list[i]
- overlap = tree_dict[chrom][genomic_pos]
- if len(overlap) != 0:
- coding_positions.append(i)
- #check 3' trailer
- chrom, pos_list = tran_to_genome(transcript, cds_stop, transcript_len, master_dict)
- for i in range(cds_stop,transcript_len+1):
- #print ("i",i)
- genomic_pos = pos_list[i-cds_stop]
- #print ("genomic position",genomic_pos)
- overlap = tree_dict[chrom][genomic_pos]
- if len(overlap) != 0:
- #print ("overlap not empty appending i",overlap)
- coding_positions.append(i)
- else:
- #check entire transcript
- chrom, pos_list = tran_to_genome(transcript, 0, transcript_len, master_dict)
- for i in range(0,transcript_len):
- genomic_pos = pos_list[i]
- overlap = tree_dict[chrom][genomic_pos]
- if len(overlap) != 0:
- coding_positions.append(i)
- coding_positions_tuple = to_ranges(coding_positions)
- coding_dict[transcript] = coding_positions_tuple
- coding_positions = set(coding_positions)
- #if this is a coding transcript find the minusone, readhtrough, plusone co-ordinates
- if cds_start != "NULL":
- if pseudo_utr_len != 0:
- cds_stop -= 3 # take 3 from stop so we can match it with orf_stop, do it here rather than above in case cds_stop is null
- recoding_dict = {0:"minusone",1:"readthrough",2:"plusone"}
- for addition in recoding_dict:
- orftype = recoding_dict[addition]
- for i in range(cds_stop+addition,transcript_len,3):
- if seq[i:i+3] in stop_codons:
- orf_seq = seq[cds_stop:i+3]
- orf_start = cds_stop
- orf_stop = i+2 # +2 so it refers to the end of the stop codon
- start_codon = None
- if orf_seq:
- length = len(orf_seq)
- orf_pos_list = []
- #determine how many nucleotides in this orf overlap with an annotated coding region
- cds_cov_count = 0.0
- for position in range(orf_start,orf_stop):
- orf_pos_list.append(position)
- for pos in range(orf_start, orf_stop+1):
- if pos in coding_positions:
- cds_cov_count += 1
- cds_cov = cds_cov_count/length
- cursor.execute("INSERT INTO {} VALUES('{}','{}',{},{},{},'{}',{});".format(orftype, transcript, start_codon, length,orf_start,orf_stop,orf_seq,cds_cov))
- break
- for frame in [0,1,2]:
- for i in range(frame,transcript_len,3):
- if seq[i:i+3] in start_codons:
- for x in range(i, transcript_len,3):
- if seq[x:x+3] in stop_codons:
- orf_seq = seq[i:x+3]
- orf_start = i
- orf_stop = x+2 # +2 so it refers to the end of the stop codon
- start_codon = seq[i:i+3]
- length = len(orf_seq)
- orf_pos_list = []
- #determine how many nucleotides in this orf overlap with an annotated coding region
- cds_cov_count = 0.0
- for pos in range(orf_start, orf_stop+1):
- if pos in coding_positions:
- cds_cov_count += 1
- cds_cov = float(cds_cov_count)/float(length)
- # Now determine orf type
- if cds_start == "NULL":
- orftype = "noncoding"
- else:
- #print ("cds start is not null :{}:".format(cds_start))
- if orf_start == cds_start and orf_stop == cds_stop:
- orftype = "cds"
- elif orf_start < cds_start and orf_stop == cds_stop:
- orftype = "extension"
- #special case for extensions, we only take from the orf_start to the cds_start, and re-calculate cds coverage
- orf_stop = cds_start
- cds_cov_count = 0.0
- for pos in range(orf_start, orf_stop+1):
- if pos in coding_positions:
- cds_cov_count += 1
- cds_cov = float(cds_cov_count)/float(length)
- orf_seq = seq[orf_start:cds_start]
- length = len(orf_seq)
- elif orf_start < cds_start and orf_stop <= cds_start:
- orftype = "uorf"
- elif orf_start < cds_start and orf_stop > cds_start:
- orftype = "ouorf"
- elif orf_start >= cds_start and orf_start <= cds_stop and orf_stop <= cds_stop:
- if orf_stop == cds_stop:
- break
- orftype = "nested"
- elif orf_start >= cds_start and orf_start <= cds_stop and orf_stop > cds_stop:
- orftype = "odorf"
- elif orf_start > cds_stop and orf_stop > cds_stop:
- orftype = "dorf"
- if orf_stop > cds_start and orf_stop < cds_stop:
- if (orf_stop+1)%3 == cds_start%3:
- orftype = "inframe_stop"
- if transcript not in orf_dict:
- orf_dict[orftype][transcript] = []
- cursor.execute("INSERT INTO {} VALUES('{}','{}',{},{},{},'{}',{});".format(orftype, transcript, start_codon, length,orf_start,orf_stop,orf_seq,cds_cov))
- break
-# Used to keep track of the codons at cds_start and cds_stop positions,
-# If there is an issue with the cds co-ordinates the starts and stops counts will
-# be much lower than the other count, start with 1 to prevent division by 0
-nuc_dict = {"stops":{"stops":1,"other":0}, "starts":{"starts":1,"other":0}}
-
-def calcgc(seq):
- if seq == "":
- return "NULL"
- g_count = 0
- c_count = 0
- a_count = 0
- t_count = 0
- for char in seq:
- if char == "A":
- a_count += 1
- if char == "T":
- t_count += 1
- if char == "G":
- g_count += 1
- if char == "C":
- c_count += 1
- gc = ((g_count+c_count)/float(len(seq)))*100
- return round(gc,2)
-
-
-
-
-for transcript in master_dict:
- #print ("transcripts", transcript)
- length = master_dict[transcript]["length"]
- cds_start = master_dict[transcript]["cds_start"]
- cds_stop = master_dict[transcript]["cds_stop"]
- seq = master_dict[transcript]["seq"]
- strand = master_dict[transcript]["strand"]
- chrom = master_dict[transcript]["chrom"]
- gene = master_dict[transcript]["gene_name"]
- gc = calcgc(seq)
- five_gc = "NULL"
- cds_gc = "NULL"
- three_gc = "NULL"
- if cds_start != "NULL":
- five_gc = calcgc(seq[:cds_start])
- cds_gc = calcgc(seq[cds_start:cds_stop])
- three_gc = calcgc(seq[cds_stop:])
- # check that the nucleotide cds_start points to is the first of the start codon
- # take one becase cds_start is 1 based but python indexing is 0 based
- start_nuc = seq[cds_start-1:cds_start+2]
- #print ("start nuc",start_nuc)
- if start_nuc == "ATG":
- nuc_dict["starts"]["starts"] += 1
- else:
- nuc_dict["starts"]["other"] += 1
- stop_nuc = seq[cds_stop-3:cds_stop]
- #print ("stop_nuc",stop_nuc)
- if stop_nuc in ["TAG","TAA","TGA"]:
- nuc_dict["stops"]["stops"] += 1
- else:
- nuc_dict["stops"]["other"] += 1
- tran_type = master_dict[transcript]["tran_type"]
- if coding_genes_dict[gene] == True:
- gene_type = 1
- else:
- gene_type = 0
- #print ("tran type before",tran_type)
- if tran_type == "coding":
- tran_type = 1
- else:
- tran_type = 0
- #print ("tran type after",tran_type)
- start_list = str(master_dict[transcript]["start_list"]).replace(" ","").strip("[]")
- stop_list = str(master_dict[transcript]["stop_list"]).replace(" ","").strip("[]")
- exon_junctions = str(master_dict[transcript]["exon_junctions"]).replace(" ","").strip("[]")
- principal = master_dict[transcript]["principal"]
- version = master_dict[transcript]["version"]
- #print (master_dict[transcript])
- #print (tran_type)
- #print (gene_type)
- #print (principal)
- #print (version)
- #print ("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{});".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version))
- cursor.execute("INSERT INTO transcripts VALUES('{}','{}',{},{},{},'{}','{}','{}','{}','{}',{},{},{},{},{},{},{},{},'{}');".format(transcript, gene, length, cds_start, cds_stop, seq, strand,stop_list, start_list, exon_junctions, tran_type,gene_type,principal,version,gc,five_gc,cds_gc,three_gc,chrom))
-
- for tup in master_dict[transcript]["exon"]:
- cursor.execute("INSERT INTO exons VALUES('{}',{},{});".format(transcript,tup[0],tup[1]))
- if transcript in coding_dict:
- for tup in coding_dict[transcript]:
- cursor.execute("INSERT INTO coding_regions VALUES('{}',{},{});".format(transcript,tup[0],tup[1]))
-
-connection.commit()
-connection.close()
-
-if (nuc_dict["starts"]["other"]/nuc_dict["starts"]["starts"]) > 0.05:
- print ("Warning: {} transcripts do not have a an AUG at the CDS start position".format(nuc_dict["starts"]["other"]))
-if (nuc_dict["stops"]["other"]/nuc_dict["stops"]["stops"]) > 0.05:
- print ("Warning: {} transcripts do not have a an stop codon at the CDS stop position".format(nuc_dict["stops"]["other"]))
-if len(notinannotation) >0:
- print ("Warning: {} transcripts were in the fasta file, but not the annotation file, these will be discarded".format(len(notinannotation)))
diff -r c5a566609a25 -r 68e8704d9484 trips_create_new_organism/create_annotation_sqlite.xml
--- a/trips_create_new_organism/create_annotation_sqlite.xml Fri Feb 25 11:24:50 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,84 +0,0 @@
-
-
- intervaltree
- sqlite
- pysqlite3
- python-intervaltree
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-@misc{githubTrips-Viz,
- author = {LastTODO, FirstTODO},
- year = {TODO},
- title = {Trips-Viz},
- publisher = {GitHub},
- journal = {GitHub repository},
- url = {https://github.com/skiniry/Trips-Viz},
-}
-
-
\ No newline at end of file
diff -r c5a566609a25 -r 68e8704d9484 trips_create_new_organism/create_annotation_sqlite_.xml
--- a/trips_create_new_organism/create_annotation_sqlite_.xml Fri Feb 25 11:24:50 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,71 +0,0 @@
-
- create new organism
- create_annotation_sqlite.py $annotation $fasta $pseudo_utr_len $transcript $gene $output
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-**GTF/GFF3 File**
-
-GFF lines have nine required fields that must be tab-separated.
-The GFF3 format addresses the most common extensions to GFF, while preserving backward compatibility with previous formats.
-
-Both transcript ids and gene names should be listed in the file.
-
------
-
-**Transcriptome FASTA file**
-
-A FASTA file with an entry for every transcript. The headers should be the transcript id's as they appear in the GTF/GFF3 file.
-
------
-
-**Psuedo UTR length**
-An integer representing the length (in nucleotides) to be added to the 5' end and 3' end of every transcript with an annotated
-CDS. Useful for when an organism does not have any annotated UTR's, if it does use 0. If not 0, the extra nucleotides should
-already be present in the FASTA file.
-
------
-
-**Example transcript**
-
-An example of a transcript id that appears in the FASTA/GTF/GFF3 file, e.g ENST00000123456
-
------
-
-**Example Gene**
-
-An example of a gene name as it appears in the GTF/GFF3 file, e.g BRCA1
-
------
-
-**Output**
-
-The output of the script can be downloaded and uploaded to Trips-viz_. by signing in and going to the uploads page, then selecting
-"Upload new transcriptome". When uploaded the new organism will appear on the home page of Trips-viz, or under the transcriptomes
-page if the organism name used is already present on Trips-viz.
-
-
-
-.. _Trips-viz: http://trips.ucc.ie
-
-
-
-