comparison trips_create_annotation/create_annotation_sqlite.py @ 3:f1c72ed4b32c draft

Uploaded
author jackcurragh
date Wed, 20 Apr 2022 15:18:02 +0000
parents d70696d3341e
children cdecd5f9a4d3
comparison
equal deleted inserted replaced
2:d70696d3341e 3:f1c72ed4b32c
4 # All start codon positions (including cds_start) should be at the first nucleotide of the codon 4 # All start codon positions (including cds_start) should be at the first nucleotide of the codon
5 # All stop codon positions (including cds_stop) should be at the last nucleotide of the codon 5 # All stop codon positions (including cds_stop) should be at the last nucleotide of the codon
6 import sys 6 import sys
7 import re 7 import re
8 import sqlite3 8 import sqlite3
9 import subprocess
10
9 from intervaltree import Interval, IntervalTree 11 from intervaltree import Interval, IntervalTree
10 import itertools 12 import itertools
11 from sqlitedict import SqliteDict 13 from sqlitedict import SqliteDict
12 import os 14 import os
13 15
22 user_transcript_id = sys.argv[5] 24 user_transcript_id = sys.argv[5]
23 # An example of a gene name from the annotation file 25 # An example of a gene name from the annotation file
24 user_gene_name = sys.argv[6] 26 user_gene_name = sys.argv[6]
25 # Set to true if transcript version is included in transcript_id, e.g: ENST000000123456.1 27 # Set to true if transcript version is included in transcript_id, e.g: ENST000000123456.1
26 TRAN_VERSION = True 28 TRAN_VERSION = True
29 output = sys.argv[7]
27 30
28 31
29 if os.path.isfile("{}.sqlite".format(organism)): 32 if os.path.isfile("{}.sqlite".format(organism)):
30 print("{}.sqlite already exists".format(organism)) 33 print("{}.sqlite already exists".format(organism))
31 sys.exit() 34 sys.exit()
608 tree_dict[chrom] = IntervalTree.from_tuples(genomic_cds_dict[chrom]) 611 tree_dict[chrom] = IntervalTree.from_tuples(genomic_cds_dict[chrom])
609 612
610 # print (list(tree_dict)) 613 # print (list(tree_dict))
611 614
612 615
613 connection = sqlite3.connect("{}.sqlite".format(organism)) 616 connection = sqlite3.connect(output)
614 cursor = connection.cursor() 617 cursor = connection.cursor()
615 cursor.execute( 618 cursor.execute(
616 "CREATE TABLE IF NOT EXISTS transcripts (transcript VARCHAR(50), gene VARCHAR(50), length INT(6), cds_start INT(6), cds_stop INT(6), sequence VARCHAR(50000), strand CHAR(1), stop_list VARCHAR(10000), start_list VARCHAR(10000), exon_junctions VARCHAR(1000), tran_type INT(1), gene_type INT(1), principal INT(1), version INT(2),gc INT(3),five_gc INT(3), cds_gc INT(3), three_gc INT(3), chrom VARCHAR(20));" 619 "CREATE TABLE IF NOT EXISTS transcripts (transcript VARCHAR(50), gene VARCHAR(50), length INT(6), cds_start INT(6), cds_stop INT(6), sequence VARCHAR(50000), strand CHAR(1), stop_list VARCHAR(10000), start_list VARCHAR(10000), exon_junctions VARCHAR(1000), tran_type INT(1), gene_type INT(1), principal INT(1), version INT(2),gc INT(3),five_gc INT(3), cds_gc INT(3), three_gc INT(3), chrom VARCHAR(20));"
617 ) 620 )
618 cursor.execute( 621 cursor.execute(
967 cursor.execute( 970 cursor.execute(
968 "INSERT INTO coding_regions VALUES('{}',{},{});".format( 971 "INSERT INTO coding_regions VALUES('{}',{},{});".format(
969 transcript, tup[0], tup[1] 972 transcript, tup[0], tup[1]
970 ) 973 )
971 ) 974 )
972 975 # print(cursor.execute(
973 connection.commit() 976 # ".tables"
974 connection.close() 977 # ))
975 978
976 print("delim", delimiters) 979 print("delim", delimiters)
977 if (nuc_dict["starts"]["other"] / nuc_dict["starts"]["starts"]) > 0.05: 980 if (nuc_dict["starts"]["other"] / nuc_dict["starts"]["starts"]) > 0.05:
978 print( 981 print(
979 "Warning: {} transcripts do not have a an AUG at the CDS start position".format( 982 "Warning: {} transcripts do not have a an AUG at the CDS start position".format(
990 print( 993 print(
991 "Warning: {} transcripts were in the fasta file, but not the annotation file, these will be discarded".format( 994 "Warning: {} transcripts were in the fasta file, but not the annotation file, these will be discarded".format(
992 len(notinannotation) 995 len(notinannotation)
993 ) 996 )
994 ) 997 )
998
999
1000
1001 connection.commit()
1002 connection.close()
1003
1004