Mercurial > repos > pravs > pep_pointer
view pep_pointer.py @ 2:0c67b75c7943 draft
planemo upload
author | pravs |
---|---|
date | Thu, 28 Dec 2017 17:39:08 -0500 |
parents | 032f1ee171ed |
children |
line wrap: on
line source
# # Author: Praveen Kumar # Updated: Nov 8th, 2017 # # # import re def main(): import sys if len(sys.argv) == 4: inputFile = sys.argv infh = open(inputFile[1], "r") # infh = open("Mus_musculus.GRCm38.90.chr.gtf", "r") gtf = {} gtf_transcript = {} gtf_gene = {} for each in infh.readlines(): a = each.split("\t") if re.search("^[^#]", each): if re.search("gene_biotype \"protein_coding\"", a[8]) and int(a[4].strip()) != int(a[3].strip()): type = a[2].strip() if type == "gene" or type == "exon" or type == "CDS" or type == "five_prime_utr" or type == "three_prime_utr": chr = "chr" + a[0].strip() strand = a[6].strip() if strand == "+": start = a[3].strip() end = a[4].strip() elif strand == "-": if int(a[4].strip()) > int(a[3].strip()): start = a[3].strip() end = a[4].strip() elif int(a[4].strip()) < int(a[3].strip()): start = a[4].strip() end = a[3].strip() else: print "Something fishy in start end coordinates" else: print "Something fishy in reading" if not gtf.has_key(strand): gtf[strand] = {} if not gtf[strand].has_key(type): gtf[strand][type] = [] b = re.search("gene_id \"(.+?)\";", a[8].strip()) gene = b.group(1) if type == "gene": transcript = "" else: b = re.search("transcript_id \"(.+?)\";", a[8].strip()) transcript = b.group(1) data = (chr, start, end, gene, transcript, strand, type) gtf[strand][type].append(data) if type == "exon": if gtf_transcript.has_key(chr+"#"+strand): if gtf_transcript[chr+"#"+strand].has_key(transcript+"#"+gene): gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) else: gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]] gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) else: gtf_transcript[chr+"#"+strand] = {} gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]] gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) if type == "gene": if gtf_gene.has_key(chr+"#"+strand): gtf_gene[chr+"#"+strand][0].append(int(start)) gtf_gene[chr+"#"+strand][1].append(int(end)) gtf_gene[chr+"#"+strand][2].append(gene) else: gtf_gene[chr+"#"+strand] = [[0],[0],["no_gene"]] gtf_gene[chr+"#"+strand][0].append(int(start)) gtf_gene[chr+"#"+strand][1].append(int(end)) gtf_gene[chr+"#"+strand][2].append(gene) # "Starting Reading Intron . . ." gtf["+"]["intron"] = [] gtf["-"]["intron"] = [] for chr_strand in gtf_transcript.keys(): chr = chr_strand.split("#")[0] strand = chr_strand.split("#")[1] for transcript_gene in gtf_transcript[chr_strand].keys(): start_list = gtf_transcript[chr_strand][transcript_gene][0] end_list = gtf_transcript[chr_strand][transcript_gene][1] sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])] sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])] if sorted_start_index == sorted_end_index: sorted_start = sorted(start_list) sorted_end = [end_list[i] for i in sorted_start_index] for x in range(len(sorted_start))[1:]: intron_start = sorted_end[x-1]+1 intron_end = sorted_start[x]-1 transcript = transcript_gene.split("#")[0] gene = transcript_gene.split("#")[1] data = (chr, str(intron_start), str(intron_end), gene, transcript, strand, "intron") gtf[strand]["intron"].append(data) # "Starting Reading Intergenic . . ." gtf["+"]["intergenic"] = [] gtf["-"]["intergenic"] = [] for chr_strand in gtf_gene.keys(): chr = chr_strand.split("#")[0] strand = chr_strand.split("#")[1] start_list = gtf_gene[chr_strand][0] end_list = gtf_gene[chr_strand][1] gene_list = gtf_gene[chr_strand][2] sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])] sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])] sorted_start = sorted(start_list) sorted_end = [end_list[i] for i in sorted_start_index] sorted_gene = [gene_list[i] for i in sorted_start_index] for x in range(len(sorted_start))[1:]: intergene_start = sorted_end[x-1]+1 intergene_end = sorted_start[x]-1 if intergene_start < intergene_end: intergene_1 = sorted_gene[x-1] intergene_2 = sorted_gene[x] gene = intergene_1 + "-#-" + intergene_2 data = (chr, str(intergene_start), str(intergene_end), gene, "", strand, "intergenic") gtf[strand]["intergenic"].append(data) import sqlite3 # conn = sqlite3.connect('gtf_database.db') conn = sqlite3.connect(":memory:") c = conn.cursor() # c.execute("DROP TABLE IF EXISTS gtf_data;") # c.execute("CREATE TABLE IF NOT EXISTS gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") c.execute("CREATE TABLE gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") for strand in gtf.keys(): if strand == "+": st = "positive" elif strand == "-": st = "negative" else: print "Something fishy in writing . . ." for type in gtf[strand].keys(): data = gtf[strand][type] c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data) conn.commit() infh = open(inputFile[2], "r") # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r") data = infh.readlines() # output file outfh = open(inputFile[3], 'w') # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w") for each in data: a = each.split("\t") chr = a[0].strip() pep_start = a[1].strip() pep_end = a[2].strip() strand = a[5].strip() c.execute("select * from gtf_data where type = 'CDS' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") rows = c.fetchall() if len(rows) > 0: outfh.write(each.strip() + "\tCDS\n") else: c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") rows = c.fetchall() if len(rows) > 0: outfh.write(each.strip() + "\tfive_prime_utr\n") else: c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") rows = c.fetchall() if len(rows) > 0: outfh.write(each.strip() + "\tthree_prime_utr\n") else: c.execute("select * from gtf_data where type = 'exon' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") rows = c.fetchall() if len(rows) > 0: outfh.write(each.strip() + "\texon\n") else: c.execute("select * from gtf_data where type = 'intron' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") rows = c.fetchall() if len(rows) > 0: outfh.write(each.strip() + "\tintron\n") else: c.execute("select * from gtf_data where type = 'gene' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") rows = c.fetchall() if len(rows) > 0: outfh.write(each.strip() + "\tgene\n") else: c.execute("select * from gtf_data where type = 'intergenic' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") rows = c.fetchall() if len(rows) > 0: outfh.write(each.strip() + "\tintergene\n") else: outfh.write(each.strip() + "\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\n") conn.close() outfh.close() else: print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>" return None if __name__ == "__main__": main()