Mercurial > repos > galaxyp > pep_pointer
diff pep_pointer.py @ 0:149ed6a9680f draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pep_pointer commit ac27a958fcb897c3cb56db313ebd282805b01103
author | galaxyp |
---|---|
date | Fri, 29 Dec 2017 12:37:22 -0500 |
parents | |
children | 073a2965e3b2 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pep_pointer.py Fri Dec 29 12:37:22 2017 -0500 @@ -0,0 +1,221 @@ + +# +# Author: Praveen Kumar +# Updated: Nov 8th, 2017 +# +# +# + +import re + + +def main(): + import sys + if len(sys.argv) == 4: + inputFile = sys.argv + infh = open(inputFile[1], "r") + # infh = open("Mus_musculus.GRCm38.90.chr.gtf", "r") + + gtf = {} + gtf_transcript = {} + gtf_gene = {} + for each in infh.readlines(): + a = each.split("\t") + if re.search("^[^#]", each): + if re.search("gene_biotype \"protein_coding\"", a[8]) and int(a[4].strip()) != int(a[3].strip()): + type = a[2].strip() + if type == "gene" or type == "exon" or type == "CDS" or type == "five_prime_utr" or type == "three_prime_utr": + chr = "chr" + a[0].strip() + strand = a[6].strip() + if strand == "+": + start = a[3].strip() + end = a[4].strip() + elif strand == "-": + if int(a[4].strip()) > int(a[3].strip()): + start = a[3].strip() + end = a[4].strip() + elif int(a[4].strip()) < int(a[3].strip()): + start = a[4].strip() + end = a[3].strip() + else: + print "Something fishy in start end coordinates" + else: + print "Something fishy in reading" + if not gtf.has_key(strand): + gtf[strand] = {} + if not gtf[strand].has_key(type): + gtf[strand][type] = [] + b = re.search("gene_id \"(.+?)\";", a[8].strip()) + gene = b.group(1) + if type == "gene": + transcript = "" + else: + b = re.search("transcript_id \"(.+?)\";", a[8].strip()) + transcript = b.group(1) + data = (chr, start, end, gene, transcript, strand, type) + gtf[strand][type].append(data) + + if type == "exon": + if gtf_transcript.has_key(chr+"#"+strand): + if gtf_transcript[chr+"#"+strand].has_key(transcript+"#"+gene): + gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) + gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) + else: + gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]] + gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) + gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) + else: + gtf_transcript[chr+"#"+strand] = {} + gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]] + gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) + gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) + + if type == "gene": + if gtf_gene.has_key(chr+"#"+strand): + gtf_gene[chr+"#"+strand][0].append(int(start)) + gtf_gene[chr+"#"+strand][1].append(int(end)) + gtf_gene[chr+"#"+strand][2].append(gene) + else: + gtf_gene[chr+"#"+strand] = [[0],[0],["no_gene"]] + gtf_gene[chr+"#"+strand][0].append(int(start)) + gtf_gene[chr+"#"+strand][1].append(int(end)) + gtf_gene[chr+"#"+strand][2].append(gene) + + + + # "Starting Reading Intron . . ." + + gtf["+"]["intron"] = [] + gtf["-"]["intron"] = [] + for chr_strand in gtf_transcript.keys(): + chr = chr_strand.split("#")[0] + strand = chr_strand.split("#")[1] + + for transcript_gene in gtf_transcript[chr_strand].keys(): + start_list = gtf_transcript[chr_strand][transcript_gene][0] + end_list = gtf_transcript[chr_strand][transcript_gene][1] + sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])] + sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])] + if sorted_start_index == sorted_end_index: + sorted_start = sorted(start_list) + sorted_end = [end_list[i] for i in sorted_start_index] + for x in range(len(sorted_start))[1:]: + intron_start = sorted_end[x-1]+1 + intron_end = sorted_start[x]-1 + transcript = transcript_gene.split("#")[0] + gene = transcript_gene.split("#")[1] + data = (chr, str(intron_start), str(intron_end), gene, transcript, strand, "intron") + gtf[strand]["intron"].append(data) + + + # "Starting Reading Intergenic . . ." + + gtf["+"]["intergenic"] = [] + gtf["-"]["intergenic"] = [] + for chr_strand in gtf_gene.keys(): + chr = chr_strand.split("#")[0] + strand = chr_strand.split("#")[1] + start_list = gtf_gene[chr_strand][0] + end_list = gtf_gene[chr_strand][1] + gene_list = gtf_gene[chr_strand][2] + sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])] + sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])] + + sorted_start = sorted(start_list) + sorted_end = [end_list[i] for i in sorted_start_index] + sorted_gene = [gene_list[i] for i in sorted_start_index] + for x in range(len(sorted_start))[1:]: + intergene_start = sorted_end[x-1]+1 + intergene_end = sorted_start[x]-1 + if intergene_start < intergene_end: + intergene_1 = sorted_gene[x-1] + intergene_2 = sorted_gene[x] + gene = intergene_1 + "-#-" + intergene_2 + data = (chr, str(intergene_start), str(intergene_end), gene, "", strand, "intergenic") + gtf[strand]["intergenic"].append(data) + + import sqlite3 + # conn = sqlite3.connect('gtf_database.db') + conn = sqlite3.connect(":memory:") + c = conn.cursor() + # c.execute("DROP TABLE IF EXISTS gtf_data;") + # c.execute("CREATE TABLE IF NOT EXISTS gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") + c.execute("CREATE TABLE gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") + + for strand in gtf.keys(): + if strand == "+": + st = "positive" + elif strand == "-": + st = "negative" + else: + print "Something fishy in writing . . ." + + for type in gtf[strand].keys(): + data = gtf[strand][type] + c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data) + + conn.commit() + + infh = open(inputFile[2], "r") + # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r") + data = infh.readlines() + # output file + outfh = open(inputFile[3], 'w') + # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w") + + for each in data: + a = each.split("\t") + chr = a[0].strip() + pep_start = a[1].strip() + pep_end = a[2].strip() + strand = a[5].strip() + c.execute("select * from gtf_data where type = 'CDS' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") + rows = c.fetchall() + if len(rows) > 0: + outfh.write(each.strip() + "\tCDS\n") + else: + c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") + rows = c.fetchall() + if len(rows) > 0: + outfh.write(each.strip() + "\tfive_prime_utr\n") + else: + c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") + rows = c.fetchall() + if len(rows) > 0: + outfh.write(each.strip() + "\tthree_prime_utr\n") + else: + c.execute("select * from gtf_data where type = 'exon' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") + rows = c.fetchall() + if len(rows) > 0: + outfh.write(each.strip() + "\texon\n") + else: + c.execute("select * from gtf_data where type = 'intron' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") + rows = c.fetchall() + if len(rows) > 0: + outfh.write(each.strip() + "\tintron\n") + else: + c.execute("select * from gtf_data where type = 'gene' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") + rows = c.fetchall() + if len(rows) > 0: + outfh.write(each.strip() + "\tgene\n") + else: + c.execute("select * from gtf_data where type = 'intergenic' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") + rows = c.fetchall() + if len(rows) > 0: + outfh.write(each.strip() + "\tintergene\n") + else: + outfh.write(each.strip() + "\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\n") + + conn.close() + outfh.close() + else: + print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>" + return None + +if __name__ == "__main__": + main() + + + + +