pep_pointer: pep_pointer.py comparison

comparison pep_pointer.py @ 0:149ed6a9680f draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pep_pointer commit ac27a958fcb897c3cb56db313ebd282805b01103

author	galaxyp
date	Fri, 29 Dec 2017 12:37:22 -0500
parents
children	073a2965e3b2

comparison

equal deleted inserted replaced

--1:000000000000
+:149ed6a9680f
+#
+# Author: Praveen Kumar
+# Updated: Nov 8th, 2017
+#
+#
+#
+import re
+def main():
+import sys
+if len(sys.argv) == 4:
+inputFile = sys.argv
+infh = open(inputFile[1], "r")
+# infh = open("Mus_musculus.GRCm38.90.chr.gtf", "r")
+gtf = {}
+gtf_transcript = {}
+gtf_gene = {}
+for each in infh.readlines():
+a = each.split("\t")
+if re.search("^[^#]", each):
+if re.search("gene_biotype \"protein_coding\"", a[8]) and int(a[4].strip()) != int(a[3].strip()):
+type = a[2].strip()
+if type == "gene" or type == "exon" or type == "CDS" or type == "five_prime_utr" or type == "three_prime_utr":
+chr = "chr" + a[0].strip()
+strand = a[6].strip()
+if strand == "+":
+start = a[3].strip()
+end = a[4].strip()
+elif strand == "-":
+if int(a[4].strip()) > int(a[3].strip()):
+start = a[3].strip()
+end = a[4].strip()
+elif int(a[4].strip()) < int(a[3].strip()):
+start = a[4].strip()
+end = a[3].strip()
+else:
+print "Something fishy in start end coordinates"
+else:
+print "Something fishy in reading"
+if not gtf.has_key(strand):
+gtf[strand] = {}
+if not gtf[strand].has_key(type):
+gtf[strand][type] = []
+b = re.search("gene_id \"(.+?)\";", a[8].strip())
+gene = b.group(1)
+if type == "gene":
+transcript = ""
+else:
+b = re.search("transcript_id \"(.+?)\";", a[8].strip())
+transcript = b.group(1)
+data = (chr, start, end, gene, transcript, strand, type)
+gtf[strand][type].append(data)
+if type == "exon":
+if gtf_transcript.has_key(chr+"#"+strand):
+if gtf_transcript[chr+"#"+strand].has_key(transcript+"#"+gene):
+gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))
+gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))
+else:
+gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]]
+gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))
+gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))
+else:
+gtf_transcript[chr+"#"+strand] = {}
+gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]]
+gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))
+gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))
+if type == "gene":
+if gtf_gene.has_key(chr+"#"+strand):
+gtf_gene[chr+"#"+strand][0].append(int(start))
+gtf_gene[chr+"#"+strand][1].append(int(end))
+gtf_gene[chr+"#"+strand][2].append(gene)
+else:
+gtf_gene[chr+"#"+strand] = [[0],[0],["no_gene"]]
+gtf_gene[chr+"#"+strand][0].append(int(start))
+gtf_gene[chr+"#"+strand][1].append(int(end))
+gtf_gene[chr+"#"+strand][2].append(gene)
+# "Starting Reading Intron . . ."
+gtf["+"]["intron"] = []
+gtf["-"]["intron"] = []
+for chr_strand  in gtf_transcript.keys():
+chr = chr_strand.split("#")[0]
+strand = chr_strand.split("#")[1]
+for transcript_gene in gtf_transcript[chr_strand].keys():
+start_list = gtf_transcript[chr_strand][transcript_gene][0]
+end_list = gtf_transcript[chr_strand][transcript_gene][1]
+sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])]
+sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])]
+if sorted_start_index == sorted_end_index:
+sorted_start = sorted(start_list)
+sorted_end = [end_list[i] for i in sorted_start_index]
+for x in range(len(sorted_start))[1:]:
+intron_start = sorted_end[x-1]+1
+intron_end = sorted_start[x]-1
+transcript = transcript_gene.split("#")[0]
+gene = transcript_gene.split("#")[1]
+data = (chr, str(intron_start), str(intron_end), gene, transcript, strand, "intron")
+gtf[strand]["intron"].append(data)
+# "Starting Reading Intergenic . . ."
+gtf["+"]["intergenic"] = []
+gtf["-"]["intergenic"] = []
+for chr_strand  in gtf_gene.keys():
+chr = chr_strand.split("#")[0]
+strand = chr_strand.split("#")[1]
+start_list = gtf_gene[chr_strand][0]
+end_list = gtf_gene[chr_strand][1]
+gene_list = gtf_gene[chr_strand][2]
+sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])]
+sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])]
+sorted_start = sorted(start_list)
+sorted_end = [end_list[i] for i in sorted_start_index]
+sorted_gene = [gene_list[i] for i in sorted_start_index]
+for x in range(len(sorted_start))[1:]:
+intergene_start = sorted_end[x-1]+1
+intergene_end = sorted_start[x]-1
+if intergene_start < intergene_end:
+intergene_1 = sorted_gene[x-1]
+intergene_2 = sorted_gene[x]
+gene = intergene_1 + "-#-" + intergene_2
+data = (chr, str(intergene_start), str(intergene_end), gene, "", strand, "intergenic")
+gtf[strand]["intergenic"].append(data)
+import sqlite3
+# conn = sqlite3.connect('gtf_database.db')
+conn = sqlite3.connect(":memory:")
+c = conn.cursor()
+# c.execute("DROP TABLE IF EXISTS gtf_data;")
+# c.execute("CREATE TABLE IF NOT EXISTS gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)")
+c.execute("CREATE TABLE gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)")
+for strand in gtf.keys():
+if strand == "+":
+st = "positive"
+elif strand == "-":
+st = "negative"
+else:
+print "Something fishy in writing . . ."
+for type in gtf[strand].keys():
+data = gtf[strand][type]
+c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data)
+conn.commit()
+infh = open(inputFile[2], "r")
+# infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r")
+data = infh.readlines()
+# output file
+outfh = open(inputFile[3], 'w')
+# outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w")
+for each in data:
+a = each.split("\t")
+chr = a[0].strip()
+pep_start = a[1].strip()
+pep_end = a[2].strip()
+strand = a[5].strip()
+c.execute("select * from gtf_data where type = 'CDS' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
+rows = c.fetchall()
+if len(rows) > 0:
+outfh.write(each.strip() + "\tCDS\n")
+else:
+c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
+rows = c.fetchall()
+if len(rows) > 0:
+outfh.write(each.strip() + "\tfive_prime_utr\n")
+else:
+c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
+rows = c.fetchall()
+if len(rows) > 0:
+outfh.write(each.strip() + "\tthree_prime_utr\n")
+else:
+c.execute("select * from gtf_data where type = 'exon' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
+rows = c.fetchall()
+if len(rows) > 0:
+outfh.write(each.strip() + "\texon\n")
+else:
+c.execute("select * from gtf_data where type = 'intron' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
+rows = c.fetchall()
+if len(rows) > 0:
+outfh.write(each.strip() + "\tintron\n")
+else:
+c.execute("select * from gtf_data where type = 'gene' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
+rows = c.fetchall()
+if len(rows) > 0:
+outfh.write(each.strip() + "\tgene\n")
+else:
+c.execute("select * from gtf_data where type = 'intergenic' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
+rows = c.fetchall()
+if len(rows) > 0:
+outfh.write(each.strip() + "\tintergene\n")
+else:
+outfh.write(each.strip() + "\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\n")
+conn.close()
+outfh.close()
+else:
+print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>"
+return None
+if __name__ == "__main__":
+main()

Mercurial > repos > galaxyp > pep_pointer

comparison pep_pointer.py @ 0:149ed6a9680f draft