Mercurial > repos > galaxyp > pep_pointer
diff pep_pointer.py @ 3:a6282baa8c6f draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pep_pointer commit 494bc6dd87b9a6e2af40cb32aa5d2ee6e9bfebfc
author | galaxyp |
---|---|
date | Mon, 20 Jun 2022 13:59:52 +0000 |
parents | 073a2965e3b2 |
children |
line wrap: on
line diff
--- a/pep_pointer.py Fri Apr 06 18:13:10 2018 -0400 +++ b/pep_pointer.py Mon Jun 20 13:59:52 2022 +0000 @@ -1,10 +1,10 @@ -# +# # Author: Praveen Kumar -# Updated: April 6th, 2018 -# -# -# +# Updated: April 6th, 2018 (updated to python3: May 2022) +# +# +# import re @@ -15,7 +15,7 @@ inputFile = sys.argv infh = open(inputFile[1], "r") # infh = open("Mus_musculus.GRCm38.90.chr.gtf", "r") - + gtf = {} gtf_transcript = {} gtf_gene = {} @@ -38,12 +38,12 @@ start = a[4].strip() end = a[3].strip() else: - print "Please check the start end coordinates in the GTF file" + print("Please check the start end coordinates in the GTF file") else: - print "Please check the strand information in the GTF file. It should be '+' or '-'." - if not gtf.has_key(strand): + print("Please check the strand information in the GTF file. It should be '+' or '-'.") + if strand not in gtf: gtf[strand] = {} - if not gtf[strand].has_key(type): + if type not in gtf[strand]: gtf[strand][type] = [] b = re.search("gene_id \"(.+?)\";", a[8].strip()) gene = b.group(1) @@ -54,43 +54,41 @@ transcript = b.group(1) data = (chr, start, end, gene, transcript, strand, type) gtf[strand][type].append(data) - + if type == "exon": - if gtf_transcript.has_key(chr+"#"+strand): - if gtf_transcript[chr+"#"+strand].has_key(transcript+"#"+gene): - gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) - gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) + if chr + "#" + strand in gtf_transcript: + if transcript + "#" + gene in gtf_transcript[chr + "#" + strand]: + gtf_transcript[chr + "#" + strand][transcript + "#" + gene][0].append(int(start)) + gtf_transcript[chr + "#" + strand][transcript + "#" + gene][1].append(int(end)) else: - gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]] - gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) - gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) + gtf_transcript[chr + "#" + strand][transcript + "#" + gene] = [[], []] + gtf_transcript[chr + "#" + strand][transcript + "#" + gene][0].append(int(start)) + gtf_transcript[chr + "#" + strand][transcript + "#" + gene][1].append(int(end)) else: - gtf_transcript[chr+"#"+strand] = {} - gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]] - gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) - gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) - + gtf_transcript[chr + "#" + strand] = {} + gtf_transcript[chr + "#" + strand][transcript + "#" + gene] = [[], []] + gtf_transcript[chr + "#" + strand][transcript + "#" + gene][0].append(int(start)) + gtf_transcript[chr + "#" + strand][transcript + "#" + gene][1].append(int(end)) + if type == "gene": - if gtf_gene.has_key(chr+"#"+strand): - gtf_gene[chr+"#"+strand][0].append(int(start)) - gtf_gene[chr+"#"+strand][1].append(int(end)) - gtf_gene[chr+"#"+strand][2].append(gene) + if chr + "#" + strand in gtf_gene: + gtf_gene[chr + "#" + strand][0].append(int(start)) + gtf_gene[chr + "#" + strand][1].append(int(end)) + gtf_gene[chr + "#" + strand][2].append(gene) else: - gtf_gene[chr+"#"+strand] = [[0],[0],["no_gene"]] - gtf_gene[chr+"#"+strand][0].append(int(start)) - gtf_gene[chr+"#"+strand][1].append(int(end)) - gtf_gene[chr+"#"+strand][2].append(gene) - - - + gtf_gene[chr + "#" + strand] = [[0], [0], ["no_gene"]] + gtf_gene[chr + "#" + strand][0].append(int(start)) + gtf_gene[chr + "#" + strand][1].append(int(end)) + gtf_gene[chr + "#" + strand][2].append(gene) + # "Starting Reading Intron . . ." - + gtf["+"]["intron"] = [] gtf["-"]["intron"] = [] - for chr_strand in gtf_transcript.keys(): + for chr_strand in gtf_transcript.keys(): chr = chr_strand.split("#")[0] strand = chr_strand.split("#")[1] - + for transcript_gene in gtf_transcript[chr_strand].keys(): start_list = gtf_transcript[chr_strand][transcript_gene][0] end_list = gtf_transcript[chr_strand][transcript_gene][1] @@ -100,19 +98,18 @@ sorted_start = sorted(start_list) sorted_end = [end_list[i] for i in sorted_start_index] for x in range(len(sorted_start))[1:]: - intron_start = sorted_end[x-1]+1 - intron_end = sorted_start[x]-1 + intron_start = sorted_end[x - 1] + 1 + intron_end = sorted_start[x] - 1 transcript = transcript_gene.split("#")[0] gene = transcript_gene.split("#")[1] data = (chr, str(intron_start), str(intron_end), gene, transcript, strand, "intron") gtf[strand]["intron"].append(data) - - + # "Starting Reading Intergenic . . ." - + gtf["+"]["intergenic"] = [] gtf["-"]["intergenic"] = [] - for chr_strand in gtf_gene.keys(): + for chr_strand in gtf_gene.keys(): chr = chr_strand.split("#")[0] strand = chr_strand.split("#")[1] start_list = gtf_gene[chr_strand][0] @@ -120,20 +117,20 @@ gene_list = gtf_gene[chr_strand][2] sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])] sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])] - + sorted_start = sorted(start_list) sorted_end = [end_list[i] for i in sorted_start_index] sorted_gene = [gene_list[i] for i in sorted_start_index] for x in range(len(sorted_start))[1:]: - intergene_start = sorted_end[x-1]+1 - intergene_end = sorted_start[x]-1 + intergene_start = sorted_end[x - 1] + 1 + intergene_end = sorted_start[x] - 1 if intergene_start < intergene_end: - intergene_1 = sorted_gene[x-1] + intergene_1 = sorted_gene[x - 1] intergene_2 = sorted_gene[x] gene = intergene_1 + "-#-" + intergene_2 data = (chr, str(intergene_start), str(intergene_end), gene, "", strand, "intergenic") gtf[strand]["intergenic"].append(data) - + import sqlite3 # conn = sqlite3.connect('gtf_database.db') conn = sqlite3.connect(":memory:") @@ -141,67 +138,63 @@ # c.execute("DROP TABLE IF EXISTS gtf_data;") # c.execute("CREATE TABLE IF NOT EXISTS gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") c.execute("CREATE TABLE gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") - + for strand in gtf.keys(): - if strand == "+": - st = "positive" - elif strand == "-": - st = "negative" - else: - print "Please check the strand information in the GTF file. It should be '+' or '-'." - + if strand not in ["+", "-"]: + print("Please check the strand information in the GTF file. It should be '+' or '-'.") + for type in gtf[strand].keys(): data = gtf[strand][type] c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data) - + conn.commit() - + infh = open(inputFile[2], "r") # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r") data = infh.readlines() # output file outfh = open(inputFile[3], 'w') # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w") - + for each in data: a = each.strip().split("\t") chr = a[0].strip() - pep_start = str(int(a[1].strip())+1) + pep_start = str(int(a[1].strip()) + 1) pep_end = a[2].strip() strand = a[5].strip() each = "\t".join(a[:6]) if (len(a) == 12 and int(a[9]) == 1) or (len(a) == 6): - c.execute("select * from gtf_data where type = 'CDS' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") + c.execute("select * from gtf_data where type = 'CDS' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ") rows = c.fetchall() if len(rows) > 0: outfh.write(each.strip() + "\tCDS\n") else: - c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") + c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ") rows = c.fetchall() if len(rows) > 0: outfh.write(each.strip() + "\tfive_prime_utr\n") else: - c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") + c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ") rows = c.fetchall() if len(rows) > 0: outfh.write(each.strip() + "\tthree_prime_utr\n") else: - c.execute("select * from gtf_data where type = 'exon' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") + c.execute("select * from gtf_data where type = 'exon' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ") rows = c.fetchall() if len(rows) > 0: outfh.write(each.strip() + "\texon\n") else: - c.execute("select * from gtf_data where type = 'intron' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") + c.execute("select * from gtf_data where type = 'intron' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ") rows = c.fetchall() if len(rows) > 0: outfh.write(each.strip() + "\tintron\n") else: - c.execute("select * from gtf_data where type = 'gene' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") + c.execute("select * from gtf_data where type = 'gene' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ") rows = c.fetchall() if len(rows) > 0: outfh.write(each.strip() + "\tgene\n") else: - c.execute("select * from gtf_data where type = 'intergenic' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") + c.execute("select * from gtf_data where type = 'intergenic' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ") rows = c.fetchall() if len(rows) > 0: outfh.write(each.strip() + "\tintergene\n") @@ -211,17 +204,13 @@ outfh.write(each.strip() + "\tSpliceJunction\n") else: outfh.write(each.strip() + "\tPlease check\n") - + conn.close() outfh.close() else: - print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>" + print("USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>") return None + if __name__ == "__main__": main() - - - - -