diff pep_pointer.py @ 3:a6282baa8c6f draft default tip

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pep_pointer commit 494bc6dd87b9a6e2af40cb32aa5d2ee6e9bfebfc
author galaxyp
date Mon, 20 Jun 2022 13:59:52 +0000
parents 073a2965e3b2
children
line wrap: on
line diff
--- a/pep_pointer.py	Fri Apr 06 18:13:10 2018 -0400
+++ b/pep_pointer.py	Mon Jun 20 13:59:52 2022 +0000
@@ -1,10 +1,10 @@
 
-# 
+#
 # Author: Praveen Kumar
-# Updated: April 6th, 2018
-# 
-# 
-# 
+# Updated: April 6th, 2018 (updated to python3: May 2022)
+#
+#
+#
 
 import re
 
@@ -15,7 +15,7 @@
         inputFile = sys.argv
         infh = open(inputFile[1], "r")
         # infh = open("Mus_musculus.GRCm38.90.chr.gtf", "r")
-        
+
         gtf = {}
         gtf_transcript = {}
         gtf_gene = {}
@@ -38,12 +38,12 @@
                                 start = a[4].strip()
                                 end = a[3].strip()
                             else:
-                                print "Please check the start end coordinates in the GTF file"
+                                print("Please check the start end coordinates in the GTF file")
                         else:
-                            print "Please check the strand information in the GTF file. It should be '+' or '-'."
-                        if not gtf.has_key(strand):
+                            print("Please check the strand information in the GTF file. It should be '+' or '-'.")
+                        if strand not in gtf:
                             gtf[strand] = {}
-                        if not gtf[strand].has_key(type):
+                        if type not in gtf[strand]:
                             gtf[strand][type] = []
                         b = re.search("gene_id \"(.+?)\";", a[8].strip())
                         gene = b.group(1)
@@ -54,43 +54,41 @@
                             transcript = b.group(1)
                         data = (chr, start, end, gene, transcript, strand, type)
                         gtf[strand][type].append(data)
-                
+
                         if type == "exon":
-                            if gtf_transcript.has_key(chr+"#"+strand):
-                                if gtf_transcript[chr+"#"+strand].has_key(transcript+"#"+gene):
-                                    gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))
-                                    gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))
+                            if chr + "#" + strand in gtf_transcript:
+                                if transcript + "#" + gene in gtf_transcript[chr + "#" + strand]:
+                                    gtf_transcript[chr + "#" + strand][transcript + "#" + gene][0].append(int(start))
+                                    gtf_transcript[chr + "#" + strand][transcript + "#" + gene][1].append(int(end))
                                 else:
-                                    gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]]
-                                    gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))
-                                    gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))
+                                    gtf_transcript[chr + "#" + strand][transcript + "#" + gene] = [[], []]
+                                    gtf_transcript[chr + "#" + strand][transcript + "#" + gene][0].append(int(start))
+                                    gtf_transcript[chr + "#" + strand][transcript + "#" + gene][1].append(int(end))
                             else:
-                                gtf_transcript[chr+"#"+strand] = {}
-                                gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]]
-                                gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))
-                                gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))
-                
+                                gtf_transcript[chr + "#" + strand] = {}
+                                gtf_transcript[chr + "#" + strand][transcript + "#" + gene] = [[], []]
+                                gtf_transcript[chr + "#" + strand][transcript + "#" + gene][0].append(int(start))
+                                gtf_transcript[chr + "#" + strand][transcript + "#" + gene][1].append(int(end))
+
                         if type == "gene":
-                            if gtf_gene.has_key(chr+"#"+strand):
-                                gtf_gene[chr+"#"+strand][0].append(int(start))
-                                gtf_gene[chr+"#"+strand][1].append(int(end))
-                                gtf_gene[chr+"#"+strand][2].append(gene)
+                            if chr + "#" + strand in gtf_gene:
+                                gtf_gene[chr + "#" + strand][0].append(int(start))
+                                gtf_gene[chr + "#" + strand][1].append(int(end))
+                                gtf_gene[chr + "#" + strand][2].append(gene)
                             else:
-                                gtf_gene[chr+"#"+strand] = [[0],[0],["no_gene"]]
-                                gtf_gene[chr+"#"+strand][0].append(int(start))
-                                gtf_gene[chr+"#"+strand][1].append(int(end))
-                                gtf_gene[chr+"#"+strand][2].append(gene)
-                        
-                        
-    
+                                gtf_gene[chr + "#" + strand] = [[0], [0], ["no_gene"]]
+                                gtf_gene[chr + "#" + strand][0].append(int(start))
+                                gtf_gene[chr + "#" + strand][1].append(int(end))
+                                gtf_gene[chr + "#" + strand][2].append(gene)
+
         # "Starting Reading Intron . . ."
-    
+
         gtf["+"]["intron"] = []
         gtf["-"]["intron"] = []
-        for chr_strand  in gtf_transcript.keys():
+        for chr_strand in gtf_transcript.keys():
             chr = chr_strand.split("#")[0]
             strand = chr_strand.split("#")[1]
-        
+
             for transcript_gene in gtf_transcript[chr_strand].keys():
                 start_list = gtf_transcript[chr_strand][transcript_gene][0]
                 end_list = gtf_transcript[chr_strand][transcript_gene][1]
@@ -100,19 +98,18 @@
                     sorted_start = sorted(start_list)
                     sorted_end = [end_list[i] for i in sorted_start_index]
                     for x in range(len(sorted_start))[1:]:
-                        intron_start = sorted_end[x-1]+1
-                        intron_end = sorted_start[x]-1
+                        intron_start = sorted_end[x - 1] + 1
+                        intron_end = sorted_start[x] - 1
                         transcript = transcript_gene.split("#")[0]
                         gene = transcript_gene.split("#")[1]
                         data = (chr, str(intron_start), str(intron_end), gene, transcript, strand, "intron")
                         gtf[strand]["intron"].append(data)
-                    
-    
+
         # "Starting Reading Intergenic . . ."
-    
+
         gtf["+"]["intergenic"] = []
         gtf["-"]["intergenic"] = []
-        for chr_strand  in gtf_gene.keys():
+        for chr_strand in gtf_gene.keys():
             chr = chr_strand.split("#")[0]
             strand = chr_strand.split("#")[1]
             start_list = gtf_gene[chr_strand][0]
@@ -120,20 +117,20 @@
             gene_list = gtf_gene[chr_strand][2]
             sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])]
             sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])]
-        
+
             sorted_start = sorted(start_list)
             sorted_end = [end_list[i] for i in sorted_start_index]
             sorted_gene = [gene_list[i] for i in sorted_start_index]
             for x in range(len(sorted_start))[1:]:
-                intergene_start = sorted_end[x-1]+1
-                intergene_end = sorted_start[x]-1
+                intergene_start = sorted_end[x - 1] + 1
+                intergene_end = sorted_start[x] - 1
                 if intergene_start < intergene_end:
-                    intergene_1 = sorted_gene[x-1]
+                    intergene_1 = sorted_gene[x - 1]
                     intergene_2 = sorted_gene[x]
                     gene = intergene_1 + "-#-" + intergene_2
                     data = (chr, str(intergene_start), str(intergene_end), gene, "", strand, "intergenic")
                     gtf[strand]["intergenic"].append(data)
-    
+
         import sqlite3
         # conn = sqlite3.connect('gtf_database.db')
         conn = sqlite3.connect(":memory:")
@@ -141,67 +138,63 @@
         # c.execute("DROP TABLE IF EXISTS gtf_data;")
         # c.execute("CREATE TABLE IF NOT EXISTS gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)")
         c.execute("CREATE TABLE gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)")
-    
+
         for strand in gtf.keys():
-            if strand == "+":
-                st = "positive"
-            elif strand == "-":
-                st = "negative"
-            else:
-                print "Please check the strand information in the GTF file. It should be '+' or '-'."
-        
+            if strand not in ["+", "-"]:
+                print("Please check the strand information in the GTF file. It should be '+' or '-'.")
+
             for type in gtf[strand].keys():
                 data = gtf[strand][type]
                 c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data)
-            
+
         conn.commit()
-    
+
         infh = open(inputFile[2], "r")
         # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r")
         data = infh.readlines()
         # output file
         outfh = open(inputFile[3], 'w')
         # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w")
-        
+
         for each in data:
             a = each.strip().split("\t")
             chr = a[0].strip()
-            pep_start = str(int(a[1].strip())+1)
+            pep_start = str(int(a[1].strip()) + 1)
             pep_end = a[2].strip()
             strand = a[5].strip()
             each = "\t".join(a[:6])
             if (len(a) == 12 and int(a[9]) == 1) or (len(a) == 6):
-                c.execute("select * from gtf_data where type = 'CDS' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
+                c.execute("select * from gtf_data where type = 'CDS' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ")
                 rows = c.fetchall()
                 if len(rows) > 0:
                     outfh.write(each.strip() + "\tCDS\n")
                 else:
-                    c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
+                    c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ")
                     rows = c.fetchall()
                     if len(rows) > 0:
                         outfh.write(each.strip() + "\tfive_prime_utr\n")
                     else:
-                        c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
+                        c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ")
                         rows = c.fetchall()
                         if len(rows) > 0:
                             outfh.write(each.strip() + "\tthree_prime_utr\n")
                         else:
-                            c.execute("select * from gtf_data where type = 'exon' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
+                            c.execute("select * from gtf_data where type = 'exon' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ")
                             rows = c.fetchall()
                             if len(rows) > 0:
                                 outfh.write(each.strip() + "\texon\n")
                             else:
-                                c.execute("select * from gtf_data where type = 'intron' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
+                                c.execute("select * from gtf_data where type = 'intron' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ")
                                 rows = c.fetchall()
                                 if len(rows) > 0:
                                     outfh.write(each.strip() + "\tintron\n")
                                 else:
-                                    c.execute("select * from gtf_data where type = 'gene' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
+                                    c.execute("select * from gtf_data where type = 'gene' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ")
                                     rows = c.fetchall()
                                     if len(rows) > 0:
                                         outfh.write(each.strip() + "\tgene\n")
                                     else:
-                                        c.execute("select * from gtf_data where type = 'intergenic' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
+                                        c.execute("select * from gtf_data where type = 'intergenic' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ")
                                         rows = c.fetchall()
                                         if len(rows) > 0:
                                             outfh.write(each.strip() + "\tintergene\n")
@@ -211,17 +204,13 @@
                 outfh.write(each.strip() + "\tSpliceJunction\n")
             else:
                 outfh.write(each.strip() + "\tPlease check\n")
-    
+
         conn.close()
         outfh.close()
     else:
-        print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>"
+        print("USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>")
     return None
 
+
 if __name__ == "__main__":
     main()
-
-
-
-
-