comparison pep_pointer.py @ 0:149ed6a9680f draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pep_pointer commit ac27a958fcb897c3cb56db313ebd282805b01103
author galaxyp
date Fri, 29 Dec 2017 12:37:22 -0500
parents
children 073a2965e3b2
comparison
equal deleted inserted replaced
-1:000000000000 0:149ed6a9680f
1
2 #
3 # Author: Praveen Kumar
4 # Updated: Nov 8th, 2017
5 #
6 #
7 #
8
9 import re
10
11
12 def main():
13 import sys
14 if len(sys.argv) == 4:
15 inputFile = sys.argv
16 infh = open(inputFile[1], "r")
17 # infh = open("Mus_musculus.GRCm38.90.chr.gtf", "r")
18
19 gtf = {}
20 gtf_transcript = {}
21 gtf_gene = {}
22 for each in infh.readlines():
23 a = each.split("\t")
24 if re.search("^[^#]", each):
25 if re.search("gene_biotype \"protein_coding\"", a[8]) and int(a[4].strip()) != int(a[3].strip()):
26 type = a[2].strip()
27 if type == "gene" or type == "exon" or type == "CDS" or type == "five_prime_utr" or type == "three_prime_utr":
28 chr = "chr" + a[0].strip()
29 strand = a[6].strip()
30 if strand == "+":
31 start = a[3].strip()
32 end = a[4].strip()
33 elif strand == "-":
34 if int(a[4].strip()) > int(a[3].strip()):
35 start = a[3].strip()
36 end = a[4].strip()
37 elif int(a[4].strip()) < int(a[3].strip()):
38 start = a[4].strip()
39 end = a[3].strip()
40 else:
41 print "Something fishy in start end coordinates"
42 else:
43 print "Something fishy in reading"
44 if not gtf.has_key(strand):
45 gtf[strand] = {}
46 if not gtf[strand].has_key(type):
47 gtf[strand][type] = []
48 b = re.search("gene_id \"(.+?)\";", a[8].strip())
49 gene = b.group(1)
50 if type == "gene":
51 transcript = ""
52 else:
53 b = re.search("transcript_id \"(.+?)\";", a[8].strip())
54 transcript = b.group(1)
55 data = (chr, start, end, gene, transcript, strand, type)
56 gtf[strand][type].append(data)
57
58 if type == "exon":
59 if gtf_transcript.has_key(chr+"#"+strand):
60 if gtf_transcript[chr+"#"+strand].has_key(transcript+"#"+gene):
61 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))
62 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))
63 else:
64 gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]]
65 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))
66 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))
67 else:
68 gtf_transcript[chr+"#"+strand] = {}
69 gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]]
70 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start))
71 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end))
72
73 if type == "gene":
74 if gtf_gene.has_key(chr+"#"+strand):
75 gtf_gene[chr+"#"+strand][0].append(int(start))
76 gtf_gene[chr+"#"+strand][1].append(int(end))
77 gtf_gene[chr+"#"+strand][2].append(gene)
78 else:
79 gtf_gene[chr+"#"+strand] = [[0],[0],["no_gene"]]
80 gtf_gene[chr+"#"+strand][0].append(int(start))
81 gtf_gene[chr+"#"+strand][1].append(int(end))
82 gtf_gene[chr+"#"+strand][2].append(gene)
83
84
85
86 # "Starting Reading Intron . . ."
87
88 gtf["+"]["intron"] = []
89 gtf["-"]["intron"] = []
90 for chr_strand in gtf_transcript.keys():
91 chr = chr_strand.split("#")[0]
92 strand = chr_strand.split("#")[1]
93
94 for transcript_gene in gtf_transcript[chr_strand].keys():
95 start_list = gtf_transcript[chr_strand][transcript_gene][0]
96 end_list = gtf_transcript[chr_strand][transcript_gene][1]
97 sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])]
98 sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])]
99 if sorted_start_index == sorted_end_index:
100 sorted_start = sorted(start_list)
101 sorted_end = [end_list[i] for i in sorted_start_index]
102 for x in range(len(sorted_start))[1:]:
103 intron_start = sorted_end[x-1]+1
104 intron_end = sorted_start[x]-1
105 transcript = transcript_gene.split("#")[0]
106 gene = transcript_gene.split("#")[1]
107 data = (chr, str(intron_start), str(intron_end), gene, transcript, strand, "intron")
108 gtf[strand]["intron"].append(data)
109
110
111 # "Starting Reading Intergenic . . ."
112
113 gtf["+"]["intergenic"] = []
114 gtf["-"]["intergenic"] = []
115 for chr_strand in gtf_gene.keys():
116 chr = chr_strand.split("#")[0]
117 strand = chr_strand.split("#")[1]
118 start_list = gtf_gene[chr_strand][0]
119 end_list = gtf_gene[chr_strand][1]
120 gene_list = gtf_gene[chr_strand][2]
121 sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])]
122 sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])]
123
124 sorted_start = sorted(start_list)
125 sorted_end = [end_list[i] for i in sorted_start_index]
126 sorted_gene = [gene_list[i] for i in sorted_start_index]
127 for x in range(len(sorted_start))[1:]:
128 intergene_start = sorted_end[x-1]+1
129 intergene_end = sorted_start[x]-1
130 if intergene_start < intergene_end:
131 intergene_1 = sorted_gene[x-1]
132 intergene_2 = sorted_gene[x]
133 gene = intergene_1 + "-#-" + intergene_2
134 data = (chr, str(intergene_start), str(intergene_end), gene, "", strand, "intergenic")
135 gtf[strand]["intergenic"].append(data)
136
137 import sqlite3
138 # conn = sqlite3.connect('gtf_database.db')
139 conn = sqlite3.connect(":memory:")
140 c = conn.cursor()
141 # c.execute("DROP TABLE IF EXISTS gtf_data;")
142 # c.execute("CREATE TABLE IF NOT EXISTS gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)")
143 c.execute("CREATE TABLE gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)")
144
145 for strand in gtf.keys():
146 if strand == "+":
147 st = "positive"
148 elif strand == "-":
149 st = "negative"
150 else:
151 print "Something fishy in writing . . ."
152
153 for type in gtf[strand].keys():
154 data = gtf[strand][type]
155 c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data)
156
157 conn.commit()
158
159 infh = open(inputFile[2], "r")
160 # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r")
161 data = infh.readlines()
162 # output file
163 outfh = open(inputFile[3], 'w')
164 # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w")
165
166 for each in data:
167 a = each.split("\t")
168 chr = a[0].strip()
169 pep_start = a[1].strip()
170 pep_end = a[2].strip()
171 strand = a[5].strip()
172 c.execute("select * from gtf_data where type = 'CDS' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
173 rows = c.fetchall()
174 if len(rows) > 0:
175 outfh.write(each.strip() + "\tCDS\n")
176 else:
177 c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
178 rows = c.fetchall()
179 if len(rows) > 0:
180 outfh.write(each.strip() + "\tfive_prime_utr\n")
181 else:
182 c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
183 rows = c.fetchall()
184 if len(rows) > 0:
185 outfh.write(each.strip() + "\tthree_prime_utr\n")
186 else:
187 c.execute("select * from gtf_data where type = 'exon' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
188 rows = c.fetchall()
189 if len(rows) > 0:
190 outfh.write(each.strip() + "\texon\n")
191 else:
192 c.execute("select * from gtf_data where type = 'intron' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
193 rows = c.fetchall()
194 if len(rows) > 0:
195 outfh.write(each.strip() + "\tintron\n")
196 else:
197 c.execute("select * from gtf_data where type = 'gene' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
198 rows = c.fetchall()
199 if len(rows) > 0:
200 outfh.write(each.strip() + "\tgene\n")
201 else:
202 c.execute("select * from gtf_data where type = 'intergenic' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ")
203 rows = c.fetchall()
204 if len(rows) > 0:
205 outfh.write(each.strip() + "\tintergene\n")
206 else:
207 outfh.write(each.strip() + "\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\n")
208
209 conn.close()
210 outfh.close()
211 else:
212 print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>"
213 return None
214
215 if __name__ == "__main__":
216 main()
217
218
219
220
221