Mercurial > repos > galaxyp > pep_pointer
comparison pep_pointer.py @ 3:a6282baa8c6f draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pep_pointer commit 494bc6dd87b9a6e2af40cb32aa5d2ee6e9bfebfc
author | galaxyp |
---|---|
date | Mon, 20 Jun 2022 13:59:52 +0000 |
parents | 073a2965e3b2 |
children |
comparison
equal
deleted
inserted
replaced
2:073a2965e3b2 | 3:a6282baa8c6f |
---|---|
1 | 1 |
2 # | 2 # |
3 # Author: Praveen Kumar | 3 # Author: Praveen Kumar |
4 # Updated: April 6th, 2018 | 4 # Updated: April 6th, 2018 (updated to python3: May 2022) |
5 # | 5 # |
6 # | 6 # |
7 # | 7 # |
8 | 8 |
9 import re | 9 import re |
10 | 10 |
11 | 11 |
12 def main(): | 12 def main(): |
13 import sys | 13 import sys |
14 if len(sys.argv) == 4: | 14 if len(sys.argv) == 4: |
15 inputFile = sys.argv | 15 inputFile = sys.argv |
16 infh = open(inputFile[1], "r") | 16 infh = open(inputFile[1], "r") |
17 # infh = open("Mus_musculus.GRCm38.90.chr.gtf", "r") | 17 # infh = open("Mus_musculus.GRCm38.90.chr.gtf", "r") |
18 | 18 |
19 gtf = {} | 19 gtf = {} |
20 gtf_transcript = {} | 20 gtf_transcript = {} |
21 gtf_gene = {} | 21 gtf_gene = {} |
22 for each in infh.readlines(): | 22 for each in infh.readlines(): |
23 a = each.split("\t") | 23 a = each.split("\t") |
36 end = a[4].strip() | 36 end = a[4].strip() |
37 elif int(a[4].strip()) < int(a[3].strip()): | 37 elif int(a[4].strip()) < int(a[3].strip()): |
38 start = a[4].strip() | 38 start = a[4].strip() |
39 end = a[3].strip() | 39 end = a[3].strip() |
40 else: | 40 else: |
41 print "Please check the start end coordinates in the GTF file" | 41 print("Please check the start end coordinates in the GTF file") |
42 else: | 42 else: |
43 print "Please check the strand information in the GTF file. It should be '+' or '-'." | 43 print("Please check the strand information in the GTF file. It should be '+' or '-'.") |
44 if not gtf.has_key(strand): | 44 if strand not in gtf: |
45 gtf[strand] = {} | 45 gtf[strand] = {} |
46 if not gtf[strand].has_key(type): | 46 if type not in gtf[strand]: |
47 gtf[strand][type] = [] | 47 gtf[strand][type] = [] |
48 b = re.search("gene_id \"(.+?)\";", a[8].strip()) | 48 b = re.search("gene_id \"(.+?)\";", a[8].strip()) |
49 gene = b.group(1) | 49 gene = b.group(1) |
50 if type == "gene": | 50 if type == "gene": |
51 transcript = "" | 51 transcript = "" |
52 else: | 52 else: |
53 b = re.search("transcript_id \"(.+?)\";", a[8].strip()) | 53 b = re.search("transcript_id \"(.+?)\";", a[8].strip()) |
54 transcript = b.group(1) | 54 transcript = b.group(1) |
55 data = (chr, start, end, gene, transcript, strand, type) | 55 data = (chr, start, end, gene, transcript, strand, type) |
56 gtf[strand][type].append(data) | 56 gtf[strand][type].append(data) |
57 | 57 |
58 if type == "exon": | 58 if type == "exon": |
59 if gtf_transcript.has_key(chr+"#"+strand): | 59 if chr + "#" + strand in gtf_transcript: |
60 if gtf_transcript[chr+"#"+strand].has_key(transcript+"#"+gene): | 60 if transcript + "#" + gene in gtf_transcript[chr + "#" + strand]: |
61 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) | 61 gtf_transcript[chr + "#" + strand][transcript + "#" + gene][0].append(int(start)) |
62 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) | 62 gtf_transcript[chr + "#" + strand][transcript + "#" + gene][1].append(int(end)) |
63 else: | 63 else: |
64 gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]] | 64 gtf_transcript[chr + "#" + strand][transcript + "#" + gene] = [[], []] |
65 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) | 65 gtf_transcript[chr + "#" + strand][transcript + "#" + gene][0].append(int(start)) |
66 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) | 66 gtf_transcript[chr + "#" + strand][transcript + "#" + gene][1].append(int(end)) |
67 else: | 67 else: |
68 gtf_transcript[chr+"#"+strand] = {} | 68 gtf_transcript[chr + "#" + strand] = {} |
69 gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]] | 69 gtf_transcript[chr + "#" + strand][transcript + "#" + gene] = [[], []] |
70 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) | 70 gtf_transcript[chr + "#" + strand][transcript + "#" + gene][0].append(int(start)) |
71 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) | 71 gtf_transcript[chr + "#" + strand][transcript + "#" + gene][1].append(int(end)) |
72 | 72 |
73 if type == "gene": | 73 if type == "gene": |
74 if gtf_gene.has_key(chr+"#"+strand): | 74 if chr + "#" + strand in gtf_gene: |
75 gtf_gene[chr+"#"+strand][0].append(int(start)) | 75 gtf_gene[chr + "#" + strand][0].append(int(start)) |
76 gtf_gene[chr+"#"+strand][1].append(int(end)) | 76 gtf_gene[chr + "#" + strand][1].append(int(end)) |
77 gtf_gene[chr+"#"+strand][2].append(gene) | 77 gtf_gene[chr + "#" + strand][2].append(gene) |
78 else: | 78 else: |
79 gtf_gene[chr+"#"+strand] = [[0],[0],["no_gene"]] | 79 gtf_gene[chr + "#" + strand] = [[0], [0], ["no_gene"]] |
80 gtf_gene[chr+"#"+strand][0].append(int(start)) | 80 gtf_gene[chr + "#" + strand][0].append(int(start)) |
81 gtf_gene[chr+"#"+strand][1].append(int(end)) | 81 gtf_gene[chr + "#" + strand][1].append(int(end)) |
82 gtf_gene[chr+"#"+strand][2].append(gene) | 82 gtf_gene[chr + "#" + strand][2].append(gene) |
83 | 83 |
84 | |
85 | |
86 # "Starting Reading Intron . . ." | 84 # "Starting Reading Intron . . ." |
87 | 85 |
88 gtf["+"]["intron"] = [] | 86 gtf["+"]["intron"] = [] |
89 gtf["-"]["intron"] = [] | 87 gtf["-"]["intron"] = [] |
90 for chr_strand in gtf_transcript.keys(): | 88 for chr_strand in gtf_transcript.keys(): |
91 chr = chr_strand.split("#")[0] | 89 chr = chr_strand.split("#")[0] |
92 strand = chr_strand.split("#")[1] | 90 strand = chr_strand.split("#")[1] |
93 | 91 |
94 for transcript_gene in gtf_transcript[chr_strand].keys(): | 92 for transcript_gene in gtf_transcript[chr_strand].keys(): |
95 start_list = gtf_transcript[chr_strand][transcript_gene][0] | 93 start_list = gtf_transcript[chr_strand][transcript_gene][0] |
96 end_list = gtf_transcript[chr_strand][transcript_gene][1] | 94 end_list = gtf_transcript[chr_strand][transcript_gene][1] |
97 sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])] | 95 sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])] |
98 sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])] | 96 sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])] |
99 if sorted_start_index == sorted_end_index: | 97 if sorted_start_index == sorted_end_index: |
100 sorted_start = sorted(start_list) | 98 sorted_start = sorted(start_list) |
101 sorted_end = [end_list[i] for i in sorted_start_index] | 99 sorted_end = [end_list[i] for i in sorted_start_index] |
102 for x in range(len(sorted_start))[1:]: | 100 for x in range(len(sorted_start))[1:]: |
103 intron_start = sorted_end[x-1]+1 | 101 intron_start = sorted_end[x - 1] + 1 |
104 intron_end = sorted_start[x]-1 | 102 intron_end = sorted_start[x] - 1 |
105 transcript = transcript_gene.split("#")[0] | 103 transcript = transcript_gene.split("#")[0] |
106 gene = transcript_gene.split("#")[1] | 104 gene = transcript_gene.split("#")[1] |
107 data = (chr, str(intron_start), str(intron_end), gene, transcript, strand, "intron") | 105 data = (chr, str(intron_start), str(intron_end), gene, transcript, strand, "intron") |
108 gtf[strand]["intron"].append(data) | 106 gtf[strand]["intron"].append(data) |
109 | 107 |
110 | |
111 # "Starting Reading Intergenic . . ." | 108 # "Starting Reading Intergenic . . ." |
112 | 109 |
113 gtf["+"]["intergenic"] = [] | 110 gtf["+"]["intergenic"] = [] |
114 gtf["-"]["intergenic"] = [] | 111 gtf["-"]["intergenic"] = [] |
115 for chr_strand in gtf_gene.keys(): | 112 for chr_strand in gtf_gene.keys(): |
116 chr = chr_strand.split("#")[0] | 113 chr = chr_strand.split("#")[0] |
117 strand = chr_strand.split("#")[1] | 114 strand = chr_strand.split("#")[1] |
118 start_list = gtf_gene[chr_strand][0] | 115 start_list = gtf_gene[chr_strand][0] |
119 end_list = gtf_gene[chr_strand][1] | 116 end_list = gtf_gene[chr_strand][1] |
120 gene_list = gtf_gene[chr_strand][2] | 117 gene_list = gtf_gene[chr_strand][2] |
121 sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])] | 118 sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])] |
122 sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])] | 119 sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])] |
123 | 120 |
124 sorted_start = sorted(start_list) | 121 sorted_start = sorted(start_list) |
125 sorted_end = [end_list[i] for i in sorted_start_index] | 122 sorted_end = [end_list[i] for i in sorted_start_index] |
126 sorted_gene = [gene_list[i] for i in sorted_start_index] | 123 sorted_gene = [gene_list[i] for i in sorted_start_index] |
127 for x in range(len(sorted_start))[1:]: | 124 for x in range(len(sorted_start))[1:]: |
128 intergene_start = sorted_end[x-1]+1 | 125 intergene_start = sorted_end[x - 1] + 1 |
129 intergene_end = sorted_start[x]-1 | 126 intergene_end = sorted_start[x] - 1 |
130 if intergene_start < intergene_end: | 127 if intergene_start < intergene_end: |
131 intergene_1 = sorted_gene[x-1] | 128 intergene_1 = sorted_gene[x - 1] |
132 intergene_2 = sorted_gene[x] | 129 intergene_2 = sorted_gene[x] |
133 gene = intergene_1 + "-#-" + intergene_2 | 130 gene = intergene_1 + "-#-" + intergene_2 |
134 data = (chr, str(intergene_start), str(intergene_end), gene, "", strand, "intergenic") | 131 data = (chr, str(intergene_start), str(intergene_end), gene, "", strand, "intergenic") |
135 gtf[strand]["intergenic"].append(data) | 132 gtf[strand]["intergenic"].append(data) |
136 | 133 |
137 import sqlite3 | 134 import sqlite3 |
138 # conn = sqlite3.connect('gtf_database.db') | 135 # conn = sqlite3.connect('gtf_database.db') |
139 conn = sqlite3.connect(":memory:") | 136 conn = sqlite3.connect(":memory:") |
140 c = conn.cursor() | 137 c = conn.cursor() |
141 # c.execute("DROP TABLE IF EXISTS gtf_data;") | 138 # c.execute("DROP TABLE IF EXISTS gtf_data;") |
142 # c.execute("CREATE TABLE IF NOT EXISTS gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") | 139 # c.execute("CREATE TABLE IF NOT EXISTS gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") |
143 c.execute("CREATE TABLE gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") | 140 c.execute("CREATE TABLE gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") |
144 | 141 |
145 for strand in gtf.keys(): | 142 for strand in gtf.keys(): |
146 if strand == "+": | 143 if strand not in ["+", "-"]: |
147 st = "positive" | 144 print("Please check the strand information in the GTF file. It should be '+' or '-'.") |
148 elif strand == "-": | 145 |
149 st = "negative" | |
150 else: | |
151 print "Please check the strand information in the GTF file. It should be '+' or '-'." | |
152 | |
153 for type in gtf[strand].keys(): | 146 for type in gtf[strand].keys(): |
154 data = gtf[strand][type] | 147 data = gtf[strand][type] |
155 c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data) | 148 c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data) |
156 | 149 |
157 conn.commit() | 150 conn.commit() |
158 | 151 |
159 infh = open(inputFile[2], "r") | 152 infh = open(inputFile[2], "r") |
160 # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r") | 153 # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r") |
161 data = infh.readlines() | 154 data = infh.readlines() |
162 # output file | 155 # output file |
163 outfh = open(inputFile[3], 'w') | 156 outfh = open(inputFile[3], 'w') |
164 # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w") | 157 # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w") |
165 | 158 |
166 for each in data: | 159 for each in data: |
167 a = each.strip().split("\t") | 160 a = each.strip().split("\t") |
168 chr = a[0].strip() | 161 chr = a[0].strip() |
169 pep_start = str(int(a[1].strip())+1) | 162 pep_start = str(int(a[1].strip()) + 1) |
170 pep_end = a[2].strip() | 163 pep_end = a[2].strip() |
171 strand = a[5].strip() | 164 strand = a[5].strip() |
172 each = "\t".join(a[:6]) | 165 each = "\t".join(a[:6]) |
173 if (len(a) == 12 and int(a[9]) == 1) or (len(a) == 6): | 166 if (len(a) == 12 and int(a[9]) == 1) or (len(a) == 6): |
174 c.execute("select * from gtf_data where type = 'CDS' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 167 c.execute("select * from gtf_data where type = 'CDS' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ") |
175 rows = c.fetchall() | 168 rows = c.fetchall() |
176 if len(rows) > 0: | 169 if len(rows) > 0: |
177 outfh.write(each.strip() + "\tCDS\n") | 170 outfh.write(each.strip() + "\tCDS\n") |
178 else: | 171 else: |
179 c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 172 c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ") |
180 rows = c.fetchall() | 173 rows = c.fetchall() |
181 if len(rows) > 0: | 174 if len(rows) > 0: |
182 outfh.write(each.strip() + "\tfive_prime_utr\n") | 175 outfh.write(each.strip() + "\tfive_prime_utr\n") |
183 else: | 176 else: |
184 c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 177 c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ") |
185 rows = c.fetchall() | 178 rows = c.fetchall() |
186 if len(rows) > 0: | 179 if len(rows) > 0: |
187 outfh.write(each.strip() + "\tthree_prime_utr\n") | 180 outfh.write(each.strip() + "\tthree_prime_utr\n") |
188 else: | 181 else: |
189 c.execute("select * from gtf_data where type = 'exon' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 182 c.execute("select * from gtf_data where type = 'exon' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ") |
190 rows = c.fetchall() | 183 rows = c.fetchall() |
191 if len(rows) > 0: | 184 if len(rows) > 0: |
192 outfh.write(each.strip() + "\texon\n") | 185 outfh.write(each.strip() + "\texon\n") |
193 else: | 186 else: |
194 c.execute("select * from gtf_data where type = 'intron' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 187 c.execute("select * from gtf_data where type = 'intron' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ") |
195 rows = c.fetchall() | 188 rows = c.fetchall() |
196 if len(rows) > 0: | 189 if len(rows) > 0: |
197 outfh.write(each.strip() + "\tintron\n") | 190 outfh.write(each.strip() + "\tintron\n") |
198 else: | 191 else: |
199 c.execute("select * from gtf_data where type = 'gene' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 192 c.execute("select * from gtf_data where type = 'gene' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ") |
200 rows = c.fetchall() | 193 rows = c.fetchall() |
201 if len(rows) > 0: | 194 if len(rows) > 0: |
202 outfh.write(each.strip() + "\tgene\n") | 195 outfh.write(each.strip() + "\tgene\n") |
203 else: | 196 else: |
204 c.execute("select * from gtf_data where type = 'intergenic' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 197 c.execute("select * from gtf_data where type = 'intergenic' and chr = '" + chr + "' and start <= " + pep_start + " and end >= " + pep_end + " and strand = '" + strand + "' ") |
205 rows = c.fetchall() | 198 rows = c.fetchall() |
206 if len(rows) > 0: | 199 if len(rows) > 0: |
207 outfh.write(each.strip() + "\tintergene\n") | 200 outfh.write(each.strip() + "\tintergene\n") |
208 else: | 201 else: |
209 outfh.write(each.strip() + "\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\n") | 202 outfh.write(each.strip() + "\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\n") |
210 elif (len(a) == 12 and int(a[9]) == 2): | 203 elif (len(a) == 12 and int(a[9]) == 2): |
211 outfh.write(each.strip() + "\tSpliceJunction\n") | 204 outfh.write(each.strip() + "\tSpliceJunction\n") |
212 else: | 205 else: |
213 outfh.write(each.strip() + "\tPlease check\n") | 206 outfh.write(each.strip() + "\tPlease check\n") |
214 | 207 |
215 conn.close() | 208 conn.close() |
216 outfh.close() | 209 outfh.close() |
217 else: | 210 else: |
218 print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>" | 211 print("USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>") |
219 return None | 212 return None |
213 | |
220 | 214 |
221 if __name__ == "__main__": | 215 if __name__ == "__main__": |
222 main() | 216 main() |
223 | |
224 | |
225 | |
226 | |
227 |