Mercurial > repos > galaxyp > pep_pointer
comparison pep_pointer.py @ 0:149ed6a9680f draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pep_pointer commit ac27a958fcb897c3cb56db313ebd282805b01103
author | galaxyp |
---|---|
date | Fri, 29 Dec 2017 12:37:22 -0500 |
parents | |
children | 073a2965e3b2 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:149ed6a9680f |
---|---|
1 | |
2 # | |
3 # Author: Praveen Kumar | |
4 # Updated: Nov 8th, 2017 | |
5 # | |
6 # | |
7 # | |
8 | |
9 import re | |
10 | |
11 | |
12 def main(): | |
13 import sys | |
14 if len(sys.argv) == 4: | |
15 inputFile = sys.argv | |
16 infh = open(inputFile[1], "r") | |
17 # infh = open("Mus_musculus.GRCm38.90.chr.gtf", "r") | |
18 | |
19 gtf = {} | |
20 gtf_transcript = {} | |
21 gtf_gene = {} | |
22 for each in infh.readlines(): | |
23 a = each.split("\t") | |
24 if re.search("^[^#]", each): | |
25 if re.search("gene_biotype \"protein_coding\"", a[8]) and int(a[4].strip()) != int(a[3].strip()): | |
26 type = a[2].strip() | |
27 if type == "gene" or type == "exon" or type == "CDS" or type == "five_prime_utr" or type == "three_prime_utr": | |
28 chr = "chr" + a[0].strip() | |
29 strand = a[6].strip() | |
30 if strand == "+": | |
31 start = a[3].strip() | |
32 end = a[4].strip() | |
33 elif strand == "-": | |
34 if int(a[4].strip()) > int(a[3].strip()): | |
35 start = a[3].strip() | |
36 end = a[4].strip() | |
37 elif int(a[4].strip()) < int(a[3].strip()): | |
38 start = a[4].strip() | |
39 end = a[3].strip() | |
40 else: | |
41 print "Something fishy in start end coordinates" | |
42 else: | |
43 print "Something fishy in reading" | |
44 if not gtf.has_key(strand): | |
45 gtf[strand] = {} | |
46 if not gtf[strand].has_key(type): | |
47 gtf[strand][type] = [] | |
48 b = re.search("gene_id \"(.+?)\";", a[8].strip()) | |
49 gene = b.group(1) | |
50 if type == "gene": | |
51 transcript = "" | |
52 else: | |
53 b = re.search("transcript_id \"(.+?)\";", a[8].strip()) | |
54 transcript = b.group(1) | |
55 data = (chr, start, end, gene, transcript, strand, type) | |
56 gtf[strand][type].append(data) | |
57 | |
58 if type == "exon": | |
59 if gtf_transcript.has_key(chr+"#"+strand): | |
60 if gtf_transcript[chr+"#"+strand].has_key(transcript+"#"+gene): | |
61 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) | |
62 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) | |
63 else: | |
64 gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]] | |
65 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) | |
66 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) | |
67 else: | |
68 gtf_transcript[chr+"#"+strand] = {} | |
69 gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]] | |
70 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) | |
71 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) | |
72 | |
73 if type == "gene": | |
74 if gtf_gene.has_key(chr+"#"+strand): | |
75 gtf_gene[chr+"#"+strand][0].append(int(start)) | |
76 gtf_gene[chr+"#"+strand][1].append(int(end)) | |
77 gtf_gene[chr+"#"+strand][2].append(gene) | |
78 else: | |
79 gtf_gene[chr+"#"+strand] = [[0],[0],["no_gene"]] | |
80 gtf_gene[chr+"#"+strand][0].append(int(start)) | |
81 gtf_gene[chr+"#"+strand][1].append(int(end)) | |
82 gtf_gene[chr+"#"+strand][2].append(gene) | |
83 | |
84 | |
85 | |
86 # "Starting Reading Intron . . ." | |
87 | |
88 gtf["+"]["intron"] = [] | |
89 gtf["-"]["intron"] = [] | |
90 for chr_strand in gtf_transcript.keys(): | |
91 chr = chr_strand.split("#")[0] | |
92 strand = chr_strand.split("#")[1] | |
93 | |
94 for transcript_gene in gtf_transcript[chr_strand].keys(): | |
95 start_list = gtf_transcript[chr_strand][transcript_gene][0] | |
96 end_list = gtf_transcript[chr_strand][transcript_gene][1] | |
97 sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])] | |
98 sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])] | |
99 if sorted_start_index == sorted_end_index: | |
100 sorted_start = sorted(start_list) | |
101 sorted_end = [end_list[i] for i in sorted_start_index] | |
102 for x in range(len(sorted_start))[1:]: | |
103 intron_start = sorted_end[x-1]+1 | |
104 intron_end = sorted_start[x]-1 | |
105 transcript = transcript_gene.split("#")[0] | |
106 gene = transcript_gene.split("#")[1] | |
107 data = (chr, str(intron_start), str(intron_end), gene, transcript, strand, "intron") | |
108 gtf[strand]["intron"].append(data) | |
109 | |
110 | |
111 # "Starting Reading Intergenic . . ." | |
112 | |
113 gtf["+"]["intergenic"] = [] | |
114 gtf["-"]["intergenic"] = [] | |
115 for chr_strand in gtf_gene.keys(): | |
116 chr = chr_strand.split("#")[0] | |
117 strand = chr_strand.split("#")[1] | |
118 start_list = gtf_gene[chr_strand][0] | |
119 end_list = gtf_gene[chr_strand][1] | |
120 gene_list = gtf_gene[chr_strand][2] | |
121 sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])] | |
122 sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])] | |
123 | |
124 sorted_start = sorted(start_list) | |
125 sorted_end = [end_list[i] for i in sorted_start_index] | |
126 sorted_gene = [gene_list[i] for i in sorted_start_index] | |
127 for x in range(len(sorted_start))[1:]: | |
128 intergene_start = sorted_end[x-1]+1 | |
129 intergene_end = sorted_start[x]-1 | |
130 if intergene_start < intergene_end: | |
131 intergene_1 = sorted_gene[x-1] | |
132 intergene_2 = sorted_gene[x] | |
133 gene = intergene_1 + "-#-" + intergene_2 | |
134 data = (chr, str(intergene_start), str(intergene_end), gene, "", strand, "intergenic") | |
135 gtf[strand]["intergenic"].append(data) | |
136 | |
137 import sqlite3 | |
138 # conn = sqlite3.connect('gtf_database.db') | |
139 conn = sqlite3.connect(":memory:") | |
140 c = conn.cursor() | |
141 # c.execute("DROP TABLE IF EXISTS gtf_data;") | |
142 # c.execute("CREATE TABLE IF NOT EXISTS gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") | |
143 c.execute("CREATE TABLE gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") | |
144 | |
145 for strand in gtf.keys(): | |
146 if strand == "+": | |
147 st = "positive" | |
148 elif strand == "-": | |
149 st = "negative" | |
150 else: | |
151 print "Something fishy in writing . . ." | |
152 | |
153 for type in gtf[strand].keys(): | |
154 data = gtf[strand][type] | |
155 c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data) | |
156 | |
157 conn.commit() | |
158 | |
159 infh = open(inputFile[2], "r") | |
160 # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r") | |
161 data = infh.readlines() | |
162 # output file | |
163 outfh = open(inputFile[3], 'w') | |
164 # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w") | |
165 | |
166 for each in data: | |
167 a = each.split("\t") | |
168 chr = a[0].strip() | |
169 pep_start = a[1].strip() | |
170 pep_end = a[2].strip() | |
171 strand = a[5].strip() | |
172 c.execute("select * from gtf_data where type = 'CDS' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | |
173 rows = c.fetchall() | |
174 if len(rows) > 0: | |
175 outfh.write(each.strip() + "\tCDS\n") | |
176 else: | |
177 c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | |
178 rows = c.fetchall() | |
179 if len(rows) > 0: | |
180 outfh.write(each.strip() + "\tfive_prime_utr\n") | |
181 else: | |
182 c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | |
183 rows = c.fetchall() | |
184 if len(rows) > 0: | |
185 outfh.write(each.strip() + "\tthree_prime_utr\n") | |
186 else: | |
187 c.execute("select * from gtf_data where type = 'exon' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | |
188 rows = c.fetchall() | |
189 if len(rows) > 0: | |
190 outfh.write(each.strip() + "\texon\n") | |
191 else: | |
192 c.execute("select * from gtf_data where type = 'intron' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | |
193 rows = c.fetchall() | |
194 if len(rows) > 0: | |
195 outfh.write(each.strip() + "\tintron\n") | |
196 else: | |
197 c.execute("select * from gtf_data where type = 'gene' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | |
198 rows = c.fetchall() | |
199 if len(rows) > 0: | |
200 outfh.write(each.strip() + "\tgene\n") | |
201 else: | |
202 c.execute("select * from gtf_data where type = 'intergenic' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | |
203 rows = c.fetchall() | |
204 if len(rows) > 0: | |
205 outfh.write(each.strip() + "\tintergene\n") | |
206 else: | |
207 outfh.write(each.strip() + "\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\n") | |
208 | |
209 conn.close() | |
210 outfh.close() | |
211 else: | |
212 print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>" | |
213 return None | |
214 | |
215 if __name__ == "__main__": | |
216 main() | |
217 | |
218 | |
219 | |
220 | |
221 |