Mercurial > repos > cpt > cpt_prophage_relatedness
comparison prophage_relatedness.py @ 0:7a23dda2e932 draft
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
author | cpt |
---|---|
date | Thu, 08 Aug 2024 03:09:32 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7a23dda2e932 |
---|---|
1 #!/usr/bin/env python | |
2 import argparse | |
3 from math import floor | |
4 from Bio.Blast import NCBIXML | |
5 import logging | |
6 | |
7 logging.basicConfig(level=logging.DEBUG) | |
8 log = logging.getLogger() | |
9 | |
10 | |
11 def parseXML(blastxml, outFile): # Modified from intron_detection | |
12 blast = [] | |
13 for iter_num, blast_record in enumerate(NCBIXML.parse(blastxml), 1): | |
14 align_num = 0 | |
15 outFile.write("Query ID\tQuery Length\tTotal Number of Hits\n") | |
16 outFile.write( | |
17 "%s\t%d\t%d\n\n" | |
18 % ( | |
19 blast_record.query_id, | |
20 blast_record.query_length, | |
21 len(blast_record.alignments), | |
22 ) | |
23 ) | |
24 | |
25 for alignment in blast_record.alignments: | |
26 | |
27 align_num += 1 | |
28 gi_nos = str(alignment.accession) | |
29 blast_gene = [] | |
30 for hsp in alignment.hsps: | |
31 | |
32 x = float(hsp.identities - 1) / ((hsp.query_end) - hsp.query_start) | |
33 nice_name = blast_record.query | |
34 if " " in nice_name: | |
35 nice_name = nice_name[0 : nice_name.index(" ")] | |
36 blast_gene.append( | |
37 { | |
38 "gi_nos": gi_nos, | |
39 "sbjct_length": alignment.length, | |
40 "query_length": blast_record.query_length, | |
41 "sbjct_range": (hsp.sbjct_start, hsp.sbjct_end), | |
42 "query_range": (hsp.query_start, hsp.query_end), | |
43 "name": nice_name, | |
44 "evalue": hsp.expect, | |
45 "identity": hsp.identities, | |
46 "identity_percent": x, | |
47 "hit_num": align_num, | |
48 "iter_num": iter_num, | |
49 "match_id": alignment.title.partition(">")[0], | |
50 "align_len": hsp.align_length, | |
51 } | |
52 ) | |
53 blast.append(blast_gene) | |
54 | |
55 return blast | |
56 | |
57 | |
58 def openTSV(blasttsv, outFile): # Modified from intron_detection | |
59 blast = [] | |
60 activeAlign = "" | |
61 numAlignments = 0 | |
62 qLen = 0 | |
63 for line in blasttsv: | |
64 line = line.strip("\n") | |
65 data = line.split("\t") | |
66 for x in range(0, len(data)): | |
67 data[x] = data[x].strip() | |
68 qLen = data[22] | |
69 if activeAlign == "": | |
70 numAlignments += 1 | |
71 blast_gene = [] | |
72 hsp_num = 1 | |
73 elif activeAlign != data[1]: | |
74 numAlignments += 1 | |
75 blast.append(blast_gene) | |
76 blast_gene = [] | |
77 hsp_num = 1 | |
78 else: | |
79 hsp_num += 1 | |
80 gi_nos = data[12] | |
81 activeAlign = data[1] | |
82 x = float(float(data[14]) - 1) / (float(data[7]) - float(data[6])) | |
83 nice_name = data[1] | |
84 if " " in nice_name: | |
85 nice_name = nice_name[0 : nice_name.index(" ")] | |
86 blast_gene.append( | |
87 { | |
88 "gi_nos": gi_nos, | |
89 "sbjct_length": int(data[23]), | |
90 "query_length": int(data[22]), | |
91 "sbjct_range": (int(data[8]), int(data[9])), | |
92 "query_range": (int(data[6]), int(data[7])), | |
93 "name": nice_name, | |
94 "evalue": float(data[10]), | |
95 "identity": int(data[14]), | |
96 "identity_percent": x, | |
97 "hit_num": numAlignments, | |
98 "iter_num": hsp_num, | |
99 "match_id": data[24].partition(">")[0], | |
100 "align_len": int(data[3]), | |
101 } | |
102 ) | |
103 | |
104 blast.append(blast_gene) | |
105 outFile.write("Query ID\tQuery Length\tTotal Number of Hits\n") | |
106 outFile.write("%s\t%d\t%d\n\n" % (data[0], int(data[22]), numAlignments)) | |
107 | |
108 return blast | |
109 | |
110 | |
111 def test_true(feature, **kwargs): | |
112 return True | |
113 | |
114 | |
115 def superSets(inSets): | |
116 inSets.sort(key=len, reverse=True) | |
117 nextInd = 0 | |
118 res = [] | |
119 for i in range(0, len(inSets)): | |
120 if i == 0: | |
121 res.append(inSets[i]) | |
122 continue | |
123 for par in res: | |
124 complete = True | |
125 for x in inSets[i]: | |
126 if not (x in par): | |
127 complete = False | |
128 if complete: | |
129 break # Subset of at least one member | |
130 if not complete: | |
131 res.append(inSets[i]) | |
132 return res | |
133 | |
134 | |
135 def disjointSets(inSets): | |
136 inSets.sort(key=lambda x: x[0]["sbjct_range"][0]) | |
137 res = [inSets[0]] | |
138 for i in range(1, len(inSets)): | |
139 disjoint = True | |
140 for elem in inSets[i]: | |
141 for cand in res: | |
142 if elem in cand: | |
143 disjoint = False | |
144 break | |
145 if not disjoint: | |
146 break | |
147 if disjoint: | |
148 res.append(inSets[i]) | |
149 return res | |
150 | |
151 | |
152 def compPhage(inRec, outFile, padding=1.2, cutoff=0.3, numReturn=20, isTSV=False): | |
153 | |
154 if isTSV: | |
155 inRec = openTSV(inRec, outFile) | |
156 else: | |
157 inRec = parseXML(inRec, outFile) | |
158 res = [] | |
159 for group in inRec: | |
160 window = floor(padding * float(group[0]["query_length"])) | |
161 group = sorted(group, key=lambda x: x["sbjct_range"][0]) | |
162 hspGroups = [] | |
163 lastInd = len(res) | |
164 | |
165 for x in range(0, len(group)): | |
166 hspGroups.append([group[x]]) | |
167 startBound = group[x]["sbjct_range"][0] | |
168 endBound = startBound + window | |
169 for hsp in group[x + 1 :]: | |
170 if ( | |
171 hsp["sbjct_range"][0] >= startBound | |
172 and hsp["sbjct_range"][1] <= endBound | |
173 ): | |
174 hspGroups[-1].append(hsp) | |
175 | |
176 for x in disjointSets(superSets(hspGroups)): | |
177 res.append(x) | |
178 | |
179 maxID = 0.0 | |
180 for x in res[lastInd:]: | |
181 sumID = 0.0 | |
182 totAlign = 0 | |
183 for y in x: | |
184 totAlign += y["align_len"] | |
185 sumID += float(y["identity"]) | |
186 x.append(totAlign) | |
187 x.append(sumID / float(x[0]["query_length"])) | |
188 maxID = max(maxID, x[-1]) | |
189 | |
190 res = sorted(res, key=lambda x: x[-1], reverse=True) | |
191 | |
192 outList = [] | |
193 outNum = 0 | |
194 for x in res: | |
195 if outNum == numReturn or x[-1] < cutoff: | |
196 break | |
197 outNum += 1 | |
198 outList.append(x) | |
199 | |
200 # Original request was that low scoring clusters would make it to the final results IF | |
201 # they were part of an Accession cluster that did have at least one high scoring member. | |
202 | |
203 outFile.write( | |
204 "Accession Number\tCluster Start Location\tEnd Location\tSubject Cluster Length\t# HSPs in Cluster\tTotal Aligned Length\t% of Query Aligned\tOverall % Query Identity\tOverall % Subject Identity\tComplete Accession Info\n" | |
205 ) | |
206 for x in outList: | |
207 minStart = min(x[0]["sbjct_range"][0], x[0]["sbjct_range"][1]) | |
208 maxEnd = max(x[0]["sbjct_range"][0], x[0]["sbjct_range"][1]) | |
209 if "|gb|" in x[0]["match_id"]: | |
210 startSlice = x[0]["match_id"].index("gb|") + 3 | |
211 endSlice = (x[0]["match_id"][startSlice:]).index("|") | |
212 accOut = x[0]["match_id"][startSlice : startSlice + endSlice] | |
213 else: | |
214 accOut = x[0]["gi_nos"] | |
215 for y in x[0:-2]: | |
216 # ("\t%.3f\t" % (x[-1])) | |
217 minStart = min(minStart, y["sbjct_range"][0]) | |
218 maxEnd = max(maxEnd, y["sbjct_range"][1]) | |
219 outFile.write( | |
220 accOut | |
221 + "\t" | |
222 + str(minStart) | |
223 + "\t" | |
224 + str(maxEnd) | |
225 + "\t" | |
226 + str(maxEnd - minStart + 1) | |
227 + "\t" | |
228 + str(len(x) - 1) | |
229 + "\t" | |
230 + str(x[-2]) | |
231 + ("\t%.3f" % (float(x[-2]) / float(x[0]["query_length"]) * 100.00)) | |
232 + ("\t%.3f" % (x[-1] * 100.00)) | |
233 + ("\t%.3f" % (float(x[-2]) / float(maxEnd - minStart + 1) * 100.00)) | |
234 + "\t" | |
235 + x[0]["match_id"] | |
236 + "\n" | |
237 ) | |
238 | |
239 # accession start end number | |
240 | |
241 | |
242 if __name__ == "__main__": | |
243 parser = argparse.ArgumentParser(description="Intron detection") | |
244 parser.add_argument( | |
245 "inRec", type=argparse.FileType("r"), help="blast XML protein results" | |
246 ) | |
247 parser.add_argument( | |
248 "--outFile", | |
249 type=argparse.FileType("w"), | |
250 help="Output Error Log", | |
251 default="./compPro.tsv", | |
252 ) | |
253 parser.add_argument( | |
254 "--padding", | |
255 help="Gap minimum (Default -1, set to a negative number to allow overlap)", | |
256 default=1.2, | |
257 type=float, | |
258 ) | |
259 parser.add_argument( | |
260 "--cutoff", | |
261 help="Gap minimum (Default -1, set to a negative number to allow overlap)", | |
262 default=0.3, | |
263 type=float, | |
264 ) | |
265 parser.add_argument( | |
266 "--numReturn", | |
267 help="Gap maximum in genome (Default 10000)", | |
268 default=20, | |
269 type=int, | |
270 ) | |
271 parser.add_argument( | |
272 "--isTSV", | |
273 help="Opening Blast TSV result", | |
274 action="store_true", | |
275 ) | |
276 args = parser.parse_args() | |
277 | |
278 compPhage(**vars(args)) |