comparison prophage_relatedness.py @ 0:7a23dda2e932 draft

planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
author cpt
date Thu, 08 Aug 2024 03:09:32 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7a23dda2e932
1 #!/usr/bin/env python
2 import argparse
3 from math import floor
4 from Bio.Blast import NCBIXML
5 import logging
6
7 logging.basicConfig(level=logging.DEBUG)
8 log = logging.getLogger()
9
10
11 def parseXML(blastxml, outFile): # Modified from intron_detection
12 blast = []
13 for iter_num, blast_record in enumerate(NCBIXML.parse(blastxml), 1):
14 align_num = 0
15 outFile.write("Query ID\tQuery Length\tTotal Number of Hits\n")
16 outFile.write(
17 "%s\t%d\t%d\n\n"
18 % (
19 blast_record.query_id,
20 blast_record.query_length,
21 len(blast_record.alignments),
22 )
23 )
24
25 for alignment in blast_record.alignments:
26
27 align_num += 1
28 gi_nos = str(alignment.accession)
29 blast_gene = []
30 for hsp in alignment.hsps:
31
32 x = float(hsp.identities - 1) / ((hsp.query_end) - hsp.query_start)
33 nice_name = blast_record.query
34 if " " in nice_name:
35 nice_name = nice_name[0 : nice_name.index(" ")]
36 blast_gene.append(
37 {
38 "gi_nos": gi_nos,
39 "sbjct_length": alignment.length,
40 "query_length": blast_record.query_length,
41 "sbjct_range": (hsp.sbjct_start, hsp.sbjct_end),
42 "query_range": (hsp.query_start, hsp.query_end),
43 "name": nice_name,
44 "evalue": hsp.expect,
45 "identity": hsp.identities,
46 "identity_percent": x,
47 "hit_num": align_num,
48 "iter_num": iter_num,
49 "match_id": alignment.title.partition(">")[0],
50 "align_len": hsp.align_length,
51 }
52 )
53 blast.append(blast_gene)
54
55 return blast
56
57
58 def openTSV(blasttsv, outFile): # Modified from intron_detection
59 blast = []
60 activeAlign = ""
61 numAlignments = 0
62 qLen = 0
63 for line in blasttsv:
64 line = line.strip("\n")
65 data = line.split("\t")
66 for x in range(0, len(data)):
67 data[x] = data[x].strip()
68 qLen = data[22]
69 if activeAlign == "":
70 numAlignments += 1
71 blast_gene = []
72 hsp_num = 1
73 elif activeAlign != data[1]:
74 numAlignments += 1
75 blast.append(blast_gene)
76 blast_gene = []
77 hsp_num = 1
78 else:
79 hsp_num += 1
80 gi_nos = data[12]
81 activeAlign = data[1]
82 x = float(float(data[14]) - 1) / (float(data[7]) - float(data[6]))
83 nice_name = data[1]
84 if " " in nice_name:
85 nice_name = nice_name[0 : nice_name.index(" ")]
86 blast_gene.append(
87 {
88 "gi_nos": gi_nos,
89 "sbjct_length": int(data[23]),
90 "query_length": int(data[22]),
91 "sbjct_range": (int(data[8]), int(data[9])),
92 "query_range": (int(data[6]), int(data[7])),
93 "name": nice_name,
94 "evalue": float(data[10]),
95 "identity": int(data[14]),
96 "identity_percent": x,
97 "hit_num": numAlignments,
98 "iter_num": hsp_num,
99 "match_id": data[24].partition(">")[0],
100 "align_len": int(data[3]),
101 }
102 )
103
104 blast.append(blast_gene)
105 outFile.write("Query ID\tQuery Length\tTotal Number of Hits\n")
106 outFile.write("%s\t%d\t%d\n\n" % (data[0], int(data[22]), numAlignments))
107
108 return blast
109
110
111 def test_true(feature, **kwargs):
112 return True
113
114
115 def superSets(inSets):
116 inSets.sort(key=len, reverse=True)
117 nextInd = 0
118 res = []
119 for i in range(0, len(inSets)):
120 if i == 0:
121 res.append(inSets[i])
122 continue
123 for par in res:
124 complete = True
125 for x in inSets[i]:
126 if not (x in par):
127 complete = False
128 if complete:
129 break # Subset of at least one member
130 if not complete:
131 res.append(inSets[i])
132 return res
133
134
135 def disjointSets(inSets):
136 inSets.sort(key=lambda x: x[0]["sbjct_range"][0])
137 res = [inSets[0]]
138 for i in range(1, len(inSets)):
139 disjoint = True
140 for elem in inSets[i]:
141 for cand in res:
142 if elem in cand:
143 disjoint = False
144 break
145 if not disjoint:
146 break
147 if disjoint:
148 res.append(inSets[i])
149 return res
150
151
152 def compPhage(inRec, outFile, padding=1.2, cutoff=0.3, numReturn=20, isTSV=False):
153
154 if isTSV:
155 inRec = openTSV(inRec, outFile)
156 else:
157 inRec = parseXML(inRec, outFile)
158 res = []
159 for group in inRec:
160 window = floor(padding * float(group[0]["query_length"]))
161 group = sorted(group, key=lambda x: x["sbjct_range"][0])
162 hspGroups = []
163 lastInd = len(res)
164
165 for x in range(0, len(group)):
166 hspGroups.append([group[x]])
167 startBound = group[x]["sbjct_range"][0]
168 endBound = startBound + window
169 for hsp in group[x + 1 :]:
170 if (
171 hsp["sbjct_range"][0] >= startBound
172 and hsp["sbjct_range"][1] <= endBound
173 ):
174 hspGroups[-1].append(hsp)
175
176 for x in disjointSets(superSets(hspGroups)):
177 res.append(x)
178
179 maxID = 0.0
180 for x in res[lastInd:]:
181 sumID = 0.0
182 totAlign = 0
183 for y in x:
184 totAlign += y["align_len"]
185 sumID += float(y["identity"])
186 x.append(totAlign)
187 x.append(sumID / float(x[0]["query_length"]))
188 maxID = max(maxID, x[-1])
189
190 res = sorted(res, key=lambda x: x[-1], reverse=True)
191
192 outList = []
193 outNum = 0
194 for x in res:
195 if outNum == numReturn or x[-1] < cutoff:
196 break
197 outNum += 1
198 outList.append(x)
199
200 # Original request was that low scoring clusters would make it to the final results IF
201 # they were part of an Accession cluster that did have at least one high scoring member.
202
203 outFile.write(
204 "Accession Number\tCluster Start Location\tEnd Location\tSubject Cluster Length\t# HSPs in Cluster\tTotal Aligned Length\t% of Query Aligned\tOverall % Query Identity\tOverall % Subject Identity\tComplete Accession Info\n"
205 )
206 for x in outList:
207 minStart = min(x[0]["sbjct_range"][0], x[0]["sbjct_range"][1])
208 maxEnd = max(x[0]["sbjct_range"][0], x[0]["sbjct_range"][1])
209 if "|gb|" in x[0]["match_id"]:
210 startSlice = x[0]["match_id"].index("gb|") + 3
211 endSlice = (x[0]["match_id"][startSlice:]).index("|")
212 accOut = x[0]["match_id"][startSlice : startSlice + endSlice]
213 else:
214 accOut = x[0]["gi_nos"]
215 for y in x[0:-2]:
216 # ("\t%.3f\t" % (x[-1]))
217 minStart = min(minStart, y["sbjct_range"][0])
218 maxEnd = max(maxEnd, y["sbjct_range"][1])
219 outFile.write(
220 accOut
221 + "\t"
222 + str(minStart)
223 + "\t"
224 + str(maxEnd)
225 + "\t"
226 + str(maxEnd - minStart + 1)
227 + "\t"
228 + str(len(x) - 1)
229 + "\t"
230 + str(x[-2])
231 + ("\t%.3f" % (float(x[-2]) / float(x[0]["query_length"]) * 100.00))
232 + ("\t%.3f" % (x[-1] * 100.00))
233 + ("\t%.3f" % (float(x[-2]) / float(maxEnd - minStart + 1) * 100.00))
234 + "\t"
235 + x[0]["match_id"]
236 + "\n"
237 )
238
239 # accession start end number
240
241
242 if __name__ == "__main__":
243 parser = argparse.ArgumentParser(description="Intron detection")
244 parser.add_argument(
245 "inRec", type=argparse.FileType("r"), help="blast XML protein results"
246 )
247 parser.add_argument(
248 "--outFile",
249 type=argparse.FileType("w"),
250 help="Output Error Log",
251 default="./compPro.tsv",
252 )
253 parser.add_argument(
254 "--padding",
255 help="Gap minimum (Default -1, set to a negative number to allow overlap)",
256 default=1.2,
257 type=float,
258 )
259 parser.add_argument(
260 "--cutoff",
261 help="Gap minimum (Default -1, set to a negative number to allow overlap)",
262 default=0.3,
263 type=float,
264 )
265 parser.add_argument(
266 "--numReturn",
267 help="Gap maximum in genome (Default 10000)",
268 default=20,
269 type=int,
270 )
271 parser.add_argument(
272 "--isTSV",
273 help="Opening Blast TSV result",
274 action="store_true",
275 )
276 args = parser.parse_args()
277
278 compPhage(**vars(args))