0
|
1 #!/usr/bin/env python
|
|
2 import sys
|
|
3 import argparse
|
|
4 import json
|
|
5 import logging
|
|
6 from Bio.Blast import NCBIXML
|
|
7
|
|
8
|
|
9 logging.basicConfig(level=logging.DEBUG)
|
|
10 log = logging.getLogger()
|
|
11
|
|
12
|
|
13 def parse_blast(blast, isXML = False):
|
|
14 res = []
|
|
15 finalRes = []
|
|
16 if isXML:
|
|
17 for iter_num, blast_record in enumerate(NCBIXML.parse(blast), 1):
|
|
18 for alignment in blast_record.alignments:
|
|
19 tempID = alignment.hit_id[alignment.hit_id.find("gb|") + 3:]
|
|
20 tempID = tempID[:tempID.find("|")]
|
|
21 tempDesc = alignment.title
|
|
22 while tempDesc.find("|") >= 0:
|
|
23 tempDesc = tempDesc[tempDesc.find("|") + 1:]
|
|
24 tempDesc = tempDesc.strip()
|
|
25 tempID = tempID.strip()
|
|
26 for hsp in alignment.hsps:
|
|
27 line = [str(blast_record.query)]
|
|
28 line.append(str(hsp.align_length))
|
|
29 line.append(str(hsp.identities))
|
|
30 line.append(str(blast_record.query_length))
|
|
31 line.append(str(alignment.length))
|
|
32 line.append(tempDesc)
|
|
33 line.append(tempID)
|
|
34 #line.append("0000000")
|
|
35 #print(line)
|
|
36 res.append(line)
|
|
37 blast.seek(0)
|
|
38 resInd = -1
|
|
39 taxLine = blast.readline()
|
|
40 while taxLine:
|
|
41 if "<Hit>" in taxLine:
|
|
42 resInd += 1
|
|
43 taxSlice = ""
|
|
44 elif "<taxid>" in taxLine:
|
|
45 taxSlice = taxLine[taxLine.find("<taxid>") + 7:taxLine.find("</taxid>")]
|
|
46 finalRes.append(res[resInd])
|
|
47 finalRes[-1].append(taxSlice)
|
|
48 #print(finalRes[-1])
|
|
49 taxLine = blast.readline()
|
|
50 return finalRes
|
|
51 else:
|
|
52 for line in blast:
|
|
53 taxSplit = []
|
|
54 preTaxSplit = line.strip("\n").split("\t")
|
|
55 for tax in preTaxSplit[-1].split(";"):
|
|
56 shallowCopy = []
|
|
57 for x in range(len(preTaxSplit)):
|
|
58 shallowCopy.append(preTaxSplit[x])
|
|
59 shallowCopy[-1] = tax
|
|
60 res.append(shallowCopy)
|
|
61 for line in res:
|
|
62 for access in line[6].split(";"):
|
|
63 shallowCopy = []
|
|
64 for x in range(len(line)):
|
|
65 shallowCopy.append(line[x])
|
|
66 shallowCopy[6] = access
|
|
67 finalRes.append(shallowCopy)
|
|
68 # for x in finalRes:
|
|
69 # print(x)
|
|
70 # exit()
|
|
71 return finalRes
|
|
72
|
|
73
|
|
74 def add_dice(blast):
|
|
75 res = []
|
|
76 for data in blast:
|
|
77 dice = (2 * float(data[2])) / (float(data[3]) + float(data[4]))
|
|
78 res.append(data + [dice])
|
|
79 return res
|
|
80
|
|
81
|
|
82 def make_num(blast):
|
|
83 res = []
|
|
84 for data in blast:
|
|
85 res.append(
|
|
86 [
|
|
87 data[0],
|
|
88 int(data[1]),
|
|
89 int(data[2]),
|
|
90 int(data[3]),
|
|
91 int(data[4]),
|
|
92 data[5],
|
|
93 data[6],
|
|
94 (data[7]),
|
|
95 ]
|
|
96 )
|
|
97 return res
|
|
98
|
|
99
|
|
100 def bundle_dice(blast):
|
|
101 res = []
|
|
102 ind = 0
|
|
103 seen = {}
|
|
104
|
|
105 for x in blast:
|
|
106 if (x[6] + "_" + (x[7])) in seen.keys():
|
|
107 res[seen[(x[6] + "_" + (x[7]))]][1] += x[1]
|
|
108 res[seen[(x[6] + "_" + (x[7]))]][2] += x[2]
|
|
109 res[seen[(x[6] + "_" + (x[7]))]][8] += x[8]
|
|
110 res[seen[(x[6] + "_" + (x[7]))]][9] += 1 # Num HSPs
|
|
111 else:
|
|
112 seen[(x[6] + "_" + (x[7]))] = ind
|
|
113 shallowCopy = []
|
|
114 for i in range(len(x)):
|
|
115 shallowCopy.append(x[i])
|
|
116 shallowCopy.append(1)
|
|
117 # print(shallowCopy)
|
|
118 res.append(shallowCopy)
|
|
119 ind += 1
|
|
120 # print(seen)
|
|
121 # stop = 0
|
|
122 # for i in res:
|
|
123 # print(i)
|
|
124 # stop += 1
|
|
125 # if stop > 7:
|
|
126 # exit()
|
|
127 return res
|
|
128
|
|
129
|
|
130 """def bundle_dice(blast):
|
|
131 res = []
|
|
132 ind = 0
|
|
133 seen = {}
|
|
134 for x in blast:
|
|
135 if ((x[0] + x[5])) in seen.keys():
|
|
136 res[seen[(x[0] + x[5])]][1] += x[1]
|
|
137 res[seen[(x[0] + x[5])]][2] += x[2]
|
|
138 res[seen[(x[0] + x[5])]][9] += 1 # Num HSPs
|
|
139 res[seen[(x[0] + x[5])]][8] = ((res[seen[(x[0] + x[5])]][8] * (res[seen[(x[0] + x[5])]][9] - 1)) + x[8]) / res[seen[(x[0] + x[5])]][9]
|
|
140 else:
|
|
141 seen[(x[0] + x[5])] = ind
|
|
142 res.append(x + [1])
|
|
143 ind += 1
|
|
144 return res """
|
|
145
|
|
146
|
|
147 def filter_dice(blast, threshold=0.5):
|
|
148 for data in blast:
|
|
149 if data[-1] > threshold:
|
|
150 yield data
|
|
151
|
|
152
|
|
153 def split_identifiers_nucl(_, ident):
|
|
154 if "<>" in ident:
|
|
155 idents = ident.split("<>")
|
|
156 else:
|
|
157 idents = [ident]
|
|
158 return idents
|
|
159
|
|
160
|
|
161 def split_identifiers_prot(_, ident):
|
|
162 if "<>" in ident:
|
|
163 idents = ident.split("<>")
|
|
164 else:
|
|
165 idents = [ident]
|
|
166 return [
|
|
167 x[x.index("[") + 1 : x.rindex("]")]
|
|
168 for x in idents
|
|
169 # MULTISPECIES: recombination-associated protein RdgC [Enterobacteriaceae]<>RecName: Full=Recombination-associated protein RdgC<>putative exonuclease, RdgC [Enterobacter sp. 638]
|
|
170 if "[" in x and "]" in x
|
|
171 ]
|
|
172
|
|
173
|
|
174 def split_identifiers_phage(par, ident):
|
|
175 par = par.replace("lcl|", "")
|
|
176 par = par[0 : par.index("_prot_")]
|
|
177 return [par]
|
|
178
|
|
179
|
|
180 def important_only(blast, split_identifiers):
|
|
181 for data in blast:
|
|
182 yield [
|
|
183 data[0], # 01 Query Seq-id (ID of your sequence)
|
|
184 data[1], # 13 All subject Seq-id(s), separated by a ';'
|
|
185 split_identifiers(
|
|
186 data[1], data[2]
|
|
187 ), # 25 All subject title(s), separated by a '<>'
|
|
188 data[3].split(";"), # Extra: All Subject Accessions
|
|
189 data[4].split(";"), # Extra: All TaxIDs
|
|
190 ]
|
|
191
|
|
192
|
|
193 def deform_scores(blast):
|
|
194 for data in blast:
|
|
195 for org in data[2]:
|
|
196 yield [data[0], data[1], org, data[3], data[4]]
|
|
197
|
|
198
|
|
199 def expand_taxIDs(blast):
|
|
200 for data in blast:
|
|
201 # if(len(data[4]) > 0):
|
|
202 # print(data[0])
|
|
203 for ID in data[7]:
|
|
204 yield [
|
|
205 data[0],
|
|
206 data[1],
|
|
207 data[2],
|
|
208 data[3],
|
|
209 data[4],
|
|
210 data[5],
|
|
211 data[6],
|
|
212 int(ID),
|
|
213 data[8],
|
|
214 ]
|
|
215
|
|
216
|
|
217 def expand_titles(blast):
|
|
218 for data in blast:
|
|
219 for title in data[5]:
|
|
220 yield [
|
|
221 data[0],
|
|
222 data[1],
|
|
223 data[2],
|
|
224 data[3],
|
|
225 data[4],
|
|
226 title,
|
|
227 data[6],
|
|
228 data[7],
|
|
229 data[8],
|
|
230 ]
|
|
231
|
|
232
|
|
233 def filter_phage(blast, phageTaxLookup):
|
|
234 res = []
|
|
235 for data in blast:
|
|
236 if int(data[7]) in phageTaxLookup:
|
|
237 res.append(data)
|
|
238 return res
|
|
239
|
|
240
|
|
241 def remove_dupes(data):
|
|
242 has_seen = {}
|
|
243 res = []
|
|
244 for row in data:
|
|
245 # qseqid, sseqid
|
|
246 key = (row[0], row[6], row[7])
|
|
247 # If we've seen the key before, we can exit
|
|
248 if key in has_seen:
|
|
249 continue
|
|
250
|
|
251 # Otherwise, continue on
|
|
252 has_seen[key] = True
|
|
253 # Pretty simple
|
|
254 res.append(row)
|
|
255 return res
|
|
256
|
|
257
|
|
258 def scoreMap(blast):
|
|
259 c = {}
|
|
260 m = {}
|
|
261 for (qseq, subID, subTitle, access, ID) in blast:
|
|
262 if (subTitle, ID) not in c:
|
|
263 m[(subTitle, ID)] = access
|
|
264 c[(subTitle, ID)] = 0
|
|
265
|
|
266 c[(subTitle, ID)] += 1
|
|
267 return c, m
|
|
268
|
|
269
|
|
270 if __name__ == "__main__":
|
|
271 parser = argparse.ArgumentParser(description="Top related genomes")
|
|
272 parser.add_argument(
|
|
273 "blast", type=argparse.FileType("r"), help="Blast 25 Column Results"
|
|
274 )
|
|
275 parser.add_argument("phagedb", type=argparse.FileType("r"))
|
|
276 parser.add_argument("--access", action="store_true")
|
|
277 parser.add_argument("--protein", action="store_true")
|
|
278 parser.add_argument("--canonical", action="store_true")
|
|
279 parser.add_argument("--hits", type=int, default=5)
|
|
280 parser.add_argument("--noFilter", action="store_true")
|
|
281 parser.add_argument("--xmlMode", action="store_true")
|
|
282
|
|
283 args = parser.parse_args()
|
|
284
|
|
285 phageDb = args.phagedb
|
|
286 phageTaxLookup = []
|
|
287 line = phageDb.readline()
|
|
288 while line:
|
|
289 line = line.split("\t")
|
|
290 phageTaxLookup.append(int(line[0]))
|
|
291 line = phageDb.readline()
|
|
292
|
|
293 if args.protein:
|
|
294 splitId = split_identifiers_prot
|
|
295 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb}
|
|
296 elif args.canonical:
|
|
297 splitId = split_identifiers_phage
|
|
298 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb}
|
|
299 else:
|
|
300 splitId = split_identifiers_nucl
|
|
301 # phageNameLookup = {k['desc'].rstrip('.'): k['id'] for k in phageDb}
|
|
302
|
|
303 data = [] # Reformatting to list rather than generator
|
|
304
|
|
305 data = parse_blast(args.blast, args.xmlMode)
|
|
306 nameRec = data[0][0]
|
|
307 data = make_num(data)
|
|
308 data = add_dice(data)
|
|
309 data = bundle_dice(data)
|
|
310
|
|
311 # data = filter_dice(data, threshold=0.0)
|
|
312 # data = important_only(data, splitId)
|
|
313
|
|
314 # data = expand_taxIDs(data)
|
|
315 # data = deform_scores(data)
|
|
316 if not args.noFilter:
|
|
317 data = filter_phage(data, phageTaxLookup)
|
|
318 # data = expand_titles(data)
|
|
319
|
|
320 if args.protein or args.canonical:
|
|
321 data = remove_dupes(data) # Probably obsolete, bundle dice should do this
|
|
322 count_label = "Similar Unique Proteins"
|
|
323 else:
|
|
324 count_label = "Nucleotide Hits"
|
|
325 # data = with_dice(data)
|
|
326 data.sort(key=lambda data: -data[8])
|
|
327 # counts, accessions = scoreMap(data)
|
|
328
|
|
329 if args.access:
|
|
330 sys.stdout.write(
|
|
331 "Top %d matches for BLASTn results of %s\t\t\t\t\t\t\n"
|
|
332 % (args.hits, nameRec)
|
|
333 )
|
|
334 sys.stdout.write(
|
|
335 "TaxID\tName\tAccessions\tSubject Length\tNumber of HSPs\tTotal Aligned Length\tDice Score\n"
|
|
336 )
|
|
337 ind = 0
|
|
338 for out in data:
|
|
339 if ind >= args.hits:
|
|
340 break
|
|
341 ind += 1
|
|
342 sys.stdout.write(
|
|
343 "%s\t%s\t%s\t%s\t%s\t%s\t%.4f\n"
|
|
344 % (out[7], out[5], out[6], out[4], out[9], out[2], out[8])
|
|
345 )
|
|
346
|
|
347 else:
|
|
348 sys.stdout.write(
|
|
349 "Top %d matches for BLASTn results of %s\t\t\t\t\t\n"
|
|
350 % (args.hits, data[0][0])
|
|
351 )
|
|
352 sys.stdout.write(
|
|
353 "TaxID\tName\tSubject Length\tNumber of HSPs\tTotal Aligned Length\tDice Score\n"
|
|
354 )
|
|
355 ind = 0
|
|
356 for out in data:
|
|
357 if ind >= args.hits:
|
|
358 break
|
|
359 ind += 1
|
|
360 sys.stdout.write(
|
|
361 "%s\t%s\t%s\t%s\t%s\t%.4f\n"
|
|
362 % (out[7], out[5], out[4], out[9], out[1], out[8])
|
|
363 )
|
|
364
|