comparison cpt_related_genome_nuc/relatedness.py @ 0:5a5fe0a6f78d draft default tip

Uploaded
author cpt
date Fri, 10 Jun 2022 08:45:13 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5a5fe0a6f78d
1 #!/usr/bin/env python
2 import sys
3 import argparse
4 import json
5 import logging
6 from Bio.Blast import NCBIXML
7
8
9 logging.basicConfig(level=logging.DEBUG)
10 log = logging.getLogger()
11
12
13 def parse_blast(blast, isXML = False):
14 res = []
15 finalRes = []
16 if isXML:
17 for iter_num, blast_record in enumerate(NCBIXML.parse(blast), 1):
18 for alignment in blast_record.alignments:
19 tempID = alignment.hit_id[alignment.hit_id.find("gb|") + 3:]
20 tempID = tempID[:tempID.find("|")]
21 tempDesc = alignment.title
22 while tempDesc.find("|") >= 0:
23 tempDesc = tempDesc[tempDesc.find("|") + 1:]
24 tempDesc = tempDesc.strip()
25 tempID = tempID.strip()
26 for hsp in alignment.hsps:
27 line = [str(blast_record.query)]
28 line.append(str(hsp.align_length))
29 line.append(str(hsp.identities))
30 line.append(str(blast_record.query_length))
31 line.append(str(alignment.length))
32 line.append(tempDesc)
33 line.append(tempID)
34 #line.append("0000000")
35 #print(line)
36 res.append(line)
37 blast.seek(0)
38 resInd = -1
39 taxLine = blast.readline()
40 while taxLine:
41 if "<Hit>" in taxLine:
42 resInd += 1
43 taxSlice = ""
44 elif "<taxid>" in taxLine:
45 taxSlice = taxLine[taxLine.find("<taxid>") + 7:taxLine.find("</taxid>")]
46 finalRes.append(res[resInd])
47 finalRes[-1].append(taxSlice)
48 #print(finalRes[-1])
49 taxLine = blast.readline()
50 return finalRes
51 else:
52 for line in blast:
53 taxSplit = []
54 preTaxSplit = line.strip("\n").split("\t")
55 for tax in preTaxSplit[-1].split(";"):
56 shallowCopy = []
57 for x in range(len(preTaxSplit)):
58 shallowCopy.append(preTaxSplit[x])
59 shallowCopy[-1] = tax
60 res.append(shallowCopy)
61 for line in res:
62 for access in line[6].split(";"):
63 shallowCopy = []
64 for x in range(len(line)):
65 shallowCopy.append(line[x])
66 shallowCopy[6] = access
67 finalRes.append(shallowCopy)
68 # for x in finalRes:
69 # print(x)
70 # exit()
71 return finalRes
72
73
74 def add_dice(blast):
75 res = []
76 for data in blast:
77 dice = (2 * float(data[2])) / (float(data[3]) + float(data[4]))
78 res.append(data + [dice])
79 return res
80
81
82 def make_num(blast):
83 res = []
84 for data in blast:
85 res.append(
86 [
87 data[0],
88 int(data[1]),
89 int(data[2]),
90 int(data[3]),
91 int(data[4]),
92 data[5],
93 data[6],
94 (data[7]),
95 ]
96 )
97 return res
98
99
100 def bundle_dice(blast):
101 res = []
102 ind = 0
103 seen = {}
104
105 for x in blast:
106 if (x[6] + "_" + (x[7])) in seen.keys():
107 res[seen[(x[6] + "_" + (x[7]))]][1] += x[1]
108 res[seen[(x[6] + "_" + (x[7]))]][2] += x[2]
109 res[seen[(x[6] + "_" + (x[7]))]][8] += x[8]
110 res[seen[(x[6] + "_" + (x[7]))]][9] += 1 # Num HSPs
111 else:
112 seen[(x[6] + "_" + (x[7]))] = ind
113 shallowCopy = []
114 for i in range(len(x)):
115 shallowCopy.append(x[i])
116 shallowCopy.append(1)
117 # print(shallowCopy)
118 res.append(shallowCopy)
119 ind += 1
120 # print(seen)
121 # stop = 0
122 # for i in res:
123 # print(i)
124 # stop += 1
125 # if stop > 7:
126 # exit()
127 return res
128
129
130 """def bundle_dice(blast):
131 res = []
132 ind = 0
133 seen = {}
134 for x in blast:
135 if ((x[0] + x[5])) in seen.keys():
136 res[seen[(x[0] + x[5])]][1] += x[1]
137 res[seen[(x[0] + x[5])]][2] += x[2]
138 res[seen[(x[0] + x[5])]][9] += 1 # Num HSPs
139 res[seen[(x[0] + x[5])]][8] = ((res[seen[(x[0] + x[5])]][8] * (res[seen[(x[0] + x[5])]][9] - 1)) + x[8]) / res[seen[(x[0] + x[5])]][9]
140 else:
141 seen[(x[0] + x[5])] = ind
142 res.append(x + [1])
143 ind += 1
144 return res """
145
146
147 def filter_dice(blast, threshold=0.5):
148 for data in blast:
149 if data[-1] > threshold:
150 yield data
151
152
153 def split_identifiers_nucl(_, ident):
154 if "<>" in ident:
155 idents = ident.split("<>")
156 else:
157 idents = [ident]
158 return idents
159
160
161 def split_identifiers_prot(_, ident):
162 if "<>" in ident:
163 idents = ident.split("<>")
164 else:
165 idents = [ident]
166 return [
167 x[x.index("[") + 1 : x.rindex("]")]
168 for x in idents
169 # MULTISPECIES: recombination-associated protein RdgC [Enterobacteriaceae]<>RecName: Full=Recombination-associated protein RdgC<>putative exonuclease, RdgC [Enterobacter sp. 638]
170 if "[" in x and "]" in x
171 ]
172
173
174 def split_identifiers_phage(par, ident):
175 par = par.replace("lcl|", "")
176 par = par[0 : par.index("_prot_")]
177 return [par]
178
179
180 def important_only(blast, split_identifiers):
181 for data in blast:
182 yield [
183 data[0], # 01 Query Seq-id (ID of your sequence)
184 data[1], # 13 All subject Seq-id(s), separated by a ';'
185 split_identifiers(
186 data[1], data[2]
187 ), # 25 All subject title(s), separated by a '<>'
188 data[3].split(";"), # Extra: All Subject Accessions
189 data[4].split(";"), # Extra: All TaxIDs
190 ]
191
192
193 def deform_scores(blast):
194 for data in blast:
195 for org in data[2]:
196 yield [data[0], data[1], org, data[3], data[4]]
197
198
199 def expand_taxIDs(blast):
200 for data in blast:
201 # if(len(data[4]) > 0):
202 # print(data[0])
203 for ID in data[7]:
204 yield [
205 data[0],
206 data[1],
207 data[2],
208 data[3],
209 data[4],
210 data[5],
211 data[6],
212 int(ID),
213 data[8],
214 ]
215
216
217 def expand_titles(blast):
218 for data in blast:
219 for title in data[5]:
220 yield [
221 data[0],
222 data[1],
223 data[2],
224 data[3],
225 data[4],
226 title,
227 data[6],
228 data[7],
229 data[8],
230 ]
231
232
233 def filter_phage(blast, phageTaxLookup):
234 res = []
235 for data in blast:
236 if int(data[7]) in phageTaxLookup:
237 res.append(data)
238 return res
239
240
241 def remove_dupes(data):
242 has_seen = {}
243 res = []
244 for row in data:
245 # qseqid, sseqid
246 key = (row[0], row[6], row[7])
247 # If we've seen the key before, we can exit
248 if key in has_seen:
249 continue
250
251 # Otherwise, continue on
252 has_seen[key] = True
253 # Pretty simple
254 res.append(row)
255 return res
256
257
258 def scoreMap(blast):
259 c = {}
260 m = {}
261 for (qseq, subID, subTitle, access, ID) in blast:
262 if (subTitle, ID) not in c:
263 m[(subTitle, ID)] = access
264 c[(subTitle, ID)] = 0
265
266 c[(subTitle, ID)] += 1
267 return c, m
268
269
270 if __name__ == "__main__":
271 parser = argparse.ArgumentParser(description="Top related genomes")
272 parser.add_argument(
273 "blast", type=argparse.FileType("r"), help="Blast 25 Column Results"
274 )
275 parser.add_argument("phagedb", type=argparse.FileType("r"))
276 parser.add_argument("--access", action="store_true")
277 parser.add_argument("--protein", action="store_true")
278 parser.add_argument("--canonical", action="store_true")
279 parser.add_argument("--hits", type=int, default=5)
280 parser.add_argument("--noFilter", action="store_true")
281 parser.add_argument("--xmlMode", action="store_true")
282
283 args = parser.parse_args()
284
285 phageDb = args.phagedb
286 phageTaxLookup = []
287 line = phageDb.readline()
288 while line:
289 line = line.split("\t")
290 phageTaxLookup.append(int(line[0]))
291 line = phageDb.readline()
292
293 if args.protein:
294 splitId = split_identifiers_prot
295 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb}
296 elif args.canonical:
297 splitId = split_identifiers_phage
298 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb}
299 else:
300 splitId = split_identifiers_nucl
301 # phageNameLookup = {k['desc'].rstrip('.'): k['id'] for k in phageDb}
302
303 data = [] # Reformatting to list rather than generator
304
305 data = parse_blast(args.blast, args.xmlMode)
306 nameRec = data[0][0]
307 data = make_num(data)
308 data = add_dice(data)
309 data = bundle_dice(data)
310
311 # data = filter_dice(data, threshold=0.0)
312 # data = important_only(data, splitId)
313
314 # data = expand_taxIDs(data)
315 # data = deform_scores(data)
316 if not args.noFilter:
317 data = filter_phage(data, phageTaxLookup)
318 # data = expand_titles(data)
319
320 if args.protein or args.canonical:
321 data = remove_dupes(data) # Probably obsolete, bundle dice should do this
322 count_label = "Similar Unique Proteins"
323 else:
324 count_label = "Nucleotide Hits"
325 # data = with_dice(data)
326 data.sort(key=lambda data: -data[8])
327 # counts, accessions = scoreMap(data)
328
329 if args.access:
330 sys.stdout.write(
331 "Top %d matches for BLASTn results of %s\t\t\t\t\t\t\n"
332 % (args.hits, nameRec)
333 )
334 sys.stdout.write(
335 "TaxID\tName\tAccessions\tSubject Length\tNumber of HSPs\tTotal Aligned Length\tDice Score\n"
336 )
337 ind = 0
338 for out in data:
339 if ind >= args.hits:
340 break
341 ind += 1
342 sys.stdout.write(
343 "%s\t%s\t%s\t%s\t%s\t%s\t%.4f\n"
344 % (out[7], out[5], out[6], out[4], out[9], out[2], out[8])
345 )
346
347 else:
348 sys.stdout.write(
349 "Top %d matches for BLASTn results of %s\t\t\t\t\t\n"
350 % (args.hits, data[0][0])
351 )
352 sys.stdout.write(
353 "TaxID\tName\tSubject Length\tNumber of HSPs\tTotal Aligned Length\tDice Score\n"
354 )
355 ind = 0
356 for out in data:
357 if ind >= args.hits:
358 break
359 ind += 1
360 sys.stdout.write(
361 "%s\t%s\t%s\t%s\t%s\t%.4f\n"
362 % (out[7], out[5], out[4], out[9], out[1], out[8])
363 )
364