Mercurial > repos > cpt > cpt_related_genomes_nuc
comparison cpt_related_genome_nuc/relatedness.py @ 0:5a5fe0a6f78d draft default tip
Uploaded
author | cpt |
---|---|
date | Fri, 10 Jun 2022 08:45:13 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5a5fe0a6f78d |
---|---|
1 #!/usr/bin/env python | |
2 import sys | |
3 import argparse | |
4 import json | |
5 import logging | |
6 from Bio.Blast import NCBIXML | |
7 | |
8 | |
9 logging.basicConfig(level=logging.DEBUG) | |
10 log = logging.getLogger() | |
11 | |
12 | |
13 def parse_blast(blast, isXML = False): | |
14 res = [] | |
15 finalRes = [] | |
16 if isXML: | |
17 for iter_num, blast_record in enumerate(NCBIXML.parse(blast), 1): | |
18 for alignment in blast_record.alignments: | |
19 tempID = alignment.hit_id[alignment.hit_id.find("gb|") + 3:] | |
20 tempID = tempID[:tempID.find("|")] | |
21 tempDesc = alignment.title | |
22 while tempDesc.find("|") >= 0: | |
23 tempDesc = tempDesc[tempDesc.find("|") + 1:] | |
24 tempDesc = tempDesc.strip() | |
25 tempID = tempID.strip() | |
26 for hsp in alignment.hsps: | |
27 line = [str(blast_record.query)] | |
28 line.append(str(hsp.align_length)) | |
29 line.append(str(hsp.identities)) | |
30 line.append(str(blast_record.query_length)) | |
31 line.append(str(alignment.length)) | |
32 line.append(tempDesc) | |
33 line.append(tempID) | |
34 #line.append("0000000") | |
35 #print(line) | |
36 res.append(line) | |
37 blast.seek(0) | |
38 resInd = -1 | |
39 taxLine = blast.readline() | |
40 while taxLine: | |
41 if "<Hit>" in taxLine: | |
42 resInd += 1 | |
43 taxSlice = "" | |
44 elif "<taxid>" in taxLine: | |
45 taxSlice = taxLine[taxLine.find("<taxid>") + 7:taxLine.find("</taxid>")] | |
46 finalRes.append(res[resInd]) | |
47 finalRes[-1].append(taxSlice) | |
48 #print(finalRes[-1]) | |
49 taxLine = blast.readline() | |
50 return finalRes | |
51 else: | |
52 for line in blast: | |
53 taxSplit = [] | |
54 preTaxSplit = line.strip("\n").split("\t") | |
55 for tax in preTaxSplit[-1].split(";"): | |
56 shallowCopy = [] | |
57 for x in range(len(preTaxSplit)): | |
58 shallowCopy.append(preTaxSplit[x]) | |
59 shallowCopy[-1] = tax | |
60 res.append(shallowCopy) | |
61 for line in res: | |
62 for access in line[6].split(";"): | |
63 shallowCopy = [] | |
64 for x in range(len(line)): | |
65 shallowCopy.append(line[x]) | |
66 shallowCopy[6] = access | |
67 finalRes.append(shallowCopy) | |
68 # for x in finalRes: | |
69 # print(x) | |
70 # exit() | |
71 return finalRes | |
72 | |
73 | |
74 def add_dice(blast): | |
75 res = [] | |
76 for data in blast: | |
77 dice = (2 * float(data[2])) / (float(data[3]) + float(data[4])) | |
78 res.append(data + [dice]) | |
79 return res | |
80 | |
81 | |
82 def make_num(blast): | |
83 res = [] | |
84 for data in blast: | |
85 res.append( | |
86 [ | |
87 data[0], | |
88 int(data[1]), | |
89 int(data[2]), | |
90 int(data[3]), | |
91 int(data[4]), | |
92 data[5], | |
93 data[6], | |
94 (data[7]), | |
95 ] | |
96 ) | |
97 return res | |
98 | |
99 | |
100 def bundle_dice(blast): | |
101 res = [] | |
102 ind = 0 | |
103 seen = {} | |
104 | |
105 for x in blast: | |
106 if (x[6] + "_" + (x[7])) in seen.keys(): | |
107 res[seen[(x[6] + "_" + (x[7]))]][1] += x[1] | |
108 res[seen[(x[6] + "_" + (x[7]))]][2] += x[2] | |
109 res[seen[(x[6] + "_" + (x[7]))]][8] += x[8] | |
110 res[seen[(x[6] + "_" + (x[7]))]][9] += 1 # Num HSPs | |
111 else: | |
112 seen[(x[6] + "_" + (x[7]))] = ind | |
113 shallowCopy = [] | |
114 for i in range(len(x)): | |
115 shallowCopy.append(x[i]) | |
116 shallowCopy.append(1) | |
117 # print(shallowCopy) | |
118 res.append(shallowCopy) | |
119 ind += 1 | |
120 # print(seen) | |
121 # stop = 0 | |
122 # for i in res: | |
123 # print(i) | |
124 # stop += 1 | |
125 # if stop > 7: | |
126 # exit() | |
127 return res | |
128 | |
129 | |
130 """def bundle_dice(blast): | |
131 res = [] | |
132 ind = 0 | |
133 seen = {} | |
134 for x in blast: | |
135 if ((x[0] + x[5])) in seen.keys(): | |
136 res[seen[(x[0] + x[5])]][1] += x[1] | |
137 res[seen[(x[0] + x[5])]][2] += x[2] | |
138 res[seen[(x[0] + x[5])]][9] += 1 # Num HSPs | |
139 res[seen[(x[0] + x[5])]][8] = ((res[seen[(x[0] + x[5])]][8] * (res[seen[(x[0] + x[5])]][9] - 1)) + x[8]) / res[seen[(x[0] + x[5])]][9] | |
140 else: | |
141 seen[(x[0] + x[5])] = ind | |
142 res.append(x + [1]) | |
143 ind += 1 | |
144 return res """ | |
145 | |
146 | |
147 def filter_dice(blast, threshold=0.5): | |
148 for data in blast: | |
149 if data[-1] > threshold: | |
150 yield data | |
151 | |
152 | |
153 def split_identifiers_nucl(_, ident): | |
154 if "<>" in ident: | |
155 idents = ident.split("<>") | |
156 else: | |
157 idents = [ident] | |
158 return idents | |
159 | |
160 | |
161 def split_identifiers_prot(_, ident): | |
162 if "<>" in ident: | |
163 idents = ident.split("<>") | |
164 else: | |
165 idents = [ident] | |
166 return [ | |
167 x[x.index("[") + 1 : x.rindex("]")] | |
168 for x in idents | |
169 # MULTISPECIES: recombination-associated protein RdgC [Enterobacteriaceae]<>RecName: Full=Recombination-associated protein RdgC<>putative exonuclease, RdgC [Enterobacter sp. 638] | |
170 if "[" in x and "]" in x | |
171 ] | |
172 | |
173 | |
174 def split_identifiers_phage(par, ident): | |
175 par = par.replace("lcl|", "") | |
176 par = par[0 : par.index("_prot_")] | |
177 return [par] | |
178 | |
179 | |
180 def important_only(blast, split_identifiers): | |
181 for data in blast: | |
182 yield [ | |
183 data[0], # 01 Query Seq-id (ID of your sequence) | |
184 data[1], # 13 All subject Seq-id(s), separated by a ';' | |
185 split_identifiers( | |
186 data[1], data[2] | |
187 ), # 25 All subject title(s), separated by a '<>' | |
188 data[3].split(";"), # Extra: All Subject Accessions | |
189 data[4].split(";"), # Extra: All TaxIDs | |
190 ] | |
191 | |
192 | |
193 def deform_scores(blast): | |
194 for data in blast: | |
195 for org in data[2]: | |
196 yield [data[0], data[1], org, data[3], data[4]] | |
197 | |
198 | |
199 def expand_taxIDs(blast): | |
200 for data in blast: | |
201 # if(len(data[4]) > 0): | |
202 # print(data[0]) | |
203 for ID in data[7]: | |
204 yield [ | |
205 data[0], | |
206 data[1], | |
207 data[2], | |
208 data[3], | |
209 data[4], | |
210 data[5], | |
211 data[6], | |
212 int(ID), | |
213 data[8], | |
214 ] | |
215 | |
216 | |
217 def expand_titles(blast): | |
218 for data in blast: | |
219 for title in data[5]: | |
220 yield [ | |
221 data[0], | |
222 data[1], | |
223 data[2], | |
224 data[3], | |
225 data[4], | |
226 title, | |
227 data[6], | |
228 data[7], | |
229 data[8], | |
230 ] | |
231 | |
232 | |
233 def filter_phage(blast, phageTaxLookup): | |
234 res = [] | |
235 for data in blast: | |
236 if int(data[7]) in phageTaxLookup: | |
237 res.append(data) | |
238 return res | |
239 | |
240 | |
241 def remove_dupes(data): | |
242 has_seen = {} | |
243 res = [] | |
244 for row in data: | |
245 # qseqid, sseqid | |
246 key = (row[0], row[6], row[7]) | |
247 # If we've seen the key before, we can exit | |
248 if key in has_seen: | |
249 continue | |
250 | |
251 # Otherwise, continue on | |
252 has_seen[key] = True | |
253 # Pretty simple | |
254 res.append(row) | |
255 return res | |
256 | |
257 | |
258 def scoreMap(blast): | |
259 c = {} | |
260 m = {} | |
261 for (qseq, subID, subTitle, access, ID) in blast: | |
262 if (subTitle, ID) not in c: | |
263 m[(subTitle, ID)] = access | |
264 c[(subTitle, ID)] = 0 | |
265 | |
266 c[(subTitle, ID)] += 1 | |
267 return c, m | |
268 | |
269 | |
270 if __name__ == "__main__": | |
271 parser = argparse.ArgumentParser(description="Top related genomes") | |
272 parser.add_argument( | |
273 "blast", type=argparse.FileType("r"), help="Blast 25 Column Results" | |
274 ) | |
275 parser.add_argument("phagedb", type=argparse.FileType("r")) | |
276 parser.add_argument("--access", action="store_true") | |
277 parser.add_argument("--protein", action="store_true") | |
278 parser.add_argument("--canonical", action="store_true") | |
279 parser.add_argument("--hits", type=int, default=5) | |
280 parser.add_argument("--noFilter", action="store_true") | |
281 parser.add_argument("--xmlMode", action="store_true") | |
282 | |
283 args = parser.parse_args() | |
284 | |
285 phageDb = args.phagedb | |
286 phageTaxLookup = [] | |
287 line = phageDb.readline() | |
288 while line: | |
289 line = line.split("\t") | |
290 phageTaxLookup.append(int(line[0])) | |
291 line = phageDb.readline() | |
292 | |
293 if args.protein: | |
294 splitId = split_identifiers_prot | |
295 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb} | |
296 elif args.canonical: | |
297 splitId = split_identifiers_phage | |
298 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb} | |
299 else: | |
300 splitId = split_identifiers_nucl | |
301 # phageNameLookup = {k['desc'].rstrip('.'): k['id'] for k in phageDb} | |
302 | |
303 data = [] # Reformatting to list rather than generator | |
304 | |
305 data = parse_blast(args.blast, args.xmlMode) | |
306 nameRec = data[0][0] | |
307 data = make_num(data) | |
308 data = add_dice(data) | |
309 data = bundle_dice(data) | |
310 | |
311 # data = filter_dice(data, threshold=0.0) | |
312 # data = important_only(data, splitId) | |
313 | |
314 # data = expand_taxIDs(data) | |
315 # data = deform_scores(data) | |
316 if not args.noFilter: | |
317 data = filter_phage(data, phageTaxLookup) | |
318 # data = expand_titles(data) | |
319 | |
320 if args.protein or args.canonical: | |
321 data = remove_dupes(data) # Probably obsolete, bundle dice should do this | |
322 count_label = "Similar Unique Proteins" | |
323 else: | |
324 count_label = "Nucleotide Hits" | |
325 # data = with_dice(data) | |
326 data.sort(key=lambda data: -data[8]) | |
327 # counts, accessions = scoreMap(data) | |
328 | |
329 if args.access: | |
330 sys.stdout.write( | |
331 "Top %d matches for BLASTn results of %s\t\t\t\t\t\t\n" | |
332 % (args.hits, nameRec) | |
333 ) | |
334 sys.stdout.write( | |
335 "TaxID\tName\tAccessions\tSubject Length\tNumber of HSPs\tTotal Aligned Length\tDice Score\n" | |
336 ) | |
337 ind = 0 | |
338 for out in data: | |
339 if ind >= args.hits: | |
340 break | |
341 ind += 1 | |
342 sys.stdout.write( | |
343 "%s\t%s\t%s\t%s\t%s\t%s\t%.4f\n" | |
344 % (out[7], out[5], out[6], out[4], out[9], out[2], out[8]) | |
345 ) | |
346 | |
347 else: | |
348 sys.stdout.write( | |
349 "Top %d matches for BLASTn results of %s\t\t\t\t\t\n" | |
350 % (args.hits, data[0][0]) | |
351 ) | |
352 sys.stdout.write( | |
353 "TaxID\tName\tSubject Length\tNumber of HSPs\tTotal Aligned Length\tDice Score\n" | |
354 ) | |
355 ind = 0 | |
356 for out in data: | |
357 if ind >= args.hits: | |
358 break | |
359 ind += 1 | |
360 sys.stdout.write( | |
361 "%s\t%s\t%s\t%s\t%s\t%.4f\n" | |
362 % (out[7], out[5], out[4], out[9], out[1], out[8]) | |
363 ) | |
364 |