Mercurial > repos > cpt > cpt_related_genomes_prot
comparison cpt_related_genome_prot/relatedness_prot.py @ 0:ebcc87a27f9c draft default tip
Uploaded
author | cpt |
---|---|
date | Fri, 10 Jun 2022 08:46:28 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ebcc87a27f9c |
---|---|
1 #!/usr/bin/env python | |
2 import sys | |
3 import argparse | |
4 import json | |
5 import logging | |
6 from Bio.Blast import NCBIXML | |
7 | |
8 logging.basicConfig(level=logging.DEBUG) | |
9 log = logging.getLogger() | |
10 | |
11 def parse_blast(blast, isXML = False): | |
12 res = [] | |
13 finalRes = [] | |
14 if isXML: | |
15 for iter_num, blast_record in enumerate(NCBIXML.parse(blast), 1): | |
16 for alignment in blast_record.alignments: | |
17 tempID = alignment.hit_id[alignment.hit_id.find("gb|") + 3:] | |
18 tempID = tempID[:tempID.find("|")] | |
19 tempDesc = alignment.title | |
20 while tempDesc.find("|") >= 0: | |
21 tempDesc = tempDesc[tempDesc.find("|") + 1:] | |
22 tempDesc = tempDesc.strip() | |
23 tempID = tempID.strip() | |
24 #for hsp in alignment.hsps: | |
25 line = [str(blast_record.query)[:str(blast_record.query).find("[")].strip()] | |
26 line.append(alignment.hit_id) | |
27 line.append(tempDesc) | |
28 line.append(alignment.accession) | |
29 res.append(line) | |
30 blast.seek(0) | |
31 resInd = -1 | |
32 taxLine = blast.readline() | |
33 while taxLine: | |
34 if "<Hit>" in taxLine: | |
35 resInd += 1 | |
36 taxSlice = "" | |
37 elif "<taxid>" in taxLine: | |
38 taxSlice = taxLine[taxLine.find("<taxid>") + 7:taxLine.find("</taxid>")] | |
39 finalRes.append(res[resInd]) | |
40 finalRes[-1].append(taxSlice) | |
41 taxLine = blast.readline() | |
42 return finalRes | |
43 else: | |
44 for line in blast: | |
45 finalRes.append(line.strip("\n").split("\t")) | |
46 return finalRes | |
47 | |
48 def with_dice(blast): | |
49 for data in blast: | |
50 dice = 2 * int(data[14]) / (float(data[22]) + float(data[23])) | |
51 yield data + [dice] | |
52 | |
53 | |
54 def filter_dice(blast, threshold=0.5): | |
55 for data in blast: | |
56 if data[-1] > threshold: | |
57 yield data | |
58 | |
59 | |
60 def split_identifiers_nucl(_, ident): | |
61 if "<>" in ident: | |
62 idents = ident.split("<>") | |
63 else: | |
64 idents = [ident] | |
65 return idents | |
66 | |
67 | |
68 def split_identifiers_prot(_, ident): | |
69 if "<>" in ident: | |
70 idents = ident.split("<>") | |
71 else: | |
72 idents = [ident] | |
73 return [ | |
74 x[x.index("[") + 1 : x.rindex("]")] | |
75 for x in idents | |
76 # MULTISPECIES: recombination-associated protein RdgC [Enterobacteriaceae]<>RecName: Full=Recombination-associated protein RdgC<>putative exonuclease, RdgC [Enterobacter sp. 638] | |
77 if "[" in x and "]" in x | |
78 ] | |
79 | |
80 | |
81 def split_identifiers_phage(par, ident): | |
82 par = par.replace("lcl|", "") | |
83 par = par[0 : par.index("_prot_")] | |
84 return [par] | |
85 | |
86 | |
87 def important_only(blast, split_identifiers): | |
88 for data in blast: | |
89 yield [ | |
90 data[0], # 01 Query Seq-id (ID of your sequence) | |
91 data[1], # 13 All subject Seq-id(s), separated by a ';' | |
92 split_identifiers( | |
93 data[1], data[2] | |
94 ), # 25 All subject title(s), separated by a '<>' | |
95 data[3].split(";"), # Extra: All Subject Accessions | |
96 data[4].split(";"), # Extra: All TaxIDs | |
97 ] | |
98 | |
99 | |
100 def deform_scores(blast): | |
101 for data in blast: | |
102 for org in data[2]: | |
103 yield [data[0], data[1], org, data[3], data[4]] | |
104 | |
105 | |
106 def expand_fields(blast): | |
107 for data in blast: | |
108 for x in range(0, len(data[4])): | |
109 yield [data[0], data[1], data[2][x], data[3], int(data[4][x])] | |
110 | |
111 def expand_taxIDs(blast, taxFilter): | |
112 for data in blast: | |
113 # if(len(data[4]) > 0): | |
114 # print(data[0]) | |
115 for ID in data[4]: | |
116 if ID != "N/A": | |
117 filterOut = False | |
118 for tax in taxFilter: | |
119 if str(ID).strip() == tax: | |
120 filterOut = True | |
121 if not filterOut: | |
122 yield [data[0], data[1], data[2], data[3], int(ID)] | |
123 | |
124 | |
125 def expand_titles(blast): | |
126 for data in blast: | |
127 for title in data[2]: | |
128 yield [data[0], data[1], title, data[3], data[4]] | |
129 | |
130 | |
131 def filter_phage(blast, phageTaxLookup, phageSciNames): | |
132 for data in blast: | |
133 for x in range(0, len(phageTaxLookup)): | |
134 if (data[4]) == phageTaxLookup[x]: | |
135 yield [data[0], data[1], phageSciNames[x], data[3], data[4]] | |
136 break | |
137 | |
138 | |
139 def remove_dupes(data): | |
140 has_seen = {} | |
141 for row in data: | |
142 # qseqid, sseqid | |
143 key = (row[0], row[4]) | |
144 # If we've seen the key before, we can exit | |
145 if key in has_seen: | |
146 continue | |
147 | |
148 # Otherwise, continue on | |
149 has_seen[key] = True | |
150 # Pretty simple | |
151 yield row | |
152 | |
153 def scoreMap(blast): | |
154 c = {} | |
155 m = {} | |
156 for (qseq, subID, subTitle, access, ID) in blast: | |
157 if (str(subTitle), ID) not in c: | |
158 m[(str(subTitle), ID)] = access | |
159 c[(str(subTitle), ID)] = 0 | |
160 | |
161 c[(str(subTitle), ID)] += 1 | |
162 return c, m | |
163 | |
164 | |
165 if __name__ == "__main__": | |
166 parser = argparse.ArgumentParser(description="Top related genomes") | |
167 parser.add_argument( | |
168 "blast", type=argparse.FileType("r"), help="Blast 25 Column Results" | |
169 ) | |
170 parser.add_argument("phagedb", type=argparse.FileType("r")) | |
171 parser.add_argument("--access", action="store_true") | |
172 parser.add_argument("--protein", action="store_true") | |
173 parser.add_argument("--canonical", action="store_true") | |
174 parser.add_argument("--noFilter", action="store_true") | |
175 #parser.add_argument("--title", action="store_true") # Add when ready to update XML after semester | |
176 parser.add_argument("--hits", type=int, default=5) | |
177 parser.add_argument("--xmlMode", action="store_true") | |
178 parser.add_argument("--taxFilter", type=str) | |
179 | |
180 args = parser.parse_args() | |
181 | |
182 phageDb = args.phagedb | |
183 phageTaxLookup = [] | |
184 sciName = [] | |
185 line = phageDb.readline() | |
186 | |
187 taxList = [] | |
188 if args.taxFilter and args.taxFilter != "" : | |
189 args.taxFilter = args.taxFilter.split(" ") | |
190 for ind in args.taxFilter: | |
191 taxList.append(ind.strip()) | |
192 | |
193 while line: | |
194 line = line.split("\t") | |
195 phageTaxLookup.append(int(line[0])) | |
196 line[1] = line[1].strip() | |
197 if (line[1] == ""): | |
198 line[1] = "Novel Genome" | |
199 sciName.append(line[1]) | |
200 line = phageDb.readline() | |
201 | |
202 if args.protein: | |
203 splitId = split_identifiers_prot | |
204 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb} | |
205 elif args.canonical: | |
206 splitId = split_identifiers_phage | |
207 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb} | |
208 else: | |
209 splitId = split_identifiers_nucl | |
210 # phageNameLookup = {k['desc'].rstrip('.'): k['id'] for k in phageDb} | |
211 | |
212 data = parse_blast(args.blast, args.xmlMode) | |
213 # data = with_dice(data) | |
214 # data = filter_dice(data, threshold=0.0) | |
215 data = important_only(data, splitId) | |
216 | |
217 data = expand_taxIDs(data, taxList) | |
218 data = remove_dupes(data) | |
219 if not args.noFilter: | |
220 data = filter_phage(data, phageTaxLookup, sciName) | |
221 listify = [] | |
222 for x in data: | |
223 listify.append(x) | |
224 #listify = greatest_taxID(listify) | |
225 | |
226 count_label = "Similar Unique Proteins" | |
227 | |
228 counts, accessions = scoreMap(listify) | |
229 | |
230 nameRec = listify[0][0] | |
231 sys.stdout.write( | |
232 "Top %d matches for BLASTp results of %s\n" | |
233 % (args.hits, nameRec) | |
234 ) | |
235 header = "# TaxID\t" | |
236 #if args.title: | |
237 header += "Name\t" | |
238 if args.access: | |
239 header += "Accessions\t" | |
240 header += "Similar Unique Proteins\n" | |
241 sys.stdout.write(header) | |
242 | |
243 for idx, ((name, ID), num) in enumerate( | |
244 sorted(counts.items(), key=lambda item: -item[1]) | |
245 ): | |
246 if idx > args.hits - 1: | |
247 break | |
248 line = str(ID) + "\t" | |
249 #if args.title: | |
250 line += str(name) + "\t" | |
251 if args.access: | |
252 line += str(accessions[(name, ID)][0]) + "\t" | |
253 line += str(num) + "\n" | |
254 sys.stdout.write(line) |