annotate cpt_related_genome_nuc/relatedness.py @ 0:5a5fe0a6f78d draft default tip

Uploaded
author cpt
date Fri, 10 Jun 2022 08:45:13 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env python
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
2 import sys
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
3 import argparse
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
4 import json
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
5 import logging
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
6 from Bio.Blast import NCBIXML
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
7
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
8
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
9 logging.basicConfig(level=logging.DEBUG)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
10 log = logging.getLogger()
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
11
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
12
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
13 def parse_blast(blast, isXML = False):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
14 res = []
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
15 finalRes = []
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
16 if isXML:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
17 for iter_num, blast_record in enumerate(NCBIXML.parse(blast), 1):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
18 for alignment in blast_record.alignments:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
19 tempID = alignment.hit_id[alignment.hit_id.find("gb|") + 3:]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
20 tempID = tempID[:tempID.find("|")]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
21 tempDesc = alignment.title
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
22 while tempDesc.find("|") >= 0:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
23 tempDesc = tempDesc[tempDesc.find("|") + 1:]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
24 tempDesc = tempDesc.strip()
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
25 tempID = tempID.strip()
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
26 for hsp in alignment.hsps:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
27 line = [str(blast_record.query)]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
28 line.append(str(hsp.align_length))
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
29 line.append(str(hsp.identities))
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
30 line.append(str(blast_record.query_length))
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
31 line.append(str(alignment.length))
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
32 line.append(tempDesc)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
33 line.append(tempID)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
34 #line.append("0000000")
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
35 #print(line)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
36 res.append(line)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
37 blast.seek(0)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
38 resInd = -1
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
39 taxLine = blast.readline()
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
40 while taxLine:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
41 if "<Hit>" in taxLine:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
42 resInd += 1
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
43 taxSlice = ""
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
44 elif "<taxid>" in taxLine:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
45 taxSlice = taxLine[taxLine.find("<taxid>") + 7:taxLine.find("</taxid>")]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
46 finalRes.append(res[resInd])
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
47 finalRes[-1].append(taxSlice)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
48 #print(finalRes[-1])
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
49 taxLine = blast.readline()
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
50 return finalRes
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
51 else:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
52 for line in blast:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
53 taxSplit = []
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
54 preTaxSplit = line.strip("\n").split("\t")
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
55 for tax in preTaxSplit[-1].split(";"):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
56 shallowCopy = []
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
57 for x in range(len(preTaxSplit)):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
58 shallowCopy.append(preTaxSplit[x])
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
59 shallowCopy[-1] = tax
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
60 res.append(shallowCopy)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
61 for line in res:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
62 for access in line[6].split(";"):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
63 shallowCopy = []
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
64 for x in range(len(line)):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
65 shallowCopy.append(line[x])
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
66 shallowCopy[6] = access
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
67 finalRes.append(shallowCopy)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
68 # for x in finalRes:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
69 # print(x)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
70 # exit()
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
71 return finalRes
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
72
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
73
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
74 def add_dice(blast):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
75 res = []
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
76 for data in blast:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
77 dice = (2 * float(data[2])) / (float(data[3]) + float(data[4]))
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
78 res.append(data + [dice])
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
79 return res
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
80
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
81
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
82 def make_num(blast):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
83 res = []
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
84 for data in blast:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
85 res.append(
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
86 [
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
87 data[0],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
88 int(data[1]),
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
89 int(data[2]),
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
90 int(data[3]),
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
91 int(data[4]),
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
92 data[5],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
93 data[6],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
94 (data[7]),
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
95 ]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
96 )
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
97 return res
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
98
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
99
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
100 def bundle_dice(blast):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
101 res = []
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
102 ind = 0
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
103 seen = {}
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
104
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
105 for x in blast:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
106 if (x[6] + "_" + (x[7])) in seen.keys():
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
107 res[seen[(x[6] + "_" + (x[7]))]][1] += x[1]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
108 res[seen[(x[6] + "_" + (x[7]))]][2] += x[2]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
109 res[seen[(x[6] + "_" + (x[7]))]][8] += x[8]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
110 res[seen[(x[6] + "_" + (x[7]))]][9] += 1 # Num HSPs
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
111 else:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
112 seen[(x[6] + "_" + (x[7]))] = ind
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
113 shallowCopy = []
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
114 for i in range(len(x)):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
115 shallowCopy.append(x[i])
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
116 shallowCopy.append(1)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
117 # print(shallowCopy)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
118 res.append(shallowCopy)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
119 ind += 1
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
120 # print(seen)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
121 # stop = 0
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
122 # for i in res:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
123 # print(i)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
124 # stop += 1
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
125 # if stop > 7:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
126 # exit()
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
127 return res
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
128
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
129
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
130 """def bundle_dice(blast):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
131 res = []
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
132 ind = 0
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
133 seen = {}
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
134 for x in blast:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
135 if ((x[0] + x[5])) in seen.keys():
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
136 res[seen[(x[0] + x[5])]][1] += x[1]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
137 res[seen[(x[0] + x[5])]][2] += x[2]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
138 res[seen[(x[0] + x[5])]][9] += 1 # Num HSPs
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
139 res[seen[(x[0] + x[5])]][8] = ((res[seen[(x[0] + x[5])]][8] * (res[seen[(x[0] + x[5])]][9] - 1)) + x[8]) / res[seen[(x[0] + x[5])]][9]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
140 else:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
141 seen[(x[0] + x[5])] = ind
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
142 res.append(x + [1])
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
143 ind += 1
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
144 return res """
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
145
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
146
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
147 def filter_dice(blast, threshold=0.5):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
148 for data in blast:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
149 if data[-1] > threshold:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
150 yield data
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
151
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
152
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
153 def split_identifiers_nucl(_, ident):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
154 if "<>" in ident:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
155 idents = ident.split("<>")
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
156 else:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
157 idents = [ident]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
158 return idents
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
159
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
160
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
161 def split_identifiers_prot(_, ident):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
162 if "<>" in ident:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
163 idents = ident.split("<>")
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
164 else:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
165 idents = [ident]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
166 return [
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
167 x[x.index("[") + 1 : x.rindex("]")]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
168 for x in idents
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
169 # MULTISPECIES: recombination-associated protein RdgC [Enterobacteriaceae]<>RecName: Full=Recombination-associated protein RdgC<>putative exonuclease, RdgC [Enterobacter sp. 638]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
170 if "[" in x and "]" in x
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
171 ]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
172
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
173
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
174 def split_identifiers_phage(par, ident):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
175 par = par.replace("lcl|", "")
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
176 par = par[0 : par.index("_prot_")]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
177 return [par]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
178
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
179
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
180 def important_only(blast, split_identifiers):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
181 for data in blast:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
182 yield [
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
183 data[0], # 01 Query Seq-id (ID of your sequence)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
184 data[1], # 13 All subject Seq-id(s), separated by a ';'
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
185 split_identifiers(
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
186 data[1], data[2]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
187 ), # 25 All subject title(s), separated by a '<>'
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
188 data[3].split(";"), # Extra: All Subject Accessions
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
189 data[4].split(";"), # Extra: All TaxIDs
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
190 ]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
191
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
192
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
193 def deform_scores(blast):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
194 for data in blast:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
195 for org in data[2]:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
196 yield [data[0], data[1], org, data[3], data[4]]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
197
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
198
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
199 def expand_taxIDs(blast):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
200 for data in blast:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
201 # if(len(data[4]) > 0):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
202 # print(data[0])
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
203 for ID in data[7]:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
204 yield [
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
205 data[0],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
206 data[1],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
207 data[2],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
208 data[3],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
209 data[4],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
210 data[5],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
211 data[6],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
212 int(ID),
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
213 data[8],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
214 ]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
215
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
216
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
217 def expand_titles(blast):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
218 for data in blast:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
219 for title in data[5]:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
220 yield [
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
221 data[0],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
222 data[1],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
223 data[2],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
224 data[3],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
225 data[4],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
226 title,
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
227 data[6],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
228 data[7],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
229 data[8],
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
230 ]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
231
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
232
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
233 def filter_phage(blast, phageTaxLookup):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
234 res = []
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
235 for data in blast:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
236 if int(data[7]) in phageTaxLookup:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
237 res.append(data)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
238 return res
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
239
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
240
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
241 def remove_dupes(data):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
242 has_seen = {}
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
243 res = []
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
244 for row in data:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
245 # qseqid, sseqid
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
246 key = (row[0], row[6], row[7])
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
247 # If we've seen the key before, we can exit
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
248 if key in has_seen:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
249 continue
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
250
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
251 # Otherwise, continue on
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
252 has_seen[key] = True
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
253 # Pretty simple
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
254 res.append(row)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
255 return res
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
256
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
257
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
258 def scoreMap(blast):
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
259 c = {}
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
260 m = {}
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
261 for (qseq, subID, subTitle, access, ID) in blast:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
262 if (subTitle, ID) not in c:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
263 m[(subTitle, ID)] = access
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
264 c[(subTitle, ID)] = 0
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
265
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
266 c[(subTitle, ID)] += 1
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
267 return c, m
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
268
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
269
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
270 if __name__ == "__main__":
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
271 parser = argparse.ArgumentParser(description="Top related genomes")
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
272 parser.add_argument(
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
273 "blast", type=argparse.FileType("r"), help="Blast 25 Column Results"
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
274 )
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
275 parser.add_argument("phagedb", type=argparse.FileType("r"))
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
276 parser.add_argument("--access", action="store_true")
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
277 parser.add_argument("--protein", action="store_true")
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
278 parser.add_argument("--canonical", action="store_true")
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
279 parser.add_argument("--hits", type=int, default=5)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
280 parser.add_argument("--noFilter", action="store_true")
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
281 parser.add_argument("--xmlMode", action="store_true")
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
282
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
283 args = parser.parse_args()
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
284
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
285 phageDb = args.phagedb
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
286 phageTaxLookup = []
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
287 line = phageDb.readline()
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
288 while line:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
289 line = line.split("\t")
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
290 phageTaxLookup.append(int(line[0]))
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
291 line = phageDb.readline()
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
292
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
293 if args.protein:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
294 splitId = split_identifiers_prot
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
295 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb}
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
296 elif args.canonical:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
297 splitId = split_identifiers_phage
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
298 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb}
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
299 else:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
300 splitId = split_identifiers_nucl
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
301 # phageNameLookup = {k['desc'].rstrip('.'): k['id'] for k in phageDb}
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
302
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
303 data = [] # Reformatting to list rather than generator
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
304
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
305 data = parse_blast(args.blast, args.xmlMode)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
306 nameRec = data[0][0]
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
307 data = make_num(data)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
308 data = add_dice(data)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
309 data = bundle_dice(data)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
310
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
311 # data = filter_dice(data, threshold=0.0)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
312 # data = important_only(data, splitId)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
313
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
314 # data = expand_taxIDs(data)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
315 # data = deform_scores(data)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
316 if not args.noFilter:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
317 data = filter_phage(data, phageTaxLookup)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
318 # data = expand_titles(data)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
319
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
320 if args.protein or args.canonical:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
321 data = remove_dupes(data) # Probably obsolete, bundle dice should do this
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
322 count_label = "Similar Unique Proteins"
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
323 else:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
324 count_label = "Nucleotide Hits"
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
325 # data = with_dice(data)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
326 data.sort(key=lambda data: -data[8])
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
327 # counts, accessions = scoreMap(data)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
328
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
329 if args.access:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
330 sys.stdout.write(
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
331 "Top %d matches for BLASTn results of %s\t\t\t\t\t\t\n"
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
332 % (args.hits, nameRec)
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
333 )
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
334 sys.stdout.write(
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
335 "TaxID\tName\tAccessions\tSubject Length\tNumber of HSPs\tTotal Aligned Length\tDice Score\n"
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
336 )
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
337 ind = 0
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
338 for out in data:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
339 if ind >= args.hits:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
340 break
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
341 ind += 1
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
342 sys.stdout.write(
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
343 "%s\t%s\t%s\t%s\t%s\t%s\t%.4f\n"
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
344 % (out[7], out[5], out[6], out[4], out[9], out[2], out[8])
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
345 )
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
346
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
347 else:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
348 sys.stdout.write(
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
349 "Top %d matches for BLASTn results of %s\t\t\t\t\t\n"
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
350 % (args.hits, data[0][0])
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
351 )
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
352 sys.stdout.write(
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
353 "TaxID\tName\tSubject Length\tNumber of HSPs\tTotal Aligned Length\tDice Score\n"
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
354 )
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
355 ind = 0
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
356 for out in data:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
357 if ind >= args.hits:
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
358 break
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
359 ind += 1
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
360 sys.stdout.write(
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
361 "%s\t%s\t%s\t%s\t%s\t%.4f\n"
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
362 % (out[7], out[5], out[4], out[9], out[1], out[8])
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
363 )
5a5fe0a6f78d Uploaded
cpt
parents:
diff changeset
364