comparison blast_report.py @ 0:a61ee7b075c0 draft default tip

"planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/blast_report_basic commit b9c92df78fd78bf5881ab0cc5f5692d2bc71f5f6"
author public-health-bioinformatics
date Tue, 03 Mar 2020 06:11:45 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a61ee7b075c0
1 #!/usr/bin/env python
2
3 from __future__ import print_function
4
5 import argparse
6 import re
7 import sys
8
9 from Cheetah.Template import Template
10
11
12 def stop_err(msg):
13 sys.stderr.write("%s\n" % msg)
14 sys.exit(1)
15
16
17 class BLASTBin:
18 def __init__(self, label, file):
19 self.label = label
20 self.dict = {}
21
22 file_in = open(file)
23 for line in file_in:
24 self.dict[line.rstrip().split('.')[0]] = ''
25 file_in.close()
26
27 def __str__(self):
28 return "label: %s dict: %s" % (self.label, str(self.dict))
29
30
31 class BLASTQuery:
32 def __init__(self, query_id):
33 self.query_id = query_id
34 self.matches = []
35 self.match_accessions = {}
36 self.bins = {} # {bin(label):[match indexes]}
37 self.pident_filtered = 0
38 self.kw_filtered = 0
39 self.kw_filtered_breakdown = {} # {kw:count}
40
41 def __str__(self):
42 format_string = "\t".join([
43 "query_id: %s",
44 "len(matches): %s",
45 "bins (labels only): %s",
46 "pident_filtered: %s",
47 "kw_filtered: %s",
48 "kw_filtered_breakdown: %s"
49 ])
50 return format_string \
51 % (self.query_id,
52 str(len(self.matches)),
53 str([bin.label for bin in bins]),
54 str(self.pident_filtered),
55 str(self.kw_filtered),
56 str(self.kw_filtered_breakdown))
57
58
59 class BLASTMatch:
60 def __init__(self, subject_acc, subject_descr, score, p_cov, p_ident, subject_bins):
61 self.subject_acc = subject_acc
62 self.subject_descr = subject_descr
63 self.score = score
64 self.p_cov = p_cov
65 self.p_ident = p_ident
66 self.bins = subject_bins
67
68 def __str__(self):
69 return "subject_acc: %s subject_descr: %s score: %s p-cov: %s p-ident: %s" \
70 % (self.subject_acc,
71 self.subject_descr,
72 str(self.score),
73 str(round(self.p_cov, 2)),
74 str(round(self.p_ident, 2)))
75
76
77 # PARSE OPTIONS AND ARGUMENTS
78 parser = argparse.ArgumentParser()
79
80 parser.add_argument('-f', '--filter-keywords',
81 dest='filter_keywords',
82 )
83 parser.add_argument('-i', '--min-identity',
84 dest='min_identity',
85 )
86 parser.add_argument('-b', '--bins',
87 dest='bins',
88 action='append',
89 nargs='+'
90 )
91 parser.add_argument('-r', '--discard-redundant',
92 dest='discard_redundant',
93 default=False,
94 action='store_true'
95 )
96 parser.add_argument('input_tab')
97 parser.add_argument('cheetah_tmpl')
98 parser.add_argument('output_html')
99 parser.add_argument('output_tab')
100
101 args = parser.parse_args()
102
103 # BINS
104 bins = []
105 if args.bins is not None:
106 for bin in args.bins:
107 bins.append(BLASTBin(bin[0], bin[1]))
108
109 print('database bins: %s' % str([bin.label for bin in bins]))
110
111 # FILTERS
112 filter_pident = 0
113 filter_kws = []
114 if args.filter_keywords:
115 filter_kws = args.filter_keywords.split(',')
116 print('minimum percent identity: %s filter_kws: %s' % (str(args.min_identity), str(filter_kws)))
117
118 if args.discard_redundant:
119 print('Throwing out redundant hits...')
120
121
122 PIDENT_COL = 2
123 DESCR_COL = 24
124 SUBJ_ID_COL = 12
125 SCORE_COL = 11
126 PCOV_COL = 25
127 queries = []
128 current_query = ''
129 output_tab = open(args.output_tab, 'w')
130
131 with open(args.input_tab) as input_tab:
132 for line in input_tab:
133 cols = line.split('\t')
134 if cols[0] != current_query:
135 current_query = cols[0]
136 queries.append(BLASTQuery(current_query))
137
138 try:
139 accs = cols[SUBJ_ID_COL].split('|')[1::2][1::2]
140 except IndexError as e:
141 stop_err("Problem with splitting:" + cols[SUBJ_ID_COL])
142
143 # keep best (first) hit only for each query and accession id.
144 if args.discard_redundant:
145 if accs[0] in queries[-1].match_accessions:
146 continue # don't save the result and skip to the next
147 else:
148 queries[-1].match_accessions[accs[0]] = ''
149
150 p_ident = float(cols[PIDENT_COL])
151 # FILTER BY PIDENT
152 if p_ident < filter_pident: # if we are not filtering, filter_pident == 0 and this will never evaluate to True
153 queries[-1].pident_filtered += 1
154 continue
155
156 descrs = cols[DESCR_COL]
157 # FILTER BY KEY WORDS
158 filter_by_kw = False
159 for kw in filter_kws:
160 kw = kw.strip()
161 if kw != '' and re.search(kw, descrs, re.IGNORECASE):
162 filter_by_kw = True
163 try:
164 queries[-1].kw_filtered_breakdown[kw] += 1
165 except Exception as e:
166 queries[-1].kw_filtered_breakdown[kw] = 1
167 if filter_by_kw: # if we are not filtering, for loop will not be entered and this will never be True
168 queries[-1].kw_filtered += 1
169 continue
170 descr = descrs.split(';')[0]
171
172 # ATTEMPT BIN
173 subj_bins = []
174 for bin in bins: # if we are not binning, bins = [] so for loop not entered
175 for acc in accs:
176 if acc.split('.')[0] in bin.dict:
177 try:
178 queries[-1].bins[bin.label].append(len(queries[-1].matches))
179 except Exception as e:
180 queries[-1].bins[bin.label] = [len(queries[-1].matches)]
181 subj_bins.append(bin.label)
182 break # this result has been binned to this bin so break
183 acc = accs[0]
184
185 score = int(float(cols[SCORE_COL]))
186 p_cov = float(cols[PCOV_COL])
187
188 # SAVE RESULT
189 queries[-1].matches.append(
190 BLASTMatch(acc, descr, score, p_cov, p_ident, subj_bins)
191 )
192 output_tab.write(line)
193 input_tab.close()
194 output_tab.close()
195
196 '''
197 for query in queries:
198 print(query)
199 for match in query.matches:
200 print(' %s' % str(match))
201 for bin in query.bins:
202 print(' bin: %s' % bin)
203 for x in query.bins[bin]:
204 print(' %s' % str(query.matches[x]))
205 '''
206
207 namespace = {'queries': queries}
208 html = Template(file=args.cheetah_tmpl, searchList=[namespace])
209 out_html = open(args.output_html, 'w')
210 out_html.write(str(html))
211 out_html.close()