Mercurial > repos > public-health-bioinformatics > blast_report_basic
comparison blast_report.py @ 0:a61ee7b075c0 draft default tip
"planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/blast_report_basic commit b9c92df78fd78bf5881ab0cc5f5692d2bc71f5f6"
author | public-health-bioinformatics |
---|---|
date | Tue, 03 Mar 2020 06:11:45 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a61ee7b075c0 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 from __future__ import print_function | |
4 | |
5 import argparse | |
6 import re | |
7 import sys | |
8 | |
9 from Cheetah.Template import Template | |
10 | |
11 | |
12 def stop_err(msg): | |
13 sys.stderr.write("%s\n" % msg) | |
14 sys.exit(1) | |
15 | |
16 | |
17 class BLASTBin: | |
18 def __init__(self, label, file): | |
19 self.label = label | |
20 self.dict = {} | |
21 | |
22 file_in = open(file) | |
23 for line in file_in: | |
24 self.dict[line.rstrip().split('.')[0]] = '' | |
25 file_in.close() | |
26 | |
27 def __str__(self): | |
28 return "label: %s dict: %s" % (self.label, str(self.dict)) | |
29 | |
30 | |
31 class BLASTQuery: | |
32 def __init__(self, query_id): | |
33 self.query_id = query_id | |
34 self.matches = [] | |
35 self.match_accessions = {} | |
36 self.bins = {} # {bin(label):[match indexes]} | |
37 self.pident_filtered = 0 | |
38 self.kw_filtered = 0 | |
39 self.kw_filtered_breakdown = {} # {kw:count} | |
40 | |
41 def __str__(self): | |
42 format_string = "\t".join([ | |
43 "query_id: %s", | |
44 "len(matches): %s", | |
45 "bins (labels only): %s", | |
46 "pident_filtered: %s", | |
47 "kw_filtered: %s", | |
48 "kw_filtered_breakdown: %s" | |
49 ]) | |
50 return format_string \ | |
51 % (self.query_id, | |
52 str(len(self.matches)), | |
53 str([bin.label for bin in bins]), | |
54 str(self.pident_filtered), | |
55 str(self.kw_filtered), | |
56 str(self.kw_filtered_breakdown)) | |
57 | |
58 | |
59 class BLASTMatch: | |
60 def __init__(self, subject_acc, subject_descr, score, p_cov, p_ident, subject_bins): | |
61 self.subject_acc = subject_acc | |
62 self.subject_descr = subject_descr | |
63 self.score = score | |
64 self.p_cov = p_cov | |
65 self.p_ident = p_ident | |
66 self.bins = subject_bins | |
67 | |
68 def __str__(self): | |
69 return "subject_acc: %s subject_descr: %s score: %s p-cov: %s p-ident: %s" \ | |
70 % (self.subject_acc, | |
71 self.subject_descr, | |
72 str(self.score), | |
73 str(round(self.p_cov, 2)), | |
74 str(round(self.p_ident, 2))) | |
75 | |
76 | |
77 # PARSE OPTIONS AND ARGUMENTS | |
78 parser = argparse.ArgumentParser() | |
79 | |
80 parser.add_argument('-f', '--filter-keywords', | |
81 dest='filter_keywords', | |
82 ) | |
83 parser.add_argument('-i', '--min-identity', | |
84 dest='min_identity', | |
85 ) | |
86 parser.add_argument('-b', '--bins', | |
87 dest='bins', | |
88 action='append', | |
89 nargs='+' | |
90 ) | |
91 parser.add_argument('-r', '--discard-redundant', | |
92 dest='discard_redundant', | |
93 default=False, | |
94 action='store_true' | |
95 ) | |
96 parser.add_argument('input_tab') | |
97 parser.add_argument('cheetah_tmpl') | |
98 parser.add_argument('output_html') | |
99 parser.add_argument('output_tab') | |
100 | |
101 args = parser.parse_args() | |
102 | |
103 # BINS | |
104 bins = [] | |
105 if args.bins is not None: | |
106 for bin in args.bins: | |
107 bins.append(BLASTBin(bin[0], bin[1])) | |
108 | |
109 print('database bins: %s' % str([bin.label for bin in bins])) | |
110 | |
111 # FILTERS | |
112 filter_pident = 0 | |
113 filter_kws = [] | |
114 if args.filter_keywords: | |
115 filter_kws = args.filter_keywords.split(',') | |
116 print('minimum percent identity: %s filter_kws: %s' % (str(args.min_identity), str(filter_kws))) | |
117 | |
118 if args.discard_redundant: | |
119 print('Throwing out redundant hits...') | |
120 | |
121 | |
122 PIDENT_COL = 2 | |
123 DESCR_COL = 24 | |
124 SUBJ_ID_COL = 12 | |
125 SCORE_COL = 11 | |
126 PCOV_COL = 25 | |
127 queries = [] | |
128 current_query = '' | |
129 output_tab = open(args.output_tab, 'w') | |
130 | |
131 with open(args.input_tab) as input_tab: | |
132 for line in input_tab: | |
133 cols = line.split('\t') | |
134 if cols[0] != current_query: | |
135 current_query = cols[0] | |
136 queries.append(BLASTQuery(current_query)) | |
137 | |
138 try: | |
139 accs = cols[SUBJ_ID_COL].split('|')[1::2][1::2] | |
140 except IndexError as e: | |
141 stop_err("Problem with splitting:" + cols[SUBJ_ID_COL]) | |
142 | |
143 # keep best (first) hit only for each query and accession id. | |
144 if args.discard_redundant: | |
145 if accs[0] in queries[-1].match_accessions: | |
146 continue # don't save the result and skip to the next | |
147 else: | |
148 queries[-1].match_accessions[accs[0]] = '' | |
149 | |
150 p_ident = float(cols[PIDENT_COL]) | |
151 # FILTER BY PIDENT | |
152 if p_ident < filter_pident: # if we are not filtering, filter_pident == 0 and this will never evaluate to True | |
153 queries[-1].pident_filtered += 1 | |
154 continue | |
155 | |
156 descrs = cols[DESCR_COL] | |
157 # FILTER BY KEY WORDS | |
158 filter_by_kw = False | |
159 for kw in filter_kws: | |
160 kw = kw.strip() | |
161 if kw != '' and re.search(kw, descrs, re.IGNORECASE): | |
162 filter_by_kw = True | |
163 try: | |
164 queries[-1].kw_filtered_breakdown[kw] += 1 | |
165 except Exception as e: | |
166 queries[-1].kw_filtered_breakdown[kw] = 1 | |
167 if filter_by_kw: # if we are not filtering, for loop will not be entered and this will never be True | |
168 queries[-1].kw_filtered += 1 | |
169 continue | |
170 descr = descrs.split(';')[0] | |
171 | |
172 # ATTEMPT BIN | |
173 subj_bins = [] | |
174 for bin in bins: # if we are not binning, bins = [] so for loop not entered | |
175 for acc in accs: | |
176 if acc.split('.')[0] in bin.dict: | |
177 try: | |
178 queries[-1].bins[bin.label].append(len(queries[-1].matches)) | |
179 except Exception as e: | |
180 queries[-1].bins[bin.label] = [len(queries[-1].matches)] | |
181 subj_bins.append(bin.label) | |
182 break # this result has been binned to this bin so break | |
183 acc = accs[0] | |
184 | |
185 score = int(float(cols[SCORE_COL])) | |
186 p_cov = float(cols[PCOV_COL]) | |
187 | |
188 # SAVE RESULT | |
189 queries[-1].matches.append( | |
190 BLASTMatch(acc, descr, score, p_cov, p_ident, subj_bins) | |
191 ) | |
192 output_tab.write(line) | |
193 input_tab.close() | |
194 output_tab.close() | |
195 | |
196 ''' | |
197 for query in queries: | |
198 print(query) | |
199 for match in query.matches: | |
200 print(' %s' % str(match)) | |
201 for bin in query.bins: | |
202 print(' bin: %s' % bin) | |
203 for x in query.bins[bin]: | |
204 print(' %s' % str(query.matches[x])) | |
205 ''' | |
206 | |
207 namespace = {'queries': queries} | |
208 html = Template(file=args.cheetah_tmpl, searchList=[namespace]) | |
209 out_html = open(args.output_html, 'w') | |
210 out_html.write(str(html)) | |
211 out_html.close() |