Mercurial > repos > iuc > virannot_blast2tsv
comparison blast2tsv.py @ 0:e889010415a1 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 3a3b40c15ae5e82334f016e88b1f3c5bbbb3b2cd
author | iuc |
---|---|
date | Mon, 04 Mar 2024 19:55:52 +0000 |
parents | |
children | 77c3ef9b0ed7 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e889010415a1 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 | |
4 # Name: blast2tsv | |
5 # Author(s): Sebastien Theil, Marie Lefebvre - INRAE | |
6 # Aims: Convert blast xml output to tsv and add taxonomy | |
7 | |
8 | |
9 import argparse | |
10 import csv | |
11 import logging as log | |
12 import os | |
13 | |
14 from Bio import Entrez | |
15 from Bio import SeqIO | |
16 from Bio.Blast import NCBIXML | |
17 from ete3 import NCBITaxa | |
18 | |
19 ncbi = NCBITaxa() | |
20 | |
21 | |
22 def main(): | |
23 options = _set_options() | |
24 _set_log_level(options.verbosity) | |
25 hits = _read_xml(options) | |
26 _write_tsv(options, hits) | |
27 | |
28 | |
29 def _guess_database(accession): | |
30 """Guess the correct database for querying based off the format of the accession""" | |
31 database_mappings_refseq = {'AC_': 'nuccore', 'NC_': 'nuccore', 'NG_': 'nuccore', | |
32 'NT_': 'nuccore', 'NW_': 'nuccore', 'NZ_': 'nuccore', | |
33 'AP_': 'protein', 'NP_': 'protein', 'YP_': 'protein', | |
34 'XP_': 'protein', 'WP_': 'protein'} | |
35 return database_mappings_refseq[accession[0:3]] | |
36 | |
37 | |
38 def _read_xml(options): | |
39 """ | |
40 Parse XML blast results file | |
41 Keep only the first hit | |
42 """ | |
43 log.info("Read XML file.") | |
44 results = open(options.xml_file, 'r') | |
45 records = NCBIXML.parse(results) | |
46 xml_results = {} | |
47 for blast_record in records: | |
48 for aln in blast_record.alignments: | |
49 hit_count = 1 | |
50 for hit in aln.hsps: | |
51 hsp = {} | |
52 if hit_count == 1: | |
53 first_hit_frame = hit.frame[1] if len(hit.frame) > 0 else 0 # strand | |
54 cumul_hit_identity = hit.identities if hit.identities else 0 | |
55 cumul_hit_score = hit.bits # hit score | |
56 cumul_hit_evalue = hit.expect # evalue | |
57 cumul_hit_length = hit.align_length if hit.align_length is not None else 0 | |
58 hit_count = hit_count + 1 | |
59 else: | |
60 # all HSPs in different strand than 1st HSPs will be discarded. | |
61 if (first_hit_frame > 0 and hit.frame[1] > 0) or (first_hit_frame < 0 and hit.frame[1] < 0): | |
62 cumul_hit_identity = cumul_hit_identity + hit.identities | |
63 cumul_hit_length = cumul_hit_length + hit.align_length | |
64 cumul_hit_evalue = cumul_hit_evalue + hit.expect | |
65 cumul_hit_score = cumul_hit_score + hit.bits | |
66 hit_count = hit_count + 1 | |
67 if hit_count == 1: | |
68 final_hit_count = hit_count | |
69 elif hit_count > 1: | |
70 final_hit_count = hit_count - 1 | |
71 hsp["evalue"] = cumul_hit_evalue / final_hit_count # The smaller the E-value, the better the match | |
72 hsp["query_id"] = blast_record.query_id | |
73 hsp["query_length"] = blast_record.query_length # length of the query | |
74 hsp["accession"] = aln.accession.replace("ref|", "") | |
75 hsp["description"] = aln.hit_def | |
76 hsp["hit_length"] = aln.length # length of the hit | |
77 hsp["hsp_length"] = hit.align_length # length of the hsp alignment | |
78 hsp["queryOverlap"] = _get_overlap_value(options.algo, hsp, 'hsp', hsp["query_length"])[0] | |
79 if cumul_hit_length == 0: | |
80 hsp["percentIdentity"] = round(cumul_hit_identity, 1) # identity percentage | |
81 else: | |
82 hsp["percentIdentity"] = round(cumul_hit_identity / cumul_hit_length * 100, 1) # identity percentage | |
83 hsp["score"] = cumul_hit_score # The higher the bit-score, the better the sequence similarity | |
84 hsp["num_hsps"] = final_hit_count | |
85 hsp["hit_cumul_length"] = cumul_hit_length | |
86 hsp["hitOverlap"] = _get_overlap_value(options.algo, hsp, 'hit', hsp["query_length"])[1] | |
87 db = _guess_database(hsp["accession"]) | |
88 try: | |
89 handle = Entrez.esummary(db=db, id=hsp["accession"]) | |
90 taxid = str(int(Entrez.read(handle)[0]['TaxId'])) | |
91 handle.close() | |
92 log.info("Taxid found for " + hsp["accession"]) | |
93 lineage = ncbi.get_lineage(taxid) | |
94 names = ncbi.get_taxid_translator(lineage) | |
95 ordered = [names[tid] for tid in lineage] | |
96 taxonomy = ordered[1:] | |
97 hsp["tax_id"] = taxid | |
98 hsp["taxonomy"] = ';'.join(taxonomy) | |
99 hsp["organism"] = taxonomy[-1] | |
100 except RuntimeError: | |
101 hsp["tax_id"] = "" | |
102 hsp["taxonomy"] = "" | |
103 hsp["organism"] = "" | |
104 log.warning("RuntimeError - Taxid not found for " + hsp["accession"]) | |
105 if hsp["evalue"] <= options.max_evalue and hsp["queryOverlap"] >= options.min_qov and \ | |
106 hsp["hitOverlap"] >= options.min_hov and hsp["score"] >= options.min_score: | |
107 xml_results[hsp["query_id"]] = hsp | |
108 else: | |
109 xml_results[hsp["query_id"]] = [hsp["query_length"]] | |
110 | |
111 return xml_results | |
112 | |
113 | |
114 def _get_overlap_value(algo, hsp, type, qlength): | |
115 """ | |
116 Set hsp or hit overlap values for hit and query | |
117 Return array [query_overlap, hit_overlap] | |
118 """ | |
119 if type == 'hsp': | |
120 q_align_len = qlength | |
121 h_align_len = hsp["hsp_length"] | |
122 else: | |
123 q_align_len = qlength | |
124 h_align_len = hsp["hit_cumul_length"] | |
125 | |
126 if algo == 'BLASTX': | |
127 if q_align_len: | |
128 query_overlap = (q_align_len * 3 / q_align_len) * 100 | |
129 if hsp["hit_length"]: | |
130 hit_overlap = (h_align_len / hsp["hit_length"]) * 100 | |
131 elif algo == 'TBLASTN': | |
132 if q_align_len: | |
133 query_overlap = (q_align_len / q_align_len) * 100 | |
134 if hsp["hit_length"]: | |
135 hit_overlap = (h_align_len * 3 / hsp["hit_length"]) * 100 | |
136 elif algo == 'TBLASTX': | |
137 if q_align_len: | |
138 query_overlap = (q_align_len * 3 / hsp["hsp_length"]) * 100 | |
139 if hsp["hit_length"]: | |
140 hit_overlap = (h_align_len * 3 / hsp["hit_length"]) * 100 | |
141 else: | |
142 if q_align_len: | |
143 query_overlap = (q_align_len / q_align_len) * 100 | |
144 if hsp["hit_length"]: | |
145 hit_overlap = (h_align_len / hsp["hit_length"]) * 100 | |
146 if query_overlap is None: | |
147 query_overlap = 0 | |
148 if query_overlap > 100: | |
149 query_overlap = 100 | |
150 if 'hit_overlap' not in locals(): | |
151 hit_overlap = 0 | |
152 if hit_overlap > 100: | |
153 hit_overlap = 100 | |
154 | |
155 return [round(query_overlap, 0), round(hit_overlap, 0)] | |
156 | |
157 | |
158 def _write_tsv(options, hits): | |
159 """ | |
160 Write output | |
161 """ | |
162 # get a list of contig without corresponding number of mapped reads | |
163 if options.rn_file is not None: | |
164 with open(options.rn_file) as rn: | |
165 rows = (line.split('\t') for line in rn) | |
166 rn_list = {row[0]: row[1:] for row in rows} | |
167 fasta = SeqIO.to_dict(SeqIO.parse(open(options.fasta_file), 'fasta')) | |
168 headers = "#algo\tquery_id\tnb_reads\tquery_length\taccession\tdescription\torganism\tpercentIdentity\tnb_hsps\tqueryOverlap\thitOverlap\tevalue\tscore\ttax_id\ttaxonomy\tsequence\n" | |
169 if not os.path.exists(options.output): | |
170 os.mkdir(options.output) | |
171 tsv_file = options.output + "/blast2tsv_output.tab" | |
172 log.info("Write output file: " + tsv_file) | |
173 f = open(tsv_file, "w+") | |
174 f.write(headers) | |
175 for h in hits: | |
176 if options.rn_file is not None: | |
177 read_nb = ''.join(rn_list[h]).replace("\n", "") | |
178 else: | |
179 read_nb = '' | |
180 if len(hits[h]) > 1: | |
181 f.write(options.algo + "\t" + h + "\t" + read_nb + "\t" + str(hits[h]["query_length"]) + "\t") | |
182 f.write(hits[h]["accession"] + "\t" + hits[h]["description"] + "\t") | |
183 f.write(hits[h]["organism"] + "\t" + str(hits[h]["percentIdentity"]) + "\t") | |
184 f.write(str(hits[h]["num_hsps"]) + "\t" + str(hits[h]["queryOverlap"]) + "\t") | |
185 f.write(str(hits[h]["hitOverlap"]) + "\t" + str(hits[h]["evalue"]) + "\t") | |
186 f.write(str(hits[h]["score"]) + "\t" + str(hits[h]["tax_id"]) + "\t") | |
187 if h in fasta: | |
188 f.write(hits[h]["taxonomy"] + "\t" + str(fasta[h].seq)) | |
189 else: | |
190 f.write(hits[h]["taxonomy"] + "\t\"\"") | |
191 f.write("\n") | |
192 else: | |
193 f.write(options.algo + "\t" + h + "\t" + read_nb + "\t" + str(hits[h])[1:-1] + "\t") | |
194 f.write("\n") | |
195 f.close() | |
196 _create_abundance(options, tsv_file) | |
197 | |
198 | |
199 def _create_abundance(options, tsv_file): | |
200 """ | |
201 extract values from tsv files | |
202 and create abundance files | |
203 """ | |
204 log.info("Calculating abundance.") | |
205 file_path = tsv_file | |
206 abundance = dict() | |
207 with open(tsv_file, 'r') as current_file: | |
208 log.debug("Reading " + file_path) | |
209 csv_reader = csv.reader(current_file, delimiter='\t') | |
210 line_count = 0 | |
211 for row in csv_reader: | |
212 if line_count == 0: | |
213 # headers | |
214 line_count += 1 | |
215 else: | |
216 # no annotation | |
217 if len(row) == 16: | |
218 if row[14] != "": | |
219 nb_reads = row[2] | |
220 if nb_reads == "": | |
221 current_reads_nb = 0 | |
222 log.debug("No reads number for " + row[1]) | |
223 else: | |
224 current_reads_nb = int(nb_reads) | |
225 contig_id = row[14] | |
226 if contig_id in abundance: | |
227 # add reads | |
228 abundance[contig_id]["reads_nb"] = abundance[row[14]]["reads_nb"] + current_reads_nb | |
229 abundance[contig_id]["contigs_nb"] = abundance[row[14]]["contigs_nb"] + 1 | |
230 else: | |
231 # init reads for this taxo | |
232 abundance[contig_id] = {} | |
233 abundance[contig_id]["reads_nb"] = current_reads_nb | |
234 abundance[contig_id]["contigs_nb"] = 1 | |
235 else: | |
236 log.debug("No annotations for contig " + row[1]) | |
237 else: | |
238 log.debug("No annotations for contig " + row[1]) | |
239 log.debug(abundance) | |
240 reads_file = open(options.output + "/blast2tsv_reads.txt", "w+") | |
241 for taxo in abundance: | |
242 reads_file.write(str(abundance[taxo]["reads_nb"])) | |
243 reads_file.write("\t") | |
244 reads_file.write("\t".join(taxo.split(";"))) | |
245 reads_file.write("\n") | |
246 reads_file.close() | |
247 log.info("Abundance file created " + options.output + "/blast2tsv_reads.txt") | |
248 contigs_file = open(options.output + "/blast2tsv_contigs.txt", "w+") | |
249 for taxo in abundance: | |
250 contigs_file.write(str(abundance[taxo]["contigs_nb"])) | |
251 contigs_file.write("\t") | |
252 contigs_file.write("\t".join(taxo.split(";"))) | |
253 contigs_file.write("\n") | |
254 contigs_file.close() | |
255 log.info("Abundance file created " + options.output + "/blast2tsv_contigs.txt") | |
256 | |
257 | |
258 def _set_options(): | |
259 parser = argparse.ArgumentParser() | |
260 parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file') | |
261 parser.add_argument('-rn', '--read-count', help='Tab-delimited file associating seqID with read number.', action='store', dest='rn_file') | |
262 parser.add_argument('-c', '--contigs', help='FASTA file with contigs sequence.', action='store', required=True, dest='fasta_file') | |
263 parser.add_argument('-me', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue') | |
264 parser.add_argument('-qov', '--min_query_overlap', help='Minimum query overlap', action='store', type=int, default=5, dest='min_qov') | |
265 parser.add_argument('-mhov', '--min_hit_overlap', help='Minimum hit overlap', action='store', type=int, default=5, dest='min_hov') | |
266 parser.add_argument('-s', '--min_score', help='Minimum score', action='store', type=int, default=30, dest='min_score') | |
267 parser.add_argument('-a', '--algo', help='Blast type detection (BLASTN|BLASTP|BLASTX|TBLASTX|TBLASTN|DIAMONDX).', action='store', type=str, default='BLASTX', dest='algo') | |
268 parser.add_argument('-o', '--out', help='The output file (.csv).', action='store', type=str, default='./blast2tsv', dest='output') | |
269 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) | |
270 args = parser.parse_args() | |
271 return args | |
272 | |
273 | |
274 def _set_log_level(verbosity): | |
275 if verbosity == 1: | |
276 log_format = '%(asctime)s %(levelname)-8s %(message)s' | |
277 log.basicConfig(level=log.INFO, format=log_format) | |
278 elif verbosity == 3: | |
279 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' | |
280 log.basicConfig(level=log.DEBUG, format=log_format) | |
281 | |
282 | |
283 if __name__ == "__main__": | |
284 main() |