comparison blast2tsv.py @ 0:c9dac9b2e01c draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 3a3b40c15ae5e82334f016e88b1f3c5bbbb3b2cd
author iuc
date Mon, 04 Mar 2024 19:56:40 +0000
parents
children 735a21808348
comparison
equal deleted inserted replaced
-1:000000000000 0:c9dac9b2e01c
1 #!/usr/bin/env python3
2
3
4 # Name: blast2tsv
5 # Author(s): Sebastien Theil, Marie Lefebvre - INRAE
6 # Aims: Convert blast xml output to tsv and add taxonomy
7
8
9 import argparse
10 import csv
11 import logging as log
12 import os
13
14 from Bio import Entrez
15 from Bio import SeqIO
16 from Bio.Blast import NCBIXML
17 from ete3 import NCBITaxa
18
19 ncbi = NCBITaxa()
20
21
22 def main():
23 options = _set_options()
24 _set_log_level(options.verbosity)
25 hits = _read_xml(options)
26 _write_tsv(options, hits)
27
28
29 def _guess_database(accession):
30 """Guess the correct database for querying based off the format of the accession"""
31 database_mappings_refseq = {'AC_': 'nuccore', 'NC_': 'nuccore', 'NG_': 'nuccore',
32 'NT_': 'nuccore', 'NW_': 'nuccore', 'NZ_': 'nuccore',
33 'AP_': 'protein', 'NP_': 'protein', 'YP_': 'protein',
34 'XP_': 'protein', 'WP_': 'protein'}
35 return database_mappings_refseq[accession[0:3]]
36
37
38 def _read_xml(options):
39 """
40 Parse XML blast results file
41 Keep only the first hit
42 """
43 log.info("Read XML file.")
44 results = open(options.xml_file, 'r')
45 records = NCBIXML.parse(results)
46 xml_results = {}
47 for blast_record in records:
48 for aln in blast_record.alignments:
49 hit_count = 1
50 for hit in aln.hsps:
51 hsp = {}
52 if hit_count == 1:
53 first_hit_frame = hit.frame[1] if len(hit.frame) > 0 else 0 # strand
54 cumul_hit_identity = hit.identities if hit.identities else 0
55 cumul_hit_score = hit.bits # hit score
56 cumul_hit_evalue = hit.expect # evalue
57 cumul_hit_length = hit.align_length if hit.align_length is not None else 0
58 hit_count = hit_count + 1
59 else:
60 # all HSPs in different strand than 1st HSPs will be discarded.
61 if (first_hit_frame > 0 and hit.frame[1] > 0) or (first_hit_frame < 0 and hit.frame[1] < 0):
62 cumul_hit_identity = cumul_hit_identity + hit.identities
63 cumul_hit_length = cumul_hit_length + hit.align_length
64 cumul_hit_evalue = cumul_hit_evalue + hit.expect
65 cumul_hit_score = cumul_hit_score + hit.bits
66 hit_count = hit_count + 1
67 if hit_count == 1:
68 final_hit_count = hit_count
69 elif hit_count > 1:
70 final_hit_count = hit_count - 1
71 hsp["evalue"] = cumul_hit_evalue / final_hit_count # The smaller the E-value, the better the match
72 hsp["query_id"] = blast_record.query_id
73 hsp["query_length"] = blast_record.query_length # length of the query
74 hsp["accession"] = aln.accession.replace("ref|", "")
75 hsp["description"] = aln.hit_def
76 hsp["hit_length"] = aln.length # length of the hit
77 hsp["hsp_length"] = hit.align_length # length of the hsp alignment
78 hsp["queryOverlap"] = _get_overlap_value(options.algo, hsp, 'hsp', hsp["query_length"])[0]
79 if cumul_hit_length == 0:
80 hsp["percentIdentity"] = round(cumul_hit_identity, 1) # identity percentage
81 else:
82 hsp["percentIdentity"] = round(cumul_hit_identity / cumul_hit_length * 100, 1) # identity percentage
83 hsp["score"] = cumul_hit_score # The higher the bit-score, the better the sequence similarity
84 hsp["num_hsps"] = final_hit_count
85 hsp["hit_cumul_length"] = cumul_hit_length
86 hsp["hitOverlap"] = _get_overlap_value(options.algo, hsp, 'hit', hsp["query_length"])[1]
87 db = _guess_database(hsp["accession"])
88 try:
89 handle = Entrez.esummary(db=db, id=hsp["accession"])
90 taxid = str(int(Entrez.read(handle)[0]['TaxId']))
91 handle.close()
92 log.info("Taxid found for " + hsp["accession"])
93 lineage = ncbi.get_lineage(taxid)
94 names = ncbi.get_taxid_translator(lineage)
95 ordered = [names[tid] for tid in lineage]
96 taxonomy = ordered[1:]
97 hsp["tax_id"] = taxid
98 hsp["taxonomy"] = ';'.join(taxonomy)
99 hsp["organism"] = taxonomy[-1]
100 except RuntimeError:
101 hsp["tax_id"] = ""
102 hsp["taxonomy"] = ""
103 hsp["organism"] = ""
104 log.warning("RuntimeError - Taxid not found for " + hsp["accession"])
105 if hsp["evalue"] <= options.max_evalue and hsp["queryOverlap"] >= options.min_qov and \
106 hsp["hitOverlap"] >= options.min_hov and hsp["score"] >= options.min_score:
107 xml_results[hsp["query_id"]] = hsp
108 else:
109 xml_results[hsp["query_id"]] = [hsp["query_length"]]
110
111 return xml_results
112
113
114 def _get_overlap_value(algo, hsp, type, qlength):
115 """
116 Set hsp or hit overlap values for hit and query
117 Return array [query_overlap, hit_overlap]
118 """
119 if type == 'hsp':
120 q_align_len = qlength
121 h_align_len = hsp["hsp_length"]
122 else:
123 q_align_len = qlength
124 h_align_len = hsp["hit_cumul_length"]
125
126 if algo == 'BLASTX':
127 if q_align_len:
128 query_overlap = (q_align_len * 3 / q_align_len) * 100
129 if hsp["hit_length"]:
130 hit_overlap = (h_align_len / hsp["hit_length"]) * 100
131 elif algo == 'TBLASTN':
132 if q_align_len:
133 query_overlap = (q_align_len / q_align_len) * 100
134 if hsp["hit_length"]:
135 hit_overlap = (h_align_len * 3 / hsp["hit_length"]) * 100
136 elif algo == 'TBLASTX':
137 if q_align_len:
138 query_overlap = (q_align_len * 3 / hsp["hsp_length"]) * 100
139 if hsp["hit_length"]:
140 hit_overlap = (h_align_len * 3 / hsp["hit_length"]) * 100
141 else:
142 if q_align_len:
143 query_overlap = (q_align_len / q_align_len) * 100
144 if hsp["hit_length"]:
145 hit_overlap = (h_align_len / hsp["hit_length"]) * 100
146 if query_overlap is None:
147 query_overlap = 0
148 if query_overlap > 100:
149 query_overlap = 100
150 if 'hit_overlap' not in locals():
151 hit_overlap = 0
152 if hit_overlap > 100:
153 hit_overlap = 100
154
155 return [round(query_overlap, 0), round(hit_overlap, 0)]
156
157
158 def _write_tsv(options, hits):
159 """
160 Write output
161 """
162 # get a list of contig without corresponding number of mapped reads
163 if options.rn_file is not None:
164 with open(options.rn_file) as rn:
165 rows = (line.split('\t') for line in rn)
166 rn_list = {row[0]: row[1:] for row in rows}
167 fasta = SeqIO.to_dict(SeqIO.parse(open(options.fasta_file), 'fasta'))
168 headers = "#algo\tquery_id\tnb_reads\tquery_length\taccession\tdescription\torganism\tpercentIdentity\tnb_hsps\tqueryOverlap\thitOverlap\tevalue\tscore\ttax_id\ttaxonomy\tsequence\n"
169 if not os.path.exists(options.output):
170 os.mkdir(options.output)
171 tsv_file = options.output + "/blast2tsv_output.tab"
172 log.info("Write output file: " + tsv_file)
173 f = open(tsv_file, "w+")
174 f.write(headers)
175 for h in hits:
176 if options.rn_file is not None:
177 read_nb = ''.join(rn_list[h]).replace("\n", "")
178 else:
179 read_nb = ''
180 if len(hits[h]) > 1:
181 f.write(options.algo + "\t" + h + "\t" + read_nb + "\t" + str(hits[h]["query_length"]) + "\t")
182 f.write(hits[h]["accession"] + "\t" + hits[h]["description"] + "\t")
183 f.write(hits[h]["organism"] + "\t" + str(hits[h]["percentIdentity"]) + "\t")
184 f.write(str(hits[h]["num_hsps"]) + "\t" + str(hits[h]["queryOverlap"]) + "\t")
185 f.write(str(hits[h]["hitOverlap"]) + "\t" + str(hits[h]["evalue"]) + "\t")
186 f.write(str(hits[h]["score"]) + "\t" + str(hits[h]["tax_id"]) + "\t")
187 if h in fasta:
188 f.write(hits[h]["taxonomy"] + "\t" + str(fasta[h].seq))
189 else:
190 f.write(hits[h]["taxonomy"] + "\t\"\"")
191 f.write("\n")
192 else:
193 f.write(options.algo + "\t" + h + "\t" + read_nb + "\t" + str(hits[h])[1:-1] + "\t")
194 f.write("\n")
195 f.close()
196 _create_abundance(options, tsv_file)
197
198
199 def _create_abundance(options, tsv_file):
200 """
201 extract values from tsv files
202 and create abundance files
203 """
204 log.info("Calculating abundance.")
205 file_path = tsv_file
206 abundance = dict()
207 with open(tsv_file, 'r') as current_file:
208 log.debug("Reading " + file_path)
209 csv_reader = csv.reader(current_file, delimiter='\t')
210 line_count = 0
211 for row in csv_reader:
212 if line_count == 0:
213 # headers
214 line_count += 1
215 else:
216 # no annotation
217 if len(row) == 16:
218 if row[14] != "":
219 nb_reads = row[2]
220 if nb_reads == "":
221 current_reads_nb = 0
222 log.debug("No reads number for " + row[1])
223 else:
224 current_reads_nb = int(nb_reads)
225 contig_id = row[14]
226 if contig_id in abundance:
227 # add reads
228 abundance[contig_id]["reads_nb"] = abundance[row[14]]["reads_nb"] + current_reads_nb
229 abundance[contig_id]["contigs_nb"] = abundance[row[14]]["contigs_nb"] + 1
230 else:
231 # init reads for this taxo
232 abundance[contig_id] = {}
233 abundance[contig_id]["reads_nb"] = current_reads_nb
234 abundance[contig_id]["contigs_nb"] = 1
235 else:
236 log.debug("No annotations for contig " + row[1])
237 else:
238 log.debug("No annotations for contig " + row[1])
239 log.debug(abundance)
240 reads_file = open(options.output + "/blast2tsv_reads.txt", "w+")
241 for taxo in abundance:
242 reads_file.write(str(abundance[taxo]["reads_nb"]))
243 reads_file.write("\t")
244 reads_file.write("\t".join(taxo.split(";")))
245 reads_file.write("\n")
246 reads_file.close()
247 log.info("Abundance file created " + options.output + "/blast2tsv_reads.txt")
248 contigs_file = open(options.output + "/blast2tsv_contigs.txt", "w+")
249 for taxo in abundance:
250 contigs_file.write(str(abundance[taxo]["contigs_nb"]))
251 contigs_file.write("\t")
252 contigs_file.write("\t".join(taxo.split(";")))
253 contigs_file.write("\n")
254 contigs_file.close()
255 log.info("Abundance file created " + options.output + "/blast2tsv_contigs.txt")
256
257
258 def _set_options():
259 parser = argparse.ArgumentParser()
260 parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file')
261 parser.add_argument('-rn', '--read-count', help='Tab-delimited file associating seqID with read number.', action='store', dest='rn_file')
262 parser.add_argument('-c', '--contigs', help='FASTA file with contigs sequence.', action='store', required=True, dest='fasta_file')
263 parser.add_argument('-me', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue')
264 parser.add_argument('-qov', '--min_query_overlap', help='Minimum query overlap', action='store', type=int, default=5, dest='min_qov')
265 parser.add_argument('-mhov', '--min_hit_overlap', help='Minimum hit overlap', action='store', type=int, default=5, dest='min_hov')
266 parser.add_argument('-s', '--min_score', help='Minimum score', action='store', type=int, default=30, dest='min_score')
267 parser.add_argument('-a', '--algo', help='Blast type detection (BLASTN|BLASTP|BLASTX|TBLASTX|TBLASTN|DIAMONDX).', action='store', type=str, default='BLASTX', dest='algo')
268 parser.add_argument('-o', '--out', help='The output file (.csv).', action='store', type=str, default='./blast2tsv', dest='output')
269 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1)
270 args = parser.parse_args()
271 return args
272
273
274 def _set_log_level(verbosity):
275 if verbosity == 1:
276 log_format = '%(asctime)s %(levelname)-8s %(message)s'
277 log.basicConfig(level=log.INFO, format=log_format)
278 elif verbosity == 3:
279 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s'
280 log.basicConfig(level=log.DEBUG, format=log_format)
281
282
283 if __name__ == "__main__":
284 main()