Mercurial > repos > iuc > virannot_blast2tsv
diff blast2tsv.py @ 0:e889010415a1 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 3a3b40c15ae5e82334f016e88b1f3c5bbbb3b2cd
author | iuc |
---|---|
date | Mon, 04 Mar 2024 19:55:52 +0000 |
parents | |
children | 77c3ef9b0ed7 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast2tsv.py Mon Mar 04 19:55:52 2024 +0000 @@ -0,0 +1,284 @@ +#!/usr/bin/env python3 + + +# Name: blast2tsv +# Author(s): Sebastien Theil, Marie Lefebvre - INRAE +# Aims: Convert blast xml output to tsv and add taxonomy + + +import argparse +import csv +import logging as log +import os + +from Bio import Entrez +from Bio import SeqIO +from Bio.Blast import NCBIXML +from ete3 import NCBITaxa + +ncbi = NCBITaxa() + + +def main(): + options = _set_options() + _set_log_level(options.verbosity) + hits = _read_xml(options) + _write_tsv(options, hits) + + +def _guess_database(accession): + """Guess the correct database for querying based off the format of the accession""" + database_mappings_refseq = {'AC_': 'nuccore', 'NC_': 'nuccore', 'NG_': 'nuccore', + 'NT_': 'nuccore', 'NW_': 'nuccore', 'NZ_': 'nuccore', + 'AP_': 'protein', 'NP_': 'protein', 'YP_': 'protein', + 'XP_': 'protein', 'WP_': 'protein'} + return database_mappings_refseq[accession[0:3]] + + +def _read_xml(options): + """ + Parse XML blast results file + Keep only the first hit + """ + log.info("Read XML file.") + results = open(options.xml_file, 'r') + records = NCBIXML.parse(results) + xml_results = {} + for blast_record in records: + for aln in blast_record.alignments: + hit_count = 1 + for hit in aln.hsps: + hsp = {} + if hit_count == 1: + first_hit_frame = hit.frame[1] if len(hit.frame) > 0 else 0 # strand + cumul_hit_identity = hit.identities if hit.identities else 0 + cumul_hit_score = hit.bits # hit score + cumul_hit_evalue = hit.expect # evalue + cumul_hit_length = hit.align_length if hit.align_length is not None else 0 + hit_count = hit_count + 1 + else: + # all HSPs in different strand than 1st HSPs will be discarded. + if (first_hit_frame > 0 and hit.frame[1] > 0) or (first_hit_frame < 0 and hit.frame[1] < 0): + cumul_hit_identity = cumul_hit_identity + hit.identities + cumul_hit_length = cumul_hit_length + hit.align_length + cumul_hit_evalue = cumul_hit_evalue + hit.expect + cumul_hit_score = cumul_hit_score + hit.bits + hit_count = hit_count + 1 + if hit_count == 1: + final_hit_count = hit_count + elif hit_count > 1: + final_hit_count = hit_count - 1 + hsp["evalue"] = cumul_hit_evalue / final_hit_count # The smaller the E-value, the better the match + hsp["query_id"] = blast_record.query_id + hsp["query_length"] = blast_record.query_length # length of the query + hsp["accession"] = aln.accession.replace("ref|", "") + hsp["description"] = aln.hit_def + hsp["hit_length"] = aln.length # length of the hit + hsp["hsp_length"] = hit.align_length # length of the hsp alignment + hsp["queryOverlap"] = _get_overlap_value(options.algo, hsp, 'hsp', hsp["query_length"])[0] + if cumul_hit_length == 0: + hsp["percentIdentity"] = round(cumul_hit_identity, 1) # identity percentage + else: + hsp["percentIdentity"] = round(cumul_hit_identity / cumul_hit_length * 100, 1) # identity percentage + hsp["score"] = cumul_hit_score # The higher the bit-score, the better the sequence similarity + hsp["num_hsps"] = final_hit_count + hsp["hit_cumul_length"] = cumul_hit_length + hsp["hitOverlap"] = _get_overlap_value(options.algo, hsp, 'hit', hsp["query_length"])[1] + db = _guess_database(hsp["accession"]) + try: + handle = Entrez.esummary(db=db, id=hsp["accession"]) + taxid = str(int(Entrez.read(handle)[0]['TaxId'])) + handle.close() + log.info("Taxid found for " + hsp["accession"]) + lineage = ncbi.get_lineage(taxid) + names = ncbi.get_taxid_translator(lineage) + ordered = [names[tid] for tid in lineage] + taxonomy = ordered[1:] + hsp["tax_id"] = taxid + hsp["taxonomy"] = ';'.join(taxonomy) + hsp["organism"] = taxonomy[-1] + except RuntimeError: + hsp["tax_id"] = "" + hsp["taxonomy"] = "" + hsp["organism"] = "" + log.warning("RuntimeError - Taxid not found for " + hsp["accession"]) + if hsp["evalue"] <= options.max_evalue and hsp["queryOverlap"] >= options.min_qov and \ + hsp["hitOverlap"] >= options.min_hov and hsp["score"] >= options.min_score: + xml_results[hsp["query_id"]] = hsp + else: + xml_results[hsp["query_id"]] = [hsp["query_length"]] + + return xml_results + + +def _get_overlap_value(algo, hsp, type, qlength): + """ + Set hsp or hit overlap values for hit and query + Return array [query_overlap, hit_overlap] + """ + if type == 'hsp': + q_align_len = qlength + h_align_len = hsp["hsp_length"] + else: + q_align_len = qlength + h_align_len = hsp["hit_cumul_length"] + + if algo == 'BLASTX': + if q_align_len: + query_overlap = (q_align_len * 3 / q_align_len) * 100 + if hsp["hit_length"]: + hit_overlap = (h_align_len / hsp["hit_length"]) * 100 + elif algo == 'TBLASTN': + if q_align_len: + query_overlap = (q_align_len / q_align_len) * 100 + if hsp["hit_length"]: + hit_overlap = (h_align_len * 3 / hsp["hit_length"]) * 100 + elif algo == 'TBLASTX': + if q_align_len: + query_overlap = (q_align_len * 3 / hsp["hsp_length"]) * 100 + if hsp["hit_length"]: + hit_overlap = (h_align_len * 3 / hsp["hit_length"]) * 100 + else: + if q_align_len: + query_overlap = (q_align_len / q_align_len) * 100 + if hsp["hit_length"]: + hit_overlap = (h_align_len / hsp["hit_length"]) * 100 + if query_overlap is None: + query_overlap = 0 + if query_overlap > 100: + query_overlap = 100 + if 'hit_overlap' not in locals(): + hit_overlap = 0 + if hit_overlap > 100: + hit_overlap = 100 + + return [round(query_overlap, 0), round(hit_overlap, 0)] + + +def _write_tsv(options, hits): + """ + Write output + """ + # get a list of contig without corresponding number of mapped reads + if options.rn_file is not None: + with open(options.rn_file) as rn: + rows = (line.split('\t') for line in rn) + rn_list = {row[0]: row[1:] for row in rows} + fasta = SeqIO.to_dict(SeqIO.parse(open(options.fasta_file), 'fasta')) + headers = "#algo\tquery_id\tnb_reads\tquery_length\taccession\tdescription\torganism\tpercentIdentity\tnb_hsps\tqueryOverlap\thitOverlap\tevalue\tscore\ttax_id\ttaxonomy\tsequence\n" + if not os.path.exists(options.output): + os.mkdir(options.output) + tsv_file = options.output + "/blast2tsv_output.tab" + log.info("Write output file: " + tsv_file) + f = open(tsv_file, "w+") + f.write(headers) + for h in hits: + if options.rn_file is not None: + read_nb = ''.join(rn_list[h]).replace("\n", "") + else: + read_nb = '' + if len(hits[h]) > 1: + f.write(options.algo + "\t" + h + "\t" + read_nb + "\t" + str(hits[h]["query_length"]) + "\t") + f.write(hits[h]["accession"] + "\t" + hits[h]["description"] + "\t") + f.write(hits[h]["organism"] + "\t" + str(hits[h]["percentIdentity"]) + "\t") + f.write(str(hits[h]["num_hsps"]) + "\t" + str(hits[h]["queryOverlap"]) + "\t") + f.write(str(hits[h]["hitOverlap"]) + "\t" + str(hits[h]["evalue"]) + "\t") + f.write(str(hits[h]["score"]) + "\t" + str(hits[h]["tax_id"]) + "\t") + if h in fasta: + f.write(hits[h]["taxonomy"] + "\t" + str(fasta[h].seq)) + else: + f.write(hits[h]["taxonomy"] + "\t\"\"") + f.write("\n") + else: + f.write(options.algo + "\t" + h + "\t" + read_nb + "\t" + str(hits[h])[1:-1] + "\t") + f.write("\n") + f.close() + _create_abundance(options, tsv_file) + + +def _create_abundance(options, tsv_file): + """ + extract values from tsv files + and create abundance files + """ + log.info("Calculating abundance.") + file_path = tsv_file + abundance = dict() + with open(tsv_file, 'r') as current_file: + log.debug("Reading " + file_path) + csv_reader = csv.reader(current_file, delimiter='\t') + line_count = 0 + for row in csv_reader: + if line_count == 0: + # headers + line_count += 1 + else: + # no annotation + if len(row) == 16: + if row[14] != "": + nb_reads = row[2] + if nb_reads == "": + current_reads_nb = 0 + log.debug("No reads number for " + row[1]) + else: + current_reads_nb = int(nb_reads) + contig_id = row[14] + if contig_id in abundance: + # add reads + abundance[contig_id]["reads_nb"] = abundance[row[14]]["reads_nb"] + current_reads_nb + abundance[contig_id]["contigs_nb"] = abundance[row[14]]["contigs_nb"] + 1 + else: + # init reads for this taxo + abundance[contig_id] = {} + abundance[contig_id]["reads_nb"] = current_reads_nb + abundance[contig_id]["contigs_nb"] = 1 + else: + log.debug("No annotations for contig " + row[1]) + else: + log.debug("No annotations for contig " + row[1]) + log.debug(abundance) + reads_file = open(options.output + "/blast2tsv_reads.txt", "w+") + for taxo in abundance: + reads_file.write(str(abundance[taxo]["reads_nb"])) + reads_file.write("\t") + reads_file.write("\t".join(taxo.split(";"))) + reads_file.write("\n") + reads_file.close() + log.info("Abundance file created " + options.output + "/blast2tsv_reads.txt") + contigs_file = open(options.output + "/blast2tsv_contigs.txt", "w+") + for taxo in abundance: + contigs_file.write(str(abundance[taxo]["contigs_nb"])) + contigs_file.write("\t") + contigs_file.write("\t".join(taxo.split(";"))) + contigs_file.write("\n") + contigs_file.close() + log.info("Abundance file created " + options.output + "/blast2tsv_contigs.txt") + + +def _set_options(): + parser = argparse.ArgumentParser() + parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file') + parser.add_argument('-rn', '--read-count', help='Tab-delimited file associating seqID with read number.', action='store', dest='rn_file') + parser.add_argument('-c', '--contigs', help='FASTA file with contigs sequence.', action='store', required=True, dest='fasta_file') + parser.add_argument('-me', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue') + parser.add_argument('-qov', '--min_query_overlap', help='Minimum query overlap', action='store', type=int, default=5, dest='min_qov') + parser.add_argument('-mhov', '--min_hit_overlap', help='Minimum hit overlap', action='store', type=int, default=5, dest='min_hov') + parser.add_argument('-s', '--min_score', help='Minimum score', action='store', type=int, default=30, dest='min_score') + parser.add_argument('-a', '--algo', help='Blast type detection (BLASTN|BLASTP|BLASTX|TBLASTX|TBLASTN|DIAMONDX).', action='store', type=str, default='BLASTX', dest='algo') + parser.add_argument('-o', '--out', help='The output file (.csv).', action='store', type=str, default='./blast2tsv', dest='output') + parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) + args = parser.parse_args() + return args + + +def _set_log_level(verbosity): + if verbosity == 1: + log_format = '%(asctime)s %(levelname)-8s %(message)s' + log.basicConfig(level=log.INFO, format=log_format) + elif verbosity == 3: + log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' + log.basicConfig(level=log.DEBUG, format=log_format) + + +if __name__ == "__main__": + main()