Mercurial > repos > estrain > btyper
changeset 7:d408ad12401a draft
Uploading GitHub version of btyper
author | estrain |
---|---|
date | Fri, 08 Dec 2017 11:39:12 -0500 |
parents | c43392559c47 |
children | f969be906cb5 |
files | btyper-2.0.3/btyper |
diffstat | 1 files changed, 905 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/btyper-2.0.3/btyper Fri Dec 08 11:39:12 2017 -0500 @@ -0,0 +1,905 @@ +#!/usr/bin/env python + +# BTyper version 2.0.3 +# Created by Laura Carroll +# lmc297@cornell.edu + +# import required packages +import argparse, sys, os, re, glob, collections + + +# parse arguments +parser = argparse.ArgumentParser(usage='btyper -i </path/to/input/file.extension> -o </path/to/desired/output_directory/> -t [input data format (seq, pe, se, sra, or sra-get)] [-other options]') +parser.add_argument('-i','--input', nargs='+', help='Enter the path to the Bacillus cereus group sequence data you would like to input, or enter an SRA accession number',required=True) +parser.add_argument('-o', '--output', nargs='+', help='Specify a path to your desired output directory',required=True) +parser.add_argument('-t', '--type', nargs='+', help='Specify your type of data: seq for genomes or contigs in fasta format, pe for paired-end Illumina reads, se for single-end Illumina reads, sra for a sra file, or sra-get with an SRA accession number',required=True) +parser.add_argument('--draft_genome', action='store_true', default=False, help='Optional argument for use with contigs in fasta format; concatenates draft genome contigs into pseudochromosome') +parser.add_argument('-v', '--virulence', nargs='?', default=True, help='Optional argument, True or False; perform virulence gene typing; default=True') +parser.add_argument('-m', '--mlst', nargs='?', default=True, help='Optional argument, True or False; perform MLST using Bacillus cereus MLST database; default=True') +parser.add_argument('-r', '--rpoB', nargs='?', default=True, help='Optional argument, True or False; perform rpoB typing; default=True') +parser.add_argument('-p', '--panC', nargs='?', default=True, help='Optional argument, True or False; perform panC typing; default=True') +parser.add_argument('-s', nargs='?', default=False, help='Optional argument, True or False; BLAST 16s DNA sequence; not recommended for inferring species or pathogenicity; default=False') +parser.add_argument('--spades_m', nargs='?', default=250, help='Optional argument for use with Illumina reads; integer; set SPAdes memory limit -m/--memory option in Gb; default is 250 Gb, the default for SPAdes') +parser.add_argument('--spades_t', nargs='?', default=16, help='Optional argument for use with Illumina reads; integer; set number of threads to use for SPAdes -t/--threads option; default is 16, the default for SPAdes') +parser.add_argument('--spades_k', nargs='?', default=77, help='Optional argument for use with Illumina reads; comma-separated list of integers; set k-mer sizes to use with SPAdes -k option; default is 77') +parser.add_argument('-v_db', '--virulence_database', nargs='?', default="aa", help='Optional argument for use with -v/--virulence option; specify virulence database to be used: nuc for nucleotide database or aa for amino acid database; default=aa') +parser.add_argument('-nuc_p', '--nucleotide_p', nargs='?',default=75, help='Optional argument for use with -v/--virulence option and nucleotide database -v_db nuc option; integer between 0 and 100; minimum percent nucleotide identity for virulence gene detection; default=75') +parser.add_argument('-nuc_q','--nucleotide_q', nargs='?',default=90, help='Optional argument for use with -v/--virulence option and nucleotide database -v_db nuc option; integer between 0 and 100; minimum percent coverage for virulence gene detection; default=90') +parser.add_argument('-aa_p','--amino_acid_p',nargs='?',default=50, help='Optional argument for use with -v/--virulence option and amino acid database -v_db aa option; integer between 0 and 100; minimum percent amino acid identity for virulence gene detection; default=50') +parser.add_argument('-aa_q','--amino_acid_q', nargs='?',default=70, help='Optional argument for use with -v/--virulence option and amino acid database -v_db aa option; integer between 0 and 100; minimum percent coverage for virulence gene detection; default=70') +parser.add_argument('-e', '--evalue', nargs='?', default=1e-5, help='Optional argument; float >= to 0; maximum blast e-value for a hit to be saved; default=1e-5') +parser.add_argument('--version', action="version", version='%(prog)s 2.0.3', help="Print version") +parser.add_argument('-a', '--amr', nargs='?', default=True, help='Optional argument, True or False; perform antimicrobial resistance gene detection; defaul=True') +parser.add_argument('-amr_db', '--amr_database', nargs='?', default="argannot", help='Optional argument for use with -a/--amr option; specify antimicrobial resistance gene database to use: argannot for ARG-ANNOT database or megares for MEGARes database; default=argannot') +parser.add_argument('--amr_blast', nargs='?', default="blastn", help='Optional argument for use with -a/--amr option; specify blast algorithm to use: blastn to blast against nucleotide database, or tblastx to blast translated sequence against translated nucleotide database; default=blastn') +parser.add_argument('--amr_p',nargs='?',default=75, help='Optional argument for use with -a/--amr option; integer between 0 and 100; minimum percent nucleotide identity for AMR gene detection; default=50') +parser.add_argument('--amr_q',nargs='?',default=50, help='Optional argument for use with -a/--amr option; integer between 0 and 100; minimum percent coverage for AMR gene detection; default=50') + +args=parser.parse_args() + +# check for biopython installation +try: + from Bio.Blast.Applications import NcbiblastnCommandline, NcbitblastnCommandline, NcbitblastxCommandline + from Bio import SeqIO + from Bio.Blast import NCBIXML +except ImportError: + print "Scanning for BioPython installation..." + try: + cellar=os.popen('brew ls btyper| grep "seq_virulence_db"').read() + btyper_path = cellar.split("seq_virulence_db")[0].strip() + pysearch=btyper_path.split("/bin/")[0]+"/libexec/lib/python2.7/site-packages/" + print pysearch + sys.path.append(pysearch.strip()) + from Bio.Blast.Applications import NcbiblastnCommandline, NcbitblastnCommandline + from Bio import SeqIO + from Bio.Blast import NCBIXML + except (ImportError,OSError) as e: + print "Could not find BioPython installed. Exiting BTyper..." + sys.exit() + +# path to program btyper.py, as well as databases +# check if homebrew is installed +if os.path.isfile("/usr/local/Cellar/btyper*/btyper"): + cellar=os.popen('brew ls btyper|grep "seq_virulence_db"').read() + btyper_path=cellar.split("seq_virulence_db")[0].strip() +else: + btyper_path = os.path.realpath(__file__) + btyper_path = btyper_path.rpartition("/")[0].strip()+"/" + print btyper_path + +# split arguments provided by user +def arg_splitter(argin): + return str(argin).strip("[']") + +iarg=arg_splitter(args.input) +oarg=arg_splitter(args.output) +targ=arg_splitter(args.type) +varg=arg_splitter(args.virulence) +marg=arg_splitter(args.mlst) +rarg=arg_splitter(args.rpoB) +parg=arg_splitter(args.panC) +sarg=arg_splitter(args.s) +spadesm=arg_splitter(args.spades_m) +spadest=arg_splitter(args.spades_t) +spadesk=arg_splitter(args.spades_k) +dgarg=arg_splitter(args.draft_genome) +vdbarg=arg_splitter(args.virulence_database) +nucparg=arg_splitter(args.nucleotide_p) +nucqarg=arg_splitter(args.nucleotide_q) +aaparg=arg_splitter(args.amino_acid_p) +aaqarg=arg_splitter(args.amino_acid_q) +earg=arg_splitter(args.evalue) +amrarg=arg_splitter(args.amr) +amrparg=arg_splitter(args.amr_p) +amrqarg=arg_splitter(args.amr_q) +amrdb=arg_splitter(args.amr_database) +amralg=arg_splitter(args.amr_blast) + +# append to output directory argument, if necessary +if oarg[-1]!="/": + oarg=oarg+"/" + +# between_sections: prints a blank line between sections in final output file, if desried +def between_sections(finalfile_string): + finalfile=open(finalfile_string,"a") + print >> finalfile, "" + finalfile.close() + +# check if file exists and isn't empty +def file_check(input_file): + try: + if os.stat(input_file).st_size > 0: + print input_file+" exits. Continuing..." + return iarg + else: + print "Your input file {0} is empty! Please use a different input file.".format(input_file) + sys.exit() + except OSError: + print "Your input file {0} does not exist. Please make sure your path and file name are correct, or specify a different input file.".format(input_file) + sys.exit() + +# dbparse: creates dictionary mapping virulence gene/mlst/rpoB/panC/16s id names from a given database to their respective sequences +def dbparse(db_path): + argseq={} + infile=open(db_path,"r") + for record in SeqIO.parse(infile, "fasta"): + seqid=str(record.description).strip() + newid=re.sub('[^0-9a-zA-Z]+', '_', seqid) + seqseq=str(record.seq).strip() + seqlen=len(seqseq) + if newid not in argseq.keys(): + argseq[newid]=seqseq + infile.close() + return argseq + +# seqparse: creates dictionary that maps query sequence IDs to sequences +def seqparse(seq_path): + newseq={} + newinfile=open(seq_path,"r") + for record in SeqIO.parse(newinfile, "fasta"): + seqid=str(record.id).strip() + newid=re.sub('[^0-9a-zA-Z]+', '_', seqid) + seqseq=str(record.seq).strip() + reduced_seq=seqseq.replace("N","") + if newid not in newseq.keys(): + newseq[newid]=seqseq + newinfile.close() + return newseq + +# make_blast_xml: takes sequences, makes blast xml file for each +def make_blast_xml(newseq,argdict,query_path,task,shorttask,evalue_thresh,pident_thresh,qcov_thresh): + # for each sequence in the newseq dictionary + for key in newseq.keys(): + # count the sequences + counter=1 + # call query sequence queryseq + queryseq=newseq[key] + # get the length of the queryseq (genelen) + genelen=len(queryseq) + # create a temporary query file with the gene sequence and id + queryfile=open(oarg+key.strip()+"_querytemp.fasta","a") + print >> queryfile, ">"+key.strip() + print >> queryfile, queryseq + queryfile.close() + # check if final results directory exists + if os.path.isdir(oarg+"btyper_final_results"): + print "Final results directory exists." + # if not, make one + else: + os.system("mkdir "+oarg+"btyper_final_results") + # check if isolatefiles directory exists within final results directory + if not os.path.isdir(oarg+"btyper_final_results/isolatefiles"): + os.system("mkdir "+oarg+"btyper_final_results/isolatefiles") + rdir_root=oarg+"btyper_final_results/isolatefiles/"+key#.split(".fasta")[0] + #print rdir_root + # make a results directory for each sequence + if not os.path.isdir(rdir_root+"_results"): + os.system("mkdir "+rdir_root+"_results") + rdir=rdir_root+"_results/" + # create a database out of the temporary query file with the individual sequence + if os.path.isfile(oarg+key.strip()+"_querytemp.fasta.nsq"): + print "Database already exists." + else: + os.system("makeblastdb -in "+oarg+key.strip()+"_querytemp.fasta -dbtype nucl") + out=rdir_root+"_blast_results.xml" + # create final file named for each isolate if necessary + finalfile_string=key.strip()+"_final_results.txt" + if not os.path.isfile(oarg+"btyper_final_results/"+finalfile_string): + finalfile=open(oarg+"btyper_final_results/"+finalfile_string,"a") + print >> finalfile, "BTyper Results for "+key + print >> finalfile, "" + finalfile.close() + if not os.path.isdir(oarg+"btyper_final_results/genefiles"): + os.system("mkdir "+oarg+"btyper_final_results/genefiles") + # open the final file for the isolate + finalfile=open(oarg+"btyper_final_results/"+finalfile_string,"a") + # print task-specific header to final output file + if shorttask=="virulence": + # print the task + print >> finalfile, task.strip() + print >> finalfile, "" + table_header=["Hit #", "Virulence Gene Name","E-Value","Percent (%) Identity","Percent (%) Coverage"] + print >>finalfile, "\t".join([t for t in table_header]) + elif shorttask=="amr": + # print the task + print >> finalfile, task.strip() + print >> finalfile, "" + table_header=["Hit #", "AMR Gene Name","E-Value","Percent (%) Identity","Percent (%) Coverage"] + print >> finalfile, "\t".join([t for t in table_header]) + elif shorttask=="rpoB": + # print the task + print >> finalfile, task.strip() + print >> finalfile, "" + print >> finalfile, "Predicted rpoB Allelic Type"+"\t"+"Percent (%) Identity"+"\t"+"Percent (%) Coverage" + elif shorttask=="panC": + print >> finalfile, task.strip() + print >> finalfile, "" + table_header=["panC Clade Name","Closest Strain","Percent (%) Identity","Percent (%) Coverage"] + print >>finalfile, "\t".join([t for t in table_header]) + elif shorttask=="16s": + print >> finalfile, task.strip() + print >> finalfile, "" + print >> finalfile, "Predicted 16s Type"+"\t"+"Percent (%) Identity"+"\t"+"Percent (%) Coverage" + # close the final file for the isolate + finalfile.close() + # run blast with query=database of sequences, db = querytemp.fasta with single isolate sequence, out = outdir, outfmt = xml + if query_path==btyper_path+"seq_virulence_db/virul_aa_db.fasta": + cline=NcbitblastnCommandline(query=str(query_path),db=oarg+key.strip()+"_querytemp.fasta",out=str(out),outfmt='"5"') + elif amrarg=="True" and amralg=="tblastx" and shorttask=="amr": + cline=NcbitblastxCommandline(query=str(query_path),db=oarg+key.strip()+"_querytemp.fasta",out=str(out),outfmt='"5"') + else: + cline=NcbiblastnCommandline(query=str(query_path),db=oarg+key.strip()+"_querytemp.fasta",out=str(out),outfmt='"5"') + stdout, stderr = cline() + #remove temporary temporary fasta files + os.system("rm "+oarg+key.strip()+"_querytemp.fasta*") + # parse blast xml files according to task + if shorttask=="virulence" or shorttask=="amr": + # parse_blast_xml_seq: parses blast xml file for virulence task (getting a list of all genes present in a sequence) + parse_blast_xml_seq(xml=out,rdir=rdir,rdir_root=rdir_root,key=key,counter=counter,shorttask=shorttask,finalfile_string=finalfile_string,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh) + else: + # parse_panC: parses blast xml for non-virulence tasks (getting the "top hit", rather than a list of all genes) + parse_panC(xml=out,rdir=rdir,rdir_root=rdir_root,key=key,counter=counter,shorttask=shorttask,finalfile_string=finalfile_string,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh) + +# dictionary_add: create a new entry for a dictionary, or append if it already exists as a key +def dictionary_add(dict_name,key,blast): + if key not in dict_name.keys(): + dict_name[key]=[] + dict_name[key].append(blast) + else: + dict_name[key].append(blast) + +# print_virulence_gene_report: prints isolatefiles and genefiles +def print_virulence_gene_report(filtres,maxgene, genequery,key,rdir,shorttask): + print >> filtres, "\t".join([str(m).strip() for m in maxgene]) + genequery=str(genequery).strip() + if shorttask=="virulence" or shorttask=="amr": + genequery=genequery.split("|")[0] + else: + genequery=re.sub('[^0-9a-zA-Z]+', '_', genequery) + genequery=genequery.strip() + if shorttask=="virulence" or shorttask=="amr": + genequery_short=genequery + elif shorttask=="panC": + genequery_short="panC" + elif shorttask=="mlst": + genequery_short=genequery.split("_")[0] + elif shorttask=="rpoB": + genequery_short="rpoB" + elif shorttask=="16s": + genequery_short="16s" + # print to genefile (genefile=1 per gene, with all isolates' sequences) + if shorttask!="virulence" and shorttask!="amr": + genefile=open(oarg+"btyper_final_results/genefiles/"+genequery_short+"_genefile.fasta","a") + print >> genefile, ">"+key+"___"+genequery + print >> genefile, str(maxgene[14]).strip() + genefile.close() + # print to isolatefile (isolatefile=1 per isolate, with all genes detected per isolate) + isolatefile=open(rdir+key+"_"+shorttask+"_sequences.fasta","a") + print >> isolatefile, ">"+genequery+"___"+key + print >> isolatefile, str(maxgene[14]).strip() + isolatefile.close() + +# print_final_report: print to the final report file i.e. BTyper's final output file +def print_final_report(finalfile_string, shorttask, maxgene): + finalout=open(oarg+"btyper_final_results/"+finalfile_string,"a") + if shorttask=="virulence" or shorttask=="amr": + gene_row=[maxgene[0],maxgene[3],maxgene[5],maxgene[8],maxgene[7],maxgene[6],maxgene[14]] + print >> finalout, "\t".join([str(gr).strip() for gr in gene_row]) + elif shorttask=="rpoB": + rpotype=maxgene[3] + rpotype=str(rpotype).replace("_","|") + if float(maxgene[8])<100: + pid=str(maxgene[8]).strip()+"*" + else: + pid=str(maxgene[8]).strip() + if float(maxgene[7])<100: + qid=str(maxgene[7]).strip()+"*" + else: + qid=str(maxgene[7]).strip() + if "*" in pid or "*" in qid: + rpotype=rpotype.strip()+"*" + gene_row=[rpotype,pid,qid] + print >> finalout, "\t".join([str(gr).strip() for gr in gene_row]) + if "*" in rpotype: + print >> finalout, "*No rpoB allele in the current database matches with 100% identity and coverage." + elif shorttask=="panC": + pan=str(maxgene[3]).strip() + panpid=maxgene[8] + panqid=maxgene[7] + if float(panpid.strip())<75: + pan="None*___None" + if float(panpid.strip())<90 and float(panpid.strip())>75: + pan="?*___?" + pan1=pan.split("___")[0] + pan2=pan.split("___")[1] + print >> finalout, pan1.strip()+"\t"+pan2.strip()+"\t"+panpid.strip()+"\t"+panqid.strip() + if pan.strip()=="None*___None": + print >> finalout, "*No panC gene could be detected in your sequence at > 75% identity. Your panC allele may not be significantly associated with the Bacillus cereus group." + if pan.strip()=="?*___?": + print >> finalout, "*A panC clade could not be determined for your isolate." + elif shorttask=="16s": + stype=maxgene[3] + if float(maxgene[8])<97: + pid=str(maxgene[8]).strip()+"*" + else: + pid=str(maxgene[8]).strip() + if float(maxgene[7])<100: + qid=str(maxgene[7]).strip()+"*" + else: + qid=str(maxgene[7]).strip() + if "*" in pid or "*" in qid: + stype=stype.strip()+"*" + gene_row=[stype,pid,qid] + print >> finalout, "\t".join([str(gr).strip() for gr in gene_row]) + if "*" in stype: + print >> finalout, "*No 16s gene in the current database matches with >97% identity and/or 100% coverage. Your isolate may not be a member of the Bacillus cereus group." + finalout.close() + +# get_st: parse mlst output for final report file; gives ST from database of STs and ATs +def get_st(mlst_infile, st_file, finalfile_string,mlst_genes): + mlstdict={} + lowconf=[] + for line in mlst_infile: + splits=line.split("\t") + if targ: + if "alignment_title" not in line: + alleleid=splits[3] + geneid=str(alleleid).split("_")[0] + at=str(alleleid).split("_")[1] + mlstdict[str(geneid).strip()]=str(at).strip() + pid=splits[8] + qid=splits[7] + if float(pid.strip())<100 or float(qid.strip())<100: + lowconf.append(geneid.strip()) + + mlst_infile.close() + stdict={} + stfile=open(st_file,"r") + for line in stfile: + splits=line.split("\t") + st=splits[0] + all_at=splits[1:8] + dictionary_add(stdict,str(st).strip(),[a.strip() for a in all_at]) + stfile.close() + st_order=[s for i in stdict["ST"] for s in i] + myst=[] + for s in st_order: + if s in mlstdict.keys(): + myst.append(mlstdict[s]) + else: + myst.append("?") + + + finalfile=open(finalfile_string,"a") + print >> finalfile, "Predicted MLST Profile:" + print >> finalfile, "" + print >> finalfile, "ST"+"\t"+"\t".join([str(mg).split(".fas")[0] for mg in mlst_genes]) + if "?" in myst: + finalmm=[] + for mm,tt in zip(myst,mlst_genes): + testmlst=tt.split(".fas")[0] + if testmlst.strip() in lowconf: + finalmm.append(mm+"*") + else: + finalmm.append(mm) + finalfile=open(finalfile_string,"a") + if "*" in str(finalmm): + print >> finalfile, "?*"+"\t"+"\t".join([str(m).strip() for m in finalmm]) + print >> finalfile, "*No allele in the current database matches with 100% identity and coverage." + else: + print >> finalfile, "?"+"\t"+"\t".join([str(m).strip() for m in finalmm]) + print >> finalfile, "" + finalfile.close() + else: + confirmst=0 + for stkey in stdict.keys(): + st_match2=[s for i in stdict[stkey] for s in i] + if myst==st_match2: + confirmst=1 + finalmm=[] + print "Found matching ST." + for mm,tt in zip(myst,mlst_genes): + testmlst=tt.split(".fas")[0] + if testmlst.strip() in lowconf: + finalmm.append(mm+"*") + else: + finalmm.append(mm) + + if "*" in str(finalmm): + stkey=stkey.strip()+"*" + finalfile=open(finalfile_string,"a") + print >> finalfile, stkey.strip()+"\t"+"\t".join([str(m).strip() for m in finalmm]) + if "*" in stkey: + print >> finalfile, "*No allele in the current database matches with 100% identity and coverage." + print >> finalfile, "" + finalfile.close() + if confirmst==0: + finalmm=[] + for mm,tt in zip(myst,mlst_genes): + testmlst=tt.split(".fas")[0] + if testmlst.strip() in lowconf: + finalmm.append(mm+"*") + else: + finalmm.append(mm) + if "*" in str(finalmm): + stkey="?*" + else: + stkey="?" + finalfile=open(finalfile_string,"a") + print >> finalfile, stkey.strip()+"\t"+"\t".join([str(m).strip() for m in finalmm]) + if "*" in stkey: + print >> finalfile, "*No allele in the current database matches with 100% identity and coverage." + print >> finalfile, "" + finalfile.close() + + + + +# parse_blast_xml_seq: parse the blast xml output into results file +def parse_blast_xml_seq(xml,rdir,rdir_root,key,counter,shorttask,finalfile_string,evalue_thresh,qcov_thresh,pident_thresh): + result_handle=open(xml) + blast_records=NCBIXML.parse(result_handle) + # open file in individual isolate results directory + filtres=open(rdir+key+"_"+shorttask+"_results.txt","a") + # print the header to the results file + headerlist=["hit_number","alignment_title","query_id",shorttask+"_gene_query","alignment_length","evalue","blast_bitscore","query_coverage","percent_idenity",shorttask+"_gene_start",shorttask+"_gene_end","genome_sequence_start","genome_sequence_end",shorttask+"_gene_sequence","genome_sequence","match_sequence"] + print >> filtres, "\t".join([h for h in headerlist]) + # loop through each record in the blast output + for record in blast_records: + for alignment in record.alignments: + # create a dictionary for HSPs + hspdict={} + # for each hsp in the alignment + for hsp in alignment.hsps: + #query coverage (qcov) can be calculated as one of the following: + #qcov=float(float(len(hsp.sbjct))/float(len(hsp.query))) + qcov=float(hsp.align_length)/float(record.query_length)*100 + pident=float(hsp.identities)/float(hsp.align_length)*100 + # add information for each HSP if the e-value is lower than the threshold + if hsp.expect<=evalue_thresh and qcov>=qcov_thresh and pident>=pident_thresh: + genefacts=[counter,alignment.title,record.query_id,record.query,hsp.align_length,hsp.expect,hsp.bits,qcov,pident,hsp.query_start,hsp.query_end,hsp.sbjct_start,hsp.sbjct_end,hsp.query,hsp.sbjct,hsp.match] + dictionary_add(hspdict,str(record.query).strip(),"\t".join([str(g).strip() for g in genefacts])) + # add the start and end positions of the hsp to hspdict + # loop through each hsp in the alignment + for hkey in hspdict.keys(): + # get values (start and end locations of each hsp, bit score) + hsps=hspdict[hkey] + # if more than 1 hsp exists (i.e. there is a chance that overlapping hsps might exist) + if len(hsps)>1: + # create list to check for overlapping HSPs + allnums=[] + # loop through each HSP + for h in hsps: + hsplits=h.split("\t") + # get genome start position + gstart=hsplits[11] + gstart=int(gstart.strip()) + # get genome end position + gend=hsplits[12] + gend=int(gend.strip()) + if gstart<gend: + # append sequence of base numbers covered by HSP to list + allnums.append(range(gstart,gend+1,1)) + else: + allnums.append(range(gend, gstart+1,1)) + # get the intersection of genome positions when more than one HSP exists in the genome + inset=set.intersection(*map(set,allnums)) + # if an intersection exists (i.e. there are overlapping HSPs) + if inset: + # loop through HSPs, keeping track of HSP with highest bit score + maxbits=0 + for h in hsps: + hsplits=h.split("\t") + # get bit score of current HSP + testbits=hsplits[6] + # if bit score of current HSP is larger than current maximum bit score: + if float(testbits.strip())>maxbits: + # the current bit score becomes the max + maxbits=float(testbits.strip()) + # the current dictionary information becomes the best gene information + maxgene=hsplits + # the current gene query becomes the best gene query + genequery=maxgene[3] + # print the best-scoring HSP to the isolatefiles and genefiles + counter=counter+1 + print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask) + print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene) + # if no intersection exists, i.e. there are multiple HSPs, but they are found in different regions of the genome: + else: + # loop through each nonoverlapping HSP + for h in hsps: + maxgene=h.split("\t") + genequery=maxgene[3] + # print each HSP to the appropriate isolatefile/genefile + counter=counter+1 + print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask) + print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene) + + # if there is only 1 HSP + else: + for h in hsps: + maxgene=h.split("\t") + genequery=maxgene[3] + #print it to the appropriate isolatefile/genefile + counter=counter+1 + print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask) + print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene) + filtres.close() + if shorttask=="virulence" and amrarg=="False": + prune_alleles(finalpath=finalfile_string) + elif shorttask=="amr": + prune_alleles(finalpath=finalfile_string) + +# parse_panC: parse blast output for panC task +def parse_panC(xml,rdir,rdir_root,key,counter,shorttask,finalfile_string,evalue_thresh,qcov_thresh,pident_thresh): + counter=0 + result_handle=open(xml) + blast_records=NCBIXML.parse(result_handle) + # open file in individual isolate results directory + filtres=open(rdir+key+"_"+shorttask+"_results.txt","a") + # print the header to the results file + headerlist=["hit_number","alignment_title","query_id",shorttask+"_gene_query","alignment_length","evalue","blast_bitscore","query_coverage","percent_idenity",shorttask+"_gene_start",shorttask+"_gene_end","genome_sequence_start","genome_sequence_end",shorttask+"_gene_sequence","genome_sequence","match_sequence"] + print >> filtres, "\t".join([h for h in headerlist]) + # loop through each record in the blast output + maxp=0 + for record in blast_records: + for alignment in record.alignments: + # for each hsp in the alignment + for hsp in alignment.hsps: + #qcov=float(float(len(hsp.sbjct))/float(len(hsp.query))) + qcov=float(hsp.align_length)/float(record.query_length)*100 + pident=float(hsp.identities)/float(hsp.align_length)*100 + testbits=float(hsp.bits) + # add information for each HSP if the e-value is lower than the threshold + if hsp.expect<=evalue_thresh and qcov>=qcov_thresh and testbits>=pident_thresh and testbits>maxp: + maxp=testbits + genefacts=[counter,alignment.title,record.query_id,record.query,hsp.align_length,hsp.expect,hsp.bits,qcov,pident,hsp.query_start,hsp.query_end,hsp.sbjct_start,hsp.sbjct_end,hsp.query,hsp.sbjct,hsp.match] + genefacts="\t".join([str(g).strip() for g in genefacts]) + maxgene=genefacts.split("\t") + genequery=maxgene[3] + #print it to the appropriate isolatefile/genefile + counter=counter+1 + print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask) + print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene) + filtres.close() + +# assemble_reads: assemble genome from reads +def assemble_reads_pe(forward,reverse): + os.system("spades.py -k "+spadesk+" --careful -1 "+forward+" -2 "+reverse+" -o "+oarg+"spades_assembly -t "+spadest+" -m "+spadesm) +def assemble_reads_se(reads): + os.system("spades.py -k "+spadesk+" --careful -s "+reads+" -o "+oarg+"spades_assembly -t "+spadest+" -m "+spadesm) + +def fastq_check(reads): + if reads.endswith(".fastq"): + os.system("gzip reads") + reads=reads.strip()+".gz" + if reads.endswith(".fastq.gz"): + return() + else: + print "It looks like your file "+reads.strip()+" does not end with either '.fastq.gz' or '.fastq'. Please make sure that you are supplying reads in fastq or fastq.gz format, and, if necessary, please rename your file with the appropriate extension." + sys.exit() + +def sra_check(sra): + if sra.endswith(".sra"): + return() + else: + print "It looks like your file "+reads.strip()+" does not end with '.sra'. Please make sure that you are supplying sequencing data in sra format, and, if necessary, please rename your file with the extension '.sra'." + +def prune_alleles(finalpath): + f=oarg+"btyper_final_results/"+finalpath + print "Pruning alleles for "+f+"..." + os.system("mv "+f+" "+f+"_temporary.txt") + old=open(f+"_temporary.txt","r") + new=open(f,"a") + bitdicta={} + bitdictv={} + for o in old: + if "BTyper Results for " in o: + print >> new, o.strip() + print >> new, "" + if "|" in o and "___" not in o and "rpoB|AT" not in o: + print >> new, o.strip() + elif "|" in o and "___" in o and "rpoB|AT" not in o: + splits=o.split("\t") + gname=splits[1] + gpref=gname.split("___")[0] + gsuff=gname.split("___")[1] + testbits=splits[5] + if "vir" in gpref: + bitdict=bitdictv + elif "amr" in gpref: + bitdict=bitdicta + if gpref.strip() not in bitdict: + bitdict[gpref.strip()]=[] + bitdict[gpref.strip()].append(gsuff.strip()) + for s in splits[2:]: + bitdict[gpref.strip()].append(str(s).strip()) + else: + maxlist=bitdict[gpref.strip()] + maxbits=maxlist[4] + if float(testbits.strip())>float(maxbits.strip()): + bitdict[gpref.strip()]=[] + bitdict[gpref.strip()].append(gsuff.strip()) + for s in splits[2:]: + bitdict[gpref.strip()].append(str(s).strip()) + + old.close() + dicts=[bitdictv,bitdicta] + if bool(dicts[0])==False and bool(dicts[1])==False: + if varg=="True": + print >> new, "Predicted Virulence Genes:" + print >> new, "" + header=["Hit #","Virulence Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"] + print >> new, "\t".join([h for h in header]) + if amrarg=="True": + print >> new, "Predicted AMR Genes:" + print >> new, "" + header=["Hit #","AMR Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"] + print >> new, "\t".join([h for h in header]) + else: + for bitdict in dicts: + counter=0 + if varg=="True" and bitdict==bitdictv: + print >> new, "Predicted Virulence Genes:" + print >> new, "" + header=["Hit #","Virulence Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"] + print >> new, "\t".join([h for h in header]) + elif amrarg=="True" and bitdict==bitdicta: + print >> new, "Predicted AMR Genes:" + print >> new, "" + header=["Hit #","AMR Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"] + print >> new, "\t".join([h for h in header]) + for key in bitdict.keys(): + counter=counter+1 + finalg=bitdict[key] + print >> new, str(counter)+"\t"+"\t".join([str(fg).strip() for fg in finalg[:-2]]) + shortgene=finalg[0].split("|")[0] + genefile=open(oarg+"btyper_final_results/genefiles/"+shortgene.strip()+"_genefile.fasta","a") + isoname=f.split("/")[-1] + print >> genefile, ">"+isoname.strip()+"___"+shortgene + print >> genefile, finalg[-1].strip() + genefile.close() + print >> new, "" + new.close() + os.system("rm "+f+"_temporary.txt") + + +# run program based on user input +if targ!="seq": + os.system("mkdir "+oarg+"spades_assembly") + if targ=="pe": + spacecount=iarg.count(" ") + if spacecount>1: + print "It looks like there may be whitespace in one or more of your file names. Please rename your file(s), and try again." + sys.exit() + elif spacecount==0: + print "It looks like you've chosen the paired-end reads (-t pe) option but have only supplied one file of reads. Please supply one forward reads file and one reverse reads file separated by a space, or select a different option for -t (se for single-end reads, sra for files in sra format, or sra-get to download sequencing data in SRA format)." + sys.exit() + else: + forward_reads=iarg.split(" ")[0] + reverse_reads=iarg.split(" ")[1] + forward_reads=re.sub("[,']","",forward_reads) + reverse_reads=re.sub("[,']","",reverse_reads) + fastq_check(forward_reads) + fastq_check(reverse_reads) + prefix=forward_reads.split(".fastq.gz")[0] + if "/" in prefix: + prefix=prefix.split("/")[-1] + assemble_reads_pe(forward=forward_reads,reverse=reverse_reads) + elif targ=="se": + sreads=iarg.strip() + fastq_check(sreads) + prefix=sreads.split(".fastq.gz")[0] + if "/" in prefix: + prefix=prefix.split("/")[-1] + assemble_reads_se(reads=sreads) + elif targ=="sra-get" or targ=="sra": + sra=iarg.strip() + if targ=="sra-get": + os.system("prefetch -v "+sra) + os.system("fastq-dump --outdir "+oarg+" --split-files --gzip ~/ncbi/public/sra/"+sra+".sra") + elif targ=="sra": + sra=iarg.split(".sra")[0] + os.system("fastq-dump --outdir "+oarg+" --split-files --gzip "+iarg.strip()) + forward_reads=oarg+sra.strip()+"_1.fastq.gz" + fastq_check(forward_reads) + prefix=forward_reads.split(".fastq.gz")[0] + if "/" in prefix: + prefix=prefix.split("/")[-1] + if os.path.exists(oarg+sra+"_2.fastq.gz"): + reverse_reads=oarg+sra+"_2.fastq.gz" + fastq_check(reverse_reads) + assemble_reads_pe(forward=forward_reads,reverse=reverse_reads) + else: + assemble_reads_se(reads=forward_reads) + os.system("cp "+oarg+"spades_assembly/contigs.fasta "+oarg+prefix.strip()+"_spades_assembly.fasta") + contigs=open(oarg+prefix.strip()+"_spades_assembly.fasta","r") + pseudo=open(oarg+prefix.strip()+"_pseudochrom.fasta","a") + print >> pseudo, ">"+prefix.strip() + print >> pseudo, "".join(line.strip() for line in contigs if ">" not in line) + contigs.close() + pseudo.close() + targ="seq" + iarg=oarg+prefix.strip()+"_pseudochrom.fasta" + + + + +# if the user inputs a fasta file +if targ=="seq": + print "-t seq has been selected. Expecting a fasta file." + # check if the file is a fasta + file_check(iarg) + # if genome is a draft genome + if dgarg=="True": + prefix=iarg.rpartition(".")[0] + if "/" in prefix: + prefix=prefix.split("/")[-1] + contigs=open(iarg,"r") + pseudo=open(oarg+prefix.strip()+"_pseudochrom.fasta","a") + print >> pseudo, ">"+prefix.strip() + print >> pseudo, "".join(line.strip() for line in contigs if ">" not in line) + contigs.close() + pseudo.close() + iarg=oarg+prefix.strip()+"_pseudochrom.fasta" + # create a dictionary from the user-supplied fasta, mapping ids to sequences + dictionaries=seqparse(iarg) + + # if performing virulence typing + if varg=="True": + # define database + query_paths=[] + if vdbarg=="nuc": + query_paths.append(btyper_path+"seq_virulence_db/bc_virul_nt.fasta") + if vdbarg=="aa": + query_paths.append(btyper_path+"seq_virulence_db/virul_aa_db.fasta") + for query_path in query_paths: + # map db ids to sequences + mydb=dbparse(query_path) + #task="Predicted Virulence Genes:" + shorttask="virulence" + # define minimum e-value for virulence genes + evalue_thresh=float(earg) + if query_path==btyper_path+"seq_virulence_db/bc_virul_nt.fasta": + task="Predicted Virulence Genes:" + # define minimum percent identity for virulence genes + pident_thresh=int(nucparg) + # define minimum query coverage for virulence genes + qcov_thresh=int(nucqarg) + elif query_path==btyper_path+"seq_virulence_db/virul_aa_db.fasta": + task="Predicted Virulence Genes:" + # define minimum percent identity for virulence genes + pident_thresh=int(aaparg) + # define minimum query coverage for virulence genes + qcov_thresh=int(aaqarg) + # run virulence typing + make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh) + # print spacing between under final results + for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"): + between_sections(finalfile_string=f) + + + + # if performing amr gene detection + if amrarg=="True": + # define database + query_paths=[] + if amrdb=="megares": + query_paths.append(btyper_path+"seq_amr_db/megares_grouped.fasta") + elif amrdb=="argannot": + query_paths.append(btyper_path+"seq_amr_db/argannot_clus80.fasta") + else: + print "Hmmmm...it looks like your selected database, "+amrdb+", doesn't exist. Please select either argannot to use the ARG-ANNOT database, or megares to use the MEGARes database." + sys.exit() + for query_path in query_paths: + # map db ids to sequences + mydb=dbparse(query_path) + task="Predicted AMR Genes:" + shorttask="amr" + # define minimum e-value for amr genes + evalue_thresh=float(earg) + # define minimum percent identity for amr genes + pident_thresh=int(amrparg) + # define minimum query coverage for amr genes + qcov_thresh=int(amrqarg) + # run amr gene detection + make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh) + for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"): + between_sections(finalfile_string=f) + + + + # if performing panC typing: + if parg=="True": + # define database + query_path=btyper_path+"seq_panC_db/panC.fasta" + mydb=dbparse(query_path) + task="Predicted panC Clade Designation:" + shorttask="panC" + evalue_thresh=float(earg) + pident_thresh=0 + qcov_thresh=0 + # run panC typing, if panC gene is detected + try: + make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh) + for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"): + between_sections(finalfile_string=f) + except UnboundLocalError: + print "No sequences found for "+shorttask + + # if performing mlst: + if marg=="True": + mlst_genes=["glp.fas","gmk.fas","ilv.fas","pta.fas","pur.fas","pyc.fas","tpi.fas"] + # get best-matching AT for each MLST gene + for mlst in mlst_genes: + query_path=btyper_path+"seq_mlst_db/"+mlst + mydb=dbparse(query_path) + task="Predicted MLST Profile:" + shorttask="mlst" + evalue_thresh=float(earg) + pident_thresh=0 + qcov_thresh=0 + # run AT for each gene, if sequence deteced + try: + make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh) + except UnboundLocalError: + print "No sequences found for "+mlst + # loop through isolatefiles + for root, dirs, files in os.walk(oarg+"btyper_final_results/isolatefiles/"): + for d in dirs: + dirroot=d.split("_results")[0] + # open mlst results file, and get ST from ATs + try: + newf=open(oarg+"btyper_final_results/isolatefiles/"+d+"/"+dirroot.strip()+"_mlst_results.txt","r") + finalfile_string=oarg+"btyper_final_results/"+dirroot.strip()+"_final_results.txt" + ff=open(finalfile_string,"r") + flines=ff.readlines() + if not any("Predicted MLST Profile" in fl.strip() for fl in flines): + get_st(mlst_infile=newf,st_file=btyper_path+"seq_mlst_db/b_cereus_mlst_db.txt",finalfile_string=finalfile_string,mlst_genes=mlst_genes) + except IOError: + print "No sequences found for "+shorttask + + # if performing rpoB typing: + if rarg=="True": + # define database + query_path=btyper_path+"seq_rpoB_db/rpobdatabase08122015.fa" + mydb=dbparse(query_path) + task="Predicted rpoB Allelic Type:" + shorttask="rpoB" + evalue_thresh=float(earg) + pident_thresh=0 + qcov_thresh=0 + # perform rpoB typing, if gene is present + try: + make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh) + for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"): + between_sections(finalfile_string=f) + except UnboundLocalError: + print "No sequences found for "+shorttask + + # if performing 16s typing: + if sarg=="True": + # define database + query_path=btyper_path+"seq_16s_db/b_cereus_group_16s_db.fasta" + mydb=dbparse(query_path) + task="Predicted 16s Type" + shorttask="16s" + evalue_thresh=float(earg) + pident_thresh=0 + qcov_thresh=0 + # perform 16s typing, if gene is present + try: + make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh) + for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"): + between_sections(finalfile_string=f) + except UnboundLocalError: + print "No sequences found for "+shorttask + +print "Typing complete...how neat is that?" +print "" +print "Thank you for using BTyper! For more fun, take your output files to BMiner, BTyper's companion application for data aggregation and visualization." +print "" +print "To cite BTyper and/or BMiner, please use the following:" +print "Carroll, Laura M., Jasna Kovac, Rachel A. Miller, Martin Wiedmann. 2017. Rapid, high-throughput identification of anthrax-causing and emetic Bacillus cereus group genome assemblies using BTyper, a computational tool for virulence-based classification of Bacillus cereus group isolates using nucleotide sequencing data. Applied and Environmental Microbiology 2017 Jun 16. pii: AEM.01096-17. doi: 10.1128/AEM.01096-17." + +