0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 # BTyper version 2.0.3
|
|
4 # Created by Laura Carroll
|
|
5 # lmc297@cornell.edu
|
|
6
|
|
7 # import required packages
|
|
8 import argparse, sys, os, re, glob, collections
|
|
9
|
|
10
|
|
11 # parse arguments
|
|
12 parser = argparse.ArgumentParser(usage='btyper -i </path/to/input/file.extension> -o </path/to/desired/output_directory/> -t [input data format (seq, pe, se, sra, or sra-get)] [-other options]')
|
|
13 parser.add_argument('-i','--input', nargs='+', help='Enter the path to the Bacillus cereus group sequence data you would like to input, or enter an SRA accession number',required=True)
|
|
14 parser.add_argument('-o', '--output', nargs='+', help='Specify a path to your desired output directory',required=True)
|
|
15 parser.add_argument('-t', '--type', nargs='+', help='Specify your type of data: seq for genomes or contigs in fasta format, pe for paired-end Illumina reads, se for single-end Illumina reads, sra for a sra file, or sra-get with an SRA accession number',required=True)
|
|
16 parser.add_argument('--draft_genome', action='store_true', default=False, help='Optional argument for use with contigs in fasta format; concatenates draft genome contigs into pseudochromosome')
|
|
17 parser.add_argument('-v', '--virulence', nargs='?', default=True, help='Optional argument, True or False; perform virulence gene typing; default=True')
|
|
18 parser.add_argument('-m', '--mlst', nargs='?', default=True, help='Optional argument, True or False; perform MLST using Bacillus cereus MLST database; default=True')
|
|
19 parser.add_argument('-r', '--rpoB', nargs='?', default=True, help='Optional argument, True or False; perform rpoB typing; default=True')
|
|
20 parser.add_argument('-p', '--panC', nargs='?', default=True, help='Optional argument, True or False; perform panC typing; default=True')
|
|
21 parser.add_argument('-s', nargs='?', default=False, help='Optional argument, True or False; BLAST 16s DNA sequence; not recommended for inferring species or pathogenicity; default=False')
|
|
22 parser.add_argument('--spades_m', nargs='?', default=250, help='Optional argument for use with Illumina reads; integer; set SPAdes memory limit -m/--memory option in Gb; default is 250 Gb, the default for SPAdes')
|
|
23 parser.add_argument('--spades_t', nargs='?', default=16, help='Optional argument for use with Illumina reads; integer; set number of threads to use for SPAdes -t/--threads option; default is 16, the default for SPAdes')
|
|
24 parser.add_argument('--spades_k', nargs='?', default=77, help='Optional argument for use with Illumina reads; comma-separated list of integers; set k-mer sizes to use with SPAdes -k option; default is 77')
|
|
25 parser.add_argument('-v_db', '--virulence_database', nargs='?', default="aa", help='Optional argument for use with -v/--virulence option; specify virulence database to be used: nuc for nucleotide database or aa for amino acid database; default=aa')
|
|
26 parser.add_argument('-nuc_p', '--nucleotide_p', nargs='?',default=75, help='Optional argument for use with -v/--virulence option and nucleotide database -v_db nuc option; integer between 0 and 100; minimum percent nucleotide identity for virulence gene detection; default=75')
|
|
27 parser.add_argument('-nuc_q','--nucleotide_q', nargs='?',default=90, help='Optional argument for use with -v/--virulence option and nucleotide database -v_db nuc option; integer between 0 and 100; minimum percent coverage for virulence gene detection; default=90')
|
|
28 parser.add_argument('-aa_p','--amino_acid_p',nargs='?',default=50, help='Optional argument for use with -v/--virulence option and amino acid database -v_db aa option; integer between 0 and 100; minimum percent amino acid identity for virulence gene detection; default=50')
|
|
29 parser.add_argument('-aa_q','--amino_acid_q', nargs='?',default=70, help='Optional argument for use with -v/--virulence option and amino acid database -v_db aa option; integer between 0 and 100; minimum percent coverage for virulence gene detection; default=70')
|
|
30 parser.add_argument('-e', '--evalue', nargs='?', default=1e-5, help='Optional argument; float >= to 0; maximum blast e-value for a hit to be saved; default=1e-5')
|
|
31 parser.add_argument('--version', action="version", version='%(prog)s 2.0.3', help="Print version")
|
|
32 parser.add_argument('-a', '--amr', nargs='?', default=True, help='Optional argument, True or False; perform antimicrobial resistance gene detection; defaul=True')
|
|
33 parser.add_argument('-amr_db', '--amr_database', nargs='?', default="argannot", help='Optional argument for use with -a/--amr option; specify antimicrobial resistance gene database to use: argannot for ARG-ANNOT database or megares for MEGARes database; default=argannot')
|
|
34 parser.add_argument('--amr_blast', nargs='?', default="blastn", help='Optional argument for use with -a/--amr option; specify blast algorithm to use: blastn to blast against nucleotide database, or tblastx to blast translated sequence against translated nucleotide database; default=blastn')
|
|
35 parser.add_argument('--amr_p',nargs='?',default=75, help='Optional argument for use with -a/--amr option; integer between 0 and 100; minimum percent nucleotide identity for AMR gene detection; default=50')
|
|
36 parser.add_argument('--amr_q',nargs='?',default=50, help='Optional argument for use with -a/--amr option; integer between 0 and 100; minimum percent coverage for AMR gene detection; default=50')
|
|
37
|
|
38 args=parser.parse_args()
|
|
39
|
|
40 # check for biopython installation
|
|
41 try:
|
|
42 from Bio.Blast.Applications import NcbiblastnCommandline, NcbitblastnCommandline, NcbitblastxCommandline
|
|
43 from Bio import SeqIO
|
|
44 from Bio.Blast import NCBIXML
|
|
45 except ImportError:
|
|
46 print "Scanning for BioPython installation..."
|
|
47 try:
|
|
48 cellar=os.popen('brew ls btyper| grep "seq_virulence_db"').read()
|
|
49 btyper_path = cellar.split("seq_virulence_db")[0].strip()
|
|
50 pysearch=btyper_path.split("/bin/")[0]+"/libexec/lib/python2.7/site-packages/"
|
|
51 print pysearch
|
|
52 sys.path.append(pysearch.strip())
|
|
53 from Bio.Blast.Applications import NcbiblastnCommandline, NcbitblastnCommandline
|
|
54 from Bio import SeqIO
|
|
55 from Bio.Blast import NCBIXML
|
|
56 except (ImportError,OSError) as e:
|
|
57 print "Could not find BioPython installed. Exiting BTyper..."
|
|
58 sys.exit()
|
|
59
|
|
60 # path to program btyper.py, as well as databases
|
|
61 # check if homebrew is installed
|
|
62 if os.path.isfile("/usr/local/Cellar/btyper*/btyper"):
|
|
63 cellar=os.popen('brew ls btyper|grep "seq_virulence_db"').read()
|
|
64 btyper_path=cellar.split("seq_virulence_db")[0].strip()
|
|
65 else:
|
|
66 btyper_path = os.path.realpath(__file__)
|
|
67 btyper_path = btyper_path.rpartition("/")[0].strip()+"/"
|
|
68 print btyper_path
|
|
69
|
|
70 # split arguments provided by user
|
|
71 def arg_splitter(argin):
|
|
72 return str(argin).strip("[']")
|
|
73
|
|
74 iarg=arg_splitter(args.input)
|
|
75 oarg=arg_splitter(args.output)
|
|
76 targ=arg_splitter(args.type)
|
|
77 varg=arg_splitter(args.virulence)
|
|
78 marg=arg_splitter(args.mlst)
|
|
79 rarg=arg_splitter(args.rpoB)
|
|
80 parg=arg_splitter(args.panC)
|
|
81 sarg=arg_splitter(args.s)
|
|
82 spadesm=arg_splitter(args.spades_m)
|
|
83 spadest=arg_splitter(args.spades_t)
|
|
84 spadesk=arg_splitter(args.spades_k)
|
|
85 dgarg=arg_splitter(args.draft_genome)
|
|
86 vdbarg=arg_splitter(args.virulence_database)
|
|
87 nucparg=arg_splitter(args.nucleotide_p)
|
|
88 nucqarg=arg_splitter(args.nucleotide_q)
|
|
89 aaparg=arg_splitter(args.amino_acid_p)
|
|
90 aaqarg=arg_splitter(args.amino_acid_q)
|
|
91 earg=arg_splitter(args.evalue)
|
|
92 amrarg=arg_splitter(args.amr)
|
|
93 amrparg=arg_splitter(args.amr_p)
|
|
94 amrqarg=arg_splitter(args.amr_q)
|
|
95 amrdb=arg_splitter(args.amr_database)
|
|
96 amralg=arg_splitter(args.amr_blast)
|
|
97
|
|
98 # append to output directory argument, if necessary
|
|
99 if oarg[-1]!="/":
|
|
100 oarg=oarg+"/"
|
|
101
|
|
102 # between_sections: prints a blank line between sections in final output file, if desried
|
|
103 def between_sections(finalfile_string):
|
|
104 finalfile=open(finalfile_string,"a")
|
|
105 print >> finalfile, ""
|
|
106 finalfile.close()
|
|
107
|
|
108 # check if file exists and isn't empty
|
|
109 def file_check(input_file):
|
|
110 try:
|
|
111 if os.stat(input_file).st_size > 0:
|
|
112 print input_file+" exits. Continuing..."
|
|
113 return iarg
|
|
114 else:
|
|
115 print "Your input file {0} is empty! Please use a different input file.".format(input_file)
|
|
116 sys.exit()
|
|
117 except OSError:
|
|
118 print "Your input file {0} does not exist. Please make sure your path and file name are correct, or specify a different input file.".format(input_file)
|
|
119 sys.exit()
|
|
120
|
|
121 # dbparse: creates dictionary mapping virulence gene/mlst/rpoB/panC/16s id names from a given database to their respective sequences
|
|
122 def dbparse(db_path):
|
|
123 argseq={}
|
|
124 infile=open(db_path,"r")
|
|
125 for record in SeqIO.parse(infile, "fasta"):
|
|
126 seqid=str(record.description).strip()
|
|
127 newid=re.sub('[^0-9a-zA-Z]+', '_', seqid)
|
|
128 seqseq=str(record.seq).strip()
|
|
129 seqlen=len(seqseq)
|
|
130 if newid not in argseq.keys():
|
|
131 argseq[newid]=seqseq
|
|
132 infile.close()
|
|
133 return argseq
|
|
134
|
|
135 # seqparse: creates dictionary that maps query sequence IDs to sequences
|
|
136 def seqparse(seq_path):
|
|
137 newseq={}
|
|
138 newinfile=open(seq_path,"r")
|
|
139 for record in SeqIO.parse(newinfile, "fasta"):
|
|
140 seqid=str(record.id).strip()
|
|
141 newid=re.sub('[^0-9a-zA-Z]+', '_', seqid)
|
|
142 seqseq=str(record.seq).strip()
|
|
143 reduced_seq=seqseq.replace("N","")
|
|
144 if newid not in newseq.keys():
|
|
145 newseq[newid]=seqseq
|
|
146 newinfile.close()
|
|
147 return newseq
|
|
148
|
|
149 # make_blast_xml: takes sequences, makes blast xml file for each
|
|
150 def make_blast_xml(newseq,argdict,query_path,task,shorttask,evalue_thresh,pident_thresh,qcov_thresh):
|
|
151 # for each sequence in the newseq dictionary
|
|
152 for key in newseq.keys():
|
|
153 # count the sequences
|
|
154 counter=1
|
|
155 # call query sequence queryseq
|
|
156 queryseq=newseq[key]
|
|
157 # get the length of the queryseq (genelen)
|
|
158 genelen=len(queryseq)
|
|
159 # create a temporary query file with the gene sequence and id
|
|
160 queryfile=open(oarg+key.strip()+"_querytemp.fasta","a")
|
|
161 print >> queryfile, ">"+key.strip()
|
|
162 print >> queryfile, queryseq
|
|
163 queryfile.close()
|
|
164 # check if final results directory exists
|
|
165 if os.path.isdir(oarg+"btyper_final_results"):
|
|
166 print "Final results directory exists."
|
|
167 # if not, make one
|
|
168 else:
|
|
169 os.system("mkdir "+oarg+"btyper_final_results")
|
|
170 # check if isolatefiles directory exists within final results directory
|
|
171 if not os.path.isdir(oarg+"btyper_final_results/isolatefiles"):
|
|
172 os.system("mkdir "+oarg+"btyper_final_results/isolatefiles")
|
|
173 rdir_root=oarg+"btyper_final_results/isolatefiles/"+key#.split(".fasta")[0]
|
|
174 #print rdir_root
|
|
175 # make a results directory for each sequence
|
|
176 if not os.path.isdir(rdir_root+"_results"):
|
|
177 os.system("mkdir "+rdir_root+"_results")
|
|
178 rdir=rdir_root+"_results/"
|
|
179 # create a database out of the temporary query file with the individual sequence
|
|
180 if os.path.isfile(oarg+key.strip()+"_querytemp.fasta.nsq"):
|
|
181 print "Database already exists."
|
|
182 else:
|
|
183 os.system("makeblastdb -in "+oarg+key.strip()+"_querytemp.fasta -dbtype nucl")
|
|
184 out=rdir_root+"_blast_results.xml"
|
|
185 # create final file named for each isolate if necessary
|
|
186 finalfile_string=key.strip()+"_final_results.txt"
|
|
187 if not os.path.isfile(oarg+"btyper_final_results/"+finalfile_string):
|
|
188 finalfile=open(oarg+"btyper_final_results/"+finalfile_string,"a")
|
|
189 print >> finalfile, "BTyper Results for "+key
|
|
190 print >> finalfile, ""
|
|
191 finalfile.close()
|
|
192 if not os.path.isdir(oarg+"btyper_final_results/genefiles"):
|
|
193 os.system("mkdir "+oarg+"btyper_final_results/genefiles")
|
|
194 # open the final file for the isolate
|
|
195 finalfile=open(oarg+"btyper_final_results/"+finalfile_string,"a")
|
|
196 # print task-specific header to final output file
|
|
197 if shorttask=="virulence":
|
|
198 # print the task
|
|
199 print >> finalfile, task.strip()
|
|
200 print >> finalfile, ""
|
|
201 table_header=["Hit #", "Virulence Gene Name","E-Value","Percent (%) Identity","Percent (%) Coverage"]
|
|
202 print >>finalfile, "\t".join([t for t in table_header])
|
|
203 elif shorttask=="amr":
|
|
204 # print the task
|
|
205 print >> finalfile, task.strip()
|
|
206 print >> finalfile, ""
|
|
207 table_header=["Hit #", "AMR Gene Name","E-Value","Percent (%) Identity","Percent (%) Coverage"]
|
|
208 print >> finalfile, "\t".join([t for t in table_header])
|
|
209 elif shorttask=="rpoB":
|
|
210 # print the task
|
|
211 print >> finalfile, task.strip()
|
|
212 print >> finalfile, ""
|
|
213 print >> finalfile, "Predicted rpoB Allelic Type"+"\t"+"Percent (%) Identity"+"\t"+"Percent (%) Coverage"
|
|
214 elif shorttask=="panC":
|
|
215 print >> finalfile, task.strip()
|
|
216 print >> finalfile, ""
|
|
217 table_header=["panC Clade Name","Closest Strain","Percent (%) Identity","Percent (%) Coverage"]
|
|
218 print >>finalfile, "\t".join([t for t in table_header])
|
|
219 elif shorttask=="16s":
|
|
220 print >> finalfile, task.strip()
|
|
221 print >> finalfile, ""
|
|
222 print >> finalfile, "Predicted 16s Type"+"\t"+"Percent (%) Identity"+"\t"+"Percent (%) Coverage"
|
|
223 # close the final file for the isolate
|
|
224 finalfile.close()
|
|
225 # run blast with query=database of sequences, db = querytemp.fasta with single isolate sequence, out = outdir, outfmt = xml
|
|
226 if query_path==btyper_path+"seq_virulence_db/virul_aa_db.fasta":
|
|
227 cline=NcbitblastnCommandline(query=str(query_path),db=oarg+key.strip()+"_querytemp.fasta",out=str(out),outfmt='"5"')
|
|
228 elif amrarg=="True" and amralg=="tblastx" and shorttask=="amr":
|
|
229 cline=NcbitblastxCommandline(query=str(query_path),db=oarg+key.strip()+"_querytemp.fasta",out=str(out),outfmt='"5"')
|
|
230 else:
|
|
231 cline=NcbiblastnCommandline(query=str(query_path),db=oarg+key.strip()+"_querytemp.fasta",out=str(out),outfmt='"5"')
|
|
232 stdout, stderr = cline()
|
|
233 #remove temporary temporary fasta files
|
|
234 os.system("rm "+oarg+key.strip()+"_querytemp.fasta*")
|
|
235 # parse blast xml files according to task
|
|
236 if shorttask=="virulence" or shorttask=="amr":
|
|
237 # parse_blast_xml_seq: parses blast xml file for virulence task (getting a list of all genes present in a sequence)
|
|
238 parse_blast_xml_seq(xml=out,rdir=rdir,rdir_root=rdir_root,key=key,counter=counter,shorttask=shorttask,finalfile_string=finalfile_string,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
|
|
239 else:
|
|
240 # parse_panC: parses blast xml for non-virulence tasks (getting the "top hit", rather than a list of all genes)
|
|
241 parse_panC(xml=out,rdir=rdir,rdir_root=rdir_root,key=key,counter=counter,shorttask=shorttask,finalfile_string=finalfile_string,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
|
|
242
|
|
243 # dictionary_add: create a new entry for a dictionary, or append if it already exists as a key
|
|
244 def dictionary_add(dict_name,key,blast):
|
|
245 if key not in dict_name.keys():
|
|
246 dict_name[key]=[]
|
|
247 dict_name[key].append(blast)
|
|
248 else:
|
|
249 dict_name[key].append(blast)
|
|
250
|
|
251 # print_virulence_gene_report: prints isolatefiles and genefiles
|
|
252 def print_virulence_gene_report(filtres,maxgene, genequery,key,rdir,shorttask):
|
|
253 print >> filtres, "\t".join([str(m).strip() for m in maxgene])
|
|
254 genequery=str(genequery).strip()
|
|
255 if shorttask=="virulence" or shorttask=="amr":
|
|
256 genequery=genequery.split("|")[0]
|
|
257 else:
|
|
258 genequery=re.sub('[^0-9a-zA-Z]+', '_', genequery)
|
|
259 genequery=genequery.strip()
|
|
260 if shorttask=="virulence" or shorttask=="amr":
|
|
261 genequery_short=genequery
|
|
262 elif shorttask=="panC":
|
|
263 genequery_short="panC"
|
|
264 elif shorttask=="mlst":
|
|
265 genequery_short=genequery.split("_")[0]
|
|
266 elif shorttask=="rpoB":
|
|
267 genequery_short="rpoB"
|
|
268 elif shorttask=="16s":
|
|
269 genequery_short="16s"
|
|
270 # print to genefile (genefile=1 per gene, with all isolates' sequences)
|
|
271 if shorttask!="virulence" and shorttask!="amr":
|
|
272 genefile=open(oarg+"btyper_final_results/genefiles/"+genequery_short+"_genefile.fasta","a")
|
|
273 print >> genefile, ">"+key+"___"+genequery
|
|
274 print >> genefile, str(maxgene[14]).strip()
|
|
275 genefile.close()
|
|
276 # print to isolatefile (isolatefile=1 per isolate, with all genes detected per isolate)
|
|
277 isolatefile=open(rdir+key+"_"+shorttask+"_sequences.fasta","a")
|
|
278 print >> isolatefile, ">"+genequery+"___"+key
|
|
279 print >> isolatefile, str(maxgene[14]).strip()
|
|
280 isolatefile.close()
|
|
281
|
|
282 # print_final_report: print to the final report file i.e. BTyper's final output file
|
|
283 def print_final_report(finalfile_string, shorttask, maxgene):
|
|
284 finalout=open(oarg+"btyper_final_results/"+finalfile_string,"a")
|
|
285 if shorttask=="virulence" or shorttask=="amr":
|
|
286 gene_row=[maxgene[0],maxgene[3],maxgene[5],maxgene[8],maxgene[7],maxgene[6],maxgene[14]]
|
|
287 print >> finalout, "\t".join([str(gr).strip() for gr in gene_row])
|
|
288 elif shorttask=="rpoB":
|
|
289 rpotype=maxgene[3]
|
|
290 rpotype=str(rpotype).replace("_","|")
|
|
291 if float(maxgene[8])<100:
|
|
292 pid=str(maxgene[8]).strip()+"*"
|
|
293 else:
|
|
294 pid=str(maxgene[8]).strip()
|
|
295 if float(maxgene[7])<100:
|
|
296 qid=str(maxgene[7]).strip()+"*"
|
|
297 else:
|
|
298 qid=str(maxgene[7]).strip()
|
|
299 if "*" in pid or "*" in qid:
|
|
300 rpotype=rpotype.strip()+"*"
|
|
301 gene_row=[rpotype,pid,qid]
|
|
302 print >> finalout, "\t".join([str(gr).strip() for gr in gene_row])
|
|
303 if "*" in rpotype:
|
|
304 print >> finalout, "*No rpoB allele in the current database matches with 100% identity and coverage."
|
|
305 elif shorttask=="panC":
|
|
306 pan=str(maxgene[3]).strip()
|
|
307 panpid=maxgene[8]
|
|
308 panqid=maxgene[7]
|
|
309 if float(panpid.strip())<75:
|
|
310 pan="None*___None"
|
|
311 if float(panpid.strip())<90 and float(panpid.strip())>75:
|
|
312 pan="?*___?"
|
|
313 pan1=pan.split("___")[0]
|
|
314 pan2=pan.split("___")[1]
|
|
315 print >> finalout, pan1.strip()+"\t"+pan2.strip()+"\t"+panpid.strip()+"\t"+panqid.strip()
|
|
316 if pan.strip()=="None*___None":
|
|
317 print >> finalout, "*No panC gene could be detected in your sequence at > 75% identity. Your panC allele may not be significantly associated with the Bacillus cereus group."
|
|
318 if pan.strip()=="?*___?":
|
|
319 print >> finalout, "*A panC clade could not be determined for your isolate."
|
|
320 elif shorttask=="16s":
|
|
321 stype=maxgene[3]
|
|
322 if float(maxgene[8])<97:
|
|
323 pid=str(maxgene[8]).strip()+"*"
|
|
324 else:
|
|
325 pid=str(maxgene[8]).strip()
|
|
326 if float(maxgene[7])<100:
|
|
327 qid=str(maxgene[7]).strip()+"*"
|
|
328 else:
|
|
329 qid=str(maxgene[7]).strip()
|
|
330 if "*" in pid or "*" in qid:
|
|
331 stype=stype.strip()+"*"
|
|
332 gene_row=[stype,pid,qid]
|
|
333 print >> finalout, "\t".join([str(gr).strip() for gr in gene_row])
|
|
334 if "*" in stype:
|
|
335 print >> finalout, "*No 16s gene in the current database matches with >97% identity and/or 100% coverage. Your isolate may not be a member of the Bacillus cereus group."
|
|
336 finalout.close()
|
|
337
|
|
338 # get_st: parse mlst output for final report file; gives ST from database of STs and ATs
|
|
339 def get_st(mlst_infile, st_file, finalfile_string,mlst_genes):
|
|
340 mlstdict={}
|
|
341 lowconf=[]
|
|
342 for line in mlst_infile:
|
|
343 splits=line.split("\t")
|
|
344 if targ:
|
|
345 if "alignment_title" not in line:
|
|
346 alleleid=splits[3]
|
|
347 geneid=str(alleleid).split("_")[0]
|
|
348 at=str(alleleid).split("_")[1]
|
|
349 mlstdict[str(geneid).strip()]=str(at).strip()
|
|
350 pid=splits[8]
|
|
351 qid=splits[7]
|
|
352 if float(pid.strip())<100 or float(qid.strip())<100:
|
|
353 lowconf.append(geneid.strip())
|
|
354
|
|
355 mlst_infile.close()
|
|
356 stdict={}
|
|
357 stfile=open(st_file,"r")
|
|
358 for line in stfile:
|
|
359 splits=line.split("\t")
|
|
360 st=splits[0]
|
|
361 all_at=splits[1:8]
|
|
362 dictionary_add(stdict,str(st).strip(),[a.strip() for a in all_at])
|
|
363 stfile.close()
|
|
364 st_order=[s for i in stdict["ST"] for s in i]
|
|
365 myst=[]
|
|
366 for s in st_order:
|
|
367 if s in mlstdict.keys():
|
|
368 myst.append(mlstdict[s])
|
|
369 else:
|
|
370 myst.append("?")
|
|
371
|
|
372
|
|
373 finalfile=open(finalfile_string,"a")
|
|
374 print >> finalfile, "Predicted MLST Profile:"
|
|
375 print >> finalfile, ""
|
|
376 print >> finalfile, "ST"+"\t"+"\t".join([str(mg).split(".fas")[0] for mg in mlst_genes])
|
|
377 if "?" in myst:
|
|
378 finalmm=[]
|
|
379 for mm,tt in zip(myst,mlst_genes):
|
|
380 testmlst=tt.split(".fas")[0]
|
|
381 if testmlst.strip() in lowconf:
|
|
382 finalmm.append(mm+"*")
|
|
383 else:
|
|
384 finalmm.append(mm)
|
|
385 finalfile=open(finalfile_string,"a")
|
|
386 if "*" in str(finalmm):
|
|
387 print >> finalfile, "?*"+"\t"+"\t".join([str(m).strip() for m in finalmm])
|
|
388 print >> finalfile, "*No allele in the current database matches with 100% identity and coverage."
|
|
389 else:
|
|
390 print >> finalfile, "?"+"\t"+"\t".join([str(m).strip() for m in finalmm])
|
|
391 print >> finalfile, ""
|
|
392 finalfile.close()
|
|
393 else:
|
|
394 confirmst=0
|
|
395 for stkey in stdict.keys():
|
|
396 st_match2=[s for i in stdict[stkey] for s in i]
|
|
397 if myst==st_match2:
|
|
398 confirmst=1
|
|
399 finalmm=[]
|
|
400 print "Found matching ST."
|
|
401 for mm,tt in zip(myst,mlst_genes):
|
|
402 testmlst=tt.split(".fas")[0]
|
|
403 if testmlst.strip() in lowconf:
|
|
404 finalmm.append(mm+"*")
|
|
405 else:
|
|
406 finalmm.append(mm)
|
|
407
|
|
408 if "*" in str(finalmm):
|
|
409 stkey=stkey.strip()+"*"
|
|
410 finalfile=open(finalfile_string,"a")
|
|
411 print >> finalfile, stkey.strip()+"\t"+"\t".join([str(m).strip() for m in finalmm])
|
|
412 if "*" in stkey:
|
|
413 print >> finalfile, "*No allele in the current database matches with 100% identity and coverage."
|
|
414 print >> finalfile, ""
|
|
415 finalfile.close()
|
|
416 if confirmst==0:
|
|
417 finalmm=[]
|
|
418 for mm,tt in zip(myst,mlst_genes):
|
|
419 testmlst=tt.split(".fas")[0]
|
|
420 if testmlst.strip() in lowconf:
|
|
421 finalmm.append(mm+"*")
|
|
422 else:
|
|
423 finalmm.append(mm)
|
|
424 if "*" in str(finalmm):
|
|
425 stkey="?*"
|
|
426 else:
|
|
427 stkey="?"
|
|
428 finalfile=open(finalfile_string,"a")
|
|
429 print >> finalfile, stkey.strip()+"\t"+"\t".join([str(m).strip() for m in finalmm])
|
|
430 if "*" in stkey:
|
|
431 print >> finalfile, "*No allele in the current database matches with 100% identity and coverage."
|
|
432 print >> finalfile, ""
|
|
433 finalfile.close()
|
|
434
|
|
435
|
|
436
|
|
437
|
|
438 # parse_blast_xml_seq: parse the blast xml output into results file
|
|
439 def parse_blast_xml_seq(xml,rdir,rdir_root,key,counter,shorttask,finalfile_string,evalue_thresh,qcov_thresh,pident_thresh):
|
|
440 result_handle=open(xml)
|
|
441 blast_records=NCBIXML.parse(result_handle)
|
|
442 # open file in individual isolate results directory
|
|
443 filtres=open(rdir+key+"_"+shorttask+"_results.txt","a")
|
|
444 # print the header to the results file
|
|
445 headerlist=["hit_number","alignment_title","query_id",shorttask+"_gene_query","alignment_length","evalue","blast_bitscore","query_coverage","percent_idenity",shorttask+"_gene_start",shorttask+"_gene_end","genome_sequence_start","genome_sequence_end",shorttask+"_gene_sequence","genome_sequence","match_sequence"]
|
|
446 print >> filtres, "\t".join([h for h in headerlist])
|
|
447 # loop through each record in the blast output
|
|
448 for record in blast_records:
|
|
449 for alignment in record.alignments:
|
|
450 # create a dictionary for HSPs
|
|
451 hspdict={}
|
|
452 # for each hsp in the alignment
|
|
453 for hsp in alignment.hsps:
|
|
454 #query coverage (qcov) can be calculated as one of the following:
|
|
455 #qcov=float(float(len(hsp.sbjct))/float(len(hsp.query)))
|
|
456 qcov=float(hsp.align_length)/float(record.query_length)*100
|
|
457 pident=float(hsp.identities)/float(hsp.align_length)*100
|
|
458 # add information for each HSP if the e-value is lower than the threshold
|
|
459 if hsp.expect<=evalue_thresh and qcov>=qcov_thresh and pident>=pident_thresh:
|
|
460 genefacts=[counter,alignment.title,record.query_id,record.query,hsp.align_length,hsp.expect,hsp.bits,qcov,pident,hsp.query_start,hsp.query_end,hsp.sbjct_start,hsp.sbjct_end,hsp.query,hsp.sbjct,hsp.match]
|
|
461 dictionary_add(hspdict,str(record.query).strip(),"\t".join([str(g).strip() for g in genefacts]))
|
|
462 # add the start and end positions of the hsp to hspdict
|
|
463 # loop through each hsp in the alignment
|
|
464 for hkey in hspdict.keys():
|
|
465 # get values (start and end locations of each hsp, bit score)
|
|
466 hsps=hspdict[hkey]
|
|
467 # if more than 1 hsp exists (i.e. there is a chance that overlapping hsps might exist)
|
|
468 if len(hsps)>1:
|
|
469 # create list to check for overlapping HSPs
|
|
470 allnums=[]
|
|
471 # loop through each HSP
|
|
472 for h in hsps:
|
|
473 hsplits=h.split("\t")
|
|
474 # get genome start position
|
|
475 gstart=hsplits[11]
|
|
476 gstart=int(gstart.strip())
|
|
477 # get genome end position
|
|
478 gend=hsplits[12]
|
|
479 gend=int(gend.strip())
|
|
480 if gstart<gend:
|
|
481 # append sequence of base numbers covered by HSP to list
|
|
482 allnums.append(range(gstart,gend+1,1))
|
|
483 else:
|
|
484 allnums.append(range(gend, gstart+1,1))
|
|
485 # get the intersection of genome positions when more than one HSP exists in the genome
|
|
486 inset=set.intersection(*map(set,allnums))
|
|
487 # if an intersection exists (i.e. there are overlapping HSPs)
|
|
488 if inset:
|
|
489 # loop through HSPs, keeping track of HSP with highest bit score
|
|
490 maxbits=0
|
|
491 for h in hsps:
|
|
492 hsplits=h.split("\t")
|
|
493 # get bit score of current HSP
|
|
494 testbits=hsplits[6]
|
|
495 # if bit score of current HSP is larger than current maximum bit score:
|
|
496 if float(testbits.strip())>maxbits:
|
|
497 # the current bit score becomes the max
|
|
498 maxbits=float(testbits.strip())
|
|
499 # the current dictionary information becomes the best gene information
|
|
500 maxgene=hsplits
|
|
501 # the current gene query becomes the best gene query
|
|
502 genequery=maxgene[3]
|
|
503 # print the best-scoring HSP to the isolatefiles and genefiles
|
|
504 counter=counter+1
|
|
505 print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask)
|
|
506 print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene)
|
|
507 # if no intersection exists, i.e. there are multiple HSPs, but they are found in different regions of the genome:
|
|
508 else:
|
|
509 # loop through each nonoverlapping HSP
|
|
510 for h in hsps:
|
|
511 maxgene=h.split("\t")
|
|
512 genequery=maxgene[3]
|
|
513 # print each HSP to the appropriate isolatefile/genefile
|
|
514 counter=counter+1
|
|
515 print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask)
|
|
516 print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene)
|
|
517
|
|
518 # if there is only 1 HSP
|
|
519 else:
|
|
520 for h in hsps:
|
|
521 maxgene=h.split("\t")
|
|
522 genequery=maxgene[3]
|
|
523 #print it to the appropriate isolatefile/genefile
|
|
524 counter=counter+1
|
|
525 print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask)
|
|
526 print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene)
|
|
527 filtres.close()
|
|
528 if shorttask=="virulence" and amrarg=="False":
|
|
529 prune_alleles(finalpath=finalfile_string)
|
|
530 elif shorttask=="amr":
|
|
531 prune_alleles(finalpath=finalfile_string)
|
|
532
|
|
533 # parse_panC: parse blast output for panC task
|
|
534 def parse_panC(xml,rdir,rdir_root,key,counter,shorttask,finalfile_string,evalue_thresh,qcov_thresh,pident_thresh):
|
|
535 counter=0
|
|
536 result_handle=open(xml)
|
|
537 blast_records=NCBIXML.parse(result_handle)
|
|
538 # open file in individual isolate results directory
|
|
539 filtres=open(rdir+key+"_"+shorttask+"_results.txt","a")
|
|
540 # print the header to the results file
|
|
541 headerlist=["hit_number","alignment_title","query_id",shorttask+"_gene_query","alignment_length","evalue","blast_bitscore","query_coverage","percent_idenity",shorttask+"_gene_start",shorttask+"_gene_end","genome_sequence_start","genome_sequence_end",shorttask+"_gene_sequence","genome_sequence","match_sequence"]
|
|
542 print >> filtres, "\t".join([h for h in headerlist])
|
|
543 # loop through each record in the blast output
|
|
544 maxp=0
|
|
545 for record in blast_records:
|
|
546 for alignment in record.alignments:
|
|
547 # for each hsp in the alignment
|
|
548 for hsp in alignment.hsps:
|
|
549 #qcov=float(float(len(hsp.sbjct))/float(len(hsp.query)))
|
|
550 qcov=float(hsp.align_length)/float(record.query_length)*100
|
|
551 pident=float(hsp.identities)/float(hsp.align_length)*100
|
|
552 testbits=float(hsp.bits)
|
|
553 # add information for each HSP if the e-value is lower than the threshold
|
|
554 if hsp.expect<=evalue_thresh and qcov>=qcov_thresh and testbits>=pident_thresh and testbits>maxp:
|
|
555 maxp=testbits
|
|
556 genefacts=[counter,alignment.title,record.query_id,record.query,hsp.align_length,hsp.expect,hsp.bits,qcov,pident,hsp.query_start,hsp.query_end,hsp.sbjct_start,hsp.sbjct_end,hsp.query,hsp.sbjct,hsp.match]
|
|
557 genefacts="\t".join([str(g).strip() for g in genefacts])
|
|
558 maxgene=genefacts.split("\t")
|
|
559 genequery=maxgene[3]
|
|
560 #print it to the appropriate isolatefile/genefile
|
|
561 counter=counter+1
|
|
562 print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask)
|
|
563 print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene)
|
|
564 filtres.close()
|
|
565
|
|
566 # assemble_reads: assemble genome from reads
|
|
567 def assemble_reads_pe(forward,reverse):
|
|
568 os.system("spades.py -k "+spadesk+" --careful -1 "+forward+" -2 "+reverse+" -o "+oarg+"spades_assembly -t "+spadest+" -m "+spadesm)
|
|
569 def assemble_reads_se(reads):
|
|
570 os.system("spades.py -k "+spadesk+" --careful -s "+reads+" -o "+oarg+"spades_assembly -t "+spadest+" -m "+spadesm)
|
|
571
|
|
572 def fastq_check(reads):
|
|
573 if reads.endswith(".fastq"):
|
|
574 os.system("gzip reads")
|
|
575 reads=reads.strip()+".gz"
|
|
576 if reads.endswith(".fastq.gz"):
|
|
577 return()
|
|
578 else:
|
|
579 print "It looks like your file "+reads.strip()+" does not end with either '.fastq.gz' or '.fastq'. Please make sure that you are supplying reads in fastq or fastq.gz format, and, if necessary, please rename your file with the appropriate extension."
|
|
580 sys.exit()
|
|
581
|
|
582 def sra_check(sra):
|
|
583 if sra.endswith(".sra"):
|
|
584 return()
|
|
585 else:
|
|
586 print "It looks like your file "+reads.strip()+" does not end with '.sra'. Please make sure that you are supplying sequencing data in sra format, and, if necessary, please rename your file with the extension '.sra'."
|
|
587
|
|
588 def prune_alleles(finalpath):
|
|
589 f=oarg+"btyper_final_results/"+finalpath
|
|
590 print "Pruning alleles for "+f+"..."
|
|
591 os.system("mv "+f+" "+f+"_temporary.txt")
|
|
592 old=open(f+"_temporary.txt","r")
|
|
593 new=open(f,"a")
|
|
594 bitdicta={}
|
|
595 bitdictv={}
|
|
596 for o in old:
|
|
597 if "BTyper Results for " in o:
|
|
598 print >> new, o.strip()
|
|
599 print >> new, ""
|
|
600 if "|" in o and "___" not in o and "rpoB|AT" not in o:
|
|
601 print >> new, o.strip()
|
|
602 elif "|" in o and "___" in o and "rpoB|AT" not in o:
|
|
603 splits=o.split("\t")
|
|
604 gname=splits[1]
|
|
605 gpref=gname.split("___")[0]
|
|
606 gsuff=gname.split("___")[1]
|
|
607 testbits=splits[5]
|
|
608 if "vir" in gpref:
|
|
609 bitdict=bitdictv
|
|
610 elif "amr" in gpref:
|
|
611 bitdict=bitdicta
|
|
612 if gpref.strip() not in bitdict:
|
|
613 bitdict[gpref.strip()]=[]
|
|
614 bitdict[gpref.strip()].append(gsuff.strip())
|
|
615 for s in splits[2:]:
|
|
616 bitdict[gpref.strip()].append(str(s).strip())
|
|
617 else:
|
|
618 maxlist=bitdict[gpref.strip()]
|
|
619 maxbits=maxlist[4]
|
|
620 if float(testbits.strip())>float(maxbits.strip()):
|
|
621 bitdict[gpref.strip()]=[]
|
|
622 bitdict[gpref.strip()].append(gsuff.strip())
|
|
623 for s in splits[2:]:
|
|
624 bitdict[gpref.strip()].append(str(s).strip())
|
|
625
|
|
626 old.close()
|
|
627 dicts=[bitdictv,bitdicta]
|
|
628 if bool(dicts[0])==False and bool(dicts[1])==False:
|
|
629 if varg=="True":
|
|
630 print >> new, "Predicted Virulence Genes:"
|
|
631 print >> new, ""
|
|
632 header=["Hit #","Virulence Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"]
|
|
633 print >> new, "\t".join([h for h in header])
|
|
634 if amrarg=="True":
|
|
635 print >> new, "Predicted AMR Genes:"
|
|
636 print >> new, ""
|
|
637 header=["Hit #","AMR Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"]
|
|
638 print >> new, "\t".join([h for h in header])
|
|
639 else:
|
|
640 for bitdict in dicts:
|
|
641 counter=0
|
|
642 if varg=="True" and bitdict==bitdictv:
|
|
643 print >> new, "Predicted Virulence Genes:"
|
|
644 print >> new, ""
|
|
645 header=["Hit #","Virulence Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"]
|
|
646 print >> new, "\t".join([h for h in header])
|
|
647 elif amrarg=="True" and bitdict==bitdicta:
|
|
648 print >> new, "Predicted AMR Genes:"
|
|
649 print >> new, ""
|
|
650 header=["Hit #","AMR Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"]
|
|
651 print >> new, "\t".join([h for h in header])
|
|
652 for key in bitdict.keys():
|
|
653 counter=counter+1
|
|
654 finalg=bitdict[key]
|
|
655 print >> new, str(counter)+"\t"+"\t".join([str(fg).strip() for fg in finalg[:-2]])
|
|
656 shortgene=finalg[0].split("|")[0]
|
|
657 genefile=open(oarg+"btyper_final_results/genefiles/"+shortgene.strip()+"_genefile.fasta","a")
|
|
658 isoname=f.split("/")[-1]
|
|
659 print >> genefile, ">"+isoname.strip()+"___"+shortgene
|
|
660 print >> genefile, finalg[-1].strip()
|
|
661 genefile.close()
|
|
662 print >> new, ""
|
|
663 new.close()
|
|
664 os.system("rm "+f+"_temporary.txt")
|
|
665
|
|
666
|
|
667 # run program based on user input
|
|
668 if targ!="seq":
|
|
669 os.system("mkdir "+oarg+"spades_assembly")
|
|
670 if targ=="pe":
|
|
671 spacecount=iarg.count(" ")
|
|
672 if spacecount>1:
|
|
673 print "It looks like there may be whitespace in one or more of your file names. Please rename your file(s), and try again."
|
|
674 sys.exit()
|
|
675 elif spacecount==0:
|
|
676 print "It looks like you've chosen the paired-end reads (-t pe) option but have only supplied one file of reads. Please supply one forward reads file and one reverse reads file separated by a space, or select a different option for -t (se for single-end reads, sra for files in sra format, or sra-get to download sequencing data in SRA format)."
|
|
677 sys.exit()
|
|
678 else:
|
|
679 forward_reads=iarg.split(" ")[0]
|
|
680 reverse_reads=iarg.split(" ")[1]
|
|
681 forward_reads=re.sub("[,']","",forward_reads)
|
|
682 reverse_reads=re.sub("[,']","",reverse_reads)
|
|
683 fastq_check(forward_reads)
|
|
684 fastq_check(reverse_reads)
|
|
685 prefix=forward_reads.split(".fastq.gz")[0]
|
|
686 if "/" in prefix:
|
|
687 prefix=prefix.split("/")[-1]
|
|
688 assemble_reads_pe(forward=forward_reads,reverse=reverse_reads)
|
|
689 elif targ=="se":
|
|
690 sreads=iarg.strip()
|
|
691 fastq_check(sreads)
|
|
692 prefix=sreads.split(".fastq.gz")[0]
|
|
693 if "/" in prefix:
|
|
694 prefix=prefix.split("/")[-1]
|
|
695 assemble_reads_se(reads=sreads)
|
|
696 elif targ=="sra-get" or targ=="sra":
|
|
697 sra=iarg.strip()
|
|
698 if targ=="sra-get":
|
|
699 os.system("prefetch -v "+sra)
|
|
700 os.system("fastq-dump --outdir "+oarg+" --split-files --gzip ~/ncbi/public/sra/"+sra+".sra")
|
|
701 elif targ=="sra":
|
|
702 sra=iarg.split(".sra")[0]
|
|
703 os.system("fastq-dump --outdir "+oarg+" --split-files --gzip "+iarg.strip())
|
|
704 forward_reads=oarg+sra.strip()+"_1.fastq.gz"
|
|
705 fastq_check(forward_reads)
|
|
706 prefix=forward_reads.split(".fastq.gz")[0]
|
|
707 if "/" in prefix:
|
|
708 prefix=prefix.split("/")[-1]
|
|
709 if os.path.exists(oarg+sra+"_2.fastq.gz"):
|
|
710 reverse_reads=oarg+sra+"_2.fastq.gz"
|
|
711 fastq_check(reverse_reads)
|
|
712 assemble_reads_pe(forward=forward_reads,reverse=reverse_reads)
|
|
713 else:
|
|
714 assemble_reads_se(reads=forward_reads)
|
|
715 os.system("cp "+oarg+"spades_assembly/contigs.fasta "+oarg+prefix.strip()+"_spades_assembly.fasta")
|
|
716 contigs=open(oarg+prefix.strip()+"_spades_assembly.fasta","r")
|
|
717 pseudo=open(oarg+prefix.strip()+"_pseudochrom.fasta","a")
|
|
718 print >> pseudo, ">"+prefix.strip()
|
|
719 print >> pseudo, "".join(line.strip() for line in contigs if ">" not in line)
|
|
720 contigs.close()
|
|
721 pseudo.close()
|
|
722 targ="seq"
|
|
723 iarg=oarg+prefix.strip()+"_pseudochrom.fasta"
|
|
724
|
|
725
|
|
726
|
|
727
|
|
728 # if the user inputs a fasta file
|
|
729 if targ=="seq":
|
|
730 print "-t seq has been selected. Expecting a fasta file."
|
|
731 # check if the file is a fasta
|
|
732 file_check(iarg)
|
|
733 # if genome is a draft genome
|
|
734 if dgarg=="True":
|
|
735 prefix=iarg.rpartition(".")[0]
|
|
736 if "/" in prefix:
|
|
737 prefix=prefix.split("/")[-1]
|
|
738 contigs=open(iarg,"r")
|
|
739 pseudo=open(oarg+prefix.strip()+"_pseudochrom.fasta","a")
|
|
740 print >> pseudo, ">"+prefix.strip()
|
|
741 print >> pseudo, "".join(line.strip() for line in contigs if ">" not in line)
|
|
742 contigs.close()
|
|
743 pseudo.close()
|
|
744 iarg=oarg+prefix.strip()+"_pseudochrom.fasta"
|
|
745 # create a dictionary from the user-supplied fasta, mapping ids to sequences
|
|
746 dictionaries=seqparse(iarg)
|
|
747
|
|
748 # if performing virulence typing
|
|
749 if varg=="True":
|
|
750 # define database
|
|
751 query_paths=[]
|
|
752 if vdbarg=="nuc":
|
|
753 query_paths.append(btyper_path+"seq_virulence_db/bc_virul_nt.fasta")
|
|
754 if vdbarg=="aa":
|
|
755 query_paths.append(btyper_path+"seq_virulence_db/virul_aa_db.fasta")
|
|
756 for query_path in query_paths:
|
|
757 # map db ids to sequences
|
|
758 mydb=dbparse(query_path)
|
|
759 #task="Predicted Virulence Genes:"
|
|
760 shorttask="virulence"
|
|
761 # define minimum e-value for virulence genes
|
|
762 evalue_thresh=float(earg)
|
|
763 if query_path==btyper_path+"seq_virulence_db/bc_virul_nt.fasta":
|
|
764 task="Predicted Virulence Genes:"
|
|
765 # define minimum percent identity for virulence genes
|
|
766 pident_thresh=int(nucparg)
|
|
767 # define minimum query coverage for virulence genes
|
|
768 qcov_thresh=int(nucqarg)
|
|
769 elif query_path==btyper_path+"seq_virulence_db/virul_aa_db.fasta":
|
|
770 task="Predicted Virulence Genes:"
|
|
771 # define minimum percent identity for virulence genes
|
|
772 pident_thresh=int(aaparg)
|
|
773 # define minimum query coverage for virulence genes
|
|
774 qcov_thresh=int(aaqarg)
|
|
775 # run virulence typing
|
|
776 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
|
|
777 # print spacing between under final results
|
|
778 for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"):
|
|
779 between_sections(finalfile_string=f)
|
|
780
|
|
781
|
|
782
|
|
783 # if performing amr gene detection
|
|
784 if amrarg=="True":
|
|
785 # define database
|
|
786 query_paths=[]
|
|
787 if amrdb=="megares":
|
|
788 query_paths.append(btyper_path+"seq_amr_db/megares_grouped.fasta")
|
|
789 elif amrdb=="argannot":
|
|
790 query_paths.append(btyper_path+"seq_amr_db/argannot_clus80.fasta")
|
|
791 else:
|
|
792 print "Hmmmm...it looks like your selected database, "+amrdb+", doesn't exist. Please select either argannot to use the ARG-ANNOT database, or megares to use the MEGARes database."
|
|
793 sys.exit()
|
|
794 for query_path in query_paths:
|
|
795 # map db ids to sequences
|
|
796 mydb=dbparse(query_path)
|
|
797 task="Predicted AMR Genes:"
|
|
798 shorttask="amr"
|
|
799 # define minimum e-value for amr genes
|
|
800 evalue_thresh=float(earg)
|
|
801 # define minimum percent identity for amr genes
|
|
802 pident_thresh=int(amrparg)
|
|
803 # define minimum query coverage for amr genes
|
|
804 qcov_thresh=int(amrqarg)
|
|
805 # run amr gene detection
|
|
806 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
|
|
807 for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"):
|
|
808 between_sections(finalfile_string=f)
|
|
809
|
|
810
|
|
811
|
|
812 # if performing panC typing:
|
|
813 if parg=="True":
|
|
814 # define database
|
|
815 query_path=btyper_path+"seq_panC_db/panC.fasta"
|
|
816 mydb=dbparse(query_path)
|
|
817 task="Predicted panC Clade Designation:"
|
|
818 shorttask="panC"
|
|
819 evalue_thresh=float(earg)
|
|
820 pident_thresh=0
|
|
821 qcov_thresh=0
|
|
822 # run panC typing, if panC gene is detected
|
|
823 try:
|
|
824 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
|
|
825 for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"):
|
|
826 between_sections(finalfile_string=f)
|
|
827 except UnboundLocalError:
|
|
828 print "No sequences found for "+shorttask
|
|
829
|
|
830 # if performing mlst:
|
|
831 if marg=="True":
|
|
832 mlst_genes=["glp.fas","gmk.fas","ilv.fas","pta.fas","pur.fas","pyc.fas","tpi.fas"]
|
|
833 # get best-matching AT for each MLST gene
|
|
834 for mlst in mlst_genes:
|
|
835 query_path=btyper_path+"seq_mlst_db/"+mlst
|
|
836 mydb=dbparse(query_path)
|
|
837 task="Predicted MLST Profile:"
|
|
838 shorttask="mlst"
|
|
839 evalue_thresh=float(earg)
|
|
840 pident_thresh=0
|
|
841 qcov_thresh=0
|
|
842 # run AT for each gene, if sequence deteced
|
|
843 try:
|
|
844 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
|
|
845 except UnboundLocalError:
|
|
846 print "No sequences found for "+mlst
|
|
847 # loop through isolatefiles
|
|
848 for root, dirs, files in os.walk(oarg+"btyper_final_results/isolatefiles/"):
|
|
849 for d in dirs:
|
|
850 dirroot=d.split("_results")[0]
|
|
851 # open mlst results file, and get ST from ATs
|
|
852 try:
|
|
853 newf=open(oarg+"btyper_final_results/isolatefiles/"+d+"/"+dirroot.strip()+"_mlst_results.txt","r")
|
|
854 finalfile_string=oarg+"btyper_final_results/"+dirroot.strip()+"_final_results.txt"
|
|
855 ff=open(finalfile_string,"r")
|
|
856 flines=ff.readlines()
|
|
857 if not any("Predicted MLST Profile" in fl.strip() for fl in flines):
|
|
858 get_st(mlst_infile=newf,st_file=btyper_path+"seq_mlst_db/b_cereus_mlst_db.txt",finalfile_string=finalfile_string,mlst_genes=mlst_genes)
|
|
859 except IOError:
|
|
860 print "No sequences found for "+shorttask
|
|
861
|
|
862 # if performing rpoB typing:
|
|
863 if rarg=="True":
|
|
864 # define database
|
|
865 query_path=btyper_path+"seq_rpoB_db/rpobdatabase08122015.fa"
|
|
866 mydb=dbparse(query_path)
|
|
867 task="Predicted rpoB Allelic Type:"
|
|
868 shorttask="rpoB"
|
|
869 evalue_thresh=float(earg)
|
|
870 pident_thresh=0
|
|
871 qcov_thresh=0
|
|
872 # perform rpoB typing, if gene is present
|
|
873 try:
|
|
874 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
|
|
875 for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"):
|
|
876 between_sections(finalfile_string=f)
|
|
877 except UnboundLocalError:
|
|
878 print "No sequences found for "+shorttask
|
|
879
|
|
880 # if performing 16s typing:
|
|
881 if sarg=="True":
|
|
882 # define database
|
|
883 query_path=btyper_path+"seq_16s_db/b_cereus_group_16s_db.fasta"
|
|
884 mydb=dbparse(query_path)
|
|
885 task="Predicted 16s Type"
|
|
886 shorttask="16s"
|
|
887 evalue_thresh=float(earg)
|
|
888 pident_thresh=0
|
|
889 qcov_thresh=0
|
|
890 # perform 16s typing, if gene is present
|
|
891 try:
|
|
892 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
|
|
893 for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"):
|
|
894 between_sections(finalfile_string=f)
|
|
895 except UnboundLocalError:
|
|
896 print "No sequences found for "+shorttask
|
|
897
|
|
898 print "Typing complete...how neat is that?"
|
|
899 print ""
|
|
900 print "Thank you for using BTyper! For more fun, take your output files to BMiner, BTyper's companion application for data aggregation and visualization."
|
|
901 print ""
|
|
902 print "To cite BTyper and/or BMiner, please use the following:"
|
|
903 print "Carroll, Laura M., Jasna Kovac, Rachel A. Miller, Martin Wiedmann. 2017. Rapid, high-throughput identification of anthrax-causing and emetic Bacillus cereus group genome assemblies using BTyper, a computational tool for virulence-based classification of Bacillus cereus group isolates using nucleotide sequencing data. Applied and Environmental Microbiology 2017 Jun 16. pii: AEM.01096-17. doi: 10.1128/AEM.01096-17."
|
|
904
|
|
905
|