comparison btyper-2.0.3/btyper @ 5:c3afcb547fee draft

Uploading bytper with chmod 775 permissions
author jpetteng
date Fri, 08 Dec 2017 10:37:52 -0500
parents
children
comparison
equal deleted inserted replaced
4:a9ce62479e43 5:c3afcb547fee
1 #!/usr/bin/env python
2
3 # BTyper version 2.0.3
4 # Created by Laura Carroll
5 # lmc297@cornell.edu
6
7 # import required packages
8 import argparse, sys, os, re, glob, collections
9
10
11 # parse arguments
12 parser = argparse.ArgumentParser(usage='btyper -i </path/to/input/file.extension> -o </path/to/desired/output_directory/> -t [input data format (seq, pe, se, sra, or sra-get)] [-other options]')
13 parser.add_argument('-i','--input', nargs='+', help='Enter the path to the Bacillus cereus group sequence data you would like to input, or enter an SRA accession number',required=True)
14 parser.add_argument('-o', '--output', nargs='+', help='Specify a path to your desired output directory',required=True)
15 parser.add_argument('-t', '--type', nargs='+', help='Specify your type of data: seq for genomes or contigs in fasta format, pe for paired-end Illumina reads, se for single-end Illumina reads, sra for a sra file, or sra-get with an SRA accession number',required=True)
16 parser.add_argument('--draft_genome', action='store_true', default=False, help='Optional argument for use with contigs in fasta format; concatenates draft genome contigs into pseudochromosome')
17 parser.add_argument('-v', '--virulence', nargs='?', default=True, help='Optional argument, True or False; perform virulence gene typing; default=True')
18 parser.add_argument('-m', '--mlst', nargs='?', default=True, help='Optional argument, True or False; perform MLST using Bacillus cereus MLST database; default=True')
19 parser.add_argument('-r', '--rpoB', nargs='?', default=True, help='Optional argument, True or False; perform rpoB typing; default=True')
20 parser.add_argument('-p', '--panC', nargs='?', default=True, help='Optional argument, True or False; perform panC typing; default=True')
21 parser.add_argument('-s', nargs='?', default=False, help='Optional argument, True or False; BLAST 16s DNA sequence; not recommended for inferring species or pathogenicity; default=False')
22 parser.add_argument('--spades_m', nargs='?', default=250, help='Optional argument for use with Illumina reads; integer; set SPAdes memory limit -m/--memory option in Gb; default is 250 Gb, the default for SPAdes')
23 parser.add_argument('--spades_t', nargs='?', default=16, help='Optional argument for use with Illumina reads; integer; set number of threads to use for SPAdes -t/--threads option; default is 16, the default for SPAdes')
24 parser.add_argument('--spades_k', nargs='?', default=77, help='Optional argument for use with Illumina reads; comma-separated list of integers; set k-mer sizes to use with SPAdes -k option; default is 77')
25 parser.add_argument('-v_db', '--virulence_database', nargs='?', default="aa", help='Optional argument for use with -v/--virulence option; specify virulence database to be used: nuc for nucleotide database or aa for amino acid database; default=aa')
26 parser.add_argument('-nuc_p', '--nucleotide_p', nargs='?',default=75, help='Optional argument for use with -v/--virulence option and nucleotide database -v_db nuc option; integer between 0 and 100; minimum percent nucleotide identity for virulence gene detection; default=75')
27 parser.add_argument('-nuc_q','--nucleotide_q', nargs='?',default=90, help='Optional argument for use with -v/--virulence option and nucleotide database -v_db nuc option; integer between 0 and 100; minimum percent coverage for virulence gene detection; default=90')
28 parser.add_argument('-aa_p','--amino_acid_p',nargs='?',default=50, help='Optional argument for use with -v/--virulence option and amino acid database -v_db aa option; integer between 0 and 100; minimum percent amino acid identity for virulence gene detection; default=50')
29 parser.add_argument('-aa_q','--amino_acid_q', nargs='?',default=70, help='Optional argument for use with -v/--virulence option and amino acid database -v_db aa option; integer between 0 and 100; minimum percent coverage for virulence gene detection; default=70')
30 parser.add_argument('-e', '--evalue', nargs='?', default=1e-5, help='Optional argument; float >= to 0; maximum blast e-value for a hit to be saved; default=1e-5')
31 parser.add_argument('--version', action="version", version='%(prog)s 2.0.3', help="Print version")
32 parser.add_argument('-a', '--amr', nargs='?', default=True, help='Optional argument, True or False; perform antimicrobial resistance gene detection; defaul=True')
33 parser.add_argument('-amr_db', '--amr_database', nargs='?', default="argannot", help='Optional argument for use with -a/--amr option; specify antimicrobial resistance gene database to use: argannot for ARG-ANNOT database or megares for MEGARes database; default=argannot')
34 parser.add_argument('--amr_blast', nargs='?', default="blastn", help='Optional argument for use with -a/--amr option; specify blast algorithm to use: blastn to blast against nucleotide database, or tblastx to blast translated sequence against translated nucleotide database; default=blastn')
35 parser.add_argument('--amr_p',nargs='?',default=75, help='Optional argument for use with -a/--amr option; integer between 0 and 100; minimum percent nucleotide identity for AMR gene detection; default=50')
36 parser.add_argument('--amr_q',nargs='?',default=50, help='Optional argument for use with -a/--amr option; integer between 0 and 100; minimum percent coverage for AMR gene detection; default=50')
37
38 args=parser.parse_args()
39
40 # check for biopython installation
41 try:
42 from Bio.Blast.Applications import NcbiblastnCommandline, NcbitblastnCommandline, NcbitblastxCommandline
43 from Bio import SeqIO
44 from Bio.Blast import NCBIXML
45 except ImportError:
46 print "Scanning for BioPython installation..."
47 try:
48 cellar=os.popen('brew ls btyper| grep "seq_virulence_db"').read()
49 btyper_path = cellar.split("seq_virulence_db")[0].strip()
50 pysearch=btyper_path.split("/bin/")[0]+"/libexec/lib/python2.7/site-packages/"
51 print pysearch
52 sys.path.append(pysearch.strip())
53 from Bio.Blast.Applications import NcbiblastnCommandline, NcbitblastnCommandline
54 from Bio import SeqIO
55 from Bio.Blast import NCBIXML
56 except (ImportError,OSError) as e:
57 print "Could not find BioPython installed. Exiting BTyper..."
58 sys.exit()
59
60 # path to program btyper.py, as well as databases
61 # check if homebrew is installed
62 if os.path.isfile("/usr/local/Cellar/btyper*/btyper"):
63 cellar=os.popen('brew ls btyper|grep "seq_virulence_db"').read()
64 btyper_path=cellar.split("seq_virulence_db")[0].strip()
65 else:
66 btyper_path = os.path.realpath(__file__)
67 btyper_path = btyper_path.rpartition("/")[0].strip()+"/"
68 print btyper_path
69
70 # split arguments provided by user
71 def arg_splitter(argin):
72 return str(argin).strip("[']")
73
74 iarg=arg_splitter(args.input)
75 oarg=arg_splitter(args.output)
76 targ=arg_splitter(args.type)
77 varg=arg_splitter(args.virulence)
78 marg=arg_splitter(args.mlst)
79 rarg=arg_splitter(args.rpoB)
80 parg=arg_splitter(args.panC)
81 sarg=arg_splitter(args.s)
82 spadesm=arg_splitter(args.spades_m)
83 spadest=arg_splitter(args.spades_t)
84 spadesk=arg_splitter(args.spades_k)
85 dgarg=arg_splitter(args.draft_genome)
86 vdbarg=arg_splitter(args.virulence_database)
87 nucparg=arg_splitter(args.nucleotide_p)
88 nucqarg=arg_splitter(args.nucleotide_q)
89 aaparg=arg_splitter(args.amino_acid_p)
90 aaqarg=arg_splitter(args.amino_acid_q)
91 earg=arg_splitter(args.evalue)
92 amrarg=arg_splitter(args.amr)
93 amrparg=arg_splitter(args.amr_p)
94 amrqarg=arg_splitter(args.amr_q)
95 amrdb=arg_splitter(args.amr_database)
96 amralg=arg_splitter(args.amr_blast)
97
98 # append to output directory argument, if necessary
99 if oarg[-1]!="/":
100 oarg=oarg+"/"
101
102 # between_sections: prints a blank line between sections in final output file, if desried
103 def between_sections(finalfile_string):
104 finalfile=open(finalfile_string,"a")
105 print >> finalfile, ""
106 finalfile.close()
107
108 # check if file exists and isn't empty
109 def file_check(input_file):
110 try:
111 if os.stat(input_file).st_size > 0:
112 print input_file+" exits. Continuing..."
113 return iarg
114 else:
115 print "Your input file {0} is empty! Please use a different input file.".format(input_file)
116 sys.exit()
117 except OSError:
118 print "Your input file {0} does not exist. Please make sure your path and file name are correct, or specify a different input file.".format(input_file)
119 sys.exit()
120
121 # dbparse: creates dictionary mapping virulence gene/mlst/rpoB/panC/16s id names from a given database to their respective sequences
122 def dbparse(db_path):
123 argseq={}
124 infile=open(db_path,"r")
125 for record in SeqIO.parse(infile, "fasta"):
126 seqid=str(record.description).strip()
127 newid=re.sub('[^0-9a-zA-Z]+', '_', seqid)
128 seqseq=str(record.seq).strip()
129 seqlen=len(seqseq)
130 if newid not in argseq.keys():
131 argseq[newid]=seqseq
132 infile.close()
133 return argseq
134
135 # seqparse: creates dictionary that maps query sequence IDs to sequences
136 def seqparse(seq_path):
137 newseq={}
138 newinfile=open(seq_path,"r")
139 for record in SeqIO.parse(newinfile, "fasta"):
140 seqid=str(record.id).strip()
141 newid=re.sub('[^0-9a-zA-Z]+', '_', seqid)
142 seqseq=str(record.seq).strip()
143 reduced_seq=seqseq.replace("N","")
144 if newid not in newseq.keys():
145 newseq[newid]=seqseq
146 newinfile.close()
147 return newseq
148
149 # make_blast_xml: takes sequences, makes blast xml file for each
150 def make_blast_xml(newseq,argdict,query_path,task,shorttask,evalue_thresh,pident_thresh,qcov_thresh):
151 # for each sequence in the newseq dictionary
152 for key in newseq.keys():
153 # count the sequences
154 counter=1
155 # call query sequence queryseq
156 queryseq=newseq[key]
157 # get the length of the queryseq (genelen)
158 genelen=len(queryseq)
159 # create a temporary query file with the gene sequence and id
160 queryfile=open(oarg+key.strip()+"_querytemp.fasta","a")
161 print >> queryfile, ">"+key.strip()
162 print >> queryfile, queryseq
163 queryfile.close()
164 # check if final results directory exists
165 if os.path.isdir(oarg+"btyper_final_results"):
166 print "Final results directory exists."
167 # if not, make one
168 else:
169 os.system("mkdir "+oarg+"btyper_final_results")
170 # check if isolatefiles directory exists within final results directory
171 if not os.path.isdir(oarg+"btyper_final_results/isolatefiles"):
172 os.system("mkdir "+oarg+"btyper_final_results/isolatefiles")
173 rdir_root=oarg+"btyper_final_results/isolatefiles/"+key#.split(".fasta")[0]
174 #print rdir_root
175 # make a results directory for each sequence
176 if not os.path.isdir(rdir_root+"_results"):
177 os.system("mkdir "+rdir_root+"_results")
178 rdir=rdir_root+"_results/"
179 # create a database out of the temporary query file with the individual sequence
180 if os.path.isfile(oarg+key.strip()+"_querytemp.fasta.nsq"):
181 print "Database already exists."
182 else:
183 os.system("makeblastdb -in "+oarg+key.strip()+"_querytemp.fasta -dbtype nucl")
184 out=rdir_root+"_blast_results.xml"
185 # create final file named for each isolate if necessary
186 finalfile_string=key.strip()+"_final_results.txt"
187 if not os.path.isfile(oarg+"btyper_final_results/"+finalfile_string):
188 finalfile=open(oarg+"btyper_final_results/"+finalfile_string,"a")
189 print >> finalfile, "BTyper Results for "+os.path.basename(iarg)
190 print >> finalfile, ""
191 finalfile.close()
192 if not os.path.isdir(oarg+"btyper_final_results/genefiles"):
193 os.system("mkdir "+oarg+"btyper_final_results/genefiles")
194 # open the final file for the isolate
195 finalfile=open(oarg+"btyper_final_results/"+finalfile_string,"a")
196 # print task-specific header to final output file
197 if shorttask=="virulence":
198 # print the task
199 print >> finalfile, task.strip()
200 print >> finalfile, ""
201 table_header=["Hit #", "Virulence Gene Name","E-Value","Percent (%) Identity","Percent (%) Coverage"]
202 print >>finalfile, "\t".join([t for t in table_header])
203 elif shorttask=="amr":
204 # print the task
205 print >> finalfile, task.strip()
206 print >> finalfile, ""
207 table_header=["Hit #", "AMR Gene Name","E-Value","Percent (%) Identity","Percent (%) Coverage"]
208 print >> finalfile, "\t".join([t for t in table_header])
209 elif shorttask=="rpoB":
210 # print the task
211 print >> finalfile, task.strip()
212 print >> finalfile, ""
213 print >> finalfile, "Predicted rpoB Allelic Type"+"\t"+"Percent (%) Identity"+"\t"+"Percent (%) Coverage"
214 elif shorttask=="panC":
215 print >> finalfile, task.strip()
216 print >> finalfile, ""
217 table_header=["panC Clade Name","Closest Strain","Percent (%) Identity","Percent (%) Coverage"]
218 print >>finalfile, "\t".join([t for t in table_header])
219 elif shorttask=="16s":
220 print >> finalfile, task.strip()
221 print >> finalfile, ""
222 print >> finalfile, "Predicted 16s Type"+"\t"+"Percent (%) Identity"+"\t"+"Percent (%) Coverage"
223 # close the final file for the isolate
224 finalfile.close()
225 # run blast with query=database of sequences, db = querytemp.fasta with single isolate sequence, out = outdir, outfmt = xml
226 if query_path==btyper_path+"seq_virulence_db/virul_aa_db.fasta":
227 cline=NcbitblastnCommandline(query=str(query_path),db=oarg+key.strip()+"_querytemp.fasta",out=str(out),outfmt='"5"')
228 elif amrarg=="True" and amralg=="tblastx" and shorttask=="amr":
229 cline=NcbitblastxCommandline(query=str(query_path),db=oarg+key.strip()+"_querytemp.fasta",out=str(out),outfmt='"5"')
230 else:
231 cline=NcbiblastnCommandline(query=str(query_path),db=oarg+key.strip()+"_querytemp.fasta",out=str(out),outfmt='"5"')
232 stdout, stderr = cline()
233 #remove temporary temporary fasta files
234 os.system("rm "+oarg+key.strip()+"_querytemp.fasta*")
235 # parse blast xml files according to task
236 if shorttask=="virulence" or shorttask=="amr":
237 # parse_blast_xml_seq: parses blast xml file for virulence task (getting a list of all genes present in a sequence)
238 parse_blast_xml_seq(xml=out,rdir=rdir,rdir_root=rdir_root,key=key,counter=counter,shorttask=shorttask,finalfile_string=finalfile_string,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
239 else:
240 # parse_panC: parses blast xml for non-virulence tasks (getting the "top hit", rather than a list of all genes)
241 parse_panC(xml=out,rdir=rdir,rdir_root=rdir_root,key=key,counter=counter,shorttask=shorttask,finalfile_string=finalfile_string,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
242
243 # dictionary_add: create a new entry for a dictionary, or append if it already exists as a key
244 def dictionary_add(dict_name,key,blast):
245 if key not in dict_name.keys():
246 dict_name[key]=[]
247 dict_name[key].append(blast)
248 else:
249 dict_name[key].append(blast)
250
251 # print_virulence_gene_report: prints isolatefiles and genefiles
252 def print_virulence_gene_report(filtres,maxgene, genequery,key,rdir,shorttask):
253 print >> filtres, "\t".join([str(m).strip() for m in maxgene])
254 genequery=str(genequery).strip()
255 if shorttask=="virulence" or shorttask=="amr":
256 genequery=genequery.split("|")[0]
257 else:
258 genequery=re.sub('[^0-9a-zA-Z]+', '_', genequery)
259 genequery=genequery.strip()
260 if shorttask=="virulence" or shorttask=="amr":
261 genequery_short=genequery
262 elif shorttask=="panC":
263 genequery_short="panC"
264 elif shorttask=="mlst":
265 genequery_short=genequery.split("_")[0]
266 elif shorttask=="rpoB":
267 genequery_short="rpoB"
268 elif shorttask=="16s":
269 genequery_short="16s"
270 # print to genefile (genefile=1 per gene, with all isolates' sequences)
271 if shorttask!="virulence" and shorttask!="amr":
272 genefile=open(oarg+"btyper_final_results/genefiles/"+genequery_short+"_genefile.fasta","a")
273 print >> genefile, ">"+key+"___"+genequery
274 print >> genefile, str(maxgene[14]).strip()
275 genefile.close()
276 # print to isolatefile (isolatefile=1 per isolate, with all genes detected per isolate)
277 isolatefile=open(rdir+key+"_"+shorttask+"_sequences.fasta","a")
278 print >> isolatefile, ">"+genequery+"___"+key
279 print >> isolatefile, str(maxgene[14]).strip()
280 isolatefile.close()
281
282 # print_final_report: print to the final report file i.e. BTyper's final output file
283 def print_final_report(finalfile_string, shorttask, maxgene):
284 finalout=open(oarg+"btyper_final_results/"+finalfile_string,"a")
285 if shorttask=="virulence" or shorttask=="amr":
286 gene_row=[maxgene[0],maxgene[3],maxgene[5],maxgene[8],maxgene[7],maxgene[6],maxgene[14]]
287 print >> finalout, "\t".join([str(gr).strip() for gr in gene_row])
288 elif shorttask=="rpoB":
289 rpotype=maxgene[3]
290 rpotype=str(rpotype).replace("_","|")
291 if float(maxgene[8])<100:
292 pid=str(maxgene[8]).strip()+"*"
293 else:
294 pid=str(maxgene[8]).strip()
295 if float(maxgene[7])<100:
296 qid=str(maxgene[7]).strip()+"*"
297 else:
298 qid=str(maxgene[7]).strip()
299 if "*" in pid or "*" in qid:
300 rpotype=rpotype.strip()+"*"
301 gene_row=[rpotype,pid,qid]
302 print >> finalout, "\t".join([str(gr).strip() for gr in gene_row])
303 if "*" in rpotype:
304 print >> finalout, "*No rpoB allele in the current database matches with 100% identity and coverage."
305 elif shorttask=="panC":
306 pan=str(maxgene[3]).strip()
307 panpid=maxgene[8]
308 panqid=maxgene[7]
309 if float(panpid.strip())<75:
310 pan="None*___None"
311 if float(panpid.strip())<90 and float(panpid.strip())>75:
312 pan="?*___?"
313 pan1=pan.split("___")[0]
314 pan2=pan.split("___")[1]
315 print >> finalout, pan1.strip()+"\t"+pan2.strip()+"\t"+panpid.strip()+"\t"+panqid.strip()
316 if pan.strip()=="None*___None":
317 print >> finalout, "*No panC gene could be detected in your sequence at > 75% identity. Your panC allele may not be significantly associated with the Bacillus cereus group."
318 if pan.strip()=="?*___?":
319 print >> finalout, "*A panC clade could not be determined for your isolate."
320 elif shorttask=="16s":
321 stype=maxgene[3]
322 if float(maxgene[8])<97:
323 pid=str(maxgene[8]).strip()+"*"
324 else:
325 pid=str(maxgene[8]).strip()
326 if float(maxgene[7])<100:
327 qid=str(maxgene[7]).strip()+"*"
328 else:
329 qid=str(maxgene[7]).strip()
330 if "*" in pid or "*" in qid:
331 stype=stype.strip()+"*"
332 gene_row=[stype,pid,qid]
333 print >> finalout, "\t".join([str(gr).strip() for gr in gene_row])
334 if "*" in stype:
335 print >> finalout, "*No 16s gene in the current database matches with >97% identity and/or 100% coverage. Your isolate may not be a member of the Bacillus cereus group."
336 finalout.close()
337
338 # get_st: parse mlst output for final report file; gives ST from database of STs and ATs
339 def get_st(mlst_infile, st_file, finalfile_string,mlst_genes):
340 mlstdict={}
341 lowconf=[]
342 for line in mlst_infile:
343 splits=line.split("\t")
344 if targ:
345 if "alignment_title" not in line:
346 alleleid=splits[3]
347 geneid=str(alleleid).split("_")[0]
348 at=str(alleleid).split("_")[1]
349 mlstdict[str(geneid).strip()]=str(at).strip()
350 pid=splits[8]
351 qid=splits[7]
352 if float(pid.strip())<100 or float(qid.strip())<100:
353 lowconf.append(geneid.strip())
354
355 mlst_infile.close()
356 stdict={}
357 stfile=open(st_file,"r")
358 for line in stfile:
359 splits=line.split("\t")
360 st=splits[0]
361 all_at=splits[1:8]
362 dictionary_add(stdict,str(st).strip(),[a.strip() for a in all_at])
363 stfile.close()
364 st_order=[s for i in stdict["ST"] for s in i]
365 myst=[]
366 for s in st_order:
367 if s in mlstdict.keys():
368 myst.append(mlstdict[s])
369 else:
370 myst.append("?")
371
372
373 finalfile=open(finalfile_string,"a")
374 print >> finalfile, "Predicted MLST Profile:"
375 print >> finalfile, ""
376 print >> finalfile, "ST"+"\t"+"\t".join([str(mg).split(".fas")[0] for mg in mlst_genes])
377 if "?" in myst:
378 finalmm=[]
379 for mm,tt in zip(myst,mlst_genes):
380 testmlst=tt.split(".fas")[0]
381 if testmlst.strip() in lowconf:
382 finalmm.append(mm+"*")
383 else:
384 finalmm.append(mm)
385 finalfile=open(finalfile_string,"a")
386 if "*" in str(finalmm):
387 print >> finalfile, "?*"+"\t"+"\t".join([str(m).strip() for m in finalmm])
388 print >> finalfile, "*No allele in the current database matches with 100% identity and coverage."
389 else:
390 print >> finalfile, "?"+"\t"+"\t".join([str(m).strip() for m in finalmm])
391 print >> finalfile, ""
392 finalfile.close()
393 else:
394 confirmst=0
395 for stkey in stdict.keys():
396 st_match2=[s for i in stdict[stkey] for s in i]
397 if myst==st_match2:
398 confirmst=1
399 finalmm=[]
400 print "Found matching ST."
401 for mm,tt in zip(myst,mlst_genes):
402 testmlst=tt.split(".fas")[0]
403 if testmlst.strip() in lowconf:
404 finalmm.append(mm+"*")
405 else:
406 finalmm.append(mm)
407
408 if "*" in str(finalmm):
409 stkey=stkey.strip()+"*"
410 finalfile=open(finalfile_string,"a")
411 print >> finalfile, stkey.strip()+"\t"+"\t".join([str(m).strip() for m in finalmm])
412 if "*" in stkey:
413 print >> finalfile, "*No allele in the current database matches with 100% identity and coverage."
414 print >> finalfile, ""
415 finalfile.close()
416 if confirmst==0:
417 finalmm=[]
418 for mm,tt in zip(myst,mlst_genes):
419 testmlst=tt.split(".fas")[0]
420 if testmlst.strip() in lowconf:
421 finalmm.append(mm+"*")
422 else:
423 finalmm.append(mm)
424 if "*" in str(finalmm):
425 stkey="?*"
426 else:
427 stkey="?"
428 finalfile=open(finalfile_string,"a")
429 print >> finalfile, stkey.strip()+"\t"+"\t".join([str(m).strip() for m in finalmm])
430 if "*" in stkey:
431 print >> finalfile, "*No allele in the current database matches with 100% identity and coverage."
432 print >> finalfile, ""
433 finalfile.close()
434
435
436
437
438 # parse_blast_xml_seq: parse the blast xml output into results file
439 def parse_blast_xml_seq(xml,rdir,rdir_root,key,counter,shorttask,finalfile_string,evalue_thresh,qcov_thresh,pident_thresh):
440 result_handle=open(xml)
441 blast_records=NCBIXML.parse(result_handle)
442 # open file in individual isolate results directory
443 filtres=open(rdir+key+"_"+shorttask+"_results.txt","a")
444 # print the header to the results file
445 headerlist=["hit_number","alignment_title","query_id",shorttask+"_gene_query","alignment_length","evalue","blast_bitscore","query_coverage","percent_idenity",shorttask+"_gene_start",shorttask+"_gene_end","genome_sequence_start","genome_sequence_end",shorttask+"_gene_sequence","genome_sequence","match_sequence"]
446 print >> filtres, "\t".join([h for h in headerlist])
447 # loop through each record in the blast output
448 for record in blast_records:
449 for alignment in record.alignments:
450 # create a dictionary for HSPs
451 hspdict={}
452 # for each hsp in the alignment
453 for hsp in alignment.hsps:
454 #query coverage (qcov) can be calculated as one of the following:
455 #qcov=float(float(len(hsp.sbjct))/float(len(hsp.query)))
456 qcov=float(hsp.align_length)/float(record.query_length)*100
457 pident=float(hsp.identities)/float(hsp.align_length)*100
458 # add information for each HSP if the e-value is lower than the threshold
459 if hsp.expect<=evalue_thresh and qcov>=qcov_thresh and pident>=pident_thresh:
460 genefacts=[counter,alignment.title,record.query_id,record.query,hsp.align_length,hsp.expect,hsp.bits,qcov,pident,hsp.query_start,hsp.query_end,hsp.sbjct_start,hsp.sbjct_end,hsp.query,hsp.sbjct,hsp.match]
461 dictionary_add(hspdict,str(record.query).strip(),"\t".join([str(g).strip() for g in genefacts]))
462 # add the start and end positions of the hsp to hspdict
463 # loop through each hsp in the alignment
464 for hkey in hspdict.keys():
465 # get values (start and end locations of each hsp, bit score)
466 hsps=hspdict[hkey]
467 # if more than 1 hsp exists (i.e. there is a chance that overlapping hsps might exist)
468 if len(hsps)>1:
469 # create list to check for overlapping HSPs
470 allnums=[]
471 # loop through each HSP
472 for h in hsps:
473 hsplits=h.split("\t")
474 # get genome start position
475 gstart=hsplits[11]
476 gstart=int(gstart.strip())
477 # get genome end position
478 gend=hsplits[12]
479 gend=int(gend.strip())
480 if gstart<gend:
481 # append sequence of base numbers covered by HSP to list
482 allnums.append(range(gstart,gend+1,1))
483 else:
484 allnums.append(range(gend, gstart+1,1))
485 # get the intersection of genome positions when more than one HSP exists in the genome
486 inset=set.intersection(*map(set,allnums))
487 # if an intersection exists (i.e. there are overlapping HSPs)
488 if inset:
489 # loop through HSPs, keeping track of HSP with highest bit score
490 maxbits=0
491 for h in hsps:
492 hsplits=h.split("\t")
493 # get bit score of current HSP
494 testbits=hsplits[6]
495 # if bit score of current HSP is larger than current maximum bit score:
496 if float(testbits.strip())>maxbits:
497 # the current bit score becomes the max
498 maxbits=float(testbits.strip())
499 # the current dictionary information becomes the best gene information
500 maxgene=hsplits
501 # the current gene query becomes the best gene query
502 genequery=maxgene[3]
503 # print the best-scoring HSP to the isolatefiles and genefiles
504 counter=counter+1
505 print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask)
506 print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene)
507 # if no intersection exists, i.e. there are multiple HSPs, but they are found in different regions of the genome:
508 else:
509 # loop through each nonoverlapping HSP
510 for h in hsps:
511 maxgene=h.split("\t")
512 genequery=maxgene[3]
513 # print each HSP to the appropriate isolatefile/genefile
514 counter=counter+1
515 print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask)
516 print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene)
517
518 # if there is only 1 HSP
519 else:
520 for h in hsps:
521 maxgene=h.split("\t")
522 genequery=maxgene[3]
523 #print it to the appropriate isolatefile/genefile
524 counter=counter+1
525 print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask)
526 print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene)
527 filtres.close()
528 if shorttask=="virulence" and amrarg=="False":
529 prune_alleles(finalpath=finalfile_string)
530 elif shorttask=="amr":
531 prune_alleles(finalpath=finalfile_string)
532
533 # parse_panC: parse blast output for panC task
534 def parse_panC(xml,rdir,rdir_root,key,counter,shorttask,finalfile_string,evalue_thresh,qcov_thresh,pident_thresh):
535 counter=0
536 result_handle=open(xml)
537 blast_records=NCBIXML.parse(result_handle)
538 # open file in individual isolate results directory
539 filtres=open(rdir+key+"_"+shorttask+"_results.txt","a")
540 # print the header to the results file
541 headerlist=["hit_number","alignment_title","query_id",shorttask+"_gene_query","alignment_length","evalue","blast_bitscore","query_coverage","percent_idenity",shorttask+"_gene_start",shorttask+"_gene_end","genome_sequence_start","genome_sequence_end",shorttask+"_gene_sequence","genome_sequence","match_sequence"]
542 print >> filtres, "\t".join([h for h in headerlist])
543 # loop through each record in the blast output
544 maxp=0
545 for record in blast_records:
546 for alignment in record.alignments:
547 # for each hsp in the alignment
548 for hsp in alignment.hsps:
549 #qcov=float(float(len(hsp.sbjct))/float(len(hsp.query)))
550 qcov=float(hsp.align_length)/float(record.query_length)*100
551 pident=float(hsp.identities)/float(hsp.align_length)*100
552 testbits=float(hsp.bits)
553 # add information for each HSP if the e-value is lower than the threshold
554 if hsp.expect<=evalue_thresh and qcov>=qcov_thresh and testbits>=pident_thresh and testbits>maxp:
555 maxp=testbits
556 genefacts=[counter,alignment.title,record.query_id,record.query,hsp.align_length,hsp.expect,hsp.bits,qcov,pident,hsp.query_start,hsp.query_end,hsp.sbjct_start,hsp.sbjct_end,hsp.query,hsp.sbjct,hsp.match]
557 genefacts="\t".join([str(g).strip() for g in genefacts])
558 maxgene=genefacts.split("\t")
559 genequery=maxgene[3]
560 #print it to the appropriate isolatefile/genefile
561 counter=counter+1
562 print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask)
563 print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene)
564 filtres.close()
565
566 # assemble_reads: assemble genome from reads
567 def assemble_reads_pe(forward,reverse):
568 os.system("spades.py -k "+spadesk+" --careful -1 "+forward+" -2 "+reverse+" -o "+oarg+"spades_assembly -t "+spadest+" -m "+spadesm)
569 def assemble_reads_se(reads):
570 os.system("spades.py -k "+spadesk+" --careful -s "+reads+" -o "+oarg+"spades_assembly -t "+spadest+" -m "+spadesm)
571
572 def fastq_check(reads):
573 if reads.endswith(".fastq"):
574 os.system("gzip reads")
575 reads=reads.strip()+".gz"
576 if reads.endswith(".fastq.gz"):
577 return()
578 else:
579 print "It looks like your file "+reads.strip()+" does not end with either '.fastq.gz' or '.fastq'. Please make sure that you are supplying reads in fastq or fastq.gz format, and, if necessary, please rename your file with the appropriate extension."
580 sys.exit()
581
582 def sra_check(sra):
583 if sra.endswith(".sra"):
584 return()
585 else:
586 print "It looks like your file "+reads.strip()+" does not end with '.sra'. Please make sure that you are supplying sequencing data in sra format, and, if necessary, please rename your file with the extension '.sra'."
587
588 def prune_alleles(finalpath):
589 f=oarg+"btyper_final_results/"+finalpath
590 print "Pruning alleles for "+f+"..."
591 os.system("mv "+f+" "+f+"_temporary.txt")
592 old=open(f+"_temporary.txt","r")
593 new=open(f,"a")
594 bitdicta={}
595 bitdictv={}
596 for o in old:
597 if "BTyper Results for " in o:
598 print >> new, o.strip()
599 print >> new, ""
600 if "|" in o and "___" not in o and "rpoB|AT" not in o:
601 print >> new, o.strip()
602 elif "|" in o and "___" in o and "rpoB|AT" not in o:
603 splits=o.split("\t")
604 gname=splits[1]
605 gpref=gname.split("___")[0]
606 gsuff=gname.split("___")[1]
607 testbits=splits[5]
608 if "vir" in gpref:
609 bitdict=bitdictv
610 elif "amr" in gpref:
611 bitdict=bitdicta
612 if gpref.strip() not in bitdict:
613 bitdict[gpref.strip()]=[]
614 bitdict[gpref.strip()].append(gsuff.strip())
615 for s in splits[2:]:
616 bitdict[gpref.strip()].append(str(s).strip())
617 else:
618 maxlist=bitdict[gpref.strip()]
619 maxbits=maxlist[4]
620 if float(testbits.strip())>float(maxbits.strip()):
621 bitdict[gpref.strip()]=[]
622 bitdict[gpref.strip()].append(gsuff.strip())
623 for s in splits[2:]:
624 bitdict[gpref.strip()].append(str(s).strip())
625
626 old.close()
627 dicts=[bitdictv,bitdicta]
628 if bool(dicts[0])==False and bool(dicts[1])==False:
629 if varg=="True":
630 print >> new, "Predicted Virulence Genes:"
631 print >> new, ""
632 header=["Hit #","Virulence Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"]
633 print >> new, "\t".join([h for h in header])
634 if amrarg=="True":
635 print >> new, "Predicted AMR Genes:"
636 print >> new, ""
637 header=["Hit #","AMR Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"]
638 print >> new, "\t".join([h for h in header])
639 else:
640 for bitdict in dicts:
641 counter=0
642 if varg=="True" and bitdict==bitdictv:
643 print >> new, "Predicted Virulence Genes:"
644 print >> new, ""
645 header=["Hit #","Virulence Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"]
646 print >> new, "\t".join([h for h in header])
647 elif amrarg=="True" and bitdict==bitdicta:
648 print >> new, "Predicted AMR Genes:"
649 print >> new, ""
650 header=["Hit #","AMR Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"]
651 print >> new, "\t".join([h for h in header])
652 for key in bitdict.keys():
653 counter=counter+1
654 finalg=bitdict[key]
655 print >> new, str(counter)+"\t"+"\t".join([str(fg).strip() for fg in finalg[:-2]])
656 shortgene=finalg[0].split("|")[0]
657 genefile=open(oarg+"btyper_final_results/genefiles/"+shortgene.strip()+"_genefile.fasta","a")
658 isoname=f.split("/")[-1]
659 print >> genefile, ">"+isoname.strip()+"___"+shortgene
660 print >> genefile, finalg[-1].strip()
661 genefile.close()
662 print >> new, ""
663 new.close()
664 os.system("rm "+f+"_temporary.txt")
665
666
667 # run program based on user input
668 if targ!="seq":
669 os.system("mkdir "+oarg+"spades_assembly")
670 if targ=="pe":
671 spacecount=iarg.count(" ")
672 if spacecount>1:
673 print "It looks like there may be whitespace in one or more of your file names. Please rename your file(s), and try again."
674 sys.exit()
675 elif spacecount==0:
676 print "It looks like you've chosen the paired-end reads (-t pe) option but have only supplied one file of reads. Please supply one forward reads file and one reverse reads file separated by a space, or select a different option for -t (se for single-end reads, sra for files in sra format, or sra-get to download sequencing data in SRA format)."
677 sys.exit()
678 else:
679 forward_reads=iarg.split(" ")[0]
680 reverse_reads=iarg.split(" ")[1]
681 forward_reads=re.sub("[,']","",forward_reads)
682 reverse_reads=re.sub("[,']","",reverse_reads)
683 fastq_check(forward_reads)
684 fastq_check(reverse_reads)
685 prefix=forward_reads.split(".fastq.gz")[0]
686 if "/" in prefix:
687 prefix=prefix.split("/")[-1]
688 assemble_reads_pe(forward=forward_reads,reverse=reverse_reads)
689 elif targ=="se":
690 sreads=iarg.strip()
691 fastq_check(sreads)
692 prefix=sreads.split(".fastq.gz")[0]
693 if "/" in prefix:
694 prefix=prefix.split("/")[-1]
695 assemble_reads_se(reads=sreads)
696 elif targ=="sra-get" or targ=="sra":
697 sra=iarg.strip()
698 if targ=="sra-get":
699 os.system("prefetch -v "+sra)
700 os.system("fastq-dump --outdir "+oarg+" --split-files --gzip ~/ncbi/public/sra/"+sra+".sra")
701 elif targ=="sra":
702 sra=iarg.split(".sra")[0]
703 os.system("fastq-dump --outdir "+oarg+" --split-files --gzip "+iarg.strip())
704 forward_reads=oarg+sra.strip()+"_1.fastq.gz"
705 fastq_check(forward_reads)
706 prefix=forward_reads.split(".fastq.gz")[0]
707 if "/" in prefix:
708 prefix=prefix.split("/")[-1]
709 if os.path.exists(oarg+sra+"_2.fastq.gz"):
710 reverse_reads=oarg+sra+"_2.fastq.gz"
711 fastq_check(reverse_reads)
712 assemble_reads_pe(forward=forward_reads,reverse=reverse_reads)
713 else:
714 assemble_reads_se(reads=forward_reads)
715 os.system("cp "+oarg+"spades_assembly/contigs.fasta "+oarg+prefix.strip()+"_spades_assembly.fasta")
716 contigs=open(oarg+prefix.strip()+"_spades_assembly.fasta","r")
717 pseudo=open(oarg+prefix.strip()+"_pseudochrom.fasta","a")
718 print >> pseudo, ">"+prefix.strip()
719 print >> pseudo, "".join(line.strip() for line in contigs if ">" not in line)
720 contigs.close()
721 pseudo.close()
722 targ="seq"
723 iarg=oarg+prefix.strip()+"_pseudochrom.fasta"
724
725
726
727
728 # if the user inputs a fasta file
729 if targ=="seq":
730 print "-t seq has been selected. Expecting a fasta file."
731 # check if the file is a fasta
732 file_check(iarg)
733 # if genome is a draft genome
734 if dgarg=="True":
735 prefix=iarg.rpartition(".")[0]
736 if "/" in prefix:
737 prefix=prefix.split("/")[-1]
738 contigs=open(iarg,"r")
739 pseudo=open(oarg+prefix.strip()+"_pseudochrom.fasta","a")
740 print >> pseudo, ">"+prefix.strip()
741 print >> pseudo, "".join(line.strip() for line in contigs if ">" not in line)
742 contigs.close()
743 pseudo.close()
744 iarg=oarg+prefix.strip()+"_pseudochrom.fasta"
745 # create a dictionary from the user-supplied fasta, mapping ids to sequences
746 dictionaries=seqparse(iarg)
747
748 # if performing virulence typing
749 if varg=="True":
750 # define database
751 query_paths=[]
752 if vdbarg=="nuc":
753 query_paths.append(btyper_path+"seq_virulence_db/bc_virul_nt.fasta")
754 if vdbarg=="aa":
755 query_paths.append(btyper_path+"seq_virulence_db/virul_aa_db.fasta")
756 for query_path in query_paths:
757 # map db ids to sequences
758 mydb=dbparse(query_path)
759 #task="Predicted Virulence Genes:"
760 shorttask="virulence"
761 # define minimum e-value for virulence genes
762 evalue_thresh=float(earg)
763 if query_path==btyper_path+"seq_virulence_db/bc_virul_nt.fasta":
764 task="Predicted Virulence Genes:"
765 # define minimum percent identity for virulence genes
766 pident_thresh=int(nucparg)
767 # define minimum query coverage for virulence genes
768 qcov_thresh=int(nucqarg)
769 elif query_path==btyper_path+"seq_virulence_db/virul_aa_db.fasta":
770 task="Predicted Virulence Genes:"
771 # define minimum percent identity for virulence genes
772 pident_thresh=int(aaparg)
773 # define minimum query coverage for virulence genes
774 qcov_thresh=int(aaqarg)
775 # run virulence typing
776 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
777 # print spacing between under final results
778 for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"):
779 between_sections(finalfile_string=f)
780
781
782
783 # if performing amr gene detection
784 if amrarg=="True":
785 # define database
786 query_paths=[]
787 if amrdb=="megares":
788 query_paths.append(btyper_path+"seq_amr_db/megares_grouped.fasta")
789 elif amrdb=="argannot":
790 query_paths.append(btyper_path+"seq_amr_db/argannot_clus80.fasta")
791 else:
792 print "Hmmmm...it looks like your selected database, "+amrdb+", doesn't exist. Please select either argannot to use the ARG-ANNOT database, or megares to use the MEGARes database."
793 sys.exit()
794 for query_path in query_paths:
795 # map db ids to sequences
796 mydb=dbparse(query_path)
797 task="Predicted AMR Genes:"
798 shorttask="amr"
799 # define minimum e-value for amr genes
800 evalue_thresh=float(earg)
801 # define minimum percent identity for amr genes
802 pident_thresh=int(amrparg)
803 # define minimum query coverage for amr genes
804 qcov_thresh=int(amrqarg)
805 # run amr gene detection
806 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
807 for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"):
808 between_sections(finalfile_string=f)
809
810
811
812 # if performing panC typing:
813 if parg=="True":
814 # define database
815 query_path=btyper_path+"seq_panC_db/panC.fasta"
816 mydb=dbparse(query_path)
817 task="Predicted panC Clade Designation:"
818 shorttask="panC"
819 evalue_thresh=float(earg)
820 pident_thresh=0
821 qcov_thresh=0
822 # run panC typing, if panC gene is detected
823 try:
824 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
825 for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"):
826 between_sections(finalfile_string=f)
827 except UnboundLocalError:
828 print "No sequences found for "+shorttask
829
830 # if performing mlst:
831 if marg=="True":
832 mlst_genes=["glp.fas","gmk.fas","ilv.fas","pta.fas","pur.fas","pyc.fas","tpi.fas"]
833 # get best-matching AT for each MLST gene
834 for mlst in mlst_genes:
835 query_path=btyper_path+"seq_mlst_db/"+mlst
836 mydb=dbparse(query_path)
837 task="Predicted MLST Profile:"
838 shorttask="mlst"
839 evalue_thresh=float(earg)
840 pident_thresh=0
841 qcov_thresh=0
842 # run AT for each gene, if sequence deteced
843 try:
844 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
845 except UnboundLocalError:
846 print "No sequences found for "+mlst
847 # loop through isolatefiles
848 for root, dirs, files in os.walk(oarg+"btyper_final_results/isolatefiles/"):
849 for d in dirs:
850 dirroot=d.split("_results")[0]
851 # open mlst results file, and get ST from ATs
852 try:
853 newf=open(oarg+"btyper_final_results/isolatefiles/"+d+"/"+dirroot.strip()+"_mlst_results.txt","r")
854 finalfile_string=oarg+"btyper_final_results/"+dirroot.strip()+"_final_results.txt"
855 ff=open(finalfile_string,"r")
856 flines=ff.readlines()
857 if not any("Predicted MLST Profile" in fl.strip() for fl in flines):
858 get_st(mlst_infile=newf,st_file=btyper_path+"seq_mlst_db/b_cereus_mlst_db.txt",finalfile_string=finalfile_string,mlst_genes=mlst_genes)
859 except IOError:
860 print "No sequences found for "+shorttask
861
862 # if performing rpoB typing:
863 if rarg=="True":
864 # define database
865 query_path=btyper_path+"seq_rpoB_db/rpobdatabase08122015.fa"
866 mydb=dbparse(query_path)
867 task="Predicted rpoB Allelic Type:"
868 shorttask="rpoB"
869 evalue_thresh=float(earg)
870 pident_thresh=0
871 qcov_thresh=0
872 # perform rpoB typing, if gene is present
873 try:
874 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
875 for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"):
876 between_sections(finalfile_string=f)
877 except UnboundLocalError:
878 print "No sequences found for "+shorttask
879
880 # if performing 16s typing:
881 if sarg=="True":
882 # define database
883 query_path=btyper_path+"seq_16s_db/b_cereus_group_16s_db.fasta"
884 mydb=dbparse(query_path)
885 task="Predicted 16s Type"
886 shorttask="16s"
887 evalue_thresh=float(earg)
888 pident_thresh=0
889 qcov_thresh=0
890 # perform 16s typing, if gene is present
891 try:
892 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
893 for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"):
894 between_sections(finalfile_string=f)
895 except UnboundLocalError:
896 print "No sequences found for "+shorttask
897
898 print "Typing complete...how neat is that?"
899 print ""
900 print "Thank you for using BTyper! For more fun, take your output files to BMiner, BTyper's companion application for data aggregation and visualization."
901 print ""
902 print "To cite BTyper and/or BMiner, please use the following:"
903 print "Carroll, Laura M., Jasna Kovac, Rachel A. Miller, Martin Wiedmann. 2017. Rapid, high-throughput identification of anthrax-causing and emetic Bacillus cereus group genome assemblies using BTyper, a computational tool for virulence-based classification of Bacillus cereus group isolates using nucleotide sequencing data. Applied and Environmental Microbiology 2017 Jun 16. pii: AEM.01096-17. doi: 10.1128/AEM.01096-17."
904
905