annotate btyper-2.0.3/btyper @ 7:d408ad12401a draft

Uploading GitHub version of btyper
author estrain
date Fri, 08 Dec 2017 11:39:12 -0500
parents 46ebe89f01ea
children ccf77a06a46e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
1 #!/usr/bin/env python
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
2
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
3 # BTyper version 2.0.3
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
4 # Created by Laura Carroll
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
5 # lmc297@cornell.edu
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
6
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
7 # import required packages
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
8 import argparse, sys, os, re, glob, collections
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
9
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
10
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
11 # parse arguments
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
12 parser = argparse.ArgumentParser(usage='btyper -i </path/to/input/file.extension> -o </path/to/desired/output_directory/> -t [input data format (seq, pe, se, sra, or sra-get)] [-other options]')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
13 parser.add_argument('-i','--input', nargs='+', help='Enter the path to the Bacillus cereus group sequence data you would like to input, or enter an SRA accession number',required=True)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
14 parser.add_argument('-o', '--output', nargs='+', help='Specify a path to your desired output directory',required=True)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
15 parser.add_argument('-t', '--type', nargs='+', help='Specify your type of data: seq for genomes or contigs in fasta format, pe for paired-end Illumina reads, se for single-end Illumina reads, sra for a sra file, or sra-get with an SRA accession number',required=True)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
16 parser.add_argument('--draft_genome', action='store_true', default=False, help='Optional argument for use with contigs in fasta format; concatenates draft genome contigs into pseudochromosome')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
17 parser.add_argument('-v', '--virulence', nargs='?', default=True, help='Optional argument, True or False; perform virulence gene typing; default=True')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
18 parser.add_argument('-m', '--mlst', nargs='?', default=True, help='Optional argument, True or False; perform MLST using Bacillus cereus MLST database; default=True')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
19 parser.add_argument('-r', '--rpoB', nargs='?', default=True, help='Optional argument, True or False; perform rpoB typing; default=True')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
20 parser.add_argument('-p', '--panC', nargs='?', default=True, help='Optional argument, True or False; perform panC typing; default=True')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
21 parser.add_argument('-s', nargs='?', default=False, help='Optional argument, True or False; BLAST 16s DNA sequence; not recommended for inferring species or pathogenicity; default=False')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
22 parser.add_argument('--spades_m', nargs='?', default=250, help='Optional argument for use with Illumina reads; integer; set SPAdes memory limit -m/--memory option in Gb; default is 250 Gb, the default for SPAdes')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
23 parser.add_argument('--spades_t', nargs='?', default=16, help='Optional argument for use with Illumina reads; integer; set number of threads to use for SPAdes -t/--threads option; default is 16, the default for SPAdes')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
24 parser.add_argument('--spades_k', nargs='?', default=77, help='Optional argument for use with Illumina reads; comma-separated list of integers; set k-mer sizes to use with SPAdes -k option; default is 77')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
25 parser.add_argument('-v_db', '--virulence_database', nargs='?', default="aa", help='Optional argument for use with -v/--virulence option; specify virulence database to be used: nuc for nucleotide database or aa for amino acid database; default=aa')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
26 parser.add_argument('-nuc_p', '--nucleotide_p', nargs='?',default=75, help='Optional argument for use with -v/--virulence option and nucleotide database -v_db nuc option; integer between 0 and 100; minimum percent nucleotide identity for virulence gene detection; default=75')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
27 parser.add_argument('-nuc_q','--nucleotide_q', nargs='?',default=90, help='Optional argument for use with -v/--virulence option and nucleotide database -v_db nuc option; integer between 0 and 100; minimum percent coverage for virulence gene detection; default=90')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
28 parser.add_argument('-aa_p','--amino_acid_p',nargs='?',default=50, help='Optional argument for use with -v/--virulence option and amino acid database -v_db aa option; integer between 0 and 100; minimum percent amino acid identity for virulence gene detection; default=50')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
29 parser.add_argument('-aa_q','--amino_acid_q', nargs='?',default=70, help='Optional argument for use with -v/--virulence option and amino acid database -v_db aa option; integer between 0 and 100; minimum percent coverage for virulence gene detection; default=70')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
30 parser.add_argument('-e', '--evalue', nargs='?', default=1e-5, help='Optional argument; float >= to 0; maximum blast e-value for a hit to be saved; default=1e-5')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
31 parser.add_argument('--version', action="version", version='%(prog)s 2.0.3', help="Print version")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
32 parser.add_argument('-a', '--amr', nargs='?', default=True, help='Optional argument, True or False; perform antimicrobial resistance gene detection; defaul=True')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
33 parser.add_argument('-amr_db', '--amr_database', nargs='?', default="argannot", help='Optional argument for use with -a/--amr option; specify antimicrobial resistance gene database to use: argannot for ARG-ANNOT database or megares for MEGARes database; default=argannot')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
34 parser.add_argument('--amr_blast', nargs='?', default="blastn", help='Optional argument for use with -a/--amr option; specify blast algorithm to use: blastn to blast against nucleotide database, or tblastx to blast translated sequence against translated nucleotide database; default=blastn')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
35 parser.add_argument('--amr_p',nargs='?',default=75, help='Optional argument for use with -a/--amr option; integer between 0 and 100; minimum percent nucleotide identity for AMR gene detection; default=50')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
36 parser.add_argument('--amr_q',nargs='?',default=50, help='Optional argument for use with -a/--amr option; integer between 0 and 100; minimum percent coverage for AMR gene detection; default=50')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
37
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
38 args=parser.parse_args()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
39
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
40 # check for biopython installation
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
41 try:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
42 from Bio.Blast.Applications import NcbiblastnCommandline, NcbitblastnCommandline, NcbitblastxCommandline
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
43 from Bio import SeqIO
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
44 from Bio.Blast import NCBIXML
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
45 except ImportError:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
46 print "Scanning for BioPython installation..."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
47 try:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
48 cellar=os.popen('brew ls btyper| grep "seq_virulence_db"').read()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
49 btyper_path = cellar.split("seq_virulence_db")[0].strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
50 pysearch=btyper_path.split("/bin/")[0]+"/libexec/lib/python2.7/site-packages/"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
51 print pysearch
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
52 sys.path.append(pysearch.strip())
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
53 from Bio.Blast.Applications import NcbiblastnCommandline, NcbitblastnCommandline
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
54 from Bio import SeqIO
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
55 from Bio.Blast import NCBIXML
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
56 except (ImportError,OSError) as e:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
57 print "Could not find BioPython installed. Exiting BTyper..."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
58 sys.exit()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
59
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
60 # path to program btyper.py, as well as databases
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
61 # check if homebrew is installed
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
62 if os.path.isfile("/usr/local/Cellar/btyper*/btyper"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
63 cellar=os.popen('brew ls btyper|grep "seq_virulence_db"').read()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
64 btyper_path=cellar.split("seq_virulence_db")[0].strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
65 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
66 btyper_path = os.path.realpath(__file__)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
67 btyper_path = btyper_path.rpartition("/")[0].strip()+"/"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
68 print btyper_path
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
69
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
70 # split arguments provided by user
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
71 def arg_splitter(argin):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
72 return str(argin).strip("[']")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
73
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
74 iarg=arg_splitter(args.input)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
75 oarg=arg_splitter(args.output)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
76 targ=arg_splitter(args.type)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
77 varg=arg_splitter(args.virulence)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
78 marg=arg_splitter(args.mlst)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
79 rarg=arg_splitter(args.rpoB)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
80 parg=arg_splitter(args.panC)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
81 sarg=arg_splitter(args.s)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
82 spadesm=arg_splitter(args.spades_m)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
83 spadest=arg_splitter(args.spades_t)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
84 spadesk=arg_splitter(args.spades_k)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
85 dgarg=arg_splitter(args.draft_genome)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
86 vdbarg=arg_splitter(args.virulence_database)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
87 nucparg=arg_splitter(args.nucleotide_p)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
88 nucqarg=arg_splitter(args.nucleotide_q)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
89 aaparg=arg_splitter(args.amino_acid_p)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
90 aaqarg=arg_splitter(args.amino_acid_q)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
91 earg=arg_splitter(args.evalue)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
92 amrarg=arg_splitter(args.amr)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
93 amrparg=arg_splitter(args.amr_p)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
94 amrqarg=arg_splitter(args.amr_q)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
95 amrdb=arg_splitter(args.amr_database)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
96 amralg=arg_splitter(args.amr_blast)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
97
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
98 # append to output directory argument, if necessary
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
99 if oarg[-1]!="/":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
100 oarg=oarg+"/"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
101
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
102 # between_sections: prints a blank line between sections in final output file, if desried
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
103 def between_sections(finalfile_string):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
104 finalfile=open(finalfile_string,"a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
105 print >> finalfile, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
106 finalfile.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
107
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
108 # check if file exists and isn't empty
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
109 def file_check(input_file):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
110 try:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
111 if os.stat(input_file).st_size > 0:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
112 print input_file+" exits. Continuing..."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
113 return iarg
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
114 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
115 print "Your input file {0} is empty! Please use a different input file.".format(input_file)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
116 sys.exit()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
117 except OSError:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
118 print "Your input file {0} does not exist. Please make sure your path and file name are correct, or specify a different input file.".format(input_file)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
119 sys.exit()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
120
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
121 # dbparse: creates dictionary mapping virulence gene/mlst/rpoB/panC/16s id names from a given database to their respective sequences
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
122 def dbparse(db_path):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
123 argseq={}
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
124 infile=open(db_path,"r")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
125 for record in SeqIO.parse(infile, "fasta"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
126 seqid=str(record.description).strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
127 newid=re.sub('[^0-9a-zA-Z]+', '_', seqid)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
128 seqseq=str(record.seq).strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
129 seqlen=len(seqseq)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
130 if newid not in argseq.keys():
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
131 argseq[newid]=seqseq
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
132 infile.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
133 return argseq
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
134
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
135 # seqparse: creates dictionary that maps query sequence IDs to sequences
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
136 def seqparse(seq_path):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
137 newseq={}
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
138 newinfile=open(seq_path,"r")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
139 for record in SeqIO.parse(newinfile, "fasta"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
140 seqid=str(record.id).strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
141 newid=re.sub('[^0-9a-zA-Z]+', '_', seqid)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
142 seqseq=str(record.seq).strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
143 reduced_seq=seqseq.replace("N","")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
144 if newid not in newseq.keys():
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
145 newseq[newid]=seqseq
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
146 newinfile.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
147 return newseq
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
148
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
149 # make_blast_xml: takes sequences, makes blast xml file for each
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
150 def make_blast_xml(newseq,argdict,query_path,task,shorttask,evalue_thresh,pident_thresh,qcov_thresh):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
151 # for each sequence in the newseq dictionary
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
152 for key in newseq.keys():
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
153 # count the sequences
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
154 counter=1
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
155 # call query sequence queryseq
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
156 queryseq=newseq[key]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
157 # get the length of the queryseq (genelen)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
158 genelen=len(queryseq)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
159 # create a temporary query file with the gene sequence and id
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
160 queryfile=open(oarg+key.strip()+"_querytemp.fasta","a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
161 print >> queryfile, ">"+key.strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
162 print >> queryfile, queryseq
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
163 queryfile.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
164 # check if final results directory exists
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
165 if os.path.isdir(oarg+"btyper_final_results"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
166 print "Final results directory exists."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
167 # if not, make one
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
168 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
169 os.system("mkdir "+oarg+"btyper_final_results")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
170 # check if isolatefiles directory exists within final results directory
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
171 if not os.path.isdir(oarg+"btyper_final_results/isolatefiles"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
172 os.system("mkdir "+oarg+"btyper_final_results/isolatefiles")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
173 rdir_root=oarg+"btyper_final_results/isolatefiles/"+key#.split(".fasta")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
174 #print rdir_root
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
175 # make a results directory for each sequence
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
176 if not os.path.isdir(rdir_root+"_results"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
177 os.system("mkdir "+rdir_root+"_results")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
178 rdir=rdir_root+"_results/"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
179 # create a database out of the temporary query file with the individual sequence
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
180 if os.path.isfile(oarg+key.strip()+"_querytemp.fasta.nsq"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
181 print "Database already exists."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
182 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
183 os.system("makeblastdb -in "+oarg+key.strip()+"_querytemp.fasta -dbtype nucl")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
184 out=rdir_root+"_blast_results.xml"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
185 # create final file named for each isolate if necessary
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
186 finalfile_string=key.strip()+"_final_results.txt"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
187 if not os.path.isfile(oarg+"btyper_final_results/"+finalfile_string):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
188 finalfile=open(oarg+"btyper_final_results/"+finalfile_string,"a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
189 print >> finalfile, "BTyper Results for "+key
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
190 print >> finalfile, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
191 finalfile.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
192 if not os.path.isdir(oarg+"btyper_final_results/genefiles"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
193 os.system("mkdir "+oarg+"btyper_final_results/genefiles")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
194 # open the final file for the isolate
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
195 finalfile=open(oarg+"btyper_final_results/"+finalfile_string,"a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
196 # print task-specific header to final output file
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
197 if shorttask=="virulence":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
198 # print the task
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
199 print >> finalfile, task.strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
200 print >> finalfile, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
201 table_header=["Hit #", "Virulence Gene Name","E-Value","Percent (%) Identity","Percent (%) Coverage"]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
202 print >>finalfile, "\t".join([t for t in table_header])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
203 elif shorttask=="amr":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
204 # print the task
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
205 print >> finalfile, task.strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
206 print >> finalfile, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
207 table_header=["Hit #", "AMR Gene Name","E-Value","Percent (%) Identity","Percent (%) Coverage"]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
208 print >> finalfile, "\t".join([t for t in table_header])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
209 elif shorttask=="rpoB":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
210 # print the task
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
211 print >> finalfile, task.strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
212 print >> finalfile, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
213 print >> finalfile, "Predicted rpoB Allelic Type"+"\t"+"Percent (%) Identity"+"\t"+"Percent (%) Coverage"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
214 elif shorttask=="panC":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
215 print >> finalfile, task.strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
216 print >> finalfile, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
217 table_header=["panC Clade Name","Closest Strain","Percent (%) Identity","Percent (%) Coverage"]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
218 print >>finalfile, "\t".join([t for t in table_header])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
219 elif shorttask=="16s":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
220 print >> finalfile, task.strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
221 print >> finalfile, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
222 print >> finalfile, "Predicted 16s Type"+"\t"+"Percent (%) Identity"+"\t"+"Percent (%) Coverage"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
223 # close the final file for the isolate
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
224 finalfile.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
225 # run blast with query=database of sequences, db = querytemp.fasta with single isolate sequence, out = outdir, outfmt = xml
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
226 if query_path==btyper_path+"seq_virulence_db/virul_aa_db.fasta":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
227 cline=NcbitblastnCommandline(query=str(query_path),db=oarg+key.strip()+"_querytemp.fasta",out=str(out),outfmt='"5"')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
228 elif amrarg=="True" and amralg=="tblastx" and shorttask=="amr":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
229 cline=NcbitblastxCommandline(query=str(query_path),db=oarg+key.strip()+"_querytemp.fasta",out=str(out),outfmt='"5"')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
230 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
231 cline=NcbiblastnCommandline(query=str(query_path),db=oarg+key.strip()+"_querytemp.fasta",out=str(out),outfmt='"5"')
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
232 stdout, stderr = cline()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
233 #remove temporary temporary fasta files
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
234 os.system("rm "+oarg+key.strip()+"_querytemp.fasta*")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
235 # parse blast xml files according to task
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
236 if shorttask=="virulence" or shorttask=="amr":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
237 # parse_blast_xml_seq: parses blast xml file for virulence task (getting a list of all genes present in a sequence)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
238 parse_blast_xml_seq(xml=out,rdir=rdir,rdir_root=rdir_root,key=key,counter=counter,shorttask=shorttask,finalfile_string=finalfile_string,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
239 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
240 # parse_panC: parses blast xml for non-virulence tasks (getting the "top hit", rather than a list of all genes)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
241 parse_panC(xml=out,rdir=rdir,rdir_root=rdir_root,key=key,counter=counter,shorttask=shorttask,finalfile_string=finalfile_string,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
242
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
243 # dictionary_add: create a new entry for a dictionary, or append if it already exists as a key
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
244 def dictionary_add(dict_name,key,blast):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
245 if key not in dict_name.keys():
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
246 dict_name[key]=[]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
247 dict_name[key].append(blast)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
248 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
249 dict_name[key].append(blast)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
250
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
251 # print_virulence_gene_report: prints isolatefiles and genefiles
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
252 def print_virulence_gene_report(filtres,maxgene, genequery,key,rdir,shorttask):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
253 print >> filtres, "\t".join([str(m).strip() for m in maxgene])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
254 genequery=str(genequery).strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
255 if shorttask=="virulence" or shorttask=="amr":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
256 genequery=genequery.split("|")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
257 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
258 genequery=re.sub('[^0-9a-zA-Z]+', '_', genequery)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
259 genequery=genequery.strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
260 if shorttask=="virulence" or shorttask=="amr":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
261 genequery_short=genequery
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
262 elif shorttask=="panC":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
263 genequery_short="panC"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
264 elif shorttask=="mlst":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
265 genequery_short=genequery.split("_")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
266 elif shorttask=="rpoB":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
267 genequery_short="rpoB"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
268 elif shorttask=="16s":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
269 genequery_short="16s"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
270 # print to genefile (genefile=1 per gene, with all isolates' sequences)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
271 if shorttask!="virulence" and shorttask!="amr":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
272 genefile=open(oarg+"btyper_final_results/genefiles/"+genequery_short+"_genefile.fasta","a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
273 print >> genefile, ">"+key+"___"+genequery
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
274 print >> genefile, str(maxgene[14]).strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
275 genefile.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
276 # print to isolatefile (isolatefile=1 per isolate, with all genes detected per isolate)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
277 isolatefile=open(rdir+key+"_"+shorttask+"_sequences.fasta","a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
278 print >> isolatefile, ">"+genequery+"___"+key
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
279 print >> isolatefile, str(maxgene[14]).strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
280 isolatefile.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
281
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
282 # print_final_report: print to the final report file i.e. BTyper's final output file
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
283 def print_final_report(finalfile_string, shorttask, maxgene):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
284 finalout=open(oarg+"btyper_final_results/"+finalfile_string,"a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
285 if shorttask=="virulence" or shorttask=="amr":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
286 gene_row=[maxgene[0],maxgene[3],maxgene[5],maxgene[8],maxgene[7],maxgene[6],maxgene[14]]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
287 print >> finalout, "\t".join([str(gr).strip() for gr in gene_row])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
288 elif shorttask=="rpoB":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
289 rpotype=maxgene[3]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
290 rpotype=str(rpotype).replace("_","|")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
291 if float(maxgene[8])<100:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
292 pid=str(maxgene[8]).strip()+"*"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
293 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
294 pid=str(maxgene[8]).strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
295 if float(maxgene[7])<100:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
296 qid=str(maxgene[7]).strip()+"*"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
297 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
298 qid=str(maxgene[7]).strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
299 if "*" in pid or "*" in qid:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
300 rpotype=rpotype.strip()+"*"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
301 gene_row=[rpotype,pid,qid]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
302 print >> finalout, "\t".join([str(gr).strip() for gr in gene_row])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
303 if "*" in rpotype:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
304 print >> finalout, "*No rpoB allele in the current database matches with 100% identity and coverage."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
305 elif shorttask=="panC":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
306 pan=str(maxgene[3]).strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
307 panpid=maxgene[8]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
308 panqid=maxgene[7]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
309 if float(panpid.strip())<75:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
310 pan="None*___None"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
311 if float(panpid.strip())<90 and float(panpid.strip())>75:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
312 pan="?*___?"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
313 pan1=pan.split("___")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
314 pan2=pan.split("___")[1]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
315 print >> finalout, pan1.strip()+"\t"+pan2.strip()+"\t"+panpid.strip()+"\t"+panqid.strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
316 if pan.strip()=="None*___None":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
317 print >> finalout, "*No panC gene could be detected in your sequence at > 75% identity. Your panC allele may not be significantly associated with the Bacillus cereus group."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
318 if pan.strip()=="?*___?":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
319 print >> finalout, "*A panC clade could not be determined for your isolate."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
320 elif shorttask=="16s":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
321 stype=maxgene[3]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
322 if float(maxgene[8])<97:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
323 pid=str(maxgene[8]).strip()+"*"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
324 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
325 pid=str(maxgene[8]).strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
326 if float(maxgene[7])<100:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
327 qid=str(maxgene[7]).strip()+"*"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
328 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
329 qid=str(maxgene[7]).strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
330 if "*" in pid or "*" in qid:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
331 stype=stype.strip()+"*"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
332 gene_row=[stype,pid,qid]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
333 print >> finalout, "\t".join([str(gr).strip() for gr in gene_row])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
334 if "*" in stype:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
335 print >> finalout, "*No 16s gene in the current database matches with >97% identity and/or 100% coverage. Your isolate may not be a member of the Bacillus cereus group."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
336 finalout.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
337
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
338 # get_st: parse mlst output for final report file; gives ST from database of STs and ATs
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
339 def get_st(mlst_infile, st_file, finalfile_string,mlst_genes):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
340 mlstdict={}
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
341 lowconf=[]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
342 for line in mlst_infile:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
343 splits=line.split("\t")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
344 if targ:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
345 if "alignment_title" not in line:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
346 alleleid=splits[3]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
347 geneid=str(alleleid).split("_")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
348 at=str(alleleid).split("_")[1]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
349 mlstdict[str(geneid).strip()]=str(at).strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
350 pid=splits[8]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
351 qid=splits[7]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
352 if float(pid.strip())<100 or float(qid.strip())<100:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
353 lowconf.append(geneid.strip())
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
354
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
355 mlst_infile.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
356 stdict={}
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
357 stfile=open(st_file,"r")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
358 for line in stfile:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
359 splits=line.split("\t")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
360 st=splits[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
361 all_at=splits[1:8]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
362 dictionary_add(stdict,str(st).strip(),[a.strip() for a in all_at])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
363 stfile.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
364 st_order=[s for i in stdict["ST"] for s in i]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
365 myst=[]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
366 for s in st_order:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
367 if s in mlstdict.keys():
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
368 myst.append(mlstdict[s])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
369 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
370 myst.append("?")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
371
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
372
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
373 finalfile=open(finalfile_string,"a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
374 print >> finalfile, "Predicted MLST Profile:"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
375 print >> finalfile, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
376 print >> finalfile, "ST"+"\t"+"\t".join([str(mg).split(".fas")[0] for mg in mlst_genes])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
377 if "?" in myst:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
378 finalmm=[]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
379 for mm,tt in zip(myst,mlst_genes):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
380 testmlst=tt.split(".fas")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
381 if testmlst.strip() in lowconf:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
382 finalmm.append(mm+"*")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
383 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
384 finalmm.append(mm)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
385 finalfile=open(finalfile_string,"a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
386 if "*" in str(finalmm):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
387 print >> finalfile, "?*"+"\t"+"\t".join([str(m).strip() for m in finalmm])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
388 print >> finalfile, "*No allele in the current database matches with 100% identity and coverage."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
389 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
390 print >> finalfile, "?"+"\t"+"\t".join([str(m).strip() for m in finalmm])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
391 print >> finalfile, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
392 finalfile.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
393 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
394 confirmst=0
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
395 for stkey in stdict.keys():
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
396 st_match2=[s for i in stdict[stkey] for s in i]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
397 if myst==st_match2:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
398 confirmst=1
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
399 finalmm=[]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
400 print "Found matching ST."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
401 for mm,tt in zip(myst,mlst_genes):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
402 testmlst=tt.split(".fas")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
403 if testmlst.strip() in lowconf:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
404 finalmm.append(mm+"*")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
405 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
406 finalmm.append(mm)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
407
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
408 if "*" in str(finalmm):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
409 stkey=stkey.strip()+"*"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
410 finalfile=open(finalfile_string,"a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
411 print >> finalfile, stkey.strip()+"\t"+"\t".join([str(m).strip() for m in finalmm])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
412 if "*" in stkey:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
413 print >> finalfile, "*No allele in the current database matches with 100% identity and coverage."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
414 print >> finalfile, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
415 finalfile.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
416 if confirmst==0:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
417 finalmm=[]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
418 for mm,tt in zip(myst,mlst_genes):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
419 testmlst=tt.split(".fas")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
420 if testmlst.strip() in lowconf:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
421 finalmm.append(mm+"*")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
422 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
423 finalmm.append(mm)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
424 if "*" in str(finalmm):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
425 stkey="?*"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
426 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
427 stkey="?"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
428 finalfile=open(finalfile_string,"a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
429 print >> finalfile, stkey.strip()+"\t"+"\t".join([str(m).strip() for m in finalmm])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
430 if "*" in stkey:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
431 print >> finalfile, "*No allele in the current database matches with 100% identity and coverage."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
432 print >> finalfile, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
433 finalfile.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
434
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
435
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
436
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
437
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
438 # parse_blast_xml_seq: parse the blast xml output into results file
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
439 def parse_blast_xml_seq(xml,rdir,rdir_root,key,counter,shorttask,finalfile_string,evalue_thresh,qcov_thresh,pident_thresh):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
440 result_handle=open(xml)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
441 blast_records=NCBIXML.parse(result_handle)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
442 # open file in individual isolate results directory
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
443 filtres=open(rdir+key+"_"+shorttask+"_results.txt","a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
444 # print the header to the results file
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
445 headerlist=["hit_number","alignment_title","query_id",shorttask+"_gene_query","alignment_length","evalue","blast_bitscore","query_coverage","percent_idenity",shorttask+"_gene_start",shorttask+"_gene_end","genome_sequence_start","genome_sequence_end",shorttask+"_gene_sequence","genome_sequence","match_sequence"]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
446 print >> filtres, "\t".join([h for h in headerlist])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
447 # loop through each record in the blast output
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
448 for record in blast_records:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
449 for alignment in record.alignments:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
450 # create a dictionary for HSPs
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
451 hspdict={}
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
452 # for each hsp in the alignment
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
453 for hsp in alignment.hsps:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
454 #query coverage (qcov) can be calculated as one of the following:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
455 #qcov=float(float(len(hsp.sbjct))/float(len(hsp.query)))
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
456 qcov=float(hsp.align_length)/float(record.query_length)*100
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
457 pident=float(hsp.identities)/float(hsp.align_length)*100
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
458 # add information for each HSP if the e-value is lower than the threshold
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
459 if hsp.expect<=evalue_thresh and qcov>=qcov_thresh and pident>=pident_thresh:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
460 genefacts=[counter,alignment.title,record.query_id,record.query,hsp.align_length,hsp.expect,hsp.bits,qcov,pident,hsp.query_start,hsp.query_end,hsp.sbjct_start,hsp.sbjct_end,hsp.query,hsp.sbjct,hsp.match]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
461 dictionary_add(hspdict,str(record.query).strip(),"\t".join([str(g).strip() for g in genefacts]))
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
462 # add the start and end positions of the hsp to hspdict
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
463 # loop through each hsp in the alignment
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
464 for hkey in hspdict.keys():
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
465 # get values (start and end locations of each hsp, bit score)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
466 hsps=hspdict[hkey]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
467 # if more than 1 hsp exists (i.e. there is a chance that overlapping hsps might exist)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
468 if len(hsps)>1:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
469 # create list to check for overlapping HSPs
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
470 allnums=[]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
471 # loop through each HSP
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
472 for h in hsps:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
473 hsplits=h.split("\t")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
474 # get genome start position
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
475 gstart=hsplits[11]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
476 gstart=int(gstart.strip())
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
477 # get genome end position
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
478 gend=hsplits[12]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
479 gend=int(gend.strip())
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
480 if gstart<gend:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
481 # append sequence of base numbers covered by HSP to list
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
482 allnums.append(range(gstart,gend+1,1))
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
483 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
484 allnums.append(range(gend, gstart+1,1))
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
485 # get the intersection of genome positions when more than one HSP exists in the genome
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
486 inset=set.intersection(*map(set,allnums))
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
487 # if an intersection exists (i.e. there are overlapping HSPs)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
488 if inset:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
489 # loop through HSPs, keeping track of HSP with highest bit score
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
490 maxbits=0
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
491 for h in hsps:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
492 hsplits=h.split("\t")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
493 # get bit score of current HSP
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
494 testbits=hsplits[6]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
495 # if bit score of current HSP is larger than current maximum bit score:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
496 if float(testbits.strip())>maxbits:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
497 # the current bit score becomes the max
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
498 maxbits=float(testbits.strip())
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
499 # the current dictionary information becomes the best gene information
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
500 maxgene=hsplits
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
501 # the current gene query becomes the best gene query
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
502 genequery=maxgene[3]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
503 # print the best-scoring HSP to the isolatefiles and genefiles
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
504 counter=counter+1
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
505 print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
506 print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
507 # if no intersection exists, i.e. there are multiple HSPs, but they are found in different regions of the genome:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
508 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
509 # loop through each nonoverlapping HSP
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
510 for h in hsps:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
511 maxgene=h.split("\t")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
512 genequery=maxgene[3]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
513 # print each HSP to the appropriate isolatefile/genefile
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
514 counter=counter+1
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
515 print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
516 print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
517
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
518 # if there is only 1 HSP
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
519 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
520 for h in hsps:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
521 maxgene=h.split("\t")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
522 genequery=maxgene[3]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
523 #print it to the appropriate isolatefile/genefile
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
524 counter=counter+1
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
525 print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
526 print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
527 filtres.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
528 if shorttask=="virulence" and amrarg=="False":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
529 prune_alleles(finalpath=finalfile_string)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
530 elif shorttask=="amr":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
531 prune_alleles(finalpath=finalfile_string)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
532
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
533 # parse_panC: parse blast output for panC task
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
534 def parse_panC(xml,rdir,rdir_root,key,counter,shorttask,finalfile_string,evalue_thresh,qcov_thresh,pident_thresh):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
535 counter=0
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
536 result_handle=open(xml)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
537 blast_records=NCBIXML.parse(result_handle)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
538 # open file in individual isolate results directory
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
539 filtres=open(rdir+key+"_"+shorttask+"_results.txt","a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
540 # print the header to the results file
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
541 headerlist=["hit_number","alignment_title","query_id",shorttask+"_gene_query","alignment_length","evalue","blast_bitscore","query_coverage","percent_idenity",shorttask+"_gene_start",shorttask+"_gene_end","genome_sequence_start","genome_sequence_end",shorttask+"_gene_sequence","genome_sequence","match_sequence"]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
542 print >> filtres, "\t".join([h for h in headerlist])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
543 # loop through each record in the blast output
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
544 maxp=0
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
545 for record in blast_records:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
546 for alignment in record.alignments:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
547 # for each hsp in the alignment
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
548 for hsp in alignment.hsps:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
549 #qcov=float(float(len(hsp.sbjct))/float(len(hsp.query)))
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
550 qcov=float(hsp.align_length)/float(record.query_length)*100
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
551 pident=float(hsp.identities)/float(hsp.align_length)*100
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
552 testbits=float(hsp.bits)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
553 # add information for each HSP if the e-value is lower than the threshold
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
554 if hsp.expect<=evalue_thresh and qcov>=qcov_thresh and testbits>=pident_thresh and testbits>maxp:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
555 maxp=testbits
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
556 genefacts=[counter,alignment.title,record.query_id,record.query,hsp.align_length,hsp.expect,hsp.bits,qcov,pident,hsp.query_start,hsp.query_end,hsp.sbjct_start,hsp.sbjct_end,hsp.query,hsp.sbjct,hsp.match]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
557 genefacts="\t".join([str(g).strip() for g in genefacts])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
558 maxgene=genefacts.split("\t")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
559 genequery=maxgene[3]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
560 #print it to the appropriate isolatefile/genefile
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
561 counter=counter+1
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
562 print_virulence_gene_report(filtres=filtres, maxgene=maxgene, genequery=genequery, key=key, rdir=rdir, shorttask=shorttask)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
563 print_final_report(finalfile_string=finalfile_string, shorttask=shorttask, maxgene=maxgene)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
564 filtres.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
565
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
566 # assemble_reads: assemble genome from reads
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
567 def assemble_reads_pe(forward,reverse):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
568 os.system("spades.py -k "+spadesk+" --careful -1 "+forward+" -2 "+reverse+" -o "+oarg+"spades_assembly -t "+spadest+" -m "+spadesm)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
569 def assemble_reads_se(reads):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
570 os.system("spades.py -k "+spadesk+" --careful -s "+reads+" -o "+oarg+"spades_assembly -t "+spadest+" -m "+spadesm)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
571
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
572 def fastq_check(reads):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
573 if reads.endswith(".fastq"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
574 os.system("gzip reads")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
575 reads=reads.strip()+".gz"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
576 if reads.endswith(".fastq.gz"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
577 return()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
578 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
579 print "It looks like your file "+reads.strip()+" does not end with either '.fastq.gz' or '.fastq'. Please make sure that you are supplying reads in fastq or fastq.gz format, and, if necessary, please rename your file with the appropriate extension."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
580 sys.exit()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
581
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
582 def sra_check(sra):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
583 if sra.endswith(".sra"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
584 return()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
585 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
586 print "It looks like your file "+reads.strip()+" does not end with '.sra'. Please make sure that you are supplying sequencing data in sra format, and, if necessary, please rename your file with the extension '.sra'."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
587
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
588 def prune_alleles(finalpath):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
589 f=oarg+"btyper_final_results/"+finalpath
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
590 print "Pruning alleles for "+f+"..."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
591 os.system("mv "+f+" "+f+"_temporary.txt")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
592 old=open(f+"_temporary.txt","r")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
593 new=open(f,"a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
594 bitdicta={}
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
595 bitdictv={}
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
596 for o in old:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
597 if "BTyper Results for " in o:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
598 print >> new, o.strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
599 print >> new, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
600 if "|" in o and "___" not in o and "rpoB|AT" not in o:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
601 print >> new, o.strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
602 elif "|" in o and "___" in o and "rpoB|AT" not in o:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
603 splits=o.split("\t")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
604 gname=splits[1]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
605 gpref=gname.split("___")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
606 gsuff=gname.split("___")[1]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
607 testbits=splits[5]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
608 if "vir" in gpref:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
609 bitdict=bitdictv
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
610 elif "amr" in gpref:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
611 bitdict=bitdicta
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
612 if gpref.strip() not in bitdict:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
613 bitdict[gpref.strip()]=[]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
614 bitdict[gpref.strip()].append(gsuff.strip())
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
615 for s in splits[2:]:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
616 bitdict[gpref.strip()].append(str(s).strip())
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
617 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
618 maxlist=bitdict[gpref.strip()]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
619 maxbits=maxlist[4]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
620 if float(testbits.strip())>float(maxbits.strip()):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
621 bitdict[gpref.strip()]=[]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
622 bitdict[gpref.strip()].append(gsuff.strip())
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
623 for s in splits[2:]:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
624 bitdict[gpref.strip()].append(str(s).strip())
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
625
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
626 old.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
627 dicts=[bitdictv,bitdicta]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
628 if bool(dicts[0])==False and bool(dicts[1])==False:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
629 if varg=="True":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
630 print >> new, "Predicted Virulence Genes:"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
631 print >> new, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
632 header=["Hit #","Virulence Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
633 print >> new, "\t".join([h for h in header])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
634 if amrarg=="True":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
635 print >> new, "Predicted AMR Genes:"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
636 print >> new, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
637 header=["Hit #","AMR Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
638 print >> new, "\t".join([h for h in header])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
639 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
640 for bitdict in dicts:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
641 counter=0
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
642 if varg=="True" and bitdict==bitdictv:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
643 print >> new, "Predicted Virulence Genes:"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
644 print >> new, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
645 header=["Hit #","Virulence Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
646 print >> new, "\t".join([h for h in header])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
647 elif amrarg=="True" and bitdict==bitdicta:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
648 print >> new, "Predicted AMR Genes:"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
649 print >> new, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
650 header=["Hit #","AMR Gene Name","E-Value Percent (%) Identity","Percent (%) Coverage"]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
651 print >> new, "\t".join([h for h in header])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
652 for key in bitdict.keys():
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
653 counter=counter+1
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
654 finalg=bitdict[key]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
655 print >> new, str(counter)+"\t"+"\t".join([str(fg).strip() for fg in finalg[:-2]])
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
656 shortgene=finalg[0].split("|")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
657 genefile=open(oarg+"btyper_final_results/genefiles/"+shortgene.strip()+"_genefile.fasta","a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
658 isoname=f.split("/")[-1]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
659 print >> genefile, ">"+isoname.strip()+"___"+shortgene
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
660 print >> genefile, finalg[-1].strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
661 genefile.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
662 print >> new, ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
663 new.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
664 os.system("rm "+f+"_temporary.txt")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
665
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
666
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
667 # run program based on user input
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
668 if targ!="seq":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
669 os.system("mkdir "+oarg+"spades_assembly")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
670 if targ=="pe":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
671 spacecount=iarg.count(" ")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
672 if spacecount>1:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
673 print "It looks like there may be whitespace in one or more of your file names. Please rename your file(s), and try again."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
674 sys.exit()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
675 elif spacecount==0:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
676 print "It looks like you've chosen the paired-end reads (-t pe) option but have only supplied one file of reads. Please supply one forward reads file and one reverse reads file separated by a space, or select a different option for -t (se for single-end reads, sra for files in sra format, or sra-get to download sequencing data in SRA format)."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
677 sys.exit()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
678 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
679 forward_reads=iarg.split(" ")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
680 reverse_reads=iarg.split(" ")[1]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
681 forward_reads=re.sub("[,']","",forward_reads)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
682 reverse_reads=re.sub("[,']","",reverse_reads)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
683 fastq_check(forward_reads)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
684 fastq_check(reverse_reads)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
685 prefix=forward_reads.split(".fastq.gz")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
686 if "/" in prefix:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
687 prefix=prefix.split("/")[-1]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
688 assemble_reads_pe(forward=forward_reads,reverse=reverse_reads)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
689 elif targ=="se":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
690 sreads=iarg.strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
691 fastq_check(sreads)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
692 prefix=sreads.split(".fastq.gz")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
693 if "/" in prefix:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
694 prefix=prefix.split("/")[-1]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
695 assemble_reads_se(reads=sreads)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
696 elif targ=="sra-get" or targ=="sra":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
697 sra=iarg.strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
698 if targ=="sra-get":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
699 os.system("prefetch -v "+sra)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
700 os.system("fastq-dump --outdir "+oarg+" --split-files --gzip ~/ncbi/public/sra/"+sra+".sra")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
701 elif targ=="sra":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
702 sra=iarg.split(".sra")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
703 os.system("fastq-dump --outdir "+oarg+" --split-files --gzip "+iarg.strip())
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
704 forward_reads=oarg+sra.strip()+"_1.fastq.gz"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
705 fastq_check(forward_reads)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
706 prefix=forward_reads.split(".fastq.gz")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
707 if "/" in prefix:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
708 prefix=prefix.split("/")[-1]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
709 if os.path.exists(oarg+sra+"_2.fastq.gz"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
710 reverse_reads=oarg+sra+"_2.fastq.gz"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
711 fastq_check(reverse_reads)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
712 assemble_reads_pe(forward=forward_reads,reverse=reverse_reads)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
713 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
714 assemble_reads_se(reads=forward_reads)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
715 os.system("cp "+oarg+"spades_assembly/contigs.fasta "+oarg+prefix.strip()+"_spades_assembly.fasta")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
716 contigs=open(oarg+prefix.strip()+"_spades_assembly.fasta","r")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
717 pseudo=open(oarg+prefix.strip()+"_pseudochrom.fasta","a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
718 print >> pseudo, ">"+prefix.strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
719 print >> pseudo, "".join(line.strip() for line in contigs if ">" not in line)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
720 contigs.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
721 pseudo.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
722 targ="seq"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
723 iarg=oarg+prefix.strip()+"_pseudochrom.fasta"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
724
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
725
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
726
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
727
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
728 # if the user inputs a fasta file
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
729 if targ=="seq":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
730 print "-t seq has been selected. Expecting a fasta file."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
731 # check if the file is a fasta
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
732 file_check(iarg)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
733 # if genome is a draft genome
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
734 if dgarg=="True":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
735 prefix=iarg.rpartition(".")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
736 if "/" in prefix:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
737 prefix=prefix.split("/")[-1]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
738 contigs=open(iarg,"r")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
739 pseudo=open(oarg+prefix.strip()+"_pseudochrom.fasta","a")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
740 print >> pseudo, ">"+prefix.strip()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
741 print >> pseudo, "".join(line.strip() for line in contigs if ">" not in line)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
742 contigs.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
743 pseudo.close()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
744 iarg=oarg+prefix.strip()+"_pseudochrom.fasta"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
745 # create a dictionary from the user-supplied fasta, mapping ids to sequences
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
746 dictionaries=seqparse(iarg)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
747
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
748 # if performing virulence typing
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
749 if varg=="True":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
750 # define database
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
751 query_paths=[]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
752 if vdbarg=="nuc":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
753 query_paths.append(btyper_path+"seq_virulence_db/bc_virul_nt.fasta")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
754 if vdbarg=="aa":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
755 query_paths.append(btyper_path+"seq_virulence_db/virul_aa_db.fasta")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
756 for query_path in query_paths:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
757 # map db ids to sequences
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
758 mydb=dbparse(query_path)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
759 #task="Predicted Virulence Genes:"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
760 shorttask="virulence"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
761 # define minimum e-value for virulence genes
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
762 evalue_thresh=float(earg)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
763 if query_path==btyper_path+"seq_virulence_db/bc_virul_nt.fasta":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
764 task="Predicted Virulence Genes:"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
765 # define minimum percent identity for virulence genes
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
766 pident_thresh=int(nucparg)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
767 # define minimum query coverage for virulence genes
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
768 qcov_thresh=int(nucqarg)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
769 elif query_path==btyper_path+"seq_virulence_db/virul_aa_db.fasta":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
770 task="Predicted Virulence Genes:"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
771 # define minimum percent identity for virulence genes
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
772 pident_thresh=int(aaparg)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
773 # define minimum query coverage for virulence genes
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
774 qcov_thresh=int(aaqarg)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
775 # run virulence typing
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
776 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
777 # print spacing between under final results
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
778 for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
779 between_sections(finalfile_string=f)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
780
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
781
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
782
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
783 # if performing amr gene detection
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
784 if amrarg=="True":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
785 # define database
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
786 query_paths=[]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
787 if amrdb=="megares":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
788 query_paths.append(btyper_path+"seq_amr_db/megares_grouped.fasta")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
789 elif amrdb=="argannot":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
790 query_paths.append(btyper_path+"seq_amr_db/argannot_clus80.fasta")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
791 else:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
792 print "Hmmmm...it looks like your selected database, "+amrdb+", doesn't exist. Please select either argannot to use the ARG-ANNOT database, or megares to use the MEGARes database."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
793 sys.exit()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
794 for query_path in query_paths:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
795 # map db ids to sequences
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
796 mydb=dbparse(query_path)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
797 task="Predicted AMR Genes:"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
798 shorttask="amr"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
799 # define minimum e-value for amr genes
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
800 evalue_thresh=float(earg)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
801 # define minimum percent identity for amr genes
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
802 pident_thresh=int(amrparg)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
803 # define minimum query coverage for amr genes
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
804 qcov_thresh=int(amrqarg)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
805 # run amr gene detection
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
806 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
807 for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
808 between_sections(finalfile_string=f)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
809
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
810
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
811
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
812 # if performing panC typing:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
813 if parg=="True":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
814 # define database
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
815 query_path=btyper_path+"seq_panC_db/panC.fasta"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
816 mydb=dbparse(query_path)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
817 task="Predicted panC Clade Designation:"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
818 shorttask="panC"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
819 evalue_thresh=float(earg)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
820 pident_thresh=0
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
821 qcov_thresh=0
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
822 # run panC typing, if panC gene is detected
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
823 try:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
824 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
825 for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
826 between_sections(finalfile_string=f)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
827 except UnboundLocalError:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
828 print "No sequences found for "+shorttask
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
829
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
830 # if performing mlst:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
831 if marg=="True":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
832 mlst_genes=["glp.fas","gmk.fas","ilv.fas","pta.fas","pur.fas","pyc.fas","tpi.fas"]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
833 # get best-matching AT for each MLST gene
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
834 for mlst in mlst_genes:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
835 query_path=btyper_path+"seq_mlst_db/"+mlst
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
836 mydb=dbparse(query_path)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
837 task="Predicted MLST Profile:"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
838 shorttask="mlst"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
839 evalue_thresh=float(earg)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
840 pident_thresh=0
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
841 qcov_thresh=0
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
842 # run AT for each gene, if sequence deteced
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
843 try:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
844 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
845 except UnboundLocalError:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
846 print "No sequences found for "+mlst
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
847 # loop through isolatefiles
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
848 for root, dirs, files in os.walk(oarg+"btyper_final_results/isolatefiles/"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
849 for d in dirs:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
850 dirroot=d.split("_results")[0]
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
851 # open mlst results file, and get ST from ATs
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
852 try:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
853 newf=open(oarg+"btyper_final_results/isolatefiles/"+d+"/"+dirroot.strip()+"_mlst_results.txt","r")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
854 finalfile_string=oarg+"btyper_final_results/"+dirroot.strip()+"_final_results.txt"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
855 ff=open(finalfile_string,"r")
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
856 flines=ff.readlines()
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
857 if not any("Predicted MLST Profile" in fl.strip() for fl in flines):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
858 get_st(mlst_infile=newf,st_file=btyper_path+"seq_mlst_db/b_cereus_mlst_db.txt",finalfile_string=finalfile_string,mlst_genes=mlst_genes)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
859 except IOError:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
860 print "No sequences found for "+shorttask
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
861
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
862 # if performing rpoB typing:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
863 if rarg=="True":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
864 # define database
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
865 query_path=btyper_path+"seq_rpoB_db/rpobdatabase08122015.fa"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
866 mydb=dbparse(query_path)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
867 task="Predicted rpoB Allelic Type:"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
868 shorttask="rpoB"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
869 evalue_thresh=float(earg)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
870 pident_thresh=0
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
871 qcov_thresh=0
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
872 # perform rpoB typing, if gene is present
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
873 try:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
874 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
875 for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
876 between_sections(finalfile_string=f)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
877 except UnboundLocalError:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
878 print "No sequences found for "+shorttask
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
879
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
880 # if performing 16s typing:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
881 if sarg=="True":
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
882 # define database
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
883 query_path=btyper_path+"seq_16s_db/b_cereus_group_16s_db.fasta"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
884 mydb=dbparse(query_path)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
885 task="Predicted 16s Type"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
886 shorttask="16s"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
887 evalue_thresh=float(earg)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
888 pident_thresh=0
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
889 qcov_thresh=0
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
890 # perform 16s typing, if gene is present
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
891 try:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
892 make_blast_xml(newseq=dictionaries,argdict=mydb,query_path=query_path,task=task,shorttask=shorttask,evalue_thresh=evalue_thresh,pident_thresh=pident_thresh,qcov_thresh=qcov_thresh)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
893 for f in glob.glob(oarg+"btyper_final_results/*_final_results.txt"):
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
894 between_sections(finalfile_string=f)
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
895 except UnboundLocalError:
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
896 print "No sequences found for "+shorttask
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
897
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
898 print "Typing complete...how neat is that?"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
899 print ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
900 print "Thank you for using BTyper! For more fun, take your output files to BMiner, BTyper's companion application for data aggregation and visualization."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
901 print ""
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
902 print "To cite BTyper and/or BMiner, please use the following:"
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
903 print "Carroll, Laura M., Jasna Kovac, Rachel A. Miller, Martin Wiedmann. 2017. Rapid, high-throughput identification of anthrax-causing and emetic Bacillus cereus group genome assemblies using BTyper, a computational tool for virulence-based classification of Bacillus cereus group isolates using nucleotide sequencing data. Applied and Environmental Microbiology 2017 Jun 16. pii: AEM.01096-17. doi: 10.1128/AEM.01096-17."
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
904
46ebe89f01ea Uploaded
estrain
parents:
diff changeset
905