annotate ecoli_serotyping/genomeFunctions.py @ 10:5b52b3842765 draft

Uploaded
author jpetteng
date Sat, 06 Jan 2018 13:32:23 -0500
parents fe3ceb5c4214
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
1 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
2 Genome Utilities
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
3 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
4 #!/usr/bin/env python
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
5
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
6 import logging
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
7 import os
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
8 import re
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
9 import tempfile
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
10 from tarfile import is_tarfile
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
11
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
12 from Bio import SeqIO
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
13 from ectyper import definitions, subprocess_util
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
14
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
15 LOG = logging.getLogger(__name__)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
16
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
17
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
18 def get_files_as_list(file_or_directory):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
19 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
20 Creates a list of files from either the given file, or all files within the
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
21 directory specified (where each file name is its absolute path).
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
22
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
23 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
24 file_or_directory (str): file or directory name given on commandline
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
25
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
26 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
27 files_list (list(str)): List of all the files found.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
28
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
29 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
30
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
31 files_list = []
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
32 if file_or_directory == '':
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
33 return files_list
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
34
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
35 if os.path.isdir(file_or_directory):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
36 LOG.info("Gathering genomes from directory " + file_or_directory)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
37
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
38 # Create a list containing the file names
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
39 for root, dirs, files in os.walk(file_or_directory):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
40 for filename in files:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
41 files_list.append(os.path.join(root, filename))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
42 # check if input is concatenated file locations
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
43 elif ',' in file_or_directory:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
44 LOG.info("Using genomes in the input list")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
45 for filename in file_or_directory.split(','):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
46 files_list.append(os.path.abspath(filename))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
47 else:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
48 LOG.info("Using genomes in file " + file_or_directory)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
49 files_list.append(os.path.abspath(file_or_directory))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
50
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
51 return sorted(files_list)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
52
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
53
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
54 def get_valid_format(file):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
55 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
56 Check using SeqIO if files are valid fasta/fastq format, returns the format.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
57
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
58 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
59 file (str): path of file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
60
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
61 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
62 fm (str or None): the file format if 'fasta' or 'fastq', otherwise None
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
63 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
64 for fm in ['fastq', 'fasta']:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
65 try:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
66 with open(file, "r") as handle:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
67 data = SeqIO.parse(handle, fm)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
68 if any(data):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
69 if is_tarfile(file):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
70 LOG.warning("Compressed file is not supported: {}".format(file))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
71 return None
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
72 return fm
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
73 except FileNotFoundError as err:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
74 LOG.warning("{0} is not found".format(file))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
75 return None
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
76 except UnicodeDecodeError as err:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
77 LOG.warning("{0} is not a valid file".format(file))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
78 return None
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
79 except:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
80 LOG.warning("{0} is an unexpected file".format(file))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
81 return None
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
82 LOG.warning("{0} is not a fasta/fastq file".format(file))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
83 return None
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
84
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
85
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
86 def get_genome_names_from_files(files, temp_dir):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
87 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
88 For each file:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
89 Takes the first header from a fasta file and sends it to the get_genome_name
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
90 function. Returns the name of the genome. If the name of the file is to be
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
91 used as the genome name, creates a temporary file using >lcl|filename as the
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
92 first part of the header.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
93
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
94 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
95 files (list): The list of files to get the genome names for
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
96 tempdir (str): A tempdir where the copied files will be stored
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
97
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
98 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
99 tuple(list(str), list(str)): first list is genome names, second list is file names
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
100 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
101
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
102 list_of_genomes = []
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
103 list_of_files = []
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
104 for file in files:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
105 # Always to use the filename as the genome name.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
106 # This means we also need to create a new file adding
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
107 # the filename to each of the headers, so that downstream applications
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
108 # (eg. BLAST) can be used with the filename as genome name.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
109
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
110 # get only the name of the file for use in the fasta header
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
111 file_base_name = os.path.basename(file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
112 file_path_name = os.path.splitext(file_base_name)[0]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
113 n_name = file_path_name.replace(' ', '_')
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
114
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
115 # create a new file for the updated fasta headers
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
116 new_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False).name
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
117
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
118 # add the new name to the list of files and genomes
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
119 list_of_files.append(new_file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
120 list_of_genomes.append(n_name)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
121
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
122 with open(new_file, "w") as outfile:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
123 with open(file) as infile:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
124 for record in SeqIO.parse(infile, "fasta"):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
125 outfile.write(">lcl|" + n_name + "|" + record.description + "\n")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
126 outfile.write(str(record.seq) + "\n")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
127
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
128 return list_of_genomes, list_of_files
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
129
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
130
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
131 def get_genome_name(header):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
132 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
133 Getting the name of the genome by hierarchy. This requires reading the first
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
134 fasta header from the file. It also assumes a single genome per file.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
135
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
136 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
137 header (str): The header containing the record.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
138
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
139 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
140 genomeName (str): Name of the genome contained in the header.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
141 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
142
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
143 re_patterns = (
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
144 # Look for lcl followed by the possible genome name
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
145 re.compile('(lcl\|[\w\-\.]+)'),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
146
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
147 # Look for contigs in the wwwwdddddd format
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
148 re.compile('([A-Za-z]{4}\d{2})\d{6}'),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
149
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
150 # Look for a possible genome name at the beginning of the record ID
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
151 re.compile('^(\w{8}\.\d)'),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
152
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
153 # Look for ref, gb, emb or dbj followed by the possible genome name
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
154 re.compile('(ref\|\w{2}_\w{6}|gb\|\w{8}|emb\|\w{8}|dbj\|\w{8})'),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
155
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
156 # Look for gi followed by the possible genome name
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
157 re.compile('(gi\|\d{8})'),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
158
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
159
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
160 # Look for name followed by space, then description
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
161 re.compile('^([\w\-\.]+)\s+[\w\-\.]+')
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
162 )
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
163
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
164 # if nothing matches, use the full header as genome_name
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
165 genome_name = header
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
166 for rep in re_patterns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
167 m = rep.search(header)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
168
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
169 if m:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
170 genome_name = m.group(1)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
171 break
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
172
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
173 return str(genome_name)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
174
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
175 def assemble_reads(reads, reference, temp_dir):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
176 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
177 Return path to assembled reads.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
178 Assemble short reads by mapping to a reference genome.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
179 Default output is the same as reads file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
180 (basename+'iden.fasta' and basename+'pred.fasta').
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
181
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
182 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
183 reads (str): FASTQ/FQ format reads file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
184 reference (str): FASTA format reference file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
185 temp_dir (str): temp_dir for storing assembled files
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
186
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
187 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
188 tuple(str, str): identifcation and prediction fasta file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
189 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
190 output = os.path.join(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
191 temp_dir,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
192 os.path.splitext(os.path.basename(reads))[0]+'.fasta'
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
193 )
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
194
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
195 # run once if index reference does not exist
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
196 index_path = \
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
197 os.path.join(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
198 definitions.DATA_DIR,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
199 'bowtie_index',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
200 os.path.splitext(os.path.basename(reference))[0]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
201 )
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
202 index_dir = os.path.split(index_path)[0]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
203 if not os.path.isdir(index_dir):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
204 LOG.info('Reference index does not exist. Creating new reference index at {}'.format(index_dir))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
205 if not os.path.exists(index_dir):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
206 os.makedirs(index_dir)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
207 cmd1 = [
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
208 'bowtie2-build',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
209 reference,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
210 index_path
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
211 ]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
212 subprocess_util.run_subprocess(cmd1)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
213
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
214 cmd2 = [
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
215 'bowtie2',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
216 '--score-min L,1,-0.5',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
217 '--np 5',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
218 '-x', index_path,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
219 '-U', reads,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
220 '-S', os.path.join(temp_dir, 'reads.sam')
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
221 ]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
222 subprocess_util.run_subprocess(cmd2)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
223
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
224 cmd3 = [
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
225 definitions.SAMTOOLS, 'view',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
226 '-F 4',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
227 '-q 1',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
228 '-bS', os.path.join(temp_dir, 'reads.sam'),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
229 '-o', os.path.join(temp_dir, 'reads.bam')
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
230 ]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
231 subprocess_util.run_subprocess(cmd3)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
232 cmd4 = [
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
233 definitions.SAMTOOLS, 'sort',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
234 os.path.join(temp_dir, 'reads.bam'),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
235 '-o', os.path.join(temp_dir, 'reads.sorted.bam'),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
236 ]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
237 subprocess_util.run_subprocess(cmd4)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
238
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
239 shell_cmd = [
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
240 definitions.SAMTOOLS+' mpileup -uf', # mpileup
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
241 reference,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
242 os.path.join(temp_dir, 'reads.sorted.bam'),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
243 '|',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
244 'bcftools call -c', # variant calling
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
245 '|',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
246 'vcfutils.pl vcf2fq', # vcf to fq
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
247 '|',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
248 'seqtk seq -A -', # fq to fasta
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
249 '>',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
250 output
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
251 ]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
252 subprocess_util.run_subprocess(' '.join(shell_cmd), is_shell=True)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
253 return split_mapped_output(output)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
254
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
255 def split_mapped_output(file):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
256 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
257 Splits given fasta files into two file based on 'lcl' tags
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
258 in the seq header
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
259
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
260 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
261 file (str): path to input fasta file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
262
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
263 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
264 (str): path to ecoli identification fasta seq
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
265 (str): path to serotype prediction fasta seq
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
266 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
267 identif_file = os.path.splitext(file)[0]+'.iden.fasta'
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
268 predict_file = os.path.splitext(file)[0]+'.pred.fasta'
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
269 identif_seqs = []
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
270 predict_seqs = []
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
271 for record in SeqIO.parse(file, 'fasta'):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
272 if 'lcl' in record.description:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
273 identif_seqs.append(record)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
274 else:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
275 predict_seqs.append(record)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
276 with open(identif_file, "w") as output_handle:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
277 SeqIO.write(identif_seqs, output_handle, "fasta")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
278 with open(predict_file, "w") as output_handle:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
279 SeqIO.write(predict_seqs, output_handle, "fasta")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
280 return identif_file, predict_file