6
|
1 '''
|
|
2 Genome Utilities
|
|
3 '''
|
|
4 #!/usr/bin/env python
|
|
5
|
|
6 import logging
|
|
7 import os
|
|
8 import re
|
|
9 import tempfile
|
|
10 from tarfile import is_tarfile
|
|
11
|
|
12 from Bio import SeqIO
|
|
13 from ectyper import definitions, subprocess_util
|
|
14
|
|
15 LOG = logging.getLogger(__name__)
|
|
16
|
|
17
|
|
18 def get_files_as_list(file_or_directory):
|
|
19 """
|
|
20 Creates a list of files from either the given file, or all files within the
|
|
21 directory specified (where each file name is its absolute path).
|
|
22
|
|
23 Args:
|
|
24 file_or_directory (str): file or directory name given on commandline
|
|
25
|
|
26 Returns:
|
|
27 files_list (list(str)): List of all the files found.
|
|
28
|
|
29 """
|
|
30
|
|
31 files_list = []
|
|
32 if file_or_directory == '':
|
|
33 return files_list
|
|
34
|
|
35 if os.path.isdir(file_or_directory):
|
|
36 LOG.info("Gathering genomes from directory " + file_or_directory)
|
|
37
|
|
38 # Create a list containing the file names
|
|
39 for root, dirs, files in os.walk(file_or_directory):
|
|
40 for filename in files:
|
|
41 files_list.append(os.path.join(root, filename))
|
|
42 # check if input is concatenated file locations
|
|
43 elif ',' in file_or_directory:
|
|
44 LOG.info("Using genomes in the input list")
|
|
45 for filename in file_or_directory.split(','):
|
|
46 files_list.append(os.path.abspath(filename))
|
|
47 else:
|
|
48 LOG.info("Using genomes in file " + file_or_directory)
|
|
49 files_list.append(os.path.abspath(file_or_directory))
|
|
50
|
|
51 return sorted(files_list)
|
|
52
|
|
53
|
|
54 def get_valid_format(file):
|
|
55 """
|
|
56 Check using SeqIO if files are valid fasta/fastq format, returns the format.
|
|
57
|
|
58 Args:
|
|
59 file (str): path of file
|
|
60
|
|
61 Returns:
|
|
62 fm (str or None): the file format if 'fasta' or 'fastq', otherwise None
|
|
63 """
|
|
64 for fm in ['fastq', 'fasta']:
|
|
65 try:
|
|
66 with open(file, "r") as handle:
|
|
67 data = SeqIO.parse(handle, fm)
|
|
68 if any(data):
|
|
69 if is_tarfile(file):
|
|
70 LOG.warning("Compressed file is not supported: {}".format(file))
|
|
71 return None
|
|
72 return fm
|
|
73 except FileNotFoundError as err:
|
|
74 LOG.warning("{0} is not found".format(file))
|
|
75 return None
|
|
76 except UnicodeDecodeError as err:
|
|
77 LOG.warning("{0} is not a valid file".format(file))
|
|
78 return None
|
|
79 except:
|
|
80 LOG.warning("{0} is an unexpected file".format(file))
|
|
81 return None
|
|
82 LOG.warning("{0} is not a fasta/fastq file".format(file))
|
|
83 return None
|
|
84
|
|
85
|
|
86 def get_genome_names_from_files(files, temp_dir):
|
|
87 """
|
|
88 For each file:
|
|
89 Takes the first header from a fasta file and sends it to the get_genome_name
|
|
90 function. Returns the name of the genome. If the name of the file is to be
|
|
91 used as the genome name, creates a temporary file using >lcl|filename as the
|
|
92 first part of the header.
|
|
93
|
|
94 Args:
|
|
95 files (list): The list of files to get the genome names for
|
|
96 tempdir (str): A tempdir where the copied files will be stored
|
|
97
|
|
98 Returns:
|
|
99 tuple(list(str), list(str)): first list is genome names, second list is file names
|
|
100 """
|
|
101
|
|
102 list_of_genomes = []
|
|
103 list_of_files = []
|
|
104 for file in files:
|
|
105 # Always to use the filename as the genome name.
|
|
106 # This means we also need to create a new file adding
|
|
107 # the filename to each of the headers, so that downstream applications
|
|
108 # (eg. BLAST) can be used with the filename as genome name.
|
|
109
|
|
110 # get only the name of the file for use in the fasta header
|
|
111 file_base_name = os.path.basename(file)
|
|
112 file_path_name = os.path.splitext(file_base_name)[0]
|
|
113 n_name = file_path_name.replace(' ', '_')
|
|
114
|
|
115 # create a new file for the updated fasta headers
|
|
116 new_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False).name
|
|
117
|
|
118 # add the new name to the list of files and genomes
|
|
119 list_of_files.append(new_file)
|
|
120 list_of_genomes.append(n_name)
|
|
121
|
|
122 with open(new_file, "w") as outfile:
|
|
123 with open(file) as infile:
|
|
124 for record in SeqIO.parse(infile, "fasta"):
|
|
125 outfile.write(">lcl|" + n_name + "|" + record.description + "\n")
|
|
126 outfile.write(str(record.seq) + "\n")
|
|
127
|
|
128 return list_of_genomes, list_of_files
|
|
129
|
|
130
|
|
131 def get_genome_name(header):
|
|
132 """
|
|
133 Getting the name of the genome by hierarchy. This requires reading the first
|
|
134 fasta header from the file. It also assumes a single genome per file.
|
|
135
|
|
136 Args:
|
|
137 header (str): The header containing the record.
|
|
138
|
|
139 Returns:
|
|
140 genomeName (str): Name of the genome contained in the header.
|
|
141 """
|
|
142
|
|
143 re_patterns = (
|
|
144 # Look for lcl followed by the possible genome name
|
|
145 re.compile('(lcl\|[\w\-\.]+)'),
|
|
146
|
|
147 # Look for contigs in the wwwwdddddd format
|
|
148 re.compile('([A-Za-z]{4}\d{2})\d{6}'),
|
|
149
|
|
150 # Look for a possible genome name at the beginning of the record ID
|
|
151 re.compile('^(\w{8}\.\d)'),
|
|
152
|
|
153 # Look for ref, gb, emb or dbj followed by the possible genome name
|
|
154 re.compile('(ref\|\w{2}_\w{6}|gb\|\w{8}|emb\|\w{8}|dbj\|\w{8})'),
|
|
155
|
|
156 # Look for gi followed by the possible genome name
|
|
157 re.compile('(gi\|\d{8})'),
|
|
158
|
|
159
|
|
160 # Look for name followed by space, then description
|
|
161 re.compile('^([\w\-\.]+)\s+[\w\-\.]+')
|
|
162 )
|
|
163
|
|
164 # if nothing matches, use the full header as genome_name
|
|
165 genome_name = header
|
|
166 for rep in re_patterns:
|
|
167 m = rep.search(header)
|
|
168
|
|
169 if m:
|
|
170 genome_name = m.group(1)
|
|
171 break
|
|
172
|
|
173 return str(genome_name)
|
|
174
|
|
175 def assemble_reads(reads, reference, temp_dir):
|
|
176 '''
|
|
177 Return path to assembled reads.
|
|
178 Assemble short reads by mapping to a reference genome.
|
|
179 Default output is the same as reads file
|
|
180 (basename+'iden.fasta' and basename+'pred.fasta').
|
|
181
|
|
182 Args:
|
|
183 reads (str): FASTQ/FQ format reads file
|
|
184 reference (str): FASTA format reference file
|
|
185 temp_dir (str): temp_dir for storing assembled files
|
|
186
|
|
187 Returns:
|
|
188 tuple(str, str): identifcation and prediction fasta file
|
|
189 '''
|
|
190 output = os.path.join(
|
|
191 temp_dir,
|
|
192 os.path.splitext(os.path.basename(reads))[0]+'.fasta'
|
|
193 )
|
|
194
|
|
195 # run once if index reference does not exist
|
|
196 index_path = \
|
|
197 os.path.join(
|
|
198 definitions.DATA_DIR,
|
|
199 'bowtie_index',
|
|
200 os.path.splitext(os.path.basename(reference))[0]
|
|
201 )
|
|
202 index_dir = os.path.split(index_path)[0]
|
|
203 if not os.path.isdir(index_dir):
|
|
204 LOG.info('Reference index does not exist. Creating new reference index at {}'.format(index_dir))
|
|
205 if not os.path.exists(index_dir):
|
|
206 os.makedirs(index_dir)
|
|
207 cmd1 = [
|
|
208 'bowtie2-build',
|
|
209 reference,
|
|
210 index_path
|
|
211 ]
|
|
212 subprocess_util.run_subprocess(cmd1)
|
|
213
|
|
214 cmd2 = [
|
|
215 'bowtie2',
|
|
216 '--score-min L,1,-0.5',
|
|
217 '--np 5',
|
|
218 '-x', index_path,
|
|
219 '-U', reads,
|
|
220 '-S', os.path.join(temp_dir, 'reads.sam')
|
|
221 ]
|
|
222 subprocess_util.run_subprocess(cmd2)
|
|
223
|
|
224 cmd3 = [
|
|
225 definitions.SAMTOOLS, 'view',
|
|
226 '-F 4',
|
|
227 '-q 1',
|
|
228 '-bS', os.path.join(temp_dir, 'reads.sam'),
|
|
229 '-o', os.path.join(temp_dir, 'reads.bam')
|
|
230 ]
|
|
231 subprocess_util.run_subprocess(cmd3)
|
|
232 cmd4 = [
|
|
233 definitions.SAMTOOLS, 'sort',
|
|
234 os.path.join(temp_dir, 'reads.bam'),
|
|
235 '-o', os.path.join(temp_dir, 'reads.sorted.bam'),
|
|
236 ]
|
|
237 subprocess_util.run_subprocess(cmd4)
|
|
238
|
|
239 shell_cmd = [
|
|
240 definitions.SAMTOOLS+' mpileup -uf', # mpileup
|
|
241 reference,
|
|
242 os.path.join(temp_dir, 'reads.sorted.bam'),
|
|
243 '|',
|
|
244 'bcftools call -c', # variant calling
|
|
245 '|',
|
|
246 'vcfutils.pl vcf2fq', # vcf to fq
|
|
247 '|',
|
|
248 'seqtk seq -A -', # fq to fasta
|
|
249 '>',
|
|
250 output
|
|
251 ]
|
|
252 subprocess_util.run_subprocess(' '.join(shell_cmd), is_shell=True)
|
|
253 return split_mapped_output(output)
|
|
254
|
|
255 def split_mapped_output(file):
|
|
256 '''
|
|
257 Splits given fasta files into two file based on 'lcl' tags
|
|
258 in the seq header
|
|
259
|
|
260 Args:
|
|
261 file (str): path to input fasta file
|
|
262
|
|
263 Returns:
|
|
264 (str): path to ecoli identification fasta seq
|
|
265 (str): path to serotype prediction fasta seq
|
|
266 '''
|
|
267 identif_file = os.path.splitext(file)[0]+'.iden.fasta'
|
|
268 predict_file = os.path.splitext(file)[0]+'.pred.fasta'
|
|
269 identif_seqs = []
|
|
270 predict_seqs = []
|
|
271 for record in SeqIO.parse(file, 'fasta'):
|
|
272 if 'lcl' in record.description:
|
|
273 identif_seqs.append(record)
|
|
274 else:
|
|
275 predict_seqs.append(record)
|
|
276 with open(identif_file, "w") as output_handle:
|
|
277 SeqIO.write(identif_seqs, output_handle, "fasta")
|
|
278 with open(predict_file, "w") as output_handle:
|
|
279 SeqIO.write(predict_seqs, output_handle, "fasta")
|
|
280 return identif_file, predict_file
|