comparison ecoli_serotyping/genomeFunctions.py @ 6:fe3ceb5c4214 draft

Uploaded
author jpetteng
date Fri, 05 Jan 2018 15:43:14 -0500
parents
children
comparison
equal deleted inserted replaced
5:a202cc394af8 6:fe3ceb5c4214
1 '''
2 Genome Utilities
3 '''
4 #!/usr/bin/env python
5
6 import logging
7 import os
8 import re
9 import tempfile
10 from tarfile import is_tarfile
11
12 from Bio import SeqIO
13 from ectyper import definitions, subprocess_util
14
15 LOG = logging.getLogger(__name__)
16
17
18 def get_files_as_list(file_or_directory):
19 """
20 Creates a list of files from either the given file, or all files within the
21 directory specified (where each file name is its absolute path).
22
23 Args:
24 file_or_directory (str): file or directory name given on commandline
25
26 Returns:
27 files_list (list(str)): List of all the files found.
28
29 """
30
31 files_list = []
32 if file_or_directory == '':
33 return files_list
34
35 if os.path.isdir(file_or_directory):
36 LOG.info("Gathering genomes from directory " + file_or_directory)
37
38 # Create a list containing the file names
39 for root, dirs, files in os.walk(file_or_directory):
40 for filename in files:
41 files_list.append(os.path.join(root, filename))
42 # check if input is concatenated file locations
43 elif ',' in file_or_directory:
44 LOG.info("Using genomes in the input list")
45 for filename in file_or_directory.split(','):
46 files_list.append(os.path.abspath(filename))
47 else:
48 LOG.info("Using genomes in file " + file_or_directory)
49 files_list.append(os.path.abspath(file_or_directory))
50
51 return sorted(files_list)
52
53
54 def get_valid_format(file):
55 """
56 Check using SeqIO if files are valid fasta/fastq format, returns the format.
57
58 Args:
59 file (str): path of file
60
61 Returns:
62 fm (str or None): the file format if 'fasta' or 'fastq', otherwise None
63 """
64 for fm in ['fastq', 'fasta']:
65 try:
66 with open(file, "r") as handle:
67 data = SeqIO.parse(handle, fm)
68 if any(data):
69 if is_tarfile(file):
70 LOG.warning("Compressed file is not supported: {}".format(file))
71 return None
72 return fm
73 except FileNotFoundError as err:
74 LOG.warning("{0} is not found".format(file))
75 return None
76 except UnicodeDecodeError as err:
77 LOG.warning("{0} is not a valid file".format(file))
78 return None
79 except:
80 LOG.warning("{0} is an unexpected file".format(file))
81 return None
82 LOG.warning("{0} is not a fasta/fastq file".format(file))
83 return None
84
85
86 def get_genome_names_from_files(files, temp_dir):
87 """
88 For each file:
89 Takes the first header from a fasta file and sends it to the get_genome_name
90 function. Returns the name of the genome. If the name of the file is to be
91 used as the genome name, creates a temporary file using >lcl|filename as the
92 first part of the header.
93
94 Args:
95 files (list): The list of files to get the genome names for
96 tempdir (str): A tempdir where the copied files will be stored
97
98 Returns:
99 tuple(list(str), list(str)): first list is genome names, second list is file names
100 """
101
102 list_of_genomes = []
103 list_of_files = []
104 for file in files:
105 # Always to use the filename as the genome name.
106 # This means we also need to create a new file adding
107 # the filename to each of the headers, so that downstream applications
108 # (eg. BLAST) can be used with the filename as genome name.
109
110 # get only the name of the file for use in the fasta header
111 file_base_name = os.path.basename(file)
112 file_path_name = os.path.splitext(file_base_name)[0]
113 n_name = file_path_name.replace(' ', '_')
114
115 # create a new file for the updated fasta headers
116 new_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False).name
117
118 # add the new name to the list of files and genomes
119 list_of_files.append(new_file)
120 list_of_genomes.append(n_name)
121
122 with open(new_file, "w") as outfile:
123 with open(file) as infile:
124 for record in SeqIO.parse(infile, "fasta"):
125 outfile.write(">lcl|" + n_name + "|" + record.description + "\n")
126 outfile.write(str(record.seq) + "\n")
127
128 return list_of_genomes, list_of_files
129
130
131 def get_genome_name(header):
132 """
133 Getting the name of the genome by hierarchy. This requires reading the first
134 fasta header from the file. It also assumes a single genome per file.
135
136 Args:
137 header (str): The header containing the record.
138
139 Returns:
140 genomeName (str): Name of the genome contained in the header.
141 """
142
143 re_patterns = (
144 # Look for lcl followed by the possible genome name
145 re.compile('(lcl\|[\w\-\.]+)'),
146
147 # Look for contigs in the wwwwdddddd format
148 re.compile('([A-Za-z]{4}\d{2})\d{6}'),
149
150 # Look for a possible genome name at the beginning of the record ID
151 re.compile('^(\w{8}\.\d)'),
152
153 # Look for ref, gb, emb or dbj followed by the possible genome name
154 re.compile('(ref\|\w{2}_\w{6}|gb\|\w{8}|emb\|\w{8}|dbj\|\w{8})'),
155
156 # Look for gi followed by the possible genome name
157 re.compile('(gi\|\d{8})'),
158
159
160 # Look for name followed by space, then description
161 re.compile('^([\w\-\.]+)\s+[\w\-\.]+')
162 )
163
164 # if nothing matches, use the full header as genome_name
165 genome_name = header
166 for rep in re_patterns:
167 m = rep.search(header)
168
169 if m:
170 genome_name = m.group(1)
171 break
172
173 return str(genome_name)
174
175 def assemble_reads(reads, reference, temp_dir):
176 '''
177 Return path to assembled reads.
178 Assemble short reads by mapping to a reference genome.
179 Default output is the same as reads file
180 (basename+'iden.fasta' and basename+'pred.fasta').
181
182 Args:
183 reads (str): FASTQ/FQ format reads file
184 reference (str): FASTA format reference file
185 temp_dir (str): temp_dir for storing assembled files
186
187 Returns:
188 tuple(str, str): identifcation and prediction fasta file
189 '''
190 output = os.path.join(
191 temp_dir,
192 os.path.splitext(os.path.basename(reads))[0]+'.fasta'
193 )
194
195 # run once if index reference does not exist
196 index_path = \
197 os.path.join(
198 definitions.DATA_DIR,
199 'bowtie_index',
200 os.path.splitext(os.path.basename(reference))[0]
201 )
202 index_dir = os.path.split(index_path)[0]
203 if not os.path.isdir(index_dir):
204 LOG.info('Reference index does not exist. Creating new reference index at {}'.format(index_dir))
205 if not os.path.exists(index_dir):
206 os.makedirs(index_dir)
207 cmd1 = [
208 'bowtie2-build',
209 reference,
210 index_path
211 ]
212 subprocess_util.run_subprocess(cmd1)
213
214 cmd2 = [
215 'bowtie2',
216 '--score-min L,1,-0.5',
217 '--np 5',
218 '-x', index_path,
219 '-U', reads,
220 '-S', os.path.join(temp_dir, 'reads.sam')
221 ]
222 subprocess_util.run_subprocess(cmd2)
223
224 cmd3 = [
225 definitions.SAMTOOLS, 'view',
226 '-F 4',
227 '-q 1',
228 '-bS', os.path.join(temp_dir, 'reads.sam'),
229 '-o', os.path.join(temp_dir, 'reads.bam')
230 ]
231 subprocess_util.run_subprocess(cmd3)
232 cmd4 = [
233 definitions.SAMTOOLS, 'sort',
234 os.path.join(temp_dir, 'reads.bam'),
235 '-o', os.path.join(temp_dir, 'reads.sorted.bam'),
236 ]
237 subprocess_util.run_subprocess(cmd4)
238
239 shell_cmd = [
240 definitions.SAMTOOLS+' mpileup -uf', # mpileup
241 reference,
242 os.path.join(temp_dir, 'reads.sorted.bam'),
243 '|',
244 'bcftools call -c', # variant calling
245 '|',
246 'vcfutils.pl vcf2fq', # vcf to fq
247 '|',
248 'seqtk seq -A -', # fq to fasta
249 '>',
250 output
251 ]
252 subprocess_util.run_subprocess(' '.join(shell_cmd), is_shell=True)
253 return split_mapped_output(output)
254
255 def split_mapped_output(file):
256 '''
257 Splits given fasta files into two file based on 'lcl' tags
258 in the seq header
259
260 Args:
261 file (str): path to input fasta file
262
263 Returns:
264 (str): path to ecoli identification fasta seq
265 (str): path to serotype prediction fasta seq
266 '''
267 identif_file = os.path.splitext(file)[0]+'.iden.fasta'
268 predict_file = os.path.splitext(file)[0]+'.pred.fasta'
269 identif_seqs = []
270 predict_seqs = []
271 for record in SeqIO.parse(file, 'fasta'):
272 if 'lcl' in record.description:
273 identif_seqs.append(record)
274 else:
275 predict_seqs.append(record)
276 with open(identif_file, "w") as output_handle:
277 SeqIO.write(identif_seqs, output_handle, "fasta")
278 with open(predict_file, "w") as output_handle:
279 SeqIO.write(predict_seqs, output_handle, "fasta")
280 return identif_file, predict_file