Mercurial > repos > jpetteng > ectyper
comparison ecoli_serotyping/genomeFunctions.py @ 6:fe3ceb5c4214 draft
Uploaded
author | jpetteng |
---|---|
date | Fri, 05 Jan 2018 15:43:14 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:a202cc394af8 | 6:fe3ceb5c4214 |
---|---|
1 ''' | |
2 Genome Utilities | |
3 ''' | |
4 #!/usr/bin/env python | |
5 | |
6 import logging | |
7 import os | |
8 import re | |
9 import tempfile | |
10 from tarfile import is_tarfile | |
11 | |
12 from Bio import SeqIO | |
13 from ectyper import definitions, subprocess_util | |
14 | |
15 LOG = logging.getLogger(__name__) | |
16 | |
17 | |
18 def get_files_as_list(file_or_directory): | |
19 """ | |
20 Creates a list of files from either the given file, or all files within the | |
21 directory specified (where each file name is its absolute path). | |
22 | |
23 Args: | |
24 file_or_directory (str): file or directory name given on commandline | |
25 | |
26 Returns: | |
27 files_list (list(str)): List of all the files found. | |
28 | |
29 """ | |
30 | |
31 files_list = [] | |
32 if file_or_directory == '': | |
33 return files_list | |
34 | |
35 if os.path.isdir(file_or_directory): | |
36 LOG.info("Gathering genomes from directory " + file_or_directory) | |
37 | |
38 # Create a list containing the file names | |
39 for root, dirs, files in os.walk(file_or_directory): | |
40 for filename in files: | |
41 files_list.append(os.path.join(root, filename)) | |
42 # check if input is concatenated file locations | |
43 elif ',' in file_or_directory: | |
44 LOG.info("Using genomes in the input list") | |
45 for filename in file_or_directory.split(','): | |
46 files_list.append(os.path.abspath(filename)) | |
47 else: | |
48 LOG.info("Using genomes in file " + file_or_directory) | |
49 files_list.append(os.path.abspath(file_or_directory)) | |
50 | |
51 return sorted(files_list) | |
52 | |
53 | |
54 def get_valid_format(file): | |
55 """ | |
56 Check using SeqIO if files are valid fasta/fastq format, returns the format. | |
57 | |
58 Args: | |
59 file (str): path of file | |
60 | |
61 Returns: | |
62 fm (str or None): the file format if 'fasta' or 'fastq', otherwise None | |
63 """ | |
64 for fm in ['fastq', 'fasta']: | |
65 try: | |
66 with open(file, "r") as handle: | |
67 data = SeqIO.parse(handle, fm) | |
68 if any(data): | |
69 if is_tarfile(file): | |
70 LOG.warning("Compressed file is not supported: {}".format(file)) | |
71 return None | |
72 return fm | |
73 except FileNotFoundError as err: | |
74 LOG.warning("{0} is not found".format(file)) | |
75 return None | |
76 except UnicodeDecodeError as err: | |
77 LOG.warning("{0} is not a valid file".format(file)) | |
78 return None | |
79 except: | |
80 LOG.warning("{0} is an unexpected file".format(file)) | |
81 return None | |
82 LOG.warning("{0} is not a fasta/fastq file".format(file)) | |
83 return None | |
84 | |
85 | |
86 def get_genome_names_from_files(files, temp_dir): | |
87 """ | |
88 For each file: | |
89 Takes the first header from a fasta file and sends it to the get_genome_name | |
90 function. Returns the name of the genome. If the name of the file is to be | |
91 used as the genome name, creates a temporary file using >lcl|filename as the | |
92 first part of the header. | |
93 | |
94 Args: | |
95 files (list): The list of files to get the genome names for | |
96 tempdir (str): A tempdir where the copied files will be stored | |
97 | |
98 Returns: | |
99 tuple(list(str), list(str)): first list is genome names, second list is file names | |
100 """ | |
101 | |
102 list_of_genomes = [] | |
103 list_of_files = [] | |
104 for file in files: | |
105 # Always to use the filename as the genome name. | |
106 # This means we also need to create a new file adding | |
107 # the filename to each of the headers, so that downstream applications | |
108 # (eg. BLAST) can be used with the filename as genome name. | |
109 | |
110 # get only the name of the file for use in the fasta header | |
111 file_base_name = os.path.basename(file) | |
112 file_path_name = os.path.splitext(file_base_name)[0] | |
113 n_name = file_path_name.replace(' ', '_') | |
114 | |
115 # create a new file for the updated fasta headers | |
116 new_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False).name | |
117 | |
118 # add the new name to the list of files and genomes | |
119 list_of_files.append(new_file) | |
120 list_of_genomes.append(n_name) | |
121 | |
122 with open(new_file, "w") as outfile: | |
123 with open(file) as infile: | |
124 for record in SeqIO.parse(infile, "fasta"): | |
125 outfile.write(">lcl|" + n_name + "|" + record.description + "\n") | |
126 outfile.write(str(record.seq) + "\n") | |
127 | |
128 return list_of_genomes, list_of_files | |
129 | |
130 | |
131 def get_genome_name(header): | |
132 """ | |
133 Getting the name of the genome by hierarchy. This requires reading the first | |
134 fasta header from the file. It also assumes a single genome per file. | |
135 | |
136 Args: | |
137 header (str): The header containing the record. | |
138 | |
139 Returns: | |
140 genomeName (str): Name of the genome contained in the header. | |
141 """ | |
142 | |
143 re_patterns = ( | |
144 # Look for lcl followed by the possible genome name | |
145 re.compile('(lcl\|[\w\-\.]+)'), | |
146 | |
147 # Look for contigs in the wwwwdddddd format | |
148 re.compile('([A-Za-z]{4}\d{2})\d{6}'), | |
149 | |
150 # Look for a possible genome name at the beginning of the record ID | |
151 re.compile('^(\w{8}\.\d)'), | |
152 | |
153 # Look for ref, gb, emb or dbj followed by the possible genome name | |
154 re.compile('(ref\|\w{2}_\w{6}|gb\|\w{8}|emb\|\w{8}|dbj\|\w{8})'), | |
155 | |
156 # Look for gi followed by the possible genome name | |
157 re.compile('(gi\|\d{8})'), | |
158 | |
159 | |
160 # Look for name followed by space, then description | |
161 re.compile('^([\w\-\.]+)\s+[\w\-\.]+') | |
162 ) | |
163 | |
164 # if nothing matches, use the full header as genome_name | |
165 genome_name = header | |
166 for rep in re_patterns: | |
167 m = rep.search(header) | |
168 | |
169 if m: | |
170 genome_name = m.group(1) | |
171 break | |
172 | |
173 return str(genome_name) | |
174 | |
175 def assemble_reads(reads, reference, temp_dir): | |
176 ''' | |
177 Return path to assembled reads. | |
178 Assemble short reads by mapping to a reference genome. | |
179 Default output is the same as reads file | |
180 (basename+'iden.fasta' and basename+'pred.fasta'). | |
181 | |
182 Args: | |
183 reads (str): FASTQ/FQ format reads file | |
184 reference (str): FASTA format reference file | |
185 temp_dir (str): temp_dir for storing assembled files | |
186 | |
187 Returns: | |
188 tuple(str, str): identifcation and prediction fasta file | |
189 ''' | |
190 output = os.path.join( | |
191 temp_dir, | |
192 os.path.splitext(os.path.basename(reads))[0]+'.fasta' | |
193 ) | |
194 | |
195 # run once if index reference does not exist | |
196 index_path = \ | |
197 os.path.join( | |
198 definitions.DATA_DIR, | |
199 'bowtie_index', | |
200 os.path.splitext(os.path.basename(reference))[0] | |
201 ) | |
202 index_dir = os.path.split(index_path)[0] | |
203 if not os.path.isdir(index_dir): | |
204 LOG.info('Reference index does not exist. Creating new reference index at {}'.format(index_dir)) | |
205 if not os.path.exists(index_dir): | |
206 os.makedirs(index_dir) | |
207 cmd1 = [ | |
208 'bowtie2-build', | |
209 reference, | |
210 index_path | |
211 ] | |
212 subprocess_util.run_subprocess(cmd1) | |
213 | |
214 cmd2 = [ | |
215 'bowtie2', | |
216 '--score-min L,1,-0.5', | |
217 '--np 5', | |
218 '-x', index_path, | |
219 '-U', reads, | |
220 '-S', os.path.join(temp_dir, 'reads.sam') | |
221 ] | |
222 subprocess_util.run_subprocess(cmd2) | |
223 | |
224 cmd3 = [ | |
225 definitions.SAMTOOLS, 'view', | |
226 '-F 4', | |
227 '-q 1', | |
228 '-bS', os.path.join(temp_dir, 'reads.sam'), | |
229 '-o', os.path.join(temp_dir, 'reads.bam') | |
230 ] | |
231 subprocess_util.run_subprocess(cmd3) | |
232 cmd4 = [ | |
233 definitions.SAMTOOLS, 'sort', | |
234 os.path.join(temp_dir, 'reads.bam'), | |
235 '-o', os.path.join(temp_dir, 'reads.sorted.bam'), | |
236 ] | |
237 subprocess_util.run_subprocess(cmd4) | |
238 | |
239 shell_cmd = [ | |
240 definitions.SAMTOOLS+' mpileup -uf', # mpileup | |
241 reference, | |
242 os.path.join(temp_dir, 'reads.sorted.bam'), | |
243 '|', | |
244 'bcftools call -c', # variant calling | |
245 '|', | |
246 'vcfutils.pl vcf2fq', # vcf to fq | |
247 '|', | |
248 'seqtk seq -A -', # fq to fasta | |
249 '>', | |
250 output | |
251 ] | |
252 subprocess_util.run_subprocess(' '.join(shell_cmd), is_shell=True) | |
253 return split_mapped_output(output) | |
254 | |
255 def split_mapped_output(file): | |
256 ''' | |
257 Splits given fasta files into two file based on 'lcl' tags | |
258 in the seq header | |
259 | |
260 Args: | |
261 file (str): path to input fasta file | |
262 | |
263 Returns: | |
264 (str): path to ecoli identification fasta seq | |
265 (str): path to serotype prediction fasta seq | |
266 ''' | |
267 identif_file = os.path.splitext(file)[0]+'.iden.fasta' | |
268 predict_file = os.path.splitext(file)[0]+'.pred.fasta' | |
269 identif_seqs = [] | |
270 predict_seqs = [] | |
271 for record in SeqIO.parse(file, 'fasta'): | |
272 if 'lcl' in record.description: | |
273 identif_seqs.append(record) | |
274 else: | |
275 predict_seqs.append(record) | |
276 with open(identif_file, "w") as output_handle: | |
277 SeqIO.write(identif_seqs, output_handle, "fasta") | |
278 with open(predict_file, "w") as output_handle: | |
279 SeqIO.write(predict_seqs, output_handle, "fasta") | |
280 return identif_file, predict_file |