Mercurial > repos > jpetteng > ectyper
comparison ecoli_serotyping/genomeFunctions.py @ 6:fe3ceb5c4214 draft
Uploaded
| author | jpetteng |
|---|---|
| date | Fri, 05 Jan 2018 15:43:14 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 5:a202cc394af8 | 6:fe3ceb5c4214 |
|---|---|
| 1 ''' | |
| 2 Genome Utilities | |
| 3 ''' | |
| 4 #!/usr/bin/env python | |
| 5 | |
| 6 import logging | |
| 7 import os | |
| 8 import re | |
| 9 import tempfile | |
| 10 from tarfile import is_tarfile | |
| 11 | |
| 12 from Bio import SeqIO | |
| 13 from ectyper import definitions, subprocess_util | |
| 14 | |
| 15 LOG = logging.getLogger(__name__) | |
| 16 | |
| 17 | |
| 18 def get_files_as_list(file_or_directory): | |
| 19 """ | |
| 20 Creates a list of files from either the given file, or all files within the | |
| 21 directory specified (where each file name is its absolute path). | |
| 22 | |
| 23 Args: | |
| 24 file_or_directory (str): file or directory name given on commandline | |
| 25 | |
| 26 Returns: | |
| 27 files_list (list(str)): List of all the files found. | |
| 28 | |
| 29 """ | |
| 30 | |
| 31 files_list = [] | |
| 32 if file_or_directory == '': | |
| 33 return files_list | |
| 34 | |
| 35 if os.path.isdir(file_or_directory): | |
| 36 LOG.info("Gathering genomes from directory " + file_or_directory) | |
| 37 | |
| 38 # Create a list containing the file names | |
| 39 for root, dirs, files in os.walk(file_or_directory): | |
| 40 for filename in files: | |
| 41 files_list.append(os.path.join(root, filename)) | |
| 42 # check if input is concatenated file locations | |
| 43 elif ',' in file_or_directory: | |
| 44 LOG.info("Using genomes in the input list") | |
| 45 for filename in file_or_directory.split(','): | |
| 46 files_list.append(os.path.abspath(filename)) | |
| 47 else: | |
| 48 LOG.info("Using genomes in file " + file_or_directory) | |
| 49 files_list.append(os.path.abspath(file_or_directory)) | |
| 50 | |
| 51 return sorted(files_list) | |
| 52 | |
| 53 | |
| 54 def get_valid_format(file): | |
| 55 """ | |
| 56 Check using SeqIO if files are valid fasta/fastq format, returns the format. | |
| 57 | |
| 58 Args: | |
| 59 file (str): path of file | |
| 60 | |
| 61 Returns: | |
| 62 fm (str or None): the file format if 'fasta' or 'fastq', otherwise None | |
| 63 """ | |
| 64 for fm in ['fastq', 'fasta']: | |
| 65 try: | |
| 66 with open(file, "r") as handle: | |
| 67 data = SeqIO.parse(handle, fm) | |
| 68 if any(data): | |
| 69 if is_tarfile(file): | |
| 70 LOG.warning("Compressed file is not supported: {}".format(file)) | |
| 71 return None | |
| 72 return fm | |
| 73 except FileNotFoundError as err: | |
| 74 LOG.warning("{0} is not found".format(file)) | |
| 75 return None | |
| 76 except UnicodeDecodeError as err: | |
| 77 LOG.warning("{0} is not a valid file".format(file)) | |
| 78 return None | |
| 79 except: | |
| 80 LOG.warning("{0} is an unexpected file".format(file)) | |
| 81 return None | |
| 82 LOG.warning("{0} is not a fasta/fastq file".format(file)) | |
| 83 return None | |
| 84 | |
| 85 | |
| 86 def get_genome_names_from_files(files, temp_dir): | |
| 87 """ | |
| 88 For each file: | |
| 89 Takes the first header from a fasta file and sends it to the get_genome_name | |
| 90 function. Returns the name of the genome. If the name of the file is to be | |
| 91 used as the genome name, creates a temporary file using >lcl|filename as the | |
| 92 first part of the header. | |
| 93 | |
| 94 Args: | |
| 95 files (list): The list of files to get the genome names for | |
| 96 tempdir (str): A tempdir where the copied files will be stored | |
| 97 | |
| 98 Returns: | |
| 99 tuple(list(str), list(str)): first list is genome names, second list is file names | |
| 100 """ | |
| 101 | |
| 102 list_of_genomes = [] | |
| 103 list_of_files = [] | |
| 104 for file in files: | |
| 105 # Always to use the filename as the genome name. | |
| 106 # This means we also need to create a new file adding | |
| 107 # the filename to each of the headers, so that downstream applications | |
| 108 # (eg. BLAST) can be used with the filename as genome name. | |
| 109 | |
| 110 # get only the name of the file for use in the fasta header | |
| 111 file_base_name = os.path.basename(file) | |
| 112 file_path_name = os.path.splitext(file_base_name)[0] | |
| 113 n_name = file_path_name.replace(' ', '_') | |
| 114 | |
| 115 # create a new file for the updated fasta headers | |
| 116 new_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False).name | |
| 117 | |
| 118 # add the new name to the list of files and genomes | |
| 119 list_of_files.append(new_file) | |
| 120 list_of_genomes.append(n_name) | |
| 121 | |
| 122 with open(new_file, "w") as outfile: | |
| 123 with open(file) as infile: | |
| 124 for record in SeqIO.parse(infile, "fasta"): | |
| 125 outfile.write(">lcl|" + n_name + "|" + record.description + "\n") | |
| 126 outfile.write(str(record.seq) + "\n") | |
| 127 | |
| 128 return list_of_genomes, list_of_files | |
| 129 | |
| 130 | |
| 131 def get_genome_name(header): | |
| 132 """ | |
| 133 Getting the name of the genome by hierarchy. This requires reading the first | |
| 134 fasta header from the file. It also assumes a single genome per file. | |
| 135 | |
| 136 Args: | |
| 137 header (str): The header containing the record. | |
| 138 | |
| 139 Returns: | |
| 140 genomeName (str): Name of the genome contained in the header. | |
| 141 """ | |
| 142 | |
| 143 re_patterns = ( | |
| 144 # Look for lcl followed by the possible genome name | |
| 145 re.compile('(lcl\|[\w\-\.]+)'), | |
| 146 | |
| 147 # Look for contigs in the wwwwdddddd format | |
| 148 re.compile('([A-Za-z]{4}\d{2})\d{6}'), | |
| 149 | |
| 150 # Look for a possible genome name at the beginning of the record ID | |
| 151 re.compile('^(\w{8}\.\d)'), | |
| 152 | |
| 153 # Look for ref, gb, emb or dbj followed by the possible genome name | |
| 154 re.compile('(ref\|\w{2}_\w{6}|gb\|\w{8}|emb\|\w{8}|dbj\|\w{8})'), | |
| 155 | |
| 156 # Look for gi followed by the possible genome name | |
| 157 re.compile('(gi\|\d{8})'), | |
| 158 | |
| 159 | |
| 160 # Look for name followed by space, then description | |
| 161 re.compile('^([\w\-\.]+)\s+[\w\-\.]+') | |
| 162 ) | |
| 163 | |
| 164 # if nothing matches, use the full header as genome_name | |
| 165 genome_name = header | |
| 166 for rep in re_patterns: | |
| 167 m = rep.search(header) | |
| 168 | |
| 169 if m: | |
| 170 genome_name = m.group(1) | |
| 171 break | |
| 172 | |
| 173 return str(genome_name) | |
| 174 | |
| 175 def assemble_reads(reads, reference, temp_dir): | |
| 176 ''' | |
| 177 Return path to assembled reads. | |
| 178 Assemble short reads by mapping to a reference genome. | |
| 179 Default output is the same as reads file | |
| 180 (basename+'iden.fasta' and basename+'pred.fasta'). | |
| 181 | |
| 182 Args: | |
| 183 reads (str): FASTQ/FQ format reads file | |
| 184 reference (str): FASTA format reference file | |
| 185 temp_dir (str): temp_dir for storing assembled files | |
| 186 | |
| 187 Returns: | |
| 188 tuple(str, str): identifcation and prediction fasta file | |
| 189 ''' | |
| 190 output = os.path.join( | |
| 191 temp_dir, | |
| 192 os.path.splitext(os.path.basename(reads))[0]+'.fasta' | |
| 193 ) | |
| 194 | |
| 195 # run once if index reference does not exist | |
| 196 index_path = \ | |
| 197 os.path.join( | |
| 198 definitions.DATA_DIR, | |
| 199 'bowtie_index', | |
| 200 os.path.splitext(os.path.basename(reference))[0] | |
| 201 ) | |
| 202 index_dir = os.path.split(index_path)[0] | |
| 203 if not os.path.isdir(index_dir): | |
| 204 LOG.info('Reference index does not exist. Creating new reference index at {}'.format(index_dir)) | |
| 205 if not os.path.exists(index_dir): | |
| 206 os.makedirs(index_dir) | |
| 207 cmd1 = [ | |
| 208 'bowtie2-build', | |
| 209 reference, | |
| 210 index_path | |
| 211 ] | |
| 212 subprocess_util.run_subprocess(cmd1) | |
| 213 | |
| 214 cmd2 = [ | |
| 215 'bowtie2', | |
| 216 '--score-min L,1,-0.5', | |
| 217 '--np 5', | |
| 218 '-x', index_path, | |
| 219 '-U', reads, | |
| 220 '-S', os.path.join(temp_dir, 'reads.sam') | |
| 221 ] | |
| 222 subprocess_util.run_subprocess(cmd2) | |
| 223 | |
| 224 cmd3 = [ | |
| 225 definitions.SAMTOOLS, 'view', | |
| 226 '-F 4', | |
| 227 '-q 1', | |
| 228 '-bS', os.path.join(temp_dir, 'reads.sam'), | |
| 229 '-o', os.path.join(temp_dir, 'reads.bam') | |
| 230 ] | |
| 231 subprocess_util.run_subprocess(cmd3) | |
| 232 cmd4 = [ | |
| 233 definitions.SAMTOOLS, 'sort', | |
| 234 os.path.join(temp_dir, 'reads.bam'), | |
| 235 '-o', os.path.join(temp_dir, 'reads.sorted.bam'), | |
| 236 ] | |
| 237 subprocess_util.run_subprocess(cmd4) | |
| 238 | |
| 239 shell_cmd = [ | |
| 240 definitions.SAMTOOLS+' mpileup -uf', # mpileup | |
| 241 reference, | |
| 242 os.path.join(temp_dir, 'reads.sorted.bam'), | |
| 243 '|', | |
| 244 'bcftools call -c', # variant calling | |
| 245 '|', | |
| 246 'vcfutils.pl vcf2fq', # vcf to fq | |
| 247 '|', | |
| 248 'seqtk seq -A -', # fq to fasta | |
| 249 '>', | |
| 250 output | |
| 251 ] | |
| 252 subprocess_util.run_subprocess(' '.join(shell_cmd), is_shell=True) | |
| 253 return split_mapped_output(output) | |
| 254 | |
| 255 def split_mapped_output(file): | |
| 256 ''' | |
| 257 Splits given fasta files into two file based on 'lcl' tags | |
| 258 in the seq header | |
| 259 | |
| 260 Args: | |
| 261 file (str): path to input fasta file | |
| 262 | |
| 263 Returns: | |
| 264 (str): path to ecoli identification fasta seq | |
| 265 (str): path to serotype prediction fasta seq | |
| 266 ''' | |
| 267 identif_file = os.path.splitext(file)[0]+'.iden.fasta' | |
| 268 predict_file = os.path.splitext(file)[0]+'.pred.fasta' | |
| 269 identif_seqs = [] | |
| 270 predict_seqs = [] | |
| 271 for record in SeqIO.parse(file, 'fasta'): | |
| 272 if 'lcl' in record.description: | |
| 273 identif_seqs.append(record) | |
| 274 else: | |
| 275 predict_seqs.append(record) | |
| 276 with open(identif_file, "w") as output_handle: | |
| 277 SeqIO.write(identif_seqs, output_handle, "fasta") | |
| 278 with open(predict_file, "w") as output_handle: | |
| 279 SeqIO.write(predict_seqs, output_handle, "fasta") | |
| 280 return identif_file, predict_file |
