Mercurial > repos > jpetteng > ectyper
changeset 6:fe3ceb5c4214 draft
Uploaded
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ecoli_serotyping/blastFunctions.py Fri Jan 05 15:43:14 2018 -0500 @@ -0,0 +1,113 @@ +#!/usr/bin/env python + +""" +Functions for setting up, running, and parsing blast +""" +import logging +import os + +from ectyper import subprocess_util + +LOG = logging.getLogger(__name__) + + +def create_blast_db(filelist, temp_dir): + """http://stackoverflow.com/questions/23944657/typeerror-method-takes-1-positional-argument-but-2-were-given + Creating a blast DB using the makeblastdb command. + The database is created in the temporary folder of the system. + + Args: + filelist: list of genomes that was given by the user on the command line. + temp_dir: temporary directory to store the blastdb in. + + Returns: + Full path to the DB + """ + blast_db_path = os.path.join(temp_dir, 'ectyper_blastdb') + + LOG.debug("Generating the blast db at {0}".format(blast_db_path)) + cmd = [ + "makeblastdb", + "-in", ' '.join(filelist), + "-dbtype", "nucl", + "-title", "ectyper_blastdb", + "-out", blast_db_path] + subprocess_util.run_subprocess(cmd) + + return blast_db_path + + +def run_blast(query_file, blast_db, args): + """ + Execute a blastn run given the query files and blastdb + + Args: + query_file (str): one or both of the VF / Serotype input files + blast_db (str): validated fasta files from the user, in DB form + args (Namespace object): parsed commadnline options from the user + chunck_size: number of genomes in the database + + Returns: + The blast output file + """ + percent_identity = args.percentIdentity + percent_length = args.percentLength + + LOG.debug('Running blast query {0} against database {1} '.format( + query_file, blast_db)) + + blast_output_file = blast_db + '.output' + + cmd = [ + "blastn", + "-query", query_file, + "-db", blast_db, + "-out", blast_output_file, + '-perc_identity', str(percent_identity), + '-qcov_hsp_perc', str(percent_length), + '-max_hsps', '1', # each allele only need to hit once + # use default max_target_seqs=500 + "-outfmt", + '6 qseqid qlen sseqid length pident sstart send sframe qcovhsp', + "-word_size", "11" + ] + subprocess_util.run_subprocess(cmd) + with open(blast_output_file, mode='rb') as fh: + for line in fh: + LOG.debug(line.decode('ascii')) + return blast_output_file + +def run_blast_for_identification(query_file, blast_db): + """ + Execute a blastn run given the query files and blastdb + with special configuration for high performance identification + + Args: + query_file: one or both of the VF / Serotype input files + blast_db: validated fasta files from the user, in DB form + + Returns: + blast_output_file (str): path to the blast output file + """ + + LOG.debug('Running blast query {0} against database {1} '.format( + query_file, blast_db)) + + blast_output_file = blast_db + '.output' + + cmd = [ + "blastn", + "-query", query_file, + "-db", blast_db, + "-out", blast_output_file, + '-perc_identity', '90', + '-qcov_hsp_perc', '90', + '-max_target_seqs', '1', # we only want to know hit/no hit + # 10 query seq, we want at most 1 hit each + "-outfmt", + '6 qseqid qlen sseqid length pident sstart send sframe', + "-word_size", "11" + ] + subprocess_util.run_subprocess(cmd) + + return blast_output_file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ecoli_serotyping/commandLineOptions.py Fri Jan 05 15:43:14 2018 -0500 @@ -0,0 +1,85 @@ +#!/usr/bin/env python + +import argparse + + +def parse_command_line(args=None): + """ + The options for both the serotyper, and virulence finder. + The returned object is used by both, but the options do not + necessarily apply to both. + + Args: + args: Optional args to be passed to argparse.parse_args() + + Returns: + The populated argparse Namespace + """ + + def check_percentage(value): + """ + type checker for percentage input + """ + ivalue = int(value) + if ivalue <= 0 or ivalue > 100: + raise argparse.ArgumentTypeError( + "{0} is an invalid positive int percentage value".format(value) + ) + return ivalue + + parser = argparse.ArgumentParser() + parser.add_argument( + "-i", + "--input", + help="Location of new file(s). Can be a single file or \ + a directory", + required=True + ) + + parser.add_argument( + "-d", + "--percentIdentity", + type=check_percentage, + help="Percentage of identity wanted to use against the\ + database. From 0 to 100, default is 90%%.", + default=90 + ) + + parser.add_argument( + "-l", + "--percentLength", + type=check_percentage, + help="Percentage of length wanted to use against the \ + database. From 0 to 100, default is 50%%.", + default=50 + ) + + parser.add_argument( + "--verify", + action="store_true", + help="Enable E. Coli. verification" + ) + + parser.add_argument( + "-s", + "--species", + action="store_true", + help="Enable species identification when non-ecoli genome is found\n\ + Note: refseq downloading is required when running this option for the first time." + ) + + parser.add_argument( + '--detailed', + action='store_true', + help='Enable detailed output' + ) + + parser.add_argument( + "-o", + "--output", + help="Directory location of output files." + ) + + if args is None: + return parser.parse_args() + return parser.parse_args(args)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ecoli_serotyping/definitions.py Fri Jan 05 15:43:14 2018 -0500 @@ -0,0 +1,20 @@ +#!/usr/bin/env python + +""" + Definitions for the ectyper project +""" + +import os + +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) +DATA_DIR = os.path.join(ROOT_DIR, 'Data') +WORKPLACE_DIR = os.getcwd() + +SEROTYPE_FILE = os.path.join(DATA_DIR, 'ectyper_data.fasta') +SEROTYPE_ALLELE_JSON = os.path.join(DATA_DIR, 'ectyper_dict.json') +COMBINED = os.path.join(DATA_DIR, 'combined.fasta') + +ECOLI_MARKERS = os.path.join(DATA_DIR, 'ecoli_specific_markers.fasta') +SAMTOOLS = 'samtools' +REFSEQ_SUMMARY = os.path.join(DATA_DIR, 'assembly_summary_refseq.txt') +REFSEQ_SKETCH = os.path.join(DATA_DIR, 'refseq.genomes.k21s1000.msh')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ecoli_serotyping/ectyper.py Fri Jan 05 15:43:14 2018 -0500 @@ -0,0 +1,303 @@ +#!/usr/bin/env python +""" + Predictive serotyping for _E. coli_. +""" +import logging +import os +import sys +import tempfile +import datetime +from urllib.request import urlretrieve + +from ectyper import (blastFunctions, commandLineOptions, definitions, + genomeFunctions, loggingFunctions, predictionFunctions, + speciesIdentification) + +LOG_FILE = loggingFunctions.initialize_logging() +LOG = logging.getLogger(__name__) + + +def run_program(): + """ + Wrapper for both the serotyping and virulence finder + The program needs to do the following + (1) Filter genome files based on format + (2) Filter genome files based on species + (3) Map FASTQ files to FASTA seq + (1) Get names of all genomes being tested + (2) Create a BLAST database of those genomes + (3) Query for serotype + (4) Parse the results + (5) Display the results + :return: success or failure + + """ + # Initialize the program + args = commandLineOptions.parse_command_line() + LOG.info('Starting ectyper\nSerotype prediction with input:\n \ + {0}\n \ + Log file is: {1}'.format(args, LOG_FILE)) + LOG.debug(args) + + ## Initialize temporary directories for the scope of this program + with tempfile.TemporaryDirectory() as temp_dir: + temp_files = create_tmp_files(temp_dir, output_dir=args.output) + LOG.debug(temp_files) + + # Download refseq file if speicies identification is enabled + if args.species: + download_refseq() + + LOG.info("Gathering genome files") + raw_genome_files = genomeFunctions.get_files_as_list(args.input) + LOG.debug(raw_genome_files) + + LOG.info("Removing invalid file types") + raw_files_dict = get_raw_files(raw_genome_files) + LOG.debug(raw_files_dict) + + # Assembling fastq + verify ecoli genome + LOG.info("Preparing genome files for blast alignment") + final_fasta_files = filter_for_ecoli_files( + raw_files_dict, temp_files, verify=args.verify, species=args.species + ) + LOG.debug(final_fasta_files) + if len(final_fasta_files) is 0: + LOG.info("No valid genome files. Terminating the program.") + exit(0) + + LOG.info("Standardizing the genome headers") + (all_genomes_list, all_genomes_files) = \ + genomeFunctions.get_genome_names_from_files( + final_fasta_files, temp_files['fasta_temp_dir']) + LOG.debug(all_genomes_list) + LOG.debug(all_genomes_files) + + # Main prediction function + predictions_file = run_prediction(all_genomes_files, args, + temp_files['output_file']) + + # Add empty rows for genomes without blast result + predictions_file = predictionFunctions.add_non_predicted( + all_genomes_list, predictions_file) + LOG.info('Outputs stored in {0}'.format(temp_files['output_dir'])) + + # Store most recent result in working directory + LOG.info('\nReporting result...') + predictionFunctions.report_result(predictions_file) + +def download_refseq(): + '''Download refseq file with progress bar + ''' + + + def reporthook(blocknum, blocksize, totalsize): + ''' + https://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads + ''' + readsofar = blocknum * blocksize + if totalsize > 0: + s = "\r {:5.1%} {:{}d} / {:d}".format( + readsofar/totalsize, readsofar, + len(str(totalsize)), + totalsize + ) + sys.stderr.write(s) + if readsofar >= totalsize: # near the end + sys.stderr.write("\n") + else: # total size is unknown + sys.stderr.write("read {}\n".format(readsofar)) + + if not os.path.isfile(definitions.REFSEQ_SKETCH): + refseq_url = 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' + LOG.info("No refseq found. Downloading reference file for species identification...") + urlretrieve(refseq_url, definitions.REFSEQ_SKETCH, reporthook) + LOG.info("Download complete.") + +def create_tmp_files(temp_dir, output_dir=None): + """Create a dictionary of temporary files used by ectyper + + Args: + temp_dir: program scope temporary directory + output_dir(str, optional): + directory to store output + + Return: + a dictionary of temporary files + example: + {'assemble_temp_dir': 'test/temp/assemblies', + 'fasta_temp_dir': 'test/temp/fastas', + 'output_dir': os.path.abspath('output')+'/', + 'output_file': os.path.abspath('output/output.csv')} + """ + + # Get the correct files and directories + files_and_dirs = { + 'assemble_temp_dir': os.path.join(temp_dir, 'assemblies'), + 'fasta_temp_dir': os.path.join(temp_dir, 'fastas'), + } + if output_dir is None: + output_dir = ''.join([ + str(datetime.datetime.now().date()), + '_', + str(datetime.datetime.now().time()).replace(':', '.') + ]) + if os.path.isabs(output_dir): + pass + else: + output_dir = os.path.join(definitions.WORKPLACE_DIR, 'output', output_dir) + + output_file = os.path.join(output_dir, 'output.csv') + if os.path.isfile(output_file): + os.remove(output_file) + for d in [ + output_dir, files_and_dirs['assemble_temp_dir'], + files_and_dirs['fasta_temp_dir'] + ]: + if not os.path.exists(d): + os.makedirs(d) + + # Finalize the tmp_files dictionary + files_and_dirs['output_file'] = output_file + files_and_dirs['output_dir'] = output_dir + + LOG.info("Temporary files and directory created") + return files_and_dirs + + +def run_prediction(genome_files, args, predictions_file): + '''Core prediction functionality + + Args: + genome_files: + list of genome files + args: + commandline arguments + predictions_file: + filename of prediction output + + Returns: + predictions_file with prediction written in it + ''' + query_file = definitions.SEROTYPE_FILE + ectyper_dict_file = definitions.SEROTYPE_ALLELE_JSON + # create a temp dir for blastdb + with tempfile.TemporaryDirectory() as temp_dir: + # Divide genome files into chunks + chunk_size = 50 + genome_chunks = [ + genome_files[i:i + chunk_size] + for i in range(0, len(genome_files), chunk_size) + ] + for index, chunk in enumerate(genome_chunks): + LOG.info("Start creating blast database #{0}".format(index + 1)) + blast_db = blastFunctions.create_blast_db(chunk, temp_dir) + + LOG.info("Start blast alignment on database #{0}".format(index + 1)) + blast_output_file = blastFunctions.run_blast(query_file, blast_db, args) + LOG.info("Start serotype prediction for database #{0}".format(index + 1)) + predictions_file = predictionFunctions.predict_serotype( + blast_output_file, ectyper_dict_file, predictions_file, + args.detailed) + return predictions_file + + +def get_raw_files(raw_files): + """Take all the raw files, and filter not fasta / fastq + + Args: + raw_files(str): list of files from user input + + Returns: + A dictitionary collection of fasta and fastq files + example: + {'raw_fasta_files':[], + 'raw_fastq_files':[]} + """ + fasta_files = [] + fastq_files = [] + + for file in raw_files: + file_format = genomeFunctions.get_valid_format(file) + if file_format == 'fasta': + fasta_files.append(file) + elif file_format == 'fastq': + fastq_files.append(file) + + LOG.debug('raw fasta files: {}'.format(fasta_files)) + LOG.debug('raw fastq files: {}'.format(fastq_files)) + + return({'fasta':fasta_files, 'fastq':fastq_files}) + + +def filter_for_ecoli_files(raw_dict, temp_files, verify=False, species=False): + """filter ecoli, identify species, assemble fastq + Assemble fastq files to fasta files, + then filter all files by reference method if verify is enabled, + if identified as non-ecoli, identify species by mash method if species is enabled. + + Args: + raw_dict{fasta:list_of_files, fastq:list_of_files}: + dictionary collection of fasta and fastq files + temp_file: temporary directory + verify(bool): + whether to perform ecoli verification + species(bool): + whether to perform species identification for non-ecoli genome + Returns: + List of filtered and assembled genome files in fasta format + """ + final_files = [] + for f in raw_dict.keys(): + temp_dir = temp_files['fasta_temp_dir'] if f == "fasta" else temp_files['assemble_temp_dir'] + + for ffile in raw_dict[f]: + filtered_file = filter_file_by_species( + ffile, f, temp_dir, verify=verify, species=species) + if filtered_file is not None and \ + genomeFunctions.get_valid_format(filtered_file) is not None: + final_files.append(filtered_file) + + LOG.info('{} final fasta files'.format(len(final_files))) + return final_files + +def filter_file_by_species(genome_file, genome_format, temp_dir, verify=False, species=False): + """filter ecoli, identify species, assemble fastq + Assemble fastq file to fasta file, + then filter the file by reference method if verify is enabled, + if identified as non-ecoli, identify species by mash method if species is enabled. + + Args: + genome_file: input genome file + genome_format(str): fasta or fastq + temp_file: temporary directory + verify(bool): + whether to perform ecoli verification + species(bool): + whether to perform species identification for non-ecoli genome + Returns: + The filtered and assembled genome files in fasta format + """ + combined_file = definitions.COMBINED + filtered_file = None + if genome_format == 'fastq': + iden_file, pred_file = \ + genomeFunctions.assemble_reads(genome_file, combined_file, temp_dir) + # If no alignment resut, the file is definitely not E.Coli + if genomeFunctions.get_valid_format(iden_file) is None: + LOG.warning( + "{} is filtered out because no identification alignment found".format(genome_file)) + return filtered_file + if not (verify or species) or speciesIdentification.is_ecoli_genome( + iden_file, genome_file, mash=species): + # final check before adding the alignment for prediction + if genomeFunctions.get_valid_format(iden_file) != 'fasta': + LOG.warning( + "{0} is filtered out because no prediction alignment found".format(genome_file)) + return filtered_file + filtered_file = pred_file + if genome_format == 'fasta': + if not (verify or species) \ + or speciesIdentification.is_ecoli_genome(genome_file, mash=species): + filtered_file = genome_file + return filtered_file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ecoli_serotyping/genomeFunctions.py Fri Jan 05 15:43:14 2018 -0500 @@ -0,0 +1,280 @@ +''' +Genome Utilities +''' +#!/usr/bin/env python + +import logging +import os +import re +import tempfile +from tarfile import is_tarfile + +from Bio import SeqIO +from ectyper import definitions, subprocess_util + +LOG = logging.getLogger(__name__) + + +def get_files_as_list(file_or_directory): + """ + Creates a list of files from either the given file, or all files within the + directory specified (where each file name is its absolute path). + + Args: + file_or_directory (str): file or directory name given on commandline + + Returns: + files_list (list(str)): List of all the files found. + + """ + + files_list = [] + if file_or_directory == '': + return files_list + + if os.path.isdir(file_or_directory): + LOG.info("Gathering genomes from directory " + file_or_directory) + + # Create a list containing the file names + for root, dirs, files in os.walk(file_or_directory): + for filename in files: + files_list.append(os.path.join(root, filename)) + # check if input is concatenated file locations + elif ',' in file_or_directory: + LOG.info("Using genomes in the input list") + for filename in file_or_directory.split(','): + files_list.append(os.path.abspath(filename)) + else: + LOG.info("Using genomes in file " + file_or_directory) + files_list.append(os.path.abspath(file_or_directory)) + + return sorted(files_list) + + +def get_valid_format(file): + """ + Check using SeqIO if files are valid fasta/fastq format, returns the format. + + Args: + file (str): path of file + + Returns: + fm (str or None): the file format if 'fasta' or 'fastq', otherwise None + """ + for fm in ['fastq', 'fasta']: + try: + with open(file, "r") as handle: + data = SeqIO.parse(handle, fm) + if any(data): + if is_tarfile(file): + LOG.warning("Compressed file is not supported: {}".format(file)) + return None + return fm + except FileNotFoundError as err: + LOG.warning("{0} is not found".format(file)) + return None + except UnicodeDecodeError as err: + LOG.warning("{0} is not a valid file".format(file)) + return None + except: + LOG.warning("{0} is an unexpected file".format(file)) + return None + LOG.warning("{0} is not a fasta/fastq file".format(file)) + return None + + +def get_genome_names_from_files(files, temp_dir): + """ + For each file: + Takes the first header from a fasta file and sends it to the get_genome_name + function. Returns the name of the genome. If the name of the file is to be + used as the genome name, creates a temporary file using >lcl|filename as the + first part of the header. + + Args: + files (list): The list of files to get the genome names for + tempdir (str): A tempdir where the copied files will be stored + + Returns: + tuple(list(str), list(str)): first list is genome names, second list is file names + """ + + list_of_genomes = [] + list_of_files = [] + for file in files: + # Always to use the filename as the genome name. + # This means we also need to create a new file adding + # the filename to each of the headers, so that downstream applications + # (eg. BLAST) can be used with the filename as genome name. + + # get only the name of the file for use in the fasta header + file_base_name = os.path.basename(file) + file_path_name = os.path.splitext(file_base_name)[0] + n_name = file_path_name.replace(' ', '_') + + # create a new file for the updated fasta headers + new_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False).name + + # add the new name to the list of files and genomes + list_of_files.append(new_file) + list_of_genomes.append(n_name) + + with open(new_file, "w") as outfile: + with open(file) as infile: + for record in SeqIO.parse(infile, "fasta"): + outfile.write(">lcl|" + n_name + "|" + record.description + "\n") + outfile.write(str(record.seq) + "\n") + + return list_of_genomes, list_of_files + + +def get_genome_name(header): + """ + Getting the name of the genome by hierarchy. This requires reading the first + fasta header from the file. It also assumes a single genome per file. + + Args: + header (str): The header containing the record. + + Returns: + genomeName (str): Name of the genome contained in the header. + """ + + re_patterns = ( + # Look for lcl followed by the possible genome name + re.compile('(lcl\|[\w\-\.]+)'), + + # Look for contigs in the wwwwdddddd format + re.compile('([A-Za-z]{4}\d{2})\d{6}'), + + # Look for a possible genome name at the beginning of the record ID + re.compile('^(\w{8}\.\d)'), + + # Look for ref, gb, emb or dbj followed by the possible genome name + re.compile('(ref\|\w{2}_\w{6}|gb\|\w{8}|emb\|\w{8}|dbj\|\w{8})'), + + # Look for gi followed by the possible genome name + re.compile('(gi\|\d{8})'), + + + # Look for name followed by space, then description + re.compile('^([\w\-\.]+)\s+[\w\-\.]+') + ) + + # if nothing matches, use the full header as genome_name + genome_name = header + for rep in re_patterns: + m = rep.search(header) + + if m: + genome_name = m.group(1) + break + + return str(genome_name) + +def assemble_reads(reads, reference, temp_dir): + ''' + Return path to assembled reads. + Assemble short reads by mapping to a reference genome. + Default output is the same as reads file + (basename+'iden.fasta' and basename+'pred.fasta'). + + Args: + reads (str): FASTQ/FQ format reads file + reference (str): FASTA format reference file + temp_dir (str): temp_dir for storing assembled files + + Returns: + tuple(str, str): identifcation and prediction fasta file + ''' + output = os.path.join( + temp_dir, + os.path.splitext(os.path.basename(reads))[0]+'.fasta' + ) + + # run once if index reference does not exist + index_path = \ + os.path.join( + definitions.DATA_DIR, + 'bowtie_index', + os.path.splitext(os.path.basename(reference))[0] + ) + index_dir = os.path.split(index_path)[0] + if not os.path.isdir(index_dir): + LOG.info('Reference index does not exist. Creating new reference index at {}'.format(index_dir)) + if not os.path.exists(index_dir): + os.makedirs(index_dir) + cmd1 = [ + 'bowtie2-build', + reference, + index_path + ] + subprocess_util.run_subprocess(cmd1) + + cmd2 = [ + 'bowtie2', + '--score-min L,1,-0.5', + '--np 5', + '-x', index_path, + '-U', reads, + '-S', os.path.join(temp_dir, 'reads.sam') + ] + subprocess_util.run_subprocess(cmd2) + + cmd3 = [ + definitions.SAMTOOLS, 'view', + '-F 4', + '-q 1', + '-bS', os.path.join(temp_dir, 'reads.sam'), + '-o', os.path.join(temp_dir, 'reads.bam') + ] + subprocess_util.run_subprocess(cmd3) + cmd4 = [ + definitions.SAMTOOLS, 'sort', + os.path.join(temp_dir, 'reads.bam'), + '-o', os.path.join(temp_dir, 'reads.sorted.bam'), + ] + subprocess_util.run_subprocess(cmd4) + + shell_cmd = [ + definitions.SAMTOOLS+' mpileup -uf', # mpileup + reference, + os.path.join(temp_dir, 'reads.sorted.bam'), + '|', + 'bcftools call -c', # variant calling + '|', + 'vcfutils.pl vcf2fq', # vcf to fq + '|', + 'seqtk seq -A -', # fq to fasta + '>', + output + ] + subprocess_util.run_subprocess(' '.join(shell_cmd), is_shell=True) + return split_mapped_output(output) + +def split_mapped_output(file): + ''' + Splits given fasta files into two file based on 'lcl' tags + in the seq header + + Args: + file (str): path to input fasta file + + Returns: + (str): path to ecoli identification fasta seq + (str): path to serotype prediction fasta seq + ''' + identif_file = os.path.splitext(file)[0]+'.iden.fasta' + predict_file = os.path.splitext(file)[0]+'.pred.fasta' + identif_seqs = [] + predict_seqs = [] + for record in SeqIO.parse(file, 'fasta'): + if 'lcl' in record.description: + identif_seqs.append(record) + else: + predict_seqs.append(record) + with open(identif_file, "w") as output_handle: + SeqIO.write(identif_seqs, output_handle, "fasta") + with open(predict_file, "w") as output_handle: + SeqIO.write(predict_seqs, output_handle, "fasta") + return identif_file, predict_file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ecoli_serotyping/loggingFunctions.py Fri Jan 05 15:43:14 2018 -0500 @@ -0,0 +1,45 @@ +#!/usr/bin/env python +""" + Set up the logging +""" + +import logging +import os + +import ectyper.definitions as D + + +def initialize_logging(): + """ + Set up the screen and file logging. + + Args: + None + + Returns: + log_file (str): The log filename + """ + + # set up DEBUG logging to file, INFO logging to STDERR + log_file = os.path.join(D.WORKPLACE_DIR, 'ectyper.log') + + formatter = logging.Formatter( + '%(asctime)s %(name)-12s %(levelname)-8s %(message)s') + + # logging to file + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', + datefmt='%m-%d %H:%M', + filename=log_file, + filemode='w') + + # define a Handler which writes INFO messages or higher to the sys.stderr + console = logging.StreamHandler() + console.setFormatter(formatter) + console.setLevel(logging.INFO) + + # add the handler to the root logger + logging.getLogger('').addHandler(console) + + return log_file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ecoli_serotyping/predictionFunctions.py Fri Jan 05 15:43:14 2018 -0500 @@ -0,0 +1,231 @@ +#!/usr/bin/env python + +import json +import logging +import os +from collections import defaultdict + +import pandas as pd + +LOG = logging.getLogger(__name__) + +""" + Serotype prediction for E. coli +""" + +def predict_serotype(blast_output_file, ectyper_dict_file, predictions_file, detailed=False): + """Make serotype prediction for all genomes based on blast output + + Args: + blast_output_file(str): + blastn output with outfmt: + "6 qseqid qlen sseqid length pident sstart send sframe qcovhsp -word_size 11" + ectyper_dict_file(str): + mapping file used to associate allele id to allele informations + predictions_file(str): + csv file to store result + detailed(bool, optional): + whether to generate detailed output or not + + Returns: + predictions_file + """ + basename, extension = os.path.splitext(predictions_file) + parsed_output_file = ''.join([basename, '_raw', extension]) + + LOG.info("Predicting serotype from blast output") + output_df = blast_output_to_df(blast_output_file) + ectyper_df = ectyper_dict_to_df(ectyper_dict_file) + # Merge output_df and ectyper_df + output_df = output_df.merge(ectyper_df, left_on='qseqid', right_on='name', how='left') + predictions_dict = {} + # Select individual genomes + output_df['genome_name'] = output_df['sseqid'].str.split('|').str[1] + # Initialize constants + gene_pairs = {'wzx':'wzy', 'wzy':'wzx', 'wzm':'wzt', 'wzt':'wzm'} + predictions_columns = ['O_prediction', 'O_info', 'H_prediction', 'H_info'] + gene_list = ['wzx', 'wzy', 'wzm', 'wzt', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA'] + if detailed: + # Add gene lists for detailed output report + for gene in gene_list: + predictions_columns.append(gene) + for genome_name, per_genome_df in output_df.groupby('genome_name'): + # Make prediction for each genome based on blast output + predictions_dict[genome_name] = get_prediction( + per_genome_df, predictions_columns, gene_pairs, detailed) + predictions_df = pd.DataFrame(predictions_dict).transpose() + if predictions_df.empty: + predictions_df = pd.DataFrame(columns=predictions_columns) + predictions_df = predictions_df[predictions_columns] + store_df(output_df, parsed_output_file) + store_df(predictions_df, predictions_file) + LOG.info("Serotype prediction completed") + return predictions_file + +def get_prediction(per_genome_df, predictions_columns, gene_pairs, detailed, ): + """Make serotype prediction for single genomes based on blast output + + Args: + per_genome_df(DataFrame): + blastn outputs for the given genome + predictions_columns(dict): + columns to be filled by prediction function + gene_pairs(dict): + dict of pair genes used for paired logic + detailed(bool): + whether to generate detailed output or not + Return: + Prediction dictionary for the given genome + """ + # Extract potential predictors + useful_columns = [ + 'gene', 'serotype', 'score', 'name', 'desc', 'pident', 'qcovhsp', 'qseqid', 'sseqid' + ] + per_genome_df = per_genome_df.sort_values(['gene', 'serotype', 'score'], ascending=False) + per_genome_df = per_genome_df[~per_genome_df.duplicated(['gene', 'serotype'])] + predictors_df = per_genome_df[useful_columns] + predictors_df = predictors_df.sort_values('score', ascending=False) + predictions = {} + for column in predictions_columns: + predictions[column] = None + for predicting_antigen in ['O', 'H']: + genes_pool = defaultdict(list) + for index, row in predictors_df.iterrows(): + gene = row['gene'] + if detailed: + predictions[gene] = True + if not predictions[predicting_antigen+'_prediction']: + serotype = row['serotype'] + if serotype[0] is not predicting_antigen: + continue + genes_pool[gene].append(serotype) + prediction = None + if len(serotype) < 1: + continue + antigen = serotype[0].upper() + if antigen != predicting_antigen: + continue + if gene in gene_pairs.keys(): + predictions[antigen+'_info'] = 'Only unpaired alignments found' + # Pair gene logic + potential_pairs = genes_pool.get(gene_pairs.get(gene)) + if potential_pairs is None: + continue + if serotype in potential_pairs: + prediction = serotype + else: + # Normal logic + prediction = serotype + if prediction is None: + continue + predictions[antigen+'_info'] = 'Alignment found' + predictions[predicting_antigen+'_prediction'] = prediction + # No alignment found + ## Make prediction based on non-paired gene + ## if only one non-paired gene is avaliable + if predictions.get(predicting_antigen+'_prediction') is None: + if len(genes_pool) == 1: + serotypes = list(genes_pool.values())[0] + if len(serotypes) == 1: + predictions[antigen+'_info'] = 'Lone unpaired alignment found' + predictions[predicting_antigen+'_prediction'] = serotypes[0] + return predictions + +def blast_output_to_df(blast_output_file): + '''Convert raw blast output file to DataFrame + Args: + blast_output_file(str): location of blast output + + Returns: + DataFrame: + DataFrame that contains all informations from blast output + ''' + # Load blast output + output_data = [] + with open(blast_output_file, 'r') as fh: + for line in fh: + fields = line.strip().split() + entry = { + 'qseqid': fields[0], + 'qlen': fields[1], + 'sseqid': fields[2], + 'length': fields[3], + 'pident': fields[4], + 'sstart': fields[5], + 'send': fields[6], + 'sframe': fields[7], + 'qcovhsp': fields[8] + } + output_data.append(entry) + df = pd.DataFrame(output_data) + if not output_data: + LOG.info("No hit found for this blast query") + # Return empty dataframe with correct columns + return pd.DataFrame( + columns=[ + 'length', 'pident', 'qcovhsp', + 'qlen', 'qseqid', 'send', + 'sframe', 'sseqid', 'sstart' + ]) + df['score'] = df['pident'].astype(float)*df['qcovhsp'].astype(float)/10000 + return df + +def ectyper_dict_to_df(ectyper_dict_file): + # Read ectyper dict + with open(ectyper_dict_file) as fh: + ectyper_dict = json.load(fh) + temp_list = [] + for antigen, alleles in ectyper_dict.items(): + for name, allele in alleles.items(): + new_entry = { + 'serotype': allele.get('allele'), + 'name': name, + 'gene': allele.get('gene'), + 'desc': allele.get('desc') + } + temp_list.append(new_entry) + df = pd.DataFrame(temp_list) + return df + +def store_df(src_df, dst_file): + """Append dataframe to a file if it exists, otherwise, make a new file + Args: + src_df(str): dataframe object to be stored + dst_file(str): dst_file to be modified/created + """ + if os.path.isfile(dst_file): + with open(dst_file, 'a') as fh: + src_df.to_csv(fh, header=False) + else: + with open(dst_file, 'w') as fh: + src_df.to_csv(fh, header=True, index_label='genome') + +def report_result(csv_file): + '''Report the content of dataframe in log + + Args: + csv_file(str): location of the prediction file + ''' + df = pd.read_csv(csv_file) + if df.empty: + LOG.info('No prediction was made becuase no alignment was found') + return + LOG.info('\n{0}'.format(df.to_string(index=False))) + +def add_non_predicted(all_genomes_list, predictions_file): + '''Add genomes that do not show up in blast result to prediction file + + Args: + all_genome_list(list): + list of genomes from user input + predictions_file(str): + location of the prediction file + Returns: + str: location of the prediction file + ''' + df = pd.read_csv(predictions_file) + df = df.merge(pd.DataFrame(all_genomes_list, columns=['genome']), on='genome', how='outer') + df.fillna({'O_info':'No alignment found', 'H_info':'No alignment found'}, inplace=True) + df.fillna('-', inplace=True) + df.to_csv(predictions_file, index=False) + return predictions_file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ecoli_serotyping/speciesIdentification.py Fri Jan 05 15:43:14 2018 -0500 @@ -0,0 +1,152 @@ +import logging +import multiprocessing +import os +import tempfile +from Bio import SeqIO + +from ectyper import genomeFunctions, blastFunctions, definitions, subprocess_util + +LOG = logging.getLogger(__name__) + +def is_ecoli_genome(iden_file, genome_file=None, mash=False): + ''' + Return True if file is classified as ecoli by ecoli markers, otherwise False + + Args: + iden_file (str): path to valid fasta genome file + genome_file (str): Optional path to valid fastq file for reads + mash (bool): Optional input to decide whether to use mash if genome is + identified as non-ecoli + + Returns: + bool: True if iden_file is ecoli, False otherwise + ''' + if genome_file is None: + genome_file = iden_file + num_hit = get_num_hits(iden_file) + if num_hit < 3: + LOG.info( + "{0} is identified as " + "an invalid e.coli genome file " + "by marker approach".format(os.path.basename(iden_file))) + if mash: + species = get_species(genome_file) + LOG.info( + "{0} is identified as genome of " + "{1} by mash approach".format(os.path.basename(iden_file), species)) + return False + LOG.debug("{0} is a valid e.coli genome file".format(os.path.basename(iden_file))) + return True + +def get_num_hits(target): + ''' + Return number of matching hits when query the reference genome + on the target genome + + Args: + target (str): target genome + + Returns: + int: number of hits found + ''' + num_hit = 0 + try: + with tempfile.TemporaryDirectory() as temp_dir: + blast_db = blastFunctions.create_blast_db([target], temp_dir) + result = blastFunctions.run_blast_for_identification( + definitions.ECOLI_MARKERS, + blast_db + ) + with open(result) as handler: + LOG.debug("get_num_hits() output:") + for line in handler: + LOG.debug(line) + num_hit += 1 + LOG.debug("{0} aligned to {1} marker sequences".format(target, num_hit)) + except SystemExit: + pass + return num_hit + +def get_species(file): + ''' + Given a fasta/fastq file, return the most likely species identification + + Args: + file (str): fasta/fastq file input + + Returns: + str: name of estimated species + ''' + LOG.info("Identifying species for {0}".format(file)) + if not os.path.isfile(definitions.REFSEQ_SKETCH): + LOG.warning("No refseq found.") + return None + species = 'unknown' + if genomeFunctions.get_valid_format(file) == 'fasta': + tmp_file = tempfile.NamedTemporaryFile().name + basename = os.path.basename(file).replace(' ', '_') + with open(tmp_file, 'w') as new_fh: + header = '> {0}\n'.format(basename) + new_fh.write(header) + for record in SeqIO.parse(file, 'fasta'): + new_fh.write(str(record.seq)) + new_fh.write('nnnnnnnnnnnnnnnnnnnn') + try: + species = get_species_helper(tmp_file) + except Exception: + pass + if genomeFunctions.get_valid_format(file) == 'fastq': + species = get_species_helper(file) + return species + +def get_species_helper(file): + ''' + Given a fasta/fastq file with one sequence, return the most likely species + identification + + Args: + file (str): fasta/fastq file input + + Returns: + str: name of estimated species + ''' + species = 'unknown' + cmd = [ + 'mash', 'dist', + file, + definitions.REFSEQ_SKETCH, + '|', + 'sort -gk3 -', + '|', + 'head -1 -' + ] + try: + mash_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True) + ass_acc_num = '_'.join(mash_output.split('\t')[1].split('_')[:2]) + cmd = [ + 'grep -E', + ass_acc_num, + definitions.REFSEQ_SUMMARY + ] + summary_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True) + species = summary_output.split('\t')[7] + return species + except Exception as err: + LOG.warning('No matching species found with distance estimation:{0}'.format(err)) + try: + cmd = [ + 'mash screen', + '-w', + '-p', str(multiprocessing.cpu_count()//2), + definitions.REFSEQ_SKETCH, + file, + '| sort -gr - | head -1 -' + ] + screen_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True) + LOG.debug(screen_output.split('\t')) + species = screen_output.split('\t')[5].split('\n')[0] + except Exception as err2: + LOG.warning( + 'No matching species found with distance screening either:{}'.format(err2) + ) + return species
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ecoli_serotyping/subprocess_util.py Fri Jan 05 15:43:14 2018 -0500 @@ -0,0 +1,34 @@ +''' +Utilities +''' +import logging +import subprocess +import timeit + +LOG = logging.getLogger(__name__) + +def run_subprocess(cmd, is_shell=False): + ''' + Run cmd command on subprocess + + Args: + cmd (str): cmd command + + Returns: + stdout (str): The stdout of cmd + ''' + start_time = timeit.default_timer() + LOG.debug("Running: {0}".format(cmd)) + comp_proc = subprocess.run( + cmd, + shell=is_shell, + universal_newlines=True, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + stderr = comp_proc.stderr + stdout = comp_proc.stdout + elapsed_time = timeit.default_timer() - start_time + LOG.debug("Subprocess finished successfully in {:0.3f} sec.".format(elapsed_time)) + return stdout