Mercurial > repos > jpetteng > ectyper
diff ecoli_serotyping/speciesIdentification.py @ 6:fe3ceb5c4214 draft
Uploaded
author | jpetteng |
---|---|
date | Fri, 05 Jan 2018 15:43:14 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ecoli_serotyping/speciesIdentification.py Fri Jan 05 15:43:14 2018 -0500 @@ -0,0 +1,152 @@ +import logging +import multiprocessing +import os +import tempfile +from Bio import SeqIO + +from ectyper import genomeFunctions, blastFunctions, definitions, subprocess_util + +LOG = logging.getLogger(__name__) + +def is_ecoli_genome(iden_file, genome_file=None, mash=False): + ''' + Return True if file is classified as ecoli by ecoli markers, otherwise False + + Args: + iden_file (str): path to valid fasta genome file + genome_file (str): Optional path to valid fastq file for reads + mash (bool): Optional input to decide whether to use mash if genome is + identified as non-ecoli + + Returns: + bool: True if iden_file is ecoli, False otherwise + ''' + if genome_file is None: + genome_file = iden_file + num_hit = get_num_hits(iden_file) + if num_hit < 3: + LOG.info( + "{0} is identified as " + "an invalid e.coli genome file " + "by marker approach".format(os.path.basename(iden_file))) + if mash: + species = get_species(genome_file) + LOG.info( + "{0} is identified as genome of " + "{1} by mash approach".format(os.path.basename(iden_file), species)) + return False + LOG.debug("{0} is a valid e.coli genome file".format(os.path.basename(iden_file))) + return True + +def get_num_hits(target): + ''' + Return number of matching hits when query the reference genome + on the target genome + + Args: + target (str): target genome + + Returns: + int: number of hits found + ''' + num_hit = 0 + try: + with tempfile.TemporaryDirectory() as temp_dir: + blast_db = blastFunctions.create_blast_db([target], temp_dir) + result = blastFunctions.run_blast_for_identification( + definitions.ECOLI_MARKERS, + blast_db + ) + with open(result) as handler: + LOG.debug("get_num_hits() output:") + for line in handler: + LOG.debug(line) + num_hit += 1 + LOG.debug("{0} aligned to {1} marker sequences".format(target, num_hit)) + except SystemExit: + pass + return num_hit + +def get_species(file): + ''' + Given a fasta/fastq file, return the most likely species identification + + Args: + file (str): fasta/fastq file input + + Returns: + str: name of estimated species + ''' + LOG.info("Identifying species for {0}".format(file)) + if not os.path.isfile(definitions.REFSEQ_SKETCH): + LOG.warning("No refseq found.") + return None + species = 'unknown' + if genomeFunctions.get_valid_format(file) == 'fasta': + tmp_file = tempfile.NamedTemporaryFile().name + basename = os.path.basename(file).replace(' ', '_') + with open(tmp_file, 'w') as new_fh: + header = '> {0}\n'.format(basename) + new_fh.write(header) + for record in SeqIO.parse(file, 'fasta'): + new_fh.write(str(record.seq)) + new_fh.write('nnnnnnnnnnnnnnnnnnnn') + try: + species = get_species_helper(tmp_file) + except Exception: + pass + if genomeFunctions.get_valid_format(file) == 'fastq': + species = get_species_helper(file) + return species + +def get_species_helper(file): + ''' + Given a fasta/fastq file with one sequence, return the most likely species + identification + + Args: + file (str): fasta/fastq file input + + Returns: + str: name of estimated species + ''' + species = 'unknown' + cmd = [ + 'mash', 'dist', + file, + definitions.REFSEQ_SKETCH, + '|', + 'sort -gk3 -', + '|', + 'head -1 -' + ] + try: + mash_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True) + ass_acc_num = '_'.join(mash_output.split('\t')[1].split('_')[:2]) + cmd = [ + 'grep -E', + ass_acc_num, + definitions.REFSEQ_SUMMARY + ] + summary_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True) + species = summary_output.split('\t')[7] + return species + except Exception as err: + LOG.warning('No matching species found with distance estimation:{0}'.format(err)) + try: + cmd = [ + 'mash screen', + '-w', + '-p', str(multiprocessing.cpu_count()//2), + definitions.REFSEQ_SKETCH, + file, + '| sort -gr - | head -1 -' + ] + screen_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True) + LOG.debug(screen_output.split('\t')) + species = screen_output.split('\t')[5].split('\n')[0] + except Exception as err2: + LOG.warning( + 'No matching species found with distance screening either:{}'.format(err2) + ) + return species