diff ecoli_serotyping/genomeFunctions.py @ 6:fe3ceb5c4214 draft

Uploaded
author jpetteng
date Fri, 05 Jan 2018 15:43:14 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/genomeFunctions.py	Fri Jan 05 15:43:14 2018 -0500
@@ -0,0 +1,280 @@
+'''
+Genome Utilities
+'''
+#!/usr/bin/env python
+
+import logging
+import os
+import re
+import tempfile
+from tarfile import is_tarfile
+
+from Bio import SeqIO
+from ectyper import definitions, subprocess_util
+
+LOG = logging.getLogger(__name__)
+
+
+def get_files_as_list(file_or_directory):
+    """
+    Creates a list of files from either the given file, or all files within the
+    directory specified (where each file name is its absolute path).
+
+    Args:
+        file_or_directory (str): file or directory name given on commandline
+
+    Returns:
+        files_list (list(str)): List of all the files found.
+
+    """
+
+    files_list = []
+    if file_or_directory == '':
+        return files_list
+
+    if os.path.isdir(file_or_directory):
+        LOG.info("Gathering genomes from directory " + file_or_directory)
+
+        # Create a list containing the file names
+        for root, dirs, files in os.walk(file_or_directory):
+            for filename in files:
+                files_list.append(os.path.join(root, filename))
+    # check if input is concatenated file locations
+    elif ',' in file_or_directory:
+        LOG.info("Using genomes in the input list")
+        for filename in file_or_directory.split(','):
+            files_list.append(os.path.abspath(filename))
+    else:
+        LOG.info("Using genomes in file " + file_or_directory)
+        files_list.append(os.path.abspath(file_or_directory))
+
+    return sorted(files_list)
+
+
+def get_valid_format(file):
+    """
+    Check using SeqIO if files are valid fasta/fastq format, returns the format.
+
+    Args:
+        file (str): path of file
+
+    Returns:
+        fm (str or None): the file format if 'fasta' or 'fastq', otherwise None
+    """
+    for fm in ['fastq', 'fasta']:
+        try:
+            with open(file, "r") as handle:
+                data = SeqIO.parse(handle, fm)
+                if any(data):
+                    if is_tarfile(file):
+                        LOG.warning("Compressed file is not supported: {}".format(file))
+                        return None
+                    return fm
+        except FileNotFoundError as err:
+            LOG.warning("{0} is not found".format(file))
+            return None
+        except UnicodeDecodeError as err:
+            LOG.warning("{0} is not a valid file".format(file))
+            return None
+        except:
+            LOG.warning("{0} is an unexpected file".format(file))
+            return None
+    LOG.warning("{0} is not a fasta/fastq file".format(file))
+    return None
+
+
+def get_genome_names_from_files(files, temp_dir):
+    """
+    For each file:
+    Takes the first header from a fasta file and sends it to the get_genome_name
+    function. Returns the name of the genome. If the name of the file is to be
+    used as the genome name, creates a temporary file using >lcl|filename as the
+    first part of the header.
+
+    Args:
+        files (list): The list of files to get the genome names for
+        tempdir (str): A tempdir where the copied files will be stored
+
+    Returns:
+        tuple(list(str), list(str)): first list is genome names, second list is file names
+    """
+
+    list_of_genomes = []
+    list_of_files = []
+    for file in files:
+        # Always to use the filename as the genome name.
+        # This means we also need to create a new file adding
+        # the filename to each of the headers, so that downstream applications
+        # (eg. BLAST) can be used with the filename as genome name.
+
+        # get only the name of the file for use in the fasta header
+        file_base_name = os.path.basename(file)
+        file_path_name = os.path.splitext(file_base_name)[0]
+        n_name = file_path_name.replace(' ', '_')
+
+        # create a new file for the updated fasta headers
+        new_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False).name
+
+        # add the new name to the list of files and genomes
+        list_of_files.append(new_file)
+        list_of_genomes.append(n_name)
+
+        with open(new_file, "w") as outfile:
+            with open(file) as infile:
+                for record in SeqIO.parse(infile, "fasta"):
+                    outfile.write(">lcl|" + n_name + "|" + record.description + "\n")
+                    outfile.write(str(record.seq) + "\n")
+
+    return list_of_genomes, list_of_files
+
+
+def get_genome_name(header):
+    """
+    Getting the name of the genome by hierarchy. This requires reading the first
+    fasta header from the file. It also assumes a single genome per file.
+
+    Args:
+        header (str): The header containing the record.
+
+    Returns:
+        genomeName (str): Name of the genome contained in the header.
+    """
+
+    re_patterns = (
+        # Look for lcl followed by the possible genome name
+        re.compile('(lcl\|[\w\-\.]+)'),
+
+        # Look for contigs in the wwwwdddddd format
+        re.compile('([A-Za-z]{4}\d{2})\d{6}'),
+
+        # Look for a possible genome name at the beginning of the record ID
+        re.compile('^(\w{8}\.\d)'),
+
+        # Look for ref, gb, emb or dbj followed by the possible genome name
+        re.compile('(ref\|\w{2}_\w{6}|gb\|\w{8}|emb\|\w{8}|dbj\|\w{8})'),
+
+        # Look for gi followed by the possible genome name
+        re.compile('(gi\|\d{8})'),
+
+
+        # Look for name followed by space, then description
+        re.compile('^([\w\-\.]+)\s+[\w\-\.]+')
+    )
+
+    # if nothing matches, use the full header as genome_name
+    genome_name = header
+    for rep in re_patterns:
+        m = rep.search(header)
+
+        if m:
+            genome_name = m.group(1)
+            break
+
+    return str(genome_name)
+
+def assemble_reads(reads, reference, temp_dir):
+    '''
+    Return path to assembled reads.
+    Assemble short reads by mapping to a reference genome.
+    Default output is the same as reads file
+        (basename+'iden.fasta' and basename+'pred.fasta').
+
+    Args:
+        reads (str): FASTQ/FQ format reads file
+        reference (str): FASTA format reference file
+        temp_dir (str): temp_dir for storing assembled files
+
+    Returns:
+        tuple(str, str): identifcation and prediction fasta file
+    '''
+    output = os.path.join(
+        temp_dir,
+        os.path.splitext(os.path.basename(reads))[0]+'.fasta'
+    )
+
+    # run once if index reference does not exist
+    index_path = \
+        os.path.join(
+            definitions.DATA_DIR,
+            'bowtie_index',
+            os.path.splitext(os.path.basename(reference))[0]
+        )
+    index_dir = os.path.split(index_path)[0]
+    if not os.path.isdir(index_dir):
+        LOG.info('Reference index does not exist. Creating new reference index at {}'.format(index_dir))
+        if not os.path.exists(index_dir):
+            os.makedirs(index_dir)
+        cmd1 = [
+            'bowtie2-build',
+            reference,
+            index_path
+        ]
+        subprocess_util.run_subprocess(cmd1)
+
+    cmd2 = [
+        'bowtie2',
+        '--score-min L,1,-0.5',
+        '--np 5',
+        '-x', index_path,
+        '-U', reads,
+        '-S', os.path.join(temp_dir, 'reads.sam')
+    ]
+    subprocess_util.run_subprocess(cmd2)
+
+    cmd3 = [
+        definitions.SAMTOOLS, 'view',
+        '-F 4',
+        '-q 1',
+        '-bS', os.path.join(temp_dir, 'reads.sam'),
+        '-o', os.path.join(temp_dir, 'reads.bam')
+    ]
+    subprocess_util.run_subprocess(cmd3)
+    cmd4 = [
+        definitions.SAMTOOLS, 'sort',
+        os.path.join(temp_dir, 'reads.bam'),
+        '-o', os.path.join(temp_dir, 'reads.sorted.bam'),
+    ]
+    subprocess_util.run_subprocess(cmd4)
+
+    shell_cmd = [
+        definitions.SAMTOOLS+' mpileup -uf', # mpileup
+        reference,
+        os.path.join(temp_dir, 'reads.sorted.bam'),
+        '|',
+        'bcftools call -c', # variant calling
+        '|',
+        'vcfutils.pl vcf2fq', # vcf to fq
+        '|',
+        'seqtk seq -A -', # fq to fasta
+        '>',
+        output
+    ]
+    subprocess_util.run_subprocess(' '.join(shell_cmd), is_shell=True)
+    return split_mapped_output(output)
+
+def split_mapped_output(file):
+    '''
+    Splits given fasta files into two file based on 'lcl' tags
+        in the seq header
+
+    Args:
+        file (str): path to input fasta file
+
+    Returns:
+        (str): path to ecoli identification fasta seq
+        (str): path to serotype prediction fasta seq
+    '''
+    identif_file = os.path.splitext(file)[0]+'.iden.fasta'
+    predict_file = os.path.splitext(file)[0]+'.pred.fasta'
+    identif_seqs = []
+    predict_seqs = []
+    for record in SeqIO.parse(file, 'fasta'):
+        if 'lcl' in record.description:
+            identif_seqs.append(record)
+        else:
+            predict_seqs.append(record)
+    with open(identif_file, "w") as output_handle:
+        SeqIO.write(identif_seqs, output_handle, "fasta")
+    with open(predict_file, "w") as output_handle:
+        SeqIO.write(predict_seqs, output_handle, "fasta")
+    return identif_file, predict_file