view tools/seq_length/seq_length.py @ 5:ea3c01e08251 draft default tip

Remove legacy tool_dependencies.xml
author peterjc
date Thu, 30 Nov 2023 09:58:47 +0000
parents 17caf7a7c2c5
children
line wrap: on
line source

#!/usr/bin/env python
"""Compute length of FASTA, QUAL, FASTQ or SSF sequences.

Takes three command line options: input sequence filename, input type
(e.g. FASTA or SFF) and the output filename (tabular).

This tool is a short Python script which requires Biopython 1.54 or later
for SFF file support. If you use this tool in scientific work leading to a
publication, please cite the Biopython application note:

Cock et al 2009. Biopython: freely available Python tools for computational
molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878.

This script is copyright 2018 by Peter Cock, The James Hutton Institute UK.
All rights reserved. See accompanying text file for licence details (MIT
license).
"""

from __future__ import print_function

import sys
from collections import defaultdict
from optparse import OptionParser

usage = r"""Use as follows to compute all the lengths in a sequence file:

$ python seq_length.py -i example.fasta -f fasta -o lengths.tsv
"""

parser = OptionParser(usage=usage)
parser.add_option(
    "-i",
    "--input",
    dest="input",
    default=None,
    help="Input sequence filename (FASTA, FASTQ, etc)",
    metavar="FILE",
)
parser.add_option(
    "-f",
    "--format",
    dest="format",
    default=None,
    help="Input sequence format (FASTA, QUAL, FASTQ, SFF)",
)
parser.add_option(
    "-o",
    "--output",
    dest="output",
    default=None,
    help="Output filename (tabular)",
    metavar="FILE",
)
parser.add_option(
    "-s",
    "--stats",
    dest="stats",
    default=False,
    action="store_true",
    help="Compute statistics (median, N50 - will require much more RAM).",
)
parser.add_option(
    "-v",
    "--version",
    dest="version",
    default=False,
    action="store_true",
    help="Show version and quit",
)
options, args = parser.parse_args()

if options.version:
    print("v0.0.4")
    sys.exit(0)
if not options.input:
    sys.exit("Require an input filename")
if not options.format:
    sys.exit("Require the input format")
if not options.output:
    sys.exit("Require an output filename")

try:
    from Bio import SeqIO
except ImportError:
    sys.exit("Missing required Python library Biopython.")

try:
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
except ImportError:
    sys.exit("Biopython tool old?, missing Bio.SeqIO.QualityIO.FastqGeneralIterator")

try:
    from Bio.SeqIO.FastaIO import SimpleFastaParser
except ImportError:
    sys.exit("Biopython tool old?, missing Bio.SeqIO.FastaIO.SimpleFastaParser")

in_file = options.input
out_file = options.output

if options.format.startswith("fastq"):
    # We don't care about the quality score encoding, just
    # need to translate Galaxy format name into something
    # Biopython will accept:
    format = "fastq"
elif options.format.lower() == "csfasta":
    # I have not tested with colour space FASTA
    format = "fasta"
elif options.format.lower() == "sff":
    # The masked/trimmed numbers are more interesting
    format = "sff-trim"
elif options.format.lower() in ["fasta", "qual"]:
    format = options.format.lower()
else:
    # TODO: Does Galaxy understand GenBank, EMBL, etc yet?
    sys.exit("Unexpected format argument: %r" % options.format)


def median_from_counts_dict(counts_dict, count=None):
    sorted_lengths = sorted(counts_dict)  # i.e. sort the keys
    if count is None:
        count = sum(counts_dict.values())
    index = count / 2
    if count % 2:
        # Odd, easy case - will be an exact value
        # within one of the tally entries
        for value in sorted_lengths:
            index -= counts_dict[value]
            if index < 0:
                return value
    else:
        # Even, hard case - may have to take mean
        special = None
        for value in sorted_lengths:
            if special is not None:
                # We were right at boundary
                return (special + value) / 2.0
            index -= counts_dict[value]
            if index == 0:
                # Special case, want mean of this value
                # (final entry of this tally) and the next
                # value (first entry of the next tally).
                special = value
            elif index < 0:
                # Typical case, the two middle values
                # are equal and fall into same dict entry
                return value
    return None


if sys.version_info[0] >= 3:
    from statistics import median

    for test in [
        {1: 4, 2: 3},
        {1: 4, 3: 6},
        {1: 4, 5: 4},
        {0: 5, 1: 1, 2: 1, 3: 5},
        {0: 5, 1: 1, 2: 1, 3: 1, 4: 5},
    ]:
        test_list = []
        for v, c in test.items():
            test_list.extend([v] * c)
        # print(test)
        # print(test_list)
        assert median_from_counts_dict(test) == median(test_list)


def n50_from_counts_dict(counts_dict):
    """Calculate N50.

    N50 is a statistical measure of average length of a set of sequences.
    It is used widely in genomics, especially in reference to contig or
    supercontig lengths within a draft assembly.

    Given a set of sequences of varying lengths, the N50 length is defined
    as the length N for which 50% of all bases in the sequences are in a
    sequence of length L < N. This can be found mathematically as follows:
    Take a list L of positive integers. Create another list L' , which is
    identical to L, except that every element n in L has been replaced with
    n copies of itself. Then the median of L' is the N50 of L. For example:
    If L = {2, 2, 2, 3, 3, 4, 8, 8}, then L' consists of six 2's, six 3's,
    four 4's, and sixteen 8's; the N50 of L is the median of L' , which is 6.

    https://web.archive.org/web/20160726124802/http://www.broadinstitute.org/crd/wiki/index.php/N50
    """
    # Continuing the above example, input L would be {2:3, 3:2, 4:1, 8:2}
    # and L' becomes {2:6, 3:6, 4:4, 8:16}} as tally tables.
    l_prime = {v: v * c for v, c in counts_dict.items()}
    return median_from_counts_dict(l_prime)


count = 0
total = 0
stats = bool(options.stats)
length_counts = defaultdict(int)  # used if stats requested
length_min = sys.maxsize  # used if stats not requested
length_max = 0

with open(out_file, "w") as out_handle:
    out_handle.write("#Identifier\tLength\n")
    if format == "fastq":
        with open(in_file) as in_handle:
            for title, seq, qual in FastqGeneralIterator(in_handle):
                count += 1
                length = len(seq)
                total += length
                identifier = title.split(None, 1)[0]
                out_handle.write("%s\t%i\n" % (identifier, length))
                if stats:
                    length_counts[length] += 1
                else:
                    length_min = min(length_min, length)
                    length_max = max(length_max, length)
    elif format == "fasta":
        with open(in_file) as in_handle:
            for title, seq in SimpleFastaParser(in_handle):
                count += 1
                length = len(seq)
                total += length
                identifier = title.split(None, 1)[0]
                out_handle.write("%s\t%i\n" % (identifier, length))
                if stats:
                    length_counts[length] += 1
                else:
                    length_min = min(length_min, length)
                    length_max = max(length_max, length)
    else:
        for record in SeqIO.parse(in_file, format):
            count += 1
            length = len(record)
            total += length
            out_handle.write("%s\t%i\n" % (record.id, length))
            if stats:
                length_counts[length] += 1
            else:
                length_min = min(length_min, length)
                length_max = max(length_max, length)
print(
    "%i sequences, total length %i, mean %0.1f" % (count, total, float(total) / count)
)
if not count:
    pass
elif not stats:
    print("Shortest %i, longest %i" % (length_min, length_max))
elif count and stats:
    print("Shortest %i, longest %i" % (min(length_counts), max(length_counts)))
    median = median_from_counts_dict(length_counts, count)
    n50 = n50_from_counts_dict(length_counts)
    print("Median length %0.1f, N50 %i" % (median, n50))