Mercurial > repos > peterjc > seq_length
diff tools/seq_length/seq_length.py @ 0:c323e29a8248 draft
Initial release v0.0.1
author | peterjc |
---|---|
date | Tue, 08 May 2018 09:35:45 -0400 |
parents | |
children | 458f987918a6 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_length/seq_length.py Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,69 @@ +#!/usr/bin/env python +"""Compute length of FASTA, QUAL, FASTQ or SSF sequences. + +Takes three command line options: input sequence filename, input type +(e.g. FASTA or SFF) and the output filename (tabular). + +This tool is a short Python script which requires Biopython 1.54 or later +for SFF file support. If you use this tool in scientific work leading to a +publication, please cite the Biopython application note: + +Cock et al 2009. Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This script is copyright 2018 by Peter Cock, The James Hutton Institute UK. +All rights reserved. See accompanying text file for licence details (MIT +license). +""" + +from __future__ import print_function + +import sys + +if "-v" in sys.argv or "--version" in sys.argv: + print("v0.0.1") + sys.exit(0) + +try: + from Bio import SeqIO +except ImportError: + sys.exit("Missing required Python library Biopython.") + + +# Parse Command Line +try: + in_file, seq_format, out_file = sys.argv[1:] +except ValueError: + sys.exit("Expected three arguments (input file, format, output file), " + "got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv))) + + +if seq_format.startswith("fastq"): + # We don't care about the quality score encoding, just + # need to translate Galaxy format name into something + # Biopython will accept: + format = "fastq" +elif seq_format.lower() == "csfasta": + # I have not tested with colour space FASTA + format = "fasta" +elif seq_format.lower == "sff": + # The masked/trimmed numbers are more interesting + format = "sff-trim" +elif seq_format.lower() in ["fasta", "qual"]: + format = seq_format.lower() +else: + # TODO: Does Galaxy understand GenBank, EMBL, etc yet? + sys.exit("Unexpected format argument: %r" % seq_format) + + +count = 0 +total = 0 +with open(out_file, "w") as out_handle: + out_handle.write("#Identifier\tLength\n") + for record in SeqIO.parse(in_file, format): + count += 1 + length = len(record) + total += length + out_handle.write("%s\t%i\n" % (record.id, length)) +print("%i sequences, total length %i" % (count, total))