comparison tools/seq_length/seq_length.py @ 0:c323e29a8248 draft

Initial release v0.0.1
author peterjc
date Tue, 08 May 2018 09:35:45 -0400
parents
children 458f987918a6
comparison
equal deleted inserted replaced
-1:000000000000 0:c323e29a8248
1 #!/usr/bin/env python
2 """Compute length of FASTA, QUAL, FASTQ or SSF sequences.
3
4 Takes three command line options: input sequence filename, input type
5 (e.g. FASTA or SFF) and the output filename (tabular).
6
7 This tool is a short Python script which requires Biopython 1.54 or later
8 for SFF file support. If you use this tool in scientific work leading to a
9 publication, please cite the Biopython application note:
10
11 Cock et al 2009. Biopython: freely available Python tools for computational
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
14
15 This script is copyright 2018 by Peter Cock, The James Hutton Institute UK.
16 All rights reserved. See accompanying text file for licence details (MIT
17 license).
18 """
19
20 from __future__ import print_function
21
22 import sys
23
24 if "-v" in sys.argv or "--version" in sys.argv:
25 print("v0.0.1")
26 sys.exit(0)
27
28 try:
29 from Bio import SeqIO
30 except ImportError:
31 sys.exit("Missing required Python library Biopython.")
32
33
34 # Parse Command Line
35 try:
36 in_file, seq_format, out_file = sys.argv[1:]
37 except ValueError:
38 sys.exit("Expected three arguments (input file, format, output file), "
39 "got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
40
41
42 if seq_format.startswith("fastq"):
43 # We don't care about the quality score encoding, just
44 # need to translate Galaxy format name into something
45 # Biopython will accept:
46 format = "fastq"
47 elif seq_format.lower() == "csfasta":
48 # I have not tested with colour space FASTA
49 format = "fasta"
50 elif seq_format.lower == "sff":
51 # The masked/trimmed numbers are more interesting
52 format = "sff-trim"
53 elif seq_format.lower() in ["fasta", "qual"]:
54 format = seq_format.lower()
55 else:
56 # TODO: Does Galaxy understand GenBank, EMBL, etc yet?
57 sys.exit("Unexpected format argument: %r" % seq_format)
58
59
60 count = 0
61 total = 0
62 with open(out_file, "w") as out_handle:
63 out_handle.write("#Identifier\tLength\n")
64 for record in SeqIO.parse(in_file, format):
65 count += 1
66 length = len(record)
67 total += length
68 out_handle.write("%s\t%i\n" % (record.id, length))
69 print("%i sequences, total length %i" % (count, total))