diff fasta_report_sequence_lengths.py @ 0:4b01f0d7b350 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/sshmm/ commit b578a90031fd7061fbdaef48b6a66d895ac077c3
author rnateam
date Fri, 06 Jul 2018 09:01:40 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_report_sequence_lengths.py	Fri Jul 06 09:01:40 2018 -0400
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+
+import sys
+
+"""
+Input: FASTA file
+Output: Print sequence ID and corresponding sequence length
+
+Example output:
+chr1	248956422
+chr2	242193529
+chr3	198295559
+...
+
+"""
+
+# Check input.
+if not len(sys.argv) == 2:
+    exit("Usage: fasta_report_sequence_lengths.py <fasta_file>")
+
+fasta_file = sys.argv[1]
+
+seq_id = "id"
+seq_len = 0
+
+# Go through FASTA file, extract sequence lengths.
+with open(fasta_file) as f:
+    for line in f:
+        if line.startswith(">"):
+            new_id = line[1:].strip()
+            if seq_len:
+                print("%s\t%i" % (seq_id, seq_len))
+            seq_len = 0
+            seq_id = new_id
+        else:
+            seq_len += len(line.strip())
+
+# Print last sequence length.
+if seq_len:
+    print("%s\t%i" % (seq_id, seq_len))
+