diff utils/fasta_to_len.py @ 0:ece409f6573c draft

Imported from capsule None
author devteam
date Mon, 19 May 2014 12:34:12 -0400
parents
children e12f68d2cc4e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/fasta_to_len.py	Mon May 19 12:34:12 2014 -0400
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+"""
+Input: fasta, int
+Output: tabular
+Return titles with lengths of corresponding seq
+"""
+
+import sys, os
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def compute_fasta_length( fasta_file, out_file, keep_first_char, keep_first_word=False ):
+
+    infile = fasta_file
+    out = open( out_file, 'w')
+    keep_first_char = int( keep_first_char )
+
+    fasta_title = ''
+    seq_len = 0
+
+    # number of char to keep in the title
+    if keep_first_char == 0:
+        keep_first_char = None
+    else:
+        keep_first_char += 1
+
+    first_entry = True
+
+    for line in open( infile ):
+        line = line.strip()
+        if not line or line.startswith( '#' ):
+            continue
+        if line[0] == '>':
+            if first_entry == False:
+                if keep_first_word:
+                    fasta_title = fasta_title.split()[0]
+                out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first_char ], seq_len ) )
+            else:
+                first_entry = False
+            fasta_title = line
+            seq_len = 0
+        else:
+            seq_len += len(line)
+
+    # last fasta-entry
+    if keep_first_word:
+        fasta_title = fasta_title.split()[0]
+    out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first_char ], seq_len ) )
+    out.close()
+
+if __name__ == "__main__" :
+    compute_fasta_length( sys.argv[1], sys.argv[2], sys.argv[3], True )
\ No newline at end of file