diff fasta_filter_by_length.py @ 0:16679a7f554a draft

Imported from capsule None
author devteam
date Mon, 19 May 2014 12:33:03 -0400
parents
children e626b3ff9922
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_filter_by_length.py	Mon May 19 12:33:03 2014 -0400
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+"""
+Input: fasta, minimal length, maximal length
+Output: fasta
+Return sequences whose lengths are within the range.
+"""
+
+import sys, os
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def __main__():
+    input_filename = sys.argv[1]
+    try:
+        min_length = int( sys.argv[2] )
+    except:
+        stop_err( "Minimal length of the return sequence requires a numerical value." )
+    try:
+        max_length = int( sys.argv[3] )
+    except:
+        stop_err( "Maximum length of the return sequence requires a numerical value." )
+    output_filename = sys.argv[4]
+    output_handle = open( output_filename, 'w' )
+    tmp_size = 0 #-1
+    tmp_buf = ''
+    at_least_one = 0
+    for line in file(input_filename):
+        if not line or line.startswith('#'):
+            continue
+        if line[0] == '>':
+            if min_length <= tmp_size <= max_length or (min_length <= tmp_size and max_length == 0):
+                output_handle.write(tmp_buf)
+                at_least_one = 1
+            tmp_buf = line
+            tmp_size = 0                                                       
+        else:
+            if max_length == 0 or tmp_size < max_length:
+                tmp_size += len(line.rstrip('\r\n'))
+                tmp_buf += line
+    # final flush of buffer
+    if min_length <= tmp_size <= max_length or (min_length <= tmp_size and max_length == 0):
+        output_handle.write(tmp_buf.rstrip('\r\n'))
+        at_least_one = 1
+    output_handle.close()
+    if at_least_one == 0:
+        print "There is no sequence that falls within your range."
+
+if __name__ == "__main__" : __main__()