# HG changeset patch # User devteam # Date 1400517252 14400 # Node ID ece409f6573cc053d2d8ddc0960934d8b3ec9421 Imported from capsule None diff -r 000000000000 -r ece409f6573c fasta_compute_length.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_compute_length.py Mon May 19 12:34:12 2014 -0400 @@ -0,0 +1,9 @@ +#!/usr/bin/env python +""" +Uses fasta_to_len converter code. +""" + +import sys +from utils.fasta_to_len import compute_fasta_length + +compute_fasta_length( sys.argv[1], sys.argv[2], sys.argv[3], False ) \ No newline at end of file diff -r 000000000000 -r ece409f6573c fasta_compute_length.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_compute_length.xml Mon May 19 12:34:12 2014 -0400 @@ -0,0 +1,51 @@ + + + fasta_compute_length.py $input $output $keep_first + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool counts the length of each fasta sequence in the file. The output file has two columns per line (separated by tab): fasta titles and lengths of the sequences. The option *How many characters to keep?* allows to select a specified number of letters from the beginning of each FASTA entry. + +----- + +**Example** + +Suppose you have the following FASTA formatted sequences from a Roche (454) FLX sequencing run:: + + >EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGCGCGATGACCATCGCCCGCTCCACCACG TTCGGCCGGCCCTTCTCGTCGAGGAATGACACCAGCGCTTCGCCCACG >EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATACTCACAGGCTTATACAATACAAATGTAAfa + +Running this tool while setting **How many characters to keep?** to **14** will produce this:: + + EYKX4VC02EQLO5 108 + EYKX4VC02D4GS2 60 + + + + \ No newline at end of file diff -r 000000000000 -r ece409f6573c utils/__init__.py diff -r 000000000000 -r ece409f6573c utils/fasta_to_len.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/fasta_to_len.py Mon May 19 12:34:12 2014 -0400 @@ -0,0 +1,52 @@ +#!/usr/bin/env python +""" +Input: fasta, int +Output: tabular +Return titles with lengths of corresponding seq +""" + +import sys, os + +assert sys.version_info[:2] >= ( 2, 4 ) + +def compute_fasta_length( fasta_file, out_file, keep_first_char, keep_first_word=False ): + + infile = fasta_file + out = open( out_file, 'w') + keep_first_char = int( keep_first_char ) + + fasta_title = '' + seq_len = 0 + + # number of char to keep in the title + if keep_first_char == 0: + keep_first_char = None + else: + keep_first_char += 1 + + first_entry = True + + for line in open( infile ): + line = line.strip() + if not line or line.startswith( '#' ): + continue + if line[0] == '>': + if first_entry == False: + if keep_first_word: + fasta_title = fasta_title.split()[0] + out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first_char ], seq_len ) ) + else: + first_entry = False + fasta_title = line + seq_len = 0 + else: + seq_len += len(line) + + # last fasta-entry + if keep_first_word: + fasta_title = fasta_title.split()[0] + out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first_char ], seq_len ) ) + out.close() + +if __name__ == "__main__" : + compute_fasta_length( sys.argv[1], sys.argv[2], sys.argv[3], True ) \ No newline at end of file