Mercurial > repos > devteam > fasta_compute_length
changeset 4:e12f68d2cc4e draft
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/fasta_compute_length commit cd1ed08574b749eee2a3f6e6151dbb0c8ca15bbf"
author | devteam |
---|---|
date | Sun, 01 Mar 2020 07:24:10 -0500 |
parents | 2051602a5f97 |
children | 7d37cfda8e00 |
files | fasta_compute_length.py fasta_compute_length.xml utils/fasta_to_len.py |
diffstat | 3 files changed, 38 insertions(+), 38 deletions(-) [+] |
line wrap: on
line diff
--- a/fasta_compute_length.py Wed Sep 11 09:41:59 2019 -0400 +++ b/fasta_compute_length.py Sun Mar 01 07:24:10 2020 -0500 @@ -6,4 +6,5 @@ import sys from utils.fasta_to_len import compute_fasta_length -compute_fasta_length( sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4] == 'id_only' ) + +compute_fasta_length(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4] == 'id_only')
--- a/fasta_compute_length.xml Wed Sep 11 09:41:59 2019 -0400 +++ b/fasta_compute_length.xml Sun Mar 01 07:24:10 2020 -0500 @@ -1,11 +1,13 @@ -<?xml version="1.0"?> -<tool id="fasta_compute_length" name="Compute sequence length" version="1.0.2"> +<tool id="fasta_compute_length" name="Compute sequence length" version="1.0.2" profile="16.04"> <description></description> + <requirements> + <requirement type="package" version="3.7">python</requirement> + </requirements> <command> #if $ref.ref_source == 'dbkey': cp '${ref.index.fields.len_path}' '$output' #else: - python $__tool_directory__/fasta_compute_length.py + python '$__tool_directory__/fasta_compute_length.py' #if $ref.ref_source == 'history': '$input' #else: @@ -85,7 +87,7 @@ <output name="output" file="merged.tab" /> </test> </tests> - <help> + <help><![CDATA[ **What it does** @@ -97,7 +99,7 @@ Suppose you have the following FASTA formatted sequences from a Roche (454) FLX sequencing run:: - >EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ + >EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGCGCGATGACCATCGCCCGCTCCACCACG TTCGGCCGGCCCTTCTCGTCGAGGAATGACACCAGCGCTTCGCCCACG >EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ @@ -110,10 +112,10 @@ However, if your IDs are not all the same length, you may wish to just keep the fasta ID, and not the description:: - >EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ + >EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGCGCGATGACCATCGCCCGCTCCACCACG TTCGGCCGGCCCTTCTCGTCGAGGAATGACACCAGCGCTTCGCCCACG - >EYKX4VC length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ + >EYKX4VC length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATACTCACAGGCTTATACAATACAAATGTAAfa Running this tool with **Strip fasta description from header** set to **True** and **How many characters to keep?** set to **0** will produce:: @@ -122,7 +124,7 @@ EYKX4VC 60 - </help> + ]]></help> <citations> <citation type="doi">10.1093/bioinformatics/btq281</citation> </citations>
--- a/utils/fasta_to_len.py Wed Sep 11 09:41:59 2019 -0400 +++ b/utils/fasta_to_len.py Sun Mar 01 07:24:10 2020 -0500 @@ -5,16 +5,13 @@ Return titles with lengths of corresponding seq """ -import sys, os +import sys -assert sys.version_info[:2] >= ( 2, 4 ) +assert sys.version_info[:2] >= (2, 4) -def compute_fasta_length( fasta_file, out_file, keep_first_char, keep_first_word=False ): - infile = fasta_file - out = open( out_file, 'w') - keep_first_char = int( keep_first_char ) - +def compute_fasta_length(fasta_file, out_file, keep_first_char, keep_first_word=False): + keep_first_char = int(keep_first_char) fasta_title = '' seq_len = 0 @@ -25,28 +22,28 @@ keep_first_char += 1 first_entry = True - - for line in open( infile ): - line = line.strip() - if not line or line.startswith( '#' ): - continue - if line[0] == '>': - if first_entry == False: - if keep_first_word: - fasta_title = fasta_title.split()[0] - out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first_char ], seq_len ) ) + with open(fasta_file) as in_fh, open(out_file, 'w') as out_fh: + for line in in_fh: + line = line.strip() + if not line or line.startswith('#'): + continue + if line[0] == '>': + if first_entry is False: + if keep_first_word: + fasta_title = fasta_title.split()[0] + out_fh.write("%s\t%d\n" % (fasta_title[1:keep_first_char], seq_len)) + else: + first_entry = False + fasta_title = line + seq_len = 0 else: - first_entry = False - fasta_title = line - seq_len = 0 - else: - seq_len += len(line) + seq_len += len(line) - # last fasta-entry - if keep_first_word: - fasta_title = fasta_title.split()[0] - out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first_char ], seq_len ) ) - out.close() + # last fasta-entry + if keep_first_word: + fasta_title = fasta_title.split()[0] + out_fh.write("%s\t%d\n" % (fasta_title[1:keep_first_char], seq_len)) -if __name__ == "__main__" : - compute_fasta_length( sys.argv[1], sys.argv[2], sys.argv[3], True ) \ No newline at end of file + +if __name__ == "__main__": + compute_fasta_length(sys.argv[1], sys.argv[2], sys.argv[3], True)