Mercurial > repos > iuc > fasta_stats
changeset 2:cd0874854f51 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit adc5e3616c1849551c9a712b651b0d1c6b0e88f1"
author | iuc |
---|---|
date | Mon, 26 Apr 2021 10:01:43 +0000 |
parents | 16f1f3e2de42 |
children | 56022eb50bbd |
files | fasta-stats.pl fasta-stats.xml test-data/ng50_input.fasta test-data/ng50_out.txt test-data/test_out.txt |
diffstat | 5 files changed, 117 insertions(+), 25 deletions(-) [+] |
line wrap: on
line diff
--- a/fasta-stats.pl Wed Apr 21 09:10:46 2021 +0000 +++ b/fasta-stats.pl Mon Apr 26 10:01:43 2021 +0000 @@ -47,7 +47,7 @@ # sort length array # (should use hash here for efficiency with huge no of short reads?) -@len = sort { $a <=> $b } @len; +@len = sort { $b <=> $a } @len; # compute more stats @@ -62,12 +62,12 @@ # calculate n50 my $thresh = int 0.5 * $stat{'num_bp'}; - $stat{'len_N50'} = &calc_x50(@len, $thresh); + ($stat{'len_N50'}, $stat{'L50'}) = &calc_x50(\@len, $thresh); #calculate NG50 if ($calc_ng50) { - my $thresh = int 0.5 * $genome_size * 1000000; - $stat{'len_NG50'} = &calc_x50(@len, $thresh); + my $thresh = int 0.5 * $genome_size; + ($stat{'len_NG50'}, $stat{'LG50'}) = &calc_x50(\@len, $thresh); } } @@ -101,15 +101,16 @@ # N50/NG50 calculation sub sub calc_x50{ - my @x = shift; + my $ref = shift; + my @x = @$ref; my $thresh = shift; my $cum=0; for my $i (0 .. $#x) { $cum += $x[$i]; if ($cum >= $thresh) { - return $x[$i]; + return $x[$i], $i+1; } } - return 0; + return (0,0); }
--- a/fasta-stats.xml Wed Apr 21 09:10:46 2021 +0000 +++ b/fasta-stats.xml Mon Apr 26 10:01:43 2021 +0000 @@ -14,7 +14,7 @@ </command> <inputs> <param name="dataset" type="data" format="fasta" label="fasta or multifasta file" help="fasta dataset to get statistics for."/> - <param name="genome_size" type="float" optional="True" label="Genome size estimate (optional)" help="Estimate of the genome size in megabases (MB). If specified, NG50 will be calculated."/> + <param name="genome_size" type="float" optional="True" label="Genome size estimate (optional)" help="Estimate of the genome size in bases. If specified, NG50 and LG50 will be calculated."/> </inputs> <outputs> <data name="stats" format="tabular" label="${tool.name} on ${on_string}: Fasta summary stats"/> @@ -25,8 +25,8 @@ <output name="stats" file="test_out.txt"/> </test> <test> - <param name="dataset" value="test.fasta"/> - <param name="genome_size" value="5.0"/> + <param name="dataset" value="ng50_input.fasta"/> + <param name="genome_size" value="4000"/> <output name="stats" file="ng50_out.txt"/> </test> </tests>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ng50_input.fasta Mon Apr 26 10:01:43 2021 +0000 @@ -0,0 +1,88 @@ +>1 +ACTTGATCCTGCTCCCTCGGTGTCTGCATTGACTCCTCATGCTGGGACTG +GACCCGTCAACCCCCCTGCTCGCTGCTCACGTACCTTCATCACTTTTAGT +GATGATGCAACTTTCGAGGAATGGTTCCCCCAAGGGCGGCCCCCAAAAGT +CCCTGTTCGTGAGGTCTGTCCAGTGACCCATCGTCCAGCCCTATACCGGG +ACCCTGTTACAGACATACCCTATGCCACTGCTCGAGCCTTCAAGATCATT +CGTGAGGCTTACAAGAAGTACATTACTGCCCATGGACTGCCGCCCACTGC +CTCAGCCCTGGGCCCCGGCCCGCCACCTCCTGAGCCCCTCCCTGGCTCTG +GGCCCCGAGCCTTGCGCCAGAAAATTGTCATTAAATGA +>2 +ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTT +TGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG +>3 +GTTCTCAGCTTCCTTGCTTCCATGGCTCCAGCACCATTCGAAACCTCAAA +GAGAGGTTCCACATGAGCATGACTGAGGAGCAGCTGCAGCTGCTGGTGGA +GCAGATGGTGGATGGCAGTATGCGGTCTATCACCACCAAACTCTATGACG +GCTTCCAGTACCTCACCAACGGCATCATGTGA +>4 +ATGGAAGCGTTTTTGGGGTCGCGGTCCGGACTTTGGGCGGGGGGTCCGGC +CCCAGGACAGTTTTACCGCATTCCGTCCACTCCCGATTCCTTCATGGATC +CGGCGTCTGCACTTTACAGAGGTCCAATCACGCGGACCCA +>5 +TCTTTTCCTTCTCTACCATTTTCAACAAAGCAGGGGAAATAACTCAGTCT +CAGAAGACAGGAAACATCAACAAGTTGTGATGCCCTTTTCTTCCAATACT +ATTGAGGCTCACAAGTCAGCTCATGTAGACGGATCACTTAAGAGCAACAA +ACTGAAGTCTGCAAGAAAATTCACATTTCTATCTGATGAGGATGACTTAA +GTGCCCATAATCCCCTTTATAAGGAAAACATAAGTCAAGTATCAACAAAT +TCAGACATTTCACAGAGAACAGATTTTGTAGACCCATTTTCACCCAAAAT +ACAAGCCAAGAGTAAGTCTCTGAGGGGCCCAAGAGAAAAGATTCAGAGGC +TGTGGAGTCAGTCAGTCAGCTTACCCAGGAGGCTGATGAGGAAAGTTCCA +AATAGACCAGAGATCATAGATCTGCAGCAGTGGCAAGGCACCAGGCAGAA +AGCTGAAAATGAAAACACTGGAATCTGTACAAACAAAAGAGGTAGCAGCA +ATCCATTGCTTACAACTGAAGAGGCAAATTTGACAGAGAAAGAGGAAATA +AGGCAAGGTGAAACACTGATGATAGAAGGAACAGAACAGTTGAAATCTCT +CTCTTCAGACTCTTCATTTTGCTTTCCCAGGCCTCACTTCTCATTCTCCA +CTTTGCCAACTGTTTCAAGAACTGTGGAACTCAAATCAGAACCTAATGTC +ATCAGTTCTCCTGCTGAGTGTTCCTTGGAACTTTCTCCTTCAAGGCCTTG +TGTTTTACATTCTTCACTCTCTAGGAGAGAGACACCTATTTGTATGTTAC +CTATTGAAACCGAAAGAAATATTTTTGAAAATTTTGCCCATCCACCAAAC +ATCTCTCCTTCTGCCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNACATTTTTTCCACTTTCCGTTTCAACGTCTGGTCCCCC +AACAccacctcttctacctccatttccaactcctcttcctccaccacctc +cttctattccttgccctccacctccttcAGCTTCATTTCTGTCCACAGAG +TGTGTCTGTATAACAGGTGTTAAATGCACGACCAACTTGATGCCTGCCGA +GAAAATTAAGTCCTCTATGACACAGCTATCAACAACGACAGTGTGTAAAA +CAGACCCTCAGAGAGAACCAAAAGGCATCCTCAGACACGTTAAAAACTTA +GCAGAACTTGAAAAATCAGTAGCTAACATGTACAGTCAAATAGAAAAAAA +CTATCTACGCACAAATGTTTCAGAACTTCAAACTATGTGCCCTTCAGAAG +TAACAAATATGGAAATCACATCTGAACAAAACAAGGGGAGTTTGAACAAT +ATTGTCGAGGGAACTGAAAAACAATCTCACAGTCAATCTACTTCACTGTA +A +>6 +ATCCAATGGATTTGAACAGAAGCGCTTTGCCAGGCTTGCCAGCAAGAAGG +CAGTGGAGGAACTTGCCTACAAATGGAGTGTTGAGGATATGTAA +>7 +ATGCAGCCCCGGGTACTCCTTGTTGTTGCCCTCCTGGCGCTCCTGGCCTC +TGCCC +>8 +CCTAAAGCTCCTTGACAACTGGGACAGCGTGACCTCCACCTTCAGCAAGC +TGCGCGAACAGCTCGGCCCTGTGACCCAGGAGTTCTGGGATAACCTGGAA +AAGGAGACAGAGGGCCTGAGGCAGGAGATGAGCAAGGATCTGGAGGAGGT +GAAGGCCAAGGTGCAGCCCTACCTGGACGACTTCCAGAAGAAGTGGCAGG +AGGAGATGGAGCTCTACCGCCAGAAGGTGGAGCCGCTGCGCGCAGAGCTC +CAAGAGGGCGCGCGCCAGAAGCTGCACGAGCTGCAAGAGAAGCTGAGCCC +ACTGGGCGAGGAGATGCGCGACCGCGCGCGCGCCCATGTGGACGCGCTGC +GCACGCATCTGGCCCCCTACAGCGACGAGCTGCGCCAGCGCTTGGCCGCG +CGCCTTGAGGCTCTCAAGGAGAACGGCGGCGCCAGACTGGCCGAGTACCA +CGCCAAGGCCACCGAGCATCTGAGCACGCTCAGCGAGAAGGCCAAGCCCG +CGCTCGAGGACCTCCGCCAAGGCCTGCTGCCCGTGCTGGAGAGCTTCAAG +GTCAGCTTCCTGAGCGCTCTCGAGGAGTACACTAAGAAGCTCAACACCCA +GTGA +>9 +ATGCTCCACCTGCATGGCTGGCAAACCATG +>10 +GAGCTTTCTTCCTCTATGCTGGATTTGCTGCTGTGGGACTCCTTTTCATC +TATGGCTGTCTTCCTGAGACCAAAGGCAAAAAATTAGAGGAAATTGAATC +ACTCTTTGACAACAGGCTATGTACATGTGGCACTTCAGATTCTGATGAAG +GGAGATATATTGAATATATTCGGGTAAAGGGAAGTAACTATCATCTTTCT +GACAATGATGCTTCTGATGTGGAATAA +>11 +ATGAACTCACCAGAGGCGAGGCTCTGCGTTGCTCAATGCAGAGACTCTTA +CCCAGGGTGTCAGCCTCTGAAAGATACACGTGCCTGGGCCTCTTCCCTGA +AGATGGACCCGGCAGGTCTGGAGGGAGGCCCCCGTGATGAATCCCGTGAT +GAGCCGCCGATCCGAGCTCAGGCTGCGTCATGGGACCAGCCACAAGGTTG +CCTGACCTATAAAGGTCGCAGGAGTGCCTCAGGGACACAGAAGCAGTTAC +AGCTGCCAG \ No newline at end of file
--- a/test-data/ng50_out.txt Wed Apr 21 09:10:46 2021 +0000 +++ b/test-data/ng50_out.txt Mon Apr 26 10:01:43 2021 +0000 @@ -1,15 +1,17 @@ -GC_content 52.0 -len_N50 194780 -len_NG50 0 -len_max 194780 -len_mean 194780 -len_median 194780 -len_min 194780 -num_A 46297 -num_C 50626 -num_G 50678 -num_N 0 -num_T 47179 -num_bp 194780 -num_bp_not_N 194780 -num_seq 1 +GC_content 51.1 +L50 2 +LG50 2 +len_N50 604 +len_NG50 604 +len_max 30 +len_mean 324 +len_median 182 +len_min 1501 +num_A 895 +num_C 940 +num_G 807 +num_N 145 +num_T 778 +num_bp 3565 +num_bp_not_N 3420 +num_seq 11