changeset 2:cd0874854f51 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit adc5e3616c1849551c9a712b651b0d1c6b0e88f1"
author iuc
date Mon, 26 Apr 2021 10:01:43 +0000
parents 16f1f3e2de42
children 56022eb50bbd
files fasta-stats.pl fasta-stats.xml test-data/ng50_input.fasta test-data/ng50_out.txt test-data/test_out.txt
diffstat 5 files changed, 117 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/fasta-stats.pl	Wed Apr 21 09:10:46 2021 +0000
+++ b/fasta-stats.pl	Mon Apr 26 10:01:43 2021 +0000
@@ -47,7 +47,7 @@
 # sort length array 
 # (should use hash here for efficiency with huge no of short reads?)
 
-@len = sort { $a <=> $b } @len;
+@len = sort { $b <=> $a } @len;
 
 # compute more stats
 
@@ -62,12 +62,12 @@
   
   # calculate n50
   my $thresh = int 0.5 * $stat{'num_bp'};
-  $stat{'len_N50'} = &calc_x50(@len, $thresh);
+  ($stat{'len_N50'}, $stat{'L50'}) = &calc_x50(\@len, $thresh);
   
   #calculate NG50
   if ($calc_ng50) {
-    my $thresh = int 0.5 * $genome_size * 1000000;
-    $stat{'len_NG50'} = &calc_x50(@len, $thresh);
+    my $thresh = int 0.5 * $genome_size;
+    ($stat{'len_NG50'}, $stat{'LG50'}) = &calc_x50(\@len, $thresh);
   }
 }
 
@@ -101,15 +101,16 @@
 # N50/NG50 calculation sub
 
 sub calc_x50{
-  my @x = shift;
+  my $ref = shift;
+  my @x = @$ref;
   my $thresh = shift;
   my $cum=0;
   for my $i (0 .. $#x) {
     $cum += $x[$i];
     if ($cum >= $thresh) {
-      return $x[$i];
+      return $x[$i], $i+1;
     }
   }
-  return 0;
+  return (0,0);
 }
 
--- a/fasta-stats.xml	Wed Apr 21 09:10:46 2021 +0000
+++ b/fasta-stats.xml	Mon Apr 26 10:01:43 2021 +0000
@@ -14,7 +14,7 @@
     </command>
     <inputs>
         <param name="dataset" type="data" format="fasta" label="fasta or multifasta file" help="fasta dataset to get statistics for."/>
-        <param name="genome_size" type="float" optional="True" label="Genome size estimate (optional)" help="Estimate of the genome size in megabases (MB). If specified, NG50 will be calculated."/>
+        <param name="genome_size" type="float" optional="True" label="Genome size estimate (optional)" help="Estimate of the genome size in bases. If specified, NG50 and LG50 will be calculated."/>
     </inputs>
     <outputs>
         <data name="stats" format="tabular" label="${tool.name} on ${on_string}: Fasta summary stats"/>
@@ -25,8 +25,8 @@
             <output name="stats" file="test_out.txt"/>
         </test>
         <test>
-            <param name="dataset" value="test.fasta"/>
-            <param name="genome_size" value="5.0"/>
+            <param name="dataset" value="ng50_input.fasta"/>
+            <param name="genome_size" value="4000"/>
             <output name="stats" file="ng50_out.txt"/>
         </test>
     </tests>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ng50_input.fasta	Mon Apr 26 10:01:43 2021 +0000
@@ -0,0 +1,88 @@
+>1
+ACTTGATCCTGCTCCCTCGGTGTCTGCATTGACTCCTCATGCTGGGACTG
+GACCCGTCAACCCCCCTGCTCGCTGCTCACGTACCTTCATCACTTTTAGT
+GATGATGCAACTTTCGAGGAATGGTTCCCCCAAGGGCGGCCCCCAAAAGT
+CCCTGTTCGTGAGGTCTGTCCAGTGACCCATCGTCCAGCCCTATACCGGG
+ACCCTGTTACAGACATACCCTATGCCACTGCTCGAGCCTTCAAGATCATT
+CGTGAGGCTTACAAGAAGTACATTACTGCCCATGGACTGCCGCCCACTGC
+CTCAGCCCTGGGCCCCGGCCCGCCACCTCCTGAGCCCCTCCCTGGCTCTG
+GGCCCCGAGCCTTGCGCCAGAAAATTGTCATTAAATGA
+>2
+ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTT
+TGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG
+>3
+GTTCTCAGCTTCCTTGCTTCCATGGCTCCAGCACCATTCGAAACCTCAAA
+GAGAGGTTCCACATGAGCATGACTGAGGAGCAGCTGCAGCTGCTGGTGGA
+GCAGATGGTGGATGGCAGTATGCGGTCTATCACCACCAAACTCTATGACG
+GCTTCCAGTACCTCACCAACGGCATCATGTGA
+>4
+ATGGAAGCGTTTTTGGGGTCGCGGTCCGGACTTTGGGCGGGGGGTCCGGC
+CCCAGGACAGTTTTACCGCATTCCGTCCACTCCCGATTCCTTCATGGATC
+CGGCGTCTGCACTTTACAGAGGTCCAATCACGCGGACCCA
+>5
+TCTTTTCCTTCTCTACCATTTTCAACAAAGCAGGGGAAATAACTCAGTCT
+CAGAAGACAGGAAACATCAACAAGTTGTGATGCCCTTTTCTTCCAATACT
+ATTGAGGCTCACAAGTCAGCTCATGTAGACGGATCACTTAAGAGCAACAA
+ACTGAAGTCTGCAAGAAAATTCACATTTCTATCTGATGAGGATGACTTAA
+GTGCCCATAATCCCCTTTATAAGGAAAACATAAGTCAAGTATCAACAAAT
+TCAGACATTTCACAGAGAACAGATTTTGTAGACCCATTTTCACCCAAAAT
+ACAAGCCAAGAGTAAGTCTCTGAGGGGCCCAAGAGAAAAGATTCAGAGGC
+TGTGGAGTCAGTCAGTCAGCTTACCCAGGAGGCTGATGAGGAAAGTTCCA
+AATAGACCAGAGATCATAGATCTGCAGCAGTGGCAAGGCACCAGGCAGAA
+AGCTGAAAATGAAAACACTGGAATCTGTACAAACAAAAGAGGTAGCAGCA
+ATCCATTGCTTACAACTGAAGAGGCAAATTTGACAGAGAAAGAGGAAATA
+AGGCAAGGTGAAACACTGATGATAGAAGGAACAGAACAGTTGAAATCTCT
+CTCTTCAGACTCTTCATTTTGCTTTCCCAGGCCTCACTTCTCATTCTCCA
+CTTTGCCAACTGTTTCAAGAACTGTGGAACTCAAATCAGAACCTAATGTC
+ATCAGTTCTCCTGCTGAGTGTTCCTTGGAACTTTCTCCTTCAAGGCCTTG
+TGTTTTACATTCTTCACTCTCTAGGAGAGAGACACCTATTTGTATGTTAC
+CTATTGAAACCGAAAGAAATATTTTTGAAAATTTTGCCCATCCACCAAAC
+ATCTCTCCTTCTGCCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNACATTTTTTCCACTTTCCGTTTCAACGTCTGGTCCCCC
+AACAccacctcttctacctccatttccaactcctcttcctccaccacctc
+cttctattccttgccctccacctccttcAGCTTCATTTCTGTCCACAGAG
+TGTGTCTGTATAACAGGTGTTAAATGCACGACCAACTTGATGCCTGCCGA
+GAAAATTAAGTCCTCTATGACACAGCTATCAACAACGACAGTGTGTAAAA
+CAGACCCTCAGAGAGAACCAAAAGGCATCCTCAGACACGTTAAAAACTTA
+GCAGAACTTGAAAAATCAGTAGCTAACATGTACAGTCAAATAGAAAAAAA
+CTATCTACGCACAAATGTTTCAGAACTTCAAACTATGTGCCCTTCAGAAG
+TAACAAATATGGAAATCACATCTGAACAAAACAAGGGGAGTTTGAACAAT
+ATTGTCGAGGGAACTGAAAAACAATCTCACAGTCAATCTACTTCACTGTA
+A
+>6
+ATCCAATGGATTTGAACAGAAGCGCTTTGCCAGGCTTGCCAGCAAGAAGG
+CAGTGGAGGAACTTGCCTACAAATGGAGTGTTGAGGATATGTAA
+>7
+ATGCAGCCCCGGGTACTCCTTGTTGTTGCCCTCCTGGCGCTCCTGGCCTC
+TGCCC
+>8
+CCTAAAGCTCCTTGACAACTGGGACAGCGTGACCTCCACCTTCAGCAAGC
+TGCGCGAACAGCTCGGCCCTGTGACCCAGGAGTTCTGGGATAACCTGGAA
+AAGGAGACAGAGGGCCTGAGGCAGGAGATGAGCAAGGATCTGGAGGAGGT
+GAAGGCCAAGGTGCAGCCCTACCTGGACGACTTCCAGAAGAAGTGGCAGG
+AGGAGATGGAGCTCTACCGCCAGAAGGTGGAGCCGCTGCGCGCAGAGCTC
+CAAGAGGGCGCGCGCCAGAAGCTGCACGAGCTGCAAGAGAAGCTGAGCCC
+ACTGGGCGAGGAGATGCGCGACCGCGCGCGCGCCCATGTGGACGCGCTGC
+GCACGCATCTGGCCCCCTACAGCGACGAGCTGCGCCAGCGCTTGGCCGCG
+CGCCTTGAGGCTCTCAAGGAGAACGGCGGCGCCAGACTGGCCGAGTACCA
+CGCCAAGGCCACCGAGCATCTGAGCACGCTCAGCGAGAAGGCCAAGCCCG
+CGCTCGAGGACCTCCGCCAAGGCCTGCTGCCCGTGCTGGAGAGCTTCAAG
+GTCAGCTTCCTGAGCGCTCTCGAGGAGTACACTAAGAAGCTCAACACCCA
+GTGA
+>9
+ATGCTCCACCTGCATGGCTGGCAAACCATG
+>10
+GAGCTTTCTTCCTCTATGCTGGATTTGCTGCTGTGGGACTCCTTTTCATC
+TATGGCTGTCTTCCTGAGACCAAAGGCAAAAAATTAGAGGAAATTGAATC
+ACTCTTTGACAACAGGCTATGTACATGTGGCACTTCAGATTCTGATGAAG
+GGAGATATATTGAATATATTCGGGTAAAGGGAAGTAACTATCATCTTTCT
+GACAATGATGCTTCTGATGTGGAATAA
+>11
+ATGAACTCACCAGAGGCGAGGCTCTGCGTTGCTCAATGCAGAGACTCTTA
+CCCAGGGTGTCAGCCTCTGAAAGATACACGTGCCTGGGCCTCTTCCCTGA
+AGATGGACCCGGCAGGTCTGGAGGGAGGCCCCCGTGATGAATCCCGTGAT
+GAGCCGCCGATCCGAGCTCAGGCTGCGTCATGGGACCAGCCACAAGGTTG
+CCTGACCTATAAAGGTCGCAGGAGTGCCTCAGGGACACAGAAGCAGTTAC
+AGCTGCCAG
\ No newline at end of file
--- a/test-data/ng50_out.txt	Wed Apr 21 09:10:46 2021 +0000
+++ b/test-data/ng50_out.txt	Mon Apr 26 10:01:43 2021 +0000
@@ -1,15 +1,17 @@
-GC_content	52.0
-len_N50	194780
-len_NG50	0
-len_max	194780
-len_mean	194780
-len_median	194780
-len_min	194780
-num_A	46297
-num_C	50626
-num_G	50678
-num_N	0
-num_T	47179
-num_bp	194780
-num_bp_not_N	194780
-num_seq	1
+GC_content	51.1
+L50	2
+LG50	2
+len_N50	604
+len_NG50	604
+len_max	30
+len_mean	324
+len_median	182
+len_min	1501
+num_A	895
+num_C	940
+num_G	807
+num_N	145
+num_T	778
+num_bp	3565
+num_bp_not_N	3420
+num_seq	11
--- a/test-data/test_out.txt	Wed Apr 21 09:10:46 2021 +0000
+++ b/test-data/test_out.txt	Mon Apr 26 10:01:43 2021 +0000
@@ -1,4 +1,5 @@
 GC_content	52.0
+L50	1
 len_N50	194780
 len_max	194780
 len_mean	194780