Mercurial > repos > nml > combine_assemblystats

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/combine_stats.pl	Wed Nov 08 16:38:50 2017 -0500
@@ -0,0 +1,103 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use autodie;
+use Getopt::Long;
+
+#quick and dirty script to combine a list of assembly stats tab files into a simple csv file where each row is one strain
+
+
+my ($files,$output) = prepare_inputs();
+
+
+my @strains = sort { $a cmp $b } keys %{$files};
+
+
+#get first file so we can determine the header
+my $first = shift @strains;
+my $top_header;
+my $second_header;
+
+
+
+open my $out,'>',$output;
+process($first,$files->{$first},$out,1);
+
+
+foreach my $name( @strains) {
+    process($name,$files->{$name},$out);
+}
+
+
+
+close $out;
+
+
+exit;
+
+sub process {
+    my ($name,$file,$out,$header) = @_;
+
+    my @header = ("Strain");
+    my @values = ($name);
+
+    open my $in,'<',$file;
+    while ( <$in>) {
+        chomp;
+
+        if (length $_ ==0) {
+            next;
+        }
+
+        #if we hit this section, we are done reading this file since the rest we do not care about
+        if ( $_ =~ /Simple Din.*repeats/) {
+            last;
+        }
+
+
+        my ($key,$value) = split /:/;
+
+        #trim out the tabs
+        $key =~ s/\t//g;
+        $value =~ s/\t//g;
+
+        if ( $value) {
+            push @header,$key;
+            push @values,$value;
+        }
+
+    }
+
+
+    close $in;
+
+    #check to see if we are printing out the header
+    if ( $header) {
+        print $out join ("\t",@header) . "\n";
+    }
+    print $out join ("\t",@values) . "\n";
+
+    return;
+}
+
+
+sub prepare_inputs {
+
+    my ($output,%files);
+
+
+
+    if (!GetOptions('stats=s' => \%files,
+                    'output=s' => \$output
+                )){
+
+        die "Invalid options given\n";
+    }
+
+
+    if ( scalar keys %files == 0){
+        die "No files given\n";
+    }
+
+    return (\%files,$output);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/combine_stats.xml	Wed Nov 08 16:38:50 2017 -0500
@@ -0,0 +1,45 @@
+<tool id="combine_stats" name="Combine AssemblyStats" version="1.0">
+  <description>Combines List Collection Assembly Statistics </description>
+  <requirements>
+    <requirement type="package" version="2.49">perl-getopt-long</requirement>
+  </requirements>
+  <command detect_errors="exit_code"><![CDATA[
+  perl '$__tool_directory__/combine_stats.pl'
+
+  --output '$result'
+    #for $f in $texts.keys# --stats '$f=$texts[$f]' #end for#
+  ]]></command>
+  <inputs>
+    <param name="texts" type="data_collection" label="List Collection of Assembly Statistics" help="" optional="false" collection_type="list" />
+  </inputs>
+  <outputs>
+    <data name="result"  format="tabular" ></data>
+  </outputs>
+  <tests>
+    <test>
+      <param name="texts">
+        <collection type="list">
+          <element name="first" value="first.txt" />
+          <element name="second" value="second.txt" />
+        </collection>
+      </param>
+      <output name="result" file="results.tabular"/>
+    </test>
+  </tests>
+  <help><![CDATA[
+
+  **What it does**
+
+  Tool will take one or more https://toolshed.g2.bx.psu.edu/view/nml/assemblystats/ results and combine into a single tabular line file where each result is represented by a single line.
+
+  ]]>
+  </help>
+    <citations>
+        <citation type="bibtex">@ARTICLE{a1,
+            title = {Combine AssemblyStats},
+            author = {Mariam Iskander, Philip Mabon},
+            url = {https://github.com/phac-nml/galaxy_tools/}
+            }
+        }</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/first.txt	Wed Nov 08 16:38:50 2017 -0500
@@ -0,0 +1,34 @@
+Statistics for contig lengths:
+	Min contig length:	1,134
+	Max contig length:	101,601
+	Mean contig length:	24480.24
+	Standard deviation of contig length:	23362.52
+	Median contig length:	16,139
+	N50 contig length:	45,147
+
+Statistics for numbers of contigs:
+	Number of contigs:	186
+	Number of contigs >=1kb:	186
+	Number of contigs in N50:	35
+
+Statistics for bases in the contigs:
+	Number of bases in all contigs:	4,553,325
+	Number of bases in contigs >=1kb:	4,553,325
+	GC Content of contigs:	47.56 %
+
+Simple Dinucleotide repeats:
+	Number of contigs with over 70% dinucleotode repeats:	0.00 % (0 contigs)
+	AT:	0.00 % (0 contigs)
+	CG:	0.00 % (0 contigs)
+	AC:	0.00 % (0 contigs)
+	TG:	0.00 % (0 contigs)
+	AG:	0.00 % (0 contigs)
+	TC:	0.00 % (0 contigs)
+
+Simple mononucleotide repeats:
+	Number of contigs with over 50% mononucleotode repeats:	0.00 % (0 contigs)
+	AA:	0.00 % (0 contigs)
+	TT:	0.00 % (0 contigs)
+	CC:	0.00 % (0 contigs)
+	GG:	0.00 % (0 contigs)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/results.tabular	Wed Nov 08 16:38:50 2017 -0500
@@ -0,0 +1,3 @@
+Strain	Min contig length	Max contig length	Mean contig length	Standard deviation of contig length	Median contig length	N50 contig length	Number of contigs	Number of contigs >=1kb	Number of contigs in N50	Number of bases in all contigs	Number of bases in contigs >=1kb	GC Content of contigs
+first	1,134	101,601	24480.24	23362.52	16,139	45,147	186	186	35	4,553,325	4,553,325	47.56 %
+second	1,134	101,601	24967.61	23081.09	16,588	44,563	185	185	37	4,619,008	4,619,008	47.50 %
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/second.txt	Wed Nov 08 16:38:50 2017 -0500
@@ -0,0 +1,34 @@
+Statistics for contig lengths:
+	Min contig length:	1,134
+	Max contig length:	101,601
+	Mean contig length:	24967.61
+	Standard deviation of contig length:	23081.09
+	Median contig length:	16,588
+	N50 contig length:	44,563
+
+Statistics for numbers of contigs:
+	Number of contigs:	185
+	Number of contigs >=1kb:	185
+	Number of contigs in N50:	37
+
+Statistics for bases in the contigs:
+	Number of bases in all contigs:	4,619,008
+	Number of bases in contigs >=1kb:	4,619,008
+	GC Content of contigs:	47.50 %
+
+Simple Dinucleotide repeats:
+	Number of contigs with over 70% dinucleotode repeats:	0.00 % (0 contigs)
+	AT:	0.00 % (0 contigs)
+	CG:	0.00 % (0 contigs)
+	AC:	0.00 % (0 contigs)
+	TG:	0.00 % (0 contigs)
+	AG:	0.00 % (0 contigs)
+	TC:	0.00 % (0 contigs)
+
+Simple mononucleotide repeats:
+	Number of contigs with over 50% mononucleotode repeats:	0.00 % (0 contigs)
+	AA:	0.00 % (0 contigs)
+	TT:	0.00 % (0 contigs)
+	CC:	0.00 % (0 contigs)
+	GG:	0.00 % (0 contigs)
+