Mercurial > repos > nml > combine_assemblystats
changeset 0:1855203c2e6c draft default tip
planemo upload for repository https://github.com/phac-nml/galaxy_tools commit 132092ff7fe1c4810d1221054419389180b81657
author | nml |
---|---|
date | Wed, 08 Nov 2017 16:38:50 -0500 |
parents | |
children | |
files | combine_stats.pl combine_stats.xml test-data/first.txt test-data/results.tabular test-data/second.txt |
diffstat | 5 files changed, 219 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/combine_stats.pl Wed Nov 08 16:38:50 2017 -0500 @@ -0,0 +1,103 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use autodie; +use Getopt::Long; + +#quick and dirty script to combine a list of assembly stats tab files into a simple csv file where each row is one strain + + +my ($files,$output) = prepare_inputs(); + + +my @strains = sort { $a cmp $b } keys %{$files}; + + +#get first file so we can determine the header +my $first = shift @strains; +my $top_header; +my $second_header; + + + +open my $out,'>',$output; +process($first,$files->{$first},$out,1); + + +foreach my $name( @strains) { + process($name,$files->{$name},$out); +} + + + +close $out; + + +exit; + +sub process { + my ($name,$file,$out,$header) = @_; + + my @header = ("Strain"); + my @values = ($name); + + open my $in,'<',$file; + while ( <$in>) { + chomp; + + if (length $_ ==0) { + next; + } + + #if we hit this section, we are done reading this file since the rest we do not care about + if ( $_ =~ /Simple Din.*repeats/) { + last; + } + + + my ($key,$value) = split /:/; + + #trim out the tabs + $key =~ s/\t//g; + $value =~ s/\t//g; + + if ( $value) { + push @header,$key; + push @values,$value; + } + + } + + + close $in; + + #check to see if we are printing out the header + if ( $header) { + print $out join ("\t",@header) . "\n"; + } + print $out join ("\t",@values) . "\n"; + + return; +} + + +sub prepare_inputs { + + my ($output,%files); + + + + if (!GetOptions('stats=s' => \%files, + 'output=s' => \$output + )){ + + die "Invalid options given\n"; + } + + + if ( scalar keys %files == 0){ + die "No files given\n"; + } + + return (\%files,$output); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/combine_stats.xml Wed Nov 08 16:38:50 2017 -0500 @@ -0,0 +1,45 @@ +<tool id="combine_stats" name="Combine AssemblyStats" version="1.0"> + <description>Combines List Collection Assembly Statistics </description> + <requirements> + <requirement type="package" version="2.49">perl-getopt-long</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + perl '$__tool_directory__/combine_stats.pl' + + --output '$result' + #for $f in $texts.keys# --stats '$f=$texts[$f]' #end for# + ]]></command> + <inputs> + <param name="texts" type="data_collection" label="List Collection of Assembly Statistics" help="" optional="false" collection_type="list" /> + </inputs> + <outputs> + <data name="result" format="tabular" ></data> + </outputs> + <tests> + <test> + <param name="texts"> + <collection type="list"> + <element name="first" value="first.txt" /> + <element name="second" value="second.txt" /> + </collection> + </param> + <output name="result" file="results.tabular"/> + </test> + </tests> + <help><![CDATA[ + + **What it does** + + Tool will take one or more https://toolshed.g2.bx.psu.edu/view/nml/assemblystats/ results and combine into a single tabular line file where each result is represented by a single line. + + ]]> + </help> + <citations> + <citation type="bibtex">@ARTICLE{a1, + title = {Combine AssemblyStats}, + author = {Mariam Iskander, Philip Mabon}, + url = {https://github.com/phac-nml/galaxy_tools/} + } + }</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/first.txt Wed Nov 08 16:38:50 2017 -0500 @@ -0,0 +1,34 @@ +Statistics for contig lengths: + Min contig length: 1,134 + Max contig length: 101,601 + Mean contig length: 24480.24 + Standard deviation of contig length: 23362.52 + Median contig length: 16,139 + N50 contig length: 45,147 + +Statistics for numbers of contigs: + Number of contigs: 186 + Number of contigs >=1kb: 186 + Number of contigs in N50: 35 + +Statistics for bases in the contigs: + Number of bases in all contigs: 4,553,325 + Number of bases in contigs >=1kb: 4,553,325 + GC Content of contigs: 47.56 % + +Simple Dinucleotide repeats: + Number of contigs with over 70% dinucleotode repeats: 0.00 % (0 contigs) + AT: 0.00 % (0 contigs) + CG: 0.00 % (0 contigs) + AC: 0.00 % (0 contigs) + TG: 0.00 % (0 contigs) + AG: 0.00 % (0 contigs) + TC: 0.00 % (0 contigs) + +Simple mononucleotide repeats: + Number of contigs with over 50% mononucleotode repeats: 0.00 % (0 contigs) + AA: 0.00 % (0 contigs) + TT: 0.00 % (0 contigs) + CC: 0.00 % (0 contigs) + GG: 0.00 % (0 contigs) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/results.tabular Wed Nov 08 16:38:50 2017 -0500 @@ -0,0 +1,3 @@ +Strain Min contig length Max contig length Mean contig length Standard deviation of contig length Median contig length N50 contig length Number of contigs Number of contigs >=1kb Number of contigs in N50 Number of bases in all contigs Number of bases in contigs >=1kb GC Content of contigs +first 1,134 101,601 24480.24 23362.52 16,139 45,147 186 186 35 4,553,325 4,553,325 47.56 % +second 1,134 101,601 24967.61 23081.09 16,588 44,563 185 185 37 4,619,008 4,619,008 47.50 %
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/second.txt Wed Nov 08 16:38:50 2017 -0500 @@ -0,0 +1,34 @@ +Statistics for contig lengths: + Min contig length: 1,134 + Max contig length: 101,601 + Mean contig length: 24967.61 + Standard deviation of contig length: 23081.09 + Median contig length: 16,588 + N50 contig length: 44,563 + +Statistics for numbers of contigs: + Number of contigs: 185 + Number of contigs >=1kb: 185 + Number of contigs in N50: 37 + +Statistics for bases in the contigs: + Number of bases in all contigs: 4,619,008 + Number of bases in contigs >=1kb: 4,619,008 + GC Content of contigs: 47.50 % + +Simple Dinucleotide repeats: + Number of contigs with over 70% dinucleotode repeats: 0.00 % (0 contigs) + AT: 0.00 % (0 contigs) + CG: 0.00 % (0 contigs) + AC: 0.00 % (0 contigs) + TG: 0.00 % (0 contigs) + AG: 0.00 % (0 contigs) + TC: 0.00 % (0 contigs) + +Simple mononucleotide repeats: + Number of contigs with over 50% mononucleotode repeats: 0.00 % (0 contigs) + AA: 0.00 % (0 contigs) + TT: 0.00 % (0 contigs) + CC: 0.00 % (0 contigs) + GG: 0.00 % (0 contigs) +