Mercurial > repos > konradpaszkiewicz > velvetoptimiser
comparison VelvetOptimiser-2.1.7_modified/VelvetOpt/Utils.pm @ 0:50ae1360fbbe default tip
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
| author | konradpaszkiewicz |
|---|---|
| date | Tue, 07 Jun 2011 18:07:56 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:50ae1360fbbe |
|---|---|
| 1 # | |
| 2 # VelvetOpt::Utils.pm | |
| 3 # | |
| 4 # Copyright 2008,2009,2010 Simon Gladman <simon.gladman@csiro.au> | |
| 5 # | |
| 6 # This program is free software; you can redistribute it and/or modify | |
| 7 # it under the terms of the GNU General Public License as published by | |
| 8 # the Free Software Foundation; either version 2 of the License, or | |
| 9 # (at your option) any later version. | |
| 10 # | |
| 11 # This program is distributed in the hope that it will be useful, | |
| 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 # GNU General Public License for more details. | |
| 15 # | |
| 16 # You should have received a copy of the GNU General Public License | |
| 17 # along with this program; if not, write to the Free Software | |
| 18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
| 19 # MA 02110-1301, USA. | |
| 20 | |
| 21 # Version 2.1.3 | |
| 22 | |
| 23 # Changes for Version 2.0.1 | |
| 24 # Added Mikael Brandstrom Durling's numCpus and freeMem for the Mac. | |
| 25 # | |
| 26 # Changes for Version 2.1.0 | |
| 27 # Fixed bug in estExpCov so it now correctly uses all short read categories not just the first two. | |
| 28 # | |
| 29 # Changes for Version 2.1.2 | |
| 30 # Fixed bug in estExpCov so it now won't take columns with "N/A" or "Inf" into account | |
| 31 # | |
| 32 # Changes for Version 2.1.3 | |
| 33 # Changed the minimum contig size to use for estimating expected coverage to 3*kmer size -1 and set the minimum coverage to 2 instead of 0. | |
| 34 # This should get rid of exp_covs of 1 when it should be very high for assembling reads that weren't ampped to a reference using one of the standard read mapping programs | |
| 35 | |
| 36 | |
| 37 package VelvetOpt::Utils; | |
| 38 | |
| 39 use strict; | |
| 40 use lib "/usr/local/lib/perl5/site_perl/5.8.8"; | |
| 41 use warnings; | |
| 42 use POSIX qw(ceil floor); | |
| 43 use Carp; | |
| 44 use List::Util qw(max); | |
| 45 use Bio::SeqIO; | |
| 46 | |
| 47 # num_cpu | |
| 48 # It returns the number of cpus present in the system if linux. | |
| 49 # If it is MAC then it returns the number of cores present. | |
| 50 # If the OS is not linux or Mac then it returns 1. | |
| 51 # Written by Torsten Seemann 2009 (linux) and Mikael Brandstrom Durling 2009 (Mac). | |
| 52 | |
| 53 sub num_cpu { | |
| 54 if ( $^O =~ m/linux/i ) { | |
| 55 my ($num) = qx(grep -c ^processor /proc/cpuinfo); | |
| 56 chomp $num; | |
| 57 return $num if $num =~ m/^\d+/; | |
| 58 } | |
| 59 elsif( $^O =~ m/darwin/i){ | |
| 60 my ($num) = qx(system_profiler SPHardwareDataType | grep Cores); | |
| 61 $num =~ /.*Cores: (\d+)/; | |
| 62 $num =$1; | |
| 63 return $num; | |
| 64 } | |
| 65 return 1; | |
| 66 } | |
| 67 | |
| 68 # free_mem | |
| 69 # Returns the current amount of free memory | |
| 70 # Mac Section written by Mikael Brandstrom Durling 2009 (Mac). | |
| 71 | |
| 72 sub free_mem { | |
| 73 if( $^O =~ m/linux/i ) { | |
| 74 my $x = `free | grep '^Mem:' | sed 's/ */~/g' | cut -d '~' -f 4,7`; | |
| 75 my @tmp = split "~", $x; | |
| 76 my $total = $tmp[0] + $tmp[1]; | |
| 77 my $totalGB = $total / 1024 / 1024; | |
| 78 return $totalGB; | |
| 79 } | |
| 80 elsif( $^O =~ m/darwin/i){ | |
| 81 my ($tmp) = qx(vm_stat | grep size); | |
| 82 $tmp =~ /.*size of (\d+) bytes.*/; | |
| 83 my $page_size = $1; | |
| 84 ($tmp) = qx(vm_stat | grep free); | |
| 85 $tmp =~ /[^0-9]+(\d+).*/; | |
| 86 my $free_pages = $1; | |
| 87 my $totalGB = ($free_pages * $page_size) / 1024 / 1024 / 1024; | |
| 88 return $totalGB; | |
| 89 } | |
| 90 } | |
| 91 | |
| 92 # estExpCov | |
| 93 # it returns the expected coverage of short reads from an assembly by | |
| 94 # performing a math mode on the stats.txt file supplied.. It looks at | |
| 95 # all the short_cov? columns.. Uses minimum contig length and minimum coverage. | |
| 96 # needs the stats.txt file path and name, and the k-value used in the assembly. | |
| 97 # Original algorithm by Torsten Seemann 2009 under the GPL. | |
| 98 # Adapted by Simon Gladman 2009. | |
| 99 # It does a weighted mode... | |
| 100 | |
| 101 sub estExpCov { | |
| 102 use List::Util qw(max); | |
| 103 my $file = shift; | |
| 104 my $kmer = shift; | |
| 105 my $minlen = 3 * $kmer - 1; | |
| 106 my $mincov = 2; | |
| 107 my $fh; | |
| 108 unless ( open IN, $file ) { | |
| 109 croak "Unable to open $file for exp_cov determination.\n"; | |
| 110 } | |
| 111 my @cov; | |
| 112 while (<IN>) { | |
| 113 chomp; | |
| 114 my @x = split m/\t/; | |
| 115 my $len = scalar @x; | |
| 116 next unless @x >= 7; | |
| 117 next unless $x[1] =~ m/^\d+$/; | |
| 118 next unless $x[1] >= $minlen; | |
| 119 | |
| 120 #add all the short_cov columns.. | |
| 121 my $cov = 0; | |
| 122 for(my $i = 5; $i < $len; $i += 2){ | |
| 123 if($x[$i] =~ /\d/){ | |
| 124 $cov += $x[$i]; | |
| 125 } | |
| 126 } | |
| 127 next unless $cov > $mincov; | |
| 128 push @cov, ( ( int($cov) ) x $x[1] ); | |
| 129 } | |
| 130 | |
| 131 my %freq_of; | |
| 132 map { $freq_of{$_}++ } @cov; | |
| 133 my $mode = 0; | |
| 134 $freq_of{$mode} = 0; # sentinel | |
| 135 for my $x ( keys %freq_of ) { | |
| 136 $mode = $x if $freq_of{$x} > $freq_of{$mode}; | |
| 137 } | |
| 138 return $mode; | |
| 139 } | |
| 140 | |
| 141 # estVelvetMemUse | |
| 142 # returns the estimated memory usage for velvet in GB | |
| 143 | |
| 144 sub estVelvetMemUse { | |
| 145 my ($readsize, $genomesize, $numreads, $k) = @_; | |
| 146 my $velvetgmem = -109635 + 18977*$readsize + 86326*$genomesize + 233353*$numreads - 51092*$k; | |
| 147 my $out = ($velvetgmem/1024) / 1024; | |
| 148 return $out; | |
| 149 } | |
| 150 | |
| 151 # getReadSizeNum | |
| 152 # returns the number of reads and average size in the short and shortPaired categories... | |
| 153 | |
| 154 sub getReadSizeNum { | |
| 155 my $f = shift; | |
| 156 my %reads; | |
| 157 my $num = 0; | |
| 158 my $currentfiletype = "fasta"; | |
| 159 #first pull apart the velveth string and get the short and shortpaired filenames... | |
| 160 my @l = split /\s+/, $f; | |
| 161 my $i = 0; | |
| 162 foreach (@l){ | |
| 163 if(/^-/){ | |
| 164 if(/^-fasta/){ | |
| 165 $currentfiletype = "fasta"; | |
| 166 } | |
| 167 elsif(/^-fastq/){ | |
| 168 $currentfiletype = "fastq"; | |
| 169 } | |
| 170 elsif(/(-eland)|(-gerald)|(-fasta.gz)|(-fastq.gz)/) { | |
| 171 croak "Cannot estimate memory usage from file types other than fasta or fastq..\n"; | |
| 172 } | |
| 173 } | |
| 174 elsif(-r $_){ | |
| 175 my $file = $_; | |
| 176 if($currentfiletype eq "fasta"){ | |
| 177 my $x = `grep -c "^>" $file`; | |
| 178 chomp($x); | |
| 179 $num += $x; | |
| 180 my $l = &getReadLength($file, 'Fasta'); | |
| 181 $reads{$l} += $x; | |
| 182 print STDERR "File: $file has $x reads of length $l\n"; | |
| 183 } | |
| 184 else { | |
| 185 my $x = `grep -c "^@" $file`; | |
| 186 chomp($x); | |
| 187 $num += $x; | |
| 188 my $l = &getReadLength($file, 'Fastq'); | |
| 189 $reads{$l} += $x; | |
| 190 print STDERR "File: $file has $x reads of length $l\n"; | |
| 191 } | |
| 192 } | |
| 193 $i ++; | |
| 194 } | |
| 195 my $totlength = 0; | |
| 196 foreach my $k (keys %reads){ | |
| 197 $totlength += ($reads{$k} * $k); | |
| 198 } | |
| 199 | |
| 200 | |
| 201 my @results; | |
| 202 push @results, floor($totlength/$num); | |
| 203 push @results, ($num/1000000); | |
| 204 printf STDERR "Total reads: %.1f million. Avg length: %.1f\n",($num/1000000), ($totlength/$num); | |
| 205 return @results; | |
| 206 } | |
| 207 | |
| 208 # getReadLength - returns the length of the first read in a file of type fasta or fastq.. | |
| 209 # | |
| 210 sub getReadLength { | |
| 211 my ($f, $t) = @_; | |
| 212 my $sio = Bio::SeqIO->new(-file => $f, -format => $t); | |
| 213 my $s = $sio->next_seq() or croak "Something went bad while reading file $f!\n"; | |
| 214 return $s->length; | |
| 215 } | |
| 216 | |
| 217 return 1; | |
| 218 |
