Mercurial > repos > dcouvin > nuclescore
comparison nuclescore.sample @ 0:82dce1eb9074 draft default tip
Uploaded
| author | dcouvin |
|---|---|
| date | Fri, 03 Sep 2021 22:36:56 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:82dce1eb9074 |
|---|---|
| 1 <tool id="nuclescoretool" name="nuclescore" version="0.1.0"> | |
| 2 <description>nuclescore</description> | |
| 3 | |
| 4 | |
| 5 <command detect_errors="aggressive"><![CDATA[ | |
| 6 | |
| 7 #import re | |
| 8 ## Creates symlinks for each input file based on the Galaxy 'element_identifier' | |
| 9 ## Used so that a human-readable name appears in the output table (instead of 'dataset_xyz.dat') | |
| 10 #set $named_input_files = '' | |
| 11 #for $input_file in $input_files | |
| 12 ## Add single quotes around each input file identifier | |
| 13 #set $_input_file = "'{}'".format($input_file.element_identifier) | |
| 14 ln -s '${input_file}' ${_input_file} && | |
| 15 #set $named_input_files = $named_input_files + ' ' + $_input_file | |
| 16 #end for | |
| 17 | |
| 18 perl '$__tool_directory__/nucleScore.pl' $_input_file $output | |
| 19 | |
| 20 | |
| 21 | |
| 22 | |
| 23 ]]></command> | |
| 24 <!-- ./nuclescore.sh ${named_input_files} > "$output" --> | |
| 25 | |
| 26 <inputs> | |
| 27 <param format="fasta" name="input_files" type="data" label="Genome fasta file : " multiple="true" display="checkboxes"/> | |
| 28 </inputs> | |
| 29 | |
| 30 <outputs> | |
| 31 <data format="tabular" name="output" /> | |
| 32 </outputs> | |
| 33 | |
| 34 <help> | |
| 35 No documentation | |
| 36 </help> | |
| 37 | |
| 38 </tool> | |
| 39 | |
| 40 | |
| 41 ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
| 42 | |
| 43 #!/usr/bin/perl | |
| 44 | |
| 45 use strict; | |
| 46 use warnings; | |
| 47 use Bio::SeqIO; | |
| 48 use Shannon::Entropy qw/entropy/; | |
| 49 use File::Basename; | |
| 50 #use Bio::Species; | |
| 51 | |
| 52 #use FindBin; | |
| 53 #use lib "$FindBin::RealBin/../perl5"; | |
| 54 | |
| 55 my $input = $ARGV[0]; | |
| 56 #chercher comment faire une liste perl pour input | |
| 57 my @liste = split(/,/, $input); | |
| 58 my $recap_total_seq = $ARGV[1]; | |
| 59 | |
| 60 #my ($input, $recap_total_seq) = @ARGV; | |
| 61 | |
| 62 my $start = time(); | |
| 63 | |
| 64 #my $file = ""; #= $ARGV[0]; | |
| 65 #my $recap_total_seq = "nucleScore_result.xls"; | |
| 66 | |
| 67 open (RECAP,'>', $recap_total_seq) or die "could not open $!"; | |
| 68 print RECAP "File\tA percent\tT percent\tC percent\tG percent\tGC percent\tAT/GC ratio\tNucleScore\tShannon Entropy\tAAA\tAAT\n"; | |
| 69 close(RECAP); | |
| 70 | |
| 71 | |
| 72 #FASTA files | |
| 73 #if(@ARGV){ | |
| 74 | |
| 75 #for (my $i = 0; $i <= $#ARGV; $i++) { | |
| 76 #if ($ARGV[$i]=~/-output/i or $ARGV[$i]=~/-o/i) { | |
| 77 # $recap_total_seq = $ARGV[$i+1]; | |
| 78 #} | |
| 79 #} | |
| 80 | |
| 81 | |
| 82 open (RECAP,'>>', $recap_total_seq) or die "could not open $!"; | |
| 83 | |
| 84 #refaire le for pour la liste input | |
| 85 #for my $arg (@ARGV){ | |
| 86 for my $arg (@liste){ | |
| 87 # if ($arg =~ m/.fasta/ or $arg =~ m/.fna/ or $arg =~ m/.fa/){ | |
| 88 | |
| 89 #print "Traitement du fichier de sequence: $arg\n"; | |
| 90 print "Traitement du fichier de sequence: $arg\n"; | |
| 91 #my $file = $arg; | |
| 92 my $file = $arg; | |
| 93 | |
| 94 | |
| 95 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$file); | |
| 96 my $globalSeq = ""; | |
| 97 while (my $seq = $seqIO->next_seq()) { | |
| 98 my $seqID = $seq->id; | |
| 99 my $seqNuc = $seq->seq; | |
| 100 $globalSeq .= $seqNuc; | |
| 101 #push @arrayID, $seqID; | |
| 102 #$hSeq{$seqID} = $seqNuc; | |
| 103 #my @seqArray = split //, $seqNuc; | |
| 104 } | |
| 105 | |
| 106 my $gcpercent = gc_percent($globalSeq); | |
| 107 my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($file); | |
| 108 my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length); | |
| 109 | |
| 110 my $atgcRatio = atgc_ratio($ade, $thy, $gua, $cyt); | |
| 111 | |
| 112 my @percentList = ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent); | |
| 113 | |
| 114 my $variance = shift_data_variance(@percentList); | |
| 115 my $nucleScore = nucle_score($variance, $gcpercent, $atgcRatio, $length); | |
| 116 my $entropy = entropy($globalSeq); | |
| 117 | |
| 118 print "The sequence length for $file is: $length\n"; | |
| 119 print "A percent: $aPercent\n"; | |
| 120 print "T percent: $tPercent\n"; | |
| 121 print "G percent: $gPercent\n"; | |
| 122 print "C percent: $cPercent\n"; | |
| 123 print "N percent: $nPercent\n"; | |
| 124 | |
| 125 print "GC percent: $gcpercent\n"; | |
| 126 | |
| 127 print "AT/GC ratio: $atgcRatio\n"; | |
| 128 | |
| 129 print "NucleScore: $nucleScore\n"; | |
| 130 | |
| 131 print "Shannon Entropy: $entropy\n\n"; | |
| 132 | |
| 133 print "3 digits:\n"; | |
| 134 my @trinucs=($globalSeq=~/(?=(.{3}))/g); | |
| 135 my %tri_count=(); | |
| 136 $tri_count{$_}++ for @trinucs; | |
| 137 print $_,":",$tri_count{$_},"\n" for sort keys(%tri_count); | |
| 138 print "\n2 digits:\n"; | |
| 139 my @trinucs2=($globalSeq=~/(?=(.{2}))/g); | |
| 140 my %tri_count2=(); | |
| 141 $tri_count2{$_}++ for @trinucs2; | |
| 142 print $_,":",$tri_count2{$_},"\n" for sort keys(%tri_count2); | |
| 143 | |
| 144 my $aaa = $tri_count{'AAA'}; | |
| 145 my $aat = $tri_count{'AAT'}; | |
| 146 | |
| 147 print "--------------------------------------\n\n"; | |
| 148 | |
| 149 my $label = basename($file); | |
| 150 | |
| 151 | |
| 152 #Summary file | |
| 153 #print RECAP "$file\t$aPercent\t$tPercent\t$cPercent\t$gPercent\t$gcpercent\t$atgcRatio\t$nucleScore\t$entropy\t$aaa\t$aat\n"; | |
| 154 print RECAP "$label\t$aPercent\t$tPercent\t$cPercent\t$gPercent\t$gcpercent\t$atgcRatio\t$nucleScore\t$entropy\t$aaa\t$aat\n"; | |
| 155 #} | |
| 156 } | |
| 157 close (RECAP) or die "close file error : $!"; | |
| 158 #} | |
| 159 | |
| 160 my $end = time(); | |
| 161 | |
| 162 my $total = $end - $start; | |
| 163 | |
| 164 print "***** Total time (in seconds) is: $total *****\n"; | |
| 165 | |
| 166 #------------------------------------------------------------------------------ | |
| 167 # number nucleotid and length | |
| 168 sub number_nuc_length_seq { | |
| 169 my ($fastaFile) = @_; | |
| 170 my $ade = 0; | |
| 171 my $thy = 0; | |
| 172 my $gua = 0; | |
| 173 my $cyt = 0; | |
| 174 my $n = 0; | |
| 175 my $length = 0; | |
| 176 | |
| 177 open (FASTA, "<", $fastaFile) or die "Could not open $!"; | |
| 178 while (<FASTA>) { | |
| 179 chomp; | |
| 180 if ($_ !~ />/) { | |
| 181 my @seq = split //, $_; | |
| 182 | |
| 183 for my $nuc (@seq) { | |
| 184 $length +=1 ; | |
| 185 if ($nuc =~ /a/i) {$ade+=1;} | |
| 186 elsif ($nuc =~ /t/i) {$thy+=1;} | |
| 187 elsif ($nuc =~ /g/i) {$gua+=1;} | |
| 188 elsif ($nuc =~ /c/i) {$cyt+=1;} | |
| 189 elsif ($nuc =~ /n/i) {$n+=1;} | |
| 190 } | |
| 191 } | |
| 192 } | |
| 193 close(FASTA) or die "Error close file :$!"; | |
| 194 return ($ade, $thy, $gua, $cyt, $n, $length); | |
| 195 | |
| 196 } | |
| 197 | |
| 198 #------------------------------------------------------------------------------ | |
| 199 # compute percentage of nucleotid | |
| 200 sub nucleotid_percent { | |
| 201 my($ade, $thy, $gua, $cyt, $n, $length) = @_; | |
| 202 | |
| 203 my $adePercent = $ade / $length * 100; | |
| 204 my $thyPercent = $thy / $length * 100; | |
| 205 my $guaPercent = $gua / $length * 100; | |
| 206 my $cytPercent = $cyt / $length * 100; | |
| 207 my $nPercent = $n / $length * 100; | |
| 208 | |
| 209 return ($adePercent, $thyPercent, $guaPercent, $cytPercent, $nPercent); | |
| 210 | |
| 211 } | |
| 212 | |
| 213 #------------------------------------------------------------------------------ | |
| 214 # compute GC pourcent | |
| 215 sub gc_percent { | |
| 216 my ($seq) = @_; | |
| 217 | |
| 218 my @charSeq = split(//, uc($seq)); | |
| 219 my %hashFlank = (); | |
| 220 | |
| 221 foreach my $v (@charSeq) { | |
| 222 $hashFlank{$v} += 1; | |
| 223 } | |
| 224 | |
| 225 if (! $hashFlank{'G'}) { $hashFlank{'G'} = 0;} | |
| 226 if (! $hashFlank{'C'}) { $hashFlank{'C'} = 0;} | |
| 227 | |
| 228 if(length($seq) == 0) { | |
| 229 return 0; | |
| 230 } | |
| 231 else { | |
| 232 return (($hashFlank{'G'} + $hashFlank{'C'}) / (length($seq))) * 100; | |
| 233 } | |
| 234 | |
| 235 } | |
| 236 #------------------------------------------------------------------------------ | |
| 237 # compute ATGC ratio | |
| 238 sub atgc_ratio { | |
| 239 my ($ade, $thy, $gua, $cyt) = @_; | |
| 240 | |
| 241 return (($ade + $thy) / ($gua + $cyt)); | |
| 242 | |
| 243 } | |
| 244 #------------------------------------------------------------------------------ | |
| 245 # variance | |
| 246 sub shift_data_variance { | |
| 247 my (@data) = @_; | |
| 248 | |
| 249 if ($#data + 1 < 2) { return 0.0; } | |
| 250 | |
| 251 my $K = $data[0]; | |
| 252 my ($n, $Ex, $Ex2) = 0.0; | |
| 253 | |
| 254 for my $x (@data) { | |
| 255 $n = $n + 1; | |
| 256 $Ex += $x - $K; | |
| 257 $Ex2 += ($x - $K) * ($x - $K); | |
| 258 } | |
| 259 | |
| 260 my $variance = ($Ex2 - ($Ex * $Ex) / $n) / ($n); ## ($n - 1) | |
| 261 | |
| 262 return $variance; | |
| 263 | |
| 264 } | |
| 265 #------------------------------------------------------------------------------ | |
| 266 # nucle score | |
| 267 #sub nucle_score { | |
| 268 # my ($variance, $gcPercent, $atgcRatio, $length) = @_; | |
| 269 # | |
| 270 # return (($variance * $gcPercent * $atgcRatio) / $length); | |
| 271 #} | |
| 272 sub nucle_score { | |
| 273 my ($variance, $gcPercent, $atgcRatio, $length) = @_; | |
| 274 return log2(($variance * $gcPercent * $atgcRatio) / sqrt($length)); | |
| 275 } | |
| 276 | |
| 277 #------------------------------------------------------------------------------ | |
| 278 sub log2 { | |
| 279 my $n = shift; | |
| 280 return (log($n) / log(2)); | |
| 281 } |
