Mercurial > repos > melpetera > acorf
changeset 0:d03fcbeb0a77 draft
Uploaded
author | melpetera |
---|---|
date | Fri, 18 Oct 2019 04:59:51 -0400 |
parents | |
children | 26aa3a8f95ce |
files | ACF/Analytic_correlation_filtration.pl ACF/README.md ACF/analytic_correlation_filtration.xml ACF/data/default_list.csv ACF/lib/IonFiltration.pm ACF/static/images/Adduct_fragment_list.JPG ACF/static/images/similarity_matrix.JPG |
diffstat | 7 files changed, 1305 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ACF/Analytic_correlation_filtration.pl Fri Oct 18 04:59:51 2019 -0400 @@ -0,0 +1,643 @@ +#!usr/bin/perl + +### Perl modules +use warnings; +use strict; +use Getopt::Long qw(GetOptions); #Creation of script options +use Pod::Usage qw(pod2usage); #Creation of script options + +#Personnal packages +use FindBin ; ## Allows you to locate the directory of original perl script +#use lib $FindBin::Bin; +use lib "$FindBin::Bin/lib"; +use IonFiltration; + +my ($file, $mass_file, $opt, $dataMatrix, $combined_DMVM, $repres_opt, $rt_threshold, $mass_threshold, $output_sif, $output_tabular, $correl_threshold, $intensity_threshold, $intensity_pourc); #Options to complete + +######################## +### Options and help ### +######################## + +GetOptions("f=s"=>\$file, "m=s"=>\$mass_file, "o=s"=>\$opt, "d=s"=>\$dataMatrix, "v=s"=>\$combined_DMVM, "r=s"=>\$repres_opt, "rt=f"=>\$rt_threshold, "mass=f"=>\$mass_threshold, "output_sif=s"=>\$output_sif, "output_tabular=s"=>\$output_tabular, "correl=s"=>\$correl_threshold, "IT=f"=>\$intensity_threshold, "IP=f"=>\$intensity_pourc) or pod2usage(2); + +### Check required parameters : +pod2usage({-message=>q{Mandatory argument '-f' is missing}, -exitval=>1, -verbose=>0}) unless $file; +#pod2usage({-message=>q{Mandatory argument '-m' is missing}, -exitval=>1, -verbose=>0}) unless $mass_file; +pod2usage({-message=>q{Mandatory argument '-o' is missing. It correspond to the grouping method for analytical correlation groups formation. +#It should be a number (1 ; 2 or 3) : +# 1 : Don't take into acount mass information (only RT) ; +# 2 : Check that all mass differences are include in a specific list and taking into acount RT information +# 3 : Check that all mass differences are include in a specific list, ignoring RT information +#To use the tool without takinf into account mass and RT information, use option 1 and define the RT threshold to 999999999.}, -exitval=>1, -verbose=>0}) unless $opt; +pod2usage({-message=>q{Mandatory argument '-r' is missing. It correspond to the group representent choosing method for analytical correlation groups formation. +It should be one of the 3 options below : + "mass" : choose the ion with the highest mass as the representant + "intensity" : choose the ion with the highest intensity as the representant + "mixt" : choose the ion with the highest (mass^2 * intensity) as the representant + "max_intensity_max_mass" : choose tha ion witht he highest intenisty among the 5 most intense ions of the group}, -exitval=>1, -verbose=>0}) unless $repres_opt; +pod2usage({-message=>q{Mandatory argument '-d' is missing}, -exitval=>1, -verbose=>0}) unless $dataMatrix; +pod2usage({-message=>q{Mandatory argument '-v' is missing}, -exitval=>1, -verbose=>0}) unless $combined_DMVM; +#pod2usage({-message=>q{Mandatory argument '-rt' is missing}, -exitval=>1, -verbose=>0}) unless $rt_threshold; +#pod2usage({-message=>q{Mandatory argument '-mass' is missing}, -exitval=>1, -verbose=>0}) unless $mass_threshold; +pod2usage({-message=>q{Mandatory argument '-correl' is missing}, -exitval=>1, -verbose=>0}) unless $correl_threshold; +pod2usage({-message=>q{Mandatory argument '-output_tabular' is missing}, -exitval=>1, -verbose=>0}) unless $output_tabular; +pod2usage({-message=>q{Mandatory argument '-output_sif' is missing}, -exitval=>1, -verbose=>0}) unless $output_sif; + + +#if(($opt != 1) && ($opt != 2) && ($opt != 3)){ +# print "you must indicate \"1\", \"2\" or \"3\" for the --o otpion\n"; +# exit; +#} + + + +if(($repres_opt ne "mass") && ($repres_opt ne "intensity") && ($repres_opt ne "mixt") && ($repres_opt ne "max_intensity_max_mass")){ + print "you must indicate \"mass\", \"intensity\", \"mix\" or \"max_intensity_max_mass\" for the --r otpion\n"; + exit; +} + + + +######################################################################### +#### Création of a hash containing all adduits and fragments possible ### +######################################################################### + +my %hmass; +if($opt != 1){ + %hmass = IonFiltration::MassCollecting($mass_file); + +} + +my $refhmass = \%hmass; + +print "Création of a hash containing all adduits and fragments possible\n"; + + +######################################################## +### Creation of a sif table + correlation filtration ### +######################################################## + +my %hrtmz; +($output_sif, %hrtmz) = IonFiltration::sifTableCreation($file, $output_sif, $opt, $rt_threshold, $mass_threshold, $correl_threshold, $dataMatrix, $output_tabular, $combined_DMVM, $repres_opt, $intensity_threshold, $intensity_pourc, \%hmass); +print "Creation of a sif table + correlation filtration done\n"; + + +###################################################### +### Analytic correlation filtrering follow options ### +###################################################### + +my %hheader_file; +my %hduplicate; + +my %hcorrelgroup; +my $groupct=1; + +my $linenb3=0; +my %hheader_line; + + + +open (F1, $output_sif) or die "Impossible to open $output_sif\n"; + +while(my $line = <F1>){ + my $count=0; + chomp $line; + my @tline = split(/\t/, $line); + my $a = $tline[0]; + my $b = $tline[2]; + + my $amass=$hrtmz{$a}{mz}; + my $atemp=$hrtmz{$a}{rt}; + my $bmass= $hrtmz{$b}{mz}; + my $btemp=$hrtmz{$b}{rt}; + my $diff = $amass-$bmass; + $diff = abs($diff); + + ### Option 1: Don't take into acount mass information ### + + if($opt == 1){ + my $btplus = $btemp + $rt_threshold; + my $btmoins = $btemp - $rt_threshold; + if(($btmoins <= $atemp) && ($atemp <= $btplus)){ + foreach my $k (keys %hcorrelgroup){ + if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ + $hcorrelgroup{$k}{$a}=1; + $hcorrelgroup{$k}{$b}=1; + $count++; + last; + } + } + if($count == 0){ + my $groupnb="group".$groupct; + $hcorrelgroup{$groupnb}{$a}=1; + $hcorrelgroup{$groupnb}{$b}=1; + $groupct ++; + } + } + } + + + + ### Option 2: Check that all mass differences are include in a specific list taking into account RT information ### + + elsif($opt == 2){ + + my $print = 0; + foreach my $s (keys %{$refhmass}){ + foreach my $r (keys %{$refhmass->{$s}}){ + my $rm = $r - $mass_threshold; + my $rp = $r + $mass_threshold; + if(($diff <= $rp) && ($diff >= $rm)){ + if($print == 0){ + my $btplus = $btemp + $rt_threshold; + my $btmoins = $btemp - $rt_threshold; + + if(($btmoins <= $atemp) && ($atemp <= $btplus)){ + foreach my $k (keys %hcorrelgroup){ + if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ + $hcorrelgroup{$k}{$a}=1; + $hcorrelgroup{$k}{$b}=1; + $count++; + last; + } + } + if($count == 0){ + my $groupnb="group".$groupct; + $hcorrelgroup{$groupnb}{$a}=1; + $hcorrelgroup{$groupnb}{$b}=1; + $groupct ++; + } + $print = 1; + } + } + } + } + } + } + + + ### Option 3: Check that all mass differences are include in a specific list, ignoring RT information ### + + elsif($opt == 3){ + + my $print = 0; + foreach my $s (keys %{$refhmass}){ + foreach my $r (keys %{$refhmass->{$s}}){ + my $rm = $r - $mass_threshold; + my $rp = $r + $mass_threshold; + if(($diff <= $rp) && ($diff >= $rm)){ + if($print == 0){ + + foreach my $k (keys %hcorrelgroup){ + if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ + $hcorrelgroup{$k}{$a}=1; + $hcorrelgroup{$k}{$b}=1; + $count++; + last; + } + } + if($count == 0){ + my $groupnb="group".$groupct; + $hcorrelgroup{$groupnb}{$a}=1; + $hcorrelgroup{$groupnb}{$b}=1; + $groupct ++; + } + $print = 1; + } + } + } + } + } +} +close F1; + +print "Analytic correlation filtrering follow options done\n"; + + +############################################# +### Join groups that have been subdivided ### +############################################# + +my @tdelete; + +foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + foreach my $v (keys %hcorrelgroup){ + my $count = 0; + if ($v ne $k){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ + if($w eq $i){ + $count = 1; + push(@tdelete, $v); + } + } + } + if($count == 1){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ + $hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w}; + } + delete($hcorrelgroup{$v}); + } + } + } +} + +foreach my $t (@tdelete){ + delete($hcorrelgroup{$t}); +} + + +### Do it twice to see if it fix the problem of unmerge groups + +foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + foreach my $v (keys %hcorrelgroup){ + my $count = 0; + if ($v ne $k){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ + if($w eq $i){ + $count = 1; + push(@tdelete, $v); + } + } + } + if($count == 1){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ + $hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w}; + } + delete($hcorrelgroup{$v}); + } + } + } +} + +foreach my $t (@tdelete){ + delete($hcorrelgroup{$t}); +} + +print "Join groups that have been subdivided done\n"; + +####################################################### +### Addition of annotation information among groups ### +####################################################### + +foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + foreach my $j (keys %{$hcorrelgroup{$k}}){ + my $count = 0; + if ($i ne $j){ + + my $a = $hrtmz{$i}{mz}; + my $b = $hrtmz{$j}{mz}; + + my $diff = $a - $b; + my $sign; + if($diff>0){ + $sign="+"; + } + if($diff<0){ + $sign="-"; + } + $diff = abs($diff); + + foreach my $z (keys %{$refhmass}){ + + foreach my $y (keys %{$refhmass->{$z}}){ + my $ym = $y - $mass_threshold; + my $yp = $y + $mass_threshold; + + + if(($diff <= $yp) && ($diff >= $ym)){ + my $diff_list = $diff - $y; + $diff_list = abs($diff_list); + $diff_list = sprintf ("%0.6f", $diff_list); + + if($hcorrelgroup{$k}{$i} eq 1){ + my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|"; + $hcorrelgroup{$k}{$i}=$val; + $count ++; + } + else{ + if($count == 0){ + my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|"; + $hcorrelgroup{$k}{$i}.=$val; + $count ++; + } + else{ + my $val = $sign."(".$z.")(".$diff_list.")|"; + $hcorrelgroup{$k}{$i}.=$val; + $count ++; + } + } + } + } + } + } + } + } +} + + +print "Addition of annotation information among groups done\n"; + + +#################################################### +### Choose the representative ion for each group ### +#################################################### + +my %hgrouprepres; + +open(F3, $dataMatrix); + +while (my $line = <F3>){ + chomp $line; + + my @tline = split (/\t/, $line); + + foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + if($tline[0] eq $i){ + $hgrouprepres{$k}{$i}{mass}=$hrtmz{$tline[0]}{mz}; + my $intensity; + my $nbsubjects=0; + for(my $y=1;$y<scalar(@tline);$y++){ + $intensity += $tline[$y]; + $nbsubjects ++; + } + my $meanintensity = $intensity/$nbsubjects; + $hgrouprepres{$k}{$i}{intensity}=$meanintensity; + $hgrouprepres{$k}{$i}{squaredmassint}=($hgrouprepres{$k}{$i}{mass}**2)/($hgrouprepres{$k}{$i}{intensity}); + } + } + } +} +close F3; + +foreach my $z (keys %hgrouprepres){ + my $max_intensity = 0; + my $max_int_ion = ""; + my $max_mass = 0; + my $max_mass_ion = ""; + my $max_squared = 0; + my $max_squared_ion = ""; + foreach my $w (keys %{$hgrouprepres{$z}}){ + if($hgrouprepres{$z}{$w}{intensity} > $max_intensity){ + $max_intensity = $hgrouprepres{$z}{$w}{intensity}; + $max_int_ion = $w; + } + if($hgrouprepres{$z}{$w}{mass} > $max_mass){ + $max_mass = $hgrouprepres{$z}{$w}{mass}; + $max_mass_ion = $w; + } + if($hgrouprepres{$z}{$w}{squaredmassint} > $max_squared){ + $max_squared = $hgrouprepres{$z}{$w}{squaredmassint}; + $max_squared_ion = $w; + } + } + + my $max_int_max_mass_ion=""; + + if($repres_opt eq "max_intensity_max_mass"){ + my %hfirst; + my $first=0; + foreach my $w (reverse sort {$hgrouprepres{$z}{$a}{intensity} <=> $hgrouprepres{$z}{$b}{intensity} } keys %{$hgrouprepres{$z}}){ + $first ++; + if ($first <= 3){ + $hfirst{$w} = $hgrouprepres{$z}{$w}{intensity}; + } + } + + my $first_2 = 0; + my $intens_max = 0; + my $mass_max = 0; + + foreach my $y (reverse sort {$hfirst{$a} <=> $hfirst{$b}} keys %hfirst){ + + $first_2 ++; + if($first_2 == 1){ + $intens_max = $hfirst{$y}; + if($intensity_threshold > $intens_max){ + $intensity_threshold = 0; + } + $max_int_max_mass_ion = $y; + $mass_max = $hgrouprepres{$z}{$y}{mass}; + } + if($hgrouprepres{$z}{$y}{mass} > $mass_max){ + if($hfirst{$y}>$intensity_threshold){ + my $a = $intens_max * $intensity_pourc; + if($hfirst{$y} > $a){ + $max_int_max_mass_ion = $y; + $mass_max = $hgrouprepres{$z}{$y}{mass}; + } + } + } + } + } + + $hgrouprepres{$z}{max_int}=$max_int_ion; + $hgrouprepres{$z}{max_mass}=$max_mass_ion; + $hgrouprepres{$z}{max_squared}=$max_squared_ion; + $hgrouprepres{$z}{max_int_max_mass}=$max_int_max_mass_ion; + +} + + +print "Choose the representative ion for each group done\n"; + +############################################################################# +### Addition of annotation information relative to the representative ion ### +############################################################################# + +my %hreprescomparison; + +my $representative=""; + +if($opt != 1){ + foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + + if($repres_opt eq "mass"){$representative = $hgrouprepres{$k}{max_mass}} + if($repres_opt eq "intensity"){$representative = $hgrouprepres{$k}{max_int}} + if($repres_opt eq "mixt"){$representative = $hgrouprepres{$k}{max_squared}} + if($repres_opt eq "max_intensity_max_mass"){$representative = $hgrouprepres{$k}{max_int_max_mass}} + + + my $count = 0; + if ($i ne $representative){ + + my $a = $hrtmz{$i}{mz}; + my $b = $hrtmz{$representative}{mz}; + + my $diff = $a - $b; + my $sign; + if($diff>0){ + $sign="+"; + } + if($diff<0){ + $sign="-"; + } + $diff = abs($diff); + + foreach my $z (keys %{$refhmass}){ + + foreach my $y (keys %{$refhmass->{$z}}){ + my $ym = $y - $mass_threshold; + my $yp = $y + $mass_threshold; + + if(($diff <= $yp) && ($diff >= $ym)){ + my $diff_list = $diff - $y; + $diff_list = abs($diff_list); + $diff_list = sprintf ("%0.4f", $diff_list); + if($hcorrelgroup{$k}{$i} eq 1){ + my $valrep = "[M ".$sign."(".$z.")]|"; + $hreprescomparison{$k}{$i}{repres_diff}=$valrep; + $count ++; + } + else{ + if($count == 0){ + my $valrep = "[M ".$sign."(".$z.")]|"; + $hreprescomparison{$k}{$i}{repres_diff}.=$valrep; + $count ++; + } + else{ + my $valrep = "[M ".$sign."(".$z.")]|"; + $hreprescomparison{$k}{$i}{repres_diff}.=$valrep; + $count ++; + } + } + } + } + } + } + else{ + $hreprescomparison{$k}{$i}{repres_diff}="M"; + } + } + } +} + + +print "Addition of annotation information relative to the representative ion done\n"; + +############################## +### Print in result file ! ### +############################## + +open(F4, ">$output_tabular"); +open(F5, $combined_DMVM); + +my $line_nb = 0; +my %hheader; +while (my $line = <F5>){ + chomp $line; + + + my @tline = split (/\t/, $line); + + if($line_nb == 0){ + print F4 "$line\tACorF_groups"; + if($opt == 1){ + if($repres_opt eq "intensity"){print F4 "\tACorF_filter\tintensity_repres\n"} + if($repres_opt eq "mass"){print F4 "\tACorF_filter\tmass_repres\n"} + if($repres_opt eq "mixt"){print F4 "\tACorF_filter\tmass2intens_repres\n"} + if($repres_opt eq "max_intensity_max_mass"){print F4 "\tACorF_filter\tmax_intensity_max_mass_repres\n"} + } + else{ + if($repres_opt eq "intensity"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tintensity_repres\tannotation_relative_to_representative\n"} + if($repres_opt eq "mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass_repres\tannotation_relative_to_representative\n"} + if($repres_opt eq "mixt"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass2intens_repres\tannotation_relative_to_representative\n"} + if($repres_opt eq "max_intensity_max_mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmax_intensity_max_mass_repres\tannotation_relative_to_representative\n"} + } + + + ### Creation of a header hash + for(my $i=0; $i<scalar(@tline);$i++){ + my $a = $tline[$i]; + $hheader{$a}=$i; + } + } + + else{ + my $find = 0; + foreach my $v (keys %hcorrelgroup){ + if(defined($hgrouprepres{$v}{$tline[0]})){ + print F4 "$line\t$v"; + + if($opt != 1){ + if(defined($hcorrelgroup{$v}{$tline[0]})){ + print F4 "\t$hcorrelgroup{$v}{$tline[0]}\t"; + + } + else{ + print F4 "\t"; + } + } + + if($repres_opt eq "intensity"){ + if($tline[0] eq $hgrouprepres{$v}{max_int}){ + print F4 "1\t"; + } + else{ + print F4 "0\t"; + } + $find = 1; + } + if($repres_opt eq "mass"){ + if($tline[0] eq $hgrouprepres{$v}{max_mass}){ + print F4 "1\t"; + } + else{ + print F4 "0\t"; + } + $find = 1; + } + if($repres_opt eq "mixt"){ + if($tline[0] eq $hgrouprepres{$v}{max_squared}){ + print F4 "1\t"; + } + else{ + print F4 "0\t"; + } + $find = 1; + } + if($repres_opt eq "max_intensity_max_mass"){ + if($tline[0] eq $hgrouprepres{$v}{max_int_max_mass}){ + print F4 "1\t"; + } + else{ + print F4 "0\t"; + } + $find = 1; + } + + if($repres_opt eq "intensity"){print F4 "$hgrouprepres{$v}{max_int}\t"} + if($repres_opt eq "mass"){print F4 "$hgrouprepres{$v}{max_mass}\t"} + if($repres_opt eq "mixt"){print F4 "$hgrouprepres{$v}{max_squared}\t"} + if($repres_opt eq "max_intensity_max_mass"){print F4 "$hgrouprepres{$v}{max_int_max_mass}\t"} + + if(defined($hreprescomparison{$v}{$tline[0]}{repres_diff})){ + print F4 "$hreprescomparison{$v}{$tline[0]}{repres_diff}\n"; + } + else{ + print F4 "-\n"; + } + } + } + if($find == 0){ + $groupct ++; + my $group = "group".$groupct; + if($opt != 1){ + print F4 "$line\t$group\t-\t-\t-\t-\n"; + } + else{ + print F4 "$line\t$group\t-\t-\n"; + } + } + } + $line_nb ++; +} + +print "Print in result file done\n"; + +print "All steps done\n"; +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ACF/README.md Fri Oct 18 04:59:51 2019 -0400 @@ -0,0 +1,45 @@ +Analytical Correlation Filtration +======= + +Metadata +----------- + + * **@name**: ACorF + * **@version**: 2019-06-20 + * **@authors**: <stephanie.monnerie@inra.fr> + * **@date creation**: 2018/11/17 + * **@main usage**: Reduction of analytical redundancies in Metabolomics data + + +Configuration +----------- + +### Requirement: + * perl + + +### Deploy: + + +### Warnings: + + +Services provided +----------- + + + +Technical description +----------- + + +Notes +----------- + + + + +License (optional) +----------- + +This code is published under CECILL 2.1.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ACF/analytic_correlation_filtration.xml Fri Oct 18 04:59:51 2019 -0400 @@ -0,0 +1,211 @@ +<tool id="Analytic_correlation_filtration" name="Analytic correlation filtration" version="2019-06-20"> + <description> + : Detect analytic correlation among data and remove them. + </description> + + + <command><![CDATA[ + + + perl $__tool_directory__/Analytic_correlation_filtration.pl + + + #if str($mass_file.mass_choice)=="false": + #if str($rt_cond.rt_choice)=="false": + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -o 1 -d "$dataMatrix_in" -v "$variableMetadata_in" -rt 9999999999 + #else: + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -o 1 -d "$dataMatrix_in" -v "$variableMetadata_in" -rt "$rt_cond.rt_threshold" + #end if + #else: + #if str($mass_file.liste.mass_list)=="true": + #if str($rt_cond.rt_choice)=="true": + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -m "$mass_file.liste.mass_file_in" -o 2 -d "$dataMatrix_in" -v "$variableMetadata_in" -rt "$rt_cond.rt_threshold" -mass "$mass_file.mass_threshold" + #end if + #if str($rt_cond.rt_choice)=="false": + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -m "$mass_file.liste.mass_file_in" -o 3 -d "$dataMatrix_in" -v "$variableMetadata_in" -mass "$mass_file.mass_threshold" + #end if + #else + #if str($rt_cond.rt_choice)=="true": + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -m $__tool_directory__/data/default_list.csv -o 2 -d "$dataMatrix_in" -v "$variableMetadata_in" -rt "$rt_cond.rt_threshold" -mass "$mass_file.mass_threshold" + #end if + #if str($rt_cond.rt_choice)=="false": + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -m $__tool_directory__/data/default_list.csv -o 3 -d "$dataMatrix_in" -v "$variableMetadata_in" -mass "$mass_file.mass_threshold" + #end if + #end if + #end if + + -r "$repres_opt.repres_opt_selector" + + #if str($repres_opt.repres_opt_selector)=="max_intensity_max_mass": + -IT $repres_opt.int_threshold + -IP $repres_opt.int_percentage + #end if + -correl "$correl_threshold" + -output_sif "$sif_out" + -output_tabular "$variableMetadata_out" + + ]]></command> + + <inputs> + <param type="data" name="file_in" format="txt" help="The .txt similarity table (you can obtain it by using the Between-table Correlation tool or for exemple the cor() function in R) " label="Correlation table file" /> + <param type="data" name="dataMatrix_in" format="tabular" help="" label="dataMatrix file" /> + <param type="data" name="variableMetadata_in" format="tabular" help="" label="variableMetadata file" /> + + <param help="Define the minimum similarity threshold accepted to determine analytic correlation" label="Correlation threshold" type="float" name="correl_threshold" value="0.90"/> + + <conditional name="mass_file"> + <param name="mass_choice" checked="true" falsevalue="false" help="'YES' if you want to take it into account; 'NO' if you don't want to take into account mass information" label="Do you want to take into account mass differences between 2 ions?" truevalue="true" type="boolean"/> + <when value="true"> + <conditional name="liste"> + <param name="mass_list" checked="true" falsevalue="false" help="'YES' if you have your own list to upload; 'NO' if you want to use a default list" label="Do you have your own list of mass differences or do you want to use a default list ?" truevalue="true" type="boolean"/> + <when value="false"> + + </when> + <when value="true"> + <param type="data" name="mass_file_in" format="tabular,csv" help="The file containing all your report and known mass differences (cf help for file example) " label="Mass differences table (format: tabular or csv) " /> + </when> + </conditional> + <param help="2 ions need to have a difference mass included in the list at +/- mass difference range to be considered as analytically correlated | Value recommendation : 0.005" label="Mass difference range" type="float" name="mass_threshold" value="0.005"/> + </when> + <when value="false"> + + </when> + </conditional> + + <conditional name="rt_cond"> + <param checked="true" falsevalue="false" help="'YES' if want to take into account retention time information; 'NO' if you don't want to take into account retention time information" label="Do you want to take into account retention time differences between 2 ions? " name="rt_choice" truevalue="true" type="boolean"/> + <when value="true"> + <param help="Choose a retention time difference threshold between 2 ions considered as analytically correlated | Value recommendation : 0.1" label="Retention time difference threshold" type="float" name="rt_threshold" value="0.1"/> + </when> + <when value="false"> + + </when> + </conditional> + + <conditional name="repres_opt"> + <param name="repres_opt_selector" label="Which representative ion do you want to select for each group" type="select" display="radio" help=""> + <option value="intensity">Highest intensity</option> + <option value="mass">Highest mass</option> + <option value="mixt">Highest (mass2 x intensity) </option> + <option value="max_intensity_max_mass">Highest mass between the 3 highest intensity (following intensity threshold and rules ==> see help) </option> + </param> + <when value="max_intensity_max_mass"> + <param help="" label="Minimum intensity threshold for the representative ion" type="float" name="int_threshold" value="1000"/> + <param help="Example: ion A have the highest intensity of a group but not the highest mass, B is an ion that have the second highest intensity in the group and a highest mass than A, to choose B as a representative ion for the group his intensity need to be at list 50% of the A intensity." label="Percentage of highest intensity of the group accept for the new representative ion. This option allow to avoid isotope selection. " type="float" name="int_percentage" value="0.5"/> + </when> + <when value="intensity"> + </when> + <when value="mass"> + </when> + <when value="mixt"> + </when> + </conditional> + + </inputs> + + <outputs> + <data format="sif" label="${file_in.name}_sif" name="sif_out"/> + <data format="tabular" label="${variableMetadata_in.name}_representative_ion" name="variableMetadata_out"/> + </outputs> + + <help><![CDATA[ + +.. class:: infomark + +**Contact** : **Stephanie Monnerie**, **Estelle Pujos-Guillot** + +--------------------------------------------------- + +.. class:: infomark + +**References** : + +--------------------------------------------------- + +----------- +Input files +----------- + ++-----------------------------------------+---------------+ +| File | Format | ++=========================================+===============+ +| 1) Similarity matrix | txt | ++-----------------------------------------+---------------+ +| 2) Data matrix | tabular | ++-----------------------------------------+---------------+ +| 3) Variable metadata | tabular | ++-----------------------------------------+---------------+ +| **Optional file** | **Format** | ++-----------------------------------------+---------------+ +| 4) Optional : Mass differences list | csv/tabular | ++-----------------------------------------+---------------+ + +--------------------------------------------------- + +------------- +Files content +------------- + +Similarity matrix + * File organisation : on line by similarity pairs with the first ion ID, the similarity value and the second ion ID, tabular separated ==> Fist_Ion_ID \\t Similarity_Value \\t Second_Ion_ID + * Example: + +.. image:: similarity_matrix.JPG + :width: 800 + +Data matrix file + * "variable x sample" **dataMatrix** : tabular separated file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the variable metadata (see below) + +Variable metadata file + * "variable x metadata" **variableMetadata** tabular separated file of the numeric and/or character variable metadata, with . as decimal and NA for missing values + +.. class:: warningmark + +For more information about input files, refer to the corresponding "W4M HowTo" page: +http://workflow4metabolomics.org/sites/workflow4metabolomics.org/files/files/w4m_TableFormatForGalaxy_150908.pdf + + +Mass differences list + * A file containing list of known adducts, fragments or isotopes with the mass differences linked to them + * Example: + +.. image:: Adduct_fragment_list.JPG + :width: 350 + +--------------------------------------------------- + +---------- +Parameters +---------- + +Take into account mass diffrences between 2 ions : + * You can enter a list of mass differences that are known. The file must be organized with a first column for the mass difference type (isotope, fragment, etc...), a second column with the mass difference chemical formula (H+, -2H+K, etc...) and a third column for the mass difference value + * If you are choosing to use a mass differences table, you have to choose a mass difference range that will be a threshold to accept or not a difference value as true (recognize a mass difference value in the file +/- this threshold). + +Take into acount retention time : + * You can use retention time as a criteria to group ions. You have to choose a value that will be use as intervalle : 2 ions are group when their retention time is equal +/- the threshold. + +Choose the representative ion for each group, there are 3 possibilities to determine the representative ion : + * The ion with the highest intensity (recommandated for LC/MS) + * The ion with the highest mass + * The ion with the highest "mass2 * intensity" value + * The ion with the highest mass between the 3 highest intensity of the group, except if the highest mass ion have an intensity < determined percentage of the highest intensity ion one (for exemple 50%) (recommandated for GC/MS) + + +--------------------------------------------------- + +-------------- +Example of use +-------------- + +For UPLC/HRMS data, default parameters can be the following: + * If a Pearson correlation is used, the default threshold can be set at 0.90 + * A delta RT of 0.1 min or adjusted depending on chromatographic systems + * The use of the list of known adduct/isotope mass differences with a mass delta of 0.005 Da or adjusted depending on MS resolution + * The choice of the ion with the highest intensity as the representative ion. +For GC/HRMS dataset, we recommend to use the same parameters but ignoring the list of mass difference and to choose the ion with the highest mass among the top highest intensity as representative. + + + + ]]></help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ACF/data/default_list.csv Fri Oct 18 04:59:51 2019 -0400 @@ -0,0 +1,225 @@ +adduit -2H+Na+K 59.9378259 +adduit H 1.007825032 +adduit -H+K 37.95588165 +adduit -H+Na 21.98194425 +adduit -3H+3Na 65.94583274 +adduit -4H+4K 151.8235266 +adduit -4H+4Na 87.92777699 +adduit -3H+3K 113.8676449 +adduit -2H+2K 75.9117633 +adduit -2H+2Na 43.9638885 +adduit 2H 2.015650064 +adduit Cl 34.96885268 +adduit -2H+Ca 37.94694092 +isotope 13C db 0.501677419 +isotope 13C 1.003354838 +isotope 15N 0.997034893 +isotope 18O 2.00424638 +isotope 34S 1.9957959 +isotope 41K 1.99811908 +isotope 37Cl 1.99704991 +isotope 13C2 2.006709676 +isotope 13C3 3.010064513 +isotope 13C+37Cl 3.000404748 +isotope 13C+18O 3.007601218 +isotope 13C+34S 2.999150738 +isotope 44Ca 3.99289082 +adduit CH3OH 32.02621475 +adduit CH3CN 41.0265491 +adduit H2O 18.01056468 +adduit 2(H2O 36.02112937 +adduit NaCl 57.95862196 +adduit HCOOH 46.0054793 +adduit +(HCOOH)+(HCOOK) 129.9668403 +adduit +(HCOOH)+(HCOONa) 113.9929029 +adduit +(HCOOH)+2(HCOONa) 181.9803264 +adduit HCOOK 83.96136095 +adduit +(HCOOK)+(HCOONa) 151.9487845 +adduit HCOONa 67.98742355 +adduit 2(HCOOH) 92.01095861 +adduit +2(HCOOH)+(HCOOK) 175.9723196 +adduit +2(HCOOH)+(HCOONa) 159.9983822 +adduit 2(HCOOK) 167.9227219 +adduit 2(HCOONa) 135.9748471 +fragment C11H18O9 294.0950822 +fragment C12H16O12 352.064176 +fragment C12H20O9 308.1107322 +fragment C2H2O 42.01056468 +fragment C2H3. 27.0229265 +fragment C2H3N 41.0265491 +fragment C2H3NO3 89.01129296 +fragment C2H3O. 43.01784112 +fragment C2H4 28.03130013 +fragment C2H4N. 42.03382553 +fragment C2H4O 44.02621475 +fragment C2H5. 29.03857656 +fragment C2H5N 43.04219916 +fragment C2H5NO2 75.0320284 +fragment C2H5O. 45.03349118 +fragment C2H5O6P 155.9823745 +fragment C2H6 30.04695019 +fragment C2H7N 45.05784922 +fragment C2HNO2 71.00072827 +fragment C3H4O3 88.01604399 +fragment C3H5. 41.03857656 +fragment C3H5NO2 87.0320284 +fragment -(C3H5O2NS)-(NH3) 136.0306485 +fragment C3H5O2NS 119.0040994 +fragment C3H6 42.04695019 +fragment C3H6O3 90.03169405 +fragment C3H7. 43.05422662 +fragment C3H7O2N 89.04767846 +fragment C3H7O2NS 121.0197495 +fragment C3H7O6P 169.9980246 +fragment C4H6 54.04695019 +fragment C4H6O2 86.03677943 +fragment C4H6O4 118.0266087 +fragment C4H7. 55.05422662 +fragment C4H8O3 104.0473441 +fragment C4H9 57.07042529 +fragment C5H7O3N 129.0425931 +fragment C5H8O3NS 162.0224891 +fragment C5H8O4 132.0422587 +fragment C6H10O4 146.0579088 +fragment -(C6H10O5)-(H2O) 180.0633881 +fragment C6H10O5 162.0528234 +fragment C6H10O7 194.0426527 +fragment C6H8O6 176.032088 +fragment CH2O 30.01056468 +fragment -(CH2S)-(HCOOH) 91.99320037 +fragment -(CH2S)-(NH3) 63.01427016 +fragment CH2S 45.98772106 +fragment CH3. 15.0229265 +fragment CH3COO. 59.01275574 +fragment CH3COOH 60.02112937 +fragment CH3N 29.0265491 +fragment CH3O. 31.01784112 +fragment CH3OH 32.02621475 +fragment CH4 16.03130013 +fragment CH4N. 30.03382553 +fragment -(CH4S)-(HCOOH) 94.00885043 +fragment -(CH4S)-(NH3) 65.02992022 +fragment CH4S 48.00337113 +fragment CH5N 31.04219916 +fragment Cl. 34.96830408 +fragment CO 27.99491462 +fragment -(CO2)-(CO) 71.98474386 +fragment CO2 43.98982924 +fragment -(H2)-(NH3) 19.04219916 +fragment H2 2.015650064 +fragment -(H2O)-(CO2) 62.00039392 +fragment -(H2O)-(HCOOH) 64.01604399 +fragment -(H2O)-(NH3) 35.03711378 +fragment H2O 18.01056468 +fragment -(H2O)-2(CO2) 105.9902232 +fragment -(H2S)-(H2O) 51.99828575 +fragment H2S 33.98772106 +fragment H2SO4 97.96737954 +fragment H3PO4 97.97689521 +fragment HCl 35.97667771 +fragment HCN 27.01089903 +fragment -(HCOOH)-(HCN) 73.01637834 +fragment HCOOH 46.0054793 +fragment HS. 32.97934743 +fragment -(NC3H9)-(CH3COOH) 119.0946287 +fragment -(NC3H9)-(H2O) 77.08406397 +fragment -(NC3H9)-(HCOOH) 105.0789786 +fragment NC3H9 59.07349929 +fragment NaCl 57.95862196 +fragment NH2CO. 44.01309008 +fragment -(NH3)-(CO2)-(H2O) 79.02694302 +fragment -(NH3)-(CO2) 61.01637834 +fragment -(NH3)-(CONH) 60.03236275 +fragment -(NH3)-(HCOOH) 63.0320284 +fragment NH3 17.0265491 +fragment NH3CO 45.02146372 +fragment NHCO 43.00581365 +fragment OH. 17.00219105 +fragment PO3 78.95850549 +fragment SO2 63.96190024 +fragment SO3 79.95681486 +fragment -2(H2O)-(CO2) 80.01095861 +fragment -2(H2O)-(HCOOH)-(NH3) 99.05315777 +fragment -2(H2O)-(HCOOH) 82.02660867 +fragment 2(H2O) 36.02112937 +fragment 2(HCOOH) 92.01095861 +fragment -2(NH3)-(CO)-(CO2) 106.0378421 +fragment -2(NH3)-(CO) 62.04801281 +fragment 2(NH3) 34.05309819 +fragment 3(H2O) 54.03169405 +fragment 3(NH3) 51.07964729 +fragment 4(H2O) 72.04225874 +fragment C10H11O3N5 249.0861892 +fragment C10H13O4N5 267.0967539 +fragment C10H14O7N5P 347.0630844 +fragment C10H15O5N5 285.1073186 +fragment C2H3NO2 73.01637834 +fragment C2H4O2 60.02112937 +fragment C2H5NO3 91.02694302 +fragment C2H6O2 62.03677943 +fragment C2H6O3 78.03169405 +fragment -(C2H6O3)-(H2O) 96.04225874 +fragment C2H6O4 94.02660867 +fragment C2H7NO2 77.04767846 +fragment C3H10O5 126.0528234 +fragment -(C3H6O3)-(CHNO) 133.0375077 +fragment C3H6O4 106.0266087 +fragment C3H8O3 92.04734412 +fragment C3H8O4 108.0422587 +fragment C4H10O5 138.0528234 +fragment C4H5NO3 115.026943 +fragment C4H8O4 120.0422587 +fragment C5H10O4 134.0579088 +fragment C5H13O4N 151.0844579 +fragment C6H11O4N 161.0688078 +fragment C6H11O5N 177.0637225 +fragment C6H13O5N 179.0793725 +fragment C5H10O5 150.0528234 +fragment C5H10O6 166.047738 +fragment C5H12O2 104.0837296 +fragment -(C5H12O2)-(H2O) 122.0942943 +fragment C5H5N5 135.0544952 +fragment C5H5ON5 151.0494098 +fragment C5H6O2 98.03677943 +fragment C5H7O2N5 169.0599745 +fragment -(C5H7O3N)-(CO2) 173.0324223 +fragment -(C5H7O3N)-(H2O) 147.0531578 +fragment C5H8N3 110.0718223 +fragment C5H8O3 116.0473441 +fragment C5H8O5N5P 249.026305 +fragment C5H9O3 117.0551691 +fragment C5H9O6P 196.0136746 +fragment C5H9O7P 212.0085893 +fragment C6H10O3 130.0629942 +fragment -(C6H10O3)-(H2O) 148.0735589 +fragment C6H11O4N3PS 252.0207885 +fragment C6H11O4NPS 224.0146405 +fragment C6H12O5 164.0684735 +fragment C6H14O6 182.0790382 +fragment C6H14O7 198.0739528 +fragment C6H16O7 200.0896029 +fragment C6H16O8 216.0845175 +fragment C6H8N3 122.0718223 +fragment C6H8NS 126.0377453 +fragment C7H5ON5 175.0494098 +fragment C7H6ON6 190.0603088 +fragment C7H7O2N5 193.0599745 +fragment C7H11O6N 205.0586371 +fragment C8H14O7 222.0739528 +fragment C8H5O3N5 219.039239 +fragment C8H7O4N5 237.0498037 +fragment C9H10O4N2 210.0640568 +fragment C9H11O3N3 209.0800412 +fragment C9H11O4N3 225.0749558 +fragment C9H12O5N2 228.0746215 +fragment C9H12O6N3P 289.0463717 +fragment C9H13O4N3 227.0906059 +fragment C9H14O7N3P 307.0569364 +fragment C9H16O8 252.0845175 +fragment CH2N2 42.02179806 +fragment -(CH2O)-(H2O) 48.02112937 +fragment CH5NO 47.03711378 +fragment -(H3PO4)-(CHNO) 140.9827089 +fragment -(H3PO4)-(H2O) 115.9874599 +fragment -(H3PO4)-(NH3) 115.0034443 +fragment HPO3 79.96633052
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ACF/lib/IonFiltration.pm Fri Oct 18 04:59:51 2019 -0400 @@ -0,0 +1,181 @@ +#!usr/bin/perl +package IonFiltration; + +### Perl modules +use strict; +use warnings; + + + + + + +######################################################################## +### Création of a hash containing all adduits and fragments possible ### +######################################################################## + + +sub MassCollecting{ + + my $mass_file = $_[0]; + my %hmass; + + open (F1, $mass_file); + + while(my $line = <F1>){ + chomp $line; + my @tline = split(/[\t;]/, $line); + if(defined($hmass{$tline[2]})){ + print "The mass difference already exists : $tline[2] !\n"; + } + $hmass{$tline[1]}{$tline[2]}=$tline[0]; + } + + close F1; + return %hmass; + +} + + + + + + + +######################################################## +### Creation of a sif table + correlation filtration ### +######################################################## + + +sub sifTableCreation{ + + my $file = $_[0]; + my $output_sif = $_[1]; +# my $opt = $_[2]; +# my $rt_threshold = $_[3]; +# my $mass_threshold = $_[4]; + my $correl_threshold = $_[5]; +# my $dataMatrix = $_[6]; +# my $output_tabular = $_[7]; + my $combined_DMVM = $_[8]; +# my $repres_opt = $_[9]; +# my $intensity_threshold = $_[10]; +# my $intensity_pourc = $_[11]; +# my $refhmass = $_[12]; + + + + + my %hheader_file; + my %hduplicate; + + my %hcorrelgroup; + my $groupct=1; + + + my $linenb3=0; + my %hheader_line; + my %hrtmz; + + open (F5, $combined_DMVM); + while(my $line = <F5>){ + chomp $line; + my @tline = split(/\t/, $line); + + if($linenb3 == 0){ + for(my $i=0; $i<scalar(@tline);$i++){ + my $a = $tline[$i]; + $hheader_line{$a}=$i; + } + } + else{ + if(defined($hheader_line{mzmed})){ + my $b = $tline[$hheader_line{mzmed}]; + $hrtmz{$tline[0]}{mz}=$b; + } + else{ + my $b = $tline[$hheader_line{mz}]; + $hrtmz{$tline[0]}{mz}=$b; + } + if(defined($hheader_line{rtmed})){ + my $d = $tline[$hheader_line{rtmed}]; + $hrtmz{$tline[0]}{rt}=$d; + } + else{ + my $d = $tline[$hheader_line{rt}]; + $hrtmz{$tline[0]}{rt}=$d; + } + } + + $linenb3 ++; + } + close F5; + + + my $linenb=0; + + open (F1, $file) or die "Impossible to open $file\n"; + open(F2, ">$output_sif") or die "Impossible to open $output_sif\n"; + + + while(my $line = <F1>){ + chomp $line; + my @tline = split(/\t/, $line); + + ############################### + ### Création of a sif table ### + ############################### + + if($linenb == 0){ + for(my $i=0; $i<scalar(@tline);$i++){ + my $a = $tline[$i]; + $hheader_file{$i}=$a; + } + } + else{ + for(my $i=1; $i<scalar(@tline);$i++){ + my $a=$tline[0]; + my $b=$hheader_file{$i}; + my $coef=$tline[$i]; + + if($a eq $b){ + # print "This is a correlation between A ($a) and A ($b) !\n" + } + else{ + + ######################### + ### Remove duplicates ### + ######################### + + my $y = $a."/".$b; + my $z = $b."/".$a; + + if((!(defined($hduplicate{$y}))) && (!(defined($hduplicate{$z})))){ + + $hduplicate{$y}=1; +# my $abcoef=abs($coef); # Only when you want to consider negative correlations + +# if($abcoef > $correl_threshold){ # Only when you want to consider negative correlations + if($coef > $correl_threshold){ + + print F2 "$a\t$coef\t$b\n"; + + my $count=0; + + } + } + } + } + } + $linenb ++; + } + close F1; + close F2; + return ($output_sif, %hrtmz); +} + + + + + +1; \ No newline at end of file