diff ACorF/Analytic_correlation_filtration.pl @ 1:26aa3a8f95ce draft

Uploaded
author melpetera
date Wed, 30 Oct 2019 06:54:20 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ACorF/Analytic_correlation_filtration.pl	Wed Oct 30 06:54:20 2019 -0400
@@ -0,0 +1,651 @@
+#!usr/bin/perl
+
+### Perl modules
+use warnings;
+use strict;
+use Getopt::Long qw(GetOptions); #Creation of script options
+use Pod::Usage qw(pod2usage); #Creation of script options
+
+#Personnal packages
+use FindBin ; ## Allows you to locate the directory of original perl script
+#use lib $FindBin::Bin;
+use lib "$FindBin::Bin/lib";
+use IonFiltration;
+
+my ($file, $mass_file, $opt, $dataMatrix, $combined_DMVM, $repres_opt, $rt_threshold, $mass_threshold, $output_sif, $output_tabular, $correl_threshold, $intensity_threshold, $intensity_pourc); #Options to complete
+
+########################
+### Options and help ###
+########################
+
+GetOptions("f=s"=>\$file, "m=s"=>\$mass_file, "o=s"=>\$opt, "d=s"=>\$dataMatrix, "v=s"=>\$combined_DMVM, "r=s"=>\$repres_opt, "rt=f"=>\$rt_threshold, "mass=f"=>\$mass_threshold, "output_sif=s"=>\$output_sif, "output_tabular=s"=>\$output_tabular, "correl=s"=>\$correl_threshold, "IT=f"=>\$intensity_threshold, "IP=f"=>\$intensity_pourc) or pod2usage(2);
+
+### Check required parameters :
+pod2usage({-message=>q{Mandatory argument '-f' is missing}, -exitval=>1, -verbose=>0}) unless $file;
+#pod2usage({-message=>q{Mandatory argument '-m' is missing}, -exitval=>1, -verbose=>0}) unless $mass_file;
+pod2usage({-message=>q{Mandatory argument '-o' is missing. It correspond to the grouping method for analytical correlation groups formation.
+#It should be a number (1 ; 2 or 3) :
+#	1 : Don't take into acount mass information (only RT) ;
+#	2 : Check that all mass differences are include in a specific list and taking into acount RT information
+#	3 : Check that all mass differences are include in a specific list, ignoring RT information
+#To use the tool without takinf into account mass and RT information, use option 1 and define the RT threshold to 999999999.}, -exitval=>1, -verbose=>0}) unless $opt;
+pod2usage({-message=>q{Mandatory argument '-r' is missing. It correspond to the group representent choosing method for analytical correlation groups formation.
+It should be one of the 3 options below :
+	"mass" : choose the ion with the highest mass as the representant
+	"intensity" : choose the ion with the highest intensity as the representant
+	"mixt" : choose the ion with the highest (mass^2 * intensity) as the representant
+	"max_intensity_max_mass" : choose tha ion witht he highest intenisty among the 5 most intense ions of the group}, -exitval=>1, -verbose=>0}) unless $repres_opt;
+pod2usage({-message=>q{Mandatory argument '-d' is missing}, -exitval=>1, -verbose=>0}) unless $dataMatrix;
+pod2usage({-message=>q{Mandatory argument '-v' is missing}, -exitval=>1, -verbose=>0}) unless $combined_DMVM;
+#pod2usage({-message=>q{Mandatory argument '-rt' is missing}, -exitval=>1, -verbose=>0}) unless $rt_threshold;
+#pod2usage({-message=>q{Mandatory argument '-mass' is missing}, -exitval=>1, -verbose=>0}) unless $mass_threshold;
+pod2usage({-message=>q{Mandatory argument '-correl' is missing}, -exitval=>1, -verbose=>0}) unless $correl_threshold;
+pod2usage({-message=>q{Mandatory argument '-output_tabular' is missing}, -exitval=>1, -verbose=>0}) unless $output_tabular;
+pod2usage({-message=>q{Mandatory argument '-output_sif' is missing}, -exitval=>1, -verbose=>0}) unless $output_sif;
+
+
+#if(($opt != 1) && ($opt != 2) && ($opt != 3)){
+#	print "you must indicate \"1\", \"2\" or \"3\" for the --o otpion\n";
+#	exit;
+#}
+
+	
+
+if(($repres_opt ne "mass") && ($repres_opt ne "intensity") && ($repres_opt ne "mixt") && ($repres_opt ne "max_intensity_max_mass")){
+	print "you must indicate \"mass\", \"intensity\", \"mix\" or \"max_intensity_max_mass\" for the --r otpion\n";
+	exit;
+}
+
+
+
+#########################################################################
+#### Création of a hash containing all adduits and fragments possible ###
+#########################################################################
+
+my %hmass;
+if($opt != 1){
+	%hmass = IonFiltration::MassCollecting($mass_file);
+	
+}
+
+my $refhmass = \%hmass;
+
+print "Création of a hash containing all adduits and fragments possible\n";
+
+	
+########################################################
+### Creation of a sif table + correlation filtration ###
+########################################################
+
+my %hrtmz;
+($output_sif, %hrtmz) = IonFiltration::sifTableCreation($file, $output_sif, $opt, $rt_threshold, $mass_threshold, $correl_threshold, $dataMatrix, $output_tabular, $combined_DMVM, $repres_opt, $intensity_threshold, $intensity_pourc, \%hmass);
+print "Creation of a sif table + correlation filtration done\n";
+
+
+######################################################
+### Analytic correlation filtrering follow options ###
+######################################################
+
+my %hheader_file;
+my %hduplicate;
+	
+my %hcorrelgroup;
+my $groupct=1;
+
+my $linenb3=0;
+my %hheader_line;
+
+
+
+open (F1, $output_sif) or die "Impossible to open $output_sif\n";
+	
+while(my $line = <F1>){
+	my $count=0;
+	chomp $line;
+	my @tline = split(/\t/, $line);
+	my $a = $tline[0];
+	my $b = $tline[2];
+							
+	my $amass=$hrtmz{$a}{mz};
+	my $atemp=$hrtmz{$a}{rt};
+	my $bmass= $hrtmz{$b}{mz};
+	my $btemp=$hrtmz{$b}{rt};
+	my $diff = $amass-$bmass;
+	$diff = abs($diff);
+								
+	### Option 1: Don't take into acount mass information ###
+					
+	if($opt == 1){
+		my $btplus = $btemp + $rt_threshold;
+		my $btmoins = $btemp - $rt_threshold;
+		if(($btmoins <= $atemp) && ($atemp <= $btplus)){
+			foreach my $k (keys %hcorrelgroup){
+				if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){
+					$hcorrelgroup{$k}{$a}=1;
+					$hcorrelgroup{$k}{$b}=1;
+					$count++;
+					last;
+				}
+			}
+			if($count == 0){
+				my $groupnb="group".$groupct;
+				$hcorrelgroup{$groupnb}{$a}=1;
+				$hcorrelgroup{$groupnb}{$b}=1;
+				$groupct ++;
+			}
+		}
+	}
+
+									
+								
+	### Option 2: Check that all mass differences are include in a specific list taking into account RT information ###
+									
+	elsif($opt == 2){
+										
+		my $print = 0;
+		foreach my $s (keys %{$refhmass}){
+			foreach my $r (keys %{$refhmass->{$s}}){
+				my $rm = $r - $mass_threshold;
+				my $rp = $r + $mass_threshold;
+				if(($diff <= $rp) && ($diff >= $rm)){
+					if($print == 0){
+						my $btplus = $btemp + $rt_threshold;
+						my $btmoins = $btemp - $rt_threshold;
+														
+						if(($btmoins <= $atemp) && ($atemp <= $btplus)){
+							foreach my $k (keys %hcorrelgroup){
+								if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){
+									$hcorrelgroup{$k}{$a}=1;
+									$hcorrelgroup{$k}{$b}=1;
+									$count++;
+									last;
+								}
+							}
+							if($count == 0){
+								my $groupnb="group".$groupct;
+								$hcorrelgroup{$groupnb}{$a}=1;
+								$hcorrelgroup{$groupnb}{$b}=1;
+								$groupct ++;
+							}
+							$print = 1;
+						}
+					}
+				}
+			}
+		}
+	}
+									
+								
+	### Option 3: Check that all mass differences are include in a specific list, ignoring RT information ###
+					
+	elsif($opt == 3){
+										
+		my $print = 0;
+		foreach my $s (keys %{$refhmass}){
+			foreach my $r (keys %{$refhmass->{$s}}){
+				my $rm = $r - $mass_threshold;
+				my $rp = $r + $mass_threshold;
+				if(($diff <= $rp) && ($diff >= $rm)){
+					if($print == 0){
+							
+						foreach my $k (keys %hcorrelgroup){
+							if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){
+								$hcorrelgroup{$k}{$a}=1;
+								$hcorrelgroup{$k}{$b}=1;
+								$count++;
+								last;
+							}
+						}
+						if($count == 0){
+							my $groupnb="group".$groupct;
+							$hcorrelgroup{$groupnb}{$a}=1;
+							$hcorrelgroup{$groupnb}{$b}=1;
+							$groupct ++;
+						}
+						$print = 1;
+					}
+				}
+			}
+		}
+	}
+}
+close F1;
+
+print "Analytic correlation filtrering follow options done\n";
+	
+	
+#############################################
+### Join groups that have been subdivided ###
+#############################################
+
+my @tdelete;
+
+foreach my $k (keys %hcorrelgroup){
+	foreach my $i (keys %{$hcorrelgroup{$k}}){
+		foreach my $v (keys %hcorrelgroup){
+			my $count = 0;
+			if ($v ne $k){
+				foreach my $w (keys %{$hcorrelgroup{$v}}){
+					if($w eq $i){
+						$count = 1;
+						push(@tdelete, $v);
+					}	
+				}
+			}
+			if($count == 1){
+				foreach my $w (keys %{$hcorrelgroup{$v}}){
+					$hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w};
+				}
+				delete($hcorrelgroup{$v});
+			}
+		}
+	}
+}
+	
+foreach my $t (@tdelete){
+	delete($hcorrelgroup{$t});
+}
+
+
+### Do it twice to see if it fix the problem of unmerge groups
+
+foreach my $k (keys %hcorrelgroup){
+	foreach my $i (keys %{$hcorrelgroup{$k}}){
+		foreach my $v (keys %hcorrelgroup){
+			my $count = 0;
+			if ($v ne $k){
+				foreach my $w (keys %{$hcorrelgroup{$v}}){
+					if($w eq $i){
+						$count = 1;
+						push(@tdelete, $v);
+					}	
+				}
+			}
+			if($count == 1){
+				foreach my $w (keys %{$hcorrelgroup{$v}}){
+					$hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w};
+				}
+				delete($hcorrelgroup{$v});
+			}
+		}
+	}
+}
+	
+foreach my $t (@tdelete){
+	delete($hcorrelgroup{$t});
+}
+	
+print "Join groups that have been subdivided done\n";
+	
+#######################################################
+### Addition of annotation information among groups ###
+#######################################################
+
+foreach my $k (keys %hcorrelgroup){
+	foreach my $i (keys %{$hcorrelgroup{$k}}){
+		foreach my $j (keys %{$hcorrelgroup{$k}}){
+			my $count = 0;
+			if ($i ne $j){
+				
+				my $a = $hrtmz{$i}{mz};
+				my $b = $hrtmz{$j}{mz};
+				
+				my $diff = $a - $b;
+				my $sign;
+				if($diff>0){
+					$sign="+";
+				}
+				if($diff<0){
+					$sign="-";
+				}
+				$diff = abs($diff);
+				
+				foreach my $z (keys %{$refhmass}){
+					
+					foreach my $y (keys %{$refhmass->{$z}}){
+						my $ym = $y - $mass_threshold;
+						my $yp = $y + $mass_threshold;
+						
+						
+						if(($diff <= $yp) && ($diff >= $ym)){
+							my $diff_list = $diff - $y;
+							$diff_list = abs($diff_list);
+							$diff_list = sprintf ("%0.6f", $diff_list);
+							
+							if($hcorrelgroup{$k}{$i} eq 1){
+								my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|";
+								$hcorrelgroup{$k}{$i}=$val;
+								$count ++;
+							}
+							else{
+								if($count == 0){
+									my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|";
+									$hcorrelgroup{$k}{$i}.=$val;
+									$count ++;
+								}
+								else{
+									my $val = $sign."(".$z.")(".$diff_list.")|";
+									$hcorrelgroup{$k}{$i}.=$val;
+									$count ++;
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+}
+	
+
+print "Addition of annotation information among groups done\n";
+	
+	
+####################################################
+### Choose the representative ion for each group ###
+####################################################
+
+my %hgrouprepres;
+	
+open(F3, $dataMatrix);
+
+while (my $line = <F3>){
+	chomp $line;
+	
+	my @tline = split (/\t/, $line);
+	
+	foreach my $k (keys %hcorrelgroup){
+		foreach my $i (keys %{$hcorrelgroup{$k}}){
+			if($tline[0] eq $i){
+				$hgrouprepres{$k}{$i}{mass}=$hrtmz{$tline[0]}{mz};
+				my $intensity;
+				my $nbsubjects=0;
+				for(my $y=1;$y<scalar(@tline);$y++){
+					$intensity += $tline[$y];
+					$nbsubjects ++;
+				}
+				my $meanintensity = $intensity/$nbsubjects;
+				$hgrouprepres{$k}{$i}{intensity}=$meanintensity;
+				$hgrouprepres{$k}{$i}{squaredmassint}=($hgrouprepres{$k}{$i}{mass}**2)/($hgrouprepres{$k}{$i}{intensity});
+			}
+		}
+	}
+}
+close F3;
+	
+foreach my $z (keys %hgrouprepres){
+	my $max_intensity =  0;
+	my $max_int_ion = "";
+	my $max_mass = 0;
+	my $max_mass_ion = "";
+	my $max_squared = 0;
+	my $max_squared_ion = "";
+	foreach my $w (keys %{$hgrouprepres{$z}}){
+		if($hgrouprepres{$z}{$w}{intensity} > $max_intensity){
+			$max_intensity = $hgrouprepres{$z}{$w}{intensity};
+			$max_int_ion = $w;
+		}
+		if($hgrouprepres{$z}{$w}{mass} > $max_mass){
+			$max_mass = $hgrouprepres{$z}{$w}{mass};
+			$max_mass_ion = $w;
+		}
+		if($hgrouprepres{$z}{$w}{squaredmassint} > $max_squared){
+			$max_squared = $hgrouprepres{$z}{$w}{squaredmassint};
+			$max_squared_ion = $w;
+		}
+	}
+	
+	my $max_int_max_mass_ion="";
+	
+	if($repres_opt eq "max_intensity_max_mass"){
+		my %hfirst;
+		my $first=0;
+		foreach my $w (reverse sort {$hgrouprepres{$z}{$a}{intensity} <=> $hgrouprepres{$z}{$b}{intensity} } keys %{$hgrouprepres{$z}}){
+			$first ++;
+			if ($first <= 3){
+				$hfirst{$w} = $hgrouprepres{$z}{$w}{intensity};
+			}
+		}
+		
+		my $first_2 = 0;
+		my $intens_max = 0;
+		my $mass_max = 0;
+		
+		foreach my $y (reverse sort {$hfirst{$a} <=> $hfirst{$b}} keys %hfirst){
+			
+			$first_2 ++;
+			if($first_2 == 1){
+				$intens_max = $hfirst{$y};
+				if($intensity_threshold > $intens_max){
+					$intensity_threshold = 0;
+				}
+				$max_int_max_mass_ion = $y;
+				$mass_max = $hgrouprepres{$z}{$y}{mass};
+			}
+			if($hgrouprepres{$z}{$y}{mass} > $mass_max){
+				if($hfirst{$y}>$intensity_threshold){
+					my $a = $intens_max * $intensity_pourc;
+					if($hfirst{$y} > $a){
+						$max_int_max_mass_ion = $y;
+						$mass_max = $hgrouprepres{$z}{$y}{mass};
+					}
+				}
+			}
+		}
+	}
+	
+	$hgrouprepres{$z}{max_int}=$max_int_ion;
+	$hgrouprepres{$z}{max_mass}=$max_mass_ion;
+	$hgrouprepres{$z}{max_squared}=$max_squared_ion;
+	$hgrouprepres{$z}{max_int_max_mass}=$max_int_max_mass_ion;
+	
+}
+
+
+print "Choose the representative ion for each group done\n";
+
+#############################################################################
+### Addition of annotation information relative to the representative ion ###
+#############################################################################
+	
+my %hreprescomparison;
+
+my $representative="";
+
+if($opt != 1){
+	foreach my $k (keys %hcorrelgroup){
+		foreach my $i (keys %{$hcorrelgroup{$k}}){
+												
+			if($repres_opt eq "mass"){$representative = $hgrouprepres{$k}{max_mass}}
+			if($repres_opt eq "intensity"){$representative = $hgrouprepres{$k}{max_int}}
+			if($repres_opt eq "mixt"){$representative = $hgrouprepres{$k}{max_squared}}
+			if($repres_opt eq "max_intensity_max_mass"){$representative = $hgrouprepres{$k}{max_int_max_mass}}
+			
+			
+			my $count = 0;
+			if ($i ne $representative){
+					
+				my $a = $hrtmz{$i}{mz};
+				my $b = $hrtmz{$representative}{mz};
+				
+				my $diff = $a - $b;
+				my $sign;
+				if($diff>0){
+					$sign="+";
+				}
+				if($diff<0){
+					$sign="-";
+				}
+				$diff = abs($diff);
+				
+				foreach my $z (keys %{$refhmass}){
+					
+					foreach my $y (keys %{$refhmass->{$z}}){
+						my $ym = $y - $mass_threshold;
+						my $yp = $y + $mass_threshold;
+						
+						if(($diff <= $yp) && ($diff >= $ym)){
+							my $diff_list = $diff - $y;
+							$diff_list = abs($diff_list);
+							$diff_list = sprintf ("%0.4f", $diff_list);
+							if($hcorrelgroup{$k}{$i} eq 1){
+								my $valrep = "[M ".$sign."(".$z.")]|";
+								$hreprescomparison{$k}{$i}{repres_diff}=$valrep;
+								$count ++;
+							}
+							else{
+								if($count == 0){
+									my $valrep = "[M ".$sign."(".$z.")]|";
+									$hreprescomparison{$k}{$i}{repres_diff}.=$valrep;
+									$count ++;
+								}
+								else{
+									my $valrep = "[M ".$sign."(".$z.")]|";
+									$hreprescomparison{$k}{$i}{repres_diff}.=$valrep;
+									$count ++;
+								}
+							}
+						}
+					}
+				}
+			}
+			else{
+				$hreprescomparison{$k}{$i}{repres_diff}="M";
+			}
+		}
+	}
+}
+	
+
+print "Addition of annotation information relative to the representative ion done\n";
+	
+##############################
+### Print in result file ! ###
+##############################
+
+open(F4, ">$output_tabular");
+open(F5, $combined_DMVM);
+
+my $line_nb = 0;
+my %hheader;
+while (my $line = <F5>){
+	chomp $line;
+	
+	
+	my @tline = split (/\t/, $line);
+	
+	if($line_nb == 0){
+		print F4 "$line\tACorF_groups";
+		if($opt == 1){
+			if($repres_opt eq "intensity"){print F4 "\tACorF_filter\tintensity_repres\n"}
+			if($repres_opt eq "mass"){print F4 "\tACorF_filter\tmass_repres\n"}
+			if($repres_opt eq "mixt"){print F4 "\tACorF_filter\tmass2intens_repres\n"}
+			if($repres_opt eq "max_intensity_max_mass"){print F4 "\tACorF_filter\tmax_intensity_max_mass_repres\n"}
+			}
+		else{
+			if($repres_opt eq "intensity"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tintensity_repres\tannotation_relative_to_representative\n"}
+			if($repres_opt eq "mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass_repres\tannotation_relative_to_representative\n"}
+			if($repres_opt eq "mixt"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass2intens_repres\tannotation_relative_to_representative\n"}
+			if($repres_opt eq "max_intensity_max_mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmax_intensity_max_mass_repres\tannotation_relative_to_representative\n"}
+		}
+		
+		
+		### Creation of a header hash
+		for(my $i=0; $i<scalar(@tline);$i++){
+			my $a = $tline[$i];
+			$hheader{$a}=$i;
+		}
+	}
+	
+	else{
+		my $find = 0;
+		foreach my $v (keys %hcorrelgroup){
+			if(defined($hgrouprepres{$v}{$tline[0]})){
+				print F4 "$line\t$v";
+					
+				if($opt != 1){
+					if(defined($hcorrelgroup{$v}{$tline[0]})){
+						print F4 "\t$hcorrelgroup{$v}{$tline[0]}\t";
+							
+					}
+					else{
+						print F4 "\t";
+					}
+				}
+				else{
+					print F4 "\t";
+				}
+					
+				if($repres_opt eq "intensity"){
+					if($tline[0] eq $hgrouprepres{$v}{max_int}){
+						print F4 "1\t";
+					}
+					else{
+						print F4 "0\t";
+					}
+					$find = 1;
+				}
+				if($repres_opt eq "mass"){
+					if($tline[0] eq $hgrouprepres{$v}{max_mass}){
+						print F4 "1\t";
+					}
+					else{
+						print F4 "0\t";
+					}
+					$find = 1;
+				}
+				if($repres_opt eq "mixt"){
+					if($tline[0] eq $hgrouprepres{$v}{max_squared}){
+						print F4 "1\t";
+					}
+					else{
+						print F4 "0\t";
+					}
+					$find = 1;
+				}
+				if($repres_opt eq "max_intensity_max_mass"){
+					if($tline[0] eq $hgrouprepres{$v}{max_int_max_mass}){
+						print F4 "1\t";
+					}
+					else{
+						print F4 "0\t";
+					}
+					$find = 1;
+				}
+					
+				if($repres_opt eq "intensity"){print F4 "$hgrouprepres{$v}{max_int}\t"}
+				if($repres_opt eq "mass"){print F4 "$hgrouprepres{$v}{max_mass}\t"}
+				if($repres_opt eq "mixt"){print F4 "$hgrouprepres{$v}{max_squared}\t"}
+				if($repres_opt eq "max_intensity_max_mass"){print F4 "$hgrouprepres{$v}{max_int_max_mass}\t"}
+				
+				if($opt != 1){
+					if(defined($hreprescomparison{$v}{$tline[0]}{repres_diff})){
+						print F4 "$hreprescomparison{$v}{$tline[0]}{repres_diff}\n";
+					}
+					else{
+						print F4 "-\n";
+					}
+				}
+				else{
+					print F4 "\n";
+				}
+			}
+		}
+		if($find == 0){
+			$groupct ++;
+			my $group = "group".$groupct;
+			if($opt != 1){
+				print F4 "$line\t$group\t-\t-\t-\t-\n";
+			}
+			else{
+				print F4 "$line\t$group\t-\t-\n";
+			}	
+		}
+	}
+	$line_nb ++;
+}
+
+print "Print in result file done\n";
+
+print "All steps done\n";
+