view ACorF/Analytic_correlation_filtration.pl @ 1:26aa3a8f95ce draft

Uploaded
author melpetera
date Wed, 30 Oct 2019 06:54:20 -0400
parents
children
line wrap: on
line source

#!usr/bin/perl

### Perl modules
use warnings;
use strict;
use Getopt::Long qw(GetOptions); #Creation of script options
use Pod::Usage qw(pod2usage); #Creation of script options

#Personnal packages
use FindBin ; ## Allows you to locate the directory of original perl script
#use lib $FindBin::Bin;
use lib "$FindBin::Bin/lib";
use IonFiltration;

my ($file, $mass_file, $opt, $dataMatrix, $combined_DMVM, $repres_opt, $rt_threshold, $mass_threshold, $output_sif, $output_tabular, $correl_threshold, $intensity_threshold, $intensity_pourc); #Options to complete

########################
### Options and help ###
########################

GetOptions("f=s"=>\$file, "m=s"=>\$mass_file, "o=s"=>\$opt, "d=s"=>\$dataMatrix, "v=s"=>\$combined_DMVM, "r=s"=>\$repres_opt, "rt=f"=>\$rt_threshold, "mass=f"=>\$mass_threshold, "output_sif=s"=>\$output_sif, "output_tabular=s"=>\$output_tabular, "correl=s"=>\$correl_threshold, "IT=f"=>\$intensity_threshold, "IP=f"=>\$intensity_pourc) or pod2usage(2);

### Check required parameters :
pod2usage({-message=>q{Mandatory argument '-f' is missing}, -exitval=>1, -verbose=>0}) unless $file;
#pod2usage({-message=>q{Mandatory argument '-m' is missing}, -exitval=>1, -verbose=>0}) unless $mass_file;
pod2usage({-message=>q{Mandatory argument '-o' is missing. It correspond to the grouping method for analytical correlation groups formation.
#It should be a number (1 ; 2 or 3) :
#	1 : Don't take into acount mass information (only RT) ;
#	2 : Check that all mass differences are include in a specific list and taking into acount RT information
#	3 : Check that all mass differences are include in a specific list, ignoring RT information
#To use the tool without takinf into account mass and RT information, use option 1 and define the RT threshold to 999999999.}, -exitval=>1, -verbose=>0}) unless $opt;
pod2usage({-message=>q{Mandatory argument '-r' is missing. It correspond to the group representent choosing method for analytical correlation groups formation.
It should be one of the 3 options below :
	"mass" : choose the ion with the highest mass as the representant
	"intensity" : choose the ion with the highest intensity as the representant
	"mixt" : choose the ion with the highest (mass^2 * intensity) as the representant
	"max_intensity_max_mass" : choose tha ion witht he highest intenisty among the 5 most intense ions of the group}, -exitval=>1, -verbose=>0}) unless $repres_opt;
pod2usage({-message=>q{Mandatory argument '-d' is missing}, -exitval=>1, -verbose=>0}) unless $dataMatrix;
pod2usage({-message=>q{Mandatory argument '-v' is missing}, -exitval=>1, -verbose=>0}) unless $combined_DMVM;
#pod2usage({-message=>q{Mandatory argument '-rt' is missing}, -exitval=>1, -verbose=>0}) unless $rt_threshold;
#pod2usage({-message=>q{Mandatory argument '-mass' is missing}, -exitval=>1, -verbose=>0}) unless $mass_threshold;
pod2usage({-message=>q{Mandatory argument '-correl' is missing}, -exitval=>1, -verbose=>0}) unless $correl_threshold;
pod2usage({-message=>q{Mandatory argument '-output_tabular' is missing}, -exitval=>1, -verbose=>0}) unless $output_tabular;
pod2usage({-message=>q{Mandatory argument '-output_sif' is missing}, -exitval=>1, -verbose=>0}) unless $output_sif;


#if(($opt != 1) && ($opt != 2) && ($opt != 3)){
#	print "you must indicate \"1\", \"2\" or \"3\" for the --o otpion\n";
#	exit;
#}

	

if(($repres_opt ne "mass") && ($repres_opt ne "intensity") && ($repres_opt ne "mixt") && ($repres_opt ne "max_intensity_max_mass")){
	print "you must indicate \"mass\", \"intensity\", \"mix\" or \"max_intensity_max_mass\" for the --r otpion\n";
	exit;
}



#########################################################################
#### Création of a hash containing all adduits and fragments possible ###
#########################################################################

my %hmass;
if($opt != 1){
	%hmass = IonFiltration::MassCollecting($mass_file);
	
}

my $refhmass = \%hmass;

print "Création of a hash containing all adduits and fragments possible\n";

	
########################################################
### Creation of a sif table + correlation filtration ###
########################################################

my %hrtmz;
($output_sif, %hrtmz) = IonFiltration::sifTableCreation($file, $output_sif, $opt, $rt_threshold, $mass_threshold, $correl_threshold, $dataMatrix, $output_tabular, $combined_DMVM, $repres_opt, $intensity_threshold, $intensity_pourc, \%hmass);
print "Creation of a sif table + correlation filtration done\n";


######################################################
### Analytic correlation filtrering follow options ###
######################################################

my %hheader_file;
my %hduplicate;
	
my %hcorrelgroup;
my $groupct=1;

my $linenb3=0;
my %hheader_line;



open (F1, $output_sif) or die "Impossible to open $output_sif\n";
	
while(my $line = <F1>){
	my $count=0;
	chomp $line;
	my @tline = split(/\t/, $line);
	my $a = $tline[0];
	my $b = $tline[2];
							
	my $amass=$hrtmz{$a}{mz};
	my $atemp=$hrtmz{$a}{rt};
	my $bmass= $hrtmz{$b}{mz};
	my $btemp=$hrtmz{$b}{rt};
	my $diff = $amass-$bmass;
	$diff = abs($diff);
								
	### Option 1: Don't take into acount mass information ###
					
	if($opt == 1){
		my $btplus = $btemp + $rt_threshold;
		my $btmoins = $btemp - $rt_threshold;
		if(($btmoins <= $atemp) && ($atemp <= $btplus)){
			foreach my $k (keys %hcorrelgroup){
				if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){
					$hcorrelgroup{$k}{$a}=1;
					$hcorrelgroup{$k}{$b}=1;
					$count++;
					last;
				}
			}
			if($count == 0){
				my $groupnb="group".$groupct;
				$hcorrelgroup{$groupnb}{$a}=1;
				$hcorrelgroup{$groupnb}{$b}=1;
				$groupct ++;
			}
		}
	}

									
								
	### Option 2: Check that all mass differences are include in a specific list taking into account RT information ###
									
	elsif($opt == 2){
										
		my $print = 0;
		foreach my $s (keys %{$refhmass}){
			foreach my $r (keys %{$refhmass->{$s}}){
				my $rm = $r - $mass_threshold;
				my $rp = $r + $mass_threshold;
				if(($diff <= $rp) && ($diff >= $rm)){
					if($print == 0){
						my $btplus = $btemp + $rt_threshold;
						my $btmoins = $btemp - $rt_threshold;
														
						if(($btmoins <= $atemp) && ($atemp <= $btplus)){
							foreach my $k (keys %hcorrelgroup){
								if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){
									$hcorrelgroup{$k}{$a}=1;
									$hcorrelgroup{$k}{$b}=1;
									$count++;
									last;
								}
							}
							if($count == 0){
								my $groupnb="group".$groupct;
								$hcorrelgroup{$groupnb}{$a}=1;
								$hcorrelgroup{$groupnb}{$b}=1;
								$groupct ++;
							}
							$print = 1;
						}
					}
				}
			}
		}
	}
									
								
	### Option 3: Check that all mass differences are include in a specific list, ignoring RT information ###
					
	elsif($opt == 3){
										
		my $print = 0;
		foreach my $s (keys %{$refhmass}){
			foreach my $r (keys %{$refhmass->{$s}}){
				my $rm = $r - $mass_threshold;
				my $rp = $r + $mass_threshold;
				if(($diff <= $rp) && ($diff >= $rm)){
					if($print == 0){
							
						foreach my $k (keys %hcorrelgroup){
							if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){
								$hcorrelgroup{$k}{$a}=1;
								$hcorrelgroup{$k}{$b}=1;
								$count++;
								last;
							}
						}
						if($count == 0){
							my $groupnb="group".$groupct;
							$hcorrelgroup{$groupnb}{$a}=1;
							$hcorrelgroup{$groupnb}{$b}=1;
							$groupct ++;
						}
						$print = 1;
					}
				}
			}
		}
	}
}
close F1;

print "Analytic correlation filtrering follow options done\n";
	
	
#############################################
### Join groups that have been subdivided ###
#############################################

my @tdelete;

foreach my $k (keys %hcorrelgroup){
	foreach my $i (keys %{$hcorrelgroup{$k}}){
		foreach my $v (keys %hcorrelgroup){
			my $count = 0;
			if ($v ne $k){
				foreach my $w (keys %{$hcorrelgroup{$v}}){
					if($w eq $i){
						$count = 1;
						push(@tdelete, $v);
					}	
				}
			}
			if($count == 1){
				foreach my $w (keys %{$hcorrelgroup{$v}}){
					$hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w};
				}
				delete($hcorrelgroup{$v});
			}
		}
	}
}
	
foreach my $t (@tdelete){
	delete($hcorrelgroup{$t});
}


### Do it twice to see if it fix the problem of unmerge groups

foreach my $k (keys %hcorrelgroup){
	foreach my $i (keys %{$hcorrelgroup{$k}}){
		foreach my $v (keys %hcorrelgroup){
			my $count = 0;
			if ($v ne $k){
				foreach my $w (keys %{$hcorrelgroup{$v}}){
					if($w eq $i){
						$count = 1;
						push(@tdelete, $v);
					}	
				}
			}
			if($count == 1){
				foreach my $w (keys %{$hcorrelgroup{$v}}){
					$hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w};
				}
				delete($hcorrelgroup{$v});
			}
		}
	}
}
	
foreach my $t (@tdelete){
	delete($hcorrelgroup{$t});
}
	
print "Join groups that have been subdivided done\n";
	
#######################################################
### Addition of annotation information among groups ###
#######################################################

foreach my $k (keys %hcorrelgroup){
	foreach my $i (keys %{$hcorrelgroup{$k}}){
		foreach my $j (keys %{$hcorrelgroup{$k}}){
			my $count = 0;
			if ($i ne $j){
				
				my $a = $hrtmz{$i}{mz};
				my $b = $hrtmz{$j}{mz};
				
				my $diff = $a - $b;
				my $sign;
				if($diff>0){
					$sign="+";
				}
				if($diff<0){
					$sign="-";
				}
				$diff = abs($diff);
				
				foreach my $z (keys %{$refhmass}){
					
					foreach my $y (keys %{$refhmass->{$z}}){
						my $ym = $y - $mass_threshold;
						my $yp = $y + $mass_threshold;
						
						
						if(($diff <= $yp) && ($diff >= $ym)){
							my $diff_list = $diff - $y;
							$diff_list = abs($diff_list);
							$diff_list = sprintf ("%0.6f", $diff_list);
							
							if($hcorrelgroup{$k}{$i} eq 1){
								my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|";
								$hcorrelgroup{$k}{$i}=$val;
								$count ++;
							}
							else{
								if($count == 0){
									my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|";
									$hcorrelgroup{$k}{$i}.=$val;
									$count ++;
								}
								else{
									my $val = $sign."(".$z.")(".$diff_list.")|";
									$hcorrelgroup{$k}{$i}.=$val;
									$count ++;
								}
							}
						}
					}
				}
			}
		}
	}
}
	

print "Addition of annotation information among groups done\n";
	
	
####################################################
### Choose the representative ion for each group ###
####################################################

my %hgrouprepres;
	
open(F3, $dataMatrix);

while (my $line = <F3>){
	chomp $line;
	
	my @tline = split (/\t/, $line);
	
	foreach my $k (keys %hcorrelgroup){
		foreach my $i (keys %{$hcorrelgroup{$k}}){
			if($tline[0] eq $i){
				$hgrouprepres{$k}{$i}{mass}=$hrtmz{$tline[0]}{mz};
				my $intensity;
				my $nbsubjects=0;
				for(my $y=1;$y<scalar(@tline);$y++){
					$intensity += $tline[$y];
					$nbsubjects ++;
				}
				my $meanintensity = $intensity/$nbsubjects;
				$hgrouprepres{$k}{$i}{intensity}=$meanintensity;
				$hgrouprepres{$k}{$i}{squaredmassint}=($hgrouprepres{$k}{$i}{mass}**2)/($hgrouprepres{$k}{$i}{intensity});
			}
		}
	}
}
close F3;
	
foreach my $z (keys %hgrouprepres){
	my $max_intensity =  0;
	my $max_int_ion = "";
	my $max_mass = 0;
	my $max_mass_ion = "";
	my $max_squared = 0;
	my $max_squared_ion = "";
	foreach my $w (keys %{$hgrouprepres{$z}}){
		if($hgrouprepres{$z}{$w}{intensity} > $max_intensity){
			$max_intensity = $hgrouprepres{$z}{$w}{intensity};
			$max_int_ion = $w;
		}
		if($hgrouprepres{$z}{$w}{mass} > $max_mass){
			$max_mass = $hgrouprepres{$z}{$w}{mass};
			$max_mass_ion = $w;
		}
		if($hgrouprepres{$z}{$w}{squaredmassint} > $max_squared){
			$max_squared = $hgrouprepres{$z}{$w}{squaredmassint};
			$max_squared_ion = $w;
		}
	}
	
	my $max_int_max_mass_ion="";
	
	if($repres_opt eq "max_intensity_max_mass"){
		my %hfirst;
		my $first=0;
		foreach my $w (reverse sort {$hgrouprepres{$z}{$a}{intensity} <=> $hgrouprepres{$z}{$b}{intensity} } keys %{$hgrouprepres{$z}}){
			$first ++;
			if ($first <= 3){
				$hfirst{$w} = $hgrouprepres{$z}{$w}{intensity};
			}
		}
		
		my $first_2 = 0;
		my $intens_max = 0;
		my $mass_max = 0;
		
		foreach my $y (reverse sort {$hfirst{$a} <=> $hfirst{$b}} keys %hfirst){
			
			$first_2 ++;
			if($first_2 == 1){
				$intens_max = $hfirst{$y};
				if($intensity_threshold > $intens_max){
					$intensity_threshold = 0;
				}
				$max_int_max_mass_ion = $y;
				$mass_max = $hgrouprepres{$z}{$y}{mass};
			}
			if($hgrouprepres{$z}{$y}{mass} > $mass_max){
				if($hfirst{$y}>$intensity_threshold){
					my $a = $intens_max * $intensity_pourc;
					if($hfirst{$y} > $a){
						$max_int_max_mass_ion = $y;
						$mass_max = $hgrouprepres{$z}{$y}{mass};
					}
				}
			}
		}
	}
	
	$hgrouprepres{$z}{max_int}=$max_int_ion;
	$hgrouprepres{$z}{max_mass}=$max_mass_ion;
	$hgrouprepres{$z}{max_squared}=$max_squared_ion;
	$hgrouprepres{$z}{max_int_max_mass}=$max_int_max_mass_ion;
	
}


print "Choose the representative ion for each group done\n";

#############################################################################
### Addition of annotation information relative to the representative ion ###
#############################################################################
	
my %hreprescomparison;

my $representative="";

if($opt != 1){
	foreach my $k (keys %hcorrelgroup){
		foreach my $i (keys %{$hcorrelgroup{$k}}){
												
			if($repres_opt eq "mass"){$representative = $hgrouprepres{$k}{max_mass}}
			if($repres_opt eq "intensity"){$representative = $hgrouprepres{$k}{max_int}}
			if($repres_opt eq "mixt"){$representative = $hgrouprepres{$k}{max_squared}}
			if($repres_opt eq "max_intensity_max_mass"){$representative = $hgrouprepres{$k}{max_int_max_mass}}
			
			
			my $count = 0;
			if ($i ne $representative){
					
				my $a = $hrtmz{$i}{mz};
				my $b = $hrtmz{$representative}{mz};
				
				my $diff = $a - $b;
				my $sign;
				if($diff>0){
					$sign="+";
				}
				if($diff<0){
					$sign="-";
				}
				$diff = abs($diff);
				
				foreach my $z (keys %{$refhmass}){
					
					foreach my $y (keys %{$refhmass->{$z}}){
						my $ym = $y - $mass_threshold;
						my $yp = $y + $mass_threshold;
						
						if(($diff <= $yp) && ($diff >= $ym)){
							my $diff_list = $diff - $y;
							$diff_list = abs($diff_list);
							$diff_list = sprintf ("%0.4f", $diff_list);
							if($hcorrelgroup{$k}{$i} eq 1){
								my $valrep = "[M ".$sign."(".$z.")]|";
								$hreprescomparison{$k}{$i}{repres_diff}=$valrep;
								$count ++;
							}
							else{
								if($count == 0){
									my $valrep = "[M ".$sign."(".$z.")]|";
									$hreprescomparison{$k}{$i}{repres_diff}.=$valrep;
									$count ++;
								}
								else{
									my $valrep = "[M ".$sign."(".$z.")]|";
									$hreprescomparison{$k}{$i}{repres_diff}.=$valrep;
									$count ++;
								}
							}
						}
					}
				}
			}
			else{
				$hreprescomparison{$k}{$i}{repres_diff}="M";
			}
		}
	}
}
	

print "Addition of annotation information relative to the representative ion done\n";
	
##############################
### Print in result file ! ###
##############################

open(F4, ">$output_tabular");
open(F5, $combined_DMVM);

my $line_nb = 0;
my %hheader;
while (my $line = <F5>){
	chomp $line;
	
	
	my @tline = split (/\t/, $line);
	
	if($line_nb == 0){
		print F4 "$line\tACorF_groups";
		if($opt == 1){
			if($repres_opt eq "intensity"){print F4 "\tACorF_filter\tintensity_repres\n"}
			if($repres_opt eq "mass"){print F4 "\tACorF_filter\tmass_repres\n"}
			if($repres_opt eq "mixt"){print F4 "\tACorF_filter\tmass2intens_repres\n"}
			if($repres_opt eq "max_intensity_max_mass"){print F4 "\tACorF_filter\tmax_intensity_max_mass_repres\n"}
			}
		else{
			if($repres_opt eq "intensity"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tintensity_repres\tannotation_relative_to_representative\n"}
			if($repres_opt eq "mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass_repres\tannotation_relative_to_representative\n"}
			if($repres_opt eq "mixt"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass2intens_repres\tannotation_relative_to_representative\n"}
			if($repres_opt eq "max_intensity_max_mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmax_intensity_max_mass_repres\tannotation_relative_to_representative\n"}
		}
		
		
		### Creation of a header hash
		for(my $i=0; $i<scalar(@tline);$i++){
			my $a = $tline[$i];
			$hheader{$a}=$i;
		}
	}
	
	else{
		my $find = 0;
		foreach my $v (keys %hcorrelgroup){
			if(defined($hgrouprepres{$v}{$tline[0]})){
				print F4 "$line\t$v";
					
				if($opt != 1){
					if(defined($hcorrelgroup{$v}{$tline[0]})){
						print F4 "\t$hcorrelgroup{$v}{$tline[0]}\t";
							
					}
					else{
						print F4 "\t";
					}
				}
				else{
					print F4 "\t";
				}
					
				if($repres_opt eq "intensity"){
					if($tline[0] eq $hgrouprepres{$v}{max_int}){
						print F4 "1\t";
					}
					else{
						print F4 "0\t";
					}
					$find = 1;
				}
				if($repres_opt eq "mass"){
					if($tline[0] eq $hgrouprepres{$v}{max_mass}){
						print F4 "1\t";
					}
					else{
						print F4 "0\t";
					}
					$find = 1;
				}
				if($repres_opt eq "mixt"){
					if($tline[0] eq $hgrouprepres{$v}{max_squared}){
						print F4 "1\t";
					}
					else{
						print F4 "0\t";
					}
					$find = 1;
				}
				if($repres_opt eq "max_intensity_max_mass"){
					if($tline[0] eq $hgrouprepres{$v}{max_int_max_mass}){
						print F4 "1\t";
					}
					else{
						print F4 "0\t";
					}
					$find = 1;
				}
					
				if($repres_opt eq "intensity"){print F4 "$hgrouprepres{$v}{max_int}\t"}
				if($repres_opt eq "mass"){print F4 "$hgrouprepres{$v}{max_mass}\t"}
				if($repres_opt eq "mixt"){print F4 "$hgrouprepres{$v}{max_squared}\t"}
				if($repres_opt eq "max_intensity_max_mass"){print F4 "$hgrouprepres{$v}{max_int_max_mass}\t"}
				
				if($opt != 1){
					if(defined($hreprescomparison{$v}{$tline[0]}{repres_diff})){
						print F4 "$hreprescomparison{$v}{$tline[0]}{repres_diff}\n";
					}
					else{
						print F4 "-\n";
					}
				}
				else{
					print F4 "\n";
				}
			}
		}
		if($find == 0){
			$groupct ++;
			my $group = "group".$groupct;
			if($opt != 1){
				print F4 "$line\t$group\t-\t-\t-\t-\n";
			}
			else{
				print F4 "$line\t$group\t-\t-\n";
			}	
		}
	}
	$line_nb ++;
}

print "Print in result file done\n";

print "All steps done\n";