comparison ACF/Analytic_correlation_filtration.pl @ 2:15430e89c97d draft default tip

Uploaded
author melpetera
date Thu, 07 Nov 2019 03:42:14 -0500
parents
children
comparison
equal deleted inserted replaced
1:26aa3a8f95ce 2:15430e89c97d
1 #!usr/bin/perl
2
3 ### Perl modules
4 use warnings;
5 use strict;
6 use Getopt::Long qw(GetOptions); #Creation of script options
7 use Pod::Usage qw(pod2usage); #Creation of script options
8
9 #Personnal packages
10 use FindBin ; ## Allows you to locate the directory of original perl script
11 #use lib $FindBin::Bin;
12 use lib "$FindBin::Bin/lib";
13 use IonFiltration;
14
15 my ($file, $mass_file, $opt, $dataMatrix, $combined_DMVM, $repres_opt, $rt_threshold, $mass_threshold, $output_sif, $output_tabular, $correl_threshold, $intensity_threshold, $intensity_pourc); #Options to complete
16
17 ########################
18 ### Options and help ###
19 ########################
20
21 GetOptions("f=s"=>\$file, "m=s"=>\$mass_file, "o=s"=>\$opt, "d=s"=>\$dataMatrix, "v=s"=>\$combined_DMVM, "r=s"=>\$repres_opt, "rt=f"=>\$rt_threshold, "mass=f"=>\$mass_threshold, "output_sif=s"=>\$output_sif, "output_tabular=s"=>\$output_tabular, "correl=s"=>\$correl_threshold, "IT=f"=>\$intensity_threshold, "IP=f"=>\$intensity_pourc) or pod2usage(2);
22
23 ### Check required parameters :
24 pod2usage({-message=>q{Mandatory argument '-f' is missing}, -exitval=>1, -verbose=>0}) unless $file;
25 #pod2usage({-message=>q{Mandatory argument '-m' is missing}, -exitval=>1, -verbose=>0}) unless $mass_file;
26 pod2usage({-message=>q{Mandatory argument '-o' is missing. It correspond to the grouping method for analytical correlation groups formation.
27 #It should be a number (1 ; 2 or 3) :
28 # 1 : Don't take into acount mass information (only RT) ;
29 # 2 : Check that all mass differences are include in a specific list and taking into acount RT information
30 # 3 : Check that all mass differences are include in a specific list, ignoring RT information
31 #To use the tool without takinf into account mass and RT information, use option 1 and define the RT threshold to 999999999.}, -exitval=>1, -verbose=>0}) unless $opt;
32 pod2usage({-message=>q{Mandatory argument '-r' is missing. It correspond to the group representent choosing method for analytical correlation groups formation.
33 It should be one of the 3 options below :
34 "mass" : choose the ion with the highest mass as the representant
35 "intensity" : choose the ion with the highest intensity as the representant
36 "mixt" : choose the ion with the highest (mass^2 * intensity) as the representant
37 "max_intensity_max_mass" : choose tha ion witht he highest intenisty among the 5 most intense ions of the group}, -exitval=>1, -verbose=>0}) unless $repres_opt;
38 pod2usage({-message=>q{Mandatory argument '-d' is missing}, -exitval=>1, -verbose=>0}) unless $dataMatrix;
39 pod2usage({-message=>q{Mandatory argument '-v' is missing}, -exitval=>1, -verbose=>0}) unless $combined_DMVM;
40 #pod2usage({-message=>q{Mandatory argument '-rt' is missing}, -exitval=>1, -verbose=>0}) unless $rt_threshold;
41 #pod2usage({-message=>q{Mandatory argument '-mass' is missing}, -exitval=>1, -verbose=>0}) unless $mass_threshold;
42 pod2usage({-message=>q{Mandatory argument '-correl' is missing}, -exitval=>1, -verbose=>0}) unless $correl_threshold;
43 pod2usage({-message=>q{Mandatory argument '-output_tabular' is missing}, -exitval=>1, -verbose=>0}) unless $output_tabular;
44 pod2usage({-message=>q{Mandatory argument '-output_sif' is missing}, -exitval=>1, -verbose=>0}) unless $output_sif;
45
46
47 #if(($opt != 1) && ($opt != 2) && ($opt != 3)){
48 # print "you must indicate \"1\", \"2\" or \"3\" for the --o otpion\n";
49 # exit;
50 #}
51
52
53
54 if(($repres_opt ne "mass") && ($repres_opt ne "intensity") && ($repres_opt ne "mixt") && ($repres_opt ne "max_intensity_max_mass")){
55 print "you must indicate \"mass\", \"intensity\", \"mix\" or \"max_intensity_max_mass\" for the --r otpion\n";
56 exit;
57 }
58
59
60
61 #########################################################################
62 #### Création of a hash containing all adduits and fragments possible ###
63 #########################################################################
64
65 my %hmass;
66 if($opt != 1){
67 %hmass = IonFiltration::MassCollecting($mass_file);
68
69 }
70
71 my $refhmass = \%hmass;
72
73 print "Création of a hash containing all adduits and fragments possible\n";
74
75
76 ########################################################
77 ### Creation of a sif table + correlation filtration ###
78 ########################################################
79
80 my %hrtmz;
81 ($output_sif, %hrtmz) = IonFiltration::sifTableCreation($file, $output_sif, $opt, $rt_threshold, $mass_threshold, $correl_threshold, $dataMatrix, $output_tabular, $combined_DMVM, $repres_opt, $intensity_threshold, $intensity_pourc, \%hmass);
82 print "Creation of a sif table + correlation filtration done\n";
83
84
85 ######################################################
86 ### Analytic correlation filtrering follow options ###
87 ######################################################
88
89 my %hheader_file;
90 my %hduplicate;
91
92 my %hcorrelgroup;
93 my $groupct=1;
94
95 my $linenb3=0;
96 my %hheader_line;
97
98
99
100 open (F1, $output_sif) or die "Impossible to open $output_sif\n";
101
102 while(my $line = <F1>){
103 my $count=0;
104 chomp $line;
105 my @tline = split(/\t/, $line);
106 my $a = $tline[0];
107 my $b = $tline[2];
108
109 my $amass=$hrtmz{$a}{mz};
110 my $atemp=$hrtmz{$a}{rt};
111 my $bmass= $hrtmz{$b}{mz};
112 my $btemp=$hrtmz{$b}{rt};
113 my $diff = $amass-$bmass;
114 $diff = abs($diff);
115
116 ### Option 1: Don't take into acount mass information ###
117
118 if($opt == 1){
119 my $btplus = $btemp + $rt_threshold;
120 my $btmoins = $btemp - $rt_threshold;
121 if(($btmoins <= $atemp) && ($atemp <= $btplus)){
122 foreach my $k (keys %hcorrelgroup){
123 if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){
124 $hcorrelgroup{$k}{$a}=1;
125 $hcorrelgroup{$k}{$b}=1;
126 $count++;
127 last;
128 }
129 }
130 if($count == 0){
131 my $groupnb="group".$groupct;
132 $hcorrelgroup{$groupnb}{$a}=1;
133 $hcorrelgroup{$groupnb}{$b}=1;
134 $groupct ++;
135 }
136 }
137 }
138
139
140
141 ### Option 2: Check that all mass differences are include in a specific list taking into account RT information ###
142
143 elsif($opt == 2){
144
145 my $print = 0;
146 foreach my $s (keys %{$refhmass}){
147 foreach my $r (keys %{$refhmass->{$s}}){
148 my $rm = $r - $mass_threshold;
149 my $rp = $r + $mass_threshold;
150 if(($diff <= $rp) && ($diff >= $rm)){
151 if($print == 0){
152 my $btplus = $btemp + $rt_threshold;
153 my $btmoins = $btemp - $rt_threshold;
154
155 if(($btmoins <= $atemp) && ($atemp <= $btplus)){
156 foreach my $k (keys %hcorrelgroup){
157 if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){
158 $hcorrelgroup{$k}{$a}=1;
159 $hcorrelgroup{$k}{$b}=1;
160 $count++;
161 last;
162 }
163 }
164 if($count == 0){
165 my $groupnb="group".$groupct;
166 $hcorrelgroup{$groupnb}{$a}=1;
167 $hcorrelgroup{$groupnb}{$b}=1;
168 $groupct ++;
169 }
170 $print = 1;
171 }
172 }
173 }
174 }
175 }
176 }
177
178
179 ### Option 3: Check that all mass differences are include in a specific list, ignoring RT information ###
180
181 elsif($opt == 3){
182
183 my $print = 0;
184 foreach my $s (keys %{$refhmass}){
185 foreach my $r (keys %{$refhmass->{$s}}){
186 my $rm = $r - $mass_threshold;
187 my $rp = $r + $mass_threshold;
188 if(($diff <= $rp) && ($diff >= $rm)){
189 if($print == 0){
190
191 foreach my $k (keys %hcorrelgroup){
192 if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){
193 $hcorrelgroup{$k}{$a}=1;
194 $hcorrelgroup{$k}{$b}=1;
195 $count++;
196 last;
197 }
198 }
199 if($count == 0){
200 my $groupnb="group".$groupct;
201 $hcorrelgroup{$groupnb}{$a}=1;
202 $hcorrelgroup{$groupnb}{$b}=1;
203 $groupct ++;
204 }
205 $print = 1;
206 }
207 }
208 }
209 }
210 }
211 }
212 close F1;
213
214 print "Analytic correlation filtrering follow options done\n";
215
216
217 #############################################
218 ### Join groups that have been subdivided ###
219 #############################################
220
221 my @tdelete;
222
223 foreach my $k (keys %hcorrelgroup){
224 foreach my $i (keys %{$hcorrelgroup{$k}}){
225 foreach my $v (keys %hcorrelgroup){
226 my $count = 0;
227 if ($v ne $k){
228 foreach my $w (keys %{$hcorrelgroup{$v}}){
229 if($w eq $i){
230 $count = 1;
231 push(@tdelete, $v);
232 }
233 }
234 }
235 if($count == 1){
236 foreach my $w (keys %{$hcorrelgroup{$v}}){
237 $hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w};
238 }
239 delete($hcorrelgroup{$v});
240 }
241 }
242 }
243 }
244
245 foreach my $t (@tdelete){
246 delete($hcorrelgroup{$t});
247 }
248
249
250 ### Do it twice to see if it fix the problem of unmerge groups
251
252 foreach my $k (keys %hcorrelgroup){
253 foreach my $i (keys %{$hcorrelgroup{$k}}){
254 foreach my $v (keys %hcorrelgroup){
255 my $count = 0;
256 if ($v ne $k){
257 foreach my $w (keys %{$hcorrelgroup{$v}}){
258 if($w eq $i){
259 $count = 1;
260 push(@tdelete, $v);
261 }
262 }
263 }
264 if($count == 1){
265 foreach my $w (keys %{$hcorrelgroup{$v}}){
266 $hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w};
267 }
268 delete($hcorrelgroup{$v});
269 }
270 }
271 }
272 }
273
274 foreach my $t (@tdelete){
275 delete($hcorrelgroup{$t});
276 }
277
278 print "Join groups that have been subdivided done\n";
279
280 #######################################################
281 ### Addition of annotation information among groups ###
282 #######################################################
283
284 foreach my $k (keys %hcorrelgroup){
285 foreach my $i (keys %{$hcorrelgroup{$k}}){
286 foreach my $j (keys %{$hcorrelgroup{$k}}){
287 my $count = 0;
288 if ($i ne $j){
289
290 my $a = $hrtmz{$i}{mz};
291 my $b = $hrtmz{$j}{mz};
292
293 my $diff = $a - $b;
294 my $sign;
295 if($diff>0){
296 $sign="+";
297 }
298 if($diff<0){
299 $sign="-";
300 }
301 $diff = abs($diff);
302
303 foreach my $z (keys %{$refhmass}){
304
305 foreach my $y (keys %{$refhmass->{$z}}){
306 my $ym = $y - $mass_threshold;
307 my $yp = $y + $mass_threshold;
308
309
310 if(($diff <= $yp) && ($diff >= $ym)){
311 my $diff_list = $diff - $y;
312 $diff_list = abs($diff_list);
313 $diff_list = sprintf ("%0.6f", $diff_list);
314
315 if($hcorrelgroup{$k}{$i} eq 1){
316 my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|";
317 $hcorrelgroup{$k}{$i}=$val;
318 $count ++;
319 }
320 else{
321 if($count == 0){
322 my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|";
323 $hcorrelgroup{$k}{$i}.=$val;
324 $count ++;
325 }
326 else{
327 my $val = $sign."(".$z.")(".$diff_list.")|";
328 $hcorrelgroup{$k}{$i}.=$val;
329 $count ++;
330 }
331 }
332 }
333 }
334 }
335 }
336 }
337 }
338 }
339
340
341 print "Addition of annotation information among groups done\n";
342
343
344 ####################################################
345 ### Choose the representative ion for each group ###
346 ####################################################
347
348 my %hgrouprepres;
349
350 open(F3, $dataMatrix);
351
352 while (my $line = <F3>){
353 chomp $line;
354
355 my @tline = split (/\t/, $line);
356
357 foreach my $k (keys %hcorrelgroup){
358 foreach my $i (keys %{$hcorrelgroup{$k}}){
359 if($tline[0] eq $i){
360 $hgrouprepres{$k}{$i}{mass}=$hrtmz{$tline[0]}{mz};
361 my $intensity;
362 my $nbsubjects=0;
363 for(my $y=1;$y<scalar(@tline);$y++){
364 $intensity += $tline[$y];
365 $nbsubjects ++;
366 }
367 my $meanintensity = $intensity/$nbsubjects;
368 $hgrouprepres{$k}{$i}{intensity}=$meanintensity;
369 $hgrouprepres{$k}{$i}{squaredmassint}=($hgrouprepres{$k}{$i}{mass}**2)/($hgrouprepres{$k}{$i}{intensity});
370 }
371 }
372 }
373 }
374 close F3;
375
376 foreach my $z (keys %hgrouprepres){
377 my $max_intensity = 0;
378 my $max_int_ion = "";
379 my $max_mass = 0;
380 my $max_mass_ion = "";
381 my $max_squared = 0;
382 my $max_squared_ion = "";
383 foreach my $w (keys %{$hgrouprepres{$z}}){
384 if($hgrouprepres{$z}{$w}{intensity} > $max_intensity){
385 $max_intensity = $hgrouprepres{$z}{$w}{intensity};
386 $max_int_ion = $w;
387 }
388 if($hgrouprepres{$z}{$w}{mass} > $max_mass){
389 $max_mass = $hgrouprepres{$z}{$w}{mass};
390 $max_mass_ion = $w;
391 }
392 if($hgrouprepres{$z}{$w}{squaredmassint} > $max_squared){
393 $max_squared = $hgrouprepres{$z}{$w}{squaredmassint};
394 $max_squared_ion = $w;
395 }
396 }
397
398 my $max_int_max_mass_ion="";
399
400 if($repres_opt eq "max_intensity_max_mass"){
401 my %hfirst;
402 my $first=0;
403 foreach my $w (reverse sort {$hgrouprepres{$z}{$a}{intensity} <=> $hgrouprepres{$z}{$b}{intensity} } keys %{$hgrouprepres{$z}}){
404 $first ++;
405 if ($first <= 3){
406 $hfirst{$w} = $hgrouprepres{$z}{$w}{intensity};
407 }
408 }
409
410 my $first_2 = 0;
411 my $intens_max = 0;
412 my $mass_max = 0;
413
414 foreach my $y (reverse sort {$hfirst{$a} <=> $hfirst{$b}} keys %hfirst){
415
416 $first_2 ++;
417 if($first_2 == 1){
418 $intens_max = $hfirst{$y};
419 if($intensity_threshold > $intens_max){
420 $intensity_threshold = 0;
421 }
422 $max_int_max_mass_ion = $y;
423 $mass_max = $hgrouprepres{$z}{$y}{mass};
424 }
425 if($hgrouprepres{$z}{$y}{mass} > $mass_max){
426 if($hfirst{$y}>$intensity_threshold){
427 my $a = $intens_max * $intensity_pourc;
428 if($hfirst{$y} > $a){
429 $max_int_max_mass_ion = $y;
430 $mass_max = $hgrouprepres{$z}{$y}{mass};
431 }
432 }
433 }
434 }
435 }
436
437 $hgrouprepres{$z}{max_int}=$max_int_ion;
438 $hgrouprepres{$z}{max_mass}=$max_mass_ion;
439 $hgrouprepres{$z}{max_squared}=$max_squared_ion;
440 $hgrouprepres{$z}{max_int_max_mass}=$max_int_max_mass_ion;
441
442 }
443
444
445 print "Choose the representative ion for each group done\n";
446
447 #############################################################################
448 ### Addition of annotation information relative to the representative ion ###
449 #############################################################################
450
451 my %hreprescomparison;
452
453 my $representative="";
454
455 if($opt != 1){
456 foreach my $k (keys %hcorrelgroup){
457 foreach my $i (keys %{$hcorrelgroup{$k}}){
458
459 if($repres_opt eq "mass"){$representative = $hgrouprepres{$k}{max_mass}}
460 if($repres_opt eq "intensity"){$representative = $hgrouprepres{$k}{max_int}}
461 if($repres_opt eq "mixt"){$representative = $hgrouprepres{$k}{max_squared}}
462 if($repres_opt eq "max_intensity_max_mass"){$representative = $hgrouprepres{$k}{max_int_max_mass}}
463
464
465 my $count = 0;
466 if ($i ne $representative){
467
468 my $a = $hrtmz{$i}{mz};
469 my $b = $hrtmz{$representative}{mz};
470
471 my $diff = $a - $b;
472 my $sign;
473 if($diff>0){
474 $sign="+";
475 }
476 if($diff<0){
477 $sign="-";
478 }
479 $diff = abs($diff);
480
481 foreach my $z (keys %{$refhmass}){
482
483 foreach my $y (keys %{$refhmass->{$z}}){
484 my $ym = $y - $mass_threshold;
485 my $yp = $y + $mass_threshold;
486
487 if(($diff <= $yp) && ($diff >= $ym)){
488 my $diff_list = $diff - $y;
489 $diff_list = abs($diff_list);
490 $diff_list = sprintf ("%0.4f", $diff_list);
491 if($hcorrelgroup{$k}{$i} eq 1){
492 my $valrep = "[M ".$sign."(".$z.")]|";
493 $hreprescomparison{$k}{$i}{repres_diff}=$valrep;
494 $count ++;
495 }
496 else{
497 if($count == 0){
498 my $valrep = "[M ".$sign."(".$z.")]|";
499 $hreprescomparison{$k}{$i}{repres_diff}.=$valrep;
500 $count ++;
501 }
502 else{
503 my $valrep = "[M ".$sign."(".$z.")]|";
504 $hreprescomparison{$k}{$i}{repres_diff}.=$valrep;
505 $count ++;
506 }
507 }
508 }
509 }
510 }
511 }
512 else{
513 $hreprescomparison{$k}{$i}{repres_diff}="M";
514 }
515 }
516 }
517 }
518
519
520 print "Addition of annotation information relative to the representative ion done\n";
521
522 ##############################
523 ### Print in result file ! ###
524 ##############################
525
526 open(F4, ">$output_tabular");
527 open(F5, $combined_DMVM);
528
529 my $line_nb = 0;
530 my %hheader;
531 while (my $line = <F5>){
532 chomp $line;
533
534
535 my @tline = split (/\t/, $line);
536
537 if($line_nb == 0){
538 print F4 "$line\tACorF_groups";
539 if($opt == 1){
540 if($repres_opt eq "intensity"){print F4 "\tACorF_filter\tintensity_repres\n"}
541 if($repres_opt eq "mass"){print F4 "\tACorF_filter\tmass_repres\n"}
542 if($repres_opt eq "mixt"){print F4 "\tACorF_filter\tmass2intens_repres\n"}
543 if($repres_opt eq "max_intensity_max_mass"){print F4 "\tACorF_filter\tmax_intensity_max_mass_repres\n"}
544 }
545 else{
546 if($repres_opt eq "intensity"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tintensity_repres\tannotation_relative_to_representative\n"}
547 if($repres_opt eq "mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass_repres\tannotation_relative_to_representative\n"}
548 if($repres_opt eq "mixt"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass2intens_repres\tannotation_relative_to_representative\n"}
549 if($repres_opt eq "max_intensity_max_mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmax_intensity_max_mass_repres\tannotation_relative_to_representative\n"}
550 }
551
552
553 ### Creation of a header hash
554 for(my $i=0; $i<scalar(@tline);$i++){
555 my $a = $tline[$i];
556 $hheader{$a}=$i;
557 }
558 }
559
560 else{
561 my $find = 0;
562 foreach my $v (keys %hcorrelgroup){
563 if(defined($hgrouprepres{$v}{$tline[0]})){
564 print F4 "$line\t$v";
565
566 if($opt != 1){
567 if(defined($hcorrelgroup{$v}{$tline[0]})){
568 print F4 "\t$hcorrelgroup{$v}{$tline[0]}\t";
569
570 }
571 else{
572 print F4 "\t";
573 }
574 }
575 else{
576 print F4 "\t";
577 }
578
579 if($repres_opt eq "intensity"){
580 if($tline[0] eq $hgrouprepres{$v}{max_int}){
581 print F4 "1\t";
582 }
583 else{
584 print F4 "0\t";
585 }
586 $find = 1;
587 }
588 if($repres_opt eq "mass"){
589 if($tline[0] eq $hgrouprepres{$v}{max_mass}){
590 print F4 "1\t";
591 }
592 else{
593 print F4 "0\t";
594 }
595 $find = 1;
596 }
597 if($repres_opt eq "mixt"){
598 if($tline[0] eq $hgrouprepres{$v}{max_squared}){
599 print F4 "1\t";
600 }
601 else{
602 print F4 "0\t";
603 }
604 $find = 1;
605 }
606 if($repres_opt eq "max_intensity_max_mass"){
607 if($tline[0] eq $hgrouprepres{$v}{max_int_max_mass}){
608 print F4 "1\t";
609 }
610 else{
611 print F4 "0\t";
612 }
613 $find = 1;
614 }
615
616 if($repres_opt eq "intensity"){print F4 "$hgrouprepres{$v}{max_int}\t"}
617 if($repres_opt eq "mass"){print F4 "$hgrouprepres{$v}{max_mass}\t"}
618 if($repres_opt eq "mixt"){print F4 "$hgrouprepres{$v}{max_squared}\t"}
619 if($repres_opt eq "max_intensity_max_mass"){print F4 "$hgrouprepres{$v}{max_int_max_mass}\t"}
620
621 if($opt != 1){
622 if(defined($hreprescomparison{$v}{$tline[0]}{repres_diff})){
623 print F4 "$hreprescomparison{$v}{$tline[0]}{repres_diff}\n";
624 }
625 else{
626 print F4 "-\n";
627 }
628 }
629 else{
630 print F4 "\n";
631 }
632 }
633 }
634 if($find == 0){
635 $groupct ++;
636 my $group = "group".$groupct;
637 if($opt != 1){
638 print F4 "$line\t$group\t-\t-\t-\t-\n";
639 }
640 else{
641 print F4 "$line\t$group\t-\t-\n";
642 }
643 }
644 }
645 $line_nb ++;
646 }
647
648 print "Print in result file done\n";
649
650 print "All steps done\n";
651