comparison carpet-src-1/tools/CARPET/PeakPeaker2.pl @ 0:cdd489d98766

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author matces
date Tue, 07 Jun 2011 16:50:41 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:cdd489d98766
1 #! /usr/bin/perl
2
3 # Copyright 2009 Matteo Cesaroni, Lucilla Luzi
4 #
5 # This program is free software; ; you can redistribute it and/or modify
6 # it under the terms of the GNU Lesser General Public License as published by
7 # the Free Software Foundation; either version 3 of the License, or (at your
8 # option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14
15
16 use Getopt::Long;
17
18 GetOptions (
19 "help" => \$OPT{help},
20 "in=s" => \$OPT{fname},
21 "perc=s" => \$OPT{value},
22 "fc=s" => \$OPT{fc_value},
23 "log=s" => \$OPT{valore_log},
24 "t=s" => \$OPT{type_peak},
25 "num=s" => \$OPT{num_probe},
26 "dist=s" => \$OPT{dist_max},
27 "dist_peaks=s" => \$OPT{dist_max_peaks},
28 "w=s"=> \$OPT{s_windows},
29 "col3=s" => \$OPT{col3},
30 "f_pv=s" => \$OPT{file_pv},
31 "out=s"=> \$OPT{out}
32
33 )|| printusage();
34
35 # opzioni da linea di comando
36 my $value=$OPT{value};
37 my $fc_value=$OPT{fc_value};
38 my $valore_log=$OPT{valore_log};
39 my $type_peak=$OPT{type_peak};
40 my $num_probe=$OPT{num_probe};
41 my $dist_max=$OPT{dist_max};
42 my $dist_max_peaks=$OPT{dist_max_peaks};
43 my $fname=$OPT{fname};
44 my $help=$OPT{help};
45 my $s_windows=$OPT{s_windows};
46 my $col3=$OPT{col3};
47 my $outfile=$OPT{out};
48 my $outfile_pv=$OPT{file_pv},
49
50 # "usage" se c'e' un help
51 $help and printusage();
52
53 # "usage" se non ci sono opzioni
54 if (!$s_windows || !$fname || (!$value && !$fc_value) || !$type_peak || !$num_probe || !$dist_max || (($type_peak eq "p") && !$valore_log)){
55 &printusage()
56 };
57
58
59 qx {sort -k 1,1 -k 4,4n $fname >$fname.sortato};
60
61
62 open(FILE, "<$fname.sortato") or die "Cannot find file $fname.sortato\n";
63 open(pvalue, ">$outfile_pv") or die "Cannot open file $outfile_pv: $!\n";
64 #open(buonitutti, "> buoni_$fname") or die "Cannot find file $fname: $!\n";
65 #loop th rought line-by-line until the end of the file and then push each line into an empty array, called @array#
66 print pvalue "#p-value track ($value) \n";
67
68
69 @array= ();
70 $conto_tutti_sopra=0;
71 $conto_tutti=0;
72 while ($line = <FILE>){
73 chomp ($line);
74 if ($line=~/track/g){next;}
75 if ($line=~/#/g){next;}
76 if ($line=~/^\s+$/g){next;}
77 push (@array,$line);
78 @value_perc=split("\t",$line);
79 push (@percentile,$value_perc[5]);
80 if ($value_perc[5]>=$fc_value){
81 $conto_tutti_sopra++;
82 }
83 elsif ($value_perc[5]<$fc_value){
84 $conto_tutti++;
85 }
86 }
87
88 close (FILE);
89
90 if (!$fc_value){
91 @perc_ordine=sort(@percentile);
92 $position=(($value*($#perc_ordine+1))-1);
93 $valore_percentile=$perc_ordine[$position];
94 #print "il percentile è $valore_percentile\n";
95 #print "il max è $perc_ordine[$#perc_ordine]\n";
96 #print "il min è $perc_ordine[1]\n";
97 $probabilita=1-$value;
98 }
99 else {
100 $valore_percentile=$fc_value;
101 $tuttitutti=($conto_tutti+$conto_tutti_sopra);
102 #print"il numero totaledi probe è $tuttitutti\n";
103 #print"il numero totale di probe sopra è $conto_tutti_sopra\n";
104 $probabilita=$conto_tutti_sopra/($conto_tutti+$conto_tutti_sopra);
105 #print "la proba= $probabilita\n";
106 }
107
108 #print buonitutti"il percentile e $valore_percentile\n";
109
110
111
112 ($one_ref,$two_ref)=&probe_cutoff(@array);
113
114 if ($type_peak eq "p"){
115 @forse_niente=&peak(@$one_ref);
116 }
117 elsif ($type_peak eq "s"){
118 @forse_niente=&peak(@$two_ref);
119 }
120
121 &double_peak(@forse_niente);
122
123 #
124 # End process
125 #
126 close OUTFILE;
127 unlink "$fname.sortato";
128 exit 0;
129
130 ################################################################################################
131 ######################## SUBROUTINE ###############################
132 ################################################################################################
133
134 sub probe_cutoff
135 {
136
137 $c=-6;
138
139 foreach $linea(@array){
140 @subarray1 = split("\t",$linea);
141 $inizio=(((($subarray1[4]-$subarray1[3])/2)+$subarray1[3])-($s_windows/2)); #per modificare la windows cambia il 500 e il 1000
142 $fine=$inizio+$s_windows; #windows 500 -->250 e 500 windows 1000 --> 500 e 1000
143 $counter=0;
144 $counter_value=0;
145
146 if($subarray1[5]>=$valore_percentile){
147 push (@array_probe,[@subarray1]);
148 print buonitutti "$linea\n";
149 }
150
151 for($i=$c;$i<=$#array;$i++) {
152 if ($i<0){
153 next;
154 }
155 @subarray = split("\t",$array[$i]);
156 if (($subarray[3]>=$inizio)&&($subarray[3]<$fine)&&("$subarray[0]" eq "$subarray1[0]")){
157 $counter++;
158 if ($subarray[5]>= $valore_percentile){
159 $counter_value++;
160 }
161 }
162 if (($subarray[3]>$fine)||("$subarray[0]" ne "$subarray1[0]")){
163 $cazzo=$subarray[3];
164 last;
165 }
166
167 }
168 $c++;
169 if ($counter !=0){
170 $chi_sq = (((($counter_value - ($probabilita*$counter))**2)/($probabilita*$counter))+(((($counter-$counter_value)-((1-$probabilita)*$counter))**2)/((1-$probabilita)*$counter)));
171 }
172 else{
173 $chi_sq = NA;
174 }
175
176 use Statistics::Distributions;
177 $chisprob=Statistics::Distributions::chisqrprob (1,$chi_sq);
178 if ($chisprob == 0){
179 $log_10=100;
180 }
181 else {
182 $log_10 = -log($chisprob)/log(10);
183 }
184
185 print pvalue "$subarray1[0]\t$subarray1[1]\tpv_$subarray1[2]\t$subarray1[3]\t$subarray1[4]\t$log_10\t$subarray1[6]\t$subarray1[7]\t$subarray1[5]\n";
186
187 if ($log_10>=$valore_log){
188 #print"$subarray1[3],$inizio,$fine,$counter,$counter_value,$chi_sq,$chisprob,$log_10\n";
189 @proviamo=($subarray1[0],$subarray1[1],$subarray1[2],$inizio,$fine,$log_10,$subarray1[6],$subarray1[7],$subarray1[5]);
190 push (@matrix_pvalue,[@proviamo]);
191
192 }
193 }
194 @result_table=@matrix_pvalue;
195 @result_table2=@array_probe;
196 return(\@result_table,\@result_table2);
197
198 }
199
200
201
202
203
204 ################################################################################################
205
206
207
208 sub peak
209 {
210 @matrix_value=@_;
211
212 $stop=$#matrix_value;
213
214 for ($cio=0; $cio<=$stop;$cio++) {
215 @log_ratio=();
216 @chilosa=();
217 $inizio=$matrix_value[$cio][3];
218 $somma=0;
219 $media=0;
220 $sommachilo=0;
221 $mediachilo=0;
222 $diviso=0;
223 push(@log_ratio,$matrix_value[$cio][5]);
224 if ($matrix_value[$cio][8]>=$valore_percentile){
225 push(@chilosa,$matrix_value[$cio][8]);
226 }
227 for ($j=0; $j<=$stop;$j++){
228 $distanza=($matrix_value[$cio+1][3]-$matrix_value[$cio][4]);
229 if (($distanza > $dist_max) || ($cio==$stop)||(!("$matrix_value[$cio+1][0]" eq "$matrix_value[$cio][0]"))){
230 $j=$stop;
231 }
232 elsif (($distanza <= $dist_max)&&("$matrix_value[$cio+1][0]" eq "$matrix_value[$cio][0]")){
233 $cio++;
234 push(@log_ratio,$matrix_value[$cio][5]);
235 $fine=$matrix_value[$cio][4];
236 $inizio3=$inizio;
237 $j++;
238 if ($matrix_value[$cio][8]>=$valore_percentile){
239 push(@chilosa,$matrix_value[$cio][8]);
240 }
241 }
242
243
244 }
245 $numeroprobes=($#chilosa+1);
246 if ((($#log_ratio+1)>=$num_probe) && (($type_peak eq "s") ||(($#chilosa+1)>=$num_probe))){
247 foreach $n (@log_ratio) {
248 $somma+=$n;
249 }
250 if($type_peak eq "s"){
251 @chilosa=@log_ratio;
252 }
253 foreach $nchilo (@chilosa) {
254 $sommachilo+=$nchilo;
255 }
256 $diviso=$#log_ratio+1;
257 $media = $somma/(@log_ratio);
258 $mediachilo=$sommachilo/(@chilosa);
259 #$somma_media_chilo=((sqrt($#chilosa+1))+$mediachilo);
260 #$moltipli=(($#log_ratio+1)*$media);
261 #$somma_media=((sqrt($#log_ratio+1))+$media);
262 if ($type_peak eq "p"){
263 $inizio3=int($inizio3+(($s_windows/2)-25));
264 $fine=int($fine-(($s_windows/2)-25));
265 }
266 @proviamo_double=($matrix_value[$cio][0],$matrix_value[$cio][1],$col3,$inizio3,$fine,$media,$matrix_value[$cio][6],$matrix_value[$cio][7],$mediachilo,$diviso);
267 push (@matrix_for_double,[@proviamo_double]);
268
269 }
270 }
271
272 @result_table=@matrix_for_double;
273 return(@result_table);
274 }
275
276
277 ################################################################################################
278
279
280
281 sub double_peak
282 {
283 @matrix_value=@_;
284
285 $stop=$#matrix_value;
286 $i=0;
287 $j=0;
288 #print $stop;
289
290 #
291 # Print output file
292 #
293 if ($outfile){
294 open (OUTFILE, ">$outfile");
295 }
296
297 #
298 # print File Header
299 #
300 my $header=<<e0c6654;
301 \# infile: $fname
302 \# percentile value: $value=$valore_percentile
303 \# fold change: $fc_value
304 \# type of analysis: $type_peak
305 \# log(pval analysis): $valore_log
306 \# num (probe defining a peak): $num_probe
307 \# dist: $dist_max
308 \# window length: $s_windows
309 track name=$col3 description="peaks find" visibility=2
310 e0c6654
311
312 if ($outfile){
313 print OUTFILE "$header";
314 }else{
315 print "$header";
316 }
317 if ($type_peak eq "p"){
318 print "perc value=$value=$valore_percentile, #probes=$num_probe (dist=$dist_max), window=$s_windows, type analisys =p-value (log=$valore_log), dist peaks=$dist_max_peaks";
319 }
320 if ($type_peak eq "s"){
321 print "perc value=$value=$valore_percentile, #probes=$num_probe (dist=$dist_max), window=$s_windows, type analisys=score, dist peaks=$dist_max_peaks";
322 }
323 for ($i=0; $i<=$stop;$i++) {
324 @log_ratio=();
325 @log_ratio2=();
326 @diviso2=();
327 $inizio=$matrix_value[$i][3];
328 $somma=0;
329 $somma2=0;
330 $media=0;
331 $media2=0;
332 $diviso2=0;
333 push(@log_ratio,$matrix_value[$i][5]);
334 push(@log_ratio2,$matrix_value[$i][8]);
335 push(@diviso2,$matrix_value[$i][9]);
336 #print "giro $i\n";
337 for ($j=0; $j<=$stop;$j++){
338 # if ("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]"){
339 $distanza=($matrix_value[$i+1][3]-$matrix_value[$i][4]);
340 #print "distanza fra i due $distanza\n";
341 #exit;
342 if (($distanza > $dist_max_peaks) || ($i==$stop)||(!("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]"))){
343 $j=$stop;
344 $fine=$matrix_value[$i][4];
345 }
346 elsif (($distanza <= $dist_max_peaks)&&("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]")){
347 $i++;
348 push(@log_ratio,$matrix_value[$i][5]);
349 push(@log_ratio2,$matrix_value[$i][8]);
350 push(@diviso2,$matrix_value[$i][9]);
351 $fine=$matrix_value[$i][4];
352 $j++;
353 }
354
355 # }
356 }
357 foreach $n (@log_ratio) {
358 $somma+=$n;
359 }
360 foreach $n1 (@log_ratio2) {
361 $somma2+=$n1;
362 }
363 foreach $n2 (@diviso2) {
364 $diviso2+=$n2;
365 }
366 $media = $somma/(@log_ratio);
367 $media2 = $somma2/(@log_ratio2);
368 $somma_media=(sqrt($diviso2)+$media);
369 $somma_score=(sqrt($diviso2)+$media2);
370 $somma_media = sprintf "%.2f", $somma_media;
371 $somma_score = sprintf "%.2f", $somma_score;
372 #$somma_score = $somma_score.$i
373 #$somma_media_chilo=((sqrt($#chilosa+1))+$mediachilo);
374 if ($outfile){
375 print OUTFILE "$matrix_value[$i][0]\t$matrix_value[$i][1]\t$matrix_value[$i][2]\t$inizio\t$fine\t$somma_media\t$matrix_value[$i][6]\t$matrix_value[$i][7]\t$somma_score$i\n";
376 }else{
377 print "$matrix_value[$i][0]\t$matrix_value[$i][1]\t$matrix_value[$i][2]\t$inizio\t$fine\t$somma_media\t$matrix_value[$i][6]\t$matrix_value[$i][7]\t$somma_score$i\n";
378 }
379
380
381 }
382 }
383
384 ####################################################################################################################################
385
386 sub printusage {
387
388 print<<eoc22334;
389
390
391
392 ***************************************************
393 :: N i m b l e G e n C h i p A n a l y s i s ::
394 ***************************************************
395
396
397 USAGE SUMMARY
398 ---------------------------------------------------------------------------------
399 This program utilizes NimbleGen ratio files in gff format as INPUT FILE and
400 provides a table of the computed picks in the same gff file format.
401
402
403
404 --in [input filename]
405 --perc [percentile value, it is used to calculate the threshold rate based
406 on dataset distribution to filter out background ratio; i.e. 0.99]
407 OR
408 --fc [fold change value, it is used as fixed threshold to filter out
409 background ratio, i.e. 2]
410 --t [type of analysis; p, performs peaks determination based on p-value inference;
411 s, performs peaks determination based on a scoring system]
412 --log (required only for p-value based analysis)
413 [log2(p-value), cutoff integer to be used to identify a significant peak; i.e. 5]
414 --num [minimal number of consecutive probes used to define a peak; i.e. 3]
415 --dist [greatest nucleotide distance between two probes or between two peaks that
416 allow to consider the signals as belonging to the same peak]
417 --w [window length]
418
419 --out [output filename (optional)]
420 -----------------------------------------------------------------------------------------
421 EXAMPLES:
422
423 perl program.pl --in 75340_ratio.gff --perc 0.98 --t s --num 3 --dist 250 --w 500
424 OR
425 perl program.pl --in 75340_ratio.gff --perc 0.98 --t p --log 5 --num 3 --dist 250 --w 500
426
427 -----------------------------------------------------------------------------------------
428
429 eoc22334
430 exit 0;
431
432 }