Mercurial > repos > matces > carpet_toolsuite
comparison carpet-src-1/tools/CARPET/PeakPeaker2.pl @ 0:cdd489d98766
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author | matces |
---|---|
date | Tue, 07 Jun 2011 16:50:41 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:cdd489d98766 |
---|---|
1 #! /usr/bin/perl | |
2 | |
3 # Copyright 2009 Matteo Cesaroni, Lucilla Luzi | |
4 # | |
5 # This program is free software; ; you can redistribute it and/or modify | |
6 # it under the terms of the GNU Lesser General Public License as published by | |
7 # the Free Software Foundation; either version 3 of the License, or (at your | |
8 # option) any later version. | |
9 # | |
10 # This program is distributed in the hope that it will be useful, | |
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 # GNU General Public License for more details. | |
14 | |
15 | |
16 use Getopt::Long; | |
17 | |
18 GetOptions ( | |
19 "help" => \$OPT{help}, | |
20 "in=s" => \$OPT{fname}, | |
21 "perc=s" => \$OPT{value}, | |
22 "fc=s" => \$OPT{fc_value}, | |
23 "log=s" => \$OPT{valore_log}, | |
24 "t=s" => \$OPT{type_peak}, | |
25 "num=s" => \$OPT{num_probe}, | |
26 "dist=s" => \$OPT{dist_max}, | |
27 "dist_peaks=s" => \$OPT{dist_max_peaks}, | |
28 "w=s"=> \$OPT{s_windows}, | |
29 "col3=s" => \$OPT{col3}, | |
30 "f_pv=s" => \$OPT{file_pv}, | |
31 "out=s"=> \$OPT{out} | |
32 | |
33 )|| printusage(); | |
34 | |
35 # opzioni da linea di comando | |
36 my $value=$OPT{value}; | |
37 my $fc_value=$OPT{fc_value}; | |
38 my $valore_log=$OPT{valore_log}; | |
39 my $type_peak=$OPT{type_peak}; | |
40 my $num_probe=$OPT{num_probe}; | |
41 my $dist_max=$OPT{dist_max}; | |
42 my $dist_max_peaks=$OPT{dist_max_peaks}; | |
43 my $fname=$OPT{fname}; | |
44 my $help=$OPT{help}; | |
45 my $s_windows=$OPT{s_windows}; | |
46 my $col3=$OPT{col3}; | |
47 my $outfile=$OPT{out}; | |
48 my $outfile_pv=$OPT{file_pv}, | |
49 | |
50 # "usage" se c'e' un help | |
51 $help and printusage(); | |
52 | |
53 # "usage" se non ci sono opzioni | |
54 if (!$s_windows || !$fname || (!$value && !$fc_value) || !$type_peak || !$num_probe || !$dist_max || (($type_peak eq "p") && !$valore_log)){ | |
55 &printusage() | |
56 }; | |
57 | |
58 | |
59 qx {sort -k 1,1 -k 4,4n $fname >$fname.sortato}; | |
60 | |
61 | |
62 open(FILE, "<$fname.sortato") or die "Cannot find file $fname.sortato\n"; | |
63 open(pvalue, ">$outfile_pv") or die "Cannot open file $outfile_pv: $!\n"; | |
64 #open(buonitutti, "> buoni_$fname") or die "Cannot find file $fname: $!\n"; | |
65 #loop th rought line-by-line until the end of the file and then push each line into an empty array, called @array# | |
66 print pvalue "#p-value track ($value) \n"; | |
67 | |
68 | |
69 @array= (); | |
70 $conto_tutti_sopra=0; | |
71 $conto_tutti=0; | |
72 while ($line = <FILE>){ | |
73 chomp ($line); | |
74 if ($line=~/track/g){next;} | |
75 if ($line=~/#/g){next;} | |
76 if ($line=~/^\s+$/g){next;} | |
77 push (@array,$line); | |
78 @value_perc=split("\t",$line); | |
79 push (@percentile,$value_perc[5]); | |
80 if ($value_perc[5]>=$fc_value){ | |
81 $conto_tutti_sopra++; | |
82 } | |
83 elsif ($value_perc[5]<$fc_value){ | |
84 $conto_tutti++; | |
85 } | |
86 } | |
87 | |
88 close (FILE); | |
89 | |
90 if (!$fc_value){ | |
91 @perc_ordine=sort(@percentile); | |
92 $position=(($value*($#perc_ordine+1))-1); | |
93 $valore_percentile=$perc_ordine[$position]; | |
94 #print "il percentile è $valore_percentile\n"; | |
95 #print "il max è $perc_ordine[$#perc_ordine]\n"; | |
96 #print "il min è $perc_ordine[1]\n"; | |
97 $probabilita=1-$value; | |
98 } | |
99 else { | |
100 $valore_percentile=$fc_value; | |
101 $tuttitutti=($conto_tutti+$conto_tutti_sopra); | |
102 #print"il numero totaledi probe è $tuttitutti\n"; | |
103 #print"il numero totale di probe sopra è $conto_tutti_sopra\n"; | |
104 $probabilita=$conto_tutti_sopra/($conto_tutti+$conto_tutti_sopra); | |
105 #print "la proba= $probabilita\n"; | |
106 } | |
107 | |
108 #print buonitutti"il percentile e $valore_percentile\n"; | |
109 | |
110 | |
111 | |
112 ($one_ref,$two_ref)=&probe_cutoff(@array); | |
113 | |
114 if ($type_peak eq "p"){ | |
115 @forse_niente=&peak(@$one_ref); | |
116 } | |
117 elsif ($type_peak eq "s"){ | |
118 @forse_niente=&peak(@$two_ref); | |
119 } | |
120 | |
121 &double_peak(@forse_niente); | |
122 | |
123 # | |
124 # End process | |
125 # | |
126 close OUTFILE; | |
127 unlink "$fname.sortato"; | |
128 exit 0; | |
129 | |
130 ################################################################################################ | |
131 ######################## SUBROUTINE ############################### | |
132 ################################################################################################ | |
133 | |
134 sub probe_cutoff | |
135 { | |
136 | |
137 $c=-6; | |
138 | |
139 foreach $linea(@array){ | |
140 @subarray1 = split("\t",$linea); | |
141 $inizio=(((($subarray1[4]-$subarray1[3])/2)+$subarray1[3])-($s_windows/2)); #per modificare la windows cambia il 500 e il 1000 | |
142 $fine=$inizio+$s_windows; #windows 500 -->250 e 500 windows 1000 --> 500 e 1000 | |
143 $counter=0; | |
144 $counter_value=0; | |
145 | |
146 if($subarray1[5]>=$valore_percentile){ | |
147 push (@array_probe,[@subarray1]); | |
148 print buonitutti "$linea\n"; | |
149 } | |
150 | |
151 for($i=$c;$i<=$#array;$i++) { | |
152 if ($i<0){ | |
153 next; | |
154 } | |
155 @subarray = split("\t",$array[$i]); | |
156 if (($subarray[3]>=$inizio)&&($subarray[3]<$fine)&&("$subarray[0]" eq "$subarray1[0]")){ | |
157 $counter++; | |
158 if ($subarray[5]>= $valore_percentile){ | |
159 $counter_value++; | |
160 } | |
161 } | |
162 if (($subarray[3]>$fine)||("$subarray[0]" ne "$subarray1[0]")){ | |
163 $cazzo=$subarray[3]; | |
164 last; | |
165 } | |
166 | |
167 } | |
168 $c++; | |
169 if ($counter !=0){ | |
170 $chi_sq = (((($counter_value - ($probabilita*$counter))**2)/($probabilita*$counter))+(((($counter-$counter_value)-((1-$probabilita)*$counter))**2)/((1-$probabilita)*$counter))); | |
171 } | |
172 else{ | |
173 $chi_sq = NA; | |
174 } | |
175 | |
176 use Statistics::Distributions; | |
177 $chisprob=Statistics::Distributions::chisqrprob (1,$chi_sq); | |
178 if ($chisprob == 0){ | |
179 $log_10=100; | |
180 } | |
181 else { | |
182 $log_10 = -log($chisprob)/log(10); | |
183 } | |
184 | |
185 print pvalue "$subarray1[0]\t$subarray1[1]\tpv_$subarray1[2]\t$subarray1[3]\t$subarray1[4]\t$log_10\t$subarray1[6]\t$subarray1[7]\t$subarray1[5]\n"; | |
186 | |
187 if ($log_10>=$valore_log){ | |
188 #print"$subarray1[3],$inizio,$fine,$counter,$counter_value,$chi_sq,$chisprob,$log_10\n"; | |
189 @proviamo=($subarray1[0],$subarray1[1],$subarray1[2],$inizio,$fine,$log_10,$subarray1[6],$subarray1[7],$subarray1[5]); | |
190 push (@matrix_pvalue,[@proviamo]); | |
191 | |
192 } | |
193 } | |
194 @result_table=@matrix_pvalue; | |
195 @result_table2=@array_probe; | |
196 return(\@result_table,\@result_table2); | |
197 | |
198 } | |
199 | |
200 | |
201 | |
202 | |
203 | |
204 ################################################################################################ | |
205 | |
206 | |
207 | |
208 sub peak | |
209 { | |
210 @matrix_value=@_; | |
211 | |
212 $stop=$#matrix_value; | |
213 | |
214 for ($cio=0; $cio<=$stop;$cio++) { | |
215 @log_ratio=(); | |
216 @chilosa=(); | |
217 $inizio=$matrix_value[$cio][3]; | |
218 $somma=0; | |
219 $media=0; | |
220 $sommachilo=0; | |
221 $mediachilo=0; | |
222 $diviso=0; | |
223 push(@log_ratio,$matrix_value[$cio][5]); | |
224 if ($matrix_value[$cio][8]>=$valore_percentile){ | |
225 push(@chilosa,$matrix_value[$cio][8]); | |
226 } | |
227 for ($j=0; $j<=$stop;$j++){ | |
228 $distanza=($matrix_value[$cio+1][3]-$matrix_value[$cio][4]); | |
229 if (($distanza > $dist_max) || ($cio==$stop)||(!("$matrix_value[$cio+1][0]" eq "$matrix_value[$cio][0]"))){ | |
230 $j=$stop; | |
231 } | |
232 elsif (($distanza <= $dist_max)&&("$matrix_value[$cio+1][0]" eq "$matrix_value[$cio][0]")){ | |
233 $cio++; | |
234 push(@log_ratio,$matrix_value[$cio][5]); | |
235 $fine=$matrix_value[$cio][4]; | |
236 $inizio3=$inizio; | |
237 $j++; | |
238 if ($matrix_value[$cio][8]>=$valore_percentile){ | |
239 push(@chilosa,$matrix_value[$cio][8]); | |
240 } | |
241 } | |
242 | |
243 | |
244 } | |
245 $numeroprobes=($#chilosa+1); | |
246 if ((($#log_ratio+1)>=$num_probe) && (($type_peak eq "s") ||(($#chilosa+1)>=$num_probe))){ | |
247 foreach $n (@log_ratio) { | |
248 $somma+=$n; | |
249 } | |
250 if($type_peak eq "s"){ | |
251 @chilosa=@log_ratio; | |
252 } | |
253 foreach $nchilo (@chilosa) { | |
254 $sommachilo+=$nchilo; | |
255 } | |
256 $diviso=$#log_ratio+1; | |
257 $media = $somma/(@log_ratio); | |
258 $mediachilo=$sommachilo/(@chilosa); | |
259 #$somma_media_chilo=((sqrt($#chilosa+1))+$mediachilo); | |
260 #$moltipli=(($#log_ratio+1)*$media); | |
261 #$somma_media=((sqrt($#log_ratio+1))+$media); | |
262 if ($type_peak eq "p"){ | |
263 $inizio3=int($inizio3+(($s_windows/2)-25)); | |
264 $fine=int($fine-(($s_windows/2)-25)); | |
265 } | |
266 @proviamo_double=($matrix_value[$cio][0],$matrix_value[$cio][1],$col3,$inizio3,$fine,$media,$matrix_value[$cio][6],$matrix_value[$cio][7],$mediachilo,$diviso); | |
267 push (@matrix_for_double,[@proviamo_double]); | |
268 | |
269 } | |
270 } | |
271 | |
272 @result_table=@matrix_for_double; | |
273 return(@result_table); | |
274 } | |
275 | |
276 | |
277 ################################################################################################ | |
278 | |
279 | |
280 | |
281 sub double_peak | |
282 { | |
283 @matrix_value=@_; | |
284 | |
285 $stop=$#matrix_value; | |
286 $i=0; | |
287 $j=0; | |
288 #print $stop; | |
289 | |
290 # | |
291 # Print output file | |
292 # | |
293 if ($outfile){ | |
294 open (OUTFILE, ">$outfile"); | |
295 } | |
296 | |
297 # | |
298 # print File Header | |
299 # | |
300 my $header=<<e0c6654; | |
301 \# infile: $fname | |
302 \# percentile value: $value=$valore_percentile | |
303 \# fold change: $fc_value | |
304 \# type of analysis: $type_peak | |
305 \# log(pval analysis): $valore_log | |
306 \# num (probe defining a peak): $num_probe | |
307 \# dist: $dist_max | |
308 \# window length: $s_windows | |
309 track name=$col3 description="peaks find" visibility=2 | |
310 e0c6654 | |
311 | |
312 if ($outfile){ | |
313 print OUTFILE "$header"; | |
314 }else{ | |
315 print "$header"; | |
316 } | |
317 if ($type_peak eq "p"){ | |
318 print "perc value=$value=$valore_percentile, #probes=$num_probe (dist=$dist_max), window=$s_windows, type analisys =p-value (log=$valore_log), dist peaks=$dist_max_peaks"; | |
319 } | |
320 if ($type_peak eq "s"){ | |
321 print "perc value=$value=$valore_percentile, #probes=$num_probe (dist=$dist_max), window=$s_windows, type analisys=score, dist peaks=$dist_max_peaks"; | |
322 } | |
323 for ($i=0; $i<=$stop;$i++) { | |
324 @log_ratio=(); | |
325 @log_ratio2=(); | |
326 @diviso2=(); | |
327 $inizio=$matrix_value[$i][3]; | |
328 $somma=0; | |
329 $somma2=0; | |
330 $media=0; | |
331 $media2=0; | |
332 $diviso2=0; | |
333 push(@log_ratio,$matrix_value[$i][5]); | |
334 push(@log_ratio2,$matrix_value[$i][8]); | |
335 push(@diviso2,$matrix_value[$i][9]); | |
336 #print "giro $i\n"; | |
337 for ($j=0; $j<=$stop;$j++){ | |
338 # if ("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]"){ | |
339 $distanza=($matrix_value[$i+1][3]-$matrix_value[$i][4]); | |
340 #print "distanza fra i due $distanza\n"; | |
341 #exit; | |
342 if (($distanza > $dist_max_peaks) || ($i==$stop)||(!("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]"))){ | |
343 $j=$stop; | |
344 $fine=$matrix_value[$i][4]; | |
345 } | |
346 elsif (($distanza <= $dist_max_peaks)&&("$matrix_value[$i+1][0]" eq "$matrix_value[$i][0]")){ | |
347 $i++; | |
348 push(@log_ratio,$matrix_value[$i][5]); | |
349 push(@log_ratio2,$matrix_value[$i][8]); | |
350 push(@diviso2,$matrix_value[$i][9]); | |
351 $fine=$matrix_value[$i][4]; | |
352 $j++; | |
353 } | |
354 | |
355 # } | |
356 } | |
357 foreach $n (@log_ratio) { | |
358 $somma+=$n; | |
359 } | |
360 foreach $n1 (@log_ratio2) { | |
361 $somma2+=$n1; | |
362 } | |
363 foreach $n2 (@diviso2) { | |
364 $diviso2+=$n2; | |
365 } | |
366 $media = $somma/(@log_ratio); | |
367 $media2 = $somma2/(@log_ratio2); | |
368 $somma_media=(sqrt($diviso2)+$media); | |
369 $somma_score=(sqrt($diviso2)+$media2); | |
370 $somma_media = sprintf "%.2f", $somma_media; | |
371 $somma_score = sprintf "%.2f", $somma_score; | |
372 #$somma_score = $somma_score.$i | |
373 #$somma_media_chilo=((sqrt($#chilosa+1))+$mediachilo); | |
374 if ($outfile){ | |
375 print OUTFILE "$matrix_value[$i][0]\t$matrix_value[$i][1]\t$matrix_value[$i][2]\t$inizio\t$fine\t$somma_media\t$matrix_value[$i][6]\t$matrix_value[$i][7]\t$somma_score$i\n"; | |
376 }else{ | |
377 print "$matrix_value[$i][0]\t$matrix_value[$i][1]\t$matrix_value[$i][2]\t$inizio\t$fine\t$somma_media\t$matrix_value[$i][6]\t$matrix_value[$i][7]\t$somma_score$i\n"; | |
378 } | |
379 | |
380 | |
381 } | |
382 } | |
383 | |
384 #################################################################################################################################### | |
385 | |
386 sub printusage { | |
387 | |
388 print<<eoc22334; | |
389 | |
390 | |
391 | |
392 *************************************************** | |
393 :: N i m b l e G e n C h i p A n a l y s i s :: | |
394 *************************************************** | |
395 | |
396 | |
397 USAGE SUMMARY | |
398 --------------------------------------------------------------------------------- | |
399 This program utilizes NimbleGen ratio files in gff format as INPUT FILE and | |
400 provides a table of the computed picks in the same gff file format. | |
401 | |
402 | |
403 | |
404 --in [input filename] | |
405 --perc [percentile value, it is used to calculate the threshold rate based | |
406 on dataset distribution to filter out background ratio; i.e. 0.99] | |
407 OR | |
408 --fc [fold change value, it is used as fixed threshold to filter out | |
409 background ratio, i.e. 2] | |
410 --t [type of analysis; p, performs peaks determination based on p-value inference; | |
411 s, performs peaks determination based on a scoring system] | |
412 --log (required only for p-value based analysis) | |
413 [log2(p-value), cutoff integer to be used to identify a significant peak; i.e. 5] | |
414 --num [minimal number of consecutive probes used to define a peak; i.e. 3] | |
415 --dist [greatest nucleotide distance between two probes or between two peaks that | |
416 allow to consider the signals as belonging to the same peak] | |
417 --w [window length] | |
418 | |
419 --out [output filename (optional)] | |
420 ----------------------------------------------------------------------------------------- | |
421 EXAMPLES: | |
422 | |
423 perl program.pl --in 75340_ratio.gff --perc 0.98 --t s --num 3 --dist 250 --w 500 | |
424 OR | |
425 perl program.pl --in 75340_ratio.gff --perc 0.98 --t p --log 5 --num 3 --dist 250 --w 500 | |
426 | |
427 ----------------------------------------------------------------------------------------- | |
428 | |
429 eoc22334 | |
430 exit 0; | |
431 | |
432 } |