Mercurial > repos > mcharles > rapsosnp
comparison rapsodyn/mpileupfilterandstat.pl @ 0:442a7c88b886 draft
Uploaded
author | mcharles |
---|---|
date | Wed, 10 Sep 2014 09:18:15 -0400 |
parents | |
children | 3f7b0788a1c4 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:442a7c88b886 |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use Getopt::Long; | |
4 | |
5 # | |
6 # Filter a pileup file on forward/reverse presence and %read having the variant | |
7 # The error code | |
8 # 1 : multiple variant type detected insertion/deletion/mutation | |
9 # 1i : inconsistency in insertion | |
10 # 1d : inconsistency in deletion | |
11 # 1m : inconsistency in mutation | |
12 # 2 : insufficient depth | |
13 # 3 : insufficient variant frequency | |
14 # 4 : variant position not covered by forward and reverse reads | |
15 # 5 : variant with other variant in neighbourhood | |
16 # 6 : too much depth | |
17 # 8 : parsing error (couldn't parse the mpileup line correctly) | |
18 # 9 : parsing error (couldn't parse the readbase string correctly) | |
19 | |
20 | |
21 my $inputfile; | |
22 my $logfile; | |
23 my $MIN_DISTANCE=0; | |
24 my $MIN_VARIANTFREQUENCY=0; | |
25 my $MIN_FORWARDREVERSE=0; | |
26 my $MIN_DEPTH=0; | |
27 my $MAX_DEPTH=500; | |
28 my $VERBOSE=0; | |
29 my $ONLY_UNFILTERED_VARIANT="OFF"; | |
30 my $DO_STAT="NO"; | |
31 | |
32 | |
33 my $STAT_MIN_DEPTH_MIN = 2; | |
34 my $STAT_MIN_DEPTH_MAX = 10; | |
35 my $STAT_MIN_DEPTH_STEP = 2; | |
36 my $STAT_MAX_DEPTH_MIN = 100; | |
37 my $STAT_MAX_DEPTH_MAX = 200; | |
38 my $STAT_MAX_DEPTH_STEP = 100; | |
39 my $STAT_FREQ_MIN = 0.8; | |
40 my $STAT_FREQ_MAX = 1; | |
41 my $STAT_FREQ_STEP = 0.1; | |
42 my $STAT_DIST_MIN = 0; | |
43 my $STAT_DIST_MAX = 50; | |
44 my $STAT_DIST_STEP = 50; | |
45 | |
46 GetOptions ( | |
47 "input_file=s" => \$inputfile, | |
48 "log_file=s" => \$logfile, | |
49 "min_depth=i" => \$MIN_DEPTH, | |
50 "max_depth=i" => \$MAX_DEPTH, | |
51 "min_frequency=f" => \$MIN_VARIANTFREQUENCY, | |
52 "min_distance=i" => \$MIN_DISTANCE, | |
53 "min_forward_and_reverse=i" => \$MIN_FORWARDREVERSE, | |
54 "variant_only=s" => \$ONLY_UNFILTERED_VARIANT, | |
55 "v=i" => \$VERBOSE, | |
56 "do_stat=s" => \$DO_STAT, | |
57 "stat_min_depth_min=i" => \$STAT_MIN_DEPTH_MIN, | |
58 "stat_min_depth_max=i" => \$STAT_MIN_DEPTH_MAX, | |
59 "stat_min_depth_step=i" => \$STAT_MIN_DEPTH_STEP, | |
60 "stat_max_depth_min=i" => \$STAT_MAX_DEPTH_MIN, | |
61 "stat_max_depth_max=i" => \$STAT_MAX_DEPTH_MAX, | |
62 "stat_max_depth_step=i" => \$STAT_MAX_DEPTH_STEP, | |
63 "stat_freq_min=f" => \$STAT_FREQ_MIN, | |
64 "stat_freq_max=f" => \$STAT_FREQ_MAX, | |
65 "stat_freq_step=f" => \$STAT_FREQ_STEP, | |
66 "stat_dist_min=i" => \$STAT_DIST_MIN, | |
67 "stat_dist_max=i" => \$STAT_DIST_MAX, | |
68 "stat_dist_step=i" => \$STAT_DIST_STEP | |
69 ) or die("Error in command line arguments\n"); | |
70 | |
71 open(IF, $inputfile) or die("Can't open $inputfile\n"); | |
72 | |
73 my @tbl_line; | |
74 my %USR_PARAM; | |
75 $USR_PARAM{"min_depth"} = $MIN_DEPTH; | |
76 $USR_PARAM{"max_depth"} = $MAX_DEPTH; | |
77 $USR_PARAM{"min_freq"} = $MIN_VARIANTFREQUENCY; | |
78 $USR_PARAM{"min_dist"} = $MIN_DISTANCE; | |
79 $USR_PARAM{"min_fr"} = $MIN_FORWARDREVERSE; | |
80 | |
81 | |
82 | |
83 #Extraction des variants | |
84 my $nb_line=0; | |
85 while (my $line=<IF>){ | |
86 $nb_line++; | |
87 if (($nb_line % 1000000 == 0)&&($VERBOSE==1)){ | |
88 print "$nb_line\n"; | |
89 } | |
90 my $error_code=0; | |
91 if ($line=~/(.*?)\s+(\d+)\s+([ATGCN])\s+(\d+)\s+(.*?)\s+(.*?)$/){ | |
92 my $current_chromosome = $1; | |
93 my $current_position = $2; | |
94 my $current_refbase = $3; | |
95 my $current_coverage = $4; | |
96 my $current_readbase_string = $5; | |
97 my $current_quality_string = $6; | |
98 | |
99 #Suppression of mPileUp special character | |
100 $current_readbase_string =~ s/\$//g; #the read start at this position | |
101 $current_readbase_string =~ s/\^.//g; #the read end at this position followed by quality char | |
102 | |
103 if ($current_readbase_string =~ /[ATGCNatgcn\d]/){ | |
104 my %variant; | |
105 $variant{"line"} = $line; | |
106 $variant{"chr"} = $current_chromosome; | |
107 $variant{"pos"} = $current_position; | |
108 $variant{"refbase"} = $current_refbase; | |
109 $variant{"coverage"} = $current_coverage; | |
110 $variant{"readbase"} = $current_readbase_string; | |
111 $variant{"quality"} = $current_quality_string; | |
112 push(@tbl_line,\%variant); | |
113 | |
114 if ($ONLY_UNFILTERED_VARIANT eq "ON"){ | |
115 print $line; | |
116 } | |
117 | |
118 } | |
119 else { | |
120 #Position with no variant | |
121 } | |
122 | |
123 } | |
124 else { | |
125 #Error Parsing | |
126 print STDERR "$line #8"; | |
127 } | |
128 } | |
129 close(IF); | |
130 | |
131 if ($ONLY_UNFILTERED_VARIANT eq "ON"){ | |
132 exit(0); | |
133 } | |
134 | |
135 ####Checking the distance between variant and other filter | |
136 | |
137 | |
138 my @error; | |
139 for (my $i=0;$i<=$#tbl_line;$i++){ | |
140 # print "ligne : $tbl_line[$i]\n"; | |
141 my $before=""; | |
142 my $after=""; | |
143 my %line = %{$tbl_line[$i]}; | |
144 | |
145 if ($tbl_line[$i-1]){ | |
146 $before = $tbl_line[$i-1]; | |
147 } | |
148 if ($tbl_line[$i+1]){ | |
149 $after = $tbl_line[$i+1]; | |
150 } | |
151 my $error_code = check_error($tbl_line[$i],$before,$after,\%USR_PARAM); | |
152 if ($error_code == 0){ | |
153 print $line{"line"}; | |
154 } | |
155 else { | |
156 push(@error,$error_code,"\t",$line{"line"}); | |
157 } | |
158 } | |
159 | |
160 ### LOG | |
161 open(LF,">$logfile") or die ("Can't open $logfile\n"); | |
162 | |
163 if ($DO_STAT eq "YES"){ | |
164 for (my $idx_min_depth=$STAT_MIN_DEPTH_MIN;$idx_min_depth<=$STAT_MIN_DEPTH_MAX;$idx_min_depth = $idx_min_depth + $STAT_MIN_DEPTH_STEP ){ | |
165 for (my $idx_max_depth=$STAT_MAX_DEPTH_MIN;$idx_max_depth<=$STAT_MAX_DEPTH_MAX;$idx_max_depth = $idx_max_depth + $STAT_MAX_DEPTH_STEP ){ | |
166 for (my $idx_freq = $STAT_FREQ_MIN;$idx_freq<=$STAT_FREQ_MAX;$idx_freq= $idx_freq+$STAT_FREQ_STEP){ | |
167 for (my $idx_dist=$STAT_DIST_MIN;$idx_dist<=$STAT_DIST_MAX;$idx_dist = $idx_dist + $STAT_DIST_STEP){ | |
168 for (my $idx_fr=0;$idx_fr<=1;$idx_fr++){ | |
169 my %stat_param; | |
170 $stat_param{"min_depth"}=$idx_min_depth; | |
171 $stat_param{"max_depth"}=$idx_max_depth; | |
172 $stat_param{"min_freq"}=$idx_freq; | |
173 $stat_param{"min_fr"}=$idx_fr; | |
174 $stat_param{"min_dist"}=$idx_dist; | |
175 | |
176 print LF "#SNP = ",&test_check(\@tbl_line,\%stat_param),"\tdepth (min/max) = ",$stat_param{"min_depth"}," / ",$stat_param{"max_depth"},"\tmin_dist=",$stat_param{"min_dist"},"\tmin_freq=",$stat_param{"min_freq"},"\tmin_forwardreverse = ",$stat_param{"min_fr"},"\n"; | |
177 } | |
178 } | |
179 } | |
180 print "\n"; | |
181 } | |
182 } | |
183 } | |
184 | |
185 | |
186 for (my $i=0;$i<=$#error;$i++){ | |
187 print LF $error[$i]; | |
188 } | |
189 close (LF); | |
190 | |
191 | |
192 | |
193 | |
194 sub test_check{ | |
195 my $ref_tbl_line = shift; | |
196 my $ref_param = shift; | |
197 my @tbl_line = @$ref_tbl_line; | |
198 my %param = %$ref_param; | |
199 my $nb=0; | |
200 | |
201 for (my $i=0;$i<=$#tbl_line;$i++){ | |
202 my $before=""; | |
203 my $after=""; | |
204 my %line = %{$tbl_line[$i]}; | |
205 | |
206 if ($tbl_line[$i-1]){ | |
207 $before = $tbl_line[$i-1]; | |
208 } | |
209 if ($tbl_line[$i+1]){ | |
210 $after = $tbl_line[$i+1]; | |
211 } | |
212 my $error_code = check_error($tbl_line[$i],$before,$after,\%param); | |
213 if ($error_code == 0){ | |
214 $nb++; | |
215 } | |
216 } | |
217 | |
218 return $nb; | |
219 } | |
220 | |
221 sub check_error{ | |
222 my $refline = shift; | |
223 my %line = %$refline; | |
224 my $refbefore = shift; | |
225 my $refafter = shift; | |
226 my $refparam = shift; | |
227 my %param = %$refparam; | |
228 | |
229 | |
230 my $current_chromosome = $line{"chr"}; | |
231 my $current_position = $line{"pos"}; | |
232 my $current_refbase = $line{"refbase"}; | |
233 my $current_coverage = $line{"coverage"}; | |
234 my $current_readbase_string = $line{"readbase"}; | |
235 | |
236 | |
237 my $min_depth = $param{"min_depth"}; | |
238 my $max_depth = $param{"max_depth"}; | |
239 my $min_variant_frequency = $param{"min_freq"}; | |
240 my $min_forward_reverse = $param{"min_fr"}; | |
241 my $min_dist = $param{"min_dist"}; | |
242 | |
243 #Verification of neightbourhood | |
244 if ($refbefore){ | |
245 my %compareline = %$refbefore; | |
246 my $compare_chromosome = $compareline{"chr"}; | |
247 my $compare_position = $compareline{"pos"}; | |
248 my $compare_refbase = $compareline{"refbase"}; | |
249 my $compare_coverage = $compareline{"coverage"}; | |
250 my $compare_readbase_string = $compareline{"readbase"}; | |
251 | |
252 if (($current_chromosome eq $compare_chromosome )&&($compare_position + $min_dist >= $current_position)){ | |
253 return 5; | |
254 } | |
255 } | |
256 | |
257 if ($refafter){ | |
258 my %compareline = %$refafter; | |
259 my $compare_chromosome = $compareline{"chr"}; | |
260 my $compare_position = $compareline{"pos"}; | |
261 my $compare_refbase = $compareline{"refbase"}; | |
262 my $compare_coverage = $compareline{"coverage"}; | |
263 my $compare_readbase_string = $compareline{"readbase"}; | |
264 | |
265 if (($current_chromosome eq $compare_chromosome )&&($current_position + $min_dist >= $compare_position)){ | |
266 return 5; | |
267 } | |
268 } | |
269 | |
270 | |
271 | |
272 | |
273 #Extraction of insertions | |
274 | |
275 ################################################################## | |
276 # my @IN = $current_readbase_string =~ m/\+[0-9]+[ACGTNacgtn]+/g; | |
277 # my @DEL = $current_readbase_string =~ m/\-[0-9]+[ACGTNacgtn]+/g; | |
278 # print "IN : @IN\n"; | |
279 # print "DEL :@DEL\n"; | |
280 #$current_readbase_string=~s/[\+\-][0-9]+[ACGTNacgtn]+//g; | |
281 ################################################################## | |
282 #!!! marche pas : exemple .+1Ct. correspond a . / +1C / t /. mais le match de l'expression vire +1Ct | |
283 ################################################################## | |
284 | |
285 # => parcours de boucle | |
286 my @readbase = split(//,$current_readbase_string); | |
287 my $cleaned_readbase_string=""; | |
288 my @IN; | |
289 my @DEL; | |
290 my $current_IN=""; | |
291 my $current_DEL=""; | |
292 my $current_size=0; | |
293 | |
294 for (my $i=0;$i<=$#readbase;$i++){ | |
295 if ($readbase[$i] eq "+"){ | |
296 #Ouverture de IN | |
297 $current_IN="+"; | |
298 | |
299 #Recuperation de la taille | |
300 my $sub = substr $current_readbase_string,$i; | |
301 if ($sub=~/^\+(\d+)/){ | |
302 $current_size = $1; | |
303 } | |
304 my $remaining_size = $current_size; | |
305 while (($remaining_size>0)&&($i<=$#readbase)){ | |
306 $i++; | |
307 $current_IN.=$readbase[$i]; | |
308 if ($readbase[$i]=~ /[ATGCNatgcn]/){ | |
309 $remaining_size--; | |
310 } | |
311 } | |
312 push(@IN,$current_IN); | |
313 } | |
314 elsif ($readbase[$i] eq "-"){ | |
315 #Ouverture de DEL | |
316 $current_DEL="-"; | |
317 | |
318 #Recuperation de la taille | |
319 my $sub = substr $current_readbase_string,$i; | |
320 if ($sub=~/^\-(\d+)/){ | |
321 $current_size = $1; | |
322 } | |
323 my $remaining_size = $current_size; | |
324 while (($remaining_size>0)&&($i<=$#readbase)){ | |
325 $i++; | |
326 $current_DEL.=$readbase[$i]; | |
327 if ($readbase[$i]=~ /[ATGCNatgcn]/){ | |
328 $remaining_size--; | |
329 } | |
330 } | |
331 push(@DEL,$current_DEL); | |
332 | |
333 } | |
334 else { | |
335 #Ajout a la string | |
336 $cleaned_readbase_string .= $readbase[$i]; | |
337 } | |
338 } | |
339 | |
340 | |
341 # print "IN : @IN\n"; | |
342 # print "DEL :@DEL\n"; | |
343 # print "$cleaned_readbase_string\n"; | |
344 | |
345 my @current_readbase_array = split(//,$cleaned_readbase_string); | |
346 | |
347 #Filtering : error detection | |
348 | |
349 if ($#current_readbase_array+1 != $current_coverage){ | |
350 return 9; | |
351 #parsing error (couldn't parse the readbase string correctly) | |
352 } | |
353 elsif ($current_coverage<$min_depth){ | |
354 return 2; | |
355 # 2 : insufficient depth | |
356 } | |
357 elsif ($current_coverage>$max_depth){ | |
358 return 6; | |
359 # 6 : too much depth | |
360 } | |
361 else { | |
362 if ($#IN>=0){ | |
363 if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){ | |
364 return 1; | |
365 # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation) | |
366 } | |
367 else { | |
368 ########## TEST de coherence des insertions ################ | |
369 # for (my $i=0;$i<=$#IN;$i++){ | |
370 # if (uc($IN[0]) ne uc($IN[$i])){ | |
371 # print uc($IN[0]),"\n"; | |
372 # print uc($IN[$i]),"\n"; | |
373 # return "1i"; | |
374 # } | |
375 # } | |
376 ########################################################### | |
377 | |
378 if($#IN+1 < $current_coverage*$min_variant_frequency ){ | |
379 return 3; | |
380 # 3 : insufficient variant frequency | |
381 } | |
382 } | |
383 } | |
384 elsif ($#DEL>=0){ | |
385 if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){ | |
386 return 1; | |
387 # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation) | |
388 } | |
389 else { | |
390 ########## TEST de coherence des deletions ################ | |
391 # for (my $i=0;$i<=$#DEL;$i++){ | |
392 # if (uc($DEL[0]) ne uc($DEL[$i])){ | |
393 # print uc($DEL[0]),"\n"; | |
394 # print uc($DEL[$i]),"\n"; | |
395 # return "1d"; | |
396 # } | |
397 # } | |
398 ########################################################### | |
399 | |
400 if($#DEL+1 < $current_coverage*$min_variant_frequency){ | |
401 return 3; | |
402 # 3 : insufficient variant frequency | |
403 } | |
404 } | |
405 } | |
406 else { | |
407 my $nbA=0; | |
408 $nbA++ while ($current_readbase_string =~ m/A/g); | |
409 my $nbC=0; | |
410 $nbC++ while ($current_readbase_string =~ m/C/g); | |
411 my $nbT=0; | |
412 $nbT++ while ($current_readbase_string =~ m/T/g); | |
413 my $nbG=0; | |
414 $nbG++ while ($current_readbase_string =~ m/G/g); | |
415 my $nbN=0; | |
416 $nbN++ while ($current_readbase_string =~ m/N/g); | |
417 my $nba=0; | |
418 $nba++ while ($current_readbase_string =~ m/a/g); | |
419 my $nbc=0; | |
420 $nbc++ while ($current_readbase_string =~ m/c/g); | |
421 my $nbt=0; | |
422 $nbt++ while ($current_readbase_string =~ m/t/g); | |
423 my $nbg=0; | |
424 $nbg++ while ($current_readbase_string =~ m/g/g); | |
425 my $nbn=0; | |
426 $nbn++ while ($current_readbase_string =~ m/n/g); | |
427 | |
428 if (($nbA+$nba>0)&&($nbT+$nbt+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){ | |
429 return "1m"; | |
430 } | |
431 if (($nbT+$nbt>0)&&($nbA+$nba+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){ | |
432 return "1m"; | |
433 } | |
434 if (($nbG+$nbg>0)&&($nbA+$nba+$nbT+$nbt+$nbC+$nbc+$nbN+$nbn>0)){ | |
435 return "1m"; | |
436 } | |
437 if (($nbC+$nbc>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbN+$nbn>0)){ | |
438 return "1m"; | |
439 } | |
440 if (($nbN+$nbn>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbC+$nbc>0)){ | |
441 return "1m"; | |
442 } | |
443 | |
444 if ($nbA+$nba >= $current_coverage*$min_variant_frequency){ | |
445 if (($nbA<$min_forward_reverse)||($nba<$min_forward_reverse)){ | |
446 return 4; | |
447 # 4 : variant position not covered by forward and reverse reads | |
448 } | |
449 } | |
450 elsif ($nbT+$nbt >= $current_coverage*$min_variant_frequency){ | |
451 if (($nbT<$min_forward_reverse)||($nbt<$min_forward_reverse)){ | |
452 return 4; | |
453 # 4 : variant position not covered by forward and reverse reads | |
454 } | |
455 } | |
456 elsif ($nbG+$nbg >= $current_coverage*$min_variant_frequency){ | |
457 if (($nbG<$min_forward_reverse)||($nbg<$min_forward_reverse)){ | |
458 return 4; | |
459 # 4 : variant position not covered by forward and reverse reads | |
460 } | |
461 } | |
462 elsif ($nbC+$nbc >= $current_coverage*$min_variant_frequency){ | |
463 if (($nbC<$min_forward_reverse)||($nbc<$min_forward_reverse)){ | |
464 return 4; | |
465 # 4 : variant position not covered by forward and reverse reads | |
466 } | |
467 } | |
468 elsif ($nbN+$nbn >= $current_coverage*$min_variant_frequency){ | |
469 if (($nbN<$min_forward_reverse)||($nbn<$min_forward_reverse)){ | |
470 return 4; | |
471 # 4 : variant position not covered by forward and reverse reads | |
472 } | |
473 } | |
474 else { | |
475 return 3; | |
476 # 3 : insufficient variant frequency | |
477 } | |
478 } | |
479 } | |
480 | |
481 return 0; | |
482 } |