Mercurial > repos > mcharles > rapsodyn
comparison rapsodyn/mpileupfilterandstat.pl @ 24:e8e6b962c1f2 draft
Uploaded
author | mcharles |
---|---|
date | Fri, 05 Sep 2014 06:12:10 -0400 |
parents | |
children | 39376c7204be |
comparison
equal
deleted
inserted
replaced
23:edddaa8ab855 | 24:e8e6b962c1f2 |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use Getopt::Long; | |
4 | |
5 # | |
6 # Filter a pileup file on forward/reverse presence and %read having the variant | |
7 # The error code | |
8 # 1 : multiple variant type detected insertion/deletion/mutation | |
9 # 1i : inconsistency in insertion | |
10 # 1d : inconsistency in deletion | |
11 # 1m : inconsistency in mutation | |
12 # 2 : insufficient depth | |
13 # 3 : insufficient variant frequency | |
14 # 4 : variant position not covered by forward and reverse reads | |
15 # 5 : variant with other variant in neighbourhood | |
16 # 6 : too much depth | |
17 # 8 : parsing error (couldn't parse the mpileup line correctly) | |
18 # 9 : parsing error (couldn't parse the readbase string correctly) | |
19 | |
20 | |
21 my $inputfile; | |
22 my $logfile; | |
23 my $MIN_DISTANCE=0; | |
24 my $MIN_VARIANTFREQUENCY=0; | |
25 my $MIN_FORWARDREVERSE=0; | |
26 my $MIN_DEPTH=0; | |
27 my $MAX_DEPTH=500; | |
28 my $VERBOSE=0; | |
29 my $ONLY_UNFILTERED_VARIANT="OFF"; | |
30 my $DO_STAT="NO"; | |
31 | |
32 if ($#ARGV<0){ | |
33 print "\n"; | |
34 print "perl 020_FilterPileupv6 -input_file <mpileup_file> [OPTION]\n"; | |
35 print "-input_file \tinputfile in mpileup format\n"; | |
36 print "-log_file \tlogfile containing discarded mpileup lines and the errorcode associated\n"; | |
37 print "-min_depth \tminimum depth required [1]\n"; | |
38 print "-max_depth \tmaximim depth (position with more coverage will be discarded) [100]\n"; | |
39 print "-min_frequency \tminimum variant frequency (0->1) [1] (default 1 => 100% reads show the variant at this position)\n"; | |
40 print "-min_distance \tminimum distance between variant [0]\n"; | |
41 print "-min_forward_and_reverse \tminimum number of reads in forward and reverse covering the variant required [0]\n"; | |
42 print "\n"; | |
43 exit(0); | |
44 } | |
45 | |
46 GetOptions ( | |
47 "input_file=s" => \$inputfile, | |
48 "log_file=s" => \$logfile, | |
49 "min_depth=i" => \$MIN_DEPTH, | |
50 "max_depth=i" => \$MAX_DEPTH, | |
51 "min_frequency=f" => \$MIN_VARIANTFREQUENCY, | |
52 "min_distance=i" => \$MIN_DISTANCE, | |
53 "min_forward_and_reverse=i" => \$MIN_FORWARDREVERSE, | |
54 "variant_only=s" => \$ONLY_UNFILTERED_VARIANT, | |
55 "v=i" => \$VERBOSE, | |
56 "do_stat=s" => \$DO_STAT | |
57 ) or die("Error in command line arguments\n"); | |
58 | |
59 | |
60 open(IF, $inputfile) or die("Can't open $inputfile\n"); | |
61 | |
62 my @tbl_line; | |
63 my %USR_PARAM; | |
64 $USR_PARAM{"min_depth"} = $MIN_DEPTH; | |
65 $USR_PARAM{"max_depth"} = $MAX_DEPTH; | |
66 $USR_PARAM{"min_freq"} = $MIN_VARIANTFREQUENCY; | |
67 $USR_PARAM{"min_dist"} = $MIN_DISTANCE; | |
68 $USR_PARAM{"min_fr"} = $MIN_FORWARDREVERSE; | |
69 | |
70 | |
71 | |
72 #Extraction des variants | |
73 my $nb_line=0; | |
74 while (my $line=<IF>){ | |
75 $nb_line++; | |
76 if (($nb_line % 1000000 == 0)&&($VERBOSE==1)){ | |
77 print "$nb_line\n"; | |
78 } | |
79 my $error_code=0; | |
80 if ($line=~/(.*?)\s+(\d+)\s+([ATGCN])\s+(\d+)\s+(.*?)\s+(.*?)$/){ | |
81 my $current_chromosome = $1; | |
82 my $current_position = $2; | |
83 my $current_refbase = $3; | |
84 my $current_coverage = $4; | |
85 my $current_readbase_string = $5; | |
86 my $current_quality_string = $6; | |
87 | |
88 #Suppression of mPileUp special character | |
89 $current_readbase_string =~ s/\$//g; #the read start at this position | |
90 $current_readbase_string =~ s/\^.//g; #the read end at this position followed by quality char | |
91 | |
92 if ($current_readbase_string =~ /[ATGCNatgcn\d]/){ | |
93 my %variant; | |
94 $variant{"line"} = $line; | |
95 $variant{"chr"} = $current_chromosome; | |
96 $variant{"pos"} = $current_position; | |
97 $variant{"refbase"} = $current_refbase; | |
98 $variant{"coverage"} = $current_coverage; | |
99 $variant{"readbase"} = $current_readbase_string; | |
100 $variant{"quality"} = $current_quality_string; | |
101 push(@tbl_line,\%variant); | |
102 | |
103 if ($ONLY_UNFILTERED_VARIANT eq "ON"){ | |
104 print $line; | |
105 } | |
106 | |
107 } | |
108 else { | |
109 #Position with no variant | |
110 } | |
111 | |
112 } | |
113 else { | |
114 #Error Parsing | |
115 print STDERR "$line #8"; | |
116 } | |
117 } | |
118 close(IF); | |
119 | |
120 if ($ONLY_UNFILTERED_VARIANT eq "ON"){ | |
121 exit(0); | |
122 } | |
123 | |
124 ####Checking the distance between variant and other filter | |
125 | |
126 | |
127 my @error; | |
128 for (my $i=0;$i<=$#tbl_line;$i++){ | |
129 # print "ligne : $tbl_line[$i]\n"; | |
130 my $before=""; | |
131 my $after=""; | |
132 my %line = %{$tbl_line[$i]}; | |
133 | |
134 if ($tbl_line[$i-1]){ | |
135 $before = $tbl_line[$i-1]; | |
136 } | |
137 if ($tbl_line[$i+1]){ | |
138 $after = $tbl_line[$i+1]; | |
139 } | |
140 my $error_code = check_error($tbl_line[$i],$before,$after,\%USR_PARAM); | |
141 if ($error_code == 0){ | |
142 print $line{"line"}; | |
143 } | |
144 else { | |
145 push(@error,$error_code,"\t",$line{"line"}); | |
146 } | |
147 } | |
148 | |
149 ### LOG | |
150 open(LF,">$logfile") or die ("Can't open $logfile\n"); | |
151 | |
152 if ($DO_STAT eq "YES"){ | |
153 for (my $idx_min_depth=2;$idx_min_depth<=32;$idx_min_depth = $idx_min_depth*2){ | |
154 for (my $idx_freq = 0.5;$idx_freq<=1;$idx_freq= $idx_freq+0.1){ | |
155 for (my $idx_dist=0;$idx_dist<=50;$idx_dist = $idx_dist + 50){ | |
156 for (my $idx_fr=0;$idx_fr<=1;$idx_fr++){ | |
157 my %stat_param; | |
158 $stat_param{"min_depth"}=$idx_min_depth; | |
159 $stat_param{"max_depth"}=250; | |
160 $stat_param{"min_freq"}=$idx_freq; | |
161 $stat_param{"min_fr"}=$idx_fr; | |
162 $stat_param{"min_dist"}=$idx_dist; | |
163 | |
164 print LF "#SNP = ",&test_check(\@tbl_line,\%stat_param),"\tdepth (min/max) = ",$stat_param{"min_depth"}," / ",$stat_param{"max_depth"},"\tmin_dist=",$stat_param{"min_dist"},"\tmin_freq=",$stat_param{"min_freq"},"\tmin_forwardreverse = ",$stat_param{"min_fr"},"\n"; | |
165 } | |
166 } | |
167 } | |
168 print "\n"; | |
169 } | |
170 } | |
171 | |
172 | |
173 for (my $i=0;$i<=$#error;$i++){ | |
174 print LF $error[$i]; | |
175 } | |
176 close (LF); | |
177 | |
178 | |
179 | |
180 | |
181 sub test_check{ | |
182 my $ref_tbl_line = shift; | |
183 my $ref_param = shift; | |
184 my @tbl_line = @$ref_tbl_line; | |
185 my %param = %$ref_param; | |
186 my $nb=0; | |
187 | |
188 for (my $i=0;$i<=$#tbl_line;$i++){ | |
189 my $before=""; | |
190 my $after=""; | |
191 my %line = %{$tbl_line[$i]}; | |
192 | |
193 if ($tbl_line[$i-1]){ | |
194 $before = $tbl_line[$i-1]; | |
195 } | |
196 if ($tbl_line[$i+1]){ | |
197 $after = $tbl_line[$i+1]; | |
198 } | |
199 my $error_code = check_error($tbl_line[$i],$before,$after,\%param); | |
200 if ($error_code == 0){ | |
201 $nb++; | |
202 } | |
203 } | |
204 | |
205 return $nb; | |
206 } | |
207 | |
208 sub check_error{ | |
209 my $refline = shift; | |
210 my %line = %$refline; | |
211 my $refbefore = shift; | |
212 my $refafter = shift; | |
213 my $refparam = shift; | |
214 my %param = %$refparam; | |
215 | |
216 | |
217 my $current_chromosome = $line{"chr"}; | |
218 my $current_position = $line{"pos"}; | |
219 my $current_refbase = $line{"refbase"}; | |
220 my $current_coverage = $line{"coverage"}; | |
221 my $current_readbase_string = $line{"readbase"}; | |
222 | |
223 | |
224 my $min_depth = $param{"min_depth"}; | |
225 my $max_depth = $param{"max_depth"}; | |
226 my $min_variant_frequency = $param{"min_freq"}; | |
227 my $min_forward_reverse = $param{"min_fr"}; | |
228 my $min_dist = $param{"min_dist"}; | |
229 | |
230 #Verification of neightbourhood | |
231 if ($refbefore){ | |
232 my %compareline = %$refbefore; | |
233 my $compare_chromosome = $compareline{"chr"}; | |
234 my $compare_position = $compareline{"pos"}; | |
235 my $compare_refbase = $compareline{"refbase"}; | |
236 my $compare_coverage = $compareline{"coverage"}; | |
237 my $compare_readbase_string = $compareline{"readbase"}; | |
238 | |
239 if (($current_chromosome eq $compare_chromosome )&&($compare_position + $min_dist >= $current_position)){ | |
240 return 5; | |
241 } | |
242 } | |
243 | |
244 if ($refafter){ | |
245 my %compareline = %$refafter; | |
246 my $compare_chromosome = $compareline{"chr"}; | |
247 my $compare_position = $compareline{"pos"}; | |
248 my $compare_refbase = $compareline{"refbase"}; | |
249 my $compare_coverage = $compareline{"coverage"}; | |
250 my $compare_readbase_string = $compareline{"readbase"}; | |
251 | |
252 if (($current_chromosome eq $compare_chromosome )&&($current_position + $min_dist >= $compare_position)){ | |
253 return 5; | |
254 } | |
255 } | |
256 | |
257 | |
258 | |
259 | |
260 #Extraction of insertions | |
261 | |
262 ################################################################## | |
263 # my @IN = $current_readbase_string =~ m/\+[0-9]+[ACGTNacgtn]+/g; | |
264 # my @DEL = $current_readbase_string =~ m/\-[0-9]+[ACGTNacgtn]+/g; | |
265 # print "IN : @IN\n"; | |
266 # print "DEL :@DEL\n"; | |
267 #$current_readbase_string=~s/[\+\-][0-9]+[ACGTNacgtn]+//g; | |
268 ################################################################## | |
269 #!!! marche pas : exemple .+1Ct. correspond a . / +1C / t /. mais le match de l'expression vire +1Ct | |
270 ################################################################## | |
271 | |
272 # => parcours de boucle | |
273 my @readbase = split(//,$current_readbase_string); | |
274 my $cleaned_readbase_string=""; | |
275 my @IN; | |
276 my @DEL; | |
277 my $current_IN=""; | |
278 my $current_DEL=""; | |
279 my $current_size=0; | |
280 | |
281 for (my $i=0;$i<=$#readbase;$i++){ | |
282 if ($readbase[$i] eq "+"){ | |
283 #Ouverture de IN | |
284 $current_IN="+"; | |
285 | |
286 #Recuperation de la taille | |
287 my $sub = substr $current_readbase_string,$i; | |
288 if ($sub=~/^\+(\d+)/){ | |
289 $current_size = $1; | |
290 } | |
291 my $remaining_size = $current_size; | |
292 while (($remaining_size>0)&&($i<=$#readbase)){ | |
293 $i++; | |
294 $current_IN.=$readbase[$i]; | |
295 if ($readbase[$i]=~ /[ATGCNatgcn]/){ | |
296 $remaining_size--; | |
297 } | |
298 } | |
299 push(@IN,$current_IN); | |
300 } | |
301 elsif ($readbase[$i] eq "-"){ | |
302 #Ouverture de DEL | |
303 $current_DEL="-"; | |
304 | |
305 #Recuperation de la taille | |
306 my $sub = substr $current_readbase_string,$i; | |
307 if ($sub=~/^\-(\d+)/){ | |
308 $current_size = $1; | |
309 } | |
310 my $remaining_size = $current_size; | |
311 while (($remaining_size>0)&&($i<=$#readbase)){ | |
312 $i++; | |
313 $current_DEL.=$readbase[$i]; | |
314 if ($readbase[$i]=~ /[ATGCNatgcn]/){ | |
315 $remaining_size--; | |
316 } | |
317 } | |
318 push(@DEL,$current_DEL); | |
319 | |
320 } | |
321 else { | |
322 #Ajout a la string | |
323 $cleaned_readbase_string .= $readbase[$i]; | |
324 } | |
325 } | |
326 | |
327 | |
328 # print "IN : @IN\n"; | |
329 # print "DEL :@DEL\n"; | |
330 # print "$cleaned_readbase_string\n"; | |
331 | |
332 my @current_readbase_array = split(//,$cleaned_readbase_string); | |
333 | |
334 #Filtering : error detection | |
335 | |
336 if ($#current_readbase_array+1 != $current_coverage){ | |
337 return 9; | |
338 #parsing error (couldn't parse the readbase string correctly) | |
339 } | |
340 elsif ($current_coverage<$min_depth){ | |
341 return 2; | |
342 # 2 : insufficient depth | |
343 } | |
344 elsif ($current_coverage>$max_depth){ | |
345 return 6; | |
346 # 6 : too much depth | |
347 } | |
348 else { | |
349 if ($#IN>=0){ | |
350 if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){ | |
351 return 1; | |
352 # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation) | |
353 } | |
354 else { | |
355 ########## TEST de coherence des insertions ################ | |
356 # for (my $i=0;$i<=$#IN;$i++){ | |
357 # if (uc($IN[0]) ne uc($IN[$i])){ | |
358 # print uc($IN[0]),"\n"; | |
359 # print uc($IN[$i]),"\n"; | |
360 # return "1i"; | |
361 # } | |
362 # } | |
363 ########################################################### | |
364 | |
365 if($#IN+1 < $current_coverage*$min_variant_frequency ){ | |
366 return 3; | |
367 # 3 : insufficient variant frequency | |
368 } | |
369 } | |
370 } | |
371 elsif ($#DEL>=0){ | |
372 if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){ | |
373 return 1; | |
374 # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation) | |
375 } | |
376 else { | |
377 ########## TEST de coherence des deletions ################ | |
378 # for (my $i=0;$i<=$#DEL;$i++){ | |
379 # if (uc($DEL[0]) ne uc($DEL[$i])){ | |
380 # print uc($DEL[0]),"\n"; | |
381 # print uc($DEL[$i]),"\n"; | |
382 # return "1d"; | |
383 # } | |
384 # } | |
385 ########################################################### | |
386 | |
387 if($#DEL+1 < $current_coverage*$min_variant_frequency){ | |
388 return 3; | |
389 # 3 : insufficient variant frequency | |
390 } | |
391 } | |
392 } | |
393 else { | |
394 my $nbA=0; | |
395 $nbA++ while ($current_readbase_string =~ m/A/g); | |
396 my $nbC=0; | |
397 $nbC++ while ($current_readbase_string =~ m/C/g); | |
398 my $nbT=0; | |
399 $nbT++ while ($current_readbase_string =~ m/T/g); | |
400 my $nbG=0; | |
401 $nbG++ while ($current_readbase_string =~ m/G/g); | |
402 my $nbN=0; | |
403 $nbN++ while ($current_readbase_string =~ m/N/g); | |
404 my $nba=0; | |
405 $nba++ while ($current_readbase_string =~ m/a/g); | |
406 my $nbc=0; | |
407 $nbc++ while ($current_readbase_string =~ m/c/g); | |
408 my $nbt=0; | |
409 $nbt++ while ($current_readbase_string =~ m/t/g); | |
410 my $nbg=0; | |
411 $nbg++ while ($current_readbase_string =~ m/g/g); | |
412 my $nbn=0; | |
413 $nbn++ while ($current_readbase_string =~ m/n/g); | |
414 | |
415 if (($nbA+$nba>0)&&($nbT+$nbt+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){ | |
416 return "1m"; | |
417 } | |
418 if (($nbT+$nbt>0)&&($nbA+$nba+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){ | |
419 return "1m"; | |
420 } | |
421 if (($nbG+$nbg>0)&&($nbA+$nba+$nbT+$nbt+$nbC+$nbc+$nbN+$nbn>0)){ | |
422 return "1m"; | |
423 } | |
424 if (($nbC+$nbc>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbN+$nbn>0)){ | |
425 return "1m"; | |
426 } | |
427 if (($nbN+$nbn>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbC+$nbc>0)){ | |
428 return "1m"; | |
429 } | |
430 | |
431 if ($nbA+$nba >= $current_coverage*$min_variant_frequency){ | |
432 if (($nbA<$min_forward_reverse)||($nba<$min_forward_reverse)){ | |
433 return 4; | |
434 # 4 : variant position not covered by forward and reverse reads | |
435 } | |
436 } | |
437 elsif ($nbT+$nbt >= $current_coverage*$min_variant_frequency){ | |
438 if (($nbT<$min_forward_reverse)||($nbt<$min_forward_reverse)){ | |
439 return 4; | |
440 # 4 : variant position not covered by forward and reverse reads | |
441 } | |
442 } | |
443 elsif ($nbG+$nbg >= $current_coverage*$min_variant_frequency){ | |
444 if (($nbG<$min_forward_reverse)||($nbg<$min_forward_reverse)){ | |
445 return 4; | |
446 # 4 : variant position not covered by forward and reverse reads | |
447 } | |
448 } | |
449 elsif ($nbC+$nbc >= $current_coverage*$min_variant_frequency){ | |
450 if (($nbC<$min_forward_reverse)||($nbc<$min_forward_reverse)){ | |
451 return 4; | |
452 # 4 : variant position not covered by forward and reverse reads | |
453 } | |
454 } | |
455 elsif ($nbN+$nbn >= $current_coverage*$min_variant_frequency){ | |
456 if (($nbN<$min_forward_reverse)||($nbn<$min_forward_reverse)){ | |
457 return 4; | |
458 # 4 : variant position not covered by forward and reverse reads | |
459 } | |
460 } | |
461 else { | |
462 return 3; | |
463 # 3 : insufficient variant frequency | |
464 } | |
465 } | |
466 } | |
467 | |
468 return 0; | |
469 } |