comparison rapsodyn/mpileupfilterandstat.pl @ 24:e8e6b962c1f2 draft

Uploaded
author mcharles
date Fri, 05 Sep 2014 06:12:10 -0400
parents
children 39376c7204be
comparison
equal deleted inserted replaced
23:edddaa8ab855 24:e8e6b962c1f2
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Long;
4
5 #
6 # Filter a pileup file on forward/reverse presence and %read having the variant
7 # The error code
8 # 1 : multiple variant type detected insertion/deletion/mutation
9 # 1i : inconsistency in insertion
10 # 1d : inconsistency in deletion
11 # 1m : inconsistency in mutation
12 # 2 : insufficient depth
13 # 3 : insufficient variant frequency
14 # 4 : variant position not covered by forward and reverse reads
15 # 5 : variant with other variant in neighbourhood
16 # 6 : too much depth
17 # 8 : parsing error (couldn't parse the mpileup line correctly)
18 # 9 : parsing error (couldn't parse the readbase string correctly)
19
20
21 my $inputfile;
22 my $logfile;
23 my $MIN_DISTANCE=0;
24 my $MIN_VARIANTFREQUENCY=0;
25 my $MIN_FORWARDREVERSE=0;
26 my $MIN_DEPTH=0;
27 my $MAX_DEPTH=500;
28 my $VERBOSE=0;
29 my $ONLY_UNFILTERED_VARIANT="OFF";
30 my $DO_STAT="NO";
31
32 if ($#ARGV<0){
33 print "\n";
34 print "perl 020_FilterPileupv6 -input_file <mpileup_file> [OPTION]\n";
35 print "-input_file \tinputfile in mpileup format\n";
36 print "-log_file \tlogfile containing discarded mpileup lines and the errorcode associated\n";
37 print "-min_depth \tminimum depth required [1]\n";
38 print "-max_depth \tmaximim depth (position with more coverage will be discarded) [100]\n";
39 print "-min_frequency \tminimum variant frequency (0->1) [1] (default 1 => 100% reads show the variant at this position)\n";
40 print "-min_distance \tminimum distance between variant [0]\n";
41 print "-min_forward_and_reverse \tminimum number of reads in forward and reverse covering the variant required [0]\n";
42 print "\n";
43 exit(0);
44 }
45
46 GetOptions (
47 "input_file=s" => \$inputfile,
48 "log_file=s" => \$logfile,
49 "min_depth=i" => \$MIN_DEPTH,
50 "max_depth=i" => \$MAX_DEPTH,
51 "min_frequency=f" => \$MIN_VARIANTFREQUENCY,
52 "min_distance=i" => \$MIN_DISTANCE,
53 "min_forward_and_reverse=i" => \$MIN_FORWARDREVERSE,
54 "variant_only=s" => \$ONLY_UNFILTERED_VARIANT,
55 "v=i" => \$VERBOSE,
56 "do_stat=s" => \$DO_STAT
57 ) or die("Error in command line arguments\n");
58
59
60 open(IF, $inputfile) or die("Can't open $inputfile\n");
61
62 my @tbl_line;
63 my %USR_PARAM;
64 $USR_PARAM{"min_depth"} = $MIN_DEPTH;
65 $USR_PARAM{"max_depth"} = $MAX_DEPTH;
66 $USR_PARAM{"min_freq"} = $MIN_VARIANTFREQUENCY;
67 $USR_PARAM{"min_dist"} = $MIN_DISTANCE;
68 $USR_PARAM{"min_fr"} = $MIN_FORWARDREVERSE;
69
70
71
72 #Extraction des variants
73 my $nb_line=0;
74 while (my $line=<IF>){
75 $nb_line++;
76 if (($nb_line % 1000000 == 0)&&($VERBOSE==1)){
77 print "$nb_line\n";
78 }
79 my $error_code=0;
80 if ($line=~/(.*?)\s+(\d+)\s+([ATGCN])\s+(\d+)\s+(.*?)\s+(.*?)$/){
81 my $current_chromosome = $1;
82 my $current_position = $2;
83 my $current_refbase = $3;
84 my $current_coverage = $4;
85 my $current_readbase_string = $5;
86 my $current_quality_string = $6;
87
88 #Suppression of mPileUp special character
89 $current_readbase_string =~ s/\$//g; #the read start at this position
90 $current_readbase_string =~ s/\^.//g; #the read end at this position followed by quality char
91
92 if ($current_readbase_string =~ /[ATGCNatgcn\d]/){
93 my %variant;
94 $variant{"line"} = $line;
95 $variant{"chr"} = $current_chromosome;
96 $variant{"pos"} = $current_position;
97 $variant{"refbase"} = $current_refbase;
98 $variant{"coverage"} = $current_coverage;
99 $variant{"readbase"} = $current_readbase_string;
100 $variant{"quality"} = $current_quality_string;
101 push(@tbl_line,\%variant);
102
103 if ($ONLY_UNFILTERED_VARIANT eq "ON"){
104 print $line;
105 }
106
107 }
108 else {
109 #Position with no variant
110 }
111
112 }
113 else {
114 #Error Parsing
115 print STDERR "$line #8";
116 }
117 }
118 close(IF);
119
120 if ($ONLY_UNFILTERED_VARIANT eq "ON"){
121 exit(0);
122 }
123
124 ####Checking the distance between variant and other filter
125
126
127 my @error;
128 for (my $i=0;$i<=$#tbl_line;$i++){
129 # print "ligne : $tbl_line[$i]\n";
130 my $before="";
131 my $after="";
132 my %line = %{$tbl_line[$i]};
133
134 if ($tbl_line[$i-1]){
135 $before = $tbl_line[$i-1];
136 }
137 if ($tbl_line[$i+1]){
138 $after = $tbl_line[$i+1];
139 }
140 my $error_code = check_error($tbl_line[$i],$before,$after,\%USR_PARAM);
141 if ($error_code == 0){
142 print $line{"line"};
143 }
144 else {
145 push(@error,$error_code,"\t",$line{"line"});
146 }
147 }
148
149 ### LOG
150 open(LF,">$logfile") or die ("Can't open $logfile\n");
151
152 if ($DO_STAT eq "YES"){
153 for (my $idx_min_depth=2;$idx_min_depth<=32;$idx_min_depth = $idx_min_depth*2){
154 for (my $idx_freq = 0.5;$idx_freq<=1;$idx_freq= $idx_freq+0.1){
155 for (my $idx_dist=0;$idx_dist<=50;$idx_dist = $idx_dist + 50){
156 for (my $idx_fr=0;$idx_fr<=1;$idx_fr++){
157 my %stat_param;
158 $stat_param{"min_depth"}=$idx_min_depth;
159 $stat_param{"max_depth"}=250;
160 $stat_param{"min_freq"}=$idx_freq;
161 $stat_param{"min_fr"}=$idx_fr;
162 $stat_param{"min_dist"}=$idx_dist;
163
164 print LF "#SNP = ",&test_check(\@tbl_line,\%stat_param),"\tdepth (min/max) = ",$stat_param{"min_depth"}," / ",$stat_param{"max_depth"},"\tmin_dist=",$stat_param{"min_dist"},"\tmin_freq=",$stat_param{"min_freq"},"\tmin_forwardreverse = ",$stat_param{"min_fr"},"\n";
165 }
166 }
167 }
168 print "\n";
169 }
170 }
171
172
173 for (my $i=0;$i<=$#error;$i++){
174 print LF $error[$i];
175 }
176 close (LF);
177
178
179
180
181 sub test_check{
182 my $ref_tbl_line = shift;
183 my $ref_param = shift;
184 my @tbl_line = @$ref_tbl_line;
185 my %param = %$ref_param;
186 my $nb=0;
187
188 for (my $i=0;$i<=$#tbl_line;$i++){
189 my $before="";
190 my $after="";
191 my %line = %{$tbl_line[$i]};
192
193 if ($tbl_line[$i-1]){
194 $before = $tbl_line[$i-1];
195 }
196 if ($tbl_line[$i+1]){
197 $after = $tbl_line[$i+1];
198 }
199 my $error_code = check_error($tbl_line[$i],$before,$after,\%param);
200 if ($error_code == 0){
201 $nb++;
202 }
203 }
204
205 return $nb;
206 }
207
208 sub check_error{
209 my $refline = shift;
210 my %line = %$refline;
211 my $refbefore = shift;
212 my $refafter = shift;
213 my $refparam = shift;
214 my %param = %$refparam;
215
216
217 my $current_chromosome = $line{"chr"};
218 my $current_position = $line{"pos"};
219 my $current_refbase = $line{"refbase"};
220 my $current_coverage = $line{"coverage"};
221 my $current_readbase_string = $line{"readbase"};
222
223
224 my $min_depth = $param{"min_depth"};
225 my $max_depth = $param{"max_depth"};
226 my $min_variant_frequency = $param{"min_freq"};
227 my $min_forward_reverse = $param{"min_fr"};
228 my $min_dist = $param{"min_dist"};
229
230 #Verification of neightbourhood
231 if ($refbefore){
232 my %compareline = %$refbefore;
233 my $compare_chromosome = $compareline{"chr"};
234 my $compare_position = $compareline{"pos"};
235 my $compare_refbase = $compareline{"refbase"};
236 my $compare_coverage = $compareline{"coverage"};
237 my $compare_readbase_string = $compareline{"readbase"};
238
239 if (($current_chromosome eq $compare_chromosome )&&($compare_position + $min_dist >= $current_position)){
240 return 5;
241 }
242 }
243
244 if ($refafter){
245 my %compareline = %$refafter;
246 my $compare_chromosome = $compareline{"chr"};
247 my $compare_position = $compareline{"pos"};
248 my $compare_refbase = $compareline{"refbase"};
249 my $compare_coverage = $compareline{"coverage"};
250 my $compare_readbase_string = $compareline{"readbase"};
251
252 if (($current_chromosome eq $compare_chromosome )&&($current_position + $min_dist >= $compare_position)){
253 return 5;
254 }
255 }
256
257
258
259
260 #Extraction of insertions
261
262 ##################################################################
263 # my @IN = $current_readbase_string =~ m/\+[0-9]+[ACGTNacgtn]+/g;
264 # my @DEL = $current_readbase_string =~ m/\-[0-9]+[ACGTNacgtn]+/g;
265 # print "IN : @IN\n";
266 # print "DEL :@DEL\n";
267 #$current_readbase_string=~s/[\+\-][0-9]+[ACGTNacgtn]+//g;
268 ##################################################################
269 #!!! marche pas : exemple .+1Ct. correspond a . / +1C / t /. mais le match de l'expression vire +1Ct
270 ##################################################################
271
272 # => parcours de boucle
273 my @readbase = split(//,$current_readbase_string);
274 my $cleaned_readbase_string="";
275 my @IN;
276 my @DEL;
277 my $current_IN="";
278 my $current_DEL="";
279 my $current_size=0;
280
281 for (my $i=0;$i<=$#readbase;$i++){
282 if ($readbase[$i] eq "+"){
283 #Ouverture de IN
284 $current_IN="+";
285
286 #Recuperation de la taille
287 my $sub = substr $current_readbase_string,$i;
288 if ($sub=~/^\+(\d+)/){
289 $current_size = $1;
290 }
291 my $remaining_size = $current_size;
292 while (($remaining_size>0)&&($i<=$#readbase)){
293 $i++;
294 $current_IN.=$readbase[$i];
295 if ($readbase[$i]=~ /[ATGCNatgcn]/){
296 $remaining_size--;
297 }
298 }
299 push(@IN,$current_IN);
300 }
301 elsif ($readbase[$i] eq "-"){
302 #Ouverture de DEL
303 $current_DEL="-";
304
305 #Recuperation de la taille
306 my $sub = substr $current_readbase_string,$i;
307 if ($sub=~/^\-(\d+)/){
308 $current_size = $1;
309 }
310 my $remaining_size = $current_size;
311 while (($remaining_size>0)&&($i<=$#readbase)){
312 $i++;
313 $current_DEL.=$readbase[$i];
314 if ($readbase[$i]=~ /[ATGCNatgcn]/){
315 $remaining_size--;
316 }
317 }
318 push(@DEL,$current_DEL);
319
320 }
321 else {
322 #Ajout a la string
323 $cleaned_readbase_string .= $readbase[$i];
324 }
325 }
326
327
328 # print "IN : @IN\n";
329 # print "DEL :@DEL\n";
330 # print "$cleaned_readbase_string\n";
331
332 my @current_readbase_array = split(//,$cleaned_readbase_string);
333
334 #Filtering : error detection
335
336 if ($#current_readbase_array+1 != $current_coverage){
337 return 9;
338 #parsing error (couldn't parse the readbase string correctly)
339 }
340 elsif ($current_coverage<$min_depth){
341 return 2;
342 # 2 : insufficient depth
343 }
344 elsif ($current_coverage>$max_depth){
345 return 6;
346 # 6 : too much depth
347 }
348 else {
349 if ($#IN>=0){
350 if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){
351 return 1;
352 # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation)
353 }
354 else {
355 ########## TEST de coherence des insertions ################
356 # for (my $i=0;$i<=$#IN;$i++){
357 # if (uc($IN[0]) ne uc($IN[$i])){
358 # print uc($IN[0]),"\n";
359 # print uc($IN[$i]),"\n";
360 # return "1i";
361 # }
362 # }
363 ###########################################################
364
365 if($#IN+1 < $current_coverage*$min_variant_frequency ){
366 return 3;
367 # 3 : insufficient variant frequency
368 }
369 }
370 }
371 elsif ($#DEL>=0){
372 if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){
373 return 1;
374 # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation)
375 }
376 else {
377 ########## TEST de coherence des deletions ################
378 # for (my $i=0;$i<=$#DEL;$i++){
379 # if (uc($DEL[0]) ne uc($DEL[$i])){
380 # print uc($DEL[0]),"\n";
381 # print uc($DEL[$i]),"\n";
382 # return "1d";
383 # }
384 # }
385 ###########################################################
386
387 if($#DEL+1 < $current_coverage*$min_variant_frequency){
388 return 3;
389 # 3 : insufficient variant frequency
390 }
391 }
392 }
393 else {
394 my $nbA=0;
395 $nbA++ while ($current_readbase_string =~ m/A/g);
396 my $nbC=0;
397 $nbC++ while ($current_readbase_string =~ m/C/g);
398 my $nbT=0;
399 $nbT++ while ($current_readbase_string =~ m/T/g);
400 my $nbG=0;
401 $nbG++ while ($current_readbase_string =~ m/G/g);
402 my $nbN=0;
403 $nbN++ while ($current_readbase_string =~ m/N/g);
404 my $nba=0;
405 $nba++ while ($current_readbase_string =~ m/a/g);
406 my $nbc=0;
407 $nbc++ while ($current_readbase_string =~ m/c/g);
408 my $nbt=0;
409 $nbt++ while ($current_readbase_string =~ m/t/g);
410 my $nbg=0;
411 $nbg++ while ($current_readbase_string =~ m/g/g);
412 my $nbn=0;
413 $nbn++ while ($current_readbase_string =~ m/n/g);
414
415 if (($nbA+$nba>0)&&($nbT+$nbt+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){
416 return "1m";
417 }
418 if (($nbT+$nbt>0)&&($nbA+$nba+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){
419 return "1m";
420 }
421 if (($nbG+$nbg>0)&&($nbA+$nba+$nbT+$nbt+$nbC+$nbc+$nbN+$nbn>0)){
422 return "1m";
423 }
424 if (($nbC+$nbc>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbN+$nbn>0)){
425 return "1m";
426 }
427 if (($nbN+$nbn>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbC+$nbc>0)){
428 return "1m";
429 }
430
431 if ($nbA+$nba >= $current_coverage*$min_variant_frequency){
432 if (($nbA<$min_forward_reverse)||($nba<$min_forward_reverse)){
433 return 4;
434 # 4 : variant position not covered by forward and reverse reads
435 }
436 }
437 elsif ($nbT+$nbt >= $current_coverage*$min_variant_frequency){
438 if (($nbT<$min_forward_reverse)||($nbt<$min_forward_reverse)){
439 return 4;
440 # 4 : variant position not covered by forward and reverse reads
441 }
442 }
443 elsif ($nbG+$nbg >= $current_coverage*$min_variant_frequency){
444 if (($nbG<$min_forward_reverse)||($nbg<$min_forward_reverse)){
445 return 4;
446 # 4 : variant position not covered by forward and reverse reads
447 }
448 }
449 elsif ($nbC+$nbc >= $current_coverage*$min_variant_frequency){
450 if (($nbC<$min_forward_reverse)||($nbc<$min_forward_reverse)){
451 return 4;
452 # 4 : variant position not covered by forward and reverse reads
453 }
454 }
455 elsif ($nbN+$nbn >= $current_coverage*$min_variant_frequency){
456 if (($nbN<$min_forward_reverse)||($nbn<$min_forward_reverse)){
457 return 4;
458 # 4 : variant position not covered by forward and reverse reads
459 }
460 }
461 else {
462 return 3;
463 # 3 : insufficient variant frequency
464 }
465 }
466 }
467
468 return 0;
469 }