comparison rapsodyn/PrepareFastqLight.pl~ @ 15:56d328bce3a7 draft default tip

Uploaded
author mcharles
date Thu, 29 Jan 2015 08:54:06 -0500
parents
children
comparison
equal deleted inserted replaced
14:93e6f2af1ce2 15:56d328bce3a7
1 #!/usr/bin/perl
2 #V1.0.2 added auto type detection
3 #V1.0.1 added log, option parameters
4 use strict;
5 use warnings;
6 use Getopt::Long;
7
8 my $read1_file;
9 my $read2_file;
10 my $log_file;
11 my $output1_file;
12 my $output2_file;
13
14 my $TYPE="sanger";
15 my $MIN_LENGTH=30;
16 my $MIN_QUALITY=30;
17
18 my $VERBOSE = "OFF";
19
20 GetOptions (
21 "read1_file=s" => \$read1_file,
22 "read2_file=s" => \$read2_file,
23 "log_file=s" => \$log_file,
24 "output1_file=s" => \$output1_file,
25 "output2_file=s" => \$output2_file,
26 "type=s" => \$TYPE,
27 "min_length=i" => \$MIN_LENGTH,
28 "min_quality=i" => \$MIN_QUALITY,
29 "verbose=s" => \$VERBOSE
30 ) or die("Error in command line arguments\n");
31
32
33 my $nb_read1=0;
34 my $nb_base_read1=0;
35 my $nb_read2=0;
36 my $nb_base_read2=0;
37
38 my $nb_read1_t=0;
39 my $nb_base_read1_t=0;
40 my $nb_read2_t=0;
41 my $nb_base_read2_t=0;
42
43 my $nb_base_current_t=0;
44
45 open(READ1, $read1_file) or die ("Can't open $read1_file\n");
46 open(READ2, $read2_file) or die ("Can't open $read2_file\n");
47 open(OUT1, ">$output1_file") or die ("Can't open $output1_file\n");
48 open(OUT2, ">$output2_file") or die ("Can't open $output2_file\n");
49 open (LF,">$log_file") or die("Can't open $log_file\n");
50
51
52 my $error1=0;
53 my $error2=0;
54 my $error3=0;
55 my $error4=0;
56 my $error5=0;
57 my $error6=0;
58 my $error7=0;
59 my $error8=0;
60 my $error9=0;
61 my $error10=0;
62
63 my $auto_type="";
64 my %qual;
65 if ($TYPE eq "auto"){
66 my $compt=0;
67 open(DETECT, $read1_file) or die ("Can't open $read1_file\n");
68 while (my $ligne1_r1 =<DETECT>){
69 my $ligne2_r1 =<DETECT>;
70 my $ligne3_r1 =<DETECT>;
71 my $ligne4_r1 =<DETECT>;
72 $compt++;
73 if ($ligne4_r1 =~ /^(.*)\s*$/i){
74 my $qual = $1;
75 my @q = split(//,$qual);
76 for (my $i=0;$i<=$#q;$i++){
77 my $num = ord($q[$i]);
78 if ($qual{$num}){
79 $qual{$num}++;
80 }
81 else {
82 $qual{$num} = 1;
83 }
84 #range sanger / illumina 1.8+ : 33->94
85 #range illumina 1.3->1.7 : 64->105
86 if ($num > 94){$auto_type = "illumina";last;}
87 if ($num < 64){$auto_type = "sanger";last;}
88 }
89 }
90 else {
91 print STDERR "Error in format detection : quality not recognized\n$ligne4_r1";
92 exit(0);
93 }
94
95 if ($auto_type ne ""){
96 last;
97 }
98
99 }
100 close (DETECT);
101 if ($auto_type eq ""){
102 print STDERR "Error in format detection : type not recognized parsing read1\n";
103 foreach my $key (sort {$a <=> $b} keys %qual){
104 print "$key\t:\t",$qual{$key},"\n";
105 }
106 exit(0);
107 }
108 else {
109 $TYPE = $auto_type;
110 }
111 }
112
113
114
115
116 while (my $ligne1_r1 =<READ1>){
117 my $ligne2_r1 =<READ1>;
118 my $ligne3_r1 =<READ1>;
119 my $ligne4_r1 =<READ1>;
120 my $ligne1_r2 =<READ2>;
121 my $ligne2_r2 =<READ2>;
122 my $ligne3_r2 =<READ2>;
123 my $ligne4_r2 =<READ2>;
124
125 $nb_read1++;
126 $nb_read2++;
127
128 #@ 1 sec
129 if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){
130 if ($VERBOSE eq "ON"){
131 print "Error in file format";
132 if ($ligne1_r1){print $ligne1_r1;}
133 if ($ligne2_r1){print $ligne2_r1;}
134 if ($ligne3_r1){print $ligne3_r1;}
135 if ($ligne4_r1){print $ligne4_r1;}
136 if ($ligne1_r2){print $ligne1_r2;}
137 if ($ligne2_r2){print $ligne2_r2;}
138 if ($ligne3_r2){print $ligne3_r2;}
139 if ($ligne4_r2){print $ligne4_r2;}
140 print "\n";
141 }
142 $error1++;
143 }
144 elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){
145 if ($VERBOSE eq "ON"){
146 print "Error in header : format\n";
147 print $ligne1_r1;
148 print $ligne2_r1;
149 print $ligne3_r1;
150 print $ligne4_r1;
151 print $ligne1_r2;
152 print $ligne2_r2;
153 print $ligne3_r2;
154 print $ligne4_r2;
155 print "\n";
156 }
157 $error2++;
158 }
159 #@ 1 - 2 sec
160 else {
161
162 my $length_seq1 = length($ligne2_r1);
163 my $length_qual1 =length($ligne4_r1);
164 my $seq1;
165 my $qual1;
166
167 my $length_seq2 = length($ligne2_r2);
168 my $length_qual2 =length($ligne4_r2);
169 my $seq2;
170 my $qual2;
171 my $header1="";
172 my $header2="";
173 my $repheader1="";
174 my $repheader2="";
175
176
177 if ($ligne1_r1 =~/^\@(.*?)[\s\/]/){
178 $header1 = $1;
179 }
180
181 if ($ligne3_r1 =~/^\+(.*?)[\s\/]/){
182 $repheader1 = $1;
183 }
184
185 if ($ligne1_r2 =~/^\@(.*?)[\s\/]/){
186 $header2 = $1;
187 }
188
189 if ($ligne3_r2 =~/^\+(.*?)[\s\/]/){
190 $repheader2 = $1;
191 }
192 #@ 2 sec
193
194 ### Verification de la coherence sequence /qualité @ 1 sec
195 if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){
196 if ($VERBOSE eq "ON"){
197 print "Error in header : empty\n";
198 print $ligne1_r1;
199 print $ligne2_r1;
200 print $ligne3_r1;
201 print $ligne4_r1;
202 print $ligne1_r2;
203 print $ligne2_r2;
204 print $ligne3_r2;
205 print $ligne4_r2;
206 print "\n";
207 }
208 $error3++;
209 }
210 elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){
211 if ($VERBOSE eq "ON"){
212 print "Error in header refgsd : empty\n";
213 print $ligne1_r1;
214 print $ligne2_r1;
215 print $ligne3_r1;
216 print $ligne4_r1;
217 print $ligne1_r2;
218 print $ligne2_r2;
219 print $ligne3_r2;
220 print $ligne4_r2;
221 print "\n";
222 }
223 $error3++;
224 }
225 elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){
226 if ($VERBOSE eq "ON"){
227 print "Error in header : different\n";
228 print $ligne1_r1;
229 print $ligne2_r1;
230 print $ligne3_r1;
231 print $ligne4_r1;
232 print $ligne1_r2;
233 print $ligne2_r2;
234 print $ligne3_r2;
235 print $ligne4_r2;
236 print "\n";
237 }
238 $error4++;
239 }
240 elsif (($TYPE eq "sanger")&&($header1 ne $header2)){
241 if ($VERBOSE eq "ON"){
242 print "Error in header : different\n";
243 print $ligne1_r1;
244 print $ligne2_r1;
245 print $ligne3_r1;
246 print $ligne4_r1;
247 print $ligne1_r2;
248 print $ligne2_r2;
249 print $ligne3_r2;
250 print $ligne4_r2;
251 print "\n";
252 }
253 $error4++;
254 }
255 elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){
256 if ($VERBOSE eq "ON"){
257 print "Error in seq/qual length\n";
258 print $ligne1_r1;
259 print $ligne2_r1;
260 print $ligne3_r1;
261 print $ligne4_r1;
262 print $ligne1_r2;
263 print $ligne2_r2;
264 print $ligne3_r2;
265 print $ligne4_r2;
266 print "\n";
267 }
268 $error5++;
269 }
270 #@ 1 - 2 sec
271 else {
272 ### Parsing sequence & qualité
273 if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){
274 $seq1 = $1;
275 $nb_base_read1 += length($seq1);
276 }
277 if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){
278 $seq2 = $1;
279 $nb_base_read2 += length($seq2);
280 }
281 if ($ligne4_r1 =~ /^(.*)\s*$/i){
282 $qual1 = $1;
283 }
284 if ($ligne4_r2 =~ /^(.*)\s*$/i){
285 $qual2 = $1;
286 }
287 #@ 2 sec
288 ### Verification du parsing et de la coherence sequence /qualité (n°2)
289 if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){
290 if ($VERBOSE eq "ON"){
291 print "Error parsing seq / quality \n";
292 print $ligne1_r1;
293 print $ligne2_r1;
294 print $ligne3_r1;
295 print $ligne4_r1;
296 print $ligne1_r2;
297 print $ligne2_r2;
298 print $ligne3_r2;
299 print $ligne4_r2;
300 print "\n";
301 }
302 $error6++;
303 }
304 elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){
305 if ($VERBOSE eq "ON"){
306 print "Error in seq/qual length after parsing\n";
307 print $ligne1_r1;
308 print $ligne2_r1;
309 print $ligne3_r1;
310 print $ligne4_r1;
311 print $ligne1_r2;
312 print $ligne2_r2;
313 print $ligne3_r2;
314 print $ligne4_r2;
315 print "\n";
316 }
317 $error7++;
318 }
319 #@ <1 sec
320 else {
321 my $fastq_lines_r1="";
322 my $fastq_lines_r2="";
323 my $nb_base_current_read1_t = 0;
324 my $nb_base_current_read2_t = 0;
325
326 $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1);
327 $nb_base_current_read1_t = $nb_base_current_t;
328 if ($fastq_lines_r1){
329 $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2);
330 $nb_base_current_read2_t = $nb_base_current_t;
331 }
332 if ($fastq_lines_r2){
333 print OUT1 $fastq_lines_r1;
334 print OUT2 $fastq_lines_r2;
335
336 $nb_read1_t++;
337 $nb_read2_t++;
338 $nb_base_read1_t += $nb_base_current_read1_t;
339 $nb_base_read2_t += $nb_base_current_read2_t;
340
341
342 }
343 }
344 }
345
346
347 #@ 7 sec
348 }
349 }
350
351 close (READ1);
352 close (READ2);
353 close (OUT1);
354 close (OUT2);
355
356 print LF "\n####\t Fastq preparation \n";
357 print LF "Fastq format : $TYPE\n";
358 print LF "## Before preparation\n";
359 print LF "#Read1 :\t$nb_read1\t#Base :\t$nb_base_read1\n";
360 print LF "#Read2 :\t$nb_read2\t#Base :\t$nb_base_read2\n";
361 print LF "## After preparation\n";
362 print LF "#Read1 :\t$nb_read1_t\t#Base :\t$nb_base_read1_t\n";
363 print LF "#Read2 :\t$nb_read2_t\t#Base :\t$nb_base_read2_t\n";
364 close (LF);
365
366
367 sub grooming_and_trimming{
368 my $header = shift;
369 my $seq = shift;
370 my $quality = shift;
371 my $quality_converted="";
372 my $quality_ori=$quality;
373
374 my $lengthseq = length($seq);
375 my $startTrim = 0;
376 my $stopTrim = length($quality)-1;
377 my $startnoN = $startTrim;
378 my $stopnoN = $stopTrim;
379
380
381 my $chercheN = $seq;
382 my @bad_position_N;
383 my @bad_position_Q;
384 my $current_index = index($chercheN,"N");
385 my $abs_index = $current_index;
386 while ($current_index >=0){
387 push (@bad_position_N,$abs_index);
388
389 if ($current_index<length($seq)){
390 $chercheN = substr($chercheN,$current_index+1);
391 $current_index = index($chercheN,"N");
392 $abs_index = $current_index + $bad_position_N[$#bad_position_N]+1;
393 }
394 else {
395 last;
396 }
397 }
398
399 my @q = split(//,$quality);
400 for (my $i=0;$i<=$#q;$i++){
401 my $chr = $q[$i];
402 my $num = ord($q[$i]);
403 if ($TYPE eq "illumina"){
404 $num = $num - 31; # 31 comme la difference entre la plage sanger (33-> 93 / 0->60) et illumina (64->104 / 0->40)
405 $quality_converted .= chr($num);
406 }
407
408 if ($num < $MIN_QUALITY + 33){ #33 comme le départ de la plage sanger
409 push(@bad_position_Q,$i);
410 }
411 }
412 if ($quality_converted){$quality = $quality_converted;}
413
414 my @bad_position = (@bad_position_N, @bad_position_Q);
415
416 if ($#bad_position>=0){
417 @bad_position = sort {$a <=> $b} @bad_position;
418 my %coord=%{&extract_longer_string_coordinates_from_bad_position(0,$stopTrim,\@bad_position)};
419 $startTrim = $coord{"start"};
420 $stopTrim = $coord{"stop"};
421 #print "$startTrim .. $stopTrim\n";
422
423 }
424 my $lengthTrim = $stopTrim - $startTrim +1;
425
426 #if ($stats_length{$lengthTrim}){
427 # $stats_length{$lengthTrim} = 1;
428 #}
429 #else {
430 # $stats_length{$lengthTrim}++;
431 #}
432 my $fastq_lines="";
433
434 # if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){
435 # print "HEAD:\t$header";
436 # print "SEQ:\n$seq\n";
437 # print "$quality_ori\n";
438 # print "$quality\n";
439 # for (my $i=0;$i<=$#bad_position;$i++){
440 # print $bad_position[$i]."(".$q[$bad_position[$i]]." : ".ord($q[$bad_position[$i]]).")"."\t";
441 # }
442 # print "\n";
443 # print "$startTrim .. $stopTrim / $lengthTrim \n";
444 # print $fastq_lines;
445 # print "\n";
446 # }
447
448 #for (my $i=$startTrim;$i<=$stopTrim;$i++){
449 # if ($stats_quality{ord($q{$i])}){
450 # $stats_quality{ord($q{$i])}=1;
451 # }
452 # else {
453 # $stats_quality{ord($q{$i])}++;
454 # }
455 #}
456
457 if ($lengthTrim >= $MIN_LENGTH){
458 $fastq_lines .= $header;
459 my $new_seq = substr($seq,$startTrim,$lengthTrim);
460 $nb_base_current_t = length($new_seq);
461 $fastq_lines .= $new_seq."\n";
462 $fastq_lines .= "+\n";
463 my $new_q = substr($quality,$startTrim,$lengthTrim);
464 $fastq_lines .= $new_q."\n";
465 return $fastq_lines;
466
467 }
468 else {
469 #print "Insufficient length after trimming\n";
470 return "";
471 }
472 }
473
474 sub extract_longer_string_coordinates_from_bad_position{
475 my $start=shift;
476 my $stop =shift;
477 my $refbad = shift;
478 my @bad_position = @$refbad;
479 my %coord;
480
481 my $current_start = $start;
482 my $current_stop = $bad_position[0]-1;
483 if ($current_stop < $start){$current_stop = $start;}
484
485
486 #debut -> premier N
487 my $current_length = $current_stop - $current_start +1;
488 my $test_length;
489
490 #entre les N
491 for (my $i=1;$i<=$#bad_position;$i++){
492 $test_length = $bad_position[$i]+1-$bad_position[$i-1]-1;
493 if ( $test_length > $current_length){
494 $current_start = $bad_position[$i-1]+1;
495 $current_stop = $bad_position[$i]-1;
496 $current_length = $current_stop - $current_start +1;
497 }
498 }
499
500 #dernier N -> fin
501 $test_length = $stop-$bad_position[$#bad_position]+1;
502 if ( $test_length > $current_length){
503 $current_start = $bad_position[$#bad_position]+1;
504 if ($current_start > $stop){$current_start=$stop;}
505 $current_stop = $stop;
506 }
507 $coord{"start"}=$current_start;
508 $coord{"stop"}= $current_stop;
509 $coord{"lenght"}=$current_stop-$current_start+1;
510
511 return \%coord;
512 }