Mercurial > repos > mcharles > rapsosnp
comparison rapsodyn/PrepareFastqLight.pl~ @ 15:56d328bce3a7 draft default tip
Uploaded
author | mcharles |
---|---|
date | Thu, 29 Jan 2015 08:54:06 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
14:93e6f2af1ce2 | 15:56d328bce3a7 |
---|---|
1 #!/usr/bin/perl | |
2 #V1.0.2 added auto type detection | |
3 #V1.0.1 added log, option parameters | |
4 use strict; | |
5 use warnings; | |
6 use Getopt::Long; | |
7 | |
8 my $read1_file; | |
9 my $read2_file; | |
10 my $log_file; | |
11 my $output1_file; | |
12 my $output2_file; | |
13 | |
14 my $TYPE="sanger"; | |
15 my $MIN_LENGTH=30; | |
16 my $MIN_QUALITY=30; | |
17 | |
18 my $VERBOSE = "OFF"; | |
19 | |
20 GetOptions ( | |
21 "read1_file=s" => \$read1_file, | |
22 "read2_file=s" => \$read2_file, | |
23 "log_file=s" => \$log_file, | |
24 "output1_file=s" => \$output1_file, | |
25 "output2_file=s" => \$output2_file, | |
26 "type=s" => \$TYPE, | |
27 "min_length=i" => \$MIN_LENGTH, | |
28 "min_quality=i" => \$MIN_QUALITY, | |
29 "verbose=s" => \$VERBOSE | |
30 ) or die("Error in command line arguments\n"); | |
31 | |
32 | |
33 my $nb_read1=0; | |
34 my $nb_base_read1=0; | |
35 my $nb_read2=0; | |
36 my $nb_base_read2=0; | |
37 | |
38 my $nb_read1_t=0; | |
39 my $nb_base_read1_t=0; | |
40 my $nb_read2_t=0; | |
41 my $nb_base_read2_t=0; | |
42 | |
43 my $nb_base_current_t=0; | |
44 | |
45 open(READ1, $read1_file) or die ("Can't open $read1_file\n"); | |
46 open(READ2, $read2_file) or die ("Can't open $read2_file\n"); | |
47 open(OUT1, ">$output1_file") or die ("Can't open $output1_file\n"); | |
48 open(OUT2, ">$output2_file") or die ("Can't open $output2_file\n"); | |
49 open (LF,">$log_file") or die("Can't open $log_file\n"); | |
50 | |
51 | |
52 my $error1=0; | |
53 my $error2=0; | |
54 my $error3=0; | |
55 my $error4=0; | |
56 my $error5=0; | |
57 my $error6=0; | |
58 my $error7=0; | |
59 my $error8=0; | |
60 my $error9=0; | |
61 my $error10=0; | |
62 | |
63 my $auto_type=""; | |
64 my %qual; | |
65 if ($TYPE eq "auto"){ | |
66 my $compt=0; | |
67 open(DETECT, $read1_file) or die ("Can't open $read1_file\n"); | |
68 while (my $ligne1_r1 =<DETECT>){ | |
69 my $ligne2_r1 =<DETECT>; | |
70 my $ligne3_r1 =<DETECT>; | |
71 my $ligne4_r1 =<DETECT>; | |
72 $compt++; | |
73 if ($ligne4_r1 =~ /^(.*)\s*$/i){ | |
74 my $qual = $1; | |
75 my @q = split(//,$qual); | |
76 for (my $i=0;$i<=$#q;$i++){ | |
77 my $num = ord($q[$i]); | |
78 if ($qual{$num}){ | |
79 $qual{$num}++; | |
80 } | |
81 else { | |
82 $qual{$num} = 1; | |
83 } | |
84 #range sanger / illumina 1.8+ : 33->94 | |
85 #range illumina 1.3->1.7 : 64->105 | |
86 if ($num > 94){$auto_type = "illumina";last;} | |
87 if ($num < 64){$auto_type = "sanger";last;} | |
88 } | |
89 } | |
90 else { | |
91 print STDERR "Error in format detection : quality not recognized\n$ligne4_r1"; | |
92 exit(0); | |
93 } | |
94 | |
95 if ($auto_type ne ""){ | |
96 last; | |
97 } | |
98 | |
99 } | |
100 close (DETECT); | |
101 if ($auto_type eq ""){ | |
102 print STDERR "Error in format detection : type not recognized parsing read1\n"; | |
103 foreach my $key (sort {$a <=> $b} keys %qual){ | |
104 print "$key\t:\t",$qual{$key},"\n"; | |
105 } | |
106 exit(0); | |
107 } | |
108 else { | |
109 $TYPE = $auto_type; | |
110 } | |
111 } | |
112 | |
113 | |
114 | |
115 | |
116 while (my $ligne1_r1 =<READ1>){ | |
117 my $ligne2_r1 =<READ1>; | |
118 my $ligne3_r1 =<READ1>; | |
119 my $ligne4_r1 =<READ1>; | |
120 my $ligne1_r2 =<READ2>; | |
121 my $ligne2_r2 =<READ2>; | |
122 my $ligne3_r2 =<READ2>; | |
123 my $ligne4_r2 =<READ2>; | |
124 | |
125 $nb_read1++; | |
126 $nb_read2++; | |
127 | |
128 #@ 1 sec | |
129 if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){ | |
130 if ($VERBOSE eq "ON"){ | |
131 print "Error in file format"; | |
132 if ($ligne1_r1){print $ligne1_r1;} | |
133 if ($ligne2_r1){print $ligne2_r1;} | |
134 if ($ligne3_r1){print $ligne3_r1;} | |
135 if ($ligne4_r1){print $ligne4_r1;} | |
136 if ($ligne1_r2){print $ligne1_r2;} | |
137 if ($ligne2_r2){print $ligne2_r2;} | |
138 if ($ligne3_r2){print $ligne3_r2;} | |
139 if ($ligne4_r2){print $ligne4_r2;} | |
140 print "\n"; | |
141 } | |
142 $error1++; | |
143 } | |
144 elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){ | |
145 if ($VERBOSE eq "ON"){ | |
146 print "Error in header : format\n"; | |
147 print $ligne1_r1; | |
148 print $ligne2_r1; | |
149 print $ligne3_r1; | |
150 print $ligne4_r1; | |
151 print $ligne1_r2; | |
152 print $ligne2_r2; | |
153 print $ligne3_r2; | |
154 print $ligne4_r2; | |
155 print "\n"; | |
156 } | |
157 $error2++; | |
158 } | |
159 #@ 1 - 2 sec | |
160 else { | |
161 | |
162 my $length_seq1 = length($ligne2_r1); | |
163 my $length_qual1 =length($ligne4_r1); | |
164 my $seq1; | |
165 my $qual1; | |
166 | |
167 my $length_seq2 = length($ligne2_r2); | |
168 my $length_qual2 =length($ligne4_r2); | |
169 my $seq2; | |
170 my $qual2; | |
171 my $header1=""; | |
172 my $header2=""; | |
173 my $repheader1=""; | |
174 my $repheader2=""; | |
175 | |
176 | |
177 if ($ligne1_r1 =~/^\@(.*?)[\s\/]/){ | |
178 $header1 = $1; | |
179 } | |
180 | |
181 if ($ligne3_r1 =~/^\+(.*?)[\s\/]/){ | |
182 $repheader1 = $1; | |
183 } | |
184 | |
185 if ($ligne1_r2 =~/^\@(.*?)[\s\/]/){ | |
186 $header2 = $1; | |
187 } | |
188 | |
189 if ($ligne3_r2 =~/^\+(.*?)[\s\/]/){ | |
190 $repheader2 = $1; | |
191 } | |
192 #@ 2 sec | |
193 | |
194 ### Verification de la coherence sequence /qualité @ 1 sec | |
195 if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){ | |
196 if ($VERBOSE eq "ON"){ | |
197 print "Error in header : empty\n"; | |
198 print $ligne1_r1; | |
199 print $ligne2_r1; | |
200 print $ligne3_r1; | |
201 print $ligne4_r1; | |
202 print $ligne1_r2; | |
203 print $ligne2_r2; | |
204 print $ligne3_r2; | |
205 print $ligne4_r2; | |
206 print "\n"; | |
207 } | |
208 $error3++; | |
209 } | |
210 elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){ | |
211 if ($VERBOSE eq "ON"){ | |
212 print "Error in header refgsd : empty\n"; | |
213 print $ligne1_r1; | |
214 print $ligne2_r1; | |
215 print $ligne3_r1; | |
216 print $ligne4_r1; | |
217 print $ligne1_r2; | |
218 print $ligne2_r2; | |
219 print $ligne3_r2; | |
220 print $ligne4_r2; | |
221 print "\n"; | |
222 } | |
223 $error3++; | |
224 } | |
225 elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){ | |
226 if ($VERBOSE eq "ON"){ | |
227 print "Error in header : different\n"; | |
228 print $ligne1_r1; | |
229 print $ligne2_r1; | |
230 print $ligne3_r1; | |
231 print $ligne4_r1; | |
232 print $ligne1_r2; | |
233 print $ligne2_r2; | |
234 print $ligne3_r2; | |
235 print $ligne4_r2; | |
236 print "\n"; | |
237 } | |
238 $error4++; | |
239 } | |
240 elsif (($TYPE eq "sanger")&&($header1 ne $header2)){ | |
241 if ($VERBOSE eq "ON"){ | |
242 print "Error in header : different\n"; | |
243 print $ligne1_r1; | |
244 print $ligne2_r1; | |
245 print $ligne3_r1; | |
246 print $ligne4_r1; | |
247 print $ligne1_r2; | |
248 print $ligne2_r2; | |
249 print $ligne3_r2; | |
250 print $ligne4_r2; | |
251 print "\n"; | |
252 } | |
253 $error4++; | |
254 } | |
255 elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){ | |
256 if ($VERBOSE eq "ON"){ | |
257 print "Error in seq/qual length\n"; | |
258 print $ligne1_r1; | |
259 print $ligne2_r1; | |
260 print $ligne3_r1; | |
261 print $ligne4_r1; | |
262 print $ligne1_r2; | |
263 print $ligne2_r2; | |
264 print $ligne3_r2; | |
265 print $ligne4_r2; | |
266 print "\n"; | |
267 } | |
268 $error5++; | |
269 } | |
270 #@ 1 - 2 sec | |
271 else { | |
272 ### Parsing sequence & qualité | |
273 if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){ | |
274 $seq1 = $1; | |
275 $nb_base_read1 += length($seq1); | |
276 } | |
277 if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){ | |
278 $seq2 = $1; | |
279 $nb_base_read2 += length($seq2); | |
280 } | |
281 if ($ligne4_r1 =~ /^(.*)\s*$/i){ | |
282 $qual1 = $1; | |
283 } | |
284 if ($ligne4_r2 =~ /^(.*)\s*$/i){ | |
285 $qual2 = $1; | |
286 } | |
287 #@ 2 sec | |
288 ### Verification du parsing et de la coherence sequence /qualité (n°2) | |
289 if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){ | |
290 if ($VERBOSE eq "ON"){ | |
291 print "Error parsing seq / quality \n"; | |
292 print $ligne1_r1; | |
293 print $ligne2_r1; | |
294 print $ligne3_r1; | |
295 print $ligne4_r1; | |
296 print $ligne1_r2; | |
297 print $ligne2_r2; | |
298 print $ligne3_r2; | |
299 print $ligne4_r2; | |
300 print "\n"; | |
301 } | |
302 $error6++; | |
303 } | |
304 elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){ | |
305 if ($VERBOSE eq "ON"){ | |
306 print "Error in seq/qual length after parsing\n"; | |
307 print $ligne1_r1; | |
308 print $ligne2_r1; | |
309 print $ligne3_r1; | |
310 print $ligne4_r1; | |
311 print $ligne1_r2; | |
312 print $ligne2_r2; | |
313 print $ligne3_r2; | |
314 print $ligne4_r2; | |
315 print "\n"; | |
316 } | |
317 $error7++; | |
318 } | |
319 #@ <1 sec | |
320 else { | |
321 my $fastq_lines_r1=""; | |
322 my $fastq_lines_r2=""; | |
323 my $nb_base_current_read1_t = 0; | |
324 my $nb_base_current_read2_t = 0; | |
325 | |
326 $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1); | |
327 $nb_base_current_read1_t = $nb_base_current_t; | |
328 if ($fastq_lines_r1){ | |
329 $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2); | |
330 $nb_base_current_read2_t = $nb_base_current_t; | |
331 } | |
332 if ($fastq_lines_r2){ | |
333 print OUT1 $fastq_lines_r1; | |
334 print OUT2 $fastq_lines_r2; | |
335 | |
336 $nb_read1_t++; | |
337 $nb_read2_t++; | |
338 $nb_base_read1_t += $nb_base_current_read1_t; | |
339 $nb_base_read2_t += $nb_base_current_read2_t; | |
340 | |
341 | |
342 } | |
343 } | |
344 } | |
345 | |
346 | |
347 #@ 7 sec | |
348 } | |
349 } | |
350 | |
351 close (READ1); | |
352 close (READ2); | |
353 close (OUT1); | |
354 close (OUT2); | |
355 | |
356 print LF "\n####\t Fastq preparation \n"; | |
357 print LF "Fastq format : $TYPE\n"; | |
358 print LF "## Before preparation\n"; | |
359 print LF "#Read1 :\t$nb_read1\t#Base :\t$nb_base_read1\n"; | |
360 print LF "#Read2 :\t$nb_read2\t#Base :\t$nb_base_read2\n"; | |
361 print LF "## After preparation\n"; | |
362 print LF "#Read1 :\t$nb_read1_t\t#Base :\t$nb_base_read1_t\n"; | |
363 print LF "#Read2 :\t$nb_read2_t\t#Base :\t$nb_base_read2_t\n"; | |
364 close (LF); | |
365 | |
366 | |
367 sub grooming_and_trimming{ | |
368 my $header = shift; | |
369 my $seq = shift; | |
370 my $quality = shift; | |
371 my $quality_converted=""; | |
372 my $quality_ori=$quality; | |
373 | |
374 my $lengthseq = length($seq); | |
375 my $startTrim = 0; | |
376 my $stopTrim = length($quality)-1; | |
377 my $startnoN = $startTrim; | |
378 my $stopnoN = $stopTrim; | |
379 | |
380 | |
381 my $chercheN = $seq; | |
382 my @bad_position_N; | |
383 my @bad_position_Q; | |
384 my $current_index = index($chercheN,"N"); | |
385 my $abs_index = $current_index; | |
386 while ($current_index >=0){ | |
387 push (@bad_position_N,$abs_index); | |
388 | |
389 if ($current_index<length($seq)){ | |
390 $chercheN = substr($chercheN,$current_index+1); | |
391 $current_index = index($chercheN,"N"); | |
392 $abs_index = $current_index + $bad_position_N[$#bad_position_N]+1; | |
393 } | |
394 else { | |
395 last; | |
396 } | |
397 } | |
398 | |
399 my @q = split(//,$quality); | |
400 for (my $i=0;$i<=$#q;$i++){ | |
401 my $chr = $q[$i]; | |
402 my $num = ord($q[$i]); | |
403 if ($TYPE eq "illumina"){ | |
404 $num = $num - 31; # 31 comme la difference entre la plage sanger (33-> 93 / 0->60) et illumina (64->104 / 0->40) | |
405 $quality_converted .= chr($num); | |
406 } | |
407 | |
408 if ($num < $MIN_QUALITY + 33){ #33 comme le départ de la plage sanger | |
409 push(@bad_position_Q,$i); | |
410 } | |
411 } | |
412 if ($quality_converted){$quality = $quality_converted;} | |
413 | |
414 my @bad_position = (@bad_position_N, @bad_position_Q); | |
415 | |
416 if ($#bad_position>=0){ | |
417 @bad_position = sort {$a <=> $b} @bad_position; | |
418 my %coord=%{&extract_longer_string_coordinates_from_bad_position(0,$stopTrim,\@bad_position)}; | |
419 $startTrim = $coord{"start"}; | |
420 $stopTrim = $coord{"stop"}; | |
421 #print "$startTrim .. $stopTrim\n"; | |
422 | |
423 } | |
424 my $lengthTrim = $stopTrim - $startTrim +1; | |
425 | |
426 #if ($stats_length{$lengthTrim}){ | |
427 # $stats_length{$lengthTrim} = 1; | |
428 #} | |
429 #else { | |
430 # $stats_length{$lengthTrim}++; | |
431 #} | |
432 my $fastq_lines=""; | |
433 | |
434 # if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){ | |
435 # print "HEAD:\t$header"; | |
436 # print "SEQ:\n$seq\n"; | |
437 # print "$quality_ori\n"; | |
438 # print "$quality\n"; | |
439 # for (my $i=0;$i<=$#bad_position;$i++){ | |
440 # print $bad_position[$i]."(".$q[$bad_position[$i]]." : ".ord($q[$bad_position[$i]]).")"."\t"; | |
441 # } | |
442 # print "\n"; | |
443 # print "$startTrim .. $stopTrim / $lengthTrim \n"; | |
444 # print $fastq_lines; | |
445 # print "\n"; | |
446 # } | |
447 | |
448 #for (my $i=$startTrim;$i<=$stopTrim;$i++){ | |
449 # if ($stats_quality{ord($q{$i])}){ | |
450 # $stats_quality{ord($q{$i])}=1; | |
451 # } | |
452 # else { | |
453 # $stats_quality{ord($q{$i])}++; | |
454 # } | |
455 #} | |
456 | |
457 if ($lengthTrim >= $MIN_LENGTH){ | |
458 $fastq_lines .= $header; | |
459 my $new_seq = substr($seq,$startTrim,$lengthTrim); | |
460 $nb_base_current_t = length($new_seq); | |
461 $fastq_lines .= $new_seq."\n"; | |
462 $fastq_lines .= "+\n"; | |
463 my $new_q = substr($quality,$startTrim,$lengthTrim); | |
464 $fastq_lines .= $new_q."\n"; | |
465 return $fastq_lines; | |
466 | |
467 } | |
468 else { | |
469 #print "Insufficient length after trimming\n"; | |
470 return ""; | |
471 } | |
472 } | |
473 | |
474 sub extract_longer_string_coordinates_from_bad_position{ | |
475 my $start=shift; | |
476 my $stop =shift; | |
477 my $refbad = shift; | |
478 my @bad_position = @$refbad; | |
479 my %coord; | |
480 | |
481 my $current_start = $start; | |
482 my $current_stop = $bad_position[0]-1; | |
483 if ($current_stop < $start){$current_stop = $start;} | |
484 | |
485 | |
486 #debut -> premier N | |
487 my $current_length = $current_stop - $current_start +1; | |
488 my $test_length; | |
489 | |
490 #entre les N | |
491 for (my $i=1;$i<=$#bad_position;$i++){ | |
492 $test_length = $bad_position[$i]+1-$bad_position[$i-1]-1; | |
493 if ( $test_length > $current_length){ | |
494 $current_start = $bad_position[$i-1]+1; | |
495 $current_stop = $bad_position[$i]-1; | |
496 $current_length = $current_stop - $current_start +1; | |
497 } | |
498 } | |
499 | |
500 #dernier N -> fin | |
501 $test_length = $stop-$bad_position[$#bad_position]+1; | |
502 if ( $test_length > $current_length){ | |
503 $current_start = $bad_position[$#bad_position]+1; | |
504 if ($current_start > $stop){$current_start=$stop;} | |
505 $current_stop = $stop; | |
506 } | |
507 $coord{"start"}=$current_start; | |
508 $coord{"stop"}= $current_stop; | |
509 $coord{"lenght"}=$current_stop-$current_start+1; | |
510 | |
511 return \%coord; | |
512 } |