Mercurial > repos > mcharles > rapsosnp
comparison rapsodyn/PrepareFastqLight.pl~ @ 7:3f7b0788a1c4 draft
Uploaded
author | mcharles |
---|---|
date | Tue, 07 Oct 2014 10:34:34 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:1776b8ddd87e | 7:3f7b0788a1c4 |
---|---|
1 #!/usr/bin/perl | |
2 #V1.0.1 added log, option parameters | |
3 use strict; | |
4 use warnings; | |
5 use Getopt::Long; | |
6 | |
7 my $read1_file; | |
8 my $read2_file; | |
9 my $log_file; | |
10 my $output1_file; | |
11 my $output2_file; | |
12 | |
13 my $TYPE="sanger"; | |
14 my $MIN_LENGTH=30; | |
15 my $MIN_QUALITY=30; | |
16 | |
17 my $VERBOSE = "OFF"; | |
18 | |
19 GetOptions ( | |
20 "read1_file=s" => \$read1_file, | |
21 "read2_file=s" => \$read2_file, | |
22 "log_file=s" => \$log_file, | |
23 "output1_file=s" => \$output1_file, | |
24 "output2_file=s" => \$output2_file, | |
25 "type=s" => \$TYPE, | |
26 "min_length=i" => \$MIN_LENGTH, | |
27 "min_quality=i" => \$MIN_QUALITY, | |
28 "verbose=s" => \$VERBOSE | |
29 ) or die("Error in command line arguments\n"); | |
30 | |
31 | |
32 my $nb_read1=0; | |
33 my $nb_base_read1=0; | |
34 my $nb_read2=0; | |
35 my $nb_base_read2=0; | |
36 | |
37 my $nb_read1_t=0; | |
38 my $nb_base_read1_t=0; | |
39 my $nb_read2_t=0; | |
40 my $nb_base_read2_t=0; | |
41 | |
42 my $nb_base_current_t=0; | |
43 | |
44 | |
45 open(READ1, $read1_file) or die ("Can't open $read1_file\n"); | |
46 open(READ2, $read2_file) or die ("Can't open $read2_file\n"); | |
47 open(OUT1, ">$output1_file") or die ("Can't open $output1_file\n"); | |
48 open(OUT2, ">$output2_file") or die ("Can't open $output2_file\n"); | |
49 | |
50 my $error1=0; | |
51 my $error2=0; | |
52 my $error3=0; | |
53 my $error4=0; | |
54 my $error5=0; | |
55 my $error6=0; | |
56 my $error7=0; | |
57 my $error8=0; | |
58 my $error9=0; | |
59 my $error10=0; | |
60 | |
61 while (my $ligne1_r1 =<READ1>){ | |
62 my $ligne2_r1 =<READ1>; | |
63 my $ligne3_r1 =<READ1>; | |
64 my $ligne4_r1 =<READ1>; | |
65 my $ligne1_r2 =<READ2>; | |
66 my $ligne2_r2 =<READ2>; | |
67 my $ligne3_r2 =<READ2>; | |
68 my $ligne4_r2 =<READ2>; | |
69 | |
70 $nb_read1++; | |
71 $nb_read2++; | |
72 | |
73 #@ 1 sec | |
74 if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){ | |
75 if ($VERBOSE eq "ON"){ | |
76 print "Error in file format"; | |
77 if ($ligne1_r1){print $ligne1_r1;} | |
78 if ($ligne2_r1){print $ligne2_r1;} | |
79 if ($ligne3_r1){print $ligne3_r1;} | |
80 if ($ligne4_r1){print $ligne4_r1;} | |
81 if ($ligne1_r2){print $ligne1_r2;} | |
82 if ($ligne2_r2){print $ligne2_r2;} | |
83 if ($ligne3_r2){print $ligne3_r2;} | |
84 if ($ligne4_r2){print $ligne4_r2;} | |
85 print "\n"; | |
86 } | |
87 $error1++; | |
88 } | |
89 elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){ | |
90 if ($VERBOSE eq "ON"){ | |
91 print "Error in header : format\n"; | |
92 print $ligne1_r1; | |
93 print $ligne2_r1; | |
94 print $ligne3_r1; | |
95 print $ligne4_r1; | |
96 print $ligne1_r2; | |
97 print $ligne2_r2; | |
98 print $ligne3_r2; | |
99 print $ligne4_r2; | |
100 print "\n"; | |
101 } | |
102 $error2++; | |
103 } | |
104 #@ 1 - 2 sec | |
105 else { | |
106 | |
107 my $length_seq1 = length($ligne2_r1); | |
108 my $length_qual1 =length($ligne4_r1); | |
109 my $seq1; | |
110 my $qual1; | |
111 | |
112 my $length_seq2 = length($ligne2_r2); | |
113 my $length_qual2 =length($ligne4_r2); | |
114 my $seq2; | |
115 my $qual2; | |
116 my $header1=""; | |
117 my $header2=""; | |
118 my $repheader1=""; | |
119 my $repheader2=""; | |
120 | |
121 | |
122 if ($ligne1_r1 =~/^\@(.*?)\#/){ | |
123 $header1 = $1; | |
124 } | |
125 | |
126 if ($ligne3_r1 =~/^\+(.*?)\#/){ | |
127 $repheader1 = $1; | |
128 } | |
129 | |
130 if ($ligne1_r2 =~/^\@(.*?)\#/){ | |
131 $header2 = $1; | |
132 } | |
133 | |
134 if ($ligne3_r2 =~/^\+(.*?)\#/){ | |
135 $repheader2 = $1; | |
136 } | |
137 #@ 2 sec | |
138 | |
139 ### Verification de la coherence sequence /qualité @ 1 sec | |
140 if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){ | |
141 if ($VERBOSE eq "ON"){ | |
142 print "Error in header : empty\n"; | |
143 print $ligne1_r1; | |
144 print $ligne2_r1; | |
145 print $ligne3_r1; | |
146 print $ligne4_r1; | |
147 print $ligne1_r2; | |
148 print $ligne2_r2; | |
149 print $ligne3_r2; | |
150 print $ligne4_r2; | |
151 print "\n"; | |
152 } | |
153 $error3++; | |
154 } | |
155 elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){ | |
156 if ($VERBOSE eq "ON"){ | |
157 print "Error in header refgsd : empty\n"; | |
158 print $ligne1_r1; | |
159 print $ligne2_r1; | |
160 print $ligne3_r1; | |
161 print $ligne4_r1; | |
162 print $ligne1_r2; | |
163 print $ligne2_r2; | |
164 print $ligne3_r2; | |
165 print $ligne4_r2; | |
166 print "\n"; | |
167 } | |
168 $error3++; | |
169 } | |
170 elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){ | |
171 if ($VERBOSE eq "ON"){ | |
172 print "Error in header : different\n"; | |
173 print $ligne1_r1; | |
174 print $ligne2_r1; | |
175 print $ligne3_r1; | |
176 print $ligne4_r1; | |
177 print $ligne1_r2; | |
178 print $ligne2_r2; | |
179 print $ligne3_r2; | |
180 print $ligne4_r2; | |
181 print "\n"; | |
182 } | |
183 $error4++; | |
184 } | |
185 elsif (($TYPE eq "sanger")&&($header1 ne $header2)){ | |
186 if ($VERBOSE eq "ON"){ | |
187 print "Error in header : different\n"; | |
188 print $ligne1_r1; | |
189 print $ligne2_r1; | |
190 print $ligne3_r1; | |
191 print $ligne4_r1; | |
192 print $ligne1_r2; | |
193 print $ligne2_r2; | |
194 print $ligne3_r2; | |
195 print $ligne4_r2; | |
196 print "\n"; | |
197 } | |
198 $error4++; | |
199 } | |
200 elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){ | |
201 if ($VERBOSE eq "ON"){ | |
202 print "Error in seq/qual length\n"; | |
203 print $ligne1_r1; | |
204 print $ligne2_r1; | |
205 print $ligne3_r1; | |
206 print $ligne4_r1; | |
207 print $ligne1_r2; | |
208 print $ligne2_r2; | |
209 print $ligne3_r2; | |
210 print $ligne4_r2; | |
211 print "\n"; | |
212 } | |
213 $error5++; | |
214 } | |
215 #@ 1 - 2 sec | |
216 else { | |
217 ### Parsing sequence & qualité | |
218 if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){ | |
219 $seq1 = $1; | |
220 $nb_base_read1 += length($seq1); | |
221 } | |
222 if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){ | |
223 $seq2 = $1; | |
224 $nb_base_read2 += length($seq2); | |
225 } | |
226 if ($ligne4_r1 =~ /^(.*)\s*$/i){ | |
227 $qual1 = $1; | |
228 } | |
229 if ($ligne4_r2 =~ /^(.*)\s*$/i){ | |
230 $qual2 = $1; | |
231 } | |
232 #@ 2 sec | |
233 ### Verification du parsing et de la coherence sequence /qualité (n°2) | |
234 if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){ | |
235 if ($VERBOSE eq "ON"){ | |
236 print "Error parsing seq / quality \n"; | |
237 print $ligne1_r1; | |
238 print $ligne2_r1; | |
239 print $ligne3_r1; | |
240 print $ligne4_r1; | |
241 print $ligne1_r2; | |
242 print $ligne2_r2; | |
243 print $ligne3_r2; | |
244 print $ligne4_r2; | |
245 print "\n"; | |
246 } | |
247 $error6++; | |
248 } | |
249 elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){ | |
250 if ($VERBOSE eq "ON"){ | |
251 print "Error in seq/qual length after parsing\n"; | |
252 print $ligne1_r1; | |
253 print $ligne2_r1; | |
254 print $ligne3_r1; | |
255 print $ligne4_r1; | |
256 print $ligne1_r2; | |
257 print $ligne2_r2; | |
258 print $ligne3_r2; | |
259 print $ligne4_r2; | |
260 print "\n"; | |
261 } | |
262 $error7++; | |
263 } | |
264 #@ <1 sec | |
265 else { | |
266 my $fastq_lines_r1=""; | |
267 my $fastq_lines_r2=""; | |
268 my $nb_base_current_read1_t = 0; | |
269 my $nb_base_current_read2_t = 0; | |
270 | |
271 $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1); | |
272 $nb_base_current_read1_t = $nb_base_current_t; | |
273 if ($fastq_lines_r1){ | |
274 $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2); | |
275 $nb_base_current_read2_t = $nb_base_current_t; | |
276 } | |
277 if ($fastq_lines_r2){ | |
278 print OUT1 $fastq_lines_r1; | |
279 print OUT2 $fastq_lines_r2; | |
280 | |
281 $nb_read1_t++; | |
282 $nb_read2_t++; | |
283 $nb_base_read1_t += $nb_base_current_read1_t; | |
284 $nb_base_read2_t += $nb_base_current_read2_t; | |
285 | |
286 | |
287 } | |
288 } | |
289 } | |
290 | |
291 | |
292 #@ 7 sec | |
293 } | |
294 } | |
295 | |
296 close (READ1); | |
297 close (READ2); | |
298 close (OUT1); | |
299 close (OUT2); | |
300 | |
301 open (LF,">$log_file") or die("Can't open $log_file\n"); | |
302 print LF "\n####\t Fastq preparation \n"; | |
303 print LF "## Before preparation\n"; | |
304 print LF "#Read1 :\t$nb_read1\t#Base :\t$nb_base_read1\n"; | |
305 print LF "#Read2 :\t$nb_read2\t#Base :\t$nb_base_read2\n"; | |
306 print LF "## After preparation\n"; | |
307 print LF "#Read1 :\t$nb_read1_t\t#Base :\t$nb_base_read1_t\n"; | |
308 print LF "#Read2 :\t$nb_read2_t\t#Base :\t$nb_base_read2_t\n"; | |
309 close (LF); | |
310 | |
311 | |
312 sub grooming_and_trimming{ | |
313 my $header = shift; | |
314 my $seq = shift; | |
315 my $quality = shift; | |
316 my $quality_converted=""; | |
317 my $quality_ori=$quality; | |
318 | |
319 my $lengthseq = length($seq); | |
320 my $startTrim = 0; | |
321 my $stopTrim = length($quality)-1; | |
322 my $startnoN = $startTrim; | |
323 my $stopnoN = $stopTrim; | |
324 | |
325 | |
326 my $chercheN = $seq; | |
327 my @bad_position_N; | |
328 my @bad_position_Q; | |
329 my $current_index = index($chercheN,"N"); | |
330 my $abs_index = $current_index; | |
331 while ($current_index >=0){ | |
332 push (@bad_position_N,$abs_index); | |
333 | |
334 if ($current_index<length($seq)){ | |
335 $chercheN = substr($chercheN,$current_index+1); | |
336 $current_index = index($chercheN,"N"); | |
337 $abs_index = $current_index + $bad_position_N[$#bad_position_N]+1; | |
338 } | |
339 else { | |
340 last; | |
341 } | |
342 } | |
343 | |
344 my @q = split(//,$quality); | |
345 for (my $i=0;$i<=$#q;$i++){ | |
346 my $chr = $q[$i]; | |
347 my $num = ord($q[$i]); | |
348 if ($TYPE eq "illumina"){ | |
349 $num = $num - 31; # 31 comme la difference entre la plage sanger (33-> 93 / 0->60) et illumina (64->104 / 0->40) | |
350 $quality_converted .= chr($num); | |
351 } | |
352 | |
353 if ($num < $MIN_QUALITY + 33){ #33 comme le départ de la plage sanger | |
354 push(@bad_position_Q,$i); | |
355 } | |
356 } | |
357 if ($quality_converted){$quality = $quality_converted;} | |
358 | |
359 my @bad_position = (@bad_position_N, @bad_position_Q); | |
360 | |
361 if ($#bad_position>=0){ | |
362 @bad_position = sort {$a <=> $b} @bad_position; | |
363 my %coord=%{&extract_longer_string_coordinates_from_bad_position(0,$stopTrim,\@bad_position)}; | |
364 $startTrim = $coord{"start"}; | |
365 $stopTrim = $coord{"stop"}; | |
366 #print "$startTrim .. $stopTrim\n"; | |
367 | |
368 } | |
369 my $lengthTrim = $stopTrim - $startTrim +1; | |
370 | |
371 #if ($stats_length{$lengthTrim}){ | |
372 # $stats_length{$lengthTrim} = 1; | |
373 #} | |
374 #else { | |
375 # $stats_length{$lengthTrim}++; | |
376 #} | |
377 my $fastq_lines=""; | |
378 | |
379 # if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){ | |
380 # print "HEAD:\t$header"; | |
381 # print "SEQ:\n$seq\n"; | |
382 # print "$quality_ori\n"; | |
383 # print "$quality\n"; | |
384 # for (my $i=0;$i<=$#bad_position;$i++){ | |
385 # print $bad_position[$i]."(".$q[$bad_position[$i]]." : ".ord($q[$bad_position[$i]]).")"."\t"; | |
386 # } | |
387 # print "\n"; | |
388 # print "$startTrim .. $stopTrim / $lengthTrim \n"; | |
389 # print $fastq_lines; | |
390 # print "\n"; | |
391 # } | |
392 | |
393 #for (my $i=$startTrim;$i<=$stopTrim;$i++){ | |
394 # if ($stats_quality{ord($q{$i])}){ | |
395 # $stats_quality{ord($q{$i])}=1; | |
396 # } | |
397 # else { | |
398 # $stats_quality{ord($q{$i])}++; | |
399 # } | |
400 #} | |
401 | |
402 if ($lengthTrim >= $MIN_LENGTH){ | |
403 $fastq_lines .= $header; | |
404 my $new_seq = substr($seq,$startTrim,$lengthTrim); | |
405 # $nb_base_current_t = length($new_seq); | |
406 $fastq_lines .= $new_seq."\n"; | |
407 $fastq_lines .= "+\n"; | |
408 my $new_q = substr($quality,$startTrim,$lengthTrim); | |
409 $fastq_lines .= $new_q."\n"; | |
410 return $fastq_lines; | |
411 | |
412 } | |
413 else { | |
414 #print "Insufficient length after trimming\n"; | |
415 return ""; | |
416 } | |
417 } | |
418 | |
419 sub extract_longer_string_coordinates_from_bad_position{ | |
420 my $start=shift; | |
421 my $stop =shift; | |
422 my $refbad = shift; | |
423 my @bad_position = @$refbad; | |
424 my %coord; | |
425 | |
426 my $current_start = $start; | |
427 my $current_stop = $bad_position[0]-1; | |
428 if ($current_stop < $start){$current_stop = $start;} | |
429 | |
430 | |
431 #debut -> premier N | |
432 my $current_length = $current_stop - $current_start +1; | |
433 my $test_length; | |
434 | |
435 #entre les N | |
436 for (my $i=1;$i<=$#bad_position;$i++){ | |
437 $test_length = $bad_position[$i]+1-$bad_position[$i-1]-1; | |
438 if ( $test_length > $current_length){ | |
439 $current_start = $bad_position[$i-1]+1; | |
440 $current_stop = $bad_position[$i]-1; | |
441 $current_length = $current_stop - $current_start +1; | |
442 } | |
443 } | |
444 | |
445 #dernier N -> fin | |
446 $test_length = $stop-$bad_position[$#bad_position]+1; | |
447 if ( $test_length > $current_length){ | |
448 $current_start = $bad_position[$#bad_position]+1; | |
449 if ($current_start > $stop){$current_start=$stop;} | |
450 $current_stop = $stop; | |
451 } | |
452 $coord{"start"}=$current_start; | |
453 $coord{"stop"}= $current_stop; | |
454 $coord{"lenght"}=$current_stop-$current_start+1; | |
455 | |
456 return \%coord; | |
457 } |