Mercurial > repos > mcharles > rapsosnp
comparison rapsodyn/PrepareFastqLight.pl @ 7:3f7b0788a1c4 draft
Uploaded
author | mcharles |
---|---|
date | Tue, 07 Oct 2014 10:34:34 -0400 |
parents | b0cbb9d21aa9 |
children | d857538d9fea |
comparison
equal
deleted
inserted
replaced
6:1776b8ddd87e | 7:3f7b0788a1c4 |
---|---|
1 #!/usr/bin/perl | 1 #!/usr/bin/perl |
2 #V1.0.1 added log, option parameters | |
2 use strict; | 3 use strict; |
3 use warnings; | 4 use warnings; |
4 | 5 use Getopt::Long; |
5 my $read1 = $ARGV[0]; | 6 |
6 my $read2 = $ARGV[1]; | 7 my $read1_file; |
7 | 8 my $read2_file; |
8 my $output1 = $ARGV[2]; | 9 my $log_file; |
9 my $output2 = $ARGV[3]; | 10 my $output1_file; |
10 | 11 my $output2_file; |
11 my $TYPE = $ARGV[4]; | 12 |
12 my $MIN_LENGTH = $ARGV[5]; | 13 my $TYPE="sanger"; |
13 my $MIN_QUALITY = $ARGV[6]; | 14 my $MIN_LENGTH=30; |
14 | 15 my $MIN_QUALITY=30; |
15 my $VERBOSE = $ARGV[7]; | 16 |
16 | 17 my $VERBOSE = "OFF"; |
17 if (!$VERBOSE){ | 18 |
18 $VERBOSE ="OFF"; | 19 GetOptions ( |
19 } | 20 "read1_file=s" => \$read1_file, |
20 | 21 "read2_file=s" => \$read2_file, |
21 open(READ1, $read1) or die ("Can't open $read1\n"); | 22 "log_file=s" => \$log_file, |
22 open(READ2, $read2) or die ("Can't open $read2\n"); | 23 "output1_file=s" => \$output1_file, |
23 open(OUT1, ">$output1") or die ("Can't open $output1\n"); | 24 "output2_file=s" => \$output2_file, |
24 open(OUT2, ">$output2") or die ("Can't open $output2\n"); | 25 "type=s" => \$TYPE, |
25 | 26 "min_length=i" => \$MIN_LENGTH, |
27 "min_quality=i" => \$MIN_QUALITY, | |
28 "verbose=s" => \$VERBOSE | |
29 ) or die("Error in command line arguments\n"); | |
30 | |
31 | |
32 my $nb_read1=0; | |
33 my $nb_base_read1=0; | |
34 my $nb_read2=0; | |
35 my $nb_base_read2=0; | |
36 | |
37 my $nb_read1_t=0; | |
38 my $nb_base_read1_t=0; | |
39 my $nb_read2_t=0; | |
40 my $nb_base_read2_t=0; | |
41 | |
42 my $nb_base_current_t=0; | |
43 | |
44 | |
45 open(READ1, $read1_file) or die ("Can't open $read1_file\n"); | |
46 open(READ2, $read2_file) or die ("Can't open $read2_file\n"); | |
47 open(OUT1, ">$output1_file") or die ("Can't open $output1_file\n"); | |
48 open(OUT2, ">$output2_file") or die ("Can't open $output2_file\n"); | |
26 | 49 |
27 my $error1=0; | 50 my $error1=0; |
28 my $error2=0; | 51 my $error2=0; |
29 my $error3=0; | 52 my $error3=0; |
30 my $error4=0; | 53 my $error4=0; |
41 my $ligne4_r1 =<READ1>; | 64 my $ligne4_r1 =<READ1>; |
42 my $ligne1_r2 =<READ2>; | 65 my $ligne1_r2 =<READ2>; |
43 my $ligne2_r2 =<READ2>; | 66 my $ligne2_r2 =<READ2>; |
44 my $ligne3_r2 =<READ2>; | 67 my $ligne3_r2 =<READ2>; |
45 my $ligne4_r2 =<READ2>; | 68 my $ligne4_r2 =<READ2>; |
69 | |
70 $nb_read1++; | |
71 $nb_read2++; | |
46 | 72 |
47 #@ 1 sec | 73 #@ 1 sec |
48 if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){ | 74 if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){ |
49 if ($VERBOSE eq "ON"){ | 75 if ($VERBOSE eq "ON"){ |
50 print "Error in file format"; | 76 print "Error in file format"; |
89 my $qual2; | 115 my $qual2; |
90 my $header1=""; | 116 my $header1=""; |
91 my $header2=""; | 117 my $header2=""; |
92 my $repheader1=""; | 118 my $repheader1=""; |
93 my $repheader2=""; | 119 my $repheader2=""; |
120 | |
94 | 121 |
95 if ($ligne1_r1 =~/^\@(.*?)\#/){ | 122 if ($ligne1_r1 =~/^\@(.*?)\#/){ |
96 $header1 = $1; | 123 $header1 = $1; |
97 } | 124 } |
98 | 125 |
188 #@ 1 - 2 sec | 215 #@ 1 - 2 sec |
189 else { | 216 else { |
190 ### Parsing sequence & qualité | 217 ### Parsing sequence & qualité |
191 if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){ | 218 if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){ |
192 $seq1 = $1; | 219 $seq1 = $1; |
220 $nb_base_read1 += length($seq1); | |
193 } | 221 } |
194 if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){ | 222 if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){ |
195 $seq2 = $1; | 223 $seq2 = $1; |
224 $nb_base_read2 += length($seq2); | |
196 } | 225 } |
197 if ($ligne4_r1 =~ /^(.*)\s*$/i){ | 226 if ($ligne4_r1 =~ /^(.*)\s*$/i){ |
198 $qual1 = $1; | 227 $qual1 = $1; |
199 } | 228 } |
200 if ($ligne4_r2 =~ /^(.*)\s*$/i){ | 229 if ($ligne4_r2 =~ /^(.*)\s*$/i){ |
234 } | 263 } |
235 #@ <1 sec | 264 #@ <1 sec |
236 else { | 265 else { |
237 my $fastq_lines_r1=""; | 266 my $fastq_lines_r1=""; |
238 my $fastq_lines_r2=""; | 267 my $fastq_lines_r2=""; |
268 my $nb_base_current_read1_t = 0; | |
269 my $nb_base_current_read2_t = 0; | |
270 | |
239 $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1); | 271 $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1); |
272 $nb_base_current_read1_t = $nb_base_current_t; | |
240 if ($fastq_lines_r1){ | 273 if ($fastq_lines_r1){ |
241 $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2); | 274 $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2); |
275 $nb_base_current_read2_t = $nb_base_current_t; | |
242 } | 276 } |
243 if ($fastq_lines_r2){ | 277 if ($fastq_lines_r2){ |
244 print OUT1 $fastq_lines_r1; | 278 print OUT1 $fastq_lines_r1; |
245 print OUT2 $fastq_lines_r2; | 279 print OUT2 $fastq_lines_r2; |
280 | |
281 $nb_read1_t++; | |
282 $nb_read2_t++; | |
283 $nb_base_read1_t += $nb_base_current_read1_t; | |
284 $nb_base_read2_t += $nb_base_current_read2_t; | |
285 | |
286 | |
246 } | 287 } |
247 } | 288 } |
248 } | 289 } |
249 | 290 |
250 # print OUT1 $ligne1_r1; | 291 |
251 # print OUT1 $ligne2_r1; | |
252 # print OUT1 $ligne3_r1; | |
253 # print OUT1 $ligne4_r1; | |
254 # print OUT2 $ligne1_r2; | |
255 # print OUT2 $ligne2_r2; | |
256 # print OUT2 $ligne3_r2; | |
257 # print OUT2 $ligne4_r2; | |
258 | |
259 #@ 7 sec | 292 #@ 7 sec |
260 } | 293 } |
261 } | 294 } |
262 | |
263 | |
264 | 295 |
265 close (READ1); | 296 close (READ1); |
266 close (READ2); | 297 close (READ2); |
267 close (OUT1); | 298 close (OUT1); |
268 close (OUT2); | 299 close (OUT2); |
269 | 300 |
301 open (LF,">$log_file") or die("Can't open $log_file\n"); | |
302 print LF "\n####\t Fastq preparation \n"; | |
303 print LF "## Before preparation\n"; | |
304 print LF "#Read1 :\t$nb_read1\t#Base :\t$nb_base_read1\n"; | |
305 print LF "#Read2 :\t$nb_read2\t#Base :\t$nb_base_read2\n"; | |
306 print LF "## After preparation\n"; | |
307 print LF "#Read1 :\t$nb_read1_t\t#Base :\t$nb_base_read1_t\n"; | |
308 print LF "#Read2 :\t$nb_read2_t\t#Base :\t$nb_base_read2_t\n"; | |
309 close (LF); | |
310 | |
270 | 311 |
271 sub grooming_and_trimming{ | 312 sub grooming_and_trimming{ |
272 my $header = shift; | 313 my $header = shift; |
273 my $seq = shift; | 314 my $seq = shift; |
274 my $quality = shift; | 315 my $quality = shift; |
324 $stopTrim = $coord{"stop"}; | 365 $stopTrim = $coord{"stop"}; |
325 #print "$startTrim .. $stopTrim\n"; | 366 #print "$startTrim .. $stopTrim\n"; |
326 | 367 |
327 } | 368 } |
328 my $lengthTrim = $stopTrim - $startTrim +1; | 369 my $lengthTrim = $stopTrim - $startTrim +1; |
329 | 370 |
371 #if ($stats_length{$lengthTrim}){ | |
372 # $stats_length{$lengthTrim} = 1; | |
373 #} | |
374 #else { | |
375 # $stats_length{$lengthTrim}++; | |
376 #} | |
330 my $fastq_lines=""; | 377 my $fastq_lines=""; |
331 | 378 |
332 # if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){ | 379 # if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){ |
333 # print "HEAD:\t$header"; | 380 # print "HEAD:\t$header"; |
334 # print "SEQ:\n$seq\n"; | 381 # print "SEQ:\n$seq\n"; |
341 # print "$startTrim .. $stopTrim / $lengthTrim \n"; | 388 # print "$startTrim .. $stopTrim / $lengthTrim \n"; |
342 # print $fastq_lines; | 389 # print $fastq_lines; |
343 # print "\n"; | 390 # print "\n"; |
344 # } | 391 # } |
345 | 392 |
393 #for (my $i=$startTrim;$i<=$stopTrim;$i++){ | |
394 # if ($stats_quality{ord($q{$i])}){ | |
395 # $stats_quality{ord($q{$i])}=1; | |
396 # } | |
397 # else { | |
398 # $stats_quality{ord($q{$i])}++; | |
399 # } | |
400 #} | |
401 | |
346 if ($lengthTrim >= $MIN_LENGTH){ | 402 if ($lengthTrim >= $MIN_LENGTH){ |
347 $fastq_lines .= $header; | 403 $fastq_lines .= $header; |
348 $fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n"; | 404 my $new_seq = substr($seq,$startTrim,$lengthTrim); |
405 $nb_base_current_t = length($new_seq); | |
406 $fastq_lines .= $new_seq."\n"; | |
349 $fastq_lines .= "+\n"; | 407 $fastq_lines .= "+\n"; |
350 $fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n"; | 408 my $new_q = substr($quality,$startTrim,$lengthTrim); |
409 $fastq_lines .= $new_q."\n"; | |
351 return $fastq_lines; | 410 return $fastq_lines; |
352 | 411 |
353 } | 412 } |
354 else { | 413 else { |
355 #print "Insufficient length after trimming\n"; | 414 #print "Insufficient length after trimming\n"; |