0
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4
|
|
5 my $read1 = $ARGV[0];
|
|
6 my $read2 = $ARGV[1];
|
|
7
|
|
8 my $output1 = $ARGV[2];
|
|
9 my $output2 = $ARGV[3];
|
|
10
|
|
11 my $TYPE = $ARGV[4];
|
|
12 my $MIN_LENGTH = $ARGV[5];
|
|
13 my $MIN_QUALITY = $ARGV[6];
|
|
14
|
|
15 my $VERBOSE = $ARGV[7];
|
|
16
|
|
17 if (!$VERBOSE){
|
|
18 $VERBOSE ="OFF";
|
|
19 }
|
|
20
|
|
21 open(READ1, $read1) or die ("Can't open $read1\n");
|
|
22 open(READ2, $read2) or die ("Can't open $read2\n");
|
|
23 open(OUT1, ">$output1") or die ("Can't open $output1\n");
|
|
24 open(OUT2, ">$output2") or die ("Can't open $output2\n");
|
|
25
|
|
26
|
|
27 my $error1=0;
|
|
28 my $error2=0;
|
|
29 my $error3=0;
|
|
30 my $error4=0;
|
|
31 my $error5=0;
|
|
32 my $error6=0;
|
|
33 my $error7=0;
|
|
34 my $error8=0;
|
|
35 my $error9=0;
|
|
36 my $error10=0;
|
|
37
|
|
38 while (my $ligne1_r1 =<READ1>){
|
|
39 my $ligne2_r1 =<READ1>;
|
|
40 my $ligne3_r1 =<READ1>;
|
|
41 my $ligne4_r1 =<READ1>;
|
|
42 my $ligne1_r2 =<READ2>;
|
|
43 my $ligne2_r2 =<READ2>;
|
|
44 my $ligne3_r2 =<READ2>;
|
|
45 my $ligne4_r2 =<READ2>;
|
|
46
|
|
47 #@ 1 sec
|
|
48 if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){
|
|
49 if ($VERBOSE eq "ON"){
|
|
50 print "Error in file format";
|
|
51 if ($ligne1_r1){print $ligne1_r1;}
|
|
52 if ($ligne2_r1){print $ligne2_r1;}
|
|
53 if ($ligne3_r1){print $ligne3_r1;}
|
|
54 if ($ligne4_r1){print $ligne4_r1;}
|
|
55 if ($ligne1_r2){print $ligne1_r2;}
|
|
56 if ($ligne2_r2){print $ligne2_r2;}
|
|
57 if ($ligne3_r2){print $ligne3_r2;}
|
|
58 if ($ligne4_r2){print $ligne4_r2;}
|
|
59 print "\n";
|
|
60 }
|
|
61 $error1++;
|
|
62 }
|
|
63 elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){
|
|
64 if ($VERBOSE eq "ON"){
|
|
65 print "Error in header : format\n";
|
|
66 print $ligne1_r1;
|
|
67 print $ligne2_r1;
|
|
68 print $ligne3_r1;
|
|
69 print $ligne4_r1;
|
|
70 print $ligne1_r2;
|
|
71 print $ligne2_r2;
|
|
72 print $ligne3_r2;
|
|
73 print $ligne4_r2;
|
|
74 print "\n";
|
|
75 }
|
|
76 $error2++;
|
|
77 }
|
|
78 #@ 1 - 2 sec
|
|
79 else {
|
|
80
|
|
81 my $length_seq1 = length($ligne2_r1);
|
|
82 my $length_qual1 =length($ligne4_r1);
|
|
83 my $seq1;
|
|
84 my $qual1;
|
|
85
|
|
86 my $length_seq2 = length($ligne2_r2);
|
|
87 my $length_qual2 =length($ligne4_r2);
|
|
88 my $seq2;
|
|
89 my $qual2;
|
|
90 my $header1="";
|
|
91 my $header2="";
|
|
92 my $repheader1="";
|
|
93 my $repheader2="";
|
|
94
|
|
95 if ($ligne1_r1 =~/^\@(.*?)\#/){
|
|
96 $header1 = $1;
|
|
97 }
|
|
98
|
|
99 if ($ligne3_r1 =~/^\+(.*?)\#/){
|
|
100 $repheader1 = $1;
|
|
101 }
|
|
102
|
|
103 if ($ligne1_r2 =~/^\@(.*?)\#/){
|
|
104 $header2 = $1;
|
|
105 }
|
|
106
|
|
107 if ($ligne3_r2 =~/^\+(.*?)\#/){
|
|
108 $repheader2 = $1;
|
|
109 }
|
|
110 #@ 2 sec
|
|
111
|
|
112 ### Verification de la coherence sequence /qualité @ 1 sec
|
|
113 if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){
|
|
114 if ($VERBOSE eq "ON"){
|
|
115 print "Error in header : empty\n";
|
|
116 print $ligne1_r1;
|
|
117 print $ligne2_r1;
|
|
118 print $ligne3_r1;
|
|
119 print $ligne4_r1;
|
|
120 print $ligne1_r2;
|
|
121 print $ligne2_r2;
|
|
122 print $ligne3_r2;
|
|
123 print $ligne4_r2;
|
|
124 print "\n";
|
|
125 }
|
|
126 $error3++;
|
|
127 }
|
|
128 elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){
|
|
129 if ($VERBOSE eq "ON"){
|
|
130 print "Error in header refgsd : empty\n";
|
|
131 print $ligne1_r1;
|
|
132 print $ligne2_r1;
|
|
133 print $ligne3_r1;
|
|
134 print $ligne4_r1;
|
|
135 print $ligne1_r2;
|
|
136 print $ligne2_r2;
|
|
137 print $ligne3_r2;
|
|
138 print $ligne4_r2;
|
|
139 print "\n";
|
|
140 }
|
|
141 $error3++;
|
|
142 }
|
|
143 elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){
|
|
144 if ($VERBOSE eq "ON"){
|
|
145 print "Error in header : different\n";
|
|
146 print $ligne1_r1;
|
|
147 print $ligne2_r1;
|
|
148 print $ligne3_r1;
|
|
149 print $ligne4_r1;
|
|
150 print $ligne1_r2;
|
|
151 print $ligne2_r2;
|
|
152 print $ligne3_r2;
|
|
153 print $ligne4_r2;
|
|
154 print "\n";
|
|
155 }
|
|
156 $error4++;
|
|
157 }
|
|
158 elsif (($TYPE eq "sanger")&&($header1 ne $header2)){
|
|
159 if ($VERBOSE eq "ON"){
|
|
160 print "Error in header : different\n";
|
|
161 print $ligne1_r1;
|
|
162 print $ligne2_r1;
|
|
163 print $ligne3_r1;
|
|
164 print $ligne4_r1;
|
|
165 print $ligne1_r2;
|
|
166 print $ligne2_r2;
|
|
167 print $ligne3_r2;
|
|
168 print $ligne4_r2;
|
|
169 print "\n";
|
|
170 }
|
|
171 $error4++;
|
|
172 }
|
|
173 elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){
|
|
174 if ($VERBOSE eq "ON"){
|
|
175 print "Error in seq/qual length\n";
|
|
176 print $ligne1_r1;
|
|
177 print $ligne2_r1;
|
|
178 print $ligne3_r1;
|
|
179 print $ligne4_r1;
|
|
180 print $ligne1_r2;
|
|
181 print $ligne2_r2;
|
|
182 print $ligne3_r2;
|
|
183 print $ligne4_r2;
|
|
184 print "\n";
|
|
185 }
|
|
186 $error5++;
|
|
187 }
|
|
188 #@ 1 - 2 sec
|
|
189 else {
|
|
190 ### Parsing sequence & qualité
|
|
191 if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){
|
|
192 $seq1 = $1;
|
|
193 }
|
|
194 if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){
|
|
195 $seq2 = $1;
|
|
196 }
|
|
197 if ($ligne4_r1 =~ /^(.*)\s*$/i){
|
|
198 $qual1 = $1;
|
|
199 }
|
|
200 if ($ligne4_r2 =~ /^(.*)\s*$/i){
|
|
201 $qual2 = $1;
|
|
202 }
|
|
203 #@ 2 sec
|
|
204 ### Verification du parsing et de la coherence sequence /qualité (n°2)
|
|
205 if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){
|
|
206 if ($VERBOSE eq "ON"){
|
|
207 print "Error parsing seq / quality \n";
|
|
208 print $ligne1_r1;
|
|
209 print $ligne2_r1;
|
|
210 print $ligne3_r1;
|
|
211 print $ligne4_r1;
|
|
212 print $ligne1_r2;
|
|
213 print $ligne2_r2;
|
|
214 print $ligne3_r2;
|
|
215 print $ligne4_r2;
|
|
216 print "\n";
|
|
217 }
|
|
218 $error6++;
|
|
219 }
|
|
220 elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){
|
|
221 if ($VERBOSE eq "ON"){
|
|
222 print "Error in seq/qual length after parsing\n";
|
|
223 print $ligne1_r1;
|
|
224 print $ligne2_r1;
|
|
225 print $ligne3_r1;
|
|
226 print $ligne4_r1;
|
|
227 print $ligne1_r2;
|
|
228 print $ligne2_r2;
|
|
229 print $ligne3_r2;
|
|
230 print $ligne4_r2;
|
|
231 print "\n";
|
|
232 }
|
|
233 $error7++;
|
|
234 }
|
|
235 #@ <1 sec
|
|
236 else {
|
|
237 my $fastq_lines_r1="";
|
|
238 my $fastq_lines_r2="";
|
|
239 $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1);
|
|
240 if ($fastq_lines_r1){
|
|
241 $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2);
|
|
242 }
|
|
243 if ($fastq_lines_r2){
|
|
244 print OUT1 $fastq_lines_r1;
|
|
245 print OUT2 $fastq_lines_r2;
|
|
246 }
|
|
247 }
|
|
248 }
|
|
249
|
|
250 # print OUT1 $ligne1_r1;
|
|
251 # print OUT1 $ligne2_r1;
|
|
252 # print OUT1 $ligne3_r1;
|
|
253 # print OUT1 $ligne4_r1;
|
|
254 # print OUT2 $ligne1_r2;
|
|
255 # print OUT2 $ligne2_r2;
|
|
256 # print OUT2 $ligne3_r2;
|
|
257 # print OUT2 $ligne4_r2;
|
|
258
|
|
259 #@ 7 sec
|
|
260 }
|
|
261 }
|
|
262
|
|
263
|
|
264
|
|
265 close (READ1);
|
|
266 close (READ2);
|
|
267 close (OUT1);
|
|
268 close (OUT2);
|
|
269
|
|
270
|
|
271
|
|
272
|
|
273 sub grooming_and_trimming{
|
|
274 my $header = shift;
|
|
275 my $seq = shift;
|
|
276 my $quality = shift;
|
|
277 my $quality_converted="";
|
|
278
|
|
279 my $startnoN = 0;
|
|
280 my $stopnoN = length($quality)-1;
|
|
281
|
4
|
282 #print "HEAD:\t$header";
|
|
283 #print "SEQ:\t$seq\n";
|
0
|
284
|
|
285 my $chercheN = $seq;
|
|
286 my @bad_position;
|
|
287 my $current_index = index($chercheN,"N");
|
|
288 my $abs_index = $current_index;
|
|
289 while ($current_index >=0){
|
|
290 push (@bad_position,$abs_index);
|
|
291
|
|
292 if ($current_index<length($seq)){
|
|
293 $chercheN = substr($chercheN,$current_index+1);
|
|
294 $current_index = index($chercheN,"N");
|
|
295 $abs_index = $current_index + $bad_position[$#bad_position]+1;
|
|
296 }
|
|
297 else {
|
|
298 last;
|
|
299 }
|
|
300 }
|
|
301
|
|
302
|
|
303 if ($#bad_position>=0){
|
|
304 my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)};
|
|
305 $startnoN = $coord{"start"};
|
|
306 $stopnoN = $coord{"stop"};
|
|
307 }
|
|
308 my $lengthnoN = $stopnoN - $startnoN + 1;
|
|
309 my $seqnoN = substr($seq,$startnoN,$lengthnoN);
|
4
|
310 # print "SEQnoN\t:$seqnoN\n";
|
|
311 # for (my $i=0;$i<=$#bad_position;$i++){
|
|
312 # print $bad_position[$i]."\t";
|
|
313 # }
|
|
314 # print "\n";
|
0
|
315
|
|
316 if ($lengthnoN >= $MIN_LENGTH){
|
|
317 my $startTrim = $startnoN;
|
|
318 my $stopTrim = $stopnoN;
|
|
319
|
|
320 my $quality_converted="";
|
4
|
321 #my @bad_position;
|
0
|
322
|
|
323 my @q = split(//,$quality);
|
|
324 #print "QUALITY\n";
|
|
325 #print "$quality\n";
|
|
326 for (my $i=0;$i<=$stopnoN;$i++){
|
|
327 my $chr = $q[$i];
|
|
328 my $num = ord($q[$i]);
|
|
329 if ($TYPE eq "illumina"){
|
|
330 $num = $num -64+33;
|
|
331 $quality_converted .= chr($num);
|
|
332 }
|
|
333
|
|
334 if ($num <$MIN_QUALITY + 64 - 33 ){
|
|
335 push(@bad_position,$i+$startnoN);
|
|
336 }
|
|
337 }
|
|
338 if ($quality_converted){$quality = $quality_converted;}
|
|
339 #print "$quality\n";
|
|
340
|
|
341
|
|
342
|
|
343 if ($#bad_position>=0){
|
4
|
344 @bad_position = sort {$a <=> $b} @bad_position;
|
|
345 # for (my $i=0;$i<=$#bad_position;$i++){
|
|
346 # print $bad_position[$i]."\t";
|
|
347 # }
|
|
348 # print "\n";
|
0
|
349 my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)};
|
|
350 $startTrim = $coord{"start"};
|
|
351 $stopTrim = $coord{"stop"};
|
|
352 #print "$startTrim .. $stopTrim\n";
|
|
353
|
|
354 }
|
|
355 my $lengthTrim = $stopTrim - $startTrim +1;
|
|
356
|
|
357
|
|
358 my $fastq_lines="";
|
|
359
|
|
360 if ($lengthTrim >= $MIN_LENGTH){
|
|
361 $fastq_lines .= $header;
|
|
362 $fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n";
|
|
363 $fastq_lines .= "+\n";
|
|
364 $fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n";
|
4
|
365 # print $fastq_lines;
|
0
|
366 return $fastq_lines;
|
4
|
367
|
0
|
368 }
|
|
369 else {
|
|
370 return "";
|
|
371 }
|
|
372
|
|
373
|
|
374
|
|
375 }
|
|
376 else {
|
|
377 return "";
|
|
378 }
|
|
379
|
|
380
|
|
381 # my @s = split(//,$seq);
|
|
382 # my $sanger_quality="";
|
|
383
|
|
384
|
|
385
|
|
386
|
|
387 # return $sanger_quality;
|
|
388 }
|
|
389
|
|
390 sub extract_longer_string_coordinates_from_bad_position{
|
|
391 my $start=shift;
|
|
392 my $stop =shift;
|
|
393 my $refbad = shift;
|
|
394 my @bad_position = @$refbad;
|
|
395 my %coord;
|
|
396
|
|
397 my $current_start = $start;
|
|
398 my $current_stop = $bad_position[0]-1;
|
|
399 if ($current_stop < $start){$current_stop = $start;}
|
|
400
|
|
401
|
|
402 #debut -> premier N
|
|
403 my $current_length = $current_stop - $current_start +1;
|
|
404 my $test_length;
|
|
405
|
|
406 #entre les N
|
|
407 for (my $i=1;$i<=$#bad_position;$i++){
|
|
408 $test_length = $bad_position[$i]+1-$bad_position[$i-1]-1;
|
|
409 if ( $test_length > $current_length){
|
|
410 $current_start = $bad_position[$i-1]+1;
|
|
411 $current_stop = $bad_position[$i]-1;
|
|
412 $current_length = $current_stop - $current_start +1;
|
|
413 }
|
|
414 }
|
|
415
|
|
416 #dernier N -> fin
|
|
417 $test_length = $stop-$bad_position[$#bad_position]+1;
|
|
418 if ( $test_length > $current_length){
|
|
419 $current_start = $bad_position[$#bad_position]+1;
|
|
420 if ($current_start > $stop){$current_start=$stop;}
|
|
421 $current_stop = $stop;
|
|
422 }
|
|
423 $coord{"start"}=$current_start;
|
|
424 $coord{"stop"}= $current_stop;
|
|
425 $coord{"lenght"}=$current_stop-$current_start+1;
|
|
426
|
|
427 return \%coord;
|
|
428 }
|