1
|
1 #!/usr/bin/perl
|
|
2
|
|
3 # load modules
|
|
4 use Getopt::Std;
|
|
5 use threads;
|
|
6 use Thread::Queue;
|
|
7 use threads::shared;
|
|
8 use File::Basename;
|
|
9 use List::Util qw(max);
|
|
10
|
|
11 $now = time ;
|
|
12
|
|
13 $|++;
|
|
14 # variables
|
|
15 my $paired :shared;
|
|
16 $paired = 0;
|
|
17 my $fq :shared;
|
|
18 my $rq :shared;
|
|
19 my $ofq :shared;
|
|
20 my $orq :shared;
|
|
21 my $trimq :shared;
|
|
22 my $failedfile :shared;
|
|
23 my $done :shared;
|
|
24 $done = 0;
|
|
25 my $nrin :shared;
|
|
26 $nrin = 0;
|
|
27
|
|
28 my $rqz :shared;
|
|
29 my $fqz :shared;
|
|
30 my $rand :shared;
|
|
31 my $discarded :shared;
|
|
32 $discarded = 0;
|
|
33 my $verbose :shared;
|
|
34 $verbose = 1;
|
|
35 my $trim3 :shared;
|
|
36 $trim3 = 1;
|
|
37 my $trim5 :shared;
|
|
38 $trim5 = 1;
|
|
39 # create file lock in the tmp dir
|
|
40 $rand = int(rand(10000));
|
|
41 while (-e "/tmp/$rand.lock") {
|
|
42 $rand = int(rand(10000));
|
|
43 }
|
|
44 system("touch '/tmp/$rand.lock'");
|
|
45 if (!-e "/tmp/sg.write.lock") {
|
|
46 system("touch '/tmp/sg.write.lock'");
|
|
47 system("chmod 777 '/tmp/sg.write.lock'");
|
|
48 }
|
|
49
|
|
50 # opts
|
|
51 # i : path to forward read FASTQ file
|
|
52 # I : path to reverse read FASTQ file
|
|
53 # q : quality threshold in phred.
|
|
54 # n : string representing common read name parts
|
|
55 # s : trimming style : simple/bwa
|
|
56 # o : output file for forward reads
|
|
57 # O : output file for reverse reads
|
|
58 # F : output file with fully failed reads pairs.
|
|
59 # v : verbose: defaults to on (0/1)
|
|
60 # S : Trim on side: 5/3/b(oth)
|
|
61 # m : minimal length.
|
|
62 getopts('i:I:q:n:s:o:O:F:v:S:m:', \%opts) ;
|
|
63
|
|
64
|
|
65 if (defined($opts{'v'})) {
|
|
66 $verbose = $opts{'v'};
|
|
67 }
|
|
68
|
|
69 if (defined($opts{'s'})) {
|
|
70 if ($opts{'s'} eq 'bwa') {
|
|
71 if ($verbose == 1) {
|
|
72 print "Using BWA style trimming\n";
|
|
73 }
|
|
74 $style = 'bwa';
|
|
75 }
|
|
76 else {
|
|
77 if ($verbose == 1) {
|
|
78 print "Using simple 1bp Quality trimming\n";
|
|
79 }
|
|
80 $style = 'simple';
|
|
81 }
|
|
82 }
|
|
83 else {
|
|
84 if ($verbose == 1){
|
|
85 print "Using simple 1bp Quality trimming\n";
|
|
86 }
|
|
87 $style = 'simple';
|
|
88 }
|
|
89
|
|
90 if (!defined($opts{'i'})) { die('Forward reads fastq is mandatory');}
|
|
91 $fq = $opts{'i'};
|
|
92 $ofq = $opts{'o'};
|
|
93
|
|
94 ## gzipped infile?
|
|
95 my $out = `file $fq`;
|
|
96 if ($out =~ m/gzip compressed/) {
|
|
97 if ($verbose == 1) {
|
|
98 print "Forward reads in gzipped format.\n";
|
|
99 }
|
|
100 $fqz = 1;
|
|
101 }
|
|
102 else {
|
|
103 if ($verbose == 1) {
|
|
104 print "Forward reads in plain fastq format.\n";
|
|
105 }
|
|
106 $fqz = 0;
|
|
107 }
|
|
108
|
|
109
|
|
110 if (defined($opts{'I'})) {
|
|
111 if ($verbose == 1) {
|
|
112 print "Reverse reads provided. Performing paired-end trimming\n";
|
|
113 }
|
|
114 $paired = 1;
|
|
115 $rq = $opts{'I'};
|
|
116 $orq = $opts{'O'};
|
|
117 # gzipped ?
|
|
118 my $out = `file $rq`;
|
|
119 if ($out =~ m/gzip compressed/) {
|
|
120 if ($verbose == 1) {
|
|
121 print "Reverse reads in gzipped format.\n";
|
|
122 }
|
|
123 $rqz = 1;
|
|
124 }
|
|
125 else {
|
|
126 if ($verbose == 1) {
|
|
127 print "Reverse reads in plain fastq format.\n";
|
|
128 }
|
|
129 $rqz = 0;
|
|
130 }
|
|
131 }
|
|
132 ## FAILED FILENAME
|
|
133 $failedfile = $opts{'F'};
|
|
134
|
|
135 ## set the sides to trim
|
|
136 my $trimside = '';
|
|
137 if (defined($opts{'S'})) {
|
|
138 if ($opts{'S'} eq '3') {
|
|
139 $trimside = "Trimming reads from the 3' end only\n";
|
|
140 $trim3 = 1;
|
|
141 $trim5 = 0;
|
|
142 }
|
|
143 elsif ($opts{'S'} eq '5') {
|
|
144 $trimside = "Trimming reads from the 5' end only\n";
|
|
145 $trim3 = 0;
|
|
146 $trim5 = 1;
|
|
147 }
|
|
148 elsif ($opts{'S'} eq 'b') {
|
|
149 $trimside = "Trimming reads both from 5' and 3'\n";
|
|
150 $trim3 = 1;
|
|
151 $trim5 = 1;
|
|
152 }
|
|
153 else {
|
|
154 $trimside = "Invalid trimming side specification. Setting to both sides trimming\n";
|
|
155 $trim3 = 1;
|
|
156 $trim5 = 1;
|
|
157 }
|
|
158 }
|
|
159 else {
|
|
160 $trimside = "Invalid trimming side specification. Setting to both sides trimming\n";
|
|
161 $trim3 = 1;
|
|
162 $trim5 = 1;
|
|
163 }
|
|
164 if ($verbose == 1) {
|
|
165 print $trimside;
|
|
166 }
|
|
167
|
|
168 # SET MINIMAL LENGTH
|
|
169 my $minlength :shared;
|
|
170 if (exists($opts{'m'})) {
|
|
171 $minlength = $opts{'m'};
|
|
172 }
|
|
173 else {
|
|
174 $minlength = 0;
|
|
175 }
|
|
176
|
|
177 ## CHECK IF OUTPUT FILESNAMES ARE PRESENT
|
|
178 if ($ofq eq '') {
|
|
179 $ofq = "$fq.$style.trimmed";
|
|
180 if ($verbose == 1) {
|
|
181 print "No Forward output name specified. Forward reads will be printed to:\n\t$ofq\n";
|
|
182 }
|
|
183 }
|
|
184 if ($paired == 1 && $orq eq '') {
|
|
185 $orq = "$rq.$style.trimmed";
|
|
186 if ($verbose == 1) {
|
|
187 print "No Reverse output name specified. Reverse reads will be printed to:\n\t$orq\n";
|
|
188 }
|
|
189 }
|
|
190 if ($failedfile eq '') {
|
|
191 $failedfile = "$fq.Failed_Reads";
|
|
192 if ($verbose == 1) {
|
|
193 print "No failed ouput name specified. Failed reads will be printed to:\n\t$failedfile\n";
|
|
194 }
|
|
195 }
|
|
196
|
|
197 ## set trimming threshold
|
|
198 if (!defined($opts{'q'}) or !($opts{'q'} =~ m/^[0-9]+$/)) {
|
|
199 $trimq = 20;
|
|
200 }
|
|
201 else {
|
|
202 $trimq = $opts{'q'};
|
|
203 }
|
|
204
|
|
205 ## set read delimiter
|
|
206 if (defined($opts{'n'})) {
|
|
207 $indelim = $opts{'n'};
|
|
208 # galaxy replaced @ by __at__
|
|
209 if (substr($indelim,0,6) eq '__at__') {
|
|
210 $indelim = '@'.substr($indelim,6);
|
|
211 }
|
|
212 }
|
|
213 else {
|
|
214 print "No indelim defined\n";
|
|
215 $indelim = "@"; # has risk, fails if @ is first qual item
|
|
216 }
|
|
217
|
|
218 ## NEW : REPLACE if indelim = @ by the first 5 chars of first line of forward-fastq.
|
|
219 if ($indelim eq '@') {
|
|
220 if ($fqz == 0) {
|
|
221 open IN, $fq;
|
|
222 }
|
|
223 else {
|
|
224 open(IN, "gunzip -c $fq | ");
|
|
225 }
|
|
226 $head = <IN>;
|
|
227 $indelim = substr($head,0,5);
|
|
228 print "\n";
|
|
229 print "WARNING:\n";
|
|
230 print "########\n";
|
|
231 print " Setting Delimiter as '\@' is prone to errors\n";
|
|
232 print " as it will incorrectly split if '\@' is the firt value in the quality string !!\n";
|
|
233 print ' It is recommended to use eg -n \'\@SRR301\''."\n";
|
|
234 print " Therefore, the delimiter was changed to the first 5 characters of the first forward fastq file.\n";
|
|
235 print " => Delimiter : $indelim\n";
|
|
236 print "\n\n";
|
|
237 sleep 3;
|
|
238 }
|
|
239 ## create queues
|
|
240 my $totrim = Thread::Queue->new();
|
|
241 my $toprint = Thread::Queue->new();
|
|
242 my $dummy = Thread::Queue->new();
|
|
243 my $failed = Thread::Queue->new();
|
|
244
|
|
245 # monitor thread
|
|
246 my $finish :shared;
|
|
247 $finish = 0;
|
|
248 $mon = threads->create('monitor');
|
|
249
|
|
250 # start FASTQ reading thread
|
|
251 if ($verbose == 1) {
|
|
252 print "start reading file\n";
|
|
253 }
|
|
254 $reading = threads->create('readfile');
|
|
255
|
|
256 ## give time to build reading buffer
|
|
257 sleep 10;
|
|
258
|
|
259 # start trimming threads
|
|
260 if ($verbose == 1) {
|
|
261 print "start trimming\n";
|
|
262 }
|
|
263 if ($paired == 1) {
|
|
264 if ($style eq 'bwa') {
|
|
265 $trimming1 = threads->create('bwapaired');
|
|
266 $trimming2 = threads->create('bwapaired');
|
|
267
|
|
268 }
|
|
269 else {
|
|
270 $trimming1 = threads->create('simplepaired');
|
|
271 $trimming2 = threads->create('simplepaired');
|
|
272 }
|
|
273
|
|
274 }
|
|
275 else {
|
|
276 if ($style eq 'bwa') {
|
|
277 #print "not available yet, switching to simple trimming\n";
|
|
278 #$style = 'simple';
|
|
279 $trimming1 = threads->create('bwasingle');
|
|
280 $trimming2 = threads->create('bwasingle');
|
|
281
|
|
282 }
|
|
283 if ($style eq 'simple') {
|
|
284 $trimming1 = threads->create('simplesingle');
|
|
285 $trimming2 = threads->create('simplesingle');
|
|
286 }
|
|
287 }
|
|
288 # start printing threads
|
|
289 $printout = threads->create('printout');
|
|
290 $printfail = threads->create('printfailed');
|
|
291 # end threads
|
|
292 $reading->join();
|
|
293 ## all reads are queued for trimming, add undef to trimming queues to end threads.
|
|
294 $totrim->enqueue(undef);
|
|
295 $totrim->enqueue(undef);
|
|
296 $trimming1->join();
|
|
297 $trimming2->join();
|
|
298 ## all reads are trimmed and queued from printing, send undef to end printing threads.
|
|
299 $toprint->enqueue(undef);
|
|
300 $failed->enqueue(undef);
|
|
301 $printout->join();
|
|
302 $printfail->join();
|
|
303 if ($verbose == 1) {
|
|
304 print "Waiting for status monitor to shut down\n";
|
|
305 }
|
|
306 $mon->join();
|
|
307 ## copy files from /tmp to destination folder
|
|
308 if ($verbose == 1) {
|
|
309 print "Copying files to destination: Forward..";
|
|
310 }
|
|
311 if ($fqz == 0) {
|
|
312 system("cp -f /tmp/$rand.fq $ofq");
|
|
313 unlink("/tmp/$rand.fq");
|
|
314 }
|
|
315 else {
|
|
316 system("gzip -c /tmp/$rand.fq > $ofq");
|
|
317 unlink("/tmp/$rand.fq");
|
|
318 }
|
|
319 if ($paired == 1) {
|
|
320 if ($verbose == 1) {
|
|
321 print " Reverse..";
|
|
322 }
|
|
323 if ($rqz == 0) {
|
|
324 system("cp -f /tmp/$rand.rq $orq");
|
|
325 unlink("/tmp/$rand.rq");
|
|
326 }
|
|
327 else {
|
|
328 system("gzip -c /tmp/$rand.rq > $orq");
|
|
329 unlink("/tmp/$rand.rq");
|
|
330 }
|
|
331 }
|
|
332
|
|
333 if ($verbose == 1) {
|
|
334 print " Failed items..";
|
|
335 }
|
|
336 system("cp -f /tmp/$rand.failed $failedfile");
|
|
337 unlink("/tmp/$rand.failed");
|
|
338 if ($verbose == 1) {
|
|
339 print " Done\n";
|
|
340 }
|
|
341
|
|
342 # remove lock file
|
|
343 unlink("/tmp/$rand.lock");
|
|
344
|
|
345 ########################
|
|
346 ## print run details: ##
|
|
347 ########################
|
|
348 my $string = sprintf("%.2f",$done/1000000)."M Reads in (each) FASTQ file processed.\n";
|
|
349 if ($paired == 1) {
|
|
350 $string .= sprintf ("%.2f",($done-$discarded)/1000000)."M read pairs in output FASTQ's.\n";
|
|
351 $string .= sprintf("%.2f",$discarded/1000000)."M reads pairs discarded\n";
|
|
352 }
|
|
353 else {
|
|
354 $string .= sprintf ("%.2f",($done-$discarded)/1000000)."M reads in output FASTQ.\n";
|
|
355 $string .= sprintf("%.2f",$discarded/1000000)."M reads discarded\n";
|
|
356 }
|
|
357 print $string;
|
|
358 ##################
|
|
359 # PRINT RUN-TIME #
|
|
360 ##################
|
|
361 $now = time - $now;
|
|
362 printf("\n\nRunning time:%02d:%02d:%02d\n",int($now/3600),int(($now % 3600)/60),int($now % 60));
|
|
363
|
|
364
|
|
365 exit();
|
|
366
|
|
367
|
|
368
|
|
369 ## SUBROUTINE : READ FASTQ FILES
|
|
370 ################################
|
|
371 sub readfile {
|
|
372 if ($paired == 1) {
|
|
373 if ($fqz == 0) {
|
|
374 open IN1, $fq;
|
|
375 }
|
|
376 else {
|
|
377 open(IN1, "gunzip -c $fq | ");
|
|
378 }
|
|
379 if ($rqz == 0) {
|
|
380 open IN2, $rq;
|
|
381 }
|
|
382 else {
|
|
383 open(IN2, "gunzip -c $rq | ");
|
|
384 }
|
|
385 $/ = "\n$indelim";
|
|
386 $deliml = length($indelim);
|
|
387 # compose pack ignore string
|
|
388 my $dx = '';
|
|
389 for (my $idx = 0; $idx<length($indelim); $idx++) {
|
|
390 $dx .= 'X';
|
|
391 }
|
|
392 my $f = <IN1>;
|
|
393 my $r = <IN2>;
|
|
394 # the reads contain the delimiter on the end (cf \n on regular file read line endings.)
|
|
395 # so we strip teh delimiter using pack, leaving just the \n.
|
|
396 # this means that in subsequent reads, we need to add the delimiter to the front of the read.
|
|
397 my $read = substr(pack("A*$dx",$f),$deliml) . $r; #pack("A*$dx",$r) ;#. '|';
|
|
398 my $counter = 1;
|
|
399 my $fr = '';
|
|
400 my $rr = '';
|
|
401 my $reads = '';
|
|
402 my $lastf = '';
|
|
403 while ($fr = <IN1>) {
|
|
404 $rr = <IN2>;
|
|
405 $lastf = $fr;
|
|
406 ## add delim to front, trim from back. add our delim.
|
|
407 ## done in next iteration, because last is different.
|
|
408 # the delimiter from the forward read is used to complete the reverse read.
|
|
409 $reads .= $indelim.pack("A*$dx",$read) .'|';
|
|
410 # concat reads
|
|
411 $read = $fr.$rr;
|
|
412 ## add string to queue if 20,000 reads in array
|
|
413 if ($counter == 20000) {
|
|
414 $nrin = $nrin + 20000;
|
|
415 $totrim->enqueue(pack("A*X",$reads)); # pack("A*X") strips last |
|
|
416 $reads = '';
|
|
417 $counter = 0;
|
|
418 }
|
|
419 $counter++;
|
|
420
|
|
421 ## dummy (see monitor).
|
|
422 my $d = $dummy->dequeue_nb();
|
|
423 }
|
|
424 # last reads in files need special care.
|
|
425 $reads .= $indelim.$lastf.$indelim.$rr;
|
|
426 close IN1;
|
|
427 close IN2;
|
|
428 # submit last reads to queue
|
|
429 if ($reads ne '') {
|
|
430 #$totrim->enqueue(pack("A*X",$reads));
|
|
431 $totrim->enqueue($reads);
|
|
432 }
|
|
433 if ($verbose == 1) {
|
|
434 print "All reads queued for trimming\n";
|
|
435 }
|
|
436 }
|
|
437 else {
|
|
438 if ($fqz == 0) {
|
|
439 open IN1, $fq;
|
|
440 }
|
|
441 else {
|
|
442 open(IN1, "gunzip -c $fq | ");
|
|
443 }
|
|
444 $/ = "\n$indelim";
|
|
445 $deliml = length($indelim);
|
|
446 # compose pack ignore string
|
|
447 my $dx = '';
|
|
448 for (my $idx = 0; $idx<length($indelim); $idx++) {
|
|
449 $dx .= 'X';
|
|
450 }
|
|
451 my $f = <IN1>;
|
|
452 # the reads contain the delimiter on the end (cf \n on regular file read line endings.)
|
|
453 # so we strip teh delimiter using pack, leaving just the \n.
|
|
454 # this means that in subsequent reads, we need to add the delimiter to the front of the read.
|
|
455 my $read = substr($f,$deliml);
|
|
456 my $counter = 1;
|
|
457 my $fr = '';
|
|
458 my $reads = '';
|
|
459 while ($fr = <IN1>) {
|
|
460 ## add delim to front, trim from back. add our delim.
|
|
461 ## done in next iteration, because last is different.
|
|
462 $reads .= $indelim.pack("A*$dx",$read) .'|';
|
|
463 # concat reads
|
|
464 $read = $fr;
|
|
465 ## add string to queue if 10,000 reads in array
|
|
466 if ($counter == 40000) {
|
|
467 $nrin = $nrin + 40000;
|
|
468 $totrim->enqueue(pack("A*X",$reads));
|
|
469 $reads = '';
|
|
470 $counter = 0;
|
|
471 }
|
|
472 $counter++;
|
|
473
|
|
474 ## dummy (see monitor).
|
|
475 my $d = $dummy->dequeue_nb();
|
|
476 }
|
|
477 $reads .= $indelim.$read;
|
|
478 close IN1;
|
|
479 # submit last reads to queue
|
|
480 if ($reads ne '') {
|
|
481 #$totrim->enqueue(pack("A*X",$reads));
|
|
482 $totrim->enqueue($reads);
|
|
483 }
|
|
484 if ($verbose == 1) {
|
|
485 print "All reads queued for trimming\n";
|
|
486 }
|
|
487
|
|
488 }
|
|
489 }
|
|
490
|
|
491 sub bwapaired {
|
|
492 my $tq = $trimq;
|
|
493 while ( defined(my $rstring = $totrim->dequeue())) {
|
|
494 my @reads = split(/\|/,$rstring); # each item has 20.000 reads
|
|
495 my $printstring = '';
|
|
496 my $failedstring = '';
|
|
497 foreach (@reads) {
|
|
498 my $read = $_;
|
|
499 my ($h1a, $s1,$h1b,$q1,$h2a,$s2,$h2b,$q2) = split(/\n/,$read);
|
|
500 ## skip too short reads
|
|
501 if (length($q1) < $minlength || length($q2) < $minlength) {
|
|
502 $discarded++;
|
|
503 $failedstring .= $read ;#. '|';
|
|
504 next;
|
|
505 }
|
|
506 # initial check for maximal quality value.
|
|
507 ## quality arrays
|
|
508 my @q1array = unpack("C*",$q1);
|
|
509 if (max(@q1array) < $tq + 33) {
|
|
510 #print "Discarding on failed forward qualities: ".max(@q1array). " vs $tq + 33)\n$read\n";
|
|
511 $discarded++;
|
|
512 $failedstring .= $read ;#. '|';
|
|
513 next;
|
|
514 }
|
|
515 my @q2array = unpack("C*",$q2);
|
|
516 if (max(@q2array) < $trimq + 33) {
|
|
517 #print "Discarding on failed reverse qualities\n$read\n";
|
|
518 $discarded++;
|
|
519 $failedstring .= $read ;#. '|';
|
|
520 next;
|
|
521 }
|
|
522
|
|
523 #############
|
|
524 ## Trim 3' ##
|
|
525 #############
|
|
526 if ($trim3 == 1) {
|
|
527 #trim first
|
|
528 my $pos = length($q1) - 1;
|
|
529 my $maxPos = $pos;
|
|
530 ## skip empty read.
|
|
531 #if ($pos <= 0) {
|
|
532 # $discarded++;
|
|
533 # $failedstring .= $read . '|';
|
|
534 # #print "skip empty : \n$read\n";
|
|
535 # next;
|
|
536 #}
|
|
537 my $area = 0;
|
|
538 my $maxArea = 0;
|
|
539 ## unpack qual-string to array of phred scores
|
|
540 #my @qarray = unpack("C*",$q1); # use from initial check.
|
|
541 while ($pos>0 && $area>=0) {
|
|
542 $area += $tq - ($q1array[($pos)] - 33);
|
|
543 ## not <= for more aggressive trimming : if multiple equal maxima => use shortest read.
|
|
544 if ($area < $maxArea) {
|
|
545 $pos--;
|
|
546 next;
|
|
547 }
|
|
548 else {
|
|
549 $maxArea = $area;
|
|
550 $maxPos = $pos;
|
|
551 $pos--;
|
|
552 next;
|
|
553 }
|
|
554 }
|
|
555 if ($maxPos==0) { # scanned whole read and didn't integrate to maximum? skip (no output for this read)
|
|
556 $discarded++;
|
|
557 $failedstring .= $read ;#. '|';
|
|
558 next;
|
|
559 }
|
|
560 # otherwise trim before position where area reached a maximum
|
|
561 # $maxPos as length of substr gives positions zero to just before the maxpos.
|
|
562 #$s1 = substr($s1,0,$maxPos);
|
|
563 #$q1 = substr($q1,0,$maxPos);
|
|
564 $s1 = unpack("A$maxPos",$s1);
|
|
565 $q1 = unpack("A$maxPos",$q1);
|
|
566
|
|
567 #trim second
|
|
568 $pos = length($q2);
|
|
569 $maxPos = $pos;
|
|
570 ## skip empty read.
|
|
571 #if ($pos <= 0) {
|
|
572 # $discarded++;
|
|
573 # $failedstring .= $read . '|';
|
|
574 # next;
|
|
575 #}
|
|
576 $area = 0;
|
|
577 $maxArea = 0;
|
|
578 #@qarray = unpack("C*",$q2);
|
|
579 while ($pos>0 && $area>=0) {
|
|
580 $area += $tq - ($q2array[($pos)] - 33);
|
|
581 if ($area < $maxArea) {
|
|
582 $pos--;
|
|
583 next;
|
|
584 }
|
|
585 else {
|
|
586 $maxArea = $area;
|
|
587 $maxPos = $pos;
|
|
588 $pos--;
|
|
589 next;
|
|
590 }
|
|
591 }
|
|
592 if ($maxPos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read)
|
|
593 $discarded++;
|
|
594 $failedstring .= $read ;#. '|';
|
|
595 next;
|
|
596 }
|
|
597 # Otherwise trim before position where area reached a maximum
|
|
598 $s2 = unpack("A$maxPos",$s2);
|
|
599 $q2 = unpack("A$maxPos",$q2);
|
|
600 #$s2 = substr($s2,0,$maxPos);
|
|
601 #$q2 = substr($q2,0,$maxPos);
|
|
602
|
|
603 }
|
|
604 #############
|
|
605 ## TRIM 5' ##
|
|
606 #############
|
|
607 if ($trim5 == 1) {
|
|
608 #trim first
|
|
609 my $skip = 0; #length($q1);
|
|
610 my $maxPos = $skip;
|
|
611 my $area = 0;
|
|
612 my $maxArea = 0;
|
|
613 my @qarray = unpack("C*",$q1);
|
|
614 while ($skip<length($q1) && $area>=0) {
|
|
615 $area += $tq - ($qarray[($skip)] - 33);
|
|
616 if ($area < $maxArea) {
|
|
617 $skip++;
|
|
618 next;
|
|
619 }
|
|
620 else {
|
|
621 $maxArea = $area;
|
|
622 $maxPos = $skip;
|
|
623 $skip++;
|
|
624 next;
|
|
625 }
|
|
626 }
|
|
627 if ($maxPos==length($q1)- 1) { # => maximal value at end of string => no point of no return found.
|
|
628 $discarded++;
|
|
629 $failedstring .= $read ;#. '|';
|
|
630 next;
|
|
631 }
|
|
632 # trim after position where area reached a maximum
|
|
633 #$s1 = substr($s1,($maxPos+1));
|
|
634 #$q1 = substr($q1,($maxPos+1));
|
|
635 $s1 = unpack("x$maxPos A*",$s1);
|
|
636 $q1 = unpack("x$maxPos A*",$q1);
|
|
637
|
|
638
|
|
639 #trim second
|
|
640 $skip = 0 ;#length($q2);
|
|
641 $maxPos = $skip;
|
|
642 $area = 0;
|
|
643 $maxArea = 0;
|
|
644 @qarray = unpack("C*",$q2);
|
|
645
|
|
646 while ($skip < length($q2) && $area>=0) {
|
|
647 $area += $tq - ($qarray[($skip)] - 33);
|
|
648
|
|
649 if ($area < $maxArea) {
|
|
650 $skip++;
|
|
651 next;
|
|
652 }
|
|
653 else {
|
|
654 $maxArea = $area;
|
|
655 $maxPos = $skip;
|
|
656 $skip++;
|
|
657 next;
|
|
658 }
|
|
659 }
|
|
660 if ($maxPos==(length($q2)-1)) { # scanned whole read and didn't integrate to zero? skip (no output for this read)
|
|
661 $discarded++;
|
|
662 $failedstring .= $read ;#. '|';
|
|
663 next;
|
|
664 }
|
|
665 # trim after position where area reached a maximum
|
|
666 #$s2 = substr($s2,($maxPos+1));
|
|
667 #$q2 = substr($q2,($maxPos+1));
|
|
668 $s2 = unpack("x$maxPos A*",$s2);
|
|
669 $q2 = unpack("x$maxPos A*",$q2);
|
|
670 }
|
|
671
|
|
672 ## DISCARD IF TOO SHORT ##
|
|
673 if (length($s1) < $minlength || length($s2) < $minlength) {
|
|
674 $discarded++;
|
|
675 $failedstring .= $read ;#. '|';
|
|
676 next;
|
|
677
|
|
678 }
|
|
679 # queue for printing
|
|
680 my @arr = ($h1a,$s1,$h1b,$q1,$h2a,$s2,$h2b,$q2);
|
|
681 $printstring .= join('~',@arr) . '|'; # these two are not in quality string, and can thus be used as seperator
|
|
682 # | seperates pairs of reads; ~ seperates lines of single read entry.
|
|
683 #print "$printstring\n";
|
|
684 }
|
|
685
|
|
686 if ($printstring ne '') {
|
|
687 $toprint->enqueue(pack("A*X",$printstring)); # removes trailing |
|
|
688 #$toprint->enqueue($printstring);
|
|
689 }
|
|
690 if ($failedstring ne '') {
|
|
691 #$failed->enqueue(pack("A*X",$failedstring));
|
|
692 $failed->enqueue($failedstring);
|
|
693 }
|
|
694
|
|
695
|
|
696 }
|
|
697 }
|
|
698 sub simplepaired {
|
|
699 # local variables speed up threading
|
|
700 my $tq = $trimq;
|
|
701 while ( defined(my $rstring = $totrim->dequeue())){
|
|
702 my @reads = split(/\|/,$rstring);
|
|
703 my $printstring = '';
|
|
704 my $failedstring = '';
|
|
705 foreach (@reads) {
|
|
706 my $read = $_;
|
|
707 my ($h1a, $s1,$h1b,$q1,$h2a,$s2,$h2b,$q2) = split(/\n/,$read);
|
|
708 ## skip too short reads
|
|
709 if (length($q1) < $minlength || length($q2) < $minlength) {
|
|
710 $discarded++;
|
|
711 $failedstring .= $read; # . '|';
|
|
712 next;
|
|
713 }
|
|
714
|
|
715 # initial check for maximal quality value.
|
|
716 ## quality arrays
|
|
717 my @q1array = unpack("C*",$q1);
|
|
718 if (max(@q1array) < $trimq + 33) {
|
|
719 #print "Discarding on failed forward qualities\n$read\n";
|
|
720 $discarded++;
|
|
721 $failedstring .= $read;# . '|';
|
|
722 next;
|
|
723 }
|
|
724 my @q2array = unpack("C*",$q2);
|
|
725 if (max(@q2array) < $trimq + 33) {
|
|
726 #print "Discarding on failed reverse qualities\n$read\n";
|
|
727 $discarded++;
|
|
728 $failedstring .= $read;# . '|';
|
|
729 next;
|
|
730 }
|
|
731
|
|
732 #############
|
|
733 ## trim 3' ##
|
|
734 #############
|
|
735 if ($trim3 == 1) {
|
|
736 #trim first
|
|
737 my $pos = length($q1) - 1;
|
|
738 my $maxPos = $pos;
|
|
739 #if ($pos <= 0) {
|
|
740 # $discarded++;
|
|
741 # $failedstring .= $read . '|';
|
|
742 # next;
|
|
743 #}
|
|
744 #my @qarray = unpack("C*",$q1);
|
|
745 while ( $pos>0 ) {
|
|
746 if ($tq > $q1array[$pos] - 33) {
|
|
747 # score at position is below threshold
|
|
748 $pos--;
|
|
749 next;
|
|
750 }
|
|
751 else {
|
|
752 # score is at or above threshold
|
|
753 $maxPos = $pos;
|
|
754 last;
|
|
755 }
|
|
756 }
|
|
757 if ($pos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read)
|
|
758 $discarded++;
|
|
759 $failedstring .= $read ;#. '|';
|
|
760 next;
|
|
761 }
|
|
762 # integrated to zero? trim before position where quality was below threshold
|
|
763 # unpack is one based
|
|
764 #$maxPos++;
|
|
765 #$s1 = substr($s1,0,$maxPos);
|
|
766 $s1 = unpack("A$maxPos",$s1);
|
|
767 #$q1 = substr($q1,0,$maxPos);
|
|
768 $q1 = unpack("A$maxPos",$q1);
|
|
769
|
|
770 #trim second
|
|
771 $pos = length($q2) -1;
|
|
772 $maxPos = $pos;
|
|
773 if ($pos <= 0) {
|
|
774 $discarded++;
|
|
775 $failedstring .= $read ;#. '|';
|
|
776 next;
|
|
777 }
|
|
778 #my @qarray = unpack("C*",$q2);
|
|
779
|
|
780 while ($pos>0) {
|
|
781 if ($tq > $q2array[$pos] - 33) {
|
|
782 # score at position is below threshold
|
|
783 $pos--;
|
|
784 next;
|
|
785 }
|
|
786 else {
|
|
787 # score is at or above threshold
|
|
788 $maxPos = $pos;
|
|
789 last;
|
|
790 }
|
|
791 }
|
|
792 if ($pos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read)
|
|
793 $discarded++;
|
|
794 $failedstring .= $read ;#. '|';
|
|
795 next;
|
|
796 }
|
|
797 # integrated to zero? trim before position where score below threshold
|
|
798 #$maxPos++;
|
|
799 $s2 = unpack("A$maxPos",$s2);
|
|
800 $q2 = unpack("A$maxPos",$q2);
|
|
801 #$s2 = substr($s2,0,$maxPos);
|
|
802 #$q2 = substr($q2,0,$maxPos);
|
|
803
|
|
804 }
|
|
805 #############
|
|
806 ## trim 5' ##
|
|
807 #############
|
|
808 if ($trim5 == 1) {
|
|
809 # first
|
|
810 my $skip = 0; #length($q1);
|
|
811 #my $maxPos = $skip;
|
|
812 my @qarray = unpack("C*",$q1);
|
|
813 while ( $skip < length($q1) ) {
|
|
814 if ($tq > $qarray[$skip] - 33) {
|
|
815 # score at position is below threshold
|
|
816 $skip++;
|
|
817 next;
|
|
818 }
|
|
819 else {
|
|
820 # score is at or above threshold
|
|
821 #$maxPos = $skip;
|
|
822 last;
|
|
823 }
|
|
824 }
|
|
825 if ($skip == length($q1)) { # should not fail, as that would have happened on 3' trim
|
|
826 $discarded++;
|
|
827 $failedstring .= $read ;#. '|';
|
|
828 next;
|
|
829 }
|
|
830 # integrated to zero? trim before position where quality was below threshold
|
|
831 # unpack is one based
|
|
832 $s1 = unpack("x$skip A*",$s1);
|
|
833 $q1 = unpack("x$skip A*",$q1);
|
|
834 #$s1 = substr($s1,$skip);
|
|
835 #$q1 = substr($q1,$skip);
|
|
836
|
|
837 # second
|
|
838 $skip = 0; #length($q1);
|
|
839 my @qarray = unpack("C*",$q2);
|
|
840 #$maxPos = $skip;
|
|
841 while ( $skip < length($q2) ) {
|
|
842 if ($tq > $qarray[$skip] - 33) {
|
|
843 # score at position is below threshold
|
|
844 $skip++;
|
|
845 next;
|
|
846 }
|
|
847 else {
|
|
848 # score is at or above threshold
|
|
849 #$maxPos = $skip;
|
|
850 last;
|
|
851 }
|
|
852 }
|
|
853 if ($skip == length($q2)) { # should not fail, as that would have happened on 3' trim
|
|
854 $discarded++;
|
|
855 $failedstring .= $read ;#. '|';
|
|
856 next;
|
|
857 }
|
|
858 # integrated to zero? trim before position where quality was below threshold
|
|
859 # unpack is one based
|
|
860 $s2 = unpack("x$skip A*",$s2);
|
|
861 $q2 = unpack("x$skip A*",$q2);
|
|
862 #$s2 = substr($s2,$skip);
|
|
863 #$q2 = substr($q2,$skip);
|
|
864
|
|
865 }
|
|
866 ## DISCARD IF TOO SHORT ##
|
|
867 if (length($s1) < $minlength || length($s2) < $minlength) {
|
|
868 $discarded++;
|
|
869 $failedstring .= $read .#. '|';
|
|
870 next;
|
|
871
|
|
872 }
|
|
873
|
|
874 # queue for printing
|
|
875 my @arr = ($h1a,$s1,$h1b,$q1,$h2a,$s2,$h2b,$q2);
|
|
876 $printstring .= join('~',@arr) . '|'; # these two are not in quality string, and can thus be used as seperator
|
|
877 # | seperates pairs of reads; ~ seperates lines of single read entry.
|
|
878 }
|
|
879 if ($printstring ne '') {
|
|
880 $toprint->enqueue(pack("A*X",$printstring)); # removes trailing |
|
|
881 #$toprint->enqueue($printstring);
|
|
882 }
|
|
883 if ($failedstring ne '') {
|
|
884 #$failed->enqueue(pack("A*X",$failedstring));
|
|
885 $failed->enqueue($failedstring);
|
|
886 }
|
|
887 }
|
|
888 }
|
|
889
|
|
890 sub bwasingle {
|
|
891 # local variables speed up threading
|
|
892 my $tq = $trimq;
|
|
893 while ( defined(my $rstring = $totrim->dequeue())){
|
|
894 my @reads = split(/\|/,$rstring);
|
|
895 my $printstring = '';
|
|
896 my $failedstring = '';
|
|
897 foreach (@reads) {
|
|
898 my $read = $_;
|
|
899 my ($h1a,$s1,$h1b,$q1) = split(/\n/,$read);
|
|
900 ## skip too short reads
|
|
901 if (length($q1) < $minlength ) {
|
|
902
|
|
903 $discarded++;
|
|
904 $failedstring .= $read ;#. '|';
|
|
905 next;
|
|
906 }
|
|
907 # initial check for maximal quality value.
|
|
908 ## quality arrays
|
|
909 my @q1array = unpack("C*",$q1);
|
|
910 if (max(@q1array) < $tq + 33) {
|
|
911 print "Discarding on failed qualities\n$read\n";
|
|
912 $discarded++;
|
|
913 $failedstring .= $read;# . '|';
|
|
914 next;
|
|
915 }
|
|
916
|
|
917 #############
|
|
918 ## trim 3' ##
|
|
919 #############
|
|
920 if ($trim3 == 1) {
|
|
921 my $pos = length($q1) - 1;
|
|
922 my $maxPos = $pos;
|
|
923 ## skip empty read.
|
|
924 #if ($pos <= 0) {
|
|
925 # $discarded++;
|
|
926 # $failedstring .= $read . '|';
|
|
927 # next;
|
|
928 #}
|
|
929 my $area = 0;
|
|
930 my $maxArea = 0;
|
|
931 #my @qarray = unpack("C*",$q1);
|
|
932 while ($pos>0 && $area>=0) {
|
|
933 $area += $tq - ($q1array[($pos)] - 33);
|
|
934 ## not <= for more aggressive trimming : if multiple equal maxima => use shortest read.
|
|
935 if ($area < $maxArea) {
|
|
936 $pos--;
|
|
937 next;
|
|
938 }
|
|
939 else {
|
|
940 $maxArea = $area;
|
|
941 $maxPos = $pos;
|
|
942 $pos--;
|
|
943 next;
|
|
944 }
|
|
945 }
|
|
946 ## The online perl approach is wrong if the high-quality part is not long enough to reach 0.
|
|
947 if ($maxPos == 0) { # maximal badness on position 0 => no point of no return reached.
|
|
948 $discarded++;
|
|
949 $failedstring .= $read ;#. '|';
|
|
950 next;
|
|
951 }
|
|
952 # otherwise trim before position where area reached a maximum
|
|
953 # $maxPos as length of substr gives positions zero to just before the maxpos.
|
|
954 $s1 = unpack("A$maxPos",$s1);
|
|
955 $q1 = unpack("A$maxPos",$q1);
|
|
956
|
|
957 }
|
|
958 #############
|
|
959 ## trim 5' ##
|
|
960 #############
|
|
961 if ($trim5 == 1) {
|
|
962 my $skip = 0; #length($q1);
|
|
963 my $maxPos = $skip;
|
|
964 my $area = 0;
|
|
965 my $maxArea = 0;
|
|
966 my @qarray = unpack("C*",$q1);
|
|
967 while ($skip<length($q1) && $area>=0) {
|
|
968 #$area += $tq - (ord(unpack("x$skip A1",$q1)) - 33);
|
|
969 $area += $tq - ($qarray[($skip)] - 33);
|
|
970 if ($area < $maxArea) {
|
|
971 $skip++;
|
|
972 next;
|
|
973 }
|
|
974 else {
|
|
975 $maxArea = $area;
|
|
976 $maxPos = $skip;
|
|
977 $skip++;
|
|
978 next;
|
|
979 }
|
|
980 }
|
|
981 #if ($skip==length($q1)) { # scanned whole read and didn't integrate to zero? skip (no output for this read
|
|
982 if ($maxPos == (length($q1)-1)) { # => maximal value at end of string => no point of no return found.
|
|
983 $discarded++;
|
|
984 $failedstring .= $read ;#. '|';
|
|
985 next;
|
|
986 }
|
|
987 # trim after position where area reached a maximum
|
|
988 $s1 = unpack("x$maxPos A*",$s1);
|
|
989 $q1 = unpack("x$maxPos A*",$q1);
|
|
990 #$s1 = substr($s1,($maxPos+1));
|
|
991 #$q1 = substr($q1,($maxPos+1));
|
|
992 }
|
|
993 ## DISCARD IF TOO SHORT ##
|
|
994 if (length($s1) < $minlength) {
|
|
995 $discarded++;
|
|
996 $failedstring .= $read ;#. '|';
|
|
997 next;
|
|
998
|
|
999 }
|
|
1000
|
|
1001 # queue for printing
|
|
1002 my @arr = ($h1a,$s1,$h1b,$q1);
|
|
1003 $printstring .= join("\n",@arr) . "\n"; # join components by newline character for printing
|
|
1004 }
|
|
1005 if ($printstring ne '') {
|
|
1006 #$toprint->enqueue(pack("A*X",$printstring)); # removes trailing |
|
|
1007 $toprint->enqueue($printstring);
|
|
1008 }
|
|
1009 if ($failedstring ne '') {
|
|
1010 #$failed->enqueue(pack("A*X",$failedstring));
|
|
1011 $failed->enqueue($failedstring);
|
|
1012 }
|
|
1013
|
|
1014 }
|
|
1015 }
|
|
1016
|
|
1017
|
|
1018 sub simplesingle {
|
|
1019 # local variables speed up threading
|
|
1020 my $tq = $trimq;
|
|
1021 while ( defined(my $rstring = $totrim->dequeue())){
|
|
1022 my @reads = split(/\|/,$rstring);
|
|
1023 my $printstring = '';
|
|
1024 my $failedstring = '';
|
|
1025 foreach (@reads) {
|
|
1026 my $read = $_;
|
|
1027 my ($h1a,$s1,$h1b,$q1) = split(/\n/,$read);
|
|
1028 ## skip too short reads
|
|
1029 if (length($q1) < $minlength ) {
|
|
1030 $discarded++;
|
|
1031 $failedstring .= $read ;#. '|';
|
|
1032 next;
|
|
1033 }
|
|
1034 # initial check for maximal quality value.
|
|
1035 ## quality arrays
|
|
1036 my @q1array = unpack("C*",$q1);
|
|
1037 if (max(@q1array) < $trimq + 33) {
|
|
1038 #print "Discarding on failed qualities\n$read\n";
|
|
1039 $discarded++;
|
|
1040 $failedstring .= $read ;#. '|';
|
|
1041 next;
|
|
1042 }
|
|
1043
|
|
1044 #############
|
|
1045 ## trim 3' ##
|
|
1046 #############
|
|
1047 if ($trim3 == 1) {
|
|
1048 my $pos = length($q1);
|
|
1049 my $maxPos = $pos;
|
|
1050 #if ($pos <= 0) {
|
|
1051 # $discarded++;
|
|
1052 # $failedstring .= $read . '|';
|
|
1053 # next;
|
|
1054 #}
|
|
1055 #my @qarray = unpack("C*",$q1);
|
|
1056
|
|
1057 while ( $pos>0 ) {
|
|
1058 if ($tq > $q1array[$pos] - 33) {
|
|
1059 # score at position is below threshold
|
|
1060 $pos--;
|
|
1061 next;
|
|
1062 }
|
|
1063 else {
|
|
1064 # score is at or above threshold
|
|
1065 $maxPos = $pos;
|
|
1066 last;
|
|
1067 }
|
|
1068 }
|
|
1069 if ($pos==0) { # scanned whole read and didn't integrate to zero? skip (no output for this read)
|
|
1070 $discarded++;
|
|
1071 $failedstring .= $read ;#. '|';
|
|
1072 next;
|
|
1073 }
|
|
1074 # integrated to zero? trim before position where quality was below threshold
|
|
1075 # unpack is one based
|
|
1076 $s1 = unpack("A$maxPos",$s1);
|
|
1077 $q1 = unpack("A$maxPos",$q1);
|
|
1078 #$s1 = substr($s1,0,$maxPos);
|
|
1079 #$q1 = substr($q1,0,$maxPos);
|
|
1080
|
|
1081 }
|
|
1082 #############
|
|
1083 ## trim 5' ##
|
|
1084 #############
|
|
1085 if ($trim5 == 1) {
|
|
1086 my $skip = 0; #length($q1);
|
|
1087 #my $maxPos = $skip;
|
|
1088 my @qarray = unpack("C*",$q1);
|
|
1089
|
|
1090 while ( $skip < length($q1) ) {
|
|
1091 if ($tq > $qarray[$skip] - 33) {
|
|
1092 # score at position is below threshold
|
|
1093 $skip++;
|
|
1094 next;
|
|
1095 }
|
|
1096 else {
|
|
1097 # score is at or above threshold
|
|
1098 #$maxPos = $skip;
|
|
1099 last;
|
|
1100 }
|
|
1101 }
|
|
1102 if ($skip == length($q1)) {
|
|
1103 $discarded++;
|
|
1104 $failedstring .= $read ;#. '|';
|
|
1105 next;
|
|
1106 }
|
|
1107 # integrated to zero? trim before position where quality was below threshold
|
|
1108 # unpack is one based
|
|
1109 #$s1 = substr($s1,$skip);
|
|
1110 #$q1 = substr($q1,$skip);
|
|
1111 $s1 = unpack("x$skip A*",$s1);
|
|
1112 $q1 = unpack("x$skip A*",$q1);
|
|
1113
|
|
1114
|
|
1115 }
|
|
1116 ## DISCARD IF TOO SHORT ##
|
|
1117 if (length($s1) < $minlength) {
|
|
1118 $discarded++;
|
|
1119 $failedstring .= $read ;#. '|';
|
|
1120 next;
|
|
1121
|
|
1122 }
|
|
1123
|
|
1124
|
|
1125 # queue for printing
|
|
1126 my @arr = ($h1a,$s1,$h1b,$q1);
|
|
1127 $printstring .= join("\n",@arr) . "\n"; # join components by newline character for printing
|
|
1128 }
|
|
1129 if ($printstring ne '') {
|
|
1130 $toprint->enqueue($printstring); # single reads => no | present, just \n
|
|
1131 }
|
|
1132 if ($failedstring ne '') {
|
|
1133 #$failed->enqueue(pack("A*X",$failedstring));
|
|
1134 $failed->enqueue($failedstring);
|
|
1135 }
|
|
1136
|
|
1137 }
|
|
1138 }
|
|
1139
|
|
1140 sub printout {
|
|
1141 if ($rand eq '' || !defined($rand)) {
|
|
1142 die('Random value not passed on!');
|
|
1143 }
|
|
1144 if ($paired == 0) {
|
|
1145 my $counter = 0;
|
|
1146 my $string1 = '';
|
|
1147 if (-e "/tmp/$rand.fq") {
|
|
1148 unlink("/tmp/$rand.fq");
|
|
1149 }
|
|
1150 while (defined(my $string = $toprint->dequeue())) { # each item represents 10000 input reads (might be less by trimming)
|
|
1151 #my @array = split(/\|/,$printstring);
|
|
1152 #$done = $done + scalar(@array);
|
|
1153 $done = $done + 10000;
|
|
1154 $string1 .= $string;
|
|
1155 $counter++;
|
|
1156 #print batches of 500,000 reads
|
|
1157 if ($counter == 50) {
|
|
1158 open OUT1, ">>/tmp/$rand.fq";
|
|
1159 print OUT1 $string1;
|
|
1160 close OUT1;
|
|
1161 $string1 = '';
|
|
1162 #$string2 = '';
|
|
1163 $counter = 0;
|
|
1164 }
|
|
1165 }
|
|
1166 ## print last
|
|
1167 open OUT1, ">>/tmp/$rand.fq";
|
|
1168 print OUT1 $string1;
|
|
1169 close OUT1;
|
|
1170 $finish = 1;
|
|
1171
|
|
1172 }
|
|
1173 else {
|
|
1174 my $counter = 0;
|
|
1175 my $string1 = '';
|
|
1176 my $string2 = '';
|
|
1177 if (-e "/tmp/$rand.fq") {
|
|
1178 unlink("/tmp/$rand.fq");
|
|
1179 }
|
|
1180 if (-e "/tmp/$rand.fq") {
|
|
1181 unlink("/tmp/$rand.rq");
|
|
1182 }
|
|
1183 while (defined(my $printstring = $toprint->dequeue())) { # each item represents 10000 input reads
|
|
1184 my @array = split(/\|/,$printstring);
|
|
1185 $done = $done + scalar(@array);
|
|
1186 foreach (@array) {
|
|
1187 my @subarr = split(/\~/,$_);
|
|
1188 $string1 .= join("\n",@subarr[0..3])."\n";
|
|
1189 $string2 .= join("\n",@subarr[4..7])."\n";
|
|
1190 }
|
|
1191 $counter++;
|
|
1192 if ($counter == 50) {
|
|
1193 #print "Printing!\n";
|
|
1194 flock("/tmp/sg.write.lock",2);
|
|
1195 open OUT1, ">>/tmp/$rand.fq";
|
|
1196 print OUT1 $string1;
|
|
1197 close OUT1;
|
|
1198 open OUT2, ">>/tmp/$rand.rq";
|
|
1199 print OUT2 $string2;
|
|
1200 close OUT2;
|
|
1201 flock("/tmp/sg.write.lock",8);
|
|
1202 $string1 = '';
|
|
1203 $string2 = '';
|
|
1204 $counter = 0;
|
|
1205 }
|
|
1206 }
|
|
1207 ## print last
|
|
1208 flock("/tmp/sg.write.lock",2);
|
|
1209 open OUT1, ">>/tmp/$rand.fq";
|
|
1210 print OUT1 $string1;
|
|
1211 close OUT1;
|
|
1212 open OUT2, ">>/tmp/$rand.rq";
|
|
1213 print OUT2 $string2;
|
|
1214 close OUT2;
|
|
1215 flock("/tmp/sg.write.lock",8);
|
|
1216 $finish = 1;
|
|
1217 }
|
|
1218
|
|
1219 }
|
|
1220
|
|
1221 sub printfailed {
|
|
1222 if (-e "/tmp/$rand.failed") {
|
|
1223 unlink("/tmp/$rand.failed");
|
|
1224 }
|
|
1225 #open FAIL, ">/tmp/$rand.failed" or die("could not open file with failed reads, '/tmp/$rand.failed'");
|
|
1226 #FAIL->autoflush(1);
|
|
1227 my $nr = 0;
|
|
1228 my $string = '';
|
|
1229 while ( defined(my $rstring = $failed->dequeue())) {
|
|
1230 #my @array = split(/\|/,$rstring);
|
|
1231 #foreach (@array) {
|
|
1232 $string .= $rstring;
|
|
1233 #}
|
|
1234 $nr++;
|
|
1235 if ($nr == 50) {
|
|
1236 open FAIL, ">>/tmp/$rand.failed";
|
|
1237 print FAIL $string;
|
|
1238 close FAIL;
|
|
1239 $nr = 0;
|
|
1240 $string = '';
|
|
1241 }
|
|
1242 #close FAIL;
|
|
1243 }
|
|
1244 open FAIL, ">>/tmp/$rand.failed";
|
|
1245 print FAIL $string;
|
|
1246 close FAIL;
|
|
1247 }
|
|
1248 sub monitor {
|
|
1249 # checks the file read monitor
|
|
1250 my $counter = 0;
|
|
1251 while ($finish == 0) {
|
|
1252 $counter++;
|
|
1253 if ($totrim->pending() > 200) {
|
|
1254 lock($dummy);
|
|
1255 #print "Putting the file reading to sleep for 10 seconds.\n";
|
|
1256 sleep 5;
|
|
1257 }
|
|
1258 else {
|
|
1259 sleep 5;
|
|
1260 }
|
|
1261 if ($counter == 6) {
|
|
1262 if ($verbose == 1) {
|
|
1263 my $string = ($done/1000000)."M Reads Passed, ".($discarded/1000000)."M reads discarded (".$totrim->pending().".10^4 pending)\n";
|
|
1264 print $string ;
|
|
1265 }
|
|
1266 $counter = 0;
|
|
1267 }
|
|
1268
|
|
1269 }
|
|
1270 }
|