comparison multiplicom_primer_trimming.pl @ 0:fadef644b886 draft

Uploaded
author geert-vandeweyer
date Fri, 22 May 2015 08:27:03 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:fadef644b886
1 #!/usr/bin/perl
2
3 ## needs twoBitToFa v285, or other versions supporting the -bed option.
4
5 # load modules
6 use Getopt::Std;
7
8 ##########
9 ## opts ##
10 ##########
11 ## input files
12 # i : input fastq 1
13 # I : input fastq 2 (if paired)
14 # b : bed file with amplicon positions
15 # r : read length (default 250)
16 # o : output fastq1
17 # O : output fastq2
18 # F : failed readpairs
19 # R : short report
20 # t : 2bit file location
21 # w : working directory (defaults to tmp)
22
23 getopts('i:I:b:r:o:O:F:R:t:', \%opts) ;
24
25 # check input values
26 if (!exists($opts{'i'}) || !-e $opts{'i'}) {
27 die('Fastq File not found');
28 }
29 if (!exists($opts{'I'}) || !-e $opts{'I'}) {
30 die('FastQ for paired end reads not found');
31 }
32 if (!exists($opts{'o'})) {
33 die('No output file specified for forward reads');
34 }
35 if (!exists($opts{'O'})) {
36 die('No output file specified for reverse reads');
37 }
38 if (!exists($opts{'F'})) {
39 die('No output file specified for failed pairs');
40 }
41
42 if (!exists($opts{'b'}) || !-e $opts{'b'}) {
43 die('BED-File not found');
44 }
45
46 #if (exists($opts{'m'})) {
47 # $minmap = $opts{'m'};
48 #}
49 #else {
50 # $minmap = 3;
51 #}
52 if (exists($opts{'r'})) {
53 $readl = $opts{'r'};
54 }
55 else {
56 print "Assuming default read length of 2x250bp\n";
57 $readl = 250;
58 }
59 if (exists($opts{'t'}) && -e $opts{'t'}) {
60 $tobit = $opts{'t'};
61 }
62 else {
63 die("2BIT reference not found");
64 }
65
66 my $tbtf = `which twoBitToFa`;
67 chomp($tbtf) ;
68 if ($tbtf eq '') {
69 if (-e "/opt/software/bin/twoBitToFa") {
70 $tbtf = "/opt/software/bin/twoBitToFa";
71 }
72 else {
73 die("Could not find a twoBitToFa executable.\n");
74 }
75 }
76
77
78 # make output directory in (tmp) working dir
79 if (exists($opts{'w'}) && -d $opts{'w'}) {
80 our $wd = $opts{'w'};
81 }
82 else {
83 our $wd = "/tmp/Trim.".int(rand(1000));
84 while (-d $wd) {
85 $wd = "/tmp/Trim.".int(rand(1000));
86 }
87 system("mkdir $wd");
88 }
89 #print "Using wd : $wd\n";
90
91
92 ## build sequence hash.
93 my %alen = %flen = %rlen = ();
94 open BED, $opts{'b'};
95 open OUT, ">$wd/bedfile.zero.bed";
96 my $minf = 999;
97 my $minr = 999;
98 while (<BED>) {
99 if ($_ !~ m/^(chr.{1,2}\t)(\d+)(\t.*)/) {
100 next;
101 }
102 chomp($_);
103 my @p = split(/\t/,$_);
104 my $fl = $p[6] - $p[1]; # p6 holds first non-primer position
105 my $rl = $p[2] - $p[7]; # p7 hold last non-primer position
106 ## lengths
107 $alen{"$p[0]:$p[1]-$p[2]"} = $p[7] - $p[6] + 1;
108 $flen{"F:$p[0]:$p[1]-$p[2]"} = $fl;
109 if ($fl < $minf) {
110 $minf = $fl;
111 }
112 if ($rl < $minr) {
113 $minr = $rl;
114 }
115 $rlen{"R:$p[0]:$p[1]-$p[2]"} = $rl;
116 print OUT "$p[0]\t".($p[1]-1)."\t".($p[6]-1)."\tF:$p[0]:$p[1]-$p[2]\n";
117 print OUT "$p[0]\t".$p[7]."\t".$p[2]."\tR:$p[0]:$p[1]-$p[2]\n";
118
119 }
120 close BED;
121 close OUT;
122
123 system("cd $wd && $tbtf -noMask -bed=bedfile.zero.bed $tobit amplicons.zero.fa");
124
125 open IN, "$wd/amplicons.zero.fa";
126 my %fseq = %rseq = ();
127 my %rmm = %fmm = ();
128 my @nts = ('A','C','T','G');
129
130 while(<IN>) {
131 my $pr = $_;
132 my $seq = <IN>;
133 chomp($pr);
134 chomp($seq);
135 $pr = substr($pr,1);
136 if (substr($pr,0,1) eq 'F') {
137 $fseq{$pr} = $seq;
138 for ($i = 0; $i < 10; $i++) {
139 foreach(@nts) {
140 my $mut = substr($fseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i);
141 $fmm{$pr}{$mut} = $fseq{$pr};
142 }
143 }
144 }
145 else {
146 $rseq{$pr} = rc($seq);
147 for ($i = 0; $i< 10;$i++){
148 foreach(@nts) {
149 my $mut = substr($rseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i);
150 $rmm{$pr}{$mut} = $rseq{$pr};
151 }
152 }
153
154 }
155 }
156 close IN;
157
158 ###############################
159 ## generate smallest overlap F##
160 ###############################
161 $ntf = $minf;
162 BUILDMIN:
163 my %fmin = ();
164 my %fpairs = ();
165 foreach( keys(%fseq)) {
166 my $sub = substr($fseq{$_},0,$ntf);
167 ## clash => increase nt.
168 if (exists($fmin{$sub})) {
169 ## check if not identical (yes, this is possible...) (same start + same length.)
170 $_ =~ m/F:chr(.{1,2}):(\d+)-(\d+)/;
171 my $cchr = $1;
172 my $cstart = $2;
173 my $cl = $flen{$_};
174 my @prev = split(/\|/,$fmin{$sub});
175 my $pprim = $prev[0];
176 $pprim =~ m/F:chr(.{1,2}):(\d+)-(\d+)/;
177 my $pchr = $1;
178 my $pstart = $2;
179 my $pl = $flen{$pprim};
180 if ("$cchr" ne "$pchr" || "$cstart" ne "$pstart" || "$cl" ne "$pl") {
181 $ntf++;
182 goto BUILDMIN;
183 }
184 else {
185 $fmin{$sub} .= "|$_";
186 my $rn = $_;
187 $rn =~ s/F:/R:/;
188 $fpairs{$_} .= "|$rn";
189 }
190 }
191 else {
192 $fmin{$sub} = $_;
193 my $rn = $_;
194 $rn =~ s/F:/R:/;
195 $fpairs{$_} = "$rn";
196
197 }
198 }
199
200 print "Minimal number of nucleotides needed for forward: $ntf\n";
201
202 ## allow one mismatch.
203 my %mmhash =();
204 foreach (keys(%fmin)) {
205 my $orig = $_;
206 for ($i = 0; $i< length($orig);$i++){
207 foreach(@nts) {
208 if ($_ eq substr($orig,$i,1)) {
209 $mmhash{$orig} = $orig;
210 next;
211 }
212 my $mut = substr($orig,0,$i).$_.substr($orig,$i+1);
213 ## if in mmhash && not in $fmin => clash after mutation => delete from mmhash.
214 if (exists($mmhash{$mut}) ) {
215 if (!exists($fmin{$mut})) {
216 delete($mmhash{$mut});
217 next;
218 }
219 else {
220 ## mut == original from oth primer => do not add reference.
221 next;
222 }
223 }
224 else {
225 $mmhash{$mut} = $orig;
226 }
227 }
228 }
229 }
230
231 ###############################
232 ## generate smallest overlap R##
233 ###############################
234 $ntr = $minr;
235 BUILDMINR:
236 my %rmin = ();
237 my %frairs = ();
238 foreach( keys(%rseq)) {
239 my $sub = substr($rseq{$_},0,$ntr);
240 ## clash => increase nt.
241 if (exists($rmin{$sub})) {
242 ## check if not identical (yes, this is possible...) (same start + same length.)
243 $_ =~ m/R:chr(.{1,2}):(\d+)-(\d+)/;
244 my $cchr = $1;
245 my $cstart = $2;
246 my $cl = $rlen{$_};
247 my @prev = split(/\|/,$rmin{$sub});
248 my $pprim = $prev[0];
249 $pprim =~ m/R:chr(.{1,2}):(\d+)-(\d+)/;
250 my $pchr = $1;
251 my $pstart = $2;
252 my $pl = $Rlen{$pprim};
253 if ("$cchr" ne "$pchr" || "$cstart" ne "$pstart" || "$cl" ne "$pl") {
254 $ntr++;
255 goto BUILDMINR;
256 }
257 else {
258 $Rmin{$sub} .= "|$_";
259 my $fn = $_;
260 $fn =~ s/R:/F:/;
261 $rpairs{$_} .= "|$fn";
262 }
263 }
264 else {
265 $rmin{$sub} = $_;
266 my $fn = $_;
267 $fn =~ s/R:/F:/;
268 $rpairs{$_} = "$fn";
269
270 }
271 }
272
273 print "Minimal number of nucleotides needed for reverse: $ntr\n";
274
275 ## allow one mismatch.
276 my %rmmhash =();
277 foreach (keys(%rmin)) {
278 my $orig = $_;
279 for ($i = 0; $i< length($orig);$i++){
280 foreach(@nts) {
281 if ($_ eq substr($orig,$i,1)) {
282 $rmmhash{$orig} = $orig;
283 next;
284 }
285 my $mut = substr($orig,0,$i).$_.substr($orig,$i+1);
286 ## if in mmhash && not in $fmin => clash after mutation => delete from mmhash.
287 if (exists($rmmhash{$mut}) ) {
288 if (!exists($rmin{$mut})) {
289 delete($rmmhash{$mut});
290 next;
291 }
292 else {
293 ## mut == original from oth primer => do not add reference.
294 next;
295 }
296 }
297 else {
298 $rmmhash{$mut} = $orig;
299 }
300 }
301 }
302 }
303
304 ##########################
305 ## process fastq files. ##
306 ##########################
307 open IN, $opts{'i'}; # open forward reads
308 open INR, $opts{'I'}; # open reverse reads
309 open OUTF, ">".$opts{'o'}; # where to put trimmed forward reads
310 open OUTR, ">".$opts{'O'}; ## where to put trimmed Reverse reads
311 open FAIL, ">$opts{'F'}"; #$wd/failed.interlaced.fq"; # keep non-matches to seperate file => for debug?
312 my $outf = $outr = $failout =''; # buffer output to speedup I/O
313 my $count = $failcount = 0 ; # track output buffer
314 my $nrmissed = $nrfound = 0; # statistics
315 my $foundboth = $byf = $byr = $toolongf = $toolongr = 0; # statistics
316 my $bptrimmed = $totalbp = 0; # statistics
317 while (my $r1 = <IN>) {
318 # read in forward
319 my $s1 = <IN>;
320 chomp($s1);
321 $totalbp += length($s1);
322 my $d = <IN>;
323 my $q1 = <IN>;
324 chomp($q1);
325 # read in reverse
326 my $r2 = <INR>;
327 my $s2 = <INR>;
328 chomp($s2);
329 $totalbp += length($s2);
330 my $d = <INR>;
331 my $q2 = <INR>;
332 chomp($q2);
333 ## the seeds : first positions of reads, lenght is determined above.
334 my $fseed = substr($s1,0,$ntf);
335 my $rseed = substr($s2,0,$ntr);
336 ## check if the seed exists in the "mutated" forward primer list
337 if (!exists($mmhash{$fseed})) {
338 ## not found : try the reverse primers.
339 if (!exists($rmmhash{$rseed})) {
340 # not found either : print out to failed reads in interlaced format
341 $nrmissed++;
342 $failout .= "$r1$s1\n+\n$q1\n$r2$s2\n+\n$q2\n";
343 $failcount++;
344 if ($failcount > 50000) {
345 chomp($failout);
346 print FAIL $failout;
347 $failout = "\n";
348 $failcount = 0;
349 }
350 next;
351 }
352 ## trim reverse 5'
353 $s2 = substr($s2,$rlen{$rmin{$rmmhash{$rseed}}}); # from position *length of reverse primer* : rseed points to the original primer in the rmm hash
354 $q2 = substr($q2,$rlen{$rmin{$rmmhash{$rseed}}});
355 ## statistics
356 $bptrimmed += $rlen{$rmin{$rmmhash{$rseed}}};
357 ## trim if readlength > amplicon size (without primers)
358 if ($readl > $rlen{$rmin{$rmmhash{$rseed}}} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
359 # trim to alength (incorrect if indels !)
360 # statistics
361 $toolongr++;
362 $bptrimmed += length($s2) - $alen{substr($rmin{$rmmhash{$rseed}},2)};
363 ## 5' primer is already trimmed. This removes fragments of 3' primer.
364 $s2 = substr($s2,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
365 $q2 = substr($q2,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
366 }
367 $byr++;
368 ## trim forward 5'
369 my @fps = split(/\|/,$rpairs{$rmin{$rmmhash{$rseed}}});
370 my $forok = 0;
371 my $forl = 0;
372 foreach(@fps) {
373 if (exists($fmm{$_}{substr($s1,0,10)})) {
374 $s1 = substr($s1,$flen{$_});
375 $q1 = substr($q1,$flen{$_});
376 #statistics
377 $bptrimmed += $flen{$_};
378 if ($readl > $flen{$_} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
379 # trim to alength (incorrect if indels !)
380 # statistics
381 $toolongf++;
382 $bptrimmed += length($s1) - $alen{substr($rmin{$rmmhash{$rseed}},2)};
383
384 $s1 = substr($s1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
385 $q1 = substr($q1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
386 }
387
388 $forok++;
389 $foundboth++;
390 last;
391 }
392 else {
393 if ($forl < $flen{$_}) {
394 $forl = $flen{$_};
395 }
396 }
397 }
398 if ($forok == 0) {
399 ## trim by max length of should be forwards.
400 $s1 = substr($s1,$forl);
401 $q1 = substr($q1,$forl);
402 # statistics
403 $bptrimmed += $forl;
404 if ($readl > $forl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
405 # trim to alength (incorrect if indels !)
406 # statistics
407 $toolongf++;
408 $bptrimmed += length($s1) - $alen{substr($rmin{$rmmhash{$rseed}},2)};
409 $s1 = substr($s1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
410 $q1 = substr($q1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
411 }
412
413 }
414 $nrfound++;
415 }
416 else {
417 ## trim forward 5'
418 $s1 = substr($s1,$flen{$fmin{$mmhash{$fseed}}});
419 $q1 = substr($q1,$flen{$fmin{$mmhash{$fseed}}});
420 # statistics
421 $bptrimmed += $flen{$fmin{$mmhash{$fseed}}};
422
423 if ($readl > $flen{$fmin{$mmhash{$fseed}}} + $alen{substr($fmin{$mmhash{$fseed}},2)}) {
424 # trim to alength (incorrect if indels !)
425 # statistics
426 $toolongf++;
427 $bptrimmed += length($s1) - $alen{substr($fmin{$mmhash{$fseed}},2)};
428
429 $s1 = substr($s1,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
430 $q1 = substr($q1,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
431 }
432 $byf++;
433 ## trim reverse 5'
434 my @rps = split(/\|/,$fpairs{$fmin{$mmhash{$fseed}}});
435 $revok = 0;
436 my $revl = 0;
437 foreach(@rps) {
438 if (exists($rmm{$_}{substr($s2,0,10)})) {
439 $s2 = substr($s2,$rlen{$_});
440 $q2 = substr($q2,$rlen{$_});
441 # statistics
442 $bptrimmed += $rlen{$_};
443 if ($readl > $rlen{$_} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
444 # trim to alength (incorrect if indels !)
445 # statistics
446 $toolongr++;
447 $bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)};
448
449 $s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
450 $q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
451 }
452
453 $revok++;
454 $foundboth++;
455 last;
456 }
457 else {
458 if ($revl < $rlen{$_}) {
459 $revl = $rlen{$_};
460 }
461 }
462 }
463 if ($revok == 0) {
464 # trim by max length of should be reverses.
465 $s2 = substr($s2,$revl);
466 $q2 = substr($q2,$revl);
467 # statistics
468 $bptrimmed += $revl;
469 if ($readl > $revl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
470 # trim to alength (incorrect if indels !)
471 # statistics
472 $toolongr++;
473 $bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)};
474 $s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
475 $q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
476 }
477
478 }
479 $nrfound++;
480 }
481 $outf .= "$r1$s1\n+\n$q1\n";
482 $outr .= "$r2$s2\n+\n$q2\n";
483 $count++;
484 if ($count > 100000) {
485 print OUTF $outf;
486 print OUTR $outr;
487
488 $outf = "\n";
489 $outr = "\n";
490 $count = 0;
491 }
492
493
494 }
495 chomp($outf);
496 chomp($outr);
497 chomp($failout);
498 print OUTF $outf;
499 print OUTR $outr;
500 print FAIL $failout;
501 close IN;
502 close INR;
503 close OUTF;
504 close OUTR;
505 close FAIL;
506 open REPORT, ">$opts{'R'}" or die ("Could not open report file");
507 print REPORT "Results: \n";
508 print REPORT "########\n";
509 print REPORT " Read pairs without match: $nrmissed\n";
510 print REPORT " Read pairs with a valid match: $nrfound\n";
511 print REPORT " Initial match on Forward: $byf\n";
512 print REPORT " Initial match on Reverse: $byr\n";
513 print REPORT " Both F and R Matched: $foundboth\n";
514 print REPORT " Forward reads trimmed to amplicon length: $toolongf\n";
515 print REPORT " Reverse reads trimmed to amplicon length: $toolongr\n";
516 print REPORT " Total basepairs in fastq files: $totalbp\n";
517 print REPORT " Total basepairs trimmed: $bptrimmed (".sprintf("%.2f",($bptrimmed/$totalbp)*100)."%)\n";
518 close REPORT;
519
520
521
522
523
524
525 if (!exists($opts{'w'})) {
526 ## clean up
527 system("rm -Rf $wd");
528 }
529
530
531 sub rc {
532 my $seq = shift;
533 $seq =~ tr/ACGT/TGCA/;
534 $seq = reverse($seq);
535 return $seq;
536
537 }