Mercurial > repos > geert-vandeweyer > multiplicom_primer_trimming
comparison multiplicom_primer_trimming.pl @ 0:fadef644b886 draft
Uploaded
author | geert-vandeweyer |
---|---|
date | Fri, 22 May 2015 08:27:03 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fadef644b886 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 ## needs twoBitToFa v285, or other versions supporting the -bed option. | |
4 | |
5 # load modules | |
6 use Getopt::Std; | |
7 | |
8 ########## | |
9 ## opts ## | |
10 ########## | |
11 ## input files | |
12 # i : input fastq 1 | |
13 # I : input fastq 2 (if paired) | |
14 # b : bed file with amplicon positions | |
15 # r : read length (default 250) | |
16 # o : output fastq1 | |
17 # O : output fastq2 | |
18 # F : failed readpairs | |
19 # R : short report | |
20 # t : 2bit file location | |
21 # w : working directory (defaults to tmp) | |
22 | |
23 getopts('i:I:b:r:o:O:F:R:t:', \%opts) ; | |
24 | |
25 # check input values | |
26 if (!exists($opts{'i'}) || !-e $opts{'i'}) { | |
27 die('Fastq File not found'); | |
28 } | |
29 if (!exists($opts{'I'}) || !-e $opts{'I'}) { | |
30 die('FastQ for paired end reads not found'); | |
31 } | |
32 if (!exists($opts{'o'})) { | |
33 die('No output file specified for forward reads'); | |
34 } | |
35 if (!exists($opts{'O'})) { | |
36 die('No output file specified for reverse reads'); | |
37 } | |
38 if (!exists($opts{'F'})) { | |
39 die('No output file specified for failed pairs'); | |
40 } | |
41 | |
42 if (!exists($opts{'b'}) || !-e $opts{'b'}) { | |
43 die('BED-File not found'); | |
44 } | |
45 | |
46 #if (exists($opts{'m'})) { | |
47 # $minmap = $opts{'m'}; | |
48 #} | |
49 #else { | |
50 # $minmap = 3; | |
51 #} | |
52 if (exists($opts{'r'})) { | |
53 $readl = $opts{'r'}; | |
54 } | |
55 else { | |
56 print "Assuming default read length of 2x250bp\n"; | |
57 $readl = 250; | |
58 } | |
59 if (exists($opts{'t'}) && -e $opts{'t'}) { | |
60 $tobit = $opts{'t'}; | |
61 } | |
62 else { | |
63 die("2BIT reference not found"); | |
64 } | |
65 | |
66 my $tbtf = `which twoBitToFa`; | |
67 chomp($tbtf) ; | |
68 if ($tbtf eq '') { | |
69 if (-e "/opt/software/bin/twoBitToFa") { | |
70 $tbtf = "/opt/software/bin/twoBitToFa"; | |
71 } | |
72 else { | |
73 die("Could not find a twoBitToFa executable.\n"); | |
74 } | |
75 } | |
76 | |
77 | |
78 # make output directory in (tmp) working dir | |
79 if (exists($opts{'w'}) && -d $opts{'w'}) { | |
80 our $wd = $opts{'w'}; | |
81 } | |
82 else { | |
83 our $wd = "/tmp/Trim.".int(rand(1000)); | |
84 while (-d $wd) { | |
85 $wd = "/tmp/Trim.".int(rand(1000)); | |
86 } | |
87 system("mkdir $wd"); | |
88 } | |
89 #print "Using wd : $wd\n"; | |
90 | |
91 | |
92 ## build sequence hash. | |
93 my %alen = %flen = %rlen = (); | |
94 open BED, $opts{'b'}; | |
95 open OUT, ">$wd/bedfile.zero.bed"; | |
96 my $minf = 999; | |
97 my $minr = 999; | |
98 while (<BED>) { | |
99 if ($_ !~ m/^(chr.{1,2}\t)(\d+)(\t.*)/) { | |
100 next; | |
101 } | |
102 chomp($_); | |
103 my @p = split(/\t/,$_); | |
104 my $fl = $p[6] - $p[1]; # p6 holds first non-primer position | |
105 my $rl = $p[2] - $p[7]; # p7 hold last non-primer position | |
106 ## lengths | |
107 $alen{"$p[0]:$p[1]-$p[2]"} = $p[7] - $p[6] + 1; | |
108 $flen{"F:$p[0]:$p[1]-$p[2]"} = $fl; | |
109 if ($fl < $minf) { | |
110 $minf = $fl; | |
111 } | |
112 if ($rl < $minr) { | |
113 $minr = $rl; | |
114 } | |
115 $rlen{"R:$p[0]:$p[1]-$p[2]"} = $rl; | |
116 print OUT "$p[0]\t".($p[1]-1)."\t".($p[6]-1)."\tF:$p[0]:$p[1]-$p[2]\n"; | |
117 print OUT "$p[0]\t".$p[7]."\t".$p[2]."\tR:$p[0]:$p[1]-$p[2]\n"; | |
118 | |
119 } | |
120 close BED; | |
121 close OUT; | |
122 | |
123 system("cd $wd && $tbtf -noMask -bed=bedfile.zero.bed $tobit amplicons.zero.fa"); | |
124 | |
125 open IN, "$wd/amplicons.zero.fa"; | |
126 my %fseq = %rseq = (); | |
127 my %rmm = %fmm = (); | |
128 my @nts = ('A','C','T','G'); | |
129 | |
130 while(<IN>) { | |
131 my $pr = $_; | |
132 my $seq = <IN>; | |
133 chomp($pr); | |
134 chomp($seq); | |
135 $pr = substr($pr,1); | |
136 if (substr($pr,0,1) eq 'F') { | |
137 $fseq{$pr} = $seq; | |
138 for ($i = 0; $i < 10; $i++) { | |
139 foreach(@nts) { | |
140 my $mut = substr($fseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i); | |
141 $fmm{$pr}{$mut} = $fseq{$pr}; | |
142 } | |
143 } | |
144 } | |
145 else { | |
146 $rseq{$pr} = rc($seq); | |
147 for ($i = 0; $i< 10;$i++){ | |
148 foreach(@nts) { | |
149 my $mut = substr($rseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i); | |
150 $rmm{$pr}{$mut} = $rseq{$pr}; | |
151 } | |
152 } | |
153 | |
154 } | |
155 } | |
156 close IN; | |
157 | |
158 ############################### | |
159 ## generate smallest overlap F## | |
160 ############################### | |
161 $ntf = $minf; | |
162 BUILDMIN: | |
163 my %fmin = (); | |
164 my %fpairs = (); | |
165 foreach( keys(%fseq)) { | |
166 my $sub = substr($fseq{$_},0,$ntf); | |
167 ## clash => increase nt. | |
168 if (exists($fmin{$sub})) { | |
169 ## check if not identical (yes, this is possible...) (same start + same length.) | |
170 $_ =~ m/F:chr(.{1,2}):(\d+)-(\d+)/; | |
171 my $cchr = $1; | |
172 my $cstart = $2; | |
173 my $cl = $flen{$_}; | |
174 my @prev = split(/\|/,$fmin{$sub}); | |
175 my $pprim = $prev[0]; | |
176 $pprim =~ m/F:chr(.{1,2}):(\d+)-(\d+)/; | |
177 my $pchr = $1; | |
178 my $pstart = $2; | |
179 my $pl = $flen{$pprim}; | |
180 if ("$cchr" ne "$pchr" || "$cstart" ne "$pstart" || "$cl" ne "$pl") { | |
181 $ntf++; | |
182 goto BUILDMIN; | |
183 } | |
184 else { | |
185 $fmin{$sub} .= "|$_"; | |
186 my $rn = $_; | |
187 $rn =~ s/F:/R:/; | |
188 $fpairs{$_} .= "|$rn"; | |
189 } | |
190 } | |
191 else { | |
192 $fmin{$sub} = $_; | |
193 my $rn = $_; | |
194 $rn =~ s/F:/R:/; | |
195 $fpairs{$_} = "$rn"; | |
196 | |
197 } | |
198 } | |
199 | |
200 print "Minimal number of nucleotides needed for forward: $ntf\n"; | |
201 | |
202 ## allow one mismatch. | |
203 my %mmhash =(); | |
204 foreach (keys(%fmin)) { | |
205 my $orig = $_; | |
206 for ($i = 0; $i< length($orig);$i++){ | |
207 foreach(@nts) { | |
208 if ($_ eq substr($orig,$i,1)) { | |
209 $mmhash{$orig} = $orig; | |
210 next; | |
211 } | |
212 my $mut = substr($orig,0,$i).$_.substr($orig,$i+1); | |
213 ## if in mmhash && not in $fmin => clash after mutation => delete from mmhash. | |
214 if (exists($mmhash{$mut}) ) { | |
215 if (!exists($fmin{$mut})) { | |
216 delete($mmhash{$mut}); | |
217 next; | |
218 } | |
219 else { | |
220 ## mut == original from oth primer => do not add reference. | |
221 next; | |
222 } | |
223 } | |
224 else { | |
225 $mmhash{$mut} = $orig; | |
226 } | |
227 } | |
228 } | |
229 } | |
230 | |
231 ############################### | |
232 ## generate smallest overlap R## | |
233 ############################### | |
234 $ntr = $minr; | |
235 BUILDMINR: | |
236 my %rmin = (); | |
237 my %frairs = (); | |
238 foreach( keys(%rseq)) { | |
239 my $sub = substr($rseq{$_},0,$ntr); | |
240 ## clash => increase nt. | |
241 if (exists($rmin{$sub})) { | |
242 ## check if not identical (yes, this is possible...) (same start + same length.) | |
243 $_ =~ m/R:chr(.{1,2}):(\d+)-(\d+)/; | |
244 my $cchr = $1; | |
245 my $cstart = $2; | |
246 my $cl = $rlen{$_}; | |
247 my @prev = split(/\|/,$rmin{$sub}); | |
248 my $pprim = $prev[0]; | |
249 $pprim =~ m/R:chr(.{1,2}):(\d+)-(\d+)/; | |
250 my $pchr = $1; | |
251 my $pstart = $2; | |
252 my $pl = $Rlen{$pprim}; | |
253 if ("$cchr" ne "$pchr" || "$cstart" ne "$pstart" || "$cl" ne "$pl") { | |
254 $ntr++; | |
255 goto BUILDMINR; | |
256 } | |
257 else { | |
258 $Rmin{$sub} .= "|$_"; | |
259 my $fn = $_; | |
260 $fn =~ s/R:/F:/; | |
261 $rpairs{$_} .= "|$fn"; | |
262 } | |
263 } | |
264 else { | |
265 $rmin{$sub} = $_; | |
266 my $fn = $_; | |
267 $fn =~ s/R:/F:/; | |
268 $rpairs{$_} = "$fn"; | |
269 | |
270 } | |
271 } | |
272 | |
273 print "Minimal number of nucleotides needed for reverse: $ntr\n"; | |
274 | |
275 ## allow one mismatch. | |
276 my %rmmhash =(); | |
277 foreach (keys(%rmin)) { | |
278 my $orig = $_; | |
279 for ($i = 0; $i< length($orig);$i++){ | |
280 foreach(@nts) { | |
281 if ($_ eq substr($orig,$i,1)) { | |
282 $rmmhash{$orig} = $orig; | |
283 next; | |
284 } | |
285 my $mut = substr($orig,0,$i).$_.substr($orig,$i+1); | |
286 ## if in mmhash && not in $fmin => clash after mutation => delete from mmhash. | |
287 if (exists($rmmhash{$mut}) ) { | |
288 if (!exists($rmin{$mut})) { | |
289 delete($rmmhash{$mut}); | |
290 next; | |
291 } | |
292 else { | |
293 ## mut == original from oth primer => do not add reference. | |
294 next; | |
295 } | |
296 } | |
297 else { | |
298 $rmmhash{$mut} = $orig; | |
299 } | |
300 } | |
301 } | |
302 } | |
303 | |
304 ########################## | |
305 ## process fastq files. ## | |
306 ########################## | |
307 open IN, $opts{'i'}; # open forward reads | |
308 open INR, $opts{'I'}; # open reverse reads | |
309 open OUTF, ">".$opts{'o'}; # where to put trimmed forward reads | |
310 open OUTR, ">".$opts{'O'}; ## where to put trimmed Reverse reads | |
311 open FAIL, ">$opts{'F'}"; #$wd/failed.interlaced.fq"; # keep non-matches to seperate file => for debug? | |
312 my $outf = $outr = $failout =''; # buffer output to speedup I/O | |
313 my $count = $failcount = 0 ; # track output buffer | |
314 my $nrmissed = $nrfound = 0; # statistics | |
315 my $foundboth = $byf = $byr = $toolongf = $toolongr = 0; # statistics | |
316 my $bptrimmed = $totalbp = 0; # statistics | |
317 while (my $r1 = <IN>) { | |
318 # read in forward | |
319 my $s1 = <IN>; | |
320 chomp($s1); | |
321 $totalbp += length($s1); | |
322 my $d = <IN>; | |
323 my $q1 = <IN>; | |
324 chomp($q1); | |
325 # read in reverse | |
326 my $r2 = <INR>; | |
327 my $s2 = <INR>; | |
328 chomp($s2); | |
329 $totalbp += length($s2); | |
330 my $d = <INR>; | |
331 my $q2 = <INR>; | |
332 chomp($q2); | |
333 ## the seeds : first positions of reads, lenght is determined above. | |
334 my $fseed = substr($s1,0,$ntf); | |
335 my $rseed = substr($s2,0,$ntr); | |
336 ## check if the seed exists in the "mutated" forward primer list | |
337 if (!exists($mmhash{$fseed})) { | |
338 ## not found : try the reverse primers. | |
339 if (!exists($rmmhash{$rseed})) { | |
340 # not found either : print out to failed reads in interlaced format | |
341 $nrmissed++; | |
342 $failout .= "$r1$s1\n+\n$q1\n$r2$s2\n+\n$q2\n"; | |
343 $failcount++; | |
344 if ($failcount > 50000) { | |
345 chomp($failout); | |
346 print FAIL $failout; | |
347 $failout = "\n"; | |
348 $failcount = 0; | |
349 } | |
350 next; | |
351 } | |
352 ## trim reverse 5' | |
353 $s2 = substr($s2,$rlen{$rmin{$rmmhash{$rseed}}}); # from position *length of reverse primer* : rseed points to the original primer in the rmm hash | |
354 $q2 = substr($q2,$rlen{$rmin{$rmmhash{$rseed}}}); | |
355 ## statistics | |
356 $bptrimmed += $rlen{$rmin{$rmmhash{$rseed}}}; | |
357 ## trim if readlength > amplicon size (without primers) | |
358 if ($readl > $rlen{$rmin{$rmmhash{$rseed}}} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) { | |
359 # trim to alength (incorrect if indels !) | |
360 # statistics | |
361 $toolongr++; | |
362 $bptrimmed += length($s2) - $alen{substr($rmin{$rmmhash{$rseed}},2)}; | |
363 ## 5' primer is already trimmed. This removes fragments of 3' primer. | |
364 $s2 = substr($s2,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); | |
365 $q2 = substr($q2,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); | |
366 } | |
367 $byr++; | |
368 ## trim forward 5' | |
369 my @fps = split(/\|/,$rpairs{$rmin{$rmmhash{$rseed}}}); | |
370 my $forok = 0; | |
371 my $forl = 0; | |
372 foreach(@fps) { | |
373 if (exists($fmm{$_}{substr($s1,0,10)})) { | |
374 $s1 = substr($s1,$flen{$_}); | |
375 $q1 = substr($q1,$flen{$_}); | |
376 #statistics | |
377 $bptrimmed += $flen{$_}; | |
378 if ($readl > $flen{$_} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) { | |
379 # trim to alength (incorrect if indels !) | |
380 # statistics | |
381 $toolongf++; | |
382 $bptrimmed += length($s1) - $alen{substr($rmin{$rmmhash{$rseed}},2)}; | |
383 | |
384 $s1 = substr($s1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); | |
385 $q1 = substr($q1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); | |
386 } | |
387 | |
388 $forok++; | |
389 $foundboth++; | |
390 last; | |
391 } | |
392 else { | |
393 if ($forl < $flen{$_}) { | |
394 $forl = $flen{$_}; | |
395 } | |
396 } | |
397 } | |
398 if ($forok == 0) { | |
399 ## trim by max length of should be forwards. | |
400 $s1 = substr($s1,$forl); | |
401 $q1 = substr($q1,$forl); | |
402 # statistics | |
403 $bptrimmed += $forl; | |
404 if ($readl > $forl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) { | |
405 # trim to alength (incorrect if indels !) | |
406 # statistics | |
407 $toolongf++; | |
408 $bptrimmed += length($s1) - $alen{substr($rmin{$rmmhash{$rseed}},2)}; | |
409 $s1 = substr($s1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); | |
410 $q1 = substr($q1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); | |
411 } | |
412 | |
413 } | |
414 $nrfound++; | |
415 } | |
416 else { | |
417 ## trim forward 5' | |
418 $s1 = substr($s1,$flen{$fmin{$mmhash{$fseed}}}); | |
419 $q1 = substr($q1,$flen{$fmin{$mmhash{$fseed}}}); | |
420 # statistics | |
421 $bptrimmed += $flen{$fmin{$mmhash{$fseed}}}; | |
422 | |
423 if ($readl > $flen{$fmin{$mmhash{$fseed}}} + $alen{substr($fmin{$mmhash{$fseed}},2)}) { | |
424 # trim to alength (incorrect if indels !) | |
425 # statistics | |
426 $toolongf++; | |
427 $bptrimmed += length($s1) - $alen{substr($fmin{$mmhash{$fseed}},2)}; | |
428 | |
429 $s1 = substr($s1,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); | |
430 $q1 = substr($q1,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); | |
431 } | |
432 $byf++; | |
433 ## trim reverse 5' | |
434 my @rps = split(/\|/,$fpairs{$fmin{$mmhash{$fseed}}}); | |
435 $revok = 0; | |
436 my $revl = 0; | |
437 foreach(@rps) { | |
438 if (exists($rmm{$_}{substr($s2,0,10)})) { | |
439 $s2 = substr($s2,$rlen{$_}); | |
440 $q2 = substr($q2,$rlen{$_}); | |
441 # statistics | |
442 $bptrimmed += $rlen{$_}; | |
443 if ($readl > $rlen{$_} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) { | |
444 # trim to alength (incorrect if indels !) | |
445 # statistics | |
446 $toolongr++; | |
447 $bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)}; | |
448 | |
449 $s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); | |
450 $q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); | |
451 } | |
452 | |
453 $revok++; | |
454 $foundboth++; | |
455 last; | |
456 } | |
457 else { | |
458 if ($revl < $rlen{$_}) { | |
459 $revl = $rlen{$_}; | |
460 } | |
461 } | |
462 } | |
463 if ($revok == 0) { | |
464 # trim by max length of should be reverses. | |
465 $s2 = substr($s2,$revl); | |
466 $q2 = substr($q2,$revl); | |
467 # statistics | |
468 $bptrimmed += $revl; | |
469 if ($readl > $revl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) { | |
470 # trim to alength (incorrect if indels !) | |
471 # statistics | |
472 $toolongr++; | |
473 $bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)}; | |
474 $s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); | |
475 $q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); | |
476 } | |
477 | |
478 } | |
479 $nrfound++; | |
480 } | |
481 $outf .= "$r1$s1\n+\n$q1\n"; | |
482 $outr .= "$r2$s2\n+\n$q2\n"; | |
483 $count++; | |
484 if ($count > 100000) { | |
485 print OUTF $outf; | |
486 print OUTR $outr; | |
487 | |
488 $outf = "\n"; | |
489 $outr = "\n"; | |
490 $count = 0; | |
491 } | |
492 | |
493 | |
494 } | |
495 chomp($outf); | |
496 chomp($outr); | |
497 chomp($failout); | |
498 print OUTF $outf; | |
499 print OUTR $outr; | |
500 print FAIL $failout; | |
501 close IN; | |
502 close INR; | |
503 close OUTF; | |
504 close OUTR; | |
505 close FAIL; | |
506 open REPORT, ">$opts{'R'}" or die ("Could not open report file"); | |
507 print REPORT "Results: \n"; | |
508 print REPORT "########\n"; | |
509 print REPORT " Read pairs without match: $nrmissed\n"; | |
510 print REPORT " Read pairs with a valid match: $nrfound\n"; | |
511 print REPORT " Initial match on Forward: $byf\n"; | |
512 print REPORT " Initial match on Reverse: $byr\n"; | |
513 print REPORT " Both F and R Matched: $foundboth\n"; | |
514 print REPORT " Forward reads trimmed to amplicon length: $toolongf\n"; | |
515 print REPORT " Reverse reads trimmed to amplicon length: $toolongr\n"; | |
516 print REPORT " Total basepairs in fastq files: $totalbp\n"; | |
517 print REPORT " Total basepairs trimmed: $bptrimmed (".sprintf("%.2f",($bptrimmed/$totalbp)*100)."%)\n"; | |
518 close REPORT; | |
519 | |
520 | |
521 | |
522 | |
523 | |
524 | |
525 if (!exists($opts{'w'})) { | |
526 ## clean up | |
527 system("rm -Rf $wd"); | |
528 } | |
529 | |
530 | |
531 sub rc { | |
532 my $seq = shift; | |
533 $seq =~ tr/ACGT/TGCA/; | |
534 $seq = reverse($seq); | |
535 return $seq; | |
536 | |
537 } |