annotate multiplicom_primer_trimming.pl @ 0:fadef644b886 draft

Uploaded
author geert-vandeweyer
date Fri, 22 May 2015 08:27:03 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
1 #!/usr/bin/perl
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
2
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
3 ## needs twoBitToFa v285, or other versions supporting the -bed option.
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
4
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
5 # load modules
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
6 use Getopt::Std;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
7
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
8 ##########
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
9 ## opts ##
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
10 ##########
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
11 ## input files
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
12 # i : input fastq 1
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
13 # I : input fastq 2 (if paired)
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
14 # b : bed file with amplicon positions
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
15 # r : read length (default 250)
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
16 # o : output fastq1
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
17 # O : output fastq2
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
18 # F : failed readpairs
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
19 # R : short report
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
20 # t : 2bit file location
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
21 # w : working directory (defaults to tmp)
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
22
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
23 getopts('i:I:b:r:o:O:F:R:t:', \%opts) ;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
24
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
25 # check input values
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
26 if (!exists($opts{'i'}) || !-e $opts{'i'}) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
27 die('Fastq File not found');
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
28 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
29 if (!exists($opts{'I'}) || !-e $opts{'I'}) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
30 die('FastQ for paired end reads not found');
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
31 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
32 if (!exists($opts{'o'})) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
33 die('No output file specified for forward reads');
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
34 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
35 if (!exists($opts{'O'})) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
36 die('No output file specified for reverse reads');
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
37 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
38 if (!exists($opts{'F'})) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
39 die('No output file specified for failed pairs');
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
40 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
41
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
42 if (!exists($opts{'b'}) || !-e $opts{'b'}) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
43 die('BED-File not found');
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
44 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
45
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
46 #if (exists($opts{'m'})) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
47 # $minmap = $opts{'m'};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
48 #}
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
49 #else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
50 # $minmap = 3;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
51 #}
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
52 if (exists($opts{'r'})) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
53 $readl = $opts{'r'};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
54 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
55 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
56 print "Assuming default read length of 2x250bp\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
57 $readl = 250;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
58 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
59 if (exists($opts{'t'}) && -e $opts{'t'}) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
60 $tobit = $opts{'t'};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
61 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
62 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
63 die("2BIT reference not found");
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
64 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
65
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
66 my $tbtf = `which twoBitToFa`;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
67 chomp($tbtf) ;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
68 if ($tbtf eq '') {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
69 if (-e "/opt/software/bin/twoBitToFa") {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
70 $tbtf = "/opt/software/bin/twoBitToFa";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
71 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
72 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
73 die("Could not find a twoBitToFa executable.\n");
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
74 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
75 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
76
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
77
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
78 # make output directory in (tmp) working dir
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
79 if (exists($opts{'w'}) && -d $opts{'w'}) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
80 our $wd = $opts{'w'};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
81 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
82 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
83 our $wd = "/tmp/Trim.".int(rand(1000));
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
84 while (-d $wd) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
85 $wd = "/tmp/Trim.".int(rand(1000));
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
86 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
87 system("mkdir $wd");
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
88 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
89 #print "Using wd : $wd\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
90
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
91
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
92 ## build sequence hash.
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
93 my %alen = %flen = %rlen = ();
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
94 open BED, $opts{'b'};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
95 open OUT, ">$wd/bedfile.zero.bed";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
96 my $minf = 999;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
97 my $minr = 999;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
98 while (<BED>) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
99 if ($_ !~ m/^(chr.{1,2}\t)(\d+)(\t.*)/) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
100 next;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
101 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
102 chomp($_);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
103 my @p = split(/\t/,$_);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
104 my $fl = $p[6] - $p[1]; # p6 holds first non-primer position
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
105 my $rl = $p[2] - $p[7]; # p7 hold last non-primer position
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
106 ## lengths
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
107 $alen{"$p[0]:$p[1]-$p[2]"} = $p[7] - $p[6] + 1;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
108 $flen{"F:$p[0]:$p[1]-$p[2]"} = $fl;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
109 if ($fl < $minf) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
110 $minf = $fl;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
111 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
112 if ($rl < $minr) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
113 $minr = $rl;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
114 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
115 $rlen{"R:$p[0]:$p[1]-$p[2]"} = $rl;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
116 print OUT "$p[0]\t".($p[1]-1)."\t".($p[6]-1)."\tF:$p[0]:$p[1]-$p[2]\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
117 print OUT "$p[0]\t".$p[7]."\t".$p[2]."\tR:$p[0]:$p[1]-$p[2]\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
118
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
119 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
120 close BED;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
121 close OUT;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
122
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
123 system("cd $wd && $tbtf -noMask -bed=bedfile.zero.bed $tobit amplicons.zero.fa");
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
124
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
125 open IN, "$wd/amplicons.zero.fa";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
126 my %fseq = %rseq = ();
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
127 my %rmm = %fmm = ();
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
128 my @nts = ('A','C','T','G');
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
129
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
130 while(<IN>) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
131 my $pr = $_;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
132 my $seq = <IN>;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
133 chomp($pr);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
134 chomp($seq);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
135 $pr = substr($pr,1);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
136 if (substr($pr,0,1) eq 'F') {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
137 $fseq{$pr} = $seq;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
138 for ($i = 0; $i < 10; $i++) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
139 foreach(@nts) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
140 my $mut = substr($fseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
141 $fmm{$pr}{$mut} = $fseq{$pr};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
142 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
143 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
144 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
145 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
146 $rseq{$pr} = rc($seq);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
147 for ($i = 0; $i< 10;$i++){
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
148 foreach(@nts) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
149 my $mut = substr($rseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
150 $rmm{$pr}{$mut} = $rseq{$pr};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
151 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
152 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
153
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
154 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
155 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
156 close IN;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
157
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
158 ###############################
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
159 ## generate smallest overlap F##
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
160 ###############################
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
161 $ntf = $minf;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
162 BUILDMIN:
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
163 my %fmin = ();
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
164 my %fpairs = ();
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
165 foreach( keys(%fseq)) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
166 my $sub = substr($fseq{$_},0,$ntf);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
167 ## clash => increase nt.
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
168 if (exists($fmin{$sub})) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
169 ## check if not identical (yes, this is possible...) (same start + same length.)
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
170 $_ =~ m/F:chr(.{1,2}):(\d+)-(\d+)/;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
171 my $cchr = $1;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
172 my $cstart = $2;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
173 my $cl = $flen{$_};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
174 my @prev = split(/\|/,$fmin{$sub});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
175 my $pprim = $prev[0];
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
176 $pprim =~ m/F:chr(.{1,2}):(\d+)-(\d+)/;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
177 my $pchr = $1;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
178 my $pstart = $2;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
179 my $pl = $flen{$pprim};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
180 if ("$cchr" ne "$pchr" || "$cstart" ne "$pstart" || "$cl" ne "$pl") {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
181 $ntf++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
182 goto BUILDMIN;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
183 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
184 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
185 $fmin{$sub} .= "|$_";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
186 my $rn = $_;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
187 $rn =~ s/F:/R:/;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
188 $fpairs{$_} .= "|$rn";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
189 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
190 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
191 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
192 $fmin{$sub} = $_;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
193 my $rn = $_;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
194 $rn =~ s/F:/R:/;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
195 $fpairs{$_} = "$rn";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
196
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
197 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
198 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
199
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
200 print "Minimal number of nucleotides needed for forward: $ntf\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
201
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
202 ## allow one mismatch.
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
203 my %mmhash =();
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
204 foreach (keys(%fmin)) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
205 my $orig = $_;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
206 for ($i = 0; $i< length($orig);$i++){
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
207 foreach(@nts) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
208 if ($_ eq substr($orig,$i,1)) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
209 $mmhash{$orig} = $orig;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
210 next;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
211 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
212 my $mut = substr($orig,0,$i).$_.substr($orig,$i+1);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
213 ## if in mmhash && not in $fmin => clash after mutation => delete from mmhash.
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
214 if (exists($mmhash{$mut}) ) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
215 if (!exists($fmin{$mut})) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
216 delete($mmhash{$mut});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
217 next;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
218 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
219 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
220 ## mut == original from oth primer => do not add reference.
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
221 next;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
222 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
223 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
224 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
225 $mmhash{$mut} = $orig;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
226 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
227 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
228 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
229 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
230
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
231 ###############################
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
232 ## generate smallest overlap R##
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
233 ###############################
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
234 $ntr = $minr;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
235 BUILDMINR:
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
236 my %rmin = ();
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
237 my %frairs = ();
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
238 foreach( keys(%rseq)) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
239 my $sub = substr($rseq{$_},0,$ntr);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
240 ## clash => increase nt.
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
241 if (exists($rmin{$sub})) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
242 ## check if not identical (yes, this is possible...) (same start + same length.)
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
243 $_ =~ m/R:chr(.{1,2}):(\d+)-(\d+)/;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
244 my $cchr = $1;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
245 my $cstart = $2;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
246 my $cl = $rlen{$_};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
247 my @prev = split(/\|/,$rmin{$sub});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
248 my $pprim = $prev[0];
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
249 $pprim =~ m/R:chr(.{1,2}):(\d+)-(\d+)/;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
250 my $pchr = $1;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
251 my $pstart = $2;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
252 my $pl = $Rlen{$pprim};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
253 if ("$cchr" ne "$pchr" || "$cstart" ne "$pstart" || "$cl" ne "$pl") {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
254 $ntr++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
255 goto BUILDMINR;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
256 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
257 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
258 $Rmin{$sub} .= "|$_";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
259 my $fn = $_;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
260 $fn =~ s/R:/F:/;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
261 $rpairs{$_} .= "|$fn";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
262 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
263 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
264 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
265 $rmin{$sub} = $_;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
266 my $fn = $_;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
267 $fn =~ s/R:/F:/;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
268 $rpairs{$_} = "$fn";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
269
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
270 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
271 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
272
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
273 print "Minimal number of nucleotides needed for reverse: $ntr\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
274
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
275 ## allow one mismatch.
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
276 my %rmmhash =();
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
277 foreach (keys(%rmin)) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
278 my $orig = $_;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
279 for ($i = 0; $i< length($orig);$i++){
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
280 foreach(@nts) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
281 if ($_ eq substr($orig,$i,1)) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
282 $rmmhash{$orig} = $orig;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
283 next;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
284 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
285 my $mut = substr($orig,0,$i).$_.substr($orig,$i+1);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
286 ## if in mmhash && not in $fmin => clash after mutation => delete from mmhash.
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
287 if (exists($rmmhash{$mut}) ) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
288 if (!exists($rmin{$mut})) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
289 delete($rmmhash{$mut});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
290 next;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
291 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
292 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
293 ## mut == original from oth primer => do not add reference.
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
294 next;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
295 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
296 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
297 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
298 $rmmhash{$mut} = $orig;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
299 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
300 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
301 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
302 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
303
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
304 ##########################
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
305 ## process fastq files. ##
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
306 ##########################
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
307 open IN, $opts{'i'}; # open forward reads
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
308 open INR, $opts{'I'}; # open reverse reads
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
309 open OUTF, ">".$opts{'o'}; # where to put trimmed forward reads
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
310 open OUTR, ">".$opts{'O'}; ## where to put trimmed Reverse reads
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
311 open FAIL, ">$opts{'F'}"; #$wd/failed.interlaced.fq"; # keep non-matches to seperate file => for debug?
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
312 my $outf = $outr = $failout =''; # buffer output to speedup I/O
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
313 my $count = $failcount = 0 ; # track output buffer
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
314 my $nrmissed = $nrfound = 0; # statistics
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
315 my $foundboth = $byf = $byr = $toolongf = $toolongr = 0; # statistics
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
316 my $bptrimmed = $totalbp = 0; # statistics
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
317 while (my $r1 = <IN>) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
318 # read in forward
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
319 my $s1 = <IN>;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
320 chomp($s1);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
321 $totalbp += length($s1);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
322 my $d = <IN>;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
323 my $q1 = <IN>;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
324 chomp($q1);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
325 # read in reverse
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
326 my $r2 = <INR>;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
327 my $s2 = <INR>;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
328 chomp($s2);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
329 $totalbp += length($s2);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
330 my $d = <INR>;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
331 my $q2 = <INR>;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
332 chomp($q2);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
333 ## the seeds : first positions of reads, lenght is determined above.
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
334 my $fseed = substr($s1,0,$ntf);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
335 my $rseed = substr($s2,0,$ntr);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
336 ## check if the seed exists in the "mutated" forward primer list
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
337 if (!exists($mmhash{$fseed})) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
338 ## not found : try the reverse primers.
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
339 if (!exists($rmmhash{$rseed})) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
340 # not found either : print out to failed reads in interlaced format
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
341 $nrmissed++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
342 $failout .= "$r1$s1\n+\n$q1\n$r2$s2\n+\n$q2\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
343 $failcount++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
344 if ($failcount > 50000) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
345 chomp($failout);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
346 print FAIL $failout;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
347 $failout = "\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
348 $failcount = 0;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
349 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
350 next;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
351 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
352 ## trim reverse 5'
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
353 $s2 = substr($s2,$rlen{$rmin{$rmmhash{$rseed}}}); # from position *length of reverse primer* : rseed points to the original primer in the rmm hash
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
354 $q2 = substr($q2,$rlen{$rmin{$rmmhash{$rseed}}});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
355 ## statistics
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
356 $bptrimmed += $rlen{$rmin{$rmmhash{$rseed}}};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
357 ## trim if readlength > amplicon size (without primers)
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
358 if ($readl > $rlen{$rmin{$rmmhash{$rseed}}} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
359 # trim to alength (incorrect if indels !)
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
360 # statistics
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
361 $toolongr++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
362 $bptrimmed += length($s2) - $alen{substr($rmin{$rmmhash{$rseed}},2)};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
363 ## 5' primer is already trimmed. This removes fragments of 3' primer.
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
364 $s2 = substr($s2,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
365 $q2 = substr($q2,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
366 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
367 $byr++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
368 ## trim forward 5'
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
369 my @fps = split(/\|/,$rpairs{$rmin{$rmmhash{$rseed}}});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
370 my $forok = 0;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
371 my $forl = 0;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
372 foreach(@fps) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
373 if (exists($fmm{$_}{substr($s1,0,10)})) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
374 $s1 = substr($s1,$flen{$_});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
375 $q1 = substr($q1,$flen{$_});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
376 #statistics
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
377 $bptrimmed += $flen{$_};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
378 if ($readl > $flen{$_} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
379 # trim to alength (incorrect if indels !)
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
380 # statistics
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
381 $toolongf++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
382 $bptrimmed += length($s1) - $alen{substr($rmin{$rmmhash{$rseed}},2)};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
383
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
384 $s1 = substr($s1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
385 $q1 = substr($q1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
386 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
387
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
388 $forok++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
389 $foundboth++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
390 last;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
391 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
392 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
393 if ($forl < $flen{$_}) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
394 $forl = $flen{$_};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
395 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
396 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
397 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
398 if ($forok == 0) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
399 ## trim by max length of should be forwards.
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
400 $s1 = substr($s1,$forl);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
401 $q1 = substr($q1,$forl);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
402 # statistics
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
403 $bptrimmed += $forl;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
404 if ($readl > $forl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
405 # trim to alength (incorrect if indels !)
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
406 # statistics
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
407 $toolongf++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
408 $bptrimmed += length($s1) - $alen{substr($rmin{$rmmhash{$rseed}},2)};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
409 $s1 = substr($s1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
410 $q1 = substr($q1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
411 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
412
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
413 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
414 $nrfound++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
415 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
416 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
417 ## trim forward 5'
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
418 $s1 = substr($s1,$flen{$fmin{$mmhash{$fseed}}});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
419 $q1 = substr($q1,$flen{$fmin{$mmhash{$fseed}}});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
420 # statistics
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
421 $bptrimmed += $flen{$fmin{$mmhash{$fseed}}};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
422
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
423 if ($readl > $flen{$fmin{$mmhash{$fseed}}} + $alen{substr($fmin{$mmhash{$fseed}},2)}) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
424 # trim to alength (incorrect if indels !)
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
425 # statistics
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
426 $toolongf++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
427 $bptrimmed += length($s1) - $alen{substr($fmin{$mmhash{$fseed}},2)};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
428
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
429 $s1 = substr($s1,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
430 $q1 = substr($q1,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
431 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
432 $byf++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
433 ## trim reverse 5'
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
434 my @rps = split(/\|/,$fpairs{$fmin{$mmhash{$fseed}}});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
435 $revok = 0;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
436 my $revl = 0;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
437 foreach(@rps) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
438 if (exists($rmm{$_}{substr($s2,0,10)})) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
439 $s2 = substr($s2,$rlen{$_});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
440 $q2 = substr($q2,$rlen{$_});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
441 # statistics
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
442 $bptrimmed += $rlen{$_};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
443 if ($readl > $rlen{$_} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
444 # trim to alength (incorrect if indels !)
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
445 # statistics
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
446 $toolongr++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
447 $bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
448
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
449 $s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
450 $q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
451 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
452
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
453 $revok++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
454 $foundboth++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
455 last;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
456 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
457 else {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
458 if ($revl < $rlen{$_}) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
459 $revl = $rlen{$_};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
460 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
461 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
462 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
463 if ($revok == 0) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
464 # trim by max length of should be reverses.
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
465 $s2 = substr($s2,$revl);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
466 $q2 = substr($q2,$revl);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
467 # statistics
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
468 $bptrimmed += $revl;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
469 if ($readl > $revl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
470 # trim to alength (incorrect if indels !)
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
471 # statistics
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
472 $toolongr++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
473 $bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)};
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
474 $s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
475 $q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
476 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
477
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
478 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
479 $nrfound++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
480 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
481 $outf .= "$r1$s1\n+\n$q1\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
482 $outr .= "$r2$s2\n+\n$q2\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
483 $count++;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
484 if ($count > 100000) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
485 print OUTF $outf;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
486 print OUTR $outr;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
487
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
488 $outf = "\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
489 $outr = "\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
490 $count = 0;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
491 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
492
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
493
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
494 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
495 chomp($outf);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
496 chomp($outr);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
497 chomp($failout);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
498 print OUTF $outf;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
499 print OUTR $outr;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
500 print FAIL $failout;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
501 close IN;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
502 close INR;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
503 close OUTF;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
504 close OUTR;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
505 close FAIL;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
506 open REPORT, ">$opts{'R'}" or die ("Could not open report file");
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
507 print REPORT "Results: \n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
508 print REPORT "########\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
509 print REPORT " Read pairs without match: $nrmissed\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
510 print REPORT " Read pairs with a valid match: $nrfound\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
511 print REPORT " Initial match on Forward: $byf\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
512 print REPORT " Initial match on Reverse: $byr\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
513 print REPORT " Both F and R Matched: $foundboth\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
514 print REPORT " Forward reads trimmed to amplicon length: $toolongf\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
515 print REPORT " Reverse reads trimmed to amplicon length: $toolongr\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
516 print REPORT " Total basepairs in fastq files: $totalbp\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
517 print REPORT " Total basepairs trimmed: $bptrimmed (".sprintf("%.2f",($bptrimmed/$totalbp)*100)."%)\n";
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
518 close REPORT;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
519
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
520
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
521
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
522
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
523
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
524
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
525 if (!exists($opts{'w'})) {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
526 ## clean up
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
527 system("rm -Rf $wd");
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
528 }
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
529
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
530
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
531 sub rc {
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
532 my $seq = shift;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
533 $seq =~ tr/ACGT/TGCA/;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
534 $seq = reverse($seq);
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
535 return $seq;
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
536
fadef644b886 Uploaded
geert-vandeweyer
parents:
diff changeset
537 }