0
|
1 #!/usr/bin/perl
|
|
2 #Name : 1_Filter_by_truncation.pl
|
|
3 #Author : Morgan, Matthew
|
|
4 #Created : 02/2013
|
|
5 #Modified : 08/2013
|
|
6 #Purpose : Filters raw pyrosequencing reads, trims primers and barcodes, truncates at user-defined length, bins by sequence, removes poor sequences
|
|
7 #Syntax : perl 1_Filter_by_truncation.pl [Raw sequence file] [gasket file] [FMID file] [RMID file] [truncation length]
|
|
8 #Further info : Further information regarding this script and APDP can be found in the documentation downloaded with this file, and in Morgan et al., (in review)
|
|
9 #Acknowledgment : Code for converting between tab-delimited text and fasta formats was borrowed and modified from http://sysbio.harvard.edu/csb/resources/computational/scriptome/
|
|
10 #Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.
|
|
11
|
|
12 #########################################################################################################################################################
|
|
13 # #
|
|
14 #CSIRO Open Source Software License Agreement (GPLv3) #
|
|
15 # #
|
|
16 #Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230. #
|
|
17 # #
|
|
18 #All rights reserved. CSIRO is willing to grant you a license to APDP on the terms of the GNU General Public License version 3 #
|
|
19 # as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.html), except where otherwise indicated for third party material. #
|
|
20 #The following additional terms apply under clause 7 of that license: #
|
|
21 # #
|
|
22 #EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS". CSIRO AND ITS #
|
|
23 #CONTRIBUTORS MAKE NO REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS, #
|
|
24 #WARRANTIES OR CONDITIONS REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, #
|
|
25 #NON-INFRINGEMENT, THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. #
|
|
26 # #
|
|
27 #TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL THEORY (INCLUDING, WITHOUT #
|
|
28 #LIMITATION, IN AN ACTION FOR BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER LIABILITY HOWSOEVER INCURRED. #
|
|
29 #WITHOUT LIMITING THE SCOPE OF THE PREVIOUS SENTENCE THE EXCLUSION OF LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR OPERATION TIME, LOSS, #
|
|
30 #DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC LOSS; #
|
|
31 #OR ANY SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS LICENCE, THE USE #
|
|
32 #OF THE SOFTWARE OR THE USE OF OR OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO OR ITS CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF #
|
|
33 #SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY. #
|
|
34 # #
|
|
35 #APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY IMPLY REPRESENTATIONS, WARRANTIES, OR CONDITIONS, OR IMPOSES OBLIGATIONS #
|
|
36 #OR LIABILITY ON CSIRO OR ONE OF ITS CONTRIBUTORS IN RESPECT OF THE SOFTWARE THAT CANNOT BE WHOLLY OR PARTLY EXCLUDED, RESTRICTED OR #
|
|
37 #MODIFIED "CONSUMER GUARANTEES". IF SUCH CONSUMER GUARANTEES APPLY THEN THE LIABILITY OF CSIRO AND ITS CONTRIBUTORS IS LIMITED, TO THE FULL #
|
|
38 #EXTENT PERMITTED BY THE APPLICABLE LEGISLATION. WHERE THE APPLICABLE LEGISLATION PERMITS THE FOLLOWING REMEDIES TO BE PROVIDED FOR BREACH OF #
|
|
39 #THE CONSUMER GUARANTEES THEN, AT ITS OPTION, CSIRO'S LIABILITY IS LIMITED TO ANY ONE OR MORE OF THEM: #
|
|
40 #1. THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT SOFTWARE, OR SUPPLYING RELEVANT SERVICES AGAIN; #
|
|
41 #2. THE REPAIR OF THE SOFTWARE; #
|
|
42 #3. THE PAYMENT OF THE COST OF REPLACING THE SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE RELEVANT SERVICES SUPPLIED AGAIN, #
|
|
43 # OR HAVING THE SOFTWARE REPAIRED. #
|
|
44 # #
|
|
45 #########################################################################################################################################################
|
|
46
|
|
47 use warnings;
|
|
48 use strict;
|
|
49
|
|
50 my $rawdata = $ARGV[0];
|
|
51 my (%FORMID, %REVMID);
|
|
52
|
|
53 #Store FMIDs and RMIDs
|
|
54
|
|
55 if ($ARGV[2]){my $midfile = $ARGV[2];
|
|
56 open (INFILE, "<$midfile")||die("Can't open MID file");
|
|
57 while (<INFILE>){
|
|
58 s/\r?\n//;
|
|
59 chomp $_;
|
|
60 my @parts = split(/\t/, $_);
|
|
61 my $midname = $parts[0];
|
|
62 my $midsequence = $parts[1];
|
|
63 $FORMID{$midname} = $midsequence;
|
|
64 print "$midname\t$midsequence\n";
|
|
65 }
|
|
66 }
|
|
67 else {
|
|
68 $FORMID{'NOFMID'} = ();
|
|
69 }
|
|
70 close (INFILE);
|
|
71
|
|
72 if ($ARGV[3]) {my $rmidfile = $ARGV[3];
|
|
73 open (RMIDS, "<$rmidfile")||die("Can't open RMID file");
|
|
74 while (<RMIDS>){
|
|
75 s/\r?\n//;
|
|
76 chomp;
|
|
77 my @parts = split(/\t/, $_);
|
|
78 my $midname = $parts[0];
|
|
79 my $midsequence = $parts[1];
|
|
80 $REVMID{$midname} = $midsequence;
|
|
81 print "$midname\t$midsequence\n";
|
|
82 }
|
|
83 }
|
|
84 else {
|
|
85 $REVMID{'NORMID'} = ();
|
|
86 }
|
|
87 my $n;
|
|
88 #read truncation length
|
|
89 if ($ARGV[4]){
|
|
90 $n = $ARGV[4];
|
|
91 }
|
|
92 close (IN);
|
|
93
|
|
94 #add gasket file
|
|
95 my @garray;
|
|
96 if ($ARGV[1]){my $gasketin = $ARGV[1];
|
|
97 open (G, "<$gasketin")||die("Can't open gasket file");
|
|
98 while (<G>){
|
|
99 chomp;
|
|
100 push @garray, $_;
|
|
101 }
|
|
102 }
|
|
103 else {
|
|
104 push @garray, "NOGASKET";
|
|
105 }
|
|
106
|
|
107 #open fasta file, read sequence name, sequence, assign to MID group, remove MID and primer sequence.
|
|
108
|
|
109 #convert fasta to tabular format
|
|
110 my $count = 0;
|
|
111 open (IN, "<$rawdata") || die ("Can't open sequence data");
|
|
112 open OUT, ">Sequence_data.txt";
|
|
113 while (<IN>) {
|
|
114 s/\r?\n//;
|
|
115 s/\t/ /g;
|
|
116 if (s/^>//) {
|
|
117 if ($. != 1) {
|
|
118 print OUT "\n"
|
|
119 }
|
|
120 $count++;
|
|
121 s/ |$/\t/;
|
|
122 $_ .= "\t";
|
|
123 }
|
|
124 else {
|
|
125 s/ //g;
|
|
126 }
|
|
127 print OUT $_;
|
|
128 }
|
|
129 print "Converted $count raw sequences in fasta format to tab-delimited text\n\n";
|
|
130 close (IN);
|
|
131 close (OUT);
|
|
132
|
|
133 # read sequence name, sequence, assign to MID group, remove MID and primer sequence.
|
|
134
|
|
135 my ($seqname, $sequence, %seen, $partition, $gasket, $formidname, $revmidname, %tophash, @observed_mids, %hash, $midcombination, %totalreads, %totalinvalidreads, %rep);
|
|
136 my $accepted=0; my $multiple_fmids=0; my $multiple_rmids=0; my $no_mids=0; my $no_rmid=0;my $no_fmid=0; my $unused_mids=0; my $tooshort=0;
|
|
137 open (INFILE, "<Sequence_data.txt") || die ("Can't open infile");
|
|
138 open OUT, ">Read_status.txt";
|
|
139 open OUT1, ">Read_status_summary.txt";
|
|
140 open OUTFILE1, ">Reads_with_forward_and_reverse_MIDs.txt";
|
|
141 open OUTFILE2, ">Filtered_unique_sequences.txt";
|
|
142 open READS, ">Reads_by_sequence.txt";
|
|
143 while (<INFILE>){
|
|
144 my (%fmid, %rmid, $f, $r);
|
|
145 my $fmid_count = 0;
|
|
146 my $rmid_count = 0;
|
|
147 chomp $_;
|
|
148 my @nameparts = split(/\t/, $_);
|
|
149 $seqname = $nameparts[0];
|
|
150 $sequence = $nameparts[2];
|
|
151 if ($garray[0] eq "NOGASKET"){$partition = "NOGASKET";}
|
|
152 foreach my $formidname (sort keys %FORMID)
|
|
153 {
|
|
154 my $formidsequence = $FORMID{$formidname};
|
|
155 if ( $sequence =~ /$formidsequence/ ) {
|
|
156 $fmid_count++;
|
|
157 $fmid{$formidname}=1;
|
|
158 }
|
|
159 else {
|
|
160 $fmid{$formidname}=0;
|
|
161 }
|
|
162 }
|
|
163 if ($fmid_count>1){
|
|
164 $multiple_fmids++;
|
|
165 print OUT "$seqname\tREJECTED\tMULTIPLE_FORWARD_MIDS\n";
|
|
166 }
|
|
167 elsif ($fmid_count==0){
|
|
168 $no_mids++;
|
|
169 print OUT "$seqname\tREJECTED\tNO_FORWARD_MID\n";
|
|
170 }
|
|
171 elsif ($fmid_count==1){
|
|
172 for my $formid (keys %fmid){
|
|
173 if ($fmid{$formid}==1){
|
|
174 $f=$formid;
|
|
175 }
|
|
176 }
|
|
177 my $partition = substr( $seqname, 0, 9 );
|
|
178 my $comb = $partition . $f;
|
|
179 my $formidsequence = $FORMID{$f};
|
|
180 $accepted++;
|
|
181 print OUT "$seqname\tACCEPTED\t$comb\n";
|
|
182 $sequence =~ s/(.*)$formidsequence//;
|
|
183 if (length($sequence)<$n){
|
|
184 $tooshort++;
|
|
185 next;
|
|
186 }
|
|
187 else {
|
|
188 my $fragment=substr($sequence, 0, $n);
|
|
189 $sequence=$fragment;
|
|
190 }
|
|
191 #want count of how many times seen this sequence with these mids
|
|
192 if (defined($hash{$sequence}{$comb})){
|
|
193 $hash{$sequence}{$comb} ++;
|
|
194 }
|
|
195 else {
|
|
196 $hash{$sequence}{$comb} = 1;
|
|
197 }
|
|
198 #get total read count for sequence
|
|
199 if (defined($totalreads{$sequence})){
|
|
200 $totalreads{$sequence} ++;
|
|
201 }
|
|
202 else {
|
|
203 $totalreads{$sequence} = 1;
|
|
204 }
|
|
205 if (defined($rep{$sequence})){
|
|
206 my @rep_array = @{$rep{$sequence}};
|
|
207 push @rep_array, $seqname;
|
|
208 $rep{$sequence} = \@rep_array;
|
|
209 }
|
|
210 else {
|
|
211 my @new_rep;
|
|
212 push @new_rep, $seqname;
|
|
213 $rep{$sequence} = \@new_rep;
|
|
214 }
|
|
215
|
|
216 }
|
|
217
|
|
218 }#closes while loop
|
|
219 close (OUT);
|
|
220 my $rej = ($multiple_fmids + $multiple_rmids + $no_mids + $no_rmid + $no_fmid + $unused_mids);
|
|
221 print OUT1 "Reads accepted:\t$accepted\n\n";
|
|
222 print OUT1 "Reads rejected:\t$rej\n\n";
|
|
223 print OUT1 "\tneither MID:\t$no_mids\n\n";
|
|
224 print OUT1 "\tno forward MID:\t$no_fmid\n\n";
|
|
225 print OUT1 "\tno reverse MID:\t$no_rmid\n\n";
|
|
226 print OUT1 "\tunused mid combination:\t$unused_mids\n\n";
|
|
227 print OUT1 "\tmultiple forward mids:\t$multiple_fmids\n\n";
|
|
228 print OUT1 "\tmultiple reverse mids:\t$multiple_rmids\n\n";
|
|
229 close (OUT1);
|
|
230 #enters zeroes for all unobserved MID combinations in submitted txt file
|
|
231
|
|
232 foreach my $gasket (@garray) {
|
|
233 foreach my $formidname (sort keys %FORMID) {
|
|
234 my $comb = $gasket . $formidname;
|
|
235 foreach $sequence (sort keys %hash) {
|
|
236 if (defined( $hash{$sequence}{$comb} )) {
|
|
237 next;
|
|
238 }
|
|
239
|
|
240 else {
|
|
241 $hash{$sequence}{$comb} = 0;
|
|
242 }
|
|
243 }
|
|
244
|
|
245 }
|
|
246 }
|
|
247
|
|
248 #print out tab delimited text file: headers are Sequence name (arbitrary); Sequence; total reads; MID reads
|
|
249
|
|
250 my $seqcount;
|
|
251 foreach my $sequence (sort keys %hash) {
|
|
252 my $independenttotal = $totalreads{$sequence};
|
|
253 my $l = length($sequence);
|
|
254 $seqcount++;
|
|
255 my $mids = $hash{$sequence};
|
|
256 local $" = "\t";
|
|
257 my @sorted_keys = sort keys %$mids;
|
|
258 my @values = @$mids{@sorted_keys};
|
|
259
|
|
260 my @rep_reads = sort(@{$rep{$sequence}});
|
|
261 my $check_total = scalar(@rep_reads);
|
|
262 if ($check_total != $independenttotal){
|
|
263 print "Oops: read totals don't match for $sequence\n\n";
|
|
264 }
|
|
265 my $rep_read = $rep_reads[0];
|
|
266
|
|
267 if ( $seqcount == 1 ) {
|
|
268 print OUTFILE1 "name\tseq_length\trep_read\tsequence\ttotal_reads\t@sorted_keys\n";
|
|
269 printf OUTFILE1 "$rawdata\_%07d\t", $seqcount;
|
|
270 print OUTFILE1 "$l\t$rep_read\t$sequence\t$independenttotal\t@values\n";
|
|
271 }
|
|
272 else {
|
|
273 printf OUTFILE1 "$rawdata\_%07d\t", $seqcount;
|
|
274 print OUTFILE1 "$l\t$rep_read\t$sequence\t$independenttotal\t@values\n";
|
|
275 }
|
|
276 print READS "$sequence\t$check_total\t@rep_reads\n";
|
|
277 }
|
|
278
|
|
279 close (INFILE);
|
|
280 close (OUTFILE1);
|
|
281
|
|
282 open (IN, "<Reads_with_forward_and_reverse_MIDs.txt")||die("can't open Reads by MID file");
|
|
283 open OUT, ">Rejected_reads.txt";
|
|
284 while (<IN>){
|
|
285 chomp;
|
|
286 my @t = split(/\t/,$_);
|
|
287 if ($_ =~ /^name/){
|
|
288 print OUT "$_\n";
|
|
289 print OUTFILE2 "$_\n";
|
|
290 }
|
|
291 elsif ($_ =~ /UNUSED_MIDS/){
|
|
292 print OUT "$_\n";
|
|
293 }
|
|
294 elsif ($t[3]=~/N/){
|
|
295 print OUT "$_\tAMBIGUOUS_BASE_CALLS\n";
|
|
296 }
|
|
297 elsif (length($t[3])<=40){
|
|
298 print OUT "$_\tTOO_SHORT\n";
|
|
299 }
|
|
300 elsif ($t[4] == 1){
|
|
301 print OUT "$_\tSINGLETON\n";
|
|
302 }
|
|
303 else {
|
|
304 print OUTFILE2 "$_\n";
|
|
305 }
|
|
306 }
|
|
307 close (IN);
|
|
308 close (OUT);
|
|
309 close (OUTFILE2);
|
|
310 my (@F, $s);
|
|
311
|
|
312 #tab back to fasta
|
|
313
|
|
314 open IN, "<Reads_with_forward_and_reverse_MIDs.txt";
|
|
315 open OUT, ">Reads_with_forward_and_reverse_MIDs.fasta";
|
|
316 while (<IN>) {
|
|
317 unless ($_ =~ /^name/){
|
|
318 chomp;
|
|
319 my @A = split(/\t/,$_);
|
|
320 my $name = $A[0];
|
|
321 my $sequence = $A[1];
|
|
322 print OUT ">$name\n";
|
|
323 print OUT "$sequence\n";
|
|
324 }
|
|
325 }
|
|
326 print "Converted $. sequences from MID-filtered reads to FASTA format\n\n";
|
|
327 close (IN);
|
|
328 close (OUT);
|
|
329
|
|
330 open IN, "<Filtered_unique_sequences.txt";
|
|
331 open OUT, ">Filtered_unique_sequences.fasta";
|
|
332 while (<IN>) {
|
|
333 unless ($_ =~ /^name/){
|
|
334 chomp;
|
|
335 my @A = split(/\t/,$_);
|
|
336 my $name = $A[0];
|
|
337 my $sequence = $A[3];
|
|
338 print OUT ">$name\n";
|
|
339 print OUT "$sequence\n";
|
|
340 }
|
|
341 }
|
|
342 print "Converted $. sequences from valid reads to FASTA format\n\n";
|
|
343 close (IN);
|
|
344 close (OUT);
|
|
345 print "Reads rejected as $n: $tooshort\n";
|
|
346 print time - $^T . "\n";
|