comparison 1_Filter_by_truncation.pl @ 0:c9680c78151b draft default tip

Uploaded
author caitlin-potter
date Mon, 23 May 2016 11:05:49 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c9680c78151b
1 #!/usr/bin/perl
2 #Name : 1_Filter_by_truncation.pl
3 #Author : Morgan, Matthew
4 #Created : 02/2013
5 #Modified : 08/2013
6 #Purpose : Filters raw pyrosequencing reads, trims primers and barcodes, truncates at user-defined length, bins by sequence, removes poor sequences
7 #Syntax : perl 1_Filter_by_truncation.pl [Raw sequence file] [gasket file] [FMID file] [RMID file] [truncation length]
8 #Further info : Further information regarding this script and APDP can be found in the documentation downloaded with this file, and in Morgan et al., (in review)
9 #Acknowledgment : Code for converting between tab-delimited text and fasta formats was borrowed and modified from http://sysbio.harvard.edu/csb/resources/computational/scriptome/
10 #Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.
11
12 #########################################################################################################################################################
13 # #
14 #CSIRO Open Source Software License Agreement (GPLv3) #
15 # #
16 #Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230. #
17 # #
18 #All rights reserved. CSIRO is willing to grant you a license to APDP on the terms of the GNU General Public License version 3 #
19 # as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.html), except where otherwise indicated for third party material. #
20 #The following additional terms apply under clause 7 of that license: #
21 # #
22 #EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS". CSIRO AND ITS #
23 #CONTRIBUTORS MAKE NO REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS, #
24 #WARRANTIES OR CONDITIONS REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, #
25 #NON-INFRINGEMENT, THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. #
26 # #
27 #TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL THEORY (INCLUDING, WITHOUT #
28 #LIMITATION, IN AN ACTION FOR BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER LIABILITY HOWSOEVER INCURRED. #
29 #WITHOUT LIMITING THE SCOPE OF THE PREVIOUS SENTENCE THE EXCLUSION OF LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR OPERATION TIME, LOSS, #
30 #DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC LOSS; #
31 #OR ANY SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS LICENCE, THE USE #
32 #OF THE SOFTWARE OR THE USE OF OR OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO OR ITS CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF #
33 #SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY. #
34 # #
35 #APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY IMPLY REPRESENTATIONS, WARRANTIES, OR CONDITIONS, OR IMPOSES OBLIGATIONS #
36 #OR LIABILITY ON CSIRO OR ONE OF ITS CONTRIBUTORS IN RESPECT OF THE SOFTWARE THAT CANNOT BE WHOLLY OR PARTLY EXCLUDED, RESTRICTED OR #
37 #MODIFIED "CONSUMER GUARANTEES". IF SUCH CONSUMER GUARANTEES APPLY THEN THE LIABILITY OF CSIRO AND ITS CONTRIBUTORS IS LIMITED, TO THE FULL #
38 #EXTENT PERMITTED BY THE APPLICABLE LEGISLATION. WHERE THE APPLICABLE LEGISLATION PERMITS THE FOLLOWING REMEDIES TO BE PROVIDED FOR BREACH OF #
39 #THE CONSUMER GUARANTEES THEN, AT ITS OPTION, CSIRO'S LIABILITY IS LIMITED TO ANY ONE OR MORE OF THEM: #
40 #1. THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT SOFTWARE, OR SUPPLYING RELEVANT SERVICES AGAIN; #
41 #2. THE REPAIR OF THE SOFTWARE; #
42 #3. THE PAYMENT OF THE COST OF REPLACING THE SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE RELEVANT SERVICES SUPPLIED AGAIN, #
43 # OR HAVING THE SOFTWARE REPAIRED. #
44 # #
45 #########################################################################################################################################################
46
47 use warnings;
48 use strict;
49
50 my $rawdata = $ARGV[0];
51 my (%FORMID, %REVMID);
52
53 #Store FMIDs and RMIDs
54
55 if ($ARGV[2]){my $midfile = $ARGV[2];
56 open (INFILE, "<$midfile")||die("Can't open MID file");
57 while (<INFILE>){
58 s/\r?\n//;
59 chomp $_;
60 my @parts = split(/\t/, $_);
61 my $midname = $parts[0];
62 my $midsequence = $parts[1];
63 $FORMID{$midname} = $midsequence;
64 print "$midname\t$midsequence\n";
65 }
66 }
67 else {
68 $FORMID{'NOFMID'} = ();
69 }
70 close (INFILE);
71
72 if ($ARGV[3]) {my $rmidfile = $ARGV[3];
73 open (RMIDS, "<$rmidfile")||die("Can't open RMID file");
74 while (<RMIDS>){
75 s/\r?\n//;
76 chomp;
77 my @parts = split(/\t/, $_);
78 my $midname = $parts[0];
79 my $midsequence = $parts[1];
80 $REVMID{$midname} = $midsequence;
81 print "$midname\t$midsequence\n";
82 }
83 }
84 else {
85 $REVMID{'NORMID'} = ();
86 }
87 my $n;
88 #read truncation length
89 if ($ARGV[4]){
90 $n = $ARGV[4];
91 }
92 close (IN);
93
94 #add gasket file
95 my @garray;
96 if ($ARGV[1]){my $gasketin = $ARGV[1];
97 open (G, "<$gasketin")||die("Can't open gasket file");
98 while (<G>){
99 chomp;
100 push @garray, $_;
101 }
102 }
103 else {
104 push @garray, "NOGASKET";
105 }
106
107 #open fasta file, read sequence name, sequence, assign to MID group, remove MID and primer sequence.
108
109 #convert fasta to tabular format
110 my $count = 0;
111 open (IN, "<$rawdata") || die ("Can't open sequence data");
112 open OUT, ">Sequence_data.txt";
113 while (<IN>) {
114 s/\r?\n//;
115 s/\t/ /g;
116 if (s/^>//) {
117 if ($. != 1) {
118 print OUT "\n"
119 }
120 $count++;
121 s/ |$/\t/;
122 $_ .= "\t";
123 }
124 else {
125 s/ //g;
126 }
127 print OUT $_;
128 }
129 print "Converted $count raw sequences in fasta format to tab-delimited text\n\n";
130 close (IN);
131 close (OUT);
132
133 # read sequence name, sequence, assign to MID group, remove MID and primer sequence.
134
135 my ($seqname, $sequence, %seen, $partition, $gasket, $formidname, $revmidname, %tophash, @observed_mids, %hash, $midcombination, %totalreads, %totalinvalidreads, %rep);
136 my $accepted=0; my $multiple_fmids=0; my $multiple_rmids=0; my $no_mids=0; my $no_rmid=0;my $no_fmid=0; my $unused_mids=0; my $tooshort=0;
137 open (INFILE, "<Sequence_data.txt") || die ("Can't open infile");
138 open OUT, ">Read_status.txt";
139 open OUT1, ">Read_status_summary.txt";
140 open OUTFILE1, ">Reads_with_forward_and_reverse_MIDs.txt";
141 open OUTFILE2, ">Filtered_unique_sequences.txt";
142 open READS, ">Reads_by_sequence.txt";
143 while (<INFILE>){
144 my (%fmid, %rmid, $f, $r);
145 my $fmid_count = 0;
146 my $rmid_count = 0;
147 chomp $_;
148 my @nameparts = split(/\t/, $_);
149 $seqname = $nameparts[0];
150 $sequence = $nameparts[2];
151 if ($garray[0] eq "NOGASKET"){$partition = "NOGASKET";}
152 foreach my $formidname (sort keys %FORMID)
153 {
154 my $formidsequence = $FORMID{$formidname};
155 if ( $sequence =~ /$formidsequence/ ) {
156 $fmid_count++;
157 $fmid{$formidname}=1;
158 }
159 else {
160 $fmid{$formidname}=0;
161 }
162 }
163 if ($fmid_count>1){
164 $multiple_fmids++;
165 print OUT "$seqname\tREJECTED\tMULTIPLE_FORWARD_MIDS\n";
166 }
167 elsif ($fmid_count==0){
168 $no_mids++;
169 print OUT "$seqname\tREJECTED\tNO_FORWARD_MID\n";
170 }
171 elsif ($fmid_count==1){
172 for my $formid (keys %fmid){
173 if ($fmid{$formid}==1){
174 $f=$formid;
175 }
176 }
177 my $partition = substr( $seqname, 0, 9 );
178 my $comb = $partition . $f;
179 my $formidsequence = $FORMID{$f};
180 $accepted++;
181 print OUT "$seqname\tACCEPTED\t$comb\n";
182 $sequence =~ s/(.*)$formidsequence//;
183 if (length($sequence)<$n){
184 $tooshort++;
185 next;
186 }
187 else {
188 my $fragment=substr($sequence, 0, $n);
189 $sequence=$fragment;
190 }
191 #want count of how many times seen this sequence with these mids
192 if (defined($hash{$sequence}{$comb})){
193 $hash{$sequence}{$comb} ++;
194 }
195 else {
196 $hash{$sequence}{$comb} = 1;
197 }
198 #get total read count for sequence
199 if (defined($totalreads{$sequence})){
200 $totalreads{$sequence} ++;
201 }
202 else {
203 $totalreads{$sequence} = 1;
204 }
205 if (defined($rep{$sequence})){
206 my @rep_array = @{$rep{$sequence}};
207 push @rep_array, $seqname;
208 $rep{$sequence} = \@rep_array;
209 }
210 else {
211 my @new_rep;
212 push @new_rep, $seqname;
213 $rep{$sequence} = \@new_rep;
214 }
215
216 }
217
218 }#closes while loop
219 close (OUT);
220 my $rej = ($multiple_fmids + $multiple_rmids + $no_mids + $no_rmid + $no_fmid + $unused_mids);
221 print OUT1 "Reads accepted:\t$accepted\n\n";
222 print OUT1 "Reads rejected:\t$rej\n\n";
223 print OUT1 "\tneither MID:\t$no_mids\n\n";
224 print OUT1 "\tno forward MID:\t$no_fmid\n\n";
225 print OUT1 "\tno reverse MID:\t$no_rmid\n\n";
226 print OUT1 "\tunused mid combination:\t$unused_mids\n\n";
227 print OUT1 "\tmultiple forward mids:\t$multiple_fmids\n\n";
228 print OUT1 "\tmultiple reverse mids:\t$multiple_rmids\n\n";
229 close (OUT1);
230 #enters zeroes for all unobserved MID combinations in submitted txt file
231
232 foreach my $gasket (@garray) {
233 foreach my $formidname (sort keys %FORMID) {
234 my $comb = $gasket . $formidname;
235 foreach $sequence (sort keys %hash) {
236 if (defined( $hash{$sequence}{$comb} )) {
237 next;
238 }
239
240 else {
241 $hash{$sequence}{$comb} = 0;
242 }
243 }
244
245 }
246 }
247
248 #print out tab delimited text file: headers are Sequence name (arbitrary); Sequence; total reads; MID reads
249
250 my $seqcount;
251 foreach my $sequence (sort keys %hash) {
252 my $independenttotal = $totalreads{$sequence};
253 my $l = length($sequence);
254 $seqcount++;
255 my $mids = $hash{$sequence};
256 local $" = "\t";
257 my @sorted_keys = sort keys %$mids;
258 my @values = @$mids{@sorted_keys};
259
260 my @rep_reads = sort(@{$rep{$sequence}});
261 my $check_total = scalar(@rep_reads);
262 if ($check_total != $independenttotal){
263 print "Oops: read totals don't match for $sequence\n\n";
264 }
265 my $rep_read = $rep_reads[0];
266
267 if ( $seqcount == 1 ) {
268 print OUTFILE1 "name\tseq_length\trep_read\tsequence\ttotal_reads\t@sorted_keys\n";
269 printf OUTFILE1 "$rawdata\_%07d\t", $seqcount;
270 print OUTFILE1 "$l\t$rep_read\t$sequence\t$independenttotal\t@values\n";
271 }
272 else {
273 printf OUTFILE1 "$rawdata\_%07d\t", $seqcount;
274 print OUTFILE1 "$l\t$rep_read\t$sequence\t$independenttotal\t@values\n";
275 }
276 print READS "$sequence\t$check_total\t@rep_reads\n";
277 }
278
279 close (INFILE);
280 close (OUTFILE1);
281
282 open (IN, "<Reads_with_forward_and_reverse_MIDs.txt")||die("can't open Reads by MID file");
283 open OUT, ">Rejected_reads.txt";
284 while (<IN>){
285 chomp;
286 my @t = split(/\t/,$_);
287 if ($_ =~ /^name/){
288 print OUT "$_\n";
289 print OUTFILE2 "$_\n";
290 }
291 elsif ($_ =~ /UNUSED_MIDS/){
292 print OUT "$_\n";
293 }
294 elsif ($t[3]=~/N/){
295 print OUT "$_\tAMBIGUOUS_BASE_CALLS\n";
296 }
297 elsif (length($t[3])<=40){
298 print OUT "$_\tTOO_SHORT\n";
299 }
300 elsif ($t[4] == 1){
301 print OUT "$_\tSINGLETON\n";
302 }
303 else {
304 print OUTFILE2 "$_\n";
305 }
306 }
307 close (IN);
308 close (OUT);
309 close (OUTFILE2);
310 my (@F, $s);
311
312 #tab back to fasta
313
314 open IN, "<Reads_with_forward_and_reverse_MIDs.txt";
315 open OUT, ">Reads_with_forward_and_reverse_MIDs.fasta";
316 while (<IN>) {
317 unless ($_ =~ /^name/){
318 chomp;
319 my @A = split(/\t/,$_);
320 my $name = $A[0];
321 my $sequence = $A[1];
322 print OUT ">$name\n";
323 print OUT "$sequence\n";
324 }
325 }
326 print "Converted $. sequences from MID-filtered reads to FASTA format\n\n";
327 close (IN);
328 close (OUT);
329
330 open IN, "<Filtered_unique_sequences.txt";
331 open OUT, ">Filtered_unique_sequences.fasta";
332 while (<IN>) {
333 unless ($_ =~ /^name/){
334 chomp;
335 my @A = split(/\t/,$_);
336 my $name = $A[0];
337 my $sequence = $A[3];
338 print OUT ">$name\n";
339 print OUT "$sequence\n";
340 }
341 }
342 print "Converted $. sequences from valid reads to FASTA format\n\n";
343 close (IN);
344 close (OUT);
345 print "Reads rejected as $n: $tooshort\n";
346 print time - $^T . "\n";