Mercurial > repos > caitlin-potter > cp_test
comparison 1_Filter_by_truncation.pl @ 0:c9680c78151b draft default tip
Uploaded
author | caitlin-potter |
---|---|
date | Mon, 23 May 2016 11:05:49 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c9680c78151b |
---|---|
1 #!/usr/bin/perl | |
2 #Name : 1_Filter_by_truncation.pl | |
3 #Author : Morgan, Matthew | |
4 #Created : 02/2013 | |
5 #Modified : 08/2013 | |
6 #Purpose : Filters raw pyrosequencing reads, trims primers and barcodes, truncates at user-defined length, bins by sequence, removes poor sequences | |
7 #Syntax : perl 1_Filter_by_truncation.pl [Raw sequence file] [gasket file] [FMID file] [RMID file] [truncation length] | |
8 #Further info : Further information regarding this script and APDP can be found in the documentation downloaded with this file, and in Morgan et al., (in review) | |
9 #Acknowledgment : Code for converting between tab-delimited text and fasta formats was borrowed and modified from http://sysbio.harvard.edu/csb/resources/computational/scriptome/ | |
10 #Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230. | |
11 | |
12 ######################################################################################################################################################### | |
13 # # | |
14 #CSIRO Open Source Software License Agreement (GPLv3) # | |
15 # # | |
16 #Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230. # | |
17 # # | |
18 #All rights reserved. CSIRO is willing to grant you a license to APDP on the terms of the GNU General Public License version 3 # | |
19 # as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.html), except where otherwise indicated for third party material. # | |
20 #The following additional terms apply under clause 7 of that license: # | |
21 # # | |
22 #EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS". CSIRO AND ITS # | |
23 #CONTRIBUTORS MAKE NO REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS, # | |
24 #WARRANTIES OR CONDITIONS REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, # | |
25 #NON-INFRINGEMENT, THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. # | |
26 # # | |
27 #TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL THEORY (INCLUDING, WITHOUT # | |
28 #LIMITATION, IN AN ACTION FOR BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER LIABILITY HOWSOEVER INCURRED. # | |
29 #WITHOUT LIMITING THE SCOPE OF THE PREVIOUS SENTENCE THE EXCLUSION OF LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR OPERATION TIME, LOSS, # | |
30 #DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC LOSS; # | |
31 #OR ANY SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS LICENCE, THE USE # | |
32 #OF THE SOFTWARE OR THE USE OF OR OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO OR ITS CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF # | |
33 #SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY. # | |
34 # # | |
35 #APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY IMPLY REPRESENTATIONS, WARRANTIES, OR CONDITIONS, OR IMPOSES OBLIGATIONS # | |
36 #OR LIABILITY ON CSIRO OR ONE OF ITS CONTRIBUTORS IN RESPECT OF THE SOFTWARE THAT CANNOT BE WHOLLY OR PARTLY EXCLUDED, RESTRICTED OR # | |
37 #MODIFIED "CONSUMER GUARANTEES". IF SUCH CONSUMER GUARANTEES APPLY THEN THE LIABILITY OF CSIRO AND ITS CONTRIBUTORS IS LIMITED, TO THE FULL # | |
38 #EXTENT PERMITTED BY THE APPLICABLE LEGISLATION. WHERE THE APPLICABLE LEGISLATION PERMITS THE FOLLOWING REMEDIES TO BE PROVIDED FOR BREACH OF # | |
39 #THE CONSUMER GUARANTEES THEN, AT ITS OPTION, CSIRO'S LIABILITY IS LIMITED TO ANY ONE OR MORE OF THEM: # | |
40 #1. THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT SOFTWARE, OR SUPPLYING RELEVANT SERVICES AGAIN; # | |
41 #2. THE REPAIR OF THE SOFTWARE; # | |
42 #3. THE PAYMENT OF THE COST OF REPLACING THE SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE RELEVANT SERVICES SUPPLIED AGAIN, # | |
43 # OR HAVING THE SOFTWARE REPAIRED. # | |
44 # # | |
45 ######################################################################################################################################################### | |
46 | |
47 use warnings; | |
48 use strict; | |
49 | |
50 my $rawdata = $ARGV[0]; | |
51 my (%FORMID, %REVMID); | |
52 | |
53 #Store FMIDs and RMIDs | |
54 | |
55 if ($ARGV[2]){my $midfile = $ARGV[2]; | |
56 open (INFILE, "<$midfile")||die("Can't open MID file"); | |
57 while (<INFILE>){ | |
58 s/\r?\n//; | |
59 chomp $_; | |
60 my @parts = split(/\t/, $_); | |
61 my $midname = $parts[0]; | |
62 my $midsequence = $parts[1]; | |
63 $FORMID{$midname} = $midsequence; | |
64 print "$midname\t$midsequence\n"; | |
65 } | |
66 } | |
67 else { | |
68 $FORMID{'NOFMID'} = (); | |
69 } | |
70 close (INFILE); | |
71 | |
72 if ($ARGV[3]) {my $rmidfile = $ARGV[3]; | |
73 open (RMIDS, "<$rmidfile")||die("Can't open RMID file"); | |
74 while (<RMIDS>){ | |
75 s/\r?\n//; | |
76 chomp; | |
77 my @parts = split(/\t/, $_); | |
78 my $midname = $parts[0]; | |
79 my $midsequence = $parts[1]; | |
80 $REVMID{$midname} = $midsequence; | |
81 print "$midname\t$midsequence\n"; | |
82 } | |
83 } | |
84 else { | |
85 $REVMID{'NORMID'} = (); | |
86 } | |
87 my $n; | |
88 #read truncation length | |
89 if ($ARGV[4]){ | |
90 $n = $ARGV[4]; | |
91 } | |
92 close (IN); | |
93 | |
94 #add gasket file | |
95 my @garray; | |
96 if ($ARGV[1]){my $gasketin = $ARGV[1]; | |
97 open (G, "<$gasketin")||die("Can't open gasket file"); | |
98 while (<G>){ | |
99 chomp; | |
100 push @garray, $_; | |
101 } | |
102 } | |
103 else { | |
104 push @garray, "NOGASKET"; | |
105 } | |
106 | |
107 #open fasta file, read sequence name, sequence, assign to MID group, remove MID and primer sequence. | |
108 | |
109 #convert fasta to tabular format | |
110 my $count = 0; | |
111 open (IN, "<$rawdata") || die ("Can't open sequence data"); | |
112 open OUT, ">Sequence_data.txt"; | |
113 while (<IN>) { | |
114 s/\r?\n//; | |
115 s/\t/ /g; | |
116 if (s/^>//) { | |
117 if ($. != 1) { | |
118 print OUT "\n" | |
119 } | |
120 $count++; | |
121 s/ |$/\t/; | |
122 $_ .= "\t"; | |
123 } | |
124 else { | |
125 s/ //g; | |
126 } | |
127 print OUT $_; | |
128 } | |
129 print "Converted $count raw sequences in fasta format to tab-delimited text\n\n"; | |
130 close (IN); | |
131 close (OUT); | |
132 | |
133 # read sequence name, sequence, assign to MID group, remove MID and primer sequence. | |
134 | |
135 my ($seqname, $sequence, %seen, $partition, $gasket, $formidname, $revmidname, %tophash, @observed_mids, %hash, $midcombination, %totalreads, %totalinvalidreads, %rep); | |
136 my $accepted=0; my $multiple_fmids=0; my $multiple_rmids=0; my $no_mids=0; my $no_rmid=0;my $no_fmid=0; my $unused_mids=0; my $tooshort=0; | |
137 open (INFILE, "<Sequence_data.txt") || die ("Can't open infile"); | |
138 open OUT, ">Read_status.txt"; | |
139 open OUT1, ">Read_status_summary.txt"; | |
140 open OUTFILE1, ">Reads_with_forward_and_reverse_MIDs.txt"; | |
141 open OUTFILE2, ">Filtered_unique_sequences.txt"; | |
142 open READS, ">Reads_by_sequence.txt"; | |
143 while (<INFILE>){ | |
144 my (%fmid, %rmid, $f, $r); | |
145 my $fmid_count = 0; | |
146 my $rmid_count = 0; | |
147 chomp $_; | |
148 my @nameparts = split(/\t/, $_); | |
149 $seqname = $nameparts[0]; | |
150 $sequence = $nameparts[2]; | |
151 if ($garray[0] eq "NOGASKET"){$partition = "NOGASKET";} | |
152 foreach my $formidname (sort keys %FORMID) | |
153 { | |
154 my $formidsequence = $FORMID{$formidname}; | |
155 if ( $sequence =~ /$formidsequence/ ) { | |
156 $fmid_count++; | |
157 $fmid{$formidname}=1; | |
158 } | |
159 else { | |
160 $fmid{$formidname}=0; | |
161 } | |
162 } | |
163 if ($fmid_count>1){ | |
164 $multiple_fmids++; | |
165 print OUT "$seqname\tREJECTED\tMULTIPLE_FORWARD_MIDS\n"; | |
166 } | |
167 elsif ($fmid_count==0){ | |
168 $no_mids++; | |
169 print OUT "$seqname\tREJECTED\tNO_FORWARD_MID\n"; | |
170 } | |
171 elsif ($fmid_count==1){ | |
172 for my $formid (keys %fmid){ | |
173 if ($fmid{$formid}==1){ | |
174 $f=$formid; | |
175 } | |
176 } | |
177 my $partition = substr( $seqname, 0, 9 ); | |
178 my $comb = $partition . $f; | |
179 my $formidsequence = $FORMID{$f}; | |
180 $accepted++; | |
181 print OUT "$seqname\tACCEPTED\t$comb\n"; | |
182 $sequence =~ s/(.*)$formidsequence//; | |
183 if (length($sequence)<$n){ | |
184 $tooshort++; | |
185 next; | |
186 } | |
187 else { | |
188 my $fragment=substr($sequence, 0, $n); | |
189 $sequence=$fragment; | |
190 } | |
191 #want count of how many times seen this sequence with these mids | |
192 if (defined($hash{$sequence}{$comb})){ | |
193 $hash{$sequence}{$comb} ++; | |
194 } | |
195 else { | |
196 $hash{$sequence}{$comb} = 1; | |
197 } | |
198 #get total read count for sequence | |
199 if (defined($totalreads{$sequence})){ | |
200 $totalreads{$sequence} ++; | |
201 } | |
202 else { | |
203 $totalreads{$sequence} = 1; | |
204 } | |
205 if (defined($rep{$sequence})){ | |
206 my @rep_array = @{$rep{$sequence}}; | |
207 push @rep_array, $seqname; | |
208 $rep{$sequence} = \@rep_array; | |
209 } | |
210 else { | |
211 my @new_rep; | |
212 push @new_rep, $seqname; | |
213 $rep{$sequence} = \@new_rep; | |
214 } | |
215 | |
216 } | |
217 | |
218 }#closes while loop | |
219 close (OUT); | |
220 my $rej = ($multiple_fmids + $multiple_rmids + $no_mids + $no_rmid + $no_fmid + $unused_mids); | |
221 print OUT1 "Reads accepted:\t$accepted\n\n"; | |
222 print OUT1 "Reads rejected:\t$rej\n\n"; | |
223 print OUT1 "\tneither MID:\t$no_mids\n\n"; | |
224 print OUT1 "\tno forward MID:\t$no_fmid\n\n"; | |
225 print OUT1 "\tno reverse MID:\t$no_rmid\n\n"; | |
226 print OUT1 "\tunused mid combination:\t$unused_mids\n\n"; | |
227 print OUT1 "\tmultiple forward mids:\t$multiple_fmids\n\n"; | |
228 print OUT1 "\tmultiple reverse mids:\t$multiple_rmids\n\n"; | |
229 close (OUT1); | |
230 #enters zeroes for all unobserved MID combinations in submitted txt file | |
231 | |
232 foreach my $gasket (@garray) { | |
233 foreach my $formidname (sort keys %FORMID) { | |
234 my $comb = $gasket . $formidname; | |
235 foreach $sequence (sort keys %hash) { | |
236 if (defined( $hash{$sequence}{$comb} )) { | |
237 next; | |
238 } | |
239 | |
240 else { | |
241 $hash{$sequence}{$comb} = 0; | |
242 } | |
243 } | |
244 | |
245 } | |
246 } | |
247 | |
248 #print out tab delimited text file: headers are Sequence name (arbitrary); Sequence; total reads; MID reads | |
249 | |
250 my $seqcount; | |
251 foreach my $sequence (sort keys %hash) { | |
252 my $independenttotal = $totalreads{$sequence}; | |
253 my $l = length($sequence); | |
254 $seqcount++; | |
255 my $mids = $hash{$sequence}; | |
256 local $" = "\t"; | |
257 my @sorted_keys = sort keys %$mids; | |
258 my @values = @$mids{@sorted_keys}; | |
259 | |
260 my @rep_reads = sort(@{$rep{$sequence}}); | |
261 my $check_total = scalar(@rep_reads); | |
262 if ($check_total != $independenttotal){ | |
263 print "Oops: read totals don't match for $sequence\n\n"; | |
264 } | |
265 my $rep_read = $rep_reads[0]; | |
266 | |
267 if ( $seqcount == 1 ) { | |
268 print OUTFILE1 "name\tseq_length\trep_read\tsequence\ttotal_reads\t@sorted_keys\n"; | |
269 printf OUTFILE1 "$rawdata\_%07d\t", $seqcount; | |
270 print OUTFILE1 "$l\t$rep_read\t$sequence\t$independenttotal\t@values\n"; | |
271 } | |
272 else { | |
273 printf OUTFILE1 "$rawdata\_%07d\t", $seqcount; | |
274 print OUTFILE1 "$l\t$rep_read\t$sequence\t$independenttotal\t@values\n"; | |
275 } | |
276 print READS "$sequence\t$check_total\t@rep_reads\n"; | |
277 } | |
278 | |
279 close (INFILE); | |
280 close (OUTFILE1); | |
281 | |
282 open (IN, "<Reads_with_forward_and_reverse_MIDs.txt")||die("can't open Reads by MID file"); | |
283 open OUT, ">Rejected_reads.txt"; | |
284 while (<IN>){ | |
285 chomp; | |
286 my @t = split(/\t/,$_); | |
287 if ($_ =~ /^name/){ | |
288 print OUT "$_\n"; | |
289 print OUTFILE2 "$_\n"; | |
290 } | |
291 elsif ($_ =~ /UNUSED_MIDS/){ | |
292 print OUT "$_\n"; | |
293 } | |
294 elsif ($t[3]=~/N/){ | |
295 print OUT "$_\tAMBIGUOUS_BASE_CALLS\n"; | |
296 } | |
297 elsif (length($t[3])<=40){ | |
298 print OUT "$_\tTOO_SHORT\n"; | |
299 } | |
300 elsif ($t[4] == 1){ | |
301 print OUT "$_\tSINGLETON\n"; | |
302 } | |
303 else { | |
304 print OUTFILE2 "$_\n"; | |
305 } | |
306 } | |
307 close (IN); | |
308 close (OUT); | |
309 close (OUTFILE2); | |
310 my (@F, $s); | |
311 | |
312 #tab back to fasta | |
313 | |
314 open IN, "<Reads_with_forward_and_reverse_MIDs.txt"; | |
315 open OUT, ">Reads_with_forward_and_reverse_MIDs.fasta"; | |
316 while (<IN>) { | |
317 unless ($_ =~ /^name/){ | |
318 chomp; | |
319 my @A = split(/\t/,$_); | |
320 my $name = $A[0]; | |
321 my $sequence = $A[1]; | |
322 print OUT ">$name\n"; | |
323 print OUT "$sequence\n"; | |
324 } | |
325 } | |
326 print "Converted $. sequences from MID-filtered reads to FASTA format\n\n"; | |
327 close (IN); | |
328 close (OUT); | |
329 | |
330 open IN, "<Filtered_unique_sequences.txt"; | |
331 open OUT, ">Filtered_unique_sequences.fasta"; | |
332 while (<IN>) { | |
333 unless ($_ =~ /^name/){ | |
334 chomp; | |
335 my @A = split(/\t/,$_); | |
336 my $name = $A[0]; | |
337 my $sequence = $A[3]; | |
338 print OUT ">$name\n"; | |
339 print OUT "$sequence\n"; | |
340 } | |
341 } | |
342 print "Converted $. sequences from valid reads to FASTA format\n\n"; | |
343 close (IN); | |
344 close (OUT); | |
345 print "Reads rejected as $n: $tooshort\n"; | |
346 print time - $^T . "\n"; |