# HG changeset patch # User caitlin-potter # Date 1464015949 14400 # Node ID c9680c78151b7ec84064e42e01aa711f3f4f6e4c Uploaded diff -r 000000000000 -r c9680c78151b 1_Filter_by_truncation.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/1_Filter_by_truncation.pl Mon May 23 11:05:49 2016 -0400 @@ -0,0 +1,346 @@ +#!/usr/bin/perl +#Name : 1_Filter_by_truncation.pl +#Author : Morgan, Matthew +#Created : 02/2013 +#Modified : 08/2013 +#Purpose : Filters raw pyrosequencing reads, trims primers and barcodes, truncates at user-defined length, bins by sequence, removes poor sequences +#Syntax : perl 1_Filter_by_truncation.pl [Raw sequence file] [gasket file] [FMID file] [RMID file] [truncation length] +#Further info : Further information regarding this script and APDP can be found in the documentation downloaded with this file, and in Morgan et al., (in review) +#Acknowledgment : Code for converting between tab-delimited text and fasta formats was borrowed and modified from http://sysbio.harvard.edu/csb/resources/computational/scriptome/ +#Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230. + +######################################################################################################################################################### +# # +#CSIRO Open Source Software License Agreement (GPLv3) # +# # +#Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230. # +# # +#All rights reserved. CSIRO is willing to grant you a license to APDP on the terms of the GNU General Public License version 3 # +# as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.html), except where otherwise indicated for third party material. # +#The following additional terms apply under clause 7 of that license: # +# # +#EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS". CSIRO AND ITS # +#CONTRIBUTORS MAKE NO REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS, # +#WARRANTIES OR CONDITIONS REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, # +#NON-INFRINGEMENT, THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. # +# # +#TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL THEORY (INCLUDING, WITHOUT # +#LIMITATION, IN AN ACTION FOR BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER LIABILITY HOWSOEVER INCURRED. # +#WITHOUT LIMITING THE SCOPE OF THE PREVIOUS SENTENCE THE EXCLUSION OF LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR OPERATION TIME, LOSS, # +#DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC LOSS; # +#OR ANY SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS LICENCE, THE USE # +#OF THE SOFTWARE OR THE USE OF OR OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO OR ITS CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF # +#SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY. # +# # +#APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY IMPLY REPRESENTATIONS, WARRANTIES, OR CONDITIONS, OR IMPOSES OBLIGATIONS # +#OR LIABILITY ON CSIRO OR ONE OF ITS CONTRIBUTORS IN RESPECT OF THE SOFTWARE THAT CANNOT BE WHOLLY OR PARTLY EXCLUDED, RESTRICTED OR # +#MODIFIED "CONSUMER GUARANTEES". IF SUCH CONSUMER GUARANTEES APPLY THEN THE LIABILITY OF CSIRO AND ITS CONTRIBUTORS IS LIMITED, TO THE FULL # +#EXTENT PERMITTED BY THE APPLICABLE LEGISLATION. WHERE THE APPLICABLE LEGISLATION PERMITS THE FOLLOWING REMEDIES TO BE PROVIDED FOR BREACH OF # +#THE CONSUMER GUARANTEES THEN, AT ITS OPTION, CSIRO'S LIABILITY IS LIMITED TO ANY ONE OR MORE OF THEM: # +#1. THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT SOFTWARE, OR SUPPLYING RELEVANT SERVICES AGAIN; # +#2. THE REPAIR OF THE SOFTWARE; # +#3. THE PAYMENT OF THE COST OF REPLACING THE SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE RELEVANT SERVICES SUPPLIED AGAIN, # +# OR HAVING THE SOFTWARE REPAIRED. # +# # +######################################################################################################################################################### + +use warnings; +use strict; + +my $rawdata = $ARGV[0]; +my (%FORMID, %REVMID); + +#Store FMIDs and RMIDs + +if ($ARGV[2]){my $midfile = $ARGV[2]; + open (INFILE, "<$midfile")||die("Can't open MID file"); + while (){ + s/\r?\n//; + chomp $_; + my @parts = split(/\t/, $_); + my $midname = $parts[0]; + my $midsequence = $parts[1]; + $FORMID{$midname} = $midsequence; + print "$midname\t$midsequence\n"; + } +} +else { + $FORMID{'NOFMID'} = (); +} +close (INFILE); + +if ($ARGV[3]) {my $rmidfile = $ARGV[3]; + open (RMIDS, "<$rmidfile")||die("Can't open RMID file"); + while (){ + s/\r?\n//; + chomp; + my @parts = split(/\t/, $_); + my $midname = $parts[0]; + my $midsequence = $parts[1]; + $REVMID{$midname} = $midsequence; + print "$midname\t$midsequence\n"; + } +} +else { + $REVMID{'NORMID'} = (); +} +my $n; +#read truncation length +if ($ARGV[4]){ + $n = $ARGV[4]; +} +close (IN); + +#add gasket file +my @garray; +if ($ARGV[1]){my $gasketin = $ARGV[1]; + open (G, "<$gasketin")||die("Can't open gasket file"); + while (){ + chomp; + push @garray, $_; + } +} +else { + push @garray, "NOGASKET"; +} + +#open fasta file, read sequence name, sequence, assign to MID group, remove MID and primer sequence. + +#convert fasta to tabular format +my $count = 0; +open (IN, "<$rawdata") || die ("Can't open sequence data"); +open OUT, ">Sequence_data.txt"; +while () { + s/\r?\n//; + s/\t/ /g; + if (s/^>//) { + if ($. != 1) { + print OUT "\n" + } + $count++; + s/ |$/\t/; + $_ .= "\t"; + } + else { + s/ //g; + } + print OUT $_; +} +print "Converted $count raw sequences in fasta format to tab-delimited text\n\n"; +close (IN); +close (OUT); + +# read sequence name, sequence, assign to MID group, remove MID and primer sequence. + +my ($seqname, $sequence, %seen, $partition, $gasket, $formidname, $revmidname, %tophash, @observed_mids, %hash, $midcombination, %totalreads, %totalinvalidreads, %rep); +my $accepted=0; my $multiple_fmids=0; my $multiple_rmids=0; my $no_mids=0; my $no_rmid=0;my $no_fmid=0; my $unused_mids=0; my $tooshort=0; + open (INFILE, "Read_status.txt"; + open OUT1, ">Read_status_summary.txt"; + open OUTFILE1, ">Reads_with_forward_and_reverse_MIDs.txt"; + open OUTFILE2, ">Filtered_unique_sequences.txt"; + open READS, ">Reads_by_sequence.txt"; + while (){ + my (%fmid, %rmid, $f, $r); + my $fmid_count = 0; + my $rmid_count = 0; + chomp $_; + my @nameparts = split(/\t/, $_); + $seqname = $nameparts[0]; + $sequence = $nameparts[2]; + if ($garray[0] eq "NOGASKET"){$partition = "NOGASKET";} + foreach my $formidname (sort keys %FORMID) + { + my $formidsequence = $FORMID{$formidname}; + if ( $sequence =~ /$formidsequence/ ) { + $fmid_count++; + $fmid{$formidname}=1; + } + else { + $fmid{$formidname}=0; + } + } + if ($fmid_count>1){ + $multiple_fmids++; + print OUT "$seqname\tREJECTED\tMULTIPLE_FORWARD_MIDS\n"; + } + elsif ($fmid_count==0){ + $no_mids++; + print OUT "$seqname\tREJECTED\tNO_FORWARD_MID\n"; + } + elsif ($fmid_count==1){ + for my $formid (keys %fmid){ + if ($fmid{$formid}==1){ + $f=$formid; + } + } + my $partition = substr( $seqname, 0, 9 ); + my $comb = $partition . $f; + my $formidsequence = $FORMID{$f}; + $accepted++; + print OUT "$seqname\tACCEPTED\t$comb\n"; + $sequence =~ s/(.*)$formidsequence//; + if (length($sequence)<$n){ + $tooshort++; + next; + } + else { + my $fragment=substr($sequence, 0, $n); + $sequence=$fragment; + } + #want count of how many times seen this sequence with these mids + if (defined($hash{$sequence}{$comb})){ + $hash{$sequence}{$comb} ++; + } + else { + $hash{$sequence}{$comb} = 1; + } + #get total read count for sequence + if (defined($totalreads{$sequence})){ + $totalreads{$sequence} ++; + } + else { + $totalreads{$sequence} = 1; + } + if (defined($rep{$sequence})){ + my @rep_array = @{$rep{$sequence}}; + push @rep_array, $seqname; + $rep{$sequence} = \@rep_array; + } + else { + my @new_rep; + push @new_rep, $seqname; + $rep{$sequence} = \@new_rep; + } + + } + +}#closes while loop +close (OUT); +my $rej = ($multiple_fmids + $multiple_rmids + $no_mids + $no_rmid + $no_fmid + $unused_mids); +print OUT1 "Reads accepted:\t$accepted\n\n"; +print OUT1 "Reads rejected:\t$rej\n\n"; +print OUT1 "\tneither MID:\t$no_mids\n\n"; +print OUT1 "\tno forward MID:\t$no_fmid\n\n"; +print OUT1 "\tno reverse MID:\t$no_rmid\n\n"; +print OUT1 "\tunused mid combination:\t$unused_mids\n\n"; +print OUT1 "\tmultiple forward mids:\t$multiple_fmids\n\n"; +print OUT1 "\tmultiple reverse mids:\t$multiple_rmids\n\n"; +close (OUT1); +#enters zeroes for all unobserved MID combinations in submitted txt file + +foreach my $gasket (@garray) { + foreach my $formidname (sort keys %FORMID) { + my $comb = $gasket . $formidname; + foreach $sequence (sort keys %hash) { + if (defined( $hash{$sequence}{$comb} )) { + next; + } + + else { + $hash{$sequence}{$comb} = 0; + } + } + + } +} + +#print out tab delimited text file: headers are Sequence name (arbitrary); Sequence; total reads; MID reads + +my $seqcount; +foreach my $sequence (sort keys %hash) { + my $independenttotal = $totalreads{$sequence}; + my $l = length($sequence); + $seqcount++; + my $mids = $hash{$sequence}; + local $" = "\t"; + my @sorted_keys = sort keys %$mids; + my @values = @$mids{@sorted_keys}; + + my @rep_reads = sort(@{$rep{$sequence}}); + my $check_total = scalar(@rep_reads); + if ($check_total != $independenttotal){ + print "Oops: read totals don't match for $sequence\n\n"; + } + my $rep_read = $rep_reads[0]; + + if ( $seqcount == 1 ) { + print OUTFILE1 "name\tseq_length\trep_read\tsequence\ttotal_reads\t@sorted_keys\n"; + printf OUTFILE1 "$rawdata\_%07d\t", $seqcount; + print OUTFILE1 "$l\t$rep_read\t$sequence\t$independenttotal\t@values\n"; + } + else { + printf OUTFILE1 "$rawdata\_%07d\t", $seqcount; + print OUTFILE1 "$l\t$rep_read\t$sequence\t$independenttotal\t@values\n"; + } + print READS "$sequence\t$check_total\t@rep_reads\n"; +} + +close (INFILE); +close (OUTFILE1); + +open (IN, "Rejected_reads.txt"; +while (){ + chomp; + my @t = split(/\t/,$_); + if ($_ =~ /^name/){ + print OUT "$_\n"; + print OUTFILE2 "$_\n"; + } + elsif ($_ =~ /UNUSED_MIDS/){ + print OUT "$_\n"; + } + elsif ($t[3]=~/N/){ + print OUT "$_\tAMBIGUOUS_BASE_CALLS\n"; + } + elsif (length($t[3])<=40){ + print OUT "$_\tTOO_SHORT\n"; + } + elsif ($t[4] == 1){ + print OUT "$_\tSINGLETON\n"; + } + else { + print OUTFILE2 "$_\n"; + } +} +close (IN); +close (OUT); +close (OUTFILE2); +my (@F, $s); + +#tab back to fasta + +open IN, "Reads_with_forward_and_reverse_MIDs.fasta"; +while () { + unless ($_ =~ /^name/){ + chomp; + my @A = split(/\t/,$_); + my $name = $A[0]; + my $sequence = $A[1]; + print OUT ">$name\n"; + print OUT "$sequence\n"; + } +} +print "Converted $. sequences from MID-filtered reads to FASTA format\n\n"; +close (IN); +close (OUT); + +open IN, "Filtered_unique_sequences.fasta"; +while () { + unless ($_ =~ /^name/){ + chomp; + my @A = split(/\t/,$_); + my $name = $A[0]; + my $sequence = $A[3]; + print OUT ">$name\n"; + print OUT "$sequence\n"; + } +} +print "Converted $. sequences from valid reads to FASTA format\n\n"; +close (IN); +close (OUT); +print "Reads rejected as $n: $tooshort\n"; +print time - $^T . "\n";