Mercurial > repos > caitlin-potter > cp_test
view 1_Filter_by_truncation.pl @ 0:c9680c78151b draft default tip
Uploaded
author | caitlin-potter |
---|---|
date | Mon, 23 May 2016 11:05:49 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl #Name : 1_Filter_by_truncation.pl #Author : Morgan, Matthew #Created : 02/2013 #Modified : 08/2013 #Purpose : Filters raw pyrosequencing reads, trims primers and barcodes, truncates at user-defined length, bins by sequence, removes poor sequences #Syntax : perl 1_Filter_by_truncation.pl [Raw sequence file] [gasket file] [FMID file] [RMID file] [truncation length] #Further info : Further information regarding this script and APDP can be found in the documentation downloaded with this file, and in Morgan et al., (in review) #Acknowledgment : Code for converting between tab-delimited text and fasta formats was borrowed and modified from http://sysbio.harvard.edu/csb/resources/computational/scriptome/ #Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230. ######################################################################################################################################################### # # #CSIRO Open Source Software License Agreement (GPLv3) # # # #Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230. # # # #All rights reserved. CSIRO is willing to grant you a license to APDP on the terms of the GNU General Public License version 3 # # as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.html), except where otherwise indicated for third party material. # #The following additional terms apply under clause 7 of that license: # # # #EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS". CSIRO AND ITS # #CONTRIBUTORS MAKE NO REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS, # #WARRANTIES OR CONDITIONS REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, # #NON-INFRINGEMENT, THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. # # # #TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL THEORY (INCLUDING, WITHOUT # #LIMITATION, IN AN ACTION FOR BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER LIABILITY HOWSOEVER INCURRED. # #WITHOUT LIMITING THE SCOPE OF THE PREVIOUS SENTENCE THE EXCLUSION OF LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR OPERATION TIME, LOSS, # #DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC LOSS; # #OR ANY SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS LICENCE, THE USE # #OF THE SOFTWARE OR THE USE OF OR OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO OR ITS CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF # #SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY. # # # #APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY IMPLY REPRESENTATIONS, WARRANTIES, OR CONDITIONS, OR IMPOSES OBLIGATIONS # #OR LIABILITY ON CSIRO OR ONE OF ITS CONTRIBUTORS IN RESPECT OF THE SOFTWARE THAT CANNOT BE WHOLLY OR PARTLY EXCLUDED, RESTRICTED OR # #MODIFIED "CONSUMER GUARANTEES". IF SUCH CONSUMER GUARANTEES APPLY THEN THE LIABILITY OF CSIRO AND ITS CONTRIBUTORS IS LIMITED, TO THE FULL # #EXTENT PERMITTED BY THE APPLICABLE LEGISLATION. WHERE THE APPLICABLE LEGISLATION PERMITS THE FOLLOWING REMEDIES TO BE PROVIDED FOR BREACH OF # #THE CONSUMER GUARANTEES THEN, AT ITS OPTION, CSIRO'S LIABILITY IS LIMITED TO ANY ONE OR MORE OF THEM: # #1. THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT SOFTWARE, OR SUPPLYING RELEVANT SERVICES AGAIN; # #2. THE REPAIR OF THE SOFTWARE; # #3. THE PAYMENT OF THE COST OF REPLACING THE SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE RELEVANT SERVICES SUPPLIED AGAIN, # # OR HAVING THE SOFTWARE REPAIRED. # # # ######################################################################################################################################################### use warnings; use strict; my $rawdata = $ARGV[0]; my (%FORMID, %REVMID); #Store FMIDs and RMIDs if ($ARGV[2]){my $midfile = $ARGV[2]; open (INFILE, "<$midfile")||die("Can't open MID file"); while (<INFILE>){ s/\r?\n//; chomp $_; my @parts = split(/\t/, $_); my $midname = $parts[0]; my $midsequence = $parts[1]; $FORMID{$midname} = $midsequence; print "$midname\t$midsequence\n"; } } else { $FORMID{'NOFMID'} = (); } close (INFILE); if ($ARGV[3]) {my $rmidfile = $ARGV[3]; open (RMIDS, "<$rmidfile")||die("Can't open RMID file"); while (<RMIDS>){ s/\r?\n//; chomp; my @parts = split(/\t/, $_); my $midname = $parts[0]; my $midsequence = $parts[1]; $REVMID{$midname} = $midsequence; print "$midname\t$midsequence\n"; } } else { $REVMID{'NORMID'} = (); } my $n; #read truncation length if ($ARGV[4]){ $n = $ARGV[4]; } close (IN); #add gasket file my @garray; if ($ARGV[1]){my $gasketin = $ARGV[1]; open (G, "<$gasketin")||die("Can't open gasket file"); while (<G>){ chomp; push @garray, $_; } } else { push @garray, "NOGASKET"; } #open fasta file, read sequence name, sequence, assign to MID group, remove MID and primer sequence. #convert fasta to tabular format my $count = 0; open (IN, "<$rawdata") || die ("Can't open sequence data"); open OUT, ">Sequence_data.txt"; while (<IN>) { s/\r?\n//; s/\t/ /g; if (s/^>//) { if ($. != 1) { print OUT "\n" } $count++; s/ |$/\t/; $_ .= "\t"; } else { s/ //g; } print OUT $_; } print "Converted $count raw sequences in fasta format to tab-delimited text\n\n"; close (IN); close (OUT); # read sequence name, sequence, assign to MID group, remove MID and primer sequence. my ($seqname, $sequence, %seen, $partition, $gasket, $formidname, $revmidname, %tophash, @observed_mids, %hash, $midcombination, %totalreads, %totalinvalidreads, %rep); my $accepted=0; my $multiple_fmids=0; my $multiple_rmids=0; my $no_mids=0; my $no_rmid=0;my $no_fmid=0; my $unused_mids=0; my $tooshort=0; open (INFILE, "<Sequence_data.txt") || die ("Can't open infile"); open OUT, ">Read_status.txt"; open OUT1, ">Read_status_summary.txt"; open OUTFILE1, ">Reads_with_forward_and_reverse_MIDs.txt"; open OUTFILE2, ">Filtered_unique_sequences.txt"; open READS, ">Reads_by_sequence.txt"; while (<INFILE>){ my (%fmid, %rmid, $f, $r); my $fmid_count = 0; my $rmid_count = 0; chomp $_; my @nameparts = split(/\t/, $_); $seqname = $nameparts[0]; $sequence = $nameparts[2]; if ($garray[0] eq "NOGASKET"){$partition = "NOGASKET";} foreach my $formidname (sort keys %FORMID) { my $formidsequence = $FORMID{$formidname}; if ( $sequence =~ /$formidsequence/ ) { $fmid_count++; $fmid{$formidname}=1; } else { $fmid{$formidname}=0; } } if ($fmid_count>1){ $multiple_fmids++; print OUT "$seqname\tREJECTED\tMULTIPLE_FORWARD_MIDS\n"; } elsif ($fmid_count==0){ $no_mids++; print OUT "$seqname\tREJECTED\tNO_FORWARD_MID\n"; } elsif ($fmid_count==1){ for my $formid (keys %fmid){ if ($fmid{$formid}==1){ $f=$formid; } } my $partition = substr( $seqname, 0, 9 ); my $comb = $partition . $f; my $formidsequence = $FORMID{$f}; $accepted++; print OUT "$seqname\tACCEPTED\t$comb\n"; $sequence =~ s/(.*)$formidsequence//; if (length($sequence)<$n){ $tooshort++; next; } else { my $fragment=substr($sequence, 0, $n); $sequence=$fragment; } #want count of how many times seen this sequence with these mids if (defined($hash{$sequence}{$comb})){ $hash{$sequence}{$comb} ++; } else { $hash{$sequence}{$comb} = 1; } #get total read count for sequence if (defined($totalreads{$sequence})){ $totalreads{$sequence} ++; } else { $totalreads{$sequence} = 1; } if (defined($rep{$sequence})){ my @rep_array = @{$rep{$sequence}}; push @rep_array, $seqname; $rep{$sequence} = \@rep_array; } else { my @new_rep; push @new_rep, $seqname; $rep{$sequence} = \@new_rep; } } }#closes while loop close (OUT); my $rej = ($multiple_fmids + $multiple_rmids + $no_mids + $no_rmid + $no_fmid + $unused_mids); print OUT1 "Reads accepted:\t$accepted\n\n"; print OUT1 "Reads rejected:\t$rej\n\n"; print OUT1 "\tneither MID:\t$no_mids\n\n"; print OUT1 "\tno forward MID:\t$no_fmid\n\n"; print OUT1 "\tno reverse MID:\t$no_rmid\n\n"; print OUT1 "\tunused mid combination:\t$unused_mids\n\n"; print OUT1 "\tmultiple forward mids:\t$multiple_fmids\n\n"; print OUT1 "\tmultiple reverse mids:\t$multiple_rmids\n\n"; close (OUT1); #enters zeroes for all unobserved MID combinations in submitted txt file foreach my $gasket (@garray) { foreach my $formidname (sort keys %FORMID) { my $comb = $gasket . $formidname; foreach $sequence (sort keys %hash) { if (defined( $hash{$sequence}{$comb} )) { next; } else { $hash{$sequence}{$comb} = 0; } } } } #print out tab delimited text file: headers are Sequence name (arbitrary); Sequence; total reads; MID reads my $seqcount; foreach my $sequence (sort keys %hash) { my $independenttotal = $totalreads{$sequence}; my $l = length($sequence); $seqcount++; my $mids = $hash{$sequence}; local $" = "\t"; my @sorted_keys = sort keys %$mids; my @values = @$mids{@sorted_keys}; my @rep_reads = sort(@{$rep{$sequence}}); my $check_total = scalar(@rep_reads); if ($check_total != $independenttotal){ print "Oops: read totals don't match for $sequence\n\n"; } my $rep_read = $rep_reads[0]; if ( $seqcount == 1 ) { print OUTFILE1 "name\tseq_length\trep_read\tsequence\ttotal_reads\t@sorted_keys\n"; printf OUTFILE1 "$rawdata\_%07d\t", $seqcount; print OUTFILE1 "$l\t$rep_read\t$sequence\t$independenttotal\t@values\n"; } else { printf OUTFILE1 "$rawdata\_%07d\t", $seqcount; print OUTFILE1 "$l\t$rep_read\t$sequence\t$independenttotal\t@values\n"; } print READS "$sequence\t$check_total\t@rep_reads\n"; } close (INFILE); close (OUTFILE1); open (IN, "<Reads_with_forward_and_reverse_MIDs.txt")||die("can't open Reads by MID file"); open OUT, ">Rejected_reads.txt"; while (<IN>){ chomp; my @t = split(/\t/,$_); if ($_ =~ /^name/){ print OUT "$_\n"; print OUTFILE2 "$_\n"; } elsif ($_ =~ /UNUSED_MIDS/){ print OUT "$_\n"; } elsif ($t[3]=~/N/){ print OUT "$_\tAMBIGUOUS_BASE_CALLS\n"; } elsif (length($t[3])<=40){ print OUT "$_\tTOO_SHORT\n"; } elsif ($t[4] == 1){ print OUT "$_\tSINGLETON\n"; } else { print OUTFILE2 "$_\n"; } } close (IN); close (OUT); close (OUTFILE2); my (@F, $s); #tab back to fasta open IN, "<Reads_with_forward_and_reverse_MIDs.txt"; open OUT, ">Reads_with_forward_and_reverse_MIDs.fasta"; while (<IN>) { unless ($_ =~ /^name/){ chomp; my @A = split(/\t/,$_); my $name = $A[0]; my $sequence = $A[1]; print OUT ">$name\n"; print OUT "$sequence\n"; } } print "Converted $. sequences from MID-filtered reads to FASTA format\n\n"; close (IN); close (OUT); open IN, "<Filtered_unique_sequences.txt"; open OUT, ">Filtered_unique_sequences.fasta"; while (<IN>) { unless ($_ =~ /^name/){ chomp; my @A = split(/\t/,$_); my $name = $A[0]; my $sequence = $A[3]; print OUT ">$name\n"; print OUT "$sequence\n"; } } print "Converted $. sequences from valid reads to FASTA format\n\n"; close (IN); close (OUT); print "Reads rejected as $n: $tooshort\n"; print time - $^T . "\n";