view 1_Filter_by_truncation.pl @ 0:c9680c78151b draft default tip

Uploaded
author caitlin-potter
date Mon, 23 May 2016 11:05:49 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl
#Name    	: 1_Filter_by_truncation.pl
#Author  	: Morgan, Matthew
#Created 	: 02/2013
#Modified	: 08/2013
#Purpose 	: Filters raw pyrosequencing reads, trims primers and barcodes, truncates at user-defined length, bins by sequence, removes poor sequences
#Syntax  	: perl 1_Filter_by_truncation.pl [Raw sequence file] [gasket file] [FMID file] [RMID file] [truncation length]
#Further info	: Further information regarding this script and APDP can be found in the documentation downloaded with this file, and in Morgan et al., (in review)
#Acknowledgment	: Code for converting between tab-delimited text and fasta formats was borrowed and modified from http://sysbio.harvard.edu/csb/resources/computational/scriptome/
#Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.

#########################################################################################################################################################	
#																			#
#CSIRO Open Source Software License Agreement (GPLv3)													#
#																			#
#Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.					#
#																			#
#All rights reserved. CSIRO is willing to grant you a license to APDP on the terms of the GNU General Public License version 3				#
# as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.html), except where otherwise indicated for third party material.	#
#The following additional terms apply under clause 7 of that license:											#
#																			#
#EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS". CSIRO AND ITS		#
#CONTRIBUTORS MAKE NO REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS,	#
#WARRANTIES OR CONDITIONS REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE,		#
#NON-INFRINGEMENT, THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE.				#
#																			#
#TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL THEORY (INCLUDING, WITHOUT		#
#LIMITATION, IN AN ACTION FOR BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER LIABILITY HOWSOEVER INCURRED.		#
#WITHOUT LIMITING THE SCOPE OF THE PREVIOUS SENTENCE THE EXCLUSION OF LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR OPERATION TIME, LOSS,		#
#DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC LOSS;		#
#OR ANY SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS LICENCE, THE USE		#
#OF THE SOFTWARE OR THE USE OF OR OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO OR ITS CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF		#
#SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY.														#
#																			#
#APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY IMPLY REPRESENTATIONS, WARRANTIES, OR CONDITIONS, OR IMPOSES OBLIGATIONS		#
#OR LIABILITY ON CSIRO OR ONE OF ITS CONTRIBUTORS IN RESPECT OF THE SOFTWARE THAT CANNOT BE WHOLLY OR PARTLY EXCLUDED, RESTRICTED OR			#
#MODIFIED "CONSUMER GUARANTEES".  IF SUCH CONSUMER GUARANTEES APPLY THEN THE LIABILITY OF CSIRO AND ITS CONTRIBUTORS IS LIMITED, TO THE FULL		#
#EXTENT PERMITTED BY THE APPLICABLE LEGISLATION.  WHERE THE APPLICABLE LEGISLATION PERMITS THE FOLLOWING REMEDIES TO BE PROVIDED FOR BREACH OF		#
#THE CONSUMER GUARANTEES THEN, AT ITS OPTION, CSIRO'S LIABILITY IS LIMITED TO ANY ONE OR MORE OF THEM:							#
#1.          THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT SOFTWARE, OR SUPPLYING RELEVANT SERVICES AGAIN;					#
#2.          THE REPAIR OF THE SOFTWARE; 														#
#3.          THE PAYMENT OF THE COST OF REPLACING THE SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE RELEVANT SERVICES SUPPLIED AGAIN,		#
#	     OR HAVING THE SOFTWARE REPAIRED.														#
#																			#
#########################################################################################################################################################

use warnings;
use strict;

my $rawdata = $ARGV[0];
my (%FORMID, %REVMID);

#Store FMIDs and RMIDs 

if ($ARGV[2]){my $midfile = $ARGV[2];
	open (INFILE, "<$midfile")||die("Can't open MID file");
	while (<INFILE>){
		s/\r?\n//;
		chomp $_;
		my @parts = split(/\t/, $_);
		my $midname = $parts[0];
		my $midsequence = $parts[1];
		$FORMID{$midname} = $midsequence;
		print "$midname\t$midsequence\n";
		}
}
else {
	$FORMID{'NOFMID'} = ();
}
close (INFILE);

if ($ARGV[3]) {my $rmidfile = $ARGV[3];
	open (RMIDS, "<$rmidfile")||die("Can't open RMID file");
	while (<RMIDS>){
		s/\r?\n//;
		chomp;
		my @parts = split(/\t/, $_);
                my $midname = $parts[0];
                my $midsequence = $parts[1];
		$REVMID{$midname} = $midsequence;
		print "$midname\t$midsequence\n";
		}
}
else {
        $REVMID{'NORMID'} = ();
}
my $n;
#read truncation length
if ($ARGV[4]){ 
	$n = $ARGV[4];
}
close (IN);

#add gasket file
my @garray;
if ($ARGV[1]){my $gasketin = $ARGV[1];
	open (G, "<$gasketin")||die("Can't open gasket file");
	while (<G>){
		chomp;
		push @garray, $_;
	}
}
else {
	push @garray, "NOGASKET";
}

#open fasta file, read sequence name, sequence, assign to MID group, remove MID and primer sequence.

#convert fasta to tabular format
my $count = 0;
open (IN, "<$rawdata") || die ("Can't open sequence data");
open OUT, ">Sequence_data.txt";
while (<IN>) {
	s/\r?\n//;
	s/\t/ /g; 
	if (s/^>//) { 
		if ($. != 1) { 
			print OUT "\n" 
		} 
		$count++;
		s/ |$/\t/; 
		$_ .= "\t"; 
	}
	else { 
		s/ //g; 		
	} 
	print OUT $_; 
} 
print "Converted $count raw sequences in fasta format to tab-delimited text\n\n";
close (IN);
close (OUT);

# read sequence name, sequence, assign to MID group, remove MID and primer sequence.

my ($seqname, $sequence, %seen, $partition, $gasket, $formidname, $revmidname, %tophash, @observed_mids, %hash, $midcombination, %totalreads, %totalinvalidreads, %rep);
my $accepted=0; my $multiple_fmids=0; my $multiple_rmids=0; my $no_mids=0; my $no_rmid=0;my $no_fmid=0; my $unused_mids=0; my $tooshort=0;
	open (INFILE, "<Sequence_data.txt") || die ("Can't open infile");
	open OUT, ">Read_status.txt";
	open OUT1, ">Read_status_summary.txt";
	open OUTFILE1, ">Reads_with_forward_and_reverse_MIDs.txt";
	open OUTFILE2, ">Filtered_unique_sequences.txt";
	open READS, ">Reads_by_sequence.txt";
	while (<INFILE>){
		my (%fmid, %rmid, $f, $r);
		my $fmid_count = 0;
		my $rmid_count = 0;
		chomp $_;
		my @nameparts = split(/\t/, $_);
		$seqname = $nameparts[0];
		$sequence = $nameparts[2];
		if ($garray[0] eq "NOGASKET"){$partition = "NOGASKET";}
	foreach my $formidname (sort keys %FORMID)
		{
		my $formidsequence = $FORMID{$formidname};
		if ( $sequence  =~ /$formidsequence/ ) {
			$fmid_count++;
			$fmid{$formidname}=1;
		}
		else {
			$fmid{$formidname}=0;
		}
	}
	if ($fmid_count>1){
		$multiple_fmids++;
		print OUT "$seqname\tREJECTED\tMULTIPLE_FORWARD_MIDS\n";
	}
	elsif ($fmid_count==0){
		$no_mids++;
		print OUT "$seqname\tREJECTED\tNO_FORWARD_MID\n";
	}
	elsif ($fmid_count==1){
                for my $formid (keys %fmid){
			if ($fmid{$formid}==1){
				$f=$formid;
			}
		}
		my $partition      = substr( $seqname, 0, 9 );		
		my $comb = $partition . $f;
		my $formidsequence = $FORMID{$f};
		$accepted++;
		print OUT "$seqname\tACCEPTED\t$comb\n";
		$sequence =~ s/(.*)$formidsequence//;
                if (length($sequence)<$n){
			$tooshort++;				
			next;
		}
		else {
			my $fragment=substr($sequence, 0, $n);
			$sequence=$fragment;
		}
                #want count of how many times seen this sequence with these mids
                if (defined($hash{$sequence}{$comb})){
                       	$hash{$sequence}{$comb} ++;
                }
                else {
                        $hash{$sequence}{$comb} = 1;
		}
                #get total read count for sequence
                if (defined($totalreads{$sequence})){
	                $totalreads{$sequence} ++;
		}
                else { 
                        $totalreads{$sequence} = 1;
                }
		if (defined($rep{$sequence})){
			my @rep_array = @{$rep{$sequence}};
			push @rep_array, $seqname;
			$rep{$sequence} = \@rep_array;
		}
		else {
			my @new_rep;
			push @new_rep, $seqname;
			$rep{$sequence} = \@new_rep;
		}
		
	}

}#closes while loop
close (OUT);
my $rej = ($multiple_fmids + $multiple_rmids + $no_mids + $no_rmid + $no_fmid + $unused_mids);
print OUT1 "Reads accepted:\t$accepted\n\n";
print OUT1 "Reads rejected:\t$rej\n\n";
print OUT1 "\tneither MID:\t$no_mids\n\n";
print OUT1 "\tno forward MID:\t$no_fmid\n\n";
print OUT1 "\tno reverse MID:\t$no_rmid\n\n";
print OUT1 "\tunused mid combination:\t$unused_mids\n\n";
print OUT1 "\tmultiple forward mids:\t$multiple_fmids\n\n";
print OUT1 "\tmultiple reverse mids:\t$multiple_rmids\n\n";
close (OUT1);
#enters zeroes for all unobserved MID combinations in submitted txt file

foreach my $gasket (@garray) {
	foreach my $formidname (sort keys %FORMID) {
		my $comb = $gasket . $formidname;
			foreach $sequence (sort keys %hash) {
				if (defined( $hash{$sequence}{$comb} )) {
					next;
				}

				else {
					$hash{$sequence}{$comb} = 0;
				}
			}
			
	}
}

#print out tab delimited text file: headers are Sequence name (arbitrary); Sequence; total reads; MID reads

my $seqcount;
foreach my $sequence (sort keys %hash) {
        my $independenttotal = $totalreads{$sequence};
	my $l = length($sequence);
	$seqcount++;
        my $mids = $hash{$sequence};
        local $" = "\t";
	my @sorted_keys = sort keys %$mids;
        my @values = @$mids{@sorted_keys};
	
	my @rep_reads = sort(@{$rep{$sequence}});
	my $check_total = scalar(@rep_reads);
	if ($check_total != $independenttotal){
		print "Oops: read totals don't match for $sequence\n\n";
	}
	my $rep_read = $rep_reads[0];
	
	if ( $seqcount == 1 ) {
                        print OUTFILE1 "name\tseq_length\trep_read\tsequence\ttotal_reads\t@sorted_keys\n";
                        printf OUTFILE1 "$rawdata\_%07d\t", $seqcount;
			print OUTFILE1 "$l\t$rep_read\t$sequence\t$independenttotal\t@values\n";
                }
                else {
                        printf OUTFILE1 "$rawdata\_%07d\t", $seqcount;
			print OUTFILE1 "$l\t$rep_read\t$sequence\t$independenttotal\t@values\n";
                }
	print READS "$sequence\t$check_total\t@rep_reads\n";
}

close (INFILE);
close (OUTFILE1);

open (IN, "<Reads_with_forward_and_reverse_MIDs.txt")||die("can't open Reads by MID file");
open OUT, ">Rejected_reads.txt";
while (<IN>){
	chomp;
	my @t = split(/\t/,$_);
	if ($_ =~ /^name/){
		print OUT "$_\n";
		print OUTFILE2 "$_\n";
	}
	elsif ($_ =~ /UNUSED_MIDS/){
		print OUT "$_\n";	
	}
	elsif ($t[3]=~/N/){
		print OUT "$_\tAMBIGUOUS_BASE_CALLS\n";
	}
	elsif (length($t[3])<=40){
		print OUT "$_\tTOO_SHORT\n";
	}
	elsif ($t[4] == 1){
		print OUT "$_\tSINGLETON\n";
	}
	else {
		print OUTFILE2 "$_\n";
	}
}
close (IN);
close (OUT);
close (OUTFILE2);
my (@F, $s);

#tab back to fasta

open IN, "<Reads_with_forward_and_reverse_MIDs.txt";
open OUT, ">Reads_with_forward_and_reverse_MIDs.fasta";
while (<IN>) {
        unless ($_ =~ /^name/){
		chomp;
        	my @A = split(/\t/,$_);
        	my $name = $A[0];
        	my $sequence = $A[1];
        	print OUT ">$name\n";
        	print OUT "$sequence\n";
	}
}
print "Converted $. sequences from MID-filtered reads to FASTA format\n\n";
close (IN);
close (OUT);

open IN, "<Filtered_unique_sequences.txt";
open OUT, ">Filtered_unique_sequences.fasta";
while (<IN>) {
        unless ($_ =~ /^name/){
                chomp;
                my @A = split(/\t/,$_);
                my $name = $A[0];
                my $sequence = $A[3];
                print OUT ">$name\n";
                print OUT "$sequence\n";
        }
}
print "Converted $. sequences from valid reads to FASTA format\n\n";
close (IN);
close (OUT);
print "Reads rejected as $n: $tooshort\n";
print time - $^T . "\n";