diff phyloconversion/length_outliers.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/phyloconversion/length_outliers.pl	Tue Mar 11 12:19:13 2014 -0700
@@ -0,0 +1,59 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use Bio::DB::Fasta;
+use Bio::SeqIO;
+use Bio::Seq;
+
+#inputs
+my $infile=shift(@ARGV);
+my $outfile=shift(@ARGV);
+my $deloutfile=shift(@ARGV);
+my $percent=shift(@ARGV);
+
+my $seqid;
+my $newnumbers=1; 		#for sequential renumbering of header
+
+open FILE, ">$outfile" or die $!;
+open DELFILE, ">$deloutfile" or die $!;
+
+
+# open infile fasta file to get average length
+my $in_obj = Bio::SeqIO->new(-file => $infile, '-format' =>'fasta');
+
+my $seqcount;
+my $seqsum = 0;
+my $avelen;
+while (my $seq = $in_obj->next_seq() ) {
+	my $sequence = $seq->seq;
+	my $seqlen = length($sequence);
+	$seqcount++;
+	$seqsum = $seqsum + $seqlen;
+}
+$avelen = $seqsum/$seqcount;
+print "AVE= $avelen \n";
+
+
+# open infile fasta file to get average length
+$in_obj = Bio::SeqIO->new(-file => $infile, '-format' =>'fasta');
+
+while (my $seq = $in_obj->next_seq() ) {
+	my $sequence = $seq->seq;
+	$seqid = $seq->id;
+	$sequence =~ s/\n//g;
+	$sequence =~ tr/a-z/A-Z/;
+	my $seqlen = length($sequence);
+
+	if($seqlen > ($avelen * ($percent/100) ) ){
+		print FILE ">";
+		print FILE $seqid." ".$seq->desc."\n".$sequence."\n";
+	}else{
+print "Writing sequence of $seqlen to DELFILE\n";
+		print DELFILE ">";
+		print DELFILE $seqid." ".$seq->desc."\n".$sequence."\n";
+	}		
+}
+
+close FILE;
+close DELFILE;