view phyloconversion/remove_phytab_dupes.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
line wrap: on
line source

#!/usr/bin/perl
use strict;



my $infile=$ARGV[0];
my $keeplongest=$ARGV[1];
my $ignoregaps=$ARGV[2];
my $uniout=$ARGV[3];
my $dupout=$ARGV[4];

open IN, $infile or die "Cannot open $infile\n";

my %UniquesHash;
my @DupeArray;

while(<IN>){
	my $row = $_;
	chomp($row);
	my @column = split(/\t/, $row);
	my $species = $column[0];
	my $partition = $column[1];
	my $id = $column[2];
	my $sequence = $column[3];

	if(exists $UniquesHash{$species}{$partition}){
		my @dupeseq = split(/\t/, $UniquesHash{$species}{$partition});
		my ($savlen,$curlen);
		if($ignoregaps==1){
			my $nogapsav = $dupeseq[1];
			my $nogapcur = $sequence;
			$nogapsav =~ s/\-//g;
			$nogapcur =~ s/\-//g;
			$savlen = length($nogapsav);
			$curlen = length($nogapcur);
		}else{
			$savlen = length($dupeseq[1]);
			$curlen = length($sequence);
		}
		if($curlen > $savlen && $keeplongest==1) { 		#current is longer so keep that one
			my $oldline = $species."\t".$partition."\t".$UniquesHash{$species}{$partition}."\n";
			$UniquesHash{$species}{$partition} = "$id\t$sequence";
			push(@DupeArray, $oldline);
		}else{ 
			push(@DupeArray, "$species\t$partition\t$id\t$sequence\n");
		}
	}else{
		$UniquesHash{$species}{$partition} = "$id\t$sequence";
	}
}

open OUT, ">".$uniout or die "Cannot open $uniout\n";
open DUPES, ">".$dupout or die "Cannot open $dupout\n";

print DUPES @DupeArray;
for my $spname ( keys %UniquesHash ) {
    for my $partname ( keys %{ $UniquesHash{$spname} } ) {
         print OUT "$spname\t$partname\t$UniquesHash{$spname}{$partname}\n";
    }
}