comparison phyloconversion/remove_phytab_dupes.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 #!/usr/bin/perl
2 use strict;
3
4
5
6 my $infile=$ARGV[0];
7 my $keeplongest=$ARGV[1];
8 my $ignoregaps=$ARGV[2];
9 my $uniout=$ARGV[3];
10 my $dupout=$ARGV[4];
11
12 open IN, $infile or die "Cannot open $infile\n";
13
14 my %UniquesHash;
15 my @DupeArray;
16
17 while(<IN>){
18 my $row = $_;
19 chomp($row);
20 my @column = split(/\t/, $row);
21 my $species = $column[0];
22 my $partition = $column[1];
23 my $id = $column[2];
24 my $sequence = $column[3];
25
26 if(exists $UniquesHash{$species}{$partition}){
27 my @dupeseq = split(/\t/, $UniquesHash{$species}{$partition});
28 my ($savlen,$curlen);
29 if($ignoregaps==1){
30 my $nogapsav = $dupeseq[1];
31 my $nogapcur = $sequence;
32 $nogapsav =~ s/\-//g;
33 $nogapcur =~ s/\-//g;
34 $savlen = length($nogapsav);
35 $curlen = length($nogapcur);
36 }else{
37 $savlen = length($dupeseq[1]);
38 $curlen = length($sequence);
39 }
40 if($curlen > $savlen && $keeplongest==1) { #current is longer so keep that one
41 my $oldline = $species."\t".$partition."\t".$UniquesHash{$species}{$partition}."\n";
42 $UniquesHash{$species}{$partition} = "$id\t$sequence";
43 push(@DupeArray, $oldline);
44 }else{
45 push(@DupeArray, "$species\t$partition\t$id\t$sequence\n");
46 }
47 }else{
48 $UniquesHash{$species}{$partition} = "$id\t$sequence";
49 }
50 }
51
52 open OUT, ">".$uniout or die "Cannot open $uniout\n";
53 open DUPES, ">".$dupout or die "Cannot open $dupout\n";
54
55 print DUPES @DupeArray;
56 for my $spname ( keys %UniquesHash ) {
57 for my $partname ( keys %{ $UniquesHash{$spname} } ) {
58 print OUT "$spname\t$partname\t$UniquesHash{$spname}{$partname}\n";
59 }
60 }