Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
comparison phyloconversion/remove_phytab_dupes.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5b9a38ec4a39 |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 | |
4 | |
5 | |
6 my $infile=$ARGV[0]; | |
7 my $keeplongest=$ARGV[1]; | |
8 my $ignoregaps=$ARGV[2]; | |
9 my $uniout=$ARGV[3]; | |
10 my $dupout=$ARGV[4]; | |
11 | |
12 open IN, $infile or die "Cannot open $infile\n"; | |
13 | |
14 my %UniquesHash; | |
15 my @DupeArray; | |
16 | |
17 while(<IN>){ | |
18 my $row = $_; | |
19 chomp($row); | |
20 my @column = split(/\t/, $row); | |
21 my $species = $column[0]; | |
22 my $partition = $column[1]; | |
23 my $id = $column[2]; | |
24 my $sequence = $column[3]; | |
25 | |
26 if(exists $UniquesHash{$species}{$partition}){ | |
27 my @dupeseq = split(/\t/, $UniquesHash{$species}{$partition}); | |
28 my ($savlen,$curlen); | |
29 if($ignoregaps==1){ | |
30 my $nogapsav = $dupeseq[1]; | |
31 my $nogapcur = $sequence; | |
32 $nogapsav =~ s/\-//g; | |
33 $nogapcur =~ s/\-//g; | |
34 $savlen = length($nogapsav); | |
35 $curlen = length($nogapcur); | |
36 }else{ | |
37 $savlen = length($dupeseq[1]); | |
38 $curlen = length($sequence); | |
39 } | |
40 if($curlen > $savlen && $keeplongest==1) { #current is longer so keep that one | |
41 my $oldline = $species."\t".$partition."\t".$UniquesHash{$species}{$partition}."\n"; | |
42 $UniquesHash{$species}{$partition} = "$id\t$sequence"; | |
43 push(@DupeArray, $oldline); | |
44 }else{ | |
45 push(@DupeArray, "$species\t$partition\t$id\t$sequence\n"); | |
46 } | |
47 }else{ | |
48 $UniquesHash{$species}{$partition} = "$id\t$sequence"; | |
49 } | |
50 } | |
51 | |
52 open OUT, ">".$uniout or die "Cannot open $uniout\n"; | |
53 open DUPES, ">".$dupout or die "Cannot open $dupout\n"; | |
54 | |
55 print DUPES @DupeArray; | |
56 for my $spname ( keys %UniquesHash ) { | |
57 for my $partname ( keys %{ $UniquesHash{$spname} } ) { | |
58 print OUT "$spname\t$partname\t$UniquesHash{$spname}{$partname}\n"; | |
59 } | |
60 } |