Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
view phyloconversion/seqConverterG.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl -w # # seqConverter.pl v1.2 # Last modified August 12, 2010 21:09 # Minor modifications for running in Galaxy by TH Oakley Jan 2012 # (c) Olaf R.P. Bininda-Emonds # # Input: # Sequence data in any of fasta, GenBank, nexus, (classic or extended) phylip, # or Se-Al formats # # Output: # Sequence data in any of fasta, nexus, (classic or extended) phylip, and/or # Se-Al formats. # # Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g] # [-G] [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t] # [-u] [-v] [-h] # [-O]<outfile> specify outfile # options: -a = print out accession numbers instead of taxon labels for nexus # and phylip output # -c<number> = global genetic code (to be used for translation unless # otherwise specified) (default = standard code; end # with * to override local settings) # -d<filename> = file containing raw sequence information; * = batch # convert all specified file types in working # directory (suffixes must be .fasta / .fas, .gb, # .nexus / .nex, .phylip / .ph, or .seal as # appropriate) # -g = do not convert ~ gap characters (e.g., from BioEdit) to - # -G = convert flanking gaps to Ns # -H = convert sequence data to haplotype data # -i<f|g|n|p|s> = format of sequence file (fasta (f), GenBank (g), # nexus (n), phylip (p), or Se-Al (s)); default = # autodetect # -j = produce jackknifed data sets each with a single sequence # deleted # -l<number> = interleave nexus- and/or phylip-formatted output # (default = sequential) with specified sequence length # (between 10 and 100 inclusive; default = 80); # NOTE: also sets default interleave length for fasta # output # -n = convert ambiguous nucleotides to Ns # -o<f|n|pc|pe|s> = output results in any or all of fasta (f), nexus # (n), classic or extended phylip (pc or pe), # and/or Se-Al (s) formats (NOTE: flag can be # invoked multiple times) # -r<a|i> = order sequences in final output alphabetically by name # (a; default) or in input order from file (i) # -s = split data set into individual partitions according to nexus # charset statements # -t = translate sequences into amino acids # -u = interactive user-input mode # -h = print this message and quit # -v = verbose output use strict; use POSIX; #THO added ability for Galaxy -G to specify outfile name my $OutFile; # Set user-input defaults and associated parameters # Data set variables my $inputType = ""; # Options are "fasta", "GenBank", "nexus", "phylip", and "Se-Al" my ($readFile, @seqFiles); my $dataSource; my $seqType = "nucleotide"; my $globalGenCode = 0; my $globalOverride = 0; my (@accNum, %nameLabel, %sequence, %geneticCode, %accPresent); my (@charsetList, %charsetStart, %charsetEnd); my (%deletedSeq, %finalSeq); my $seqCount; my $ntax; my $nameClean = 1; my $skipDuplicates = 1; # User input variables my $seqOrder = "alphabetical"; # Options are "alphabetical" (default) and "input" my $seqSplit = 0; my $jackknife = 0; my $gapChange = 1; my $ambigChange = 0; my $translateSeqs = 0; my $haploTyping = 0; my $haploFile; my $flankGap = 0; # Translation variables and genetic codes my %transTable = ('1' => 'standard', '2' => 'vertebrate mitochondrial', '3' => 'yeast mitochondrial', '4' => 'mold, protozoan and colenterate mitochondrial and mycoplasam/spiroplasma', '5' => 'invertebrate mitochondrial', '6' => 'ciliate, dasycladacean and hexamita nuclear', '9' => 'echinoderm mitochondrial', '10' => 'euplotid nuclear', '11' => 'bacterial and plant plastid', '12' => 'alternative yeast nuclear', '13' => 'ascidian mitochondrial', '14' => 'alternative flatworm mitochondrial', '15' => 'Blepharisma nuclear', '16' => 'chlorophycean mitochondrial', '21' => 'trematode mitochondrial', '22' => 'Scenedesmus obliquus mitochondrial', '23' => 'Thraustochytrium mitochondrial'); my %gb2seal = ('1' => '0', '2' => '1', '3' => '2', '4' => '3', '4' => '4', '5' => '5', '6' => '6', '9' => '7', '10' => '8', '11' => '9', '12' => '10', '13' => '11', '14' => '12', '15' => '13', '16' => '1', '21' => '1', '22' => '1', '23' => '1'); my %seal2gb = ('0' => '1', '1' => '2', '2' => '3', '3' => '4', '4' => '4', '5' => '5', '6' => '6', '7' => '9', '8' => '10', '9' => '11', '10' => '12', '11' => '13', '12' => '14', '13' => '15'); my %genCodes; foreach my $code (qw(0 1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23)) { $genCodes{$code} = 1; } my %DNAtoAA; my @ambigList = qw(A C G T M R W S Y K V H D B N); my %ambigCode = ('A' => 'A', 'C' => 'C', 'G' => 'G', 'T' => 'T', 'AC' => 'M', 'AG' => 'R', 'AT' => 'W', 'CG' => 'S', 'CT' => 'Y', 'GT' => 'K', 'ACG' => 'V', 'ACT' => 'H', 'AGT' => 'D', 'CGT' => 'B', 'ACGT' => 'N'); my (%constitNTlist); while (my ($nt, $code) = each %ambigCode) # Where $nt = key and $code = value { push @{$constitNTlist{$code}}, $_ foreach (split("",$nt)); } # Output variables my $maxLength; my $fastaPrint = 0; my $fastaOut; my $nexusPrint = 0; my $nexusOut; my ($phylipTradPrint, $phylipExtPrint) = (0, 0); my $phylipOut; my $sealPrint = 0; my $sealOut; my $outFormat = "sequential"; my $interleaveLength = 80; my $fastaLength = 80; my $accPrint = 0; # Miscellaneous variables my $verbose = 0; my $debug = 0; my $perlScript = "seqConverterG.pl"; my $version = "1.2"; # Read in user input if (not @ARGV or join(' ', @ARGV) =~ /\s-u/ or $ARGV[0] =~ /^-u/) # Enter interactive user-input mode { print "Entering interactive user-input mode. Type \"q\" at any prompt to exit program.\n"; # Get print format my $defaultAcc = ($accPrint) ? "y" : "n"; undef $accPrint; until (defined $accPrint) { print "\tPrint out accession numbers only to nexus- and/or phylip output (y|n) [$defaultAcc]: "; $accPrint = <stdin>; chomp ($accPrint); exit(0) if ($accPrint eq "q"); if (substr($accPrint, 0, 1) =~ /^n/i or $accPrint eq "") { $accPrint = 0; } elsif (substr($accPrint, 0, 1) =~ /^y/i) { $accPrint = 1; } else { print "\t\tInvalid input ($accPrint)\n"; undef $accPrint; } } # Get datafile until (defined $readFile) { print "\tEnter name of data file (* = batch convert): "; $readFile = <stdin>; chomp ($readFile); exit(0) if ($readFile eq "q"); unless (-e $readFile or $readFile eq "*") { print "\t\tFile '$readFile' does not exist\n"; undef $readFile; } } push @seqFiles, $readFile; # Get format of datafile my $defaultInput = "autodetect"; undef $inputType; until (defined $inputType) { print "\tEnter format of file $readFile (fasta|GenBank|nexus|phylip|Se-Al) [$defaultInput]: "; $inputType = <stdin>; chomp ($inputType); exit(0) if ($inputType =~ /^q/i); if (substr($inputType, 0, 1) =~ /^a/i or $inputType eq "") { $inputType = "autodetect"; } elsif (substr($inputType, 0, 1) =~ /^f/i) { $inputType = "fasta"; } elsif (substr($inputType, 0, 1) =~ /^g/i) { $inputType = "GenBank"; } elsif (substr($inputType, 0, 1) =~ /^n/i) { $inputType = "nexus"; } elsif (substr($inputType, 0, 1) =~ /^p/i) { $inputType = "phylip"; } elsif (substr($inputType, 0, 1) =~ /^s/i) { $inputType = "Se-Al"; } else { print "\t\tInvalid input ($inputType)\n"; undef $inputType; } } $inputType = "" if ($inputType eq "autodetect"); # Get whether or not to clean sequence labels my $defaultClean = ($nameClean) ? "y" : "n"; undef $nameClean; until (defined $nameClean) { print "\tClean sequence labels by changing non-alphanumeric characters to underscores (y|n)? [$defaultClean]: "; $nameClean = <stdin>; chomp ($nameClean); exit(0) if ($nameClean =~ /^q/i); if (substr($nameClean, 0, 1) =~ /^y/i or $nameClean eq "") { $nameClean = 1; } elsif (substr($nameClean, 0, 1) =~ /^n/i) { $nameClean = 0; } else { print "\t\tInvalid input ($seqSplit)\n"; undef $nameClean; } } # Get whether or not to split sequences my $defaultSplit = ($seqSplit) ? "y" : "n"; undef $seqSplit; until (defined $seqSplit) { print "\tSplit into individual partitions following charset statements (y|n)? [$defaultSplit]: "; $seqSplit = <stdin>; chomp ($seqSplit); exit(0) if ($seqSplit =~ /^q/i); if (substr($seqSplit, 0, 1) =~ /^n/i or $seqSplit eq "") { $seqSplit = 0; } elsif (substr($seqSplit, 0, 1) =~ /^y/i) { $seqSplit = 1; } else { print "\t\tInvalid input ($seqSplit)\n"; undef $seqSplit; } } # Get whether or not to jackknife sequences my $defaultJack = ($jackknife) ? "y" : "n"; undef $jackknife; until (defined $jackknife) { print "\tProduce replicate, jackknifed data sets (y|n)? [$defaultJack]: "; $jackknife = <stdin>; chomp ($jackknife); exit(0) if ($jackknife =~ /^q/i); if (substr($jackknife, 0, 1) =~ /^n/i or $jackknife eq "") { $jackknife = 0; } elsif (substr($jackknife, 0, 1) =~ /^y/i) { $jackknife = 1; } else { print "\t\tInvalid input ($jackknife)\n"; undef $jackknife; } } # Get whether or not to convert gaps my $defaultGaps = ($gapChange) ? "y" : "n"; undef $gapChange; until (defined $gapChange) { print "\tConvert ~ gap characters to - (y|n)? [$defaultGaps]: "; $gapChange = <stdin>; chomp ($gapChange); exit(0) if ($gapChange =~ /^q/i); if (substr($gapChange, 0, 1) =~ /^n/i) { $gapChange = 0; } elsif (substr($gapChange, 0, 1) =~ /^y/i or $gapChange eq "") { $gapChange = 1; } else { print "\t\tInvalid input ($gapChange)\n"; undef $gapChange; } } # Get whether or not to convert flanking gaps my $defaultFlank = ($flankGap) ? "y" : "n"; undef $flankGap; until (defined $flankGap) { print "\tConvert flanking gap characters to Ns (y|n)? [$defaultFlank]: "; $flankGap = <stdin>; chomp ($flankGap); exit(0) if ($flankGap =~ /^q/i); if (substr($flankGap, 0, 1) =~ /^n/i) { $flankGap = 0; } elsif (substr($flankGap, 0, 1) =~ /^y/i or $flankGap eq "") { $flankGap = 1; } else { print "\t\tInvalid input ($flankGap)\n"; undef $flankGap; } } # Get whether or not to convert ambiguous nucleotides my $defaultAmbig = ($ambigChange) ? "y" : "n"; undef $ambigChange; until (defined $ambigChange) { print "\tConvert ambiguous nucleotides to Ns (y|n)? [$defaultAmbig]: "; $ambigChange = <stdin>; chomp ($ambigChange); exit(0) if ($ambigChange =~ /^q/i); if (substr($ambigChange, 0, 1) =~ /^n/i or $ambigChange eq "") { $ambigChange = 0; } elsif (substr($ambigChange, 0, 1) =~ /^y/i) { $ambigChange = 1; } else { print "\t\tInvalid input ($ambigChange)\n"; undef $ambigChange; } } # Get output order of sequences my $defaultOrder = $seqOrder; undef $seqOrder; until (defined $seqOrder) { print "\tEnter output order for sequences (alphabetical|clustal|input file) [$defaultOrder]: "; $seqOrder = <stdin>; chomp ($seqOrder); exit(0) if ($seqOrder =~ /^q/i); if (substr($seqOrder, 0, 1) =~ /^i/i) { $seqOrder = "input"; } elsif (substr($seqOrder, 0, 1) =~ /^a/i or $seqOrder eq "") { $seqOrder = "alphabetical"; } else { print "\t\tInvalid input ($seqOrder)\n"; undef $seqOrder; } } # Get whether or not to convert to haplotypes my $defaultHaplo = ($haploTyping) ? "y" : "n"; undef $haploTyping; until (defined $haploTyping) { print "\tConvert sequence data to haplotypes (y|n)? [$defaultHaplo]: "; $haploTyping = <stdin>; chomp ($haploTyping); exit(0) if ($haploTyping =~ /^q/i); if (substr($haploTyping, 0, 1) =~ /^n/i or $haploTyping eq "") { $haploTyping = 0; } elsif (substr($haploTyping, 0, 1) =~ /^y/i) { $haploTyping = 1; } else { print "\t\tInvalid input ($haploTyping)\n"; undef $haploTyping; } } # Get whether or not to convert to amino acids my $defaultTranslate = ($translateSeqs) ? "y" : "n"; undef $translateSeqs; until (defined $translateSeqs) { print "\tConvert sequence data to amino acids (y|n)? [$defaultTranslate]: "; $translateSeqs = <stdin>; chomp ($translateSeqs); exit(0) if ($translateSeqs =~ /^q/i); if (substr($translateSeqs, 0, 1) =~ /^n/i or $translateSeqs eq "") { $translateSeqs = 0; } elsif (substr($translateSeqs, 0, 1) =~ /^y/i) { $translateSeqs = 1; } else { print "\t\tInvalid input ($translateSeqs)\n"; undef $translateSeqs; } } # Get genetic code for translation my $defaultCode = $globalGenCode; $globalGenCode = 99; print "\tGenetic codes available for protein translation:\n"; foreach my $code (qw(1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23)) { print "\t\t$code: $transTable{$code}"; print "\n"; } until (defined $genCodes{$globalGenCode}) { $globalOverride = 0; print "\tEnter global genetic code to be applied (0 = no translation; follow with an * to override local code) [$defaultCode]: "; $globalGenCode = <stdin>; chomp ($globalGenCode); exit(0) if ($globalGenCode =~ /^q/i); $globalGenCode = $defaultCode if ($globalGenCode eq ""); if ($globalGenCode =~ /\*$/) { $globalOverride = 1; $globalGenCode =~ s/\*$//; } print "\t\tInvalid input ($globalGenCode)\n" unless (defined $genCodes{$globalGenCode} or $globalGenCode == 0); } # Get output formats my $defaultFasta = ($fastaPrint) ? "y" : "n"; undef $fastaPrint; until (defined $fastaPrint) { print "\tOutput results in fasta format (y|n) [$defaultFasta]: "; $fastaPrint = <stdin>; chomp ($fastaPrint); exit(0) if ($fastaPrint =~ /^q/i); if (substr($fastaPrint, 0, 1) =~ /^y/i) { $fastaPrint = 1; } elsif (substr($fastaPrint, 0, 1) =~ /^n/i or $fastaPrint eq "") { $fastaPrint = 0; } else { print "\t\tInvalid input ($fastaPrint)\n"; undef $fastaPrint; } } my $defaultNexus = ($nexusPrint) ? "y" : "n"; undef $nexusPrint; until (defined $nexusPrint) { print "\tOutput results in nexus format (y|n) [$defaultNexus]: "; $nexusPrint = <stdin>; chomp ($nexusPrint); exit(0) if ($nexusPrint =~ /^q/i); if (substr($nexusPrint, 0, 1) =~ /^y/i) { $nexusPrint = 1; } elsif (substr($nexusPrint, 0, 1) =~ /^n/i or $nexusPrint eq "") { $nexusPrint = 0; } else { print "\t\tInvalid input ($nexusPrint)\n"; undef $nexusPrint; } } my $defaultPhylip = ($phylipTradPrint) ? "y" : "n"; undef $phylipTradPrint; until (defined $phylipTradPrint or $phylipExtPrint) { print "\tOutput results in traditional phylip format (y|n) [$defaultPhylip]: "; $phylipTradPrint = <stdin>; chomp ($phylipTradPrint); exit(0) if ($phylipTradPrint =~ /^q/i); if (substr($phylipTradPrint, 0, 1) =~ /^y/i) { $phylipTradPrint = 1; } elsif (substr($phylipTradPrint, 0, 1) =~ /^n/i or $phylipTradPrint eq "") { $phylipTradPrint = 0; } else { print "\t\tInvalid input ($phylipTradPrint)\n"; undef $phylipTradPrint; } } if ($phylipTradPrint == 0) # Check for extended format { my $defaultPhylip = ($phylipExtPrint) ? "y" : "n"; undef $phylipExtPrint; until (defined $phylipExtPrint or $phylipExtPrint) { print "\tOutput results in extended phylip format (y|n) [$defaultPhylip]: "; $phylipExtPrint = <stdin>; chomp ($phylipExtPrint); exit(0) if ($phylipExtPrint =~ /^q/i); if (substr($phylipExtPrint, 0, 1) =~ /^y/i) { $phylipExtPrint = 1; } elsif (substr($phylipExtPrint, 0, 1) =~ /^n/i or $phylipExtPrint eq "") { $phylipExtPrint = 0; } else { print "\t\tInvalid input ($phylipExtPrint)\n"; undef $phylipExtPrint; } } } my $defaultSeal = ($sealPrint) ? "y" : "n"; undef $sealPrint; until (defined $sealPrint) { print "\tOutput results in Se-Al format (y|n) [$defaultSeal]: "; $sealPrint = <stdin>; chomp ($sealPrint); exit(0) if ($sealPrint =~ /^q/i); if (substr($sealPrint, 0, 1) =~ /^y/i) { $sealPrint = 1; } elsif (substr($sealPrint, 0, 1) =~ /^n/i or $sealPrint eq "") { $sealPrint = 0; } else { print "\t\tInvalid input ($sealPrint)\n"; undef $sealPrint; } } if ($nexusPrint or $phylipTradPrint or $phylipExtPrint) { my $defaultInterleave = ($outFormat) ? "y" : "n"; undef $outFormat; until (defined $outFormat) { print "\tInterleave output for nexus- and/or phylip-formatted output with length (n|10-100) [$defaultInterleave]: "; $outFormat = <stdin>; chomp ($outFormat); exit(0) if ($outFormat =~ /^q/i); if ($outFormat =~ /(\d+)/) { if ($outFormat =~ /\D/ or $outFormat < 10 or $outFormat > 100) { print "\t\tInvalid input ($outFormat)\n"; undef $outFormat; } else { $outFormat = $1; } } elsif (substr($outFormat, 0, 1) =~ /^n/i or $outFormat eq "") { $outFormat = "sequential"; } else { print "\t\tInvalid input ($outFormat)\n"; undef $outFormat; } } if ($outFormat ne "sequential") { $interleaveLength = $outFormat; $outFormat = "interleave"; } } # Get verbose output mode my $defaultVerbose = ($verbose) ? "y" : "n"; undef $verbose; until (defined $verbose) { print "\tOutput verbose results to screen (y|n) [$defaultVerbose]: "; $verbose = <stdin>; chomp ($verbose); exit(0) if ($verbose =~ /^q/i); if (substr($verbose, 0, 1) =~ /^y/i) { $verbose = 1; print "\n"; } elsif (substr($verbose, 0, 1) =~ /^n/i or $verbose eq "") { $verbose = 0; } elsif (substr($verbose, 0, 1) =~ /^x/i or $verbose eq "") { $verbose = $debug = 1; } else { print "\t\tInvalid input ($verbose)\n"; undef $verbose; } } } elsif (join(' ', @ARGV) =~ /\s-h/ or $ARGV[0] =~ /^-h/) # Print help screen { print "Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g]\n"; print " [-G] [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t]\n"; print " [-u] [-v] [-h]\n"; print "Version: $version\n"; print "Options: -a = print out accession numbers instead of taxon labels for nexus\n"; print " and phylip output\n"; print " -c<number> = global genetic code (to be used for translation unless\n"; print " otherwise specified) (default = standard code; end\n"; print " with * to override local settings)\n"; foreach my $code (qw(1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23)) { print " $code: $transTable{$code}"; print "\n"; } print " -d<filename> = file containing raw sequence information; * = batch\n"; print " convert all specified file types in working\n"; print " directory (suffixes must be .fasta / .fas, .gb\n"; print " .nexus / .nex, .phylip / .ph, or .seal as\n"; print " appropriate)\n"; print " -g = do not convert ~ gap characters (e.g., from BioEdit) to -\n"; print " -G = convert flanking gaps to Ns\n"; print " -H = convert sequence data to haplotype data\n"; print " -i<f|g|n|p|s> = format of sequence file (fasta (f), GenBank (g),\n"; print " nexus (n), phylip (p), or Se-Al (s)); default =\n"; print " autodetect\n"; print " -j = produce jackknifed data sets each with a single sequence\n"; print " deleted\n"; print " -l<number> = interleave nexus- and/or phylip-formatted output\n"; print " (default = sequential) with specified sequence length\n"; print " (between 10 and 100 inclusive; default = 80);\n"; print " NOTE: also sets default interleave length for fasta\n"; print " output\n"; print " -n = convert ambiguous nucleotides to Ns\n"; print " -o<f|n|pc|pe|s> = output results in any or all of fasta (f), nexus\n"; print " (n), classic or extended phylip (pc or pe),\n"; print " and/or Se-Al (s) formats (NOTE: flag can be\n"; print " invoked multiple times)\n"; print " -r<a|i> = order sequences in final output alphabetically by name\n"; print " (a; default) or in input order from file (i)\n"; print " -s = split data set into individual partitions according to nexus\n"; print " charset statements\n"; print " -t = translate sequences into amino acids\n"; print " -u = interactive user-input mode\n"; print " -h = print this message and quit\n"; print " -v = verbose output\n"; exit(0); } else # Process switches { for (my $i = 0; $i <= $#ARGV; $i++) { if ($ARGV[$i] eq "-a") { $accPrint = 1; } elsif ($ARGV[$i] =~ /-c(\d+)/) { $globalGenCode = $1; if ($globalGenCode != 0 and not defined $genCodes{$globalGenCode}) { print "Don't understand argument: $ARGV[$i]\n"; print "Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g]\n"; print " [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t] [-u]\n"; print " [-v] [-h]\n"; print "Version: $version\n"; exit(1); } $globalOverride = 1 if ($ARGV[$i] =~ /\*$/); } elsif ($ARGV[$i] =~ /^-d(.*)/) { $readFile = $1; unless ($readFile eq "*") { die "ERROR: Data file $readFile does not exist.\n" unless (-e $readFile); } push @seqFiles, $readFile; } elsif ($ARGV[$i] eq "-g") { $gapChange = 0; } elsif ($ARGV[$i] eq "-G") { $flankGap = 1; } elsif ($ARGV[$i] eq "-H") { $haploTyping = 1; } elsif ($ARGV[$i] eq "-if") { $inputType = "fasta"; } elsif ($ARGV[$i] eq "-ig") { $inputType = "GenBank"; } elsif ($ARGV[$i] eq "-in") { $inputType = "nexus"; } elsif ($ARGV[$i] eq "-ip") { $inputType = "phylip"; } elsif ($ARGV[$i] eq "-is") { $inputType = "Se-Al"; } elsif ($ARGV[$i] eq "-j") { $jackknife = 1; } elsif ($ARGV[$i] eq "-k") { $nameClean = 0; } elsif ($ARGV[$i] =~ /-l(\d+)?/) { $interleaveLength = $1 if (defined $1); $outFormat = "interleave"; } elsif ($ARGV[$i] eq "-n") { $ambigChange = 1; } elsif ($ARGV[$i] eq "-of") { $fastaPrint = 1; } elsif ($ARGV[$i] eq "-on") { $nexusPrint = 1; } elsif ($ARGV[$i] eq "-opc") { $phylipTradPrint = 1; } elsif ($ARGV[$i] eq "-ope") { $phylipExtPrint = 1; } elsif ($ARGV[$i] eq "-os") { $sealPrint = 1; } elsif ($ARGV[$i] eq "-ra") { $seqOrder = "alphabetical"; } elsif ($ARGV[$i] eq "-ri") { $seqOrder = "input"; } elsif ($ARGV[$i] eq "-s") { $seqSplit = 1; } elsif ($ARGV[$i] eq "-t") { $translateSeqs = 1; $globalGenCode = 1 unless ($globalGenCode); } elsif ($ARGV[$i] eq "-v") { $verbose = 1; } elsif ($ARGV[$i] eq "-x") { $debug = 1; $verbose = 1; } elsif ($ARGV[$i] =~ /^-O(.*)/) { $OutFile = $1; } else { print "Don't understand argument: $ARGV[$i]\n"; print "Usage: seqConverter.pl -d<filename> -o<f|n|pc|pe|s> [-a] [-c<number>] [-g]\n"; print " [-G] [-H] [-i<f|g|n|p|s>] [-j] [-l<number>] [-n] [-r<a|i>] [-s] [-t]\n"; print " [-u] [-v] [-h]\n"; print "Version: $version\n"; exit(1); } } } $accPrint = 0 if ($accPrint eq "n"); geneticCoder() if ($globalGenCode); $fastaLength = $interleaveLength; # Check for I/O errors die "ERROR: Must supply name of file containing sequence data.\n" if (not @seqFiles); die "ERROR: Must supply at least one output format.\n" unless ($fastaPrint or $nexusPrint or $phylipTradPrint or $phylipExtPrint or $sealPrint); die "ERROR: Sequence length for interleaved format must be between 10 and 100 inclusive.\n" if (($fastaPrint or (($nexusPrint or $phylipTradPrint or $phylipExtPrint) and $outFormat eq "interleave")) and ($interleaveLength < 10 or $interleaveLength > 100)); # Read in sequence data if ($seqFiles[0] eq "*") # Batch convert all appropriate files in working directory { if ($inputType) { undef @seqFiles; system("ls -d * > convertList.txt"); setLineBreak("convertList.txt"); open (LIST, "<convertList.txt") or die "Cannot open file containing names of all sequence files, convertList.txt\n"; while (<LIST>) { chomp; next unless ($_); push @seqFiles, $_ if ($inputType eq "Se-Al" and $_ =~ /\.seal$/); push @seqFiles, $_ if ($inputType eq "phylip" and ($_ =~ /\.phylip$/ or $_ =~ /\.ph$/)); push @seqFiles, $_ if ($inputType eq "nexus" and ($_ =~ /\.nexus$/ or $_ =~ /\.nex$/)); push @seqFiles, $_ if ($inputType eq "GenBank" and ($_ =~ /\.gb$/)); push @seqFiles, $_ if ($inputType eq "fasta" and ($_ =~ /\.fasta$/ or $_ =~ /\.fas$/)); } close LIST; unlink ("convertList.txt") unless ($debug); die "ERROR: No files of type $inputType found for batch conversion.\n" if (not @seqFiles); } else { die "ERROR: Must specify input file type for batch conversion\n"; } } foreach my $seqFile (@seqFiles) { print "\nConverting file $seqFile ...\n"; # Set output file names $dataSource = $seqFile; $dataSource =~ s/\.\w+$//; if ($fastaPrint) { $fastaOut = $dataSource . ".fasta"; $fastaOut =~ s/\.fasta$/_new.fasta/ if ($fastaOut eq $seqFile); $fastaOut =~ s/\.fasta$/_haplo.fasta/ if ($haploTyping); } if ($nexusPrint) { # $nexusOut = $dataSource . ".nex"; # Fixed Output File name much easier for Galaxy $nexusOut = $OutFile; $nexusOut =~ s/\.nex$/_new.nex/ if ($nexusOut eq $seqFile); $nexusOut =~ s/\.nex$/_haplo.nex/ if ($haploTyping); } if ($phylipTradPrint or $phylipExtPrint) { # $phylipOut = $dataSource . ".phylip"; # Fixed Output File name much easier for Galaxy $phylipOut = $OutFile; $phylipOut =~ s/\.phylip$/_new.phylip/ if ($phylipOut eq $seqFile); $phylipOut =~ s/\.phylip$/_haplo.phylip/ if ($haploTyping); } if ($sealPrint) { $sealOut = $dataSource . ".seal"; $sealOut =~ s/\.seal$/_new.seal/ if ($sealOut eq $seqFile); $sealOut =~ s/\.seal$/_haplo.seal/ if ($haploTyping); } $haploFile = $dataSource . "_haplotypeSeqs.txt"; $phylipExtPrint = 0 if ($phylipTradPrint); # Read in sequence data # Clear variables undef @accNum; undef %nameLabel; undef %sequence; undef %geneticCode; undef %accPresent; undef %deletedSeq; undef %finalSeq; undef $seqCount; undef $ntax; undef @charsetList; undef %charsetStart; undef %charsetEnd; my (@haploList, %haploID, %haploSeqs); $maxLength = 0; seqRead($seqFile); if (not @accNum) { print "\tERROR: Could not read in sequences from file $seqFile; skipping to next file\n"; next; } # Process for printing my $stopCodonCount; $seqType = "protein" if ($globalGenCode and $translateSeqs); foreach my $seq (@accNum) { $sequence{$seq} =~ s/\~/-/g if ($gapChange); $sequence{$seq} =~ s/R|Y|M|K|S|W|H|B|V|D/N/ig if ($ambigChange and $seqType eq "nucleotide"); $finalSeq{$seq} = $sequence{$seq}; $geneticCode{$seq} = $globalGenCode if ($globalOverride); if ($globalGenCode and $translateSeqs) { $finalSeq{$seq} = translate($sequence{$seq}, $geneticCode{$seq}); $stopCodonCount++ if (($finalSeq{$seq} =~ tr/\*//) > 2); # Check for stop codons } $maxLength = length($finalSeq{$seq}) if (length($finalSeq{$seq}) > $maxLength); $nameLabel{$seq} =~ s/\W+/_/g if ($nameClean); # Clean sequence labels of non-alphanumerics } printf "\n\tWARNING: $stopCodonCount of %s sequences had more than two stop codons; check for\n\t\t1) proper reading frame,\n\t\t2) proper genetic code, or\n\t\t3) that sequences are coding DNA\n", scalar(@accNum) if ($globalGenCode and $stopCodonCount); # Add gaps to end of any sequence less than maximum length foreach my $seq (@accNum) { $finalSeq{$seq} .= "-" x ($maxLength - length($sequence{$seq})); $sequence{$seq} = $finalSeq{$seq}; } # Convert flanking gaps to Ns if desired if ($flankGap) { foreach my $seq (@accNum) { if ($finalSeq{$seq} =~ /^(\-+)/) { my $startGap = $1; my $startN = "N" x length($startGap); $finalSeq{$seq} =~s/^$startGap/$startN/; } if ($finalSeq{$seq} =~ /(\-+)$/) { my $endGap = $1; my $endN = "N" x length($endGap); $finalSeq{$seq} =~s/$endGap$/$endN/; } } } # Determine haplotypes as needed if ($haploTyping) { foreach my $entry (@accNum) { $haploID{$entry} = 0; foreach my $haploType (@haploList) { if ($finalSeq{$entry} eq $finalSeq{$haploType}) # Matches existing haplotype; add to list { $haploID{$entry} = $haploType; push @{$haploSeqs{$haploType}}, $entry; last; } } if (not $haploID{$entry}) # No match to existing haplotype; define new haplotype { $haploID{$entry} = "haplo" . (scalar(@haploList) + 1); push @haploList, $haploID{$entry}; $nameLabel{$haploID{$entry}} = $haploID{$entry}; $finalSeq{$haploID{$entry}} = $finalSeq{$entry}; push @{$haploSeqs{$haploID{$entry}}}, $entry; } } undef @accNum; @accNum = @haploList; open (HAPLO, ">$haploFile") or die "Cannot print to file $haploFile\n"; foreach my $haploType (@haploList) { print HAPLO "$haploType:"; print HAPLO "\t$nameLabel{$_}" foreach @{$haploSeqs{$haploType}}; print HAPLO "\n"; } close HAPLO; } # Print results! print "\nPrinting results ...\n"; @accNum = sort { $nameLabel{$a} cmp $nameLabel{$b} } keys %nameLabel if ($seqOrder eq "alphabetical" and not $haploTyping); # Print full data set $ntax = scalar(@accNum); seqPrint($seqFile); # Print jackknifed data sets if ($jackknife) { my $delseqCount = 0; foreach my $seq (@accNum) { $deletedSeq{$seq} = 1; $delseqCount++; # Change output file names $fastaOut = $dataSource . "_jack$delseqCount.fasta"; $nexusOut = $dataSource . "_jack$delseqCount.nex"; $phylipOut = $dataSource . "_jack$delseqCount.phylip"; $sealOut = $dataSource . "_jack$delseqCount.seal"; $ntax = scalar(@accNum) - 1; seqPrint($seqFile); $deletedSeq{$seq} = 0; # Reset deleted sequence } } # Print data set partitions if (@charsetList and $seqSplit) { my $delCount = 0; foreach my $partition (@charsetList) { $delCount = 0; # Change output file names $fastaOut = $dataSource . "_$partition.fasta"; $nexusOut = $dataSource . "_$partition.nex"; $phylipOut = $dataSource . "_$partition.phylip"; $sealOut = $dataSource . "_$partition.seal"; # Restrict sequence to partition limits foreach my $seq (@accNum) { $deletedSeq{$seq} = 0; # Reset all deleted sequences $finalSeq{$seq} = substr($sequence{$seq}, $charsetStart{$partition} - 1, $charsetEnd{$partition} - $charsetStart{$partition} + 1); # Check that sequence remains informative unless ($finalSeq{$seq} =~ /a/i or $finalSeq{$seq} =~ /c/i or $finalSeq{$seq} =~ /g/i or $finalSeq{$seq} =~ /t/i) { $delCount++; $deletedSeq{$seq} = 1; } } $ntax = scalar(@accNum) - $delCount; $maxLength = $charsetEnd{$partition} - $charsetStart{$partition} + 1; seqPrint($seqFile); } } } exit(0); ### Subroutines used in the program sub setLineBreak # Check line breaks of input files and set input record separator accordingly { my $inFile = shift; $/ ="\n"; open (IN, "<$inFile") or die "Cannot open $inFile to check form of line breaks.\n"; while (<IN>) { if ($_ =~ /\r\n/) { print "\tDOS line breaks detected ...\n" if ($verbose); $/ ="\r\n"; last; } elsif ($_ =~ /\r/) { print "\tMac line breaks detected ...\n" if ($verbose); $/ ="\r"; last; } else { print "\tUnix line breaks detected ...\n" if ($verbose); $/ ="\n"; last; } } close IN; } sub seqRead { my $seqFile = shift; undef %sequence; print "\nReading in sequence data from file $seqFile (type is $inputType) ...\n" if ($inputType); setLineBreak($seqFile); open (SEQ, "<$seqFile") or die "Cannot open file containing sequences, $seqFile\n"; my ($header, $tempAcc, $tempName, $tempSeq); my $fastaAcc; my (%nexusSpecies, %nexusAcc, $nexusRead, $commentFlag); my ($phylipLineCount, $phylipTaxa, $phylipChars, %phylipSeq); my $sealCode; my ($sealDelFlag, $owner) = (0, 0); my ($gbAcc, $gbRead); my $macBlock = 0; my %accCount; while (<SEQ>) { chomp; my $lineRead = $_; next unless ($lineRead); # Autodetect sequence format if (not $inputType) { $inputType = "fasta" if ($lineRead =~ /^>/); $inputType = "nexus" if ($lineRead =~ /\#nexus/i); $inputType = "phylip" if ($lineRead =~ /^\s*\d+\s+\d+/); $inputType = "Se-Al" if ($lineRead =~ /^\s*Database=\{/i); $inputType = "GenBank" if ($lineRead =~ /^\s*LOCUS/); print "\nReading in sequence data from file $seqFile (type determined to be $inputType) ...\n" if ($inputType); } if ($inputType eq "nexus") { # Check if charset statement present if ($lineRead =~ /charset/i) { $lineRead =~ s/\s+//g; $lineRead =~ s/;$//; my ($charsetName, $charsetBounds) = split('=', $lineRead); $charsetName =~ s/charset//i; push @charsetList, $charsetName; if ($charsetBounds =~ /(\d+)-*(\d*)/) { $charsetStart{$charsetName} = $1; if ($2) { $charsetEnd{$charsetName} = $2; } else { $charsetEnd{$charsetName} = $1; } } } # Otherwise block out MacClade / PAUP blocks $macBlock = 1 if ($lineRead =~ /begin macclade;/i or $lineRead =~ /begin paup;/i); $macBlock = 0 if ($macBlock and $lineRead =~ /end;/i); next if ($macBlock); # Otherwise only read in data lines or charset statements if ($lineRead =~ /^\s*matrix/i) { $nexusRead = 1; next; } $nexusRead = 0 if ($lineRead =~ /;\s*$/); next unless ($nexusRead); # Remove MacClade sequence lengths $lineRead =~ s/\[\d+\]$//; $commentFlag = 1 if ($lineRead =~ /\[/); if ($lineRead =~ /\]/) { $commentFlag = 0; next; } next if ($commentFlag); next unless ($lineRead =~ /a/i or $lineRead =~ /c/i or $lineRead =~ /g/i or $lineRead =~ /t/i or $lineRead =~ /n/i or $lineRead =~ /\?/ or $lineRead =~ /\-/); # Clean up input line $lineRead =~ s/^\s+//; $lineRead =~ s/\'//g; my @nexusLine = split(/\s+/, $lineRead); my $species = shift(@nexusLine); $species =~ s/\s+/_/g; $species =~ s/\_+/_/g; my $seq = join('', @nexusLine); $seq =~ s/\s+//g; $seqType = "protein" if ($seq =~ /E/i or $seq =~ /Q/i or $seq =~ /I/i or $seq =~ /L/i or $seq =~ /F/i or $seq =~ /P/i); if (not defined $nexusSpecies{$species}) { $nexusSpecies{$species} = 1; $seqCount++; $nexusAcc{$species} = "tAlign_".$seqCount; push @accNum, $nexusAcc{$species}; $nameLabel{$nexusAcc{$species}} = $species; $sequence{$nexusAcc{$species}} = uc($seq); $geneticCode{$nexusAcc{$species}} = $globalGenCode; } else # Sequences are in interleaved format; append sequence { $sequence{$nexusAcc{$species}} .= uc($seq); } } if ($inputType eq "fasta") { if ($lineRead =~/^\s*>/) { my $species; $seqCount++; (my $tempSpecies = $lineRead) =~ s/^\s*>//; if ($tempSpecies =~ /^Mit\.\s+/) # Entry comes from European RNA project { $tempSpecies =~ s/^Mit\.\s+//i; # To fix entries from European RNA project my @speciesInfo = split(/\s+/, $tempSpecies); $species = join('_', $speciesInfo[0], $speciesInfo[1]); if (defined $speciesInfo[2]) { $fastaAcc = $speciesInfo[2]; $accCount{$fastaAcc}++; if ($accCount{$fastaAcc} > 1) { print "\nWARNING: Accession number $fastaAcc used more than once"; print "; skipping this and subsequent entries" if ($skipDuplicates); print "\n"; } } else { $fastaAcc = "tAlign_".$seqCount; $accCount{$fastaAcc}++; } } else { my @speciesLine = split(/\s+/, $tempSpecies); if ($speciesLine[$#speciesLine] =~ /^\(?\w+\d+\)?$/ and scalar(@speciesLine) > 1) # Check whether last entry is an accession number { $fastaAcc = pop (@speciesLine); $fastaAcc =~ s/^\(//g; $fastaAcc =~ s/\)$//g; $accCount{$fastaAcc}++; if ($accCount{$fastaAcc} > 1) { print "\nWARNING: Accession number $fastaAcc used more than once"; if ($skipDuplicates) { print "; skipping this and subsequent entries\n"; } else { print "; assigning temporary accession number\n"; $fastaAcc = "tAlign_" . $seqCount; } } } else { $fastaAcc = "tAlign_".$seqCount; $accCount{$fastaAcc}++; } $species = join('_', @speciesLine); $species = "Sequence_".$seqCount if ($species eq ""); } push @accNum, $fastaAcc unless ($accCount{$fastaAcc} > 1 and $skipDuplicates); $geneticCode{$fastaAcc} = $globalGenCode; ($nameLabel{$fastaAcc} = $species) =~ s/\s+/_/; $nameLabel{$fastaAcc} =~ s/\_+/_/; } else { next if ($accCount{$fastaAcc} > 1 and $skipDuplicates); $lineRead =~ s/\s+//g; $seqType = "protein" if ($lineRead =~ /E/i or $lineRead =~ /Q/i or $lineRead =~ /I/i or $lineRead =~ /L/i or $lineRead =~ /F/i or $lineRead =~ /P/i); $sequence{$fastaAcc} .= uc($lineRead); } } if ($inputType eq "Se-Al") { my $header; $sealDelFlag = 1 if ($lineRead =~/MCoL/); # Se-Al sometimes places deleted species at end of file; do not read in remainder of file next if ($sealDelFlag == 1); next unless ($lineRead =~/NumSites/i or $lineRead =~/Owner/i or $lineRead =~/Name/i or $lineRead =~/Accession/i or $lineRead =~/Sequence/i or $lineRead =~/GeneticCode/i or $lineRead =~/Frame/i); if ($lineRead =~/Owner\s*\=\s*(\d+)/i) { $owner = $1; } if ($lineRead =~/Accession/i and $owner == 2) { $seqCount++; if ($lineRead =~ /null/ or $lineRead =~ /\"\"/) { $tempAcc = "tAlign_" . $seqCount; $accCount{$tempAcc}++; } else { ($header, $tempAcc) = split (/=/, $lineRead); $tempAcc =~ s/\"//g; $tempAcc =~ s/;//g; $accCount{$tempAcc}++; if ($accCount{$tempAcc} > 1) { print "\nWARNING: Accession number $fastaAcc used more than once"; if ($skipDuplicates) { print "; skipping this and subsequent entries\n"; } else { print "; assigning temporary accession number\n"; $tempAcc = "tAlign_" . $seqCount; } print "\n"; } } push @accNum, $tempAcc unless ($accCount{$tempAcc} > 1 and $skipDuplicates); } if ($lineRead =~/Name/i and $owner == 2) { ($header, $tempName) = split (/=/, $lineRead); $tempName =~ s/\"//g; $tempName =~ s/\s*;//g; $tempName =~ s/\s+/_/g; $tempName =~ s/\_+/_/g; } if ($lineRead =~/GeneticCode=(\d)/i and $owner == 2) { $sealCode = $1; $geneticCode{$tempAcc} = $seal2gb{$sealCode}; } if ($lineRead =~/Sequence/i and $owner == 2) { next if ($accCount{$tempAcc} > 1 and $skipDuplicates); ($header, $tempSeq) = split (/=/, $lineRead); $tempSeq =~ s/\"//g; $tempSeq =~ s/;//g; $tempSeq =~ s/\s+//g; $nameLabel{$tempAcc} = $tempName; $sequence{$tempAcc} = uc($tempSeq); $seqType = "protein" if ($tempSeq =~ /E/i or $tempSeq =~ /Q/i or $tempSeq =~ /I/i or $tempSeq =~ /L/i or $tempSeq =~ /F/i or $tempSeq =~ /P/i); } if ($lineRead =~/Frame=(\d)/i) # Correct for reading frame { my $readingFrame = $1; $sequence{$tempAcc} = "--" . $sequence{$tempAcc} if ($readingFrame == 2); $sequence{$tempAcc} = "-" . $sequence{$tempAcc} if ($readingFrame == 3); } } if ($inputType eq "phylip") { if ($lineRead =~ /^\s*(\d+)\s+(\d+)/) { $phylipTaxa = $1; $phylipChars = $2; $phylipLineCount = 0; } else { $phylipLineCount++; $lineRead =~ s/\s//g; $phylipSeq{$phylipLineCount} .= $lineRead; $phylipLineCount = 0 if ($phylipLineCount == $phylipTaxa); } } if ($inputType eq "GenBank") { # Get species name and accession number # Pure GenBank format $gbAcc = $1 if ($lineRead =~ /^\s*ACCESSION\s+(\w+\d+)/); if ($lineRead =~ /^\s*ORGANISM\s+/) { $seqCount++; $gbAcc = "tAlign_" . $seqCount if (not defined $gbAcc); $lineRead =~ s/^\s+//; my @orgLine = split(/\s+/, $lineRead); my $header = shift (@orgLine); $nameLabel{$gbAcc} = join('_', @orgLine); $accCount{$gbAcc}++; } # BioEdit format if ($lineRead =~ /^\s*TITLE/ and not defined ($gbAcc)) { $seqCount++; $gbAcc = "tAlign_" . $seqCount; my @accLine = split (/\s+/, $lineRead); $gbAcc = $1 if ($accLine[2] =~ /^\((\w+\d+)\)/); $nameLabel{$gbAcc} = $accLine[1]; $accCount{$gbAcc}++; } if ($lineRead =~ /^\s*ORIGIN/) { if ($accCount{$gbAcc} > 1) { print "\nWARNING: Accession number $gbAcc used more than once"; if ($skipDuplicates) { print "; skipping this and subsequent entries\n"; next; } else { print "; assigning temporary accession number\n"; $gbAcc = "tAlign_" . $seqCount; $gbRead = 1; } } else { $gbRead = 1; $seqCount++; } next; } if ($lineRead =~ /^\s*\/\//) # End of accession; process { $gbRead = 0; next unless ($gbAcc); push @accNum, $gbAcc unless ($accCount{$gbAcc} > 1); $geneticCode{$gbAcc} = $globalGenCode; $nameLabel{$gbAcc} =~ s/\s+/_/g; $nameLabel{$gbAcc} =~ s/\_+/_/g; $sequence{$gbAcc} =~ s/\d//g; $sequence{$gbAcc} =~ s/\s+//g; $sequence{$gbAcc} =~ s/\~/-/g if ($gapChange); undef $gbAcc; } next unless ($gbRead); if ($gbRead and $lineRead =~ /^\s+\d+/) { $sequence{$gbAcc} .= uc($lineRead); } } } close SEQ; if ($inputType eq "phylip") # Postprocess input to derive taxon names and sequence; accounts for both sequential and extended formatting { for (my $i = 1; $i <= $phylipTaxa; $i++) { my $phylipAcc = "tAlign_" . $i; push @accNum, $phylipAcc; $geneticCode{$phylipAcc} = $globalGenCode; # Derive taxon name and sequence $sequence{$phylipAcc} = uc(substr($phylipSeq{$i}, 0 - $phylipChars)); $seqType = "protein" if ($sequence{$phylipAcc} =~ /E/i or $sequence{$phylipAcc} =~ /Q/i or $sequence{$phylipAcc} =~ /I/i or $sequence{$phylipAcc} =~ /L/i or $sequence{$phylipAcc} =~ /F/i or $sequence{$phylipAcc} =~ /P/i); $nameLabel{$phylipAcc} = substr($phylipSeq{$i}, 0, length($phylipSeq{$i}) - $phylipChars); $nameLabel{$phylipAcc} =~ s/\s+/_/g; $nameLabel{$phylipAcc} =~ s/\_+/_/g; } } } sub seqPrint { my $seqFile = shift; $interleaveLength = $maxLength if ($outFormat eq "sequential"); # Print fasta-formatted file if ($fastaPrint) { print "\tWriting to fasta-formatted file $fastaOut ...\n"; open (FASTA, ">$fastaOut") or die "Cannot open fasta file for aligned DNA sequences, $fastaOut"; foreach my $entry (@accNum) { # next if ($deletedSeq{$entry} or not defined $sequence{$entry}); print FASTA ">$nameLabel{$entry}"; print FASTA "\t($entry)" unless ($entry =~ /^tAlign/); print FASTA "\n"; # Print sequence my $fastaSeq = $finalSeq{$entry}; for (my $breakpoint = 0; $breakpoint <= length($fastaSeq); $breakpoint += $fastaLength) { print FASTA substr($fastaSeq, $breakpoint, $fastaLength) . "\n"; } # my $fastaSeq = $finalSeq{$entry}; # my $breakPoint = 80; # my $breakCount = 0; # until ($breakPoint > length($fastaSeq)) # { # $breakCount++; # my $replaceString = "\n" . substr($fastaSeq, $breakPoint, 1); # substr($fastaSeq, $breakPoint, 1) = $replaceString; # $breakPoint += 81; # Latter accounts for addition of \n to string # } # print FASTA "\n$fastaSeq\n"; } close FASTA; } # Print nexus-formatted file if ($nexusPrint) { print "\tWriting to nexus file $nexusOut"; print " in interleaved format" if ($outFormat eq "interleave"); print " ...\n"; open (NEX, ">$nexusOut") or die "Cannot open nexus file for aligned DNA sequences, $nexusOut"; print NEX "#NEXUS\n\n"; print NEX "[File created from $seqFile using $perlScript v$version on ".localtime()."]\n\n"; print NEX "begin data;\n"; print NEX "\tdimensions ntax = $ntax nchar = $maxLength;\n"; print NEX "\tformat datatype = $seqType gap = - missing = ?"; print NEX " interleave" if ($outFormat eq "interleave"); print NEX ";\n\n"; print NEX "\tmatrix\n\n"; for (my $interleaveCount = 1; $interleaveCount <= ceil($maxLength / $interleaveLength); $interleaveCount++) { my $seqStart = (($interleaveCount - 1) * $interleaveLength) + 1; my $seqEnd = $interleaveCount * $interleaveLength; $seqEnd = $maxLength if ($maxLength <= $seqEnd); my $printLength = $seqEnd - $seqStart + 1; foreach my $entry (@accNum) { next if ($deletedSeq{$entry}); my $nexusName = $nameLabel{$entry}; $nexusName = $entry if ($accPrint); if ($nexusName =~ /\W/) { print NEX "'$nexusName'"; } else { print NEX "$nexusName"; } my $printSeq = substr($finalSeq{$entry}, $seqStart - 1, $printLength); if ($outFormat eq "interleave") # Add gaps every 10 elements in sequence { for (my $gapAdd = 100; $gapAdd >= 10; $gapAdd-=10) { next if $gapAdd > $printLength; my $bp = substr($printSeq, $gapAdd - 1, 1) . " "; substr($printSeq, $gapAdd - 1, 1, $bp); } } print NEX "\t$printSeq\n"; } print NEX "\n" if ($interleaveCount * $interleaveLength <= $maxLength); } # foreach my $entry (@accNum) # { # next if ($deletedSeq{$entry}); # if ($nameLabel{$entry} =~ /\W/) # { # print NEX "'$nameLabel{$entry}'"; # } # else # { # print NEX "$nameLabel{$entry}"; # } # print NEX "\t$finalSeq{$entry}\n"; # } print NEX "\t;\nend;\n"; close NEX; if (-e "/Developer/Tools/SetFile") { system ("/Developer/Tools/SetFile -t 'TEXT' $nexusOut"); system ("/Developer/Tools/SetFile -c 'PAUP' $nexusOut"); } } # Print phylip-formatted file (on demand) if ($phylipTradPrint or $phylipExtPrint) { my $maxTaxLength = 50; $maxTaxLength = 10 if ($phylipTradPrint); my $blankName = " " x $maxTaxLength; my %shortNameCount; print "\tWriting to phylip file $phylipOut ...\n"; open (PHYLIP, ">$phylipOut") or die "Cannot open phylip file for aligned DNA sequences, $phylipOut"; print PHYLIP " $ntax $maxLength\n"; for (my $interleaveCount = 1; $interleaveCount <= ceil($maxLength / $interleaveLength); $interleaveCount++) { my $seqStart = (($interleaveCount - 1) * $interleaveLength) + 1; my $seqEnd = $interleaveCount * $interleaveLength; $seqEnd = $maxLength if ($maxLength <= $seqEnd); my $printLength = $seqEnd - $seqStart + 1; # foreach my $entry (@accNum) # { # my $trimmedName = substr($nameLabel{$entry}, 0, $maxTaxLength); # $shortNameCount{trimmedName} = 0; # } foreach my $entry (@accNum) { next if ($deletedSeq{$entry}); my $phylipName = $nameLabel{$entry}; $phylipName = $entry if ($accPrint); # Print name label as appropriate; also check label and adjust to proper length if needed if ($interleaveCount == 1) { if (length($phylipName) < $maxTaxLength) { $shortNameCount{$phylipName}++; $phylipName .= " " x ($maxTaxLength - length($phylipName)) if ($phylipTradPrint); # Pad end of name with spaces as needed } else { my $trimmedName = substr($phylipName, 0, $maxTaxLength); $shortNameCount{$trimmedName}++; if ($shortNameCount{$trimmedName} > 1) # Check for duplicates among shortened names and make unique by adding numbers { $phylipName = substr($phylipName, 0, $maxTaxLength - length($shortNameCount{$trimmedName})); $phylipName .= $shortNameCount{$trimmedName}; $phylipName .= " " x ($maxTaxLength - length($phylipName)); # Pad end of name with spaces as needed } else { $phylipName = $trimmedName; } } print PHYLIP "$phylipName"; print PHYLIP " " if ($phylipExtPrint); } else { print PHYLIP "$blankName"; } # Print sequence my $printSeq = substr($finalSeq{$entry}, $seqStart - 1, $printLength); if ($outFormat eq "interleave") # Add gaps every 10 elements in sequence { for (my $gapAdd = 100; $gapAdd >= 10; $gapAdd-=10) { next if $gapAdd > $printLength; my $bp = substr($printSeq, $gapAdd - 1, 1) . " "; substr($printSeq, $gapAdd - 1, 1, $bp); } } print PHYLIP "$printSeq\n"; } print PHYLIP "\n" if ($interleaveCount * $interleaveLength <= $maxLength); # foreach my $entry (@accNum) # { # next if ($deletedSeq{$entry}); # # my $phylipName = $nameLabel{$entry}; # # # Check name label and adjust to proper length if needed # if (length($phylipName) < $maxTaxLength) # { # $shortNameCount{$phylipName}++; # $phylipName .= " " x ($maxTaxLength - length($phylipName)); # Pad end of name with spaces as needed # } # else # { # my $trimmedName = substr($phylipName, 0 , $maxTaxLength); # $shortNameCount{$trimmedName}++; # if ($shortNameCount{$trimmedName} > 1) # Check for duplicates among shortened names and make unique by adding numbers # { # $phylipName = substr($phylipName, 0, $maxTaxLength - length($shortNameCount{$trimmedName})); # $phylipName .= $shortNameCount{$trimmedName}; # $phylipName .= " " x ($maxTaxLength - length($phylipName)); # Pad end of name with spaces as needed # } # else # { # $phylipName = $trimmedName; # } # } # # print PHYLIP "$phylipName"; # print PHYLIP " " if ($phylipExtPrint); # print PHYLIP "$finalSeq{$entry}\n"; } close PHYLIP; } # Print Se-Al-formatted file (on demand) if ($sealPrint) { print "\tWriting to Se_Al file $sealOut ...\n"; open (SEAL, ">$sealOut") or die "Cannot open Se-Al file for aligned DNA sequences, $sealOut\n"; print SEAL "Database={\n"; print SEAL "\tID='MLst';\n"; print SEAL "\tOwner=null;\n"; print SEAL "\tName=null;\n"; print SEAL "\tDescription=null;\n"; print SEAL "\tFlags=0;\n"; print SEAL "\tCount=2;\n"; print SEAL "\t{\n\t\t{\n"; print SEAL "\t\t\tID='PAli';\n"; print SEAL "\t\t\tOwner=1;\n"; print SEAL "\t\t\tName=\"$seqFile\";\n"; print SEAL "\t\t\tDescription=null;\n"; print SEAL "\t\t\tFlags=0;\n"; print SEAL "\t\t\tNumSites=$maxLength;\n"; print SEAL "\t\t\tType="; if ($seqType eq "nucleotide") { print SEAL "\"Nucleotide\""; } else { print SEAL "\"Amino Acid\""; } print SEAL ";\n"; print SEAL "\t\t\tFeatures=null;\n"; print SEAL "\t\t\tColourMode="; if ($seqType eq "nucleotide") { print SEAL "1"; } else { print SEAL "2"; } print SEAL ";\n"; print SEAL "\t\t\tLabelMode=0;\n"; print SEAL "\t\t\ttriplets=false;\n"; print SEAL "\t\t\tinverse=true;\n"; print SEAL "\t\t\tCount=$ntax;\n"; print SEAL "\t\t\t{\n"; my $i = 0; foreach my $sequence (@accNum) { next if ($deletedSeq{$sequence}); $i++; print SEAL "\t\t\t\t{\n"; print SEAL "\t\t\t\t\tID='PSeq';\n"; print SEAL "\t\t\t\t\tOwner=2;\n"; print SEAL "\t\t\t\t\tName=\"$nameLabel{$sequence}\";\n"; print SEAL "\t\t\t\t\tDescription=null;\n"; print SEAL "\t\t\t\t\tFlags=0;\n"; print SEAL "\t\t\t\t\tAccession="; if ($sequence =~/^tAlign_/) { print SEAL "null;\n"; } else { print SEAL "$sequence;\n"; } if ($seqType eq "nucleotide") { print SEAL "\t\t\t\t\tType=\"DNA\";\n"; } else { print SEAL "\t\t\t\t\tType=\"AA\";\n"; } print SEAL "\t\t\t\t\tLength=".length($finalSeq{$sequence}).";\n"; print SEAL "\t\t\t\t\tSequence=\"$finalSeq{$sequence}\";\n"; if (defined $geneticCode{$sequence} and $geneticCode{$sequence} != 0) { print SEAL "\t\t\t\t\tGeneticCode=$gb2seal{$geneticCode{$sequence}};\n"; } else { print SEAL "\t\t\t\t\tGeneticCode=-1;\n"; # Default for Se-Al is non-coding } print SEAL "\t\t\t\t\tCodeTable=null;\n"; print SEAL "\t\t\t\t\tFrame=1;\n"; print SEAL "\t\t\t\t\tFeatures=null;\n"; print SEAL "\t\t\t\t\tParent=null;\n"; print SEAL "\t\t\t\t\tComplemented=false;\n"; print SEAL "\t\t\t\t\tReversed=false;\n"; print SEAL "\t\t\t\t}"; print SEAL "," unless ($i == $ntax); print SEAL "\n"; } print SEAL "\t\t\t};\n"; print SEAL "\t\t},\n"; print SEAL "\t\t{\n"; print SEAL "\t\t\tID='MCoL';\n"; print SEAL "\t\t\tOwner=1;\n"; print SEAL "\t\t\tName=\"Genetic Codes\";\n"; print SEAL "\t\t\tDescription=\"Custom Genetic Codes\";\n"; print SEAL "\t\t\tFlags=0;\n"; print SEAL "\t\t\tCount=0;\n"; print SEAL "\t\t}\n"; print SEAL "\t};\n"; print SEAL "};\n"; close SEAL; if (-e "/Developer/Tools/SetFile") { system ("/Developer/Tools/SetFile -t 'TEXT' $sealOut"); system ("/Developer/Tools/SetFile -c 'SEAL' $sealOut"); } } } sub geneticCoder # Create translation tables for all genetic codes { my %geneticCode = ('1' => 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', '2' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG', '3' => 'FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG', '4' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', '5' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG', '6' => 'FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', '9' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG', '10' => 'FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', '11' => 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', '12' => 'FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', '13' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG', '14' => 'FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG', '15' => 'FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', '16' => 'FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', '21' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG', '22' => 'FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', '23' => 'FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'); foreach my $code (qw(1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23)) { # Establish basic translation table for each genetic code # print "\nEstablishing \"$transTable{$code}\" genetic code ...\n" if ($debug); my $position = 0; foreach my $base1 (qw (T C A G)) { foreach my $base2 (qw (T C A G)) { foreach my $base3 (qw (T C A G)) { my $codon = $base1.$base2.$base3; $DNAtoAA{$code}{$codon} = substr($geneticCode{$code}, $position, 1); # print "\t$codon = $DNAtoAA{$code}{$codon}\n" if ($debug); $position++; } } } # Extend translation table to account for ambiguity codes (note: does not account for gaps) # print "\nExtending translation table to account for ambiguity codes ...\n" if ($debug); foreach my $firstPos (@ambigList) { foreach my $secondPos (@ambigList) { foreach my $thirdPos (@ambigList) { my $codon = $firstPos.$secondPos.$thirdPos; next if (defined $DNAtoAA{$code}{$codon}); my $refAA = ""; foreach my $firstNT (@ {$constitNTlist{$firstPos} }) { last if (defined $DNAtoAA{$code}{$codon}); foreach my $secondNT (@ {$constitNTlist{$secondPos} }) { last if (defined $DNAtoAA{$code}{$codon}); foreach my $thirdNT (@ {$constitNTlist{$thirdPos} }) { my $testCodon = $firstNT.$secondNT.$thirdNT; if (not $refAA) { $refAA = $DNAtoAA{$code}{$testCodon}; } else { if ($DNAtoAA{$code}{$testCodon} ne $refAA) { $DNAtoAA{$code}{$codon} = "?"; last; } } } } } $DNAtoAA{$code}{$codon} = $refAA if (not defined $DNAtoAA{$code}{$codon}); # print "\t$codon = $DNAtoAA{$code}{$codon}\n" if ($debug); } } } } return; } sub translate # Translate a DNA sequence to an AA sequence (note: does not account for gaps) { my $DNAseq = shift; my $userCode = shift; my $protSeq; for (my $codonStart = 0; $codonStart < length($DNAseq); $codonStart += 3) { if (length($DNAseq) - $codonStart >= 3) # Codon is complete; translate { my $codon = substr($DNAseq, $codonStart, 3); if ($codon =~ /-/ or $codon =~ /\./) { $protSeq .= "?"; } else { $protSeq .= $DNAtoAA{$userCode}{$codon}; } } else # Incomplete codon; automatically translates as ? { $protSeq .= "?"; } } return $protSeq; } # Version history # # v1.2 (August 12, 2010) # - added switch to actually allow fasta output to be specified (or not) # - sets TYPE and CREATOR codes for nexus files on Mac systems when # SetFile is present # - can now parse GenBank formatted output (both pure GenBank and BioEdit # versions) # - error checking: warns if same accession number used more than once # and skips subsequent entires # - added ability to: # - detect sequence type of input (nucleotide vs protein) and set as # appropriate in output files # - convert nucleotide input to proteins # - convert sequence input to haplotypes # - interleave nexus- and phylip-formatted output (request by Michael # Craige) and to use inputted value to specify line lengths in fasta # output # - output individual data partitions specified according to nexus- # formatted charset statements # - output jackknifed data sets, each missing a single taxon # - clean sequence labels of non-alphanumeric characters (on by # default) # - convert all ~ gap characters (e.g., from BioEdit) to - # - convert all ambiguous nucleotides to Ns # - change flnaking gaps to Ns (indirect request by Simon Creer) # - fixed classic phylip output such that it now conforms 100% to the # phylip guidelines # - improved parsing of MacClade generating files, including blocking out # MacClade and PAUP blocks when reading in nexus-formatted files # - fixed translation between GenBank and Se-Al genetic codes # - changed all instances of #nexus to #NEXUS in output files for # compatibility with TNT (thanks to Douglas Chester for spotting this) # - minor bug fixes # # v1.1 (March 2, 2006) # - added ability to batch convert all specified file types in working # directory (use -d*) # - updated to seqRead module 1.1.1 (includes autodetection of sequence # format) # - checks that necessary input file(s) exists before proceeding # - added GNU GPL statement # - sets TYPE and CREATOR codes for Se-Al files on Mac systems when # SetFile is present # - minor bug fixes # # v1.0 (May 30, 2005) # - initial release # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # A copy of the GNU General Public License is available at # http://www.gnu.org/copyleft/gpl.html or by writing to # the Free Software Foundation, Inc., 51 Franklin Street, # Fifth Floor, Boston, MA, 02110-1301, USA.