Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
diff alignment/phylocatenator.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phylocatenator.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,413 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; +#phylocatenator.pl +#Written by Todd H. Oakley UCSB +#Version 1.0 May, 2012 +# +#phyloconcatenator.pl [infile] [also requires arguments 1-8 detailed below] +# +#Version 1.0.1 Sept 2012 -- fixed a bug that led sometimes to species with no data being retained. +#This bug would not affect the data that was retained, so if raxml ran all was fine. However, +#raxml will not run with species that have completely missing data. Added extra section to remove +#those species. + +#For debugging command line pass, uncomment next +#for(my $i=0; $i < @ARGV; $i++){ +# print "Arg $i: $ARGV[$i] \n\n"; +#} +#exit; + +die "Check arguments" unless @ARGV == 9; + +#Obtain Arguments +my $infile1=shift(@ARGV); #0 input file +my $mingf_persp=shift(@ARGV); #1 minimum number of genefamiles to keep species +my $minsp_pergf=shift(@ARGV); #2 minimum number of species to keep genefamily +my $min_gf_len =shift(@ARGV); #3 minimum gene family length +my $speciesfile=shift(@ARGV); #4 optional species file 'None' if false +my $modelsfile=shift(@ARGV); #5 models file +my $outfile=shift(@ARGV); #6 data outfile +my $partfile=shift(@ARGV); #7 partition file +my $htmlfile=shift(@ARGV); #8 html file + +#Open 2 output files +open (OUT, ">$outfile") or die "Cannot create $outfile \n"; +open (PART, ">$partfile") or die "Cannot create $partfile\n"; +open (TABLE, ">$htmlfile") or die "Cannot create $htmlfile\n"; + +my %HoAdatatable; +my %HoGF; +my %lengthof; +my %ModelFor; +my $line = ""; + +my @specieslist; +my @genelist; +my @nsp; +my $nsl=1; +my @uniquemodels; +my @nullmodels; +my @retainedspecies; + +# read file with species +unless($speciesfile eq 'None'){ + open SPFILE, $speciesfile or die "ERROR: Cannot open file $speciesfile\n\n"; + $nsl=0; +while (<SPFILE>) { + chomp; + my $currentinput = "$_"; + if($currentinput =~m /\w/){ #must have a word otherwise empty + push(@nsp, $currentinput); + }else{ + die "ERROR: Fewer than 4 species meet your specified criteria.\n"; + } +# if(@nsp < 4){ +# die "ERROR: Species file must have more than 4 species.\n"; +# } +} +#print "\n\n"; +} #end of unless + +#Determine models used for each partition, make hash +unless($modelsfile eq 'None'){ + open MODFILE, $modelsfile or die "ERROR: Cannot open file $modelsfile\n\n"; + while (<MODFILE>) { + chomp; + my $currentinput = "$_"; + if($currentinput =~m /\t/){ #must have a tab otherwise wrong file format + if($currentinput =~m /\t\t/){ + print OUT "ERROR: file contains 2 tabs in a row. Check phytab format.\n"; + die; + }else{ + my @genemodel = split(/\t/, $currentinput); + my $genefamily=$genemodel[0]; + my $curmodel = $genemodel[1]; + if (exists $ModelFor{$genefamily}) { + print OUT "ERROR: Model specification for $genefamily is duplicated\n"; + die; + }else{ + $ModelFor{$genefamily}=$curmodel; + push(@uniquemodels,$curmodel); + } + } + }else{ + die "ERROR: Model LUT must be genefamily\tmodel and contain no blank lines\n"; + } + } +} +@uniquemodels = uniq(@uniquemodels); #remove redundant models - uniq is subroutine at end of script + +#Now check that models are valid raxml models NOT DONE YET +checkraxmlmodel(); + +#INPUT ALL DATA +open(INFILE1, $infile1); +foreach $line(<INFILE1>) { + chop($line); + my $getline = $line; + my @column = split(/\t/, $getline); + my $species = $column[0]; + my $genefamily = $column[1]; + my $genename = $column[2]; + my $sequence = $column[3]; + + if (exists $HoAdatatable{$species}{$genefamily}) { + print OUT "ERROR: $species $genefamily is duplicated\n"; + die; + }else{ + $HoAdatatable{$species}{$genefamily} = $sequence; + $HoGF{$genefamily}{$species}=$sequence; + } +} + +#If no models file selected, set every gene family to GTR model +if($modelsfile eq 'None'){ + foreach my $gfkey (sort keys %HoGF){ + $ModelFor{$gfkey} = 'GTR'; + } + push(@uniquemodels, 'GTR'); +} + +#First, keep all species with enough total partitions present +foreach my $specieskey (sort keys %HoAdatatable) +{ + #Count species with minimum gfs + my $ngf_persp=0; + foreach my $genefamilykey (sort keys %{$HoAdatatable{$specieskey}}) + { + $ngf_persp++; + } + unless($ngf_persp < $mingf_persp){ #too few genes for this species, delete the sp + if($nsl){ #No species list supplied, push all species into list + push(@specieslist,$specieskey); + }else{ + #See if current specieskey is in inputted species list + my $nsp; + foreach $nsp(@nsp){ + if (index($nsp,$specieskey) ge 0){ + push(@specieslist,$specieskey); + } + } + } + } +} +print OUT "\n"; +unless (@specieslist){ + print OUT "ERROR: No species with more than $mingf_persp genes\n"; + die; +} +if(@specieslist < 4){ + print OUT "ERROR: Less than 4 species with more than $mingf_persp genes\n"; + die; +} + + +my $oldgenelen = 0; +my $currentseqlen = 0; +foreach my $gfkey (sort keys %HoGF) +{ + $oldgenelen=0; + #Count gfs with minimum species + my $nsp_pergf=0; + for(my $j=0; $j<@specieslist; $j++){ + if(exists $HoGF{$gfkey}{$specieslist[$j]}){ + $nsp_pergf++ ; ###if exists $HoGF{$gfkey}{$specieslist[$j]}; + + + ##get length of gene and check it is consistent + if(exists $lengthof{$gfkey}){ + $currentseqlen = length($HoGF{$gfkey}{$specieslist[$j]}); + # $lengthof{$gfkey} = $currentseqlen; + if($currentseqlen == $oldgenelen){ + #$oldgenelen = $lengthof{$gfkey}; + #okay + }else{ + print OUT "ERROR: $specieslist[$j] $gfkey sequences ". + "different lengths than previous. Sequences must be aligned. If ". + "sequences are aligned, check that the line ". + "does not have an extra data column before the ". + "sequence.\n\n"; + die "ERROR: $specieslist[$j] $gfkey sequences ". + "different lengths than previous. Sequences must be aligned. If ". + "sequences are aligned, check that the line ". + "does not have an extra data column before the ". + "sequence.\n\n"; + } + }else{ + $currentseqlen = length($HoGF{$gfkey}{$specieslist[$j]}); + if($currentseqlen == 0){ + die "ERROR: Zero length sequence in file.\n" + } + $lengthof{$gfkey} = $currentseqlen; + $oldgenelen = $lengthof{$gfkey}; + } + } + } + if($nsp_pergf < $minsp_pergf){ + #too few species for this gene family + }else{ + if(exists $lengthof{$gfkey}){ + if($lengthof{$gfkey}>$min_gf_len){ + push(@genelist,$gfkey); + } + } + } +} +if (@genelist==0){ + print OUT "ERROR: No gene families/partitions meet the specified criteria\n"; + die "ERROR: No gene families/partitions meet the specified criteria\n"; +} + +#Now must delete species that lack any *retained* partitions, ie some species may be all missing +foreach(@specieslist) +{ + my $curspecies=$_; + #Count species with minimum gfs + my $ngf_persp=0; + my $nonmissing=0; + foreach (@genelist) + { + if (exists $HoAdatatable{$curspecies}{$_}){ + my $cursequence = $HoAdatatable{$curspecies}{$_}; + $cursequence =~ s/\?//g; + $cursequence =~ s/\-//g; + #Will not remove N's assumes those are not missing data. Revisit? + my $curlen = length($cursequence); + $nonmissing = $nonmissing + $curlen; + } + } + if($nonmissing > 0) #must remove species as it contains none of the genes retained + { + print "$curspecies has $nonmissing Non-Missing characters\n"; + push(@retainedspecies, $curspecies); + } +} +@specieslist=@retainedspecies; + +#Remove blankspecies from +#print phylip file +#calculate n characters +my $nchar=0; #total characters +my $ncharPs =0; #start count for characters in current partition +my $ncharPe =0; #end count for characters in current partition + +#First count total characters for first line +for(my $k=0; $k<@genelist; $k++){ + if (exists $lengthof{$genelist[$k]}){ #check that length was calculated + $nchar = $nchar + $lengthof{$genelist[$k]}; + }else{ + die "ERROR: $genelist[$k] LENGTH MISSING\n"; + } +} +print OUT @specieslist." ".$nchar."\n"; +htmlheader(); + +#Need to determine gene list order, which will change due to partitioning +#then write header line of gene names, hopefully in correct order +#print TABLE "<td bgcolor=white></td>"; #Blank line in species column +print TABLE "<td style'width:2pc'><font size='-3'>Partition:</td>"; +for(my $part=0; $part < @uniquemodels; $part++){ + for(my $k=0; $k<@genelist; $k++){ + #First check if current gf matches current partition + if(exists $ModelFor{$genelist[$k]}){ + if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ + if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ + print TABLE "<td style'width:2pc'><font size='-3'>$genelist[$k]</td>"; + } + } + } + } +} +print TABLE "</tr>"; + +#print TABLE "<td bgcolor=white></td>"; #Blank line in species column +print TABLE "<td style'width:2pc'><font size='-3'>Model:</td>"; +for(my $part=0; $part < @uniquemodels; $part++){ + for(my $k=0; $k<@genelist; $k++){ + #First check if current gf matches current partition + if(exists $ModelFor{$genelist[$k]}){ + if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ + if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ + print TABLE "<td style'width:2pc'><font size='-3'>$uniquemodels[$part]</td>"; + } + } + } + } +} +#End of htmlheader printing + +for(my $j=0; $j<@specieslist; $j++){ + print OUT "$specieslist[$j]\t"; + print TABLE " + <tr> + <td>$specieslist[$j]</td>"; + for(my $part=0; $part < @uniquemodels; $part++){ + for(my $k=0; $k<@genelist; $k++){ + #First check if current gf matches current partition + if(exists $ModelFor{$genelist[$k]}){ + if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ + if (exists $HoAdatatable{$specieslist[$j]}{$genelist[$k]}){ + print OUT $HoAdatatable{$specieslist[$j]}{$genelist[$k]}; + print TABLE "<td bgcolor=black></td>"; + $ncharPe = $ncharPe + $lengthof{$genelist[$k]}; + }else{ + if (exists $lengthof{$genelist[$k]}){ + for(my $gap=0; $gap<$lengthof{$genelist[$k]}; $gap++){ + print OUT "?"; + } + $ncharPe = $ncharPe + $lengthof{$genelist[$k]}; + print TABLE "<td bgcolor=white></td>"; + #print TABLE "<td style'width:2pc'><font size='-2'>$genelist[$k]</td>"; + }else{ + die "ERROR: BUG!! $genelist[$k] LENGTH MISSING\n"; + } + } + } + }else{ + die "ERROR: $genelist[$k] is not assigned a model. Check model LUT input.\n"; + } + + } + if($j==0){ #print partitions first time through gene lists + if(($ncharPe==0) || ($ncharPs > $ncharPe)){ + push(@nullmodels, $uniquemodels[$part]); + #print "NOTE: no partitions under the model $uniquemodels[$part] made final dataset\n"; + }else{ + print PART "$uniquemodels[$part], $uniquemodels[$part] = $ncharPs - $ncharPe \n"; + } + $ncharPs=$ncharPe + 1; + } + } + print TABLE "</tr>"; + print OUT "\n"; +} +if($ncharPe/@genelist != $nchar){ + # Have to account for multiple times through gene lists -- ncharPe is summed multiple times but printed once + #die "ERROR: BUG!! Last partition number doesn't match total n characters\n"; +} + +#print Statistics to screen can be redirected for Log File +print "\nSPECIES:\n"; +unless($speciesfile eq 'None'){ + print "Used species file to select species. \n"; +} +print "Number of species with $mingf_persp or more genefamilies/partitions: ".@specieslist."\n"; +print "Species list: @specieslist\n\n\n"; +print "\nPARTITIONS/GENE FAMILIES:\n"; +print "Number genefamilies/partitions longer than $min_gf_len characters and present in at least $minsp_pergf genefamilies/partitions: ".@genelist."\n"; +for(my $i=0; $i < @genelist; $i++){ + print "$genelist[$i] Length:$lengthof{$genelist[$i]}\n"; +} + + #Printing Model stats +print "\nMODELS:\n"; +if($modelsfile eq 'None'){ + print "All partitions set to GTR (no model LUT supplied)\n\n"; +}else{ + print "A LUT of models was supplied.\n"; + print "The following models were present in the model LUT file:\n"; + print join(" ", @uniquemodels), "\n"; + print "\n"; + for(my $i=0; $i < @nullmodels; $i++){ + print "NOTE: no partitions under the model $nullmodels[$i] made final dataset\n"; + } +} +close PART; +close OUT; +close SPFILE; +close TABLE; + +sub uniq { + return keys %{{ map { $_ => 1 } @_ }}; +} + +sub checkraxmlmodel { + my @raxmlmodels = ("DNA","BIN","MULTI","DAYHOFF", "DCMUT", "JTT", "MTREV", "WAG", "RTREV", "CPREV", "VT", "BLOSUM62", "MTMAM", "LG", "GTR", "MTART", "MTZOA", "FLU","PMB", "HIVB","HIVW","JTTDCMUT"); + return(1); #Not yet implemented +} +sub htmlheader { + print TABLE '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" + "http://www.w3.org/TR/html4/loose.dtd"> + <html> + <head> + <title>Dataset Presences and Absences</title> + <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> + </head> + <hr><table border="1"> + <tr> + <th align="center">Species<br>'; + for(my $i=1; $i < @genelist+1; $i++){ +#Genes are printing in wrong order +# print TABLE "<td style'width:2pc'><font size='-2'>$genelist[$i]</td>"; + print TABLE "<td style'width:2pc'><font size='-2'>$i</td>"; + } + print TABLE "</th> + </tr>"; +} +sub htmltail { +print TABLE ' +</body> +</html>'; +}