Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
view alignment/phylocatenator.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
line wrap: on
line source
#! /usr/bin/perl -w use strict; use warnings; #phylocatenator.pl #Written by Todd H. Oakley UCSB #Version 1.0 May, 2012 # #phyloconcatenator.pl [infile] [also requires arguments 1-8 detailed below] # #Version 1.0.1 Sept 2012 -- fixed a bug that led sometimes to species with no data being retained. #This bug would not affect the data that was retained, so if raxml ran all was fine. However, #raxml will not run with species that have completely missing data. Added extra section to remove #those species. #For debugging command line pass, uncomment next #for(my $i=0; $i < @ARGV; $i++){ # print "Arg $i: $ARGV[$i] \n\n"; #} #exit; die "Check arguments" unless @ARGV == 9; #Obtain Arguments my $infile1=shift(@ARGV); #0 input file my $mingf_persp=shift(@ARGV); #1 minimum number of genefamiles to keep species my $minsp_pergf=shift(@ARGV); #2 minimum number of species to keep genefamily my $min_gf_len =shift(@ARGV); #3 minimum gene family length my $speciesfile=shift(@ARGV); #4 optional species file 'None' if false my $modelsfile=shift(@ARGV); #5 models file my $outfile=shift(@ARGV); #6 data outfile my $partfile=shift(@ARGV); #7 partition file my $htmlfile=shift(@ARGV); #8 html file #Open 2 output files open (OUT, ">$outfile") or die "Cannot create $outfile \n"; open (PART, ">$partfile") or die "Cannot create $partfile\n"; open (TABLE, ">$htmlfile") or die "Cannot create $htmlfile\n"; my %HoAdatatable; my %HoGF; my %lengthof; my %ModelFor; my $line = ""; my @specieslist; my @genelist; my @nsp; my $nsl=1; my @uniquemodels; my @nullmodels; my @retainedspecies; # read file with species unless($speciesfile eq 'None'){ open SPFILE, $speciesfile or die "ERROR: Cannot open file $speciesfile\n\n"; $nsl=0; while (<SPFILE>) { chomp; my $currentinput = "$_"; if($currentinput =~m /\w/){ #must have a word otherwise empty push(@nsp, $currentinput); }else{ die "ERROR: Fewer than 4 species meet your specified criteria.\n"; } # if(@nsp < 4){ # die "ERROR: Species file must have more than 4 species.\n"; # } } #print "\n\n"; } #end of unless #Determine models used for each partition, make hash unless($modelsfile eq 'None'){ open MODFILE, $modelsfile or die "ERROR: Cannot open file $modelsfile\n\n"; while (<MODFILE>) { chomp; my $currentinput = "$_"; if($currentinput =~m /\t/){ #must have a tab otherwise wrong file format if($currentinput =~m /\t\t/){ print OUT "ERROR: file contains 2 tabs in a row. Check phytab format.\n"; die; }else{ my @genemodel = split(/\t/, $currentinput); my $genefamily=$genemodel[0]; my $curmodel = $genemodel[1]; if (exists $ModelFor{$genefamily}) { print OUT "ERROR: Model specification for $genefamily is duplicated\n"; die; }else{ $ModelFor{$genefamily}=$curmodel; push(@uniquemodels,$curmodel); } } }else{ die "ERROR: Model LUT must be genefamily\tmodel and contain no blank lines\n"; } } } @uniquemodels = uniq(@uniquemodels); #remove redundant models - uniq is subroutine at end of script #Now check that models are valid raxml models NOT DONE YET checkraxmlmodel(); #INPUT ALL DATA open(INFILE1, $infile1); foreach $line(<INFILE1>) { chop($line); my $getline = $line; my @column = split(/\t/, $getline); my $species = $column[0]; my $genefamily = $column[1]; my $genename = $column[2]; my $sequence = $column[3]; if (exists $HoAdatatable{$species}{$genefamily}) { print OUT "ERROR: $species $genefamily is duplicated\n"; die; }else{ $HoAdatatable{$species}{$genefamily} = $sequence; $HoGF{$genefamily}{$species}=$sequence; } } #If no models file selected, set every gene family to GTR model if($modelsfile eq 'None'){ foreach my $gfkey (sort keys %HoGF){ $ModelFor{$gfkey} = 'GTR'; } push(@uniquemodels, 'GTR'); } #First, keep all species with enough total partitions present foreach my $specieskey (sort keys %HoAdatatable) { #Count species with minimum gfs my $ngf_persp=0; foreach my $genefamilykey (sort keys %{$HoAdatatable{$specieskey}}) { $ngf_persp++; } unless($ngf_persp < $mingf_persp){ #too few genes for this species, delete the sp if($nsl){ #No species list supplied, push all species into list push(@specieslist,$specieskey); }else{ #See if current specieskey is in inputted species list my $nsp; foreach $nsp(@nsp){ if (index($nsp,$specieskey) ge 0){ push(@specieslist,$specieskey); } } } } } print OUT "\n"; unless (@specieslist){ print OUT "ERROR: No species with more than $mingf_persp genes\n"; die; } if(@specieslist < 4){ print OUT "ERROR: Less than 4 species with more than $mingf_persp genes\n"; die; } my $oldgenelen = 0; my $currentseqlen = 0; foreach my $gfkey (sort keys %HoGF) { $oldgenelen=0; #Count gfs with minimum species my $nsp_pergf=0; for(my $j=0; $j<@specieslist; $j++){ if(exists $HoGF{$gfkey}{$specieslist[$j]}){ $nsp_pergf++ ; ###if exists $HoGF{$gfkey}{$specieslist[$j]}; ##get length of gene and check it is consistent if(exists $lengthof{$gfkey}){ $currentseqlen = length($HoGF{$gfkey}{$specieslist[$j]}); # $lengthof{$gfkey} = $currentseqlen; if($currentseqlen == $oldgenelen){ #$oldgenelen = $lengthof{$gfkey}; #okay }else{ print OUT "ERROR: $specieslist[$j] $gfkey sequences ". "different lengths than previous. Sequences must be aligned. If ". "sequences are aligned, check that the line ". "does not have an extra data column before the ". "sequence.\n\n"; die "ERROR: $specieslist[$j] $gfkey sequences ". "different lengths than previous. Sequences must be aligned. If ". "sequences are aligned, check that the line ". "does not have an extra data column before the ". "sequence.\n\n"; } }else{ $currentseqlen = length($HoGF{$gfkey}{$specieslist[$j]}); if($currentseqlen == 0){ die "ERROR: Zero length sequence in file.\n" } $lengthof{$gfkey} = $currentseqlen; $oldgenelen = $lengthof{$gfkey}; } } } if($nsp_pergf < $minsp_pergf){ #too few species for this gene family }else{ if(exists $lengthof{$gfkey}){ if($lengthof{$gfkey}>$min_gf_len){ push(@genelist,$gfkey); } } } } if (@genelist==0){ print OUT "ERROR: No gene families/partitions meet the specified criteria\n"; die "ERROR: No gene families/partitions meet the specified criteria\n"; } #Now must delete species that lack any *retained* partitions, ie some species may be all missing foreach(@specieslist) { my $curspecies=$_; #Count species with minimum gfs my $ngf_persp=0; my $nonmissing=0; foreach (@genelist) { if (exists $HoAdatatable{$curspecies}{$_}){ my $cursequence = $HoAdatatable{$curspecies}{$_}; $cursequence =~ s/\?//g; $cursequence =~ s/\-//g; #Will not remove N's assumes those are not missing data. Revisit? my $curlen = length($cursequence); $nonmissing = $nonmissing + $curlen; } } if($nonmissing > 0) #must remove species as it contains none of the genes retained { print "$curspecies has $nonmissing Non-Missing characters\n"; push(@retainedspecies, $curspecies); } } @specieslist=@retainedspecies; #Remove blankspecies from #print phylip file #calculate n characters my $nchar=0; #total characters my $ncharPs =0; #start count for characters in current partition my $ncharPe =0; #end count for characters in current partition #First count total characters for first line for(my $k=0; $k<@genelist; $k++){ if (exists $lengthof{$genelist[$k]}){ #check that length was calculated $nchar = $nchar + $lengthof{$genelist[$k]}; }else{ die "ERROR: $genelist[$k] LENGTH MISSING\n"; } } print OUT @specieslist." ".$nchar."\n"; htmlheader(); #Need to determine gene list order, which will change due to partitioning #then write header line of gene names, hopefully in correct order #print TABLE "<td bgcolor=white></td>"; #Blank line in species column print TABLE "<td style'width:2pc'><font size='-3'>Partition:</td>"; for(my $part=0; $part < @uniquemodels; $part++){ for(my $k=0; $k<@genelist; $k++){ #First check if current gf matches current partition if(exists $ModelFor{$genelist[$k]}){ if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ print TABLE "<td style'width:2pc'><font size='-3'>$genelist[$k]</td>"; } } } } } print TABLE "</tr>"; #print TABLE "<td bgcolor=white></td>"; #Blank line in species column print TABLE "<td style'width:2pc'><font size='-3'>Model:</td>"; for(my $part=0; $part < @uniquemodels; $part++){ for(my $k=0; $k<@genelist; $k++){ #First check if current gf matches current partition if(exists $ModelFor{$genelist[$k]}){ if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ print TABLE "<td style'width:2pc'><font size='-3'>$uniquemodels[$part]</td>"; } } } } } #End of htmlheader printing for(my $j=0; $j<@specieslist; $j++){ print OUT "$specieslist[$j]\t"; print TABLE " <tr> <td>$specieslist[$j]</td>"; for(my $part=0; $part < @uniquemodels; $part++){ for(my $k=0; $k<@genelist; $k++){ #First check if current gf matches current partition if(exists $ModelFor{$genelist[$k]}){ if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ if (exists $HoAdatatable{$specieslist[$j]}{$genelist[$k]}){ print OUT $HoAdatatable{$specieslist[$j]}{$genelist[$k]}; print TABLE "<td bgcolor=black></td>"; $ncharPe = $ncharPe + $lengthof{$genelist[$k]}; }else{ if (exists $lengthof{$genelist[$k]}){ for(my $gap=0; $gap<$lengthof{$genelist[$k]}; $gap++){ print OUT "?"; } $ncharPe = $ncharPe + $lengthof{$genelist[$k]}; print TABLE "<td bgcolor=white></td>"; #print TABLE "<td style'width:2pc'><font size='-2'>$genelist[$k]</td>"; }else{ die "ERROR: BUG!! $genelist[$k] LENGTH MISSING\n"; } } } }else{ die "ERROR: $genelist[$k] is not assigned a model. Check model LUT input.\n"; } } if($j==0){ #print partitions first time through gene lists if(($ncharPe==0) || ($ncharPs > $ncharPe)){ push(@nullmodels, $uniquemodels[$part]); #print "NOTE: no partitions under the model $uniquemodels[$part] made final dataset\n"; }else{ print PART "$uniquemodels[$part], $uniquemodels[$part] = $ncharPs - $ncharPe \n"; } $ncharPs=$ncharPe + 1; } } print TABLE "</tr>"; print OUT "\n"; } if($ncharPe/@genelist != $nchar){ # Have to account for multiple times through gene lists -- ncharPe is summed multiple times but printed once #die "ERROR: BUG!! Last partition number doesn't match total n characters\n"; } #print Statistics to screen can be redirected for Log File print "\nSPECIES:\n"; unless($speciesfile eq 'None'){ print "Used species file to select species. \n"; } print "Number of species with $mingf_persp or more genefamilies/partitions: ".@specieslist."\n"; print "Species list: @specieslist\n\n\n"; print "\nPARTITIONS/GENE FAMILIES:\n"; print "Number genefamilies/partitions longer than $min_gf_len characters and present in at least $minsp_pergf genefamilies/partitions: ".@genelist."\n"; for(my $i=0; $i < @genelist; $i++){ print "$genelist[$i] Length:$lengthof{$genelist[$i]}\n"; } #Printing Model stats print "\nMODELS:\n"; if($modelsfile eq 'None'){ print "All partitions set to GTR (no model LUT supplied)\n\n"; }else{ print "A LUT of models was supplied.\n"; print "The following models were present in the model LUT file:\n"; print join(" ", @uniquemodels), "\n"; print "\n"; for(my $i=0; $i < @nullmodels; $i++){ print "NOTE: no partitions under the model $nullmodels[$i] made final dataset\n"; } } close PART; close OUT; close SPFILE; close TABLE; sub uniq { return keys %{{ map { $_ => 1 } @_ }}; } sub checkraxmlmodel { my @raxmlmodels = ("DNA","BIN","MULTI","DAYHOFF", "DCMUT", "JTT", "MTREV", "WAG", "RTREV", "CPREV", "VT", "BLOSUM62", "MTMAM", "LG", "GTR", "MTART", "MTZOA", "FLU","PMB", "HIVB","HIVW","JTTDCMUT"); return(1); #Not yet implemented } sub htmlheader { print TABLE '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <title>Dataset Presences and Absences</title> <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> </head> <hr><table border="1"> <tr> <th align="center">Species<br>'; for(my $i=1; $i < @genelist+1; $i++){ #Genes are printing in wrong order # print TABLE "<td style'width:2pc'><font size='-2'>$genelist[$i]</td>"; print TABLE "<td style'width:2pc'><font size='-2'>$i</td>"; } print TABLE "</th> </tr>"; } sub htmltail { print TABLE ' </body> </html>'; }