diff gene_family_scaffold_updater.pl @ 0:2b0906489073 draft default tip

Uploaded
author greg
date Tue, 21 Aug 2018 13:00:21 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_family_scaffold_updater.pl	Tue Aug 21 13:00:21 2018 -0400
@@ -0,0 +1,880 @@
+#!/usr/bin/env perl
+# Author: Eric Wafula
+# Email: ekw10@psu.edu
+# Institution: Penn State University, Biology Dept, Claude dePamphilis Lab
+# Date: June 2018
+
+use strict;
+use warnings;
+use File::Spec;
+use File::Basename;
+use Getopt::Long qw(:config no_ignore_case);
+use FindBin;
+use DBI;
+
+my $home =  "$FindBin::Bin/..";
+
+my $usage = <<__EOUSAGE__;
+
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+#
+#                                  GENE FAMILY SCAFFOLD UPDATER
+#
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+#  Required Options:
+#
+#
+#  --database_connection_string <string>    : Postgres database connection string using format
+#                                             postgresql://<user>:<password>@<host>/<database name>
+#
+#  --proteins <string>                      : Amino acids (proteins) sequences fasta file (proteins.fasta)
+#                                             This can either be an absolute path or just the file name
+#
+#  --coding_sequences <string>              : Corresponding coding sequences (CDS) fasta file (cds.fasta)
+#
+#  --scaffold <string>                      : Orthogroups or gene families proteins scaffold.  This can either be an absolute
+#                                             path to the directory containing the scaffolds (e.g., /home/scaffolds/22Gv1.1)
+#                                             or just the scaffold (e.g., 22Gv1.1).  If the latter, ~home/data is prepended to
+#                                             the scaffold to create the absolute path.
+#                                             the scaffold to create the absolute path.
+#                                             If Monocots clusters (version 1.0): 12Gv1.0
+#                                             If Angiosperms clusters (version 1.0): 22Gv1.0
+#                                             If Angiosperms clusters (version 1.1): 22Gv1.1
+#                                             If Green plants clusters (version 1.0): 31Gv1.0
+#                                             If Other non PlantTribes clusters: XGvY.Z, where "X" is the number species in the scaffold,
+#                                             and "Y.Z" version number such as 12Gv1.0. Please look at one of the PlantTribes scaffold
+#                                               data on how data files and directories are named, formated, and organized.
+#
+#
+#  --species_name <string>                  : Name of the species
+#
+#  --species_code <string>                  : Code of the species
+#
+#  --species_family <string>                : Family of the species
+#
+#  --species_order <string>                 : Order of the species
+#
+#  --species_group <string>                 : Group of the species
+#
+#  --species_clade <string>                 : Clade of the species
+#
+#  --rooting_order_species_code <string>    : Species code after which the new species will be placed in the rooting order config file
+#
+# # # # # # # # # # # # # # # # # #
+#  Others Options:
+#
+#  --num_threads <int>                      : number of threads (CPUs) to used for HMMScan, BLASTP, and MAFFT
+#                                             Default: 1
+#
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+#  Example Usage:
+#
+#  GeneFamilyScaffoldUpdater --database_connection_string postgresql://<user>:<password>@<host>/<database name>
+                             --proteins proteins.fasta --coding_sequences cds.fasta --scaffold 22Gv1.1
+#                            --species_name Fake genome --species_family Brassicaceae --species_order Brassicales
+                             --species_group Rosids --species_clade Core Eudicots --rooting_order_species_code Phypa
+#
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+
+__EOUSAGE__
+    ;
+
+# Declare and initialize variables;
+my $database_connection_string;
+my $proteins;
+my $species_name;
+my $species_code;
+my $species_family;
+my $species_order;
+my $species_group;
+my $species_clade;
+my $coding_sequences;
+my $scaffold;
+my $rooting_order_species_code;
+my $num_threads;
+
+my $options = GetOptions ( 'database_connection_string=s' => \$database_connection_string,
+                           'proteins=s' => \$proteins,
+                           'species_name=s' => \$species_name,
+                           'species_code=s' => \$species_code,
+                           'species_family=s' => \$species_family,
+                           'species_order=s' => \$species_order,
+                           'species_group=s' => \$species_group,
+                           'species_clade=s' => \$species_clade,
+                           'coding_sequences=s' => \$coding_sequences,
+                           'scaffold=s' => \$scaffold,
+                           'rooting_order_species_code=s' => \$rooting_order_species_code,
+                           'num_threads=i' => \$num_threads,
+                         );
+
+# # # # # # # # # # # # # # # # # # # # # # # # #  validate options and set variables  # # # # # # # # # # # # # # # # # # # # # # # # # #
+# check if options are set
+unless ( $options ) { die $usage; }
+unless ( $database_connection_string and $proteins and $species_name and $species_code and $species_family and $species_order and $species_group and $species_clade and $coding_sequences and $scaffold and $rooting_order_species_code ) {
+    print "\nOne or more required options not set\n"; die $usage; }
+# get scaffold directory
+my $scaffold_dir;
+if (File::Spec->file_name_is_absolute($scaffold)) {
+    $scaffold_dir = $scaffold;
+    $scaffold = basename($scaffold);
+} else {
+    if ($scaffold) { $scaffold_dir = "$home/data/$scaffold"; }
+    else { print "\n --scaffold option is not set\n\n"; die $usage; }
+}
+
+# validate scaffold and update type options
+if ( $scaffold !~ /^\d+Gv\d+\.\d+$/) {
+    print "\nOrthogroups or gene families proteins scaffold name $scaffold is not in the required format";
+    print " i.e. XGvY.Z, where X is number species in the scaffold, and Y.Z version number such as 12Gv1.0.\n";
+    die $usage;
+}
+
+# Find out if the received rooting_order_species_code is in
+# the rooting order configuration file for the scaffold.  We
+# do this before anything else since it is the least resource
+# intensive and an invalid species code will force an error.
+validate_rooting_order_species_code($scaffold_dir, $rooting_order_species_code);
+
+# Get a database connection.
+my $dbh = get_database_connection($database_connection_string);
+
+# The gene_family_scaffold_loader tool must be executed before
+# this tool so that the information about the scaffold is
+# avaialble in the galaxy_plant_tribes database.  Check to make
+# sure the scaffold has been loaded into the databse before
+# continuing with the update.
+validate_scaffold($dbh, $scaffold);
+
+# get scaffold clustering methods
+my %methods;
+my $annotation_dir = "$scaffold_dir/annot";
+opendir (DIR, "$annotation_dir") or die "Can't open $annotation_dir directory\n";
+while (my $filename = readdir(DIR)) {
+    if ($filename =~ /(\w+)\.list$/) { $methods{$1} = $1; }
+}
+
+# set defaults
+if (!$num_threads) { $num_threads = 1; }
+
+ # # # # # # # # # # # # # # # # # # # # # # # # # # # # #  sub-routine calls  # # # # # # # # # # # # # # # # # # # # # # # # # # #
+
+log_msg("Starting gene family scaffold updating.");
+
+# Create working directory.
+my $working_dir = "./geneFamilyScaffoldUpdate_dir";
+if (-d $working_dir) {
+    die "Exiting...!\nGene family scaffold update output directory ($working_dir) already exists!\n\n";
+}
+make_directory($working_dir);
+
+# Copy original scaffold data to a working directory.
+log_msg("Copying original scaffold data to working directory.");
+my $copy_scaffold_data = system "cp -r $scaffold_dir $working_dir";
+if ($copy_scaffold_data != 0) {
+    stop_err("Copying original scaffold data to working directory failed.");
+}
+
+# Update the scaffold config files in the working directory with the new genome.
+update_config_files ( $scaffold, $rooting_order_species_code, $species_name, $species_code, $species_family, $species_order, $species_group, $species_clade, $working_dir );
+
+# Update the scaffold files in the working directory with the new genome.
+foreach my $method (keys %methods) {
+    sort_sequences ( $proteins, $coding_sequences, $scaffold, $method, $num_threads, $species_name, $working_dir, $scaffold_dir );
+}
+
+# Move updated scaffold data to original directory.
+my $updated_scaffold_dir = "$working_dir/$scaffold";
+log_msg("Removing original scaffold data directory $scaffold_dir.");
+my $remove_scaffold_data = system "rm -rf $scaffold_dir";
+if ($remove_scaffold_data != 0) {
+    stop_err("Removing original scaffold data directory failed.");
+}
+log_msg("Moving updated scaffold data from\n$updated_scaffold_dir\nto original directory\n$scaffold_dir.");
+my $move_scaffold_data = system "mv $updated_scaffold_dir $scaffold_dir";
+if ($move_scaffold_data != 0) {
+    stop_err("Moving updated scaffold data to original directory failed.");
+}
+
+# Update the database tables with the new genome.
+update_database_tables ( $dbh, $proteins, $scaffold, \%methods, $species_name, $species_family, $species_order, $species_group, $species_clade, $scaffold_dir, $working_dir );
+
+log_msg("Completed gene family scaffold updating.");
+
+exit(0);
+
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub-routines # # # # # # # # # # # # # # # # # # # # # # # # # # #
+
+sub log_msg {
+    my ($msg) = @_;
+    print localtime()." - ".$msg."\n\n";
+}
+
+sub stop_err {
+    my ($error_msg) = @_;
+    print "\n-- ".localtime()." - ".$error_msg."\n\n";
+    exit(1);
+}
+
+sub validate_rooting_order_species_code {
+    my ($scaffold_dir, $rooting_order_species_code ) = @_;
+    my $rooting_order_config = "$scaffold_dir/$scaffold.rootingOrder.config";
+    open (IN, $rooting_order_config) or die "Can't open $rooting_order_config\n";
+    while (<IN>) {
+        chomp;
+        if (/^#/ || /^$/ || /^\[/) {
+            # Skip comments, blasnk lines and section headers.
+            next;
+        }
+        # Example line: Physcomitrella patens=Phypa
+        my @F = split(/=/, $_);
+        my $rooting_order_species_code_cmp = $F[1];
+        if (defined($rooting_order_species_code_cmp) && $rooting_order_species_code_cmp eq $rooting_order_species_code) {
+            return;
+        }
+    }
+    stop_err("Invalid rooting order species code $rooting_order_species_code is not found in $rooting_order_config");
+}
+
+sub validate_scaffold {
+    my ($dbh, $scaffold) = @_;
+    my ($stmt, $sth, $rv);
+    $stmt = qq(SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '$scaffold';);
+    $sth = $dbh->prepare( $stmt );
+    $rv = $sth->execute() or die $DBI::errstr;
+    if ($rv < 0) { print $DBI::errstr; }
+    if ($sth->rows > 0) {
+        return;
+    }
+    stop_err("The scaffold $scaffold has not been loaded into the database - use the GeneFamilyScaffoldLoader tool to load the scaffold before attempting  to update the scaffold with this tool.");
+}
+
+sub make_directory {
+        my ( $new_dir ) = @_;
+        if (!-d $new_dir) {
+                mkdir($new_dir, 0755);
+        }
+}
+
+sub get_database_connection {
+    my ($database_connection_string) = @_;
+    # Database connection and variables, the format of database_connection_string is
+    # postgresql://<user>:<password>@<host>/<database name>
+    my @conn_part = split(/:\/\//, $database_connection_string);
+    my $conn_part_str = $conn_part[1];
+    my $driver = "Pg";
+    my @conn_part2 = split(/\//, $conn_part_str);
+    my $database = $conn_part2[1];
+    my $dsn = "DBI:$driver:dbname = $database;host = 127.0.0.1;port = 5432";
+    @conn_part2 = split(/:/, $conn_part_str);
+    my $userid = $conn_part2[0];
+    @conn_part2 = split(/@/, $conn_part_str);
+    my $conn_part2_str = $conn_part2[0];
+    my @conn_part3 = split(/:/, $conn_part2_str);
+    my $password = $conn_part3[1];
+    my $dbh = DBI->connect($dsn, $userid, $password, { RaiseError => 1 }) or die "$DBI::errstr\nError : Unable to open database galaxy_plant_tribes\n";
+    log_msg("Successfully connected to database $database.");
+    return $dbh;
+}
+
+sub update_config_files {
+    my ( $scaffold, $rooting_order_species_code, $species_name, $species_code, $species_family, $species_order, $species_group, $species_clade, $working_dir ) = @_;
+    log_msg("Updating scaffold config files in working directory.");
+    # Update  the rootingOrder config file.
+    my $rooting_order_config = "$working_dir/$scaffold/$scaffold.rootingOrder.config";
+    my $tmp_rooting_order_config = "$working_dir/$scaffold/$scaffold.rootingOrder.config.tmp";
+    open (IN, $rooting_order_config) or die "Can't open $rooting_order_config\n";
+    open (OUT, ">$tmp_rooting_order_config") or die "Can't open $tmp_rooting_order_config file\n";
+    my $inserted = 0;
+    while (<IN>) {
+        # Example line: Physcomitrella patens=Phypa
+        chomp;
+        print OUT "$_\n";
+        if (not $inserted) {
+            if (not /^#/ && not /^$/ && not /^\[/) {
+                my @F=split(/=/, $_);
+                my $cmp_species_code = $F[1];
+                if (defined($cmp_species_code) && $cmp_species_code eq $rooting_order_species_code) {
+                    print OUT "$species_name=$species_code\n";
+                    $inserted = 1;
+                }
+            }
+        }
+    }
+    close OUT;
+    close IN;
+    my $update_rooting_order = system "mv $tmp_rooting_order_config $rooting_order_config >/dev/null";
+    if ($update_rooting_order != 0) {
+        stop_err("Updating rooting order config in working directory failed.");
+    }
+    # Update  the taxaLineage config file.
+    my $taxa_lineage_config = "$working_dir/$scaffold/$scaffold.taxaLineage.config";
+    # Make sure the last line of the file ends with a newline character
+    # by rewriting the entire file.
+    my $tmp_taxa_lineage_config = "$working_dir/$scaffold/$scaffold.taxaLineage.config.tmp";
+    open (IN, $taxa_lineage_config) or die "Can't open $taxa_lineage_config\n";
+    open (OUT, ">$tmp_taxa_lineage_config") or die "Can't open $tmp_taxa_lineage_config file\n";
+    while (<IN>) {
+        chomp;
+        print OUT "$_\n";
+    }
+    close OUT;
+    close IN;
+    my $update_taxa = system "mv $tmp_taxa_lineage_config $taxa_lineage_config >/dev/null";
+    if ($update_taxa != 0) {
+        stop_err("Updating taxa lineage config in working directory failed.");
+    }
+    # Append the new species information to the file.
+    open (OUT, ">>$taxa_lineage_config") or die "Can't open $taxa_lineage_config file\n";
+    print OUT "$species_name\t$species_family\t$species_order\t$species_group\t$species_clade\n";
+    close OUT;
+}
+
+sub sort_sequences {
+    my ( $proteins, $coding_sequences, $scaffold, $method, $num_threads, $species_name, $working_dir, $scaffold_dir ) = @_;
+    my $method_dir = "$working_dir/$method";
+    make_directory($method_dir);
+    log_msg("Sorting working directory protein sequences in the $method clustering method.");
+    log_msg("Running BLASTP.");
+    my $blastp_call = system "blastp -outfmt 6 -evalue 1e-5 -num_threads $num_threads -query $proteins -db $scaffold_dir/db/blast/$method -out $method_dir/proteins.blastp  >/dev/null";
+    if ($blastp_call != 0) {
+        stop_err("Running BLASTP failed.");
+    }
+    log_msg("Getting best BLASTP hits.");
+    my $blast_results = "proteins.blastp";
+    get_best_blastp_orthos ( $blast_results, $scaffold, $method, $method_dir, $scaffold_dir );
+    log_msg("Running HMMScan.");
+    my $hmmscan_call = system "hmmscan -E 1e-5 --cpu $num_threads --noali --tblout $method_dir/proteins.hmmscan -o $method_dir/hmmscan.log $scaffold_dir/db/hmm/$method $proteins >/dev/null";
+    if ($hmmscan_call != 0) {
+        stop_err("Running HMMScan failed.");
+    }
+    log_msg("Getting best HMMScan hits.");
+    my $hmmscan_results = "proteins.hmmscan";
+    get_best_hmmscan_orthos ( $hmmscan_results, $scaffold, $method, $method_dir, $scaffold_dir );
+    get_blast_hmmscan_orthos ( $scaffold, $method, $method_dir, $scaffold_dir );
+    get_orthogroup_fasta ( $proteins, $coding_sequences, $method, $method_dir, $scaffold_dir );
+    update_scaffold_data( $proteins, $scaffold, $method, $num_threads, $method_dir, $species_name, $working_dir, $scaffold_dir );
+}
+
+sub  get_best_blastp_orthos {
+    my ( $blast_results, $scaffold, $method, $method_dir, $scaffold_dir ) = @_;
+    my (%best, %max, %list);
+    open (IN, "$method_dir/$blast_results") or die "Can't open $method_dir/$blast_results file\n";
+    while (<IN>) {
+        chomp;
+        my @F=split(/\t/, $_);
+        if ($F[0] eq $F[1]) { next; }
+        if (!$best{$F[0]}) {
+            $best{$F[0]} = $_;
+            $max{$F[0]} = $F[11];
+        }
+        else {
+            if ($F[11] > $max{$F[0]}) {
+                $best{$F[0]} = $_;
+                $max{$F[0]} = $F[11];
+            }
+        }
+    }
+    close IN;
+    open (IN, "$scaffold_dir/annot/$method.list") or die "Can't open $scaffold_dir/annot/$method.list file\n";
+    while (<IN>) {
+        chomp;
+        my @F=split(/\t/, $_);
+        $list{$F[1]} = $F[0];
+    }
+    close IN;
+    open (OUT, ">$method_dir/$blast_results.bestOrthos") or die "Can't open $method_dir/$blast_results.bestOrthos file\n";
+    print OUT "Gene ID\tOrthogroup ID\n";
+    foreach (keys %best) {
+        my @F = split(/\t/, $best{$_});
+        print OUT "$F[0]\t$list{$F[1]}\n";
+    }
+    close OUT;
+}
+
+sub  get_best_hmmscan_orthos {
+    my ( $hmmscan_results, $scaffold, $method, $method_dir, $scaffold_dir ) = @_;
+    my %hits;
+    open (IN, "$method_dir/$hmmscan_results") or die "Can't open $method_dir/$hmmscan_results file\n";
+    while (<IN>) {
+        if (/^#/){next;}
+        my @F = split(/\s+/, $_);
+        $hits{$F[2]}{$F[0]} = $F[5];
+    }
+    close IN;
+    open (OUT, ">$method_dir/$hmmscan_results.bestOrthos") or die "Can't open $method_dir/$hmmscan_results.bestOrthos file\n";
+    print OUT "Gene ID\tOrthogroup ID\n";
+    for my $hit (keys %hits) {
+        my $score = 0;
+        my $best_target;
+        for my $target (keys %{$hits{$hit}}) {
+            if ($hits{$hit}{$target} >= $score) {
+                $score = $hits{$hit}{$target};
+                $best_target = $target;
+            }
+        }
+        print OUT "$hit\t$best_target\n";
+     }
+    close OUT;
+}
+
+sub get_blast_hmmscan_orthos {
+    my ( $scaffold, $method, $method_dir, $scaffold_dir ) = @_;
+    my (%blastp, %hmmscan, %genes);
+    opendir (DIR, "$method_dir") or die "Can't open $method_dir directory\n";
+    while (my $filename = readdir(DIR)) {
+        if ($filename =~ /^proteins\.blastp\.bestOrthos$/){
+            open (IN, "$method_dir/$filename") or die "Can't open $method_dir/$filename file\n";
+            while (<IN>) {
+                chomp;
+                if (/^Gene/) {next;}
+                my @F = split(/\t/, $_);
+                $blastp{$F[0]} = $F[1];
+                $genes{$F[0]} = $F[0];
+            }
+            close IN;
+        }
+        if ($filename =~ /^proteins\.hmmscan\.bestOrthos$/){
+            open (IN, "$method_dir/$filename") or die "Can't open $method_dir/$filename file\n";
+            while (<IN>) {
+                chomp;
+                if (/^Gene/) {next;}
+                my @F = split(/\t/, $_);
+                $hmmscan{$F[0]} = $F[1];
+                $genes{$F[0]} = $F[0];
+            }
+            close IN;
+        }
+    }
+    closedir DIR;
+    open (OUT, ">$method_dir/proteins.both.bestOrthos") or die "Can't open $method_dir/protein.both.bestOrthos file\n";
+    print OUT "Gene ID\tOrthogroup ID\n";
+    foreach (sort keys %genes) {
+        if (!$blastp{$_} and $hmmscan{$_}) { print OUT "$_\t$hmmscan{$_}\n"; next; }
+        elsif ($blastp{$_} and !$hmmscan{$_}) { print OUT "$_\t$blastp{$_}\n"; next; }
+        elsif ($blastp{$_} == $hmmscan{$_}) { print OUT "$_\t$blastp{$_}\n"; next }
+        else { print OUT "$_\t$hmmscan{$_}\n"; }
+    }
+    close OUT;
+}
+
+sub get_orthogroup_fasta {
+    my ( $proteins, $coding_sequences, $method, $method_dir, $scaffold_dir ) = @_;
+    log_msg("Retrieving orthogroup fasta files.");
+    my (%orthos, %pep, %cds);
+    my $orthogroup_fasta = "$method_dir/orthogroups_fasta";
+    make_directory($orthogroup_fasta);
+    my $orthogroup_assignment = "proteins.both.bestOrthos";
+    open (IN, "$method_dir/$orthogroup_assignment") or die "Can't open $method_dir/$orthogroup_assignment file\n";
+    while (<IN>) {
+        chomp;
+        if ($_ =~ /^Gene/) { next; }
+        my @F = split(/\t/, $_);
+        $orthos{$F[1]}{$F[0]} = $F[0];
+    }
+    close IN;
+    %pep = get_sequences ($proteins);
+    if ($coding_sequences) { %cds = get_sequences ($coding_sequences); }
+    my ($ortho_id, $seq_id);
+    foreach $ortho_id (keys %orthos) {
+        open (PEP, ">$orthogroup_fasta/$ortho_id.faa") or die "Can't open $orthogroup_fasta/$ortho_id.faa file\n";
+        if ($coding_sequences) { open (CDS, ">$orthogroup_fasta/$ortho_id.fna") or die "Can't open $orthogroup_fasta/$ortho_id.fna file\n"; }
+        foreach $seq_id (sort keys %{$orthos{$ortho_id}}) {
+            $pep{$seq_id} =~ s/.{80}(?=.)/$&\n/g;
+            print PEP ">$seq_id\n$pep{$seq_id}\n";
+            if ($coding_sequences) {
+                $cds{$seq_id} =~ s/.{80}(?=.)/$&\n/g;
+                print CDS ">$seq_id\n$cds{$seq_id}\n";
+            }
+        }
+        close PEP;
+        close CDS;
+    }
+    my @files;
+    my $formated_fasta = "$orthogroup_fasta/formated_fasta";
+    make_directory($formated_fasta);
+    opendir(DIR, "$orthogroup_fasta") or die "Can't open $orthogroup_fasta directory\n";
+    while(my $filename = readdir(DIR)) {
+        if($filename =~ /\d+\.fna/ or $filename =~ /\d+\.faa/){
+            push (@files, $filename);
+        }
+    }
+    closedir(DIR);
+    foreach my $file (@files) {
+        open (IN, "$orthogroup_fasta/$file") or die "Can't open $orthogroup_fasta/$file file\n";
+        open (OUT, ">$formated_fasta/$file") or die "Can't open $formated_fasta/$file file\n";
+        while (<IN>) {
+            chomp;
+            if(/^>/){ s/\|/_/g; print OUT "$_\n"; }
+            else { print OUT "$_\n"; }
+        }
+        close IN;
+        close OUT;
+    }
+    my $integrated_orthogroup_fasta = "$method_dir/integrated_orthogroup_fasta";
+    make_directory($integrated_orthogroup_fasta);
+    integrate_orthogroup_fasta ( $formated_fasta, $method, $integrated_orthogroup_fasta, $scaffold_dir );
+}
+
+sub get_sequences {
+    my ( $file ) = @_;
+    my (%sequences, $id);
+    open (IN, "$file") or die "Can't open $file file\n";
+    while (<IN>) {
+        if ($_ =~ />(\S+)/){ $id = $1; }
+        else { s/\s+//g; $sequences{$id} .= $_; }
+    }
+    close IN;
+    return %sequences;
+}
+
+sub integrate_orthogroup_fasta {
+    my ( $formated_fasta, $method, $integrated_orthogroup_fasta, $scaffold_dir) = @_;
+    log_msg("Integrating orthogroup fasta files.");
+    my (%pep, %cds);
+    opendir (DIR, "$formated_fasta") or die "Can't open $formated_fasta directory\n";
+    while ( my $filename = readdir(DIR) ) {
+        if ($filename =~ /^(\d+)\.faa$/) { $pep{$1} = $1; }
+        if ($filename =~ /^(\d+)\.fna$/) { $cds{$1} = $1; }
+    }
+    closedir DIR;
+    if (keys(%cds) and (keys(%pep) != keys(%cds))) {
+        die "Exiting...!\nOrthogroup classification protein and CDS fasta files not equivalent in $formated_fasta directory\n\n";
+    }
+    foreach my $ortho_id (keys %pep) {
+        my $merging_call = system "cat $scaffold_dir/fasta/$method/$ortho_id.faa $formated_fasta/$ortho_id.faa > $integrated_orthogroup_fasta/$ortho_id.faa";
+        if ($merging_call != 0) {
+            stop_err("Merging orthogroup $ortho_id failed.");
+        }
+        if (keys(%cds) and $cds{$ortho_id}) {
+            my $merging_call = system "cat $scaffold_dir/fasta/$method/$ortho_id.fna $formated_fasta/$ortho_id.fna > $integrated_orthogroup_fasta/$ortho_id.fna";
+            if ($merging_call != 0) {
+                stop_err("Merging orthogroup $ortho_id failed.");
+            }
+        }
+    }
+}
+
+sub update_scaffold_data {
+    my ( $proteins, $scaffold, $method, $num_threads, $method_dir, $species_name, $working_dir, $scaffold_dir ) = @_;
+    log_msg("Updating scaffold data files.");
+    # update orthogroup annotation files
+    log_msg("Updating orthogroup annotation files.");
+    my %annot;
+    open (OUT, ">>$working_dir/$scaffold/annot/$method.list") or die "Can't open $working_dir/$scaffold/annot/$method.list file\n";
+    opendir (DIR, "$method_dir/orthogroups_fasta") or die "Can't open $method_dir/orthogroups_fasta directory\n";
+    while ( my $filename = readdir(DIR) ) {
+        my $seq_count = 0;
+        my $ortho_id;
+        if ($filename =~ /^(\d+)\.faa$/) {
+            $ortho_id = $1;
+            open (IN, "$method_dir/orthogroups_fasta/$filename") or die "Can't open $method_dir/orthogroups_fasta/$filename file\n";
+            while (<IN>) {
+                chomp;
+                if (/^>(\S+)/) {
+                    print OUT "$ortho_id\t$1\n";
+                    $seq_count++;
+                }
+            }
+            close IN;
+        }
+        else { next; }
+        $annot{$ortho_id} =  $seq_count;
+    }
+    closedir DIR;
+    close OUT;
+    my ($fields, %avg_summary, %min_summary);
+    if (File::Spec->file_name_is_absolute($proteins)) { $proteins = basename($proteins); }
+    open (IN, "$working_dir/$scaffold/annot/$method.avg_evalue.summary") or die "Can't open $working_dir/$scaffold/annot/$method.avg_evalue.summary file\n";
+    while (<IN>) {
+        chomp;
+        if (/^Orthogroup\s+ID\s+(.*)/) { $fields = "Orthogroup ID\t$species_name\t$1"; next; }
+        else { /(\d+)\s+(.*)/; $avg_summary{$1} = $2; }
+    }
+    close IN;
+    open (OUT, ">$working_dir/$scaffold/annot/$method.avg_evalue.summary") or die "Can't open $working_dir/$scaffold/annot/$method.avg_evalue.summary file\n";
+    print OUT "$fields\n";
+    foreach (sort {$a<=>$b} keys %avg_summary) {
+        if ($annot{$_}) { print OUT "$_\t$annot{$_}\t$avg_summary{$_}\n"; }
+        else { print OUT "$_\t0\t$avg_summary{$_}\n"; }
+    }
+    close OUT;
+    open (IN, "$working_dir/$scaffold/annot/$method.min_evalue.summary") or die "Can't open $working_dir/$scaffold/annot/$method.min_evalue.summary file\n";
+    while (<IN>) {
+        chomp;
+        if (/^Orthogroup\s+ID\s+(.*)/) { $fields = "Orthogroup ID\t$species_name\t$1"; next; }
+        else { /(\d+)\s+(.*)/; $min_summary{$1} = $2; }
+    }
+    close IN;
+    open (OUT, ">$working_dir/$scaffold/annot/$method.min_evalue.summary") or die "Can't open $working_dir/$scaffold/annot/$method.min_evalue.summary file\n";
+    print OUT "$fields\n";
+    foreach (sort {$a<=>$b} keys %min_summary) {
+        if ($annot{$_}) { print OUT "$_\t$annot{$_}\t$min_summary{$_}\n"; }
+        else { print OUT "$_\t0\t$min_summary{$_}\n"; }
+    }
+    close OUT;
+
+    # update orthogroup fasta files
+    log_msg("Updating orthogroup fasta files.");
+    opendir (DIR, "$method_dir/integrated_orthogroup_fasta") or die "Can't open $method_dir/integrated_orthogroup_fasta directory\n";
+    while ( my $filename = readdir(DIR) ) {
+        if (($filename =~ /^(\d+)\.faa$/) or ($filename =~ /^(\d+)\.fna$/)) {
+            my $update_orthogroup_fasta = system "cp $method_dir/integrated_orthogroup_fasta/$filename $working_dir/$scaffold/fasta/$method/$filename >/dev/null";
+            if ($update_orthogroup_fasta != 0) {
+                stop_err("Updating orthogroup fasta $filename failed.");
+            }
+        }
+    }
+    close IN;
+    closedir DIR;
+
+    # update orthogroup alignments
+    log_msg("Updating alignments.");
+    opendir (DIR, "$method_dir/orthogroups_fasta/formated_fasta") or die "Can't open $method_dir/orthogroups_fasta/formated_fasta directory\n";
+    while ( my $filename = readdir(DIR) ) {
+        if ($filename =~ /^(\d+)\.faa$/) {
+            my $ortho_id = $1;
+            my $align_fasta = system "mafft --thread $num_threads --add $method_dir/orthogroups_fasta/formated_fasta/$filename $working_dir/$scaffold/alns/$method/$ortho_id.aln > $method_dir/orthogroups_fasta/formated_fasta/$ortho_id.aln 2>/dev/null";
+            if ($align_fasta != 0) {
+                stop_err("Aligning orthogroup fasta file $filename failed.");
+            }
+            my $update_orthogroup_alignment = system "mv $method_dir/orthogroups_fasta/formated_fasta/$ortho_id.aln $working_dir/$scaffold/alns/$method/$ortho_id.aln >/dev/null";
+            if ($update_orthogroup_alignment != 0) {
+                stop_err(" - Updating orthogroup alignment $ortho_id.aln failed.");
+            }
+        }
+    }
+    closedir DIR;
+
+    # update orthogroup hmm profiles
+    log_msg("Updating hmm profiles.");
+    opendir (DIR, "$working_dir/$scaffold/alns/$method") or die "Can't open $working_dir/$scaffold/alns/$method directory\n";
+    while ( my $filename = readdir(DIR) ) {
+        if ($filename =~ /^(\d+)\.aln$/) {
+            my $ortho_id = $1;
+            my $update_orthogroup_hmm = system "hmmbuild -n $ortho_id --amino --cpu $num_threads $working_dir/$ortho_id.hmm $working_dir/$scaffold/alns/$method/$ortho_id.aln >/dev/null";
+            if ($update_orthogroup_hmm != 0) {
+                stop_err("Updating orthogroup hmm profile $ortho_id.hmm failed.");
+            }
+            my $convert_hmm_format = system "hmmconvert $working_dir/$ortho_id.hmm > $working_dir/$scaffold/hmms/$method/$ortho_id.hmm";
+            if ($convert_hmm_format != 0) {
+                stop_err("Converting orthogroup hmm profile $ortho_id.hmm format failed.");
+            }
+            my $remove_tmp_file = system "rm  $working_dir/$ortho_id.hmm >/dev/null";
+            if ($remove_tmp_file != 0) {
+                stop_err("Could not remove temporary hmm profile $ortho_id.hmm.");
+            }
+        }
+    }
+
+    # update orthogroup blast and hmm databases
+    log_msg("Updating blast and hmm databases.");
+    my $update_blast_database = system "find $method_dir/orthogroups_fasta/ -name \"*.faa\" -print0 | xargs -0 cat >> $working_dir/$scaffold/db/blast/$method";
+    if ($update_blast_database != 0) {
+        stop_err("Updating blast database failed.");
+    }
+    my $index_blast_database = system "makeblastdb -in $working_dir/$scaffold/db/blast/$method -dbtype prot >/dev/null";
+    if ($index_blast_database != 0) {
+        stop_err("Indexing blast database failed.");
+    }
+    my $update_hmm_database = system "find $working_dir/$scaffold/hmms/$method/ -name \"*.hmm\" -print0 | xargs -0 cat > $working_dir/$scaffold/db/hmm/$method";
+    if ($update_hmm_database != 0) {
+        stop_err("Updating hmm database failed.");
+    }
+    my $index_hmm_database = system "hmmpress -f $working_dir/$scaffold/db/hmm/$method >/dev/null";
+    if ($index_hmm_database != 0) {
+        stop_err("Indexing hmm database failed.");
+    }
+}
+
+sub update_database_tables {
+    my ( $dbh, $proteins, $scaffold, $methods, $species_name, $species_family, $species_order, $species_group, $species_clade, $scaffold_dir, $working_dir ) = @_;
+    log_msg("Updating for database tables.");
+    my ( $species_code, %method_genes, %gene_sequences, %dna, %aa );
+    my $gsot_association_prep_file = "$working_dir/gene_scaffold_orthogroup_taxon_association.tsv";
+    my $num_recs = 0;
+
+    # Output a prep file that stores information for updating
+    # the gene_scaffold_orthogroup_taxon_association table.
+    open (ASSOC, ">$gsot_association_prep_file") or die "Can't open $gsot_association_prep_file file\n";
+    print ASSOC "gene_id\tscaffold_id\tclustering_method\torthogroup_id\tspecies_name\n";
+    # get new species name and code
+    if (File::Spec->file_name_is_absolute($proteins)) { $proteins = basename($proteins); }
+    $species_name =~ s/\_/ /g;
+    my $rooting_order_config = "$scaffold_dir/$scaffold.rootingOrder.config";
+    open(IN, "$rooting_order_config") or die "Can't open $rooting_order_config file\n";
+    while (<IN>){
+        chomp;
+        if (/^\#/ or /^\s+/ or /^\[/){ next; }
+        if (/(\w+\s+\w+)\=(\w+)/) { if ($species_name eq $1) { $species_code = $2;} }
+    }
+    close IN;
+    foreach my $clustering_method (keys %$methods) {
+        # Updating orthogroup database table
+        log_msg("Updating $clustering_method records for the plant_tribes_orthogroup database table.");
+        my ( $stmt, $sth, $rv, $scaffold_id );
+        $stmt = qq(SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '$scaffold' AND clustering_method = '$clustering_method';);
+        $sth = $dbh->prepare( $stmt );
+        $rv = $sth->execute() or die $DBI::errstr;
+        if ($rv < 0) { print $DBI::errstr; }
+        while (my @row = $sth->fetchrow_array()) {
+            $scaffold_id = $row[0];
+        }
+        my $scaffold_annotation_dir = "$scaffold_dir/annot";
+        opendir (DIR, $scaffold_annotation_dir) or die "Can't open $scaffold_annotation_dir directory\n";
+        $num_recs = 0;
+        while ( my $filename = readdir(DIR) ) {
+            if ($filename =~ /$clustering_method.min_evalue\.summary/) {
+                open (IN, "$scaffold_annotation_dir/$filename") or die "Can't open $scaffold_annotation_dir/$filename file\n";
+                while (<IN>){
+                    chomp;
+                    if (/^Orthogroup/){ next; }
+                    my @fields = split(/\t/, $_);
+                    my $num_species = 0;
+                    my $num_genes = 0;
+                    $scaffold =~ /(\d+)Gv\d+\.\d+/; # 22Gv1.1
+                    my $genomes = $1 + 1;
+                    for (1..$genomes){
+                        if ($fields[$_] > 0){ $num_species++; }
+                        $num_genes += $fields[$_];
+                    }
+                    $stmt = qq(UPDATE plant_tribes_orthogroup SET num_species = $num_species, num_genes = $num_genes WHERE orthogroup_id = $fields[0] AND scaffold_id = $scaffold_id;);
+                    $rv = $dbh->do($stmt) or die $DBI::errstr;
+                    if($rv < 0) {
+                        print $DBI::errstr;
+                    }
+                    $num_recs = $num_recs + 1;
+                }
+                close IN;
+            }
+            if ($filename =~ /$clustering_method\.list/) {
+                open (IN, "$scaffold_annotation_dir/$filename") or die "Can't open $scaffold_annotation_dir/$filename file\n";
+                while (<IN>){
+                    chomp;
+                     my @fields = split(/\t/, $_);
+                     my @gene_id = split(/\|/, $fields[1]);
+                     if ($gene_id[1] =~ /$species_code/) {
+                         $fields[1] =~ s/\|/_/g;
+                         $method_genes{$clustering_method}{$fields[0]}{$fields[1]} = $fields[1];
+                     }
+                }
+                close IN;
+            }
+        }
+        close DIR;
+        log_msg("$num_recs records for $scaffold $clustering_method were successfully updated in the plant_tribes_orthogroup table.");
+        # Updating taxon database table
+        log_msg("Inserting $clustering_method records into the plant_tribes_taxon database table.");
+        my $num_genes = 0;
+        foreach my $ortho_id (keys %{$method_genes{$clustering_method}}){
+            foreach (keys %{$method_genes{$clustering_method}{$ortho_id}}){
+                $num_genes++;
+            }
+        }
+        my $taxa_lineage_config = "$scaffold_dir/$scaffold.taxaLineage.config";
+        open(IN, "$taxa_lineage_config") or die "Can't open $taxa_lineage_config file\n";
+        $num_recs = 0;
+        while (<IN>){
+            chomp;
+            my @fields = split(/\t/, $_);
+            if ($fields[0] ne $species_name) { next; }
+            $stmt = qq(INSERT INTO plant_tribes_taxon (species_name, scaffold_id, num_genes, species_family, species_order, species_group, species_clade) VALUES ('$fields[0]', $scaffold_id, $num_genes, '$fields[1]', '$fields[2]', '$fields[3]', '$fields[4]'));
+            $rv = $dbh->do($stmt) or die $DBI::errstr;
+            $num_recs = $num_recs + 1;
+        }
+        close IN;
+        log_msg("$num_recs records for $species_name $scaffold $clustering_method were successfully inserted into the plant_tribes_taxon table.");
+        my ($dna_id, $aa_id);
+        my $orthogroups_fasta_dir = "$working_dir/$clustering_method/orthogroups_fasta/formated_fasta";
+        opendir (DIR, $orthogroups_fasta_dir) or die "Can't open $orthogroups_fasta_dir directory\n";
+        while ( my $filename = readdir(DIR) ) {
+            if ($filename =~ /^(\d+)\.fna$/) {
+                my $ortho_id = $1;
+                open(IN, "$orthogroups_fasta_dir/$filename") or die "Can't open $orthogroups_fasta_dir/$filename file\n";
+                while(<IN>){
+                    chomp;
+                    if (/^>(\S+)/){
+                        $dna_id = $1;
+                        print ASSOC "$dna_id\t$scaffold\t$clustering_method\t$ortho_id\t$species_name\n";
+                        next;
+                    }
+                    else { s/\s+//g; $dna{$dna_id} .= $_; }
+                }
+                close IN;
+            }
+            if ($filename =~ /^(\d+)\.faa$/) {
+                open(IN, "$orthogroups_fasta_dir/$filename") or die "Can't open $orthogroups_fasta_dir/$filename file\n";
+                while(<IN>){
+                    chomp;
+                    if (/^>(\S+)/){ $aa_id = $1; next; }
+                    else { s/\s+//g; $aa{$aa_id} .= $_; }
+                }
+                close IN;
+            }
+        }
+        close DIR;
+    }
+    close ASSOC;
+    # Updating gene database table
+    log_msg("Inserting records into the plant_tribes_gene database table.");
+    $num_recs = 0;
+    foreach my $gene_id (sort keys %dna) {
+        my $stmt = qq(INSERT INTO plant_tribes_gene (gene_id, dna_sequence, aa_sequence) VALUES ('$gene_id', '$dna{$gene_id}', '$aa{$gene_id}'));
+        my $rv = $dbh->do($stmt) or die $DBI::errstr;
+        $num_recs = $num_recs + 1;
+    }
+    log_msg("$num_recs records for $species_name $scaffold were successfully inserted into the plant_tribes_gene table.");
+    # Updaing gene-scaffold-orthogroup-taxon-association database table
+    log_msg("Inserting  records into the gene_scaffold_orthogroup_taxon_association database table.");
+    open(IN, "$gsot_association_prep_file") or die "Can't open $gsot_association_prep_file file\n";
+    $num_recs = 0;
+    my ( $stmt, $sth, $rv, $scaffold_id, $clustering_method, $orthogroup_id, $taxon_id, $gene_id );
+    my ( $gene_id_db, $scaffold_id_db, $orthogroup_id_db, $taxon_id_db );
+    while(<IN>){
+        chomp;
+        if (/^gene_id/) {
+            # gene_id scaffold_id clustering_method orthogroup_id species_name
+            next;
+        }
+        my @fields = split(/\t/, $_);
+        # gnl_Fakge_v1.0_AT1G03390.1 22Gv1.1 orthomcl 3 Fake genome
+        $gene_id = $fields[0];
+        $scaffold_id = $fields[1];
+        $clustering_method = $fields[2];
+        $orthogroup_id = $fields[3];
+        $species_name = $fields[4];
+        $stmt = qq(SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '$scaffold_id' AND clustering_method = '$clustering_method';);
+        $sth = $dbh->prepare( $stmt );
+        $rv = $sth->execute() or die $DBI::errstr;
+        if ($rv < 0) { print $DBI::errstr; }
+        while (my @row = $sth->fetchrow_array()) {
+            $scaffold_id_db = $row[0];
+        }
+        $stmt = qq(SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '$orthogroup_id' AND scaffold_id = '$scaffold_id_db';);
+        $sth = $dbh->prepare( $stmt );
+        $rv = $sth->execute() or die $DBI::errstr;
+        if ($rv < 0) { print $DBI::errstr; }
+        while (my @row = $sth->fetchrow_array()) {
+            $orthogroup_id_db = $row[0];
+        }
+        $stmt = qq(SELECT id FROM plant_tribes_taxon WHERE species_name = '$species_name' AND scaffold_id = '$scaffold_id_db';);
+        $sth = $dbh->prepare( $stmt );
+        $rv = $sth->execute() or die $DBI::errstr;
+        if ($rv < 0) { print $DBI::errstr; }
+        while (my @row = $sth->fetchrow_array()) {
+            $taxon_id_db = $row[0];
+        }
+        $stmt = qq(SELECT id FROM plant_tribes_gene WHERE gene_id = '$gene_id' );
+        $sth = $dbh->prepare( $stmt );
+        $rv = $sth->execute() or die $DBI::errstr;
+        if ($rv < 0) { print $DBI::errstr; }
+        while (my @row = $sth->fetchrow_array()) {
+            $gene_id_db = $row[0];
+        }
+        $stmt = qq(INSERT INTO gene_scaffold_orthogroup_taxon_association (gene_id, scaffold_id, orthogroup_id, taxon_id) VALUES ($gene_id_db, $scaffold_id_db, $orthogroup_id_db, $taxon_id_db));
+        $rv = $dbh->do($stmt) or die $DBI::errstr;
+        $num_recs = $num_recs + 1;
+    }
+    close IN;
+    log_msg("$num_recs records for $scaffold $clustering_method were successfully inserted into the gene_scaffold_orthogroup_taxon_association table.");
+    $dbh->disconnect();
+}