annotate gene_family_scaffold_updater.pl @ 0:2b0906489073 draft default tip

Uploaded
author greg
date Tue, 21 Aug 2018 13:00:21 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2b0906489073 Uploaded
greg
parents:
diff changeset
1 #!/usr/bin/env perl
2b0906489073 Uploaded
greg
parents:
diff changeset
2 # Author: Eric Wafula
2b0906489073 Uploaded
greg
parents:
diff changeset
3 # Email: ekw10@psu.edu
2b0906489073 Uploaded
greg
parents:
diff changeset
4 # Institution: Penn State University, Biology Dept, Claude dePamphilis Lab
2b0906489073 Uploaded
greg
parents:
diff changeset
5 # Date: June 2018
2b0906489073 Uploaded
greg
parents:
diff changeset
6
2b0906489073 Uploaded
greg
parents:
diff changeset
7 use strict;
2b0906489073 Uploaded
greg
parents:
diff changeset
8 use warnings;
2b0906489073 Uploaded
greg
parents:
diff changeset
9 use File::Spec;
2b0906489073 Uploaded
greg
parents:
diff changeset
10 use File::Basename;
2b0906489073 Uploaded
greg
parents:
diff changeset
11 use Getopt::Long qw(:config no_ignore_case);
2b0906489073 Uploaded
greg
parents:
diff changeset
12 use FindBin;
2b0906489073 Uploaded
greg
parents:
diff changeset
13 use DBI;
2b0906489073 Uploaded
greg
parents:
diff changeset
14
2b0906489073 Uploaded
greg
parents:
diff changeset
15 my $home = "$FindBin::Bin/..";
2b0906489073 Uploaded
greg
parents:
diff changeset
16
2b0906489073 Uploaded
greg
parents:
diff changeset
17 my $usage = <<__EOUSAGE__;
2b0906489073 Uploaded
greg
parents:
diff changeset
18
2b0906489073 Uploaded
greg
parents:
diff changeset
19 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
2b0906489073 Uploaded
greg
parents:
diff changeset
20 #
2b0906489073 Uploaded
greg
parents:
diff changeset
21 # GENE FAMILY SCAFFOLD UPDATER
2b0906489073 Uploaded
greg
parents:
diff changeset
22 #
2b0906489073 Uploaded
greg
parents:
diff changeset
23 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
2b0906489073 Uploaded
greg
parents:
diff changeset
24 # Required Options:
2b0906489073 Uploaded
greg
parents:
diff changeset
25 #
2b0906489073 Uploaded
greg
parents:
diff changeset
26 #
2b0906489073 Uploaded
greg
parents:
diff changeset
27 # --database_connection_string <string> : Postgres database connection string using format
2b0906489073 Uploaded
greg
parents:
diff changeset
28 # postgresql://<user>:<password>@<host>/<database name>
2b0906489073 Uploaded
greg
parents:
diff changeset
29 #
2b0906489073 Uploaded
greg
parents:
diff changeset
30 # --proteins <string> : Amino acids (proteins) sequences fasta file (proteins.fasta)
2b0906489073 Uploaded
greg
parents:
diff changeset
31 # This can either be an absolute path or just the file name
2b0906489073 Uploaded
greg
parents:
diff changeset
32 #
2b0906489073 Uploaded
greg
parents:
diff changeset
33 # --coding_sequences <string> : Corresponding coding sequences (CDS) fasta file (cds.fasta)
2b0906489073 Uploaded
greg
parents:
diff changeset
34 #
2b0906489073 Uploaded
greg
parents:
diff changeset
35 # --scaffold <string> : Orthogroups or gene families proteins scaffold. This can either be an absolute
2b0906489073 Uploaded
greg
parents:
diff changeset
36 # path to the directory containing the scaffolds (e.g., /home/scaffolds/22Gv1.1)
2b0906489073 Uploaded
greg
parents:
diff changeset
37 # or just the scaffold (e.g., 22Gv1.1). If the latter, ~home/data is prepended to
2b0906489073 Uploaded
greg
parents:
diff changeset
38 # the scaffold to create the absolute path.
2b0906489073 Uploaded
greg
parents:
diff changeset
39 # the scaffold to create the absolute path.
2b0906489073 Uploaded
greg
parents:
diff changeset
40 # If Monocots clusters (version 1.0): 12Gv1.0
2b0906489073 Uploaded
greg
parents:
diff changeset
41 # If Angiosperms clusters (version 1.0): 22Gv1.0
2b0906489073 Uploaded
greg
parents:
diff changeset
42 # If Angiosperms clusters (version 1.1): 22Gv1.1
2b0906489073 Uploaded
greg
parents:
diff changeset
43 # If Green plants clusters (version 1.0): 31Gv1.0
2b0906489073 Uploaded
greg
parents:
diff changeset
44 # If Other non PlantTribes clusters: XGvY.Z, where "X" is the number species in the scaffold,
2b0906489073 Uploaded
greg
parents:
diff changeset
45 # and "Y.Z" version number such as 12Gv1.0. Please look at one of the PlantTribes scaffold
2b0906489073 Uploaded
greg
parents:
diff changeset
46 # data on how data files and directories are named, formated, and organized.
2b0906489073 Uploaded
greg
parents:
diff changeset
47 #
2b0906489073 Uploaded
greg
parents:
diff changeset
48 #
2b0906489073 Uploaded
greg
parents:
diff changeset
49 # --species_name <string> : Name of the species
2b0906489073 Uploaded
greg
parents:
diff changeset
50 #
2b0906489073 Uploaded
greg
parents:
diff changeset
51 # --species_code <string> : Code of the species
2b0906489073 Uploaded
greg
parents:
diff changeset
52 #
2b0906489073 Uploaded
greg
parents:
diff changeset
53 # --species_family <string> : Family of the species
2b0906489073 Uploaded
greg
parents:
diff changeset
54 #
2b0906489073 Uploaded
greg
parents:
diff changeset
55 # --species_order <string> : Order of the species
2b0906489073 Uploaded
greg
parents:
diff changeset
56 #
2b0906489073 Uploaded
greg
parents:
diff changeset
57 # --species_group <string> : Group of the species
2b0906489073 Uploaded
greg
parents:
diff changeset
58 #
2b0906489073 Uploaded
greg
parents:
diff changeset
59 # --species_clade <string> : Clade of the species
2b0906489073 Uploaded
greg
parents:
diff changeset
60 #
2b0906489073 Uploaded
greg
parents:
diff changeset
61 # --rooting_order_species_code <string> : Species code after which the new species will be placed in the rooting order config file
2b0906489073 Uploaded
greg
parents:
diff changeset
62 #
2b0906489073 Uploaded
greg
parents:
diff changeset
63 # # # # # # # # # # # # # # # # # #
2b0906489073 Uploaded
greg
parents:
diff changeset
64 # Others Options:
2b0906489073 Uploaded
greg
parents:
diff changeset
65 #
2b0906489073 Uploaded
greg
parents:
diff changeset
66 # --num_threads <int> : number of threads (CPUs) to used for HMMScan, BLASTP, and MAFFT
2b0906489073 Uploaded
greg
parents:
diff changeset
67 # Default: 1
2b0906489073 Uploaded
greg
parents:
diff changeset
68 #
2b0906489073 Uploaded
greg
parents:
diff changeset
69 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
2b0906489073 Uploaded
greg
parents:
diff changeset
70 # Example Usage:
2b0906489073 Uploaded
greg
parents:
diff changeset
71 #
2b0906489073 Uploaded
greg
parents:
diff changeset
72 # GeneFamilyScaffoldUpdater --database_connection_string postgresql://<user>:<password>@<host>/<database name>
2b0906489073 Uploaded
greg
parents:
diff changeset
73 --proteins proteins.fasta --coding_sequences cds.fasta --scaffold 22Gv1.1
2b0906489073 Uploaded
greg
parents:
diff changeset
74 # --species_name Fake genome --species_family Brassicaceae --species_order Brassicales
2b0906489073 Uploaded
greg
parents:
diff changeset
75 --species_group Rosids --species_clade Core Eudicots --rooting_order_species_code Phypa
2b0906489073 Uploaded
greg
parents:
diff changeset
76 #
2b0906489073 Uploaded
greg
parents:
diff changeset
77 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
2b0906489073 Uploaded
greg
parents:
diff changeset
78
2b0906489073 Uploaded
greg
parents:
diff changeset
79 __EOUSAGE__
2b0906489073 Uploaded
greg
parents:
diff changeset
80 ;
2b0906489073 Uploaded
greg
parents:
diff changeset
81
2b0906489073 Uploaded
greg
parents:
diff changeset
82 # Declare and initialize variables;
2b0906489073 Uploaded
greg
parents:
diff changeset
83 my $database_connection_string;
2b0906489073 Uploaded
greg
parents:
diff changeset
84 my $proteins;
2b0906489073 Uploaded
greg
parents:
diff changeset
85 my $species_name;
2b0906489073 Uploaded
greg
parents:
diff changeset
86 my $species_code;
2b0906489073 Uploaded
greg
parents:
diff changeset
87 my $species_family;
2b0906489073 Uploaded
greg
parents:
diff changeset
88 my $species_order;
2b0906489073 Uploaded
greg
parents:
diff changeset
89 my $species_group;
2b0906489073 Uploaded
greg
parents:
diff changeset
90 my $species_clade;
2b0906489073 Uploaded
greg
parents:
diff changeset
91 my $coding_sequences;
2b0906489073 Uploaded
greg
parents:
diff changeset
92 my $scaffold;
2b0906489073 Uploaded
greg
parents:
diff changeset
93 my $rooting_order_species_code;
2b0906489073 Uploaded
greg
parents:
diff changeset
94 my $num_threads;
2b0906489073 Uploaded
greg
parents:
diff changeset
95
2b0906489073 Uploaded
greg
parents:
diff changeset
96 my $options = GetOptions ( 'database_connection_string=s' => \$database_connection_string,
2b0906489073 Uploaded
greg
parents:
diff changeset
97 'proteins=s' => \$proteins,
2b0906489073 Uploaded
greg
parents:
diff changeset
98 'species_name=s' => \$species_name,
2b0906489073 Uploaded
greg
parents:
diff changeset
99 'species_code=s' => \$species_code,
2b0906489073 Uploaded
greg
parents:
diff changeset
100 'species_family=s' => \$species_family,
2b0906489073 Uploaded
greg
parents:
diff changeset
101 'species_order=s' => \$species_order,
2b0906489073 Uploaded
greg
parents:
diff changeset
102 'species_group=s' => \$species_group,
2b0906489073 Uploaded
greg
parents:
diff changeset
103 'species_clade=s' => \$species_clade,
2b0906489073 Uploaded
greg
parents:
diff changeset
104 'coding_sequences=s' => \$coding_sequences,
2b0906489073 Uploaded
greg
parents:
diff changeset
105 'scaffold=s' => \$scaffold,
2b0906489073 Uploaded
greg
parents:
diff changeset
106 'rooting_order_species_code=s' => \$rooting_order_species_code,
2b0906489073 Uploaded
greg
parents:
diff changeset
107 'num_threads=i' => \$num_threads,
2b0906489073 Uploaded
greg
parents:
diff changeset
108 );
2b0906489073 Uploaded
greg
parents:
diff changeset
109
2b0906489073 Uploaded
greg
parents:
diff changeset
110 # # # # # # # # # # # # # # # # # # # # # # # # # validate options and set variables # # # # # # # # # # # # # # # # # # # # # # # # # #
2b0906489073 Uploaded
greg
parents:
diff changeset
111 # check if options are set
2b0906489073 Uploaded
greg
parents:
diff changeset
112 unless ( $options ) { die $usage; }
2b0906489073 Uploaded
greg
parents:
diff changeset
113 unless ( $database_connection_string and $proteins and $species_name and $species_code and $species_family and $species_order and $species_group and $species_clade and $coding_sequences and $scaffold and $rooting_order_species_code ) {
2b0906489073 Uploaded
greg
parents:
diff changeset
114 print "\nOne or more required options not set\n"; die $usage; }
2b0906489073 Uploaded
greg
parents:
diff changeset
115 # get scaffold directory
2b0906489073 Uploaded
greg
parents:
diff changeset
116 my $scaffold_dir;
2b0906489073 Uploaded
greg
parents:
diff changeset
117 if (File::Spec->file_name_is_absolute($scaffold)) {
2b0906489073 Uploaded
greg
parents:
diff changeset
118 $scaffold_dir = $scaffold;
2b0906489073 Uploaded
greg
parents:
diff changeset
119 $scaffold = basename($scaffold);
2b0906489073 Uploaded
greg
parents:
diff changeset
120 } else {
2b0906489073 Uploaded
greg
parents:
diff changeset
121 if ($scaffold) { $scaffold_dir = "$home/data/$scaffold"; }
2b0906489073 Uploaded
greg
parents:
diff changeset
122 else { print "\n --scaffold option is not set\n\n"; die $usage; }
2b0906489073 Uploaded
greg
parents:
diff changeset
123 }
2b0906489073 Uploaded
greg
parents:
diff changeset
124
2b0906489073 Uploaded
greg
parents:
diff changeset
125 # validate scaffold and update type options
2b0906489073 Uploaded
greg
parents:
diff changeset
126 if ( $scaffold !~ /^\d+Gv\d+\.\d+$/) {
2b0906489073 Uploaded
greg
parents:
diff changeset
127 print "\nOrthogroups or gene families proteins scaffold name $scaffold is not in the required format";
2b0906489073 Uploaded
greg
parents:
diff changeset
128 print " i.e. XGvY.Z, where X is number species in the scaffold, and Y.Z version number such as 12Gv1.0.\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
129 die $usage;
2b0906489073 Uploaded
greg
parents:
diff changeset
130 }
2b0906489073 Uploaded
greg
parents:
diff changeset
131
2b0906489073 Uploaded
greg
parents:
diff changeset
132 # Find out if the received rooting_order_species_code is in
2b0906489073 Uploaded
greg
parents:
diff changeset
133 # the rooting order configuration file for the scaffold. We
2b0906489073 Uploaded
greg
parents:
diff changeset
134 # do this before anything else since it is the least resource
2b0906489073 Uploaded
greg
parents:
diff changeset
135 # intensive and an invalid species code will force an error.
2b0906489073 Uploaded
greg
parents:
diff changeset
136 validate_rooting_order_species_code($scaffold_dir, $rooting_order_species_code);
2b0906489073 Uploaded
greg
parents:
diff changeset
137
2b0906489073 Uploaded
greg
parents:
diff changeset
138 # Get a database connection.
2b0906489073 Uploaded
greg
parents:
diff changeset
139 my $dbh = get_database_connection($database_connection_string);
2b0906489073 Uploaded
greg
parents:
diff changeset
140
2b0906489073 Uploaded
greg
parents:
diff changeset
141 # The gene_family_scaffold_loader tool must be executed before
2b0906489073 Uploaded
greg
parents:
diff changeset
142 # this tool so that the information about the scaffold is
2b0906489073 Uploaded
greg
parents:
diff changeset
143 # avaialble in the galaxy_plant_tribes database. Check to make
2b0906489073 Uploaded
greg
parents:
diff changeset
144 # sure the scaffold has been loaded into the databse before
2b0906489073 Uploaded
greg
parents:
diff changeset
145 # continuing with the update.
2b0906489073 Uploaded
greg
parents:
diff changeset
146 validate_scaffold($dbh, $scaffold);
2b0906489073 Uploaded
greg
parents:
diff changeset
147
2b0906489073 Uploaded
greg
parents:
diff changeset
148 # get scaffold clustering methods
2b0906489073 Uploaded
greg
parents:
diff changeset
149 my %methods;
2b0906489073 Uploaded
greg
parents:
diff changeset
150 my $annotation_dir = "$scaffold_dir/annot";
2b0906489073 Uploaded
greg
parents:
diff changeset
151 opendir (DIR, "$annotation_dir") or die "Can't open $annotation_dir directory\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
152 while (my $filename = readdir(DIR)) {
2b0906489073 Uploaded
greg
parents:
diff changeset
153 if ($filename =~ /(\w+)\.list$/) { $methods{$1} = $1; }
2b0906489073 Uploaded
greg
parents:
diff changeset
154 }
2b0906489073 Uploaded
greg
parents:
diff changeset
155
2b0906489073 Uploaded
greg
parents:
diff changeset
156 # set defaults
2b0906489073 Uploaded
greg
parents:
diff changeset
157 if (!$num_threads) { $num_threads = 1; }
2b0906489073 Uploaded
greg
parents:
diff changeset
158
2b0906489073 Uploaded
greg
parents:
diff changeset
159 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub-routine calls # # # # # # # # # # # # # # # # # # # # # # # # # # #
2b0906489073 Uploaded
greg
parents:
diff changeset
160
2b0906489073 Uploaded
greg
parents:
diff changeset
161 log_msg("Starting gene family scaffold updating.");
2b0906489073 Uploaded
greg
parents:
diff changeset
162
2b0906489073 Uploaded
greg
parents:
diff changeset
163 # Create working directory.
2b0906489073 Uploaded
greg
parents:
diff changeset
164 my $working_dir = "./geneFamilyScaffoldUpdate_dir";
2b0906489073 Uploaded
greg
parents:
diff changeset
165 if (-d $working_dir) {
2b0906489073 Uploaded
greg
parents:
diff changeset
166 die "Exiting...!\nGene family scaffold update output directory ($working_dir) already exists!\n\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
167 }
2b0906489073 Uploaded
greg
parents:
diff changeset
168 make_directory($working_dir);
2b0906489073 Uploaded
greg
parents:
diff changeset
169
2b0906489073 Uploaded
greg
parents:
diff changeset
170 # Copy original scaffold data to a working directory.
2b0906489073 Uploaded
greg
parents:
diff changeset
171 log_msg("Copying original scaffold data to working directory.");
2b0906489073 Uploaded
greg
parents:
diff changeset
172 my $copy_scaffold_data = system "cp -r $scaffold_dir $working_dir";
2b0906489073 Uploaded
greg
parents:
diff changeset
173 if ($copy_scaffold_data != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
174 stop_err("Copying original scaffold data to working directory failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
175 }
2b0906489073 Uploaded
greg
parents:
diff changeset
176
2b0906489073 Uploaded
greg
parents:
diff changeset
177 # Update the scaffold config files in the working directory with the new genome.
2b0906489073 Uploaded
greg
parents:
diff changeset
178 update_config_files ( $scaffold, $rooting_order_species_code, $species_name, $species_code, $species_family, $species_order, $species_group, $species_clade, $working_dir );
2b0906489073 Uploaded
greg
parents:
diff changeset
179
2b0906489073 Uploaded
greg
parents:
diff changeset
180 # Update the scaffold files in the working directory with the new genome.
2b0906489073 Uploaded
greg
parents:
diff changeset
181 foreach my $method (keys %methods) {
2b0906489073 Uploaded
greg
parents:
diff changeset
182 sort_sequences ( $proteins, $coding_sequences, $scaffold, $method, $num_threads, $species_name, $working_dir, $scaffold_dir );
2b0906489073 Uploaded
greg
parents:
diff changeset
183 }
2b0906489073 Uploaded
greg
parents:
diff changeset
184
2b0906489073 Uploaded
greg
parents:
diff changeset
185 # Move updated scaffold data to original directory.
2b0906489073 Uploaded
greg
parents:
diff changeset
186 my $updated_scaffold_dir = "$working_dir/$scaffold";
2b0906489073 Uploaded
greg
parents:
diff changeset
187 log_msg("Removing original scaffold data directory $scaffold_dir.");
2b0906489073 Uploaded
greg
parents:
diff changeset
188 my $remove_scaffold_data = system "rm -rf $scaffold_dir";
2b0906489073 Uploaded
greg
parents:
diff changeset
189 if ($remove_scaffold_data != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
190 stop_err("Removing original scaffold data directory failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
191 }
2b0906489073 Uploaded
greg
parents:
diff changeset
192 log_msg("Moving updated scaffold data from\n$updated_scaffold_dir\nto original directory\n$scaffold_dir.");
2b0906489073 Uploaded
greg
parents:
diff changeset
193 my $move_scaffold_data = system "mv $updated_scaffold_dir $scaffold_dir";
2b0906489073 Uploaded
greg
parents:
diff changeset
194 if ($move_scaffold_data != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
195 stop_err("Moving updated scaffold data to original directory failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
196 }
2b0906489073 Uploaded
greg
parents:
diff changeset
197
2b0906489073 Uploaded
greg
parents:
diff changeset
198 # Update the database tables with the new genome.
2b0906489073 Uploaded
greg
parents:
diff changeset
199 update_database_tables ( $dbh, $proteins, $scaffold, \%methods, $species_name, $species_family, $species_order, $species_group, $species_clade, $scaffold_dir, $working_dir );
2b0906489073 Uploaded
greg
parents:
diff changeset
200
2b0906489073 Uploaded
greg
parents:
diff changeset
201 log_msg("Completed gene family scaffold updating.");
2b0906489073 Uploaded
greg
parents:
diff changeset
202
2b0906489073 Uploaded
greg
parents:
diff changeset
203 exit(0);
2b0906489073 Uploaded
greg
parents:
diff changeset
204
2b0906489073 Uploaded
greg
parents:
diff changeset
205 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub-routines # # # # # # # # # # # # # # # # # # # # # # # # # # #
2b0906489073 Uploaded
greg
parents:
diff changeset
206
2b0906489073 Uploaded
greg
parents:
diff changeset
207 sub log_msg {
2b0906489073 Uploaded
greg
parents:
diff changeset
208 my ($msg) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
209 print localtime()." - ".$msg."\n\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
210 }
2b0906489073 Uploaded
greg
parents:
diff changeset
211
2b0906489073 Uploaded
greg
parents:
diff changeset
212 sub stop_err {
2b0906489073 Uploaded
greg
parents:
diff changeset
213 my ($error_msg) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
214 print "\n-- ".localtime()." - ".$error_msg."\n\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
215 exit(1);
2b0906489073 Uploaded
greg
parents:
diff changeset
216 }
2b0906489073 Uploaded
greg
parents:
diff changeset
217
2b0906489073 Uploaded
greg
parents:
diff changeset
218 sub validate_rooting_order_species_code {
2b0906489073 Uploaded
greg
parents:
diff changeset
219 my ($scaffold_dir, $rooting_order_species_code ) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
220 my $rooting_order_config = "$scaffold_dir/$scaffold.rootingOrder.config";
2b0906489073 Uploaded
greg
parents:
diff changeset
221 open (IN, $rooting_order_config) or die "Can't open $rooting_order_config\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
222 while (<IN>) {
2b0906489073 Uploaded
greg
parents:
diff changeset
223 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
224 if (/^#/ || /^$/ || /^\[/) {
2b0906489073 Uploaded
greg
parents:
diff changeset
225 # Skip comments, blasnk lines and section headers.
2b0906489073 Uploaded
greg
parents:
diff changeset
226 next;
2b0906489073 Uploaded
greg
parents:
diff changeset
227 }
2b0906489073 Uploaded
greg
parents:
diff changeset
228 # Example line: Physcomitrella patens=Phypa
2b0906489073 Uploaded
greg
parents:
diff changeset
229 my @F = split(/=/, $_);
2b0906489073 Uploaded
greg
parents:
diff changeset
230 my $rooting_order_species_code_cmp = $F[1];
2b0906489073 Uploaded
greg
parents:
diff changeset
231 if (defined($rooting_order_species_code_cmp) && $rooting_order_species_code_cmp eq $rooting_order_species_code) {
2b0906489073 Uploaded
greg
parents:
diff changeset
232 return;
2b0906489073 Uploaded
greg
parents:
diff changeset
233 }
2b0906489073 Uploaded
greg
parents:
diff changeset
234 }
2b0906489073 Uploaded
greg
parents:
diff changeset
235 stop_err("Invalid rooting order species code $rooting_order_species_code is not found in $rooting_order_config");
2b0906489073 Uploaded
greg
parents:
diff changeset
236 }
2b0906489073 Uploaded
greg
parents:
diff changeset
237
2b0906489073 Uploaded
greg
parents:
diff changeset
238 sub validate_scaffold {
2b0906489073 Uploaded
greg
parents:
diff changeset
239 my ($dbh, $scaffold) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
240 my ($stmt, $sth, $rv);
2b0906489073 Uploaded
greg
parents:
diff changeset
241 $stmt = qq(SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '$scaffold';);
2b0906489073 Uploaded
greg
parents:
diff changeset
242 $sth = $dbh->prepare( $stmt );
2b0906489073 Uploaded
greg
parents:
diff changeset
243 $rv = $sth->execute() or die $DBI::errstr;
2b0906489073 Uploaded
greg
parents:
diff changeset
244 if ($rv < 0) { print $DBI::errstr; }
2b0906489073 Uploaded
greg
parents:
diff changeset
245 if ($sth->rows > 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
246 return;
2b0906489073 Uploaded
greg
parents:
diff changeset
247 }
2b0906489073 Uploaded
greg
parents:
diff changeset
248 stop_err("The scaffold $scaffold has not been loaded into the database - use the GeneFamilyScaffoldLoader tool to load the scaffold before attempting to update the scaffold with this tool.");
2b0906489073 Uploaded
greg
parents:
diff changeset
249 }
2b0906489073 Uploaded
greg
parents:
diff changeset
250
2b0906489073 Uploaded
greg
parents:
diff changeset
251 sub make_directory {
2b0906489073 Uploaded
greg
parents:
diff changeset
252 my ( $new_dir ) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
253 if (!-d $new_dir) {
2b0906489073 Uploaded
greg
parents:
diff changeset
254 mkdir($new_dir, 0755);
2b0906489073 Uploaded
greg
parents:
diff changeset
255 }
2b0906489073 Uploaded
greg
parents:
diff changeset
256 }
2b0906489073 Uploaded
greg
parents:
diff changeset
257
2b0906489073 Uploaded
greg
parents:
diff changeset
258 sub get_database_connection {
2b0906489073 Uploaded
greg
parents:
diff changeset
259 my ($database_connection_string) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
260 # Database connection and variables, the format of database_connection_string is
2b0906489073 Uploaded
greg
parents:
diff changeset
261 # postgresql://<user>:<password>@<host>/<database name>
2b0906489073 Uploaded
greg
parents:
diff changeset
262 my @conn_part = split(/:\/\//, $database_connection_string);
2b0906489073 Uploaded
greg
parents:
diff changeset
263 my $conn_part_str = $conn_part[1];
2b0906489073 Uploaded
greg
parents:
diff changeset
264 my $driver = "Pg";
2b0906489073 Uploaded
greg
parents:
diff changeset
265 my @conn_part2 = split(/\//, $conn_part_str);
2b0906489073 Uploaded
greg
parents:
diff changeset
266 my $database = $conn_part2[1];
2b0906489073 Uploaded
greg
parents:
diff changeset
267 my $dsn = "DBI:$driver:dbname = $database;host = 127.0.0.1;port = 5432";
2b0906489073 Uploaded
greg
parents:
diff changeset
268 @conn_part2 = split(/:/, $conn_part_str);
2b0906489073 Uploaded
greg
parents:
diff changeset
269 my $userid = $conn_part2[0];
2b0906489073 Uploaded
greg
parents:
diff changeset
270 @conn_part2 = split(/@/, $conn_part_str);
2b0906489073 Uploaded
greg
parents:
diff changeset
271 my $conn_part2_str = $conn_part2[0];
2b0906489073 Uploaded
greg
parents:
diff changeset
272 my @conn_part3 = split(/:/, $conn_part2_str);
2b0906489073 Uploaded
greg
parents:
diff changeset
273 my $password = $conn_part3[1];
2b0906489073 Uploaded
greg
parents:
diff changeset
274 my $dbh = DBI->connect($dsn, $userid, $password, { RaiseError => 1 }) or die "$DBI::errstr\nError : Unable to open database galaxy_plant_tribes\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
275 log_msg("Successfully connected to database $database.");
2b0906489073 Uploaded
greg
parents:
diff changeset
276 return $dbh;
2b0906489073 Uploaded
greg
parents:
diff changeset
277 }
2b0906489073 Uploaded
greg
parents:
diff changeset
278
2b0906489073 Uploaded
greg
parents:
diff changeset
279 sub update_config_files {
2b0906489073 Uploaded
greg
parents:
diff changeset
280 my ( $scaffold, $rooting_order_species_code, $species_name, $species_code, $species_family, $species_order, $species_group, $species_clade, $working_dir ) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
281 log_msg("Updating scaffold config files in working directory.");
2b0906489073 Uploaded
greg
parents:
diff changeset
282 # Update the rootingOrder config file.
2b0906489073 Uploaded
greg
parents:
diff changeset
283 my $rooting_order_config = "$working_dir/$scaffold/$scaffold.rootingOrder.config";
2b0906489073 Uploaded
greg
parents:
diff changeset
284 my $tmp_rooting_order_config = "$working_dir/$scaffold/$scaffold.rootingOrder.config.tmp";
2b0906489073 Uploaded
greg
parents:
diff changeset
285 open (IN, $rooting_order_config) or die "Can't open $rooting_order_config\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
286 open (OUT, ">$tmp_rooting_order_config") or die "Can't open $tmp_rooting_order_config file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
287 my $inserted = 0;
2b0906489073 Uploaded
greg
parents:
diff changeset
288 while (<IN>) {
2b0906489073 Uploaded
greg
parents:
diff changeset
289 # Example line: Physcomitrella patens=Phypa
2b0906489073 Uploaded
greg
parents:
diff changeset
290 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
291 print OUT "$_\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
292 if (not $inserted) {
2b0906489073 Uploaded
greg
parents:
diff changeset
293 if (not /^#/ && not /^$/ && not /^\[/) {
2b0906489073 Uploaded
greg
parents:
diff changeset
294 my @F=split(/=/, $_);
2b0906489073 Uploaded
greg
parents:
diff changeset
295 my $cmp_species_code = $F[1];
2b0906489073 Uploaded
greg
parents:
diff changeset
296 if (defined($cmp_species_code) && $cmp_species_code eq $rooting_order_species_code) {
2b0906489073 Uploaded
greg
parents:
diff changeset
297 print OUT "$species_name=$species_code\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
298 $inserted = 1;
2b0906489073 Uploaded
greg
parents:
diff changeset
299 }
2b0906489073 Uploaded
greg
parents:
diff changeset
300 }
2b0906489073 Uploaded
greg
parents:
diff changeset
301 }
2b0906489073 Uploaded
greg
parents:
diff changeset
302 }
2b0906489073 Uploaded
greg
parents:
diff changeset
303 close OUT;
2b0906489073 Uploaded
greg
parents:
diff changeset
304 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
305 my $update_rooting_order = system "mv $tmp_rooting_order_config $rooting_order_config >/dev/null";
2b0906489073 Uploaded
greg
parents:
diff changeset
306 if ($update_rooting_order != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
307 stop_err("Updating rooting order config in working directory failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
308 }
2b0906489073 Uploaded
greg
parents:
diff changeset
309 # Update the taxaLineage config file.
2b0906489073 Uploaded
greg
parents:
diff changeset
310 my $taxa_lineage_config = "$working_dir/$scaffold/$scaffold.taxaLineage.config";
2b0906489073 Uploaded
greg
parents:
diff changeset
311 # Make sure the last line of the file ends with a newline character
2b0906489073 Uploaded
greg
parents:
diff changeset
312 # by rewriting the entire file.
2b0906489073 Uploaded
greg
parents:
diff changeset
313 my $tmp_taxa_lineage_config = "$working_dir/$scaffold/$scaffold.taxaLineage.config.tmp";
2b0906489073 Uploaded
greg
parents:
diff changeset
314 open (IN, $taxa_lineage_config) or die "Can't open $taxa_lineage_config\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
315 open (OUT, ">$tmp_taxa_lineage_config") or die "Can't open $tmp_taxa_lineage_config file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
316 while (<IN>) {
2b0906489073 Uploaded
greg
parents:
diff changeset
317 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
318 print OUT "$_\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
319 }
2b0906489073 Uploaded
greg
parents:
diff changeset
320 close OUT;
2b0906489073 Uploaded
greg
parents:
diff changeset
321 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
322 my $update_taxa = system "mv $tmp_taxa_lineage_config $taxa_lineage_config >/dev/null";
2b0906489073 Uploaded
greg
parents:
diff changeset
323 if ($update_taxa != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
324 stop_err("Updating taxa lineage config in working directory failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
325 }
2b0906489073 Uploaded
greg
parents:
diff changeset
326 # Append the new species information to the file.
2b0906489073 Uploaded
greg
parents:
diff changeset
327 open (OUT, ">>$taxa_lineage_config") or die "Can't open $taxa_lineage_config file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
328 print OUT "$species_name\t$species_family\t$species_order\t$species_group\t$species_clade\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
329 close OUT;
2b0906489073 Uploaded
greg
parents:
diff changeset
330 }
2b0906489073 Uploaded
greg
parents:
diff changeset
331
2b0906489073 Uploaded
greg
parents:
diff changeset
332 sub sort_sequences {
2b0906489073 Uploaded
greg
parents:
diff changeset
333 my ( $proteins, $coding_sequences, $scaffold, $method, $num_threads, $species_name, $working_dir, $scaffold_dir ) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
334 my $method_dir = "$working_dir/$method";
2b0906489073 Uploaded
greg
parents:
diff changeset
335 make_directory($method_dir);
2b0906489073 Uploaded
greg
parents:
diff changeset
336 log_msg("Sorting working directory protein sequences in the $method clustering method.");
2b0906489073 Uploaded
greg
parents:
diff changeset
337 log_msg("Running BLASTP.");
2b0906489073 Uploaded
greg
parents:
diff changeset
338 my $blastp_call = system "blastp -outfmt 6 -evalue 1e-5 -num_threads $num_threads -query $proteins -db $scaffold_dir/db/blast/$method -out $method_dir/proteins.blastp >/dev/null";
2b0906489073 Uploaded
greg
parents:
diff changeset
339 if ($blastp_call != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
340 stop_err("Running BLASTP failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
341 }
2b0906489073 Uploaded
greg
parents:
diff changeset
342 log_msg("Getting best BLASTP hits.");
2b0906489073 Uploaded
greg
parents:
diff changeset
343 my $blast_results = "proteins.blastp";
2b0906489073 Uploaded
greg
parents:
diff changeset
344 get_best_blastp_orthos ( $blast_results, $scaffold, $method, $method_dir, $scaffold_dir );
2b0906489073 Uploaded
greg
parents:
diff changeset
345 log_msg("Running HMMScan.");
2b0906489073 Uploaded
greg
parents:
diff changeset
346 my $hmmscan_call = system "hmmscan -E 1e-5 --cpu $num_threads --noali --tblout $method_dir/proteins.hmmscan -o $method_dir/hmmscan.log $scaffold_dir/db/hmm/$method $proteins >/dev/null";
2b0906489073 Uploaded
greg
parents:
diff changeset
347 if ($hmmscan_call != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
348 stop_err("Running HMMScan failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
349 }
2b0906489073 Uploaded
greg
parents:
diff changeset
350 log_msg("Getting best HMMScan hits.");
2b0906489073 Uploaded
greg
parents:
diff changeset
351 my $hmmscan_results = "proteins.hmmscan";
2b0906489073 Uploaded
greg
parents:
diff changeset
352 get_best_hmmscan_orthos ( $hmmscan_results, $scaffold, $method, $method_dir, $scaffold_dir );
2b0906489073 Uploaded
greg
parents:
diff changeset
353 get_blast_hmmscan_orthos ( $scaffold, $method, $method_dir, $scaffold_dir );
2b0906489073 Uploaded
greg
parents:
diff changeset
354 get_orthogroup_fasta ( $proteins, $coding_sequences, $method, $method_dir, $scaffold_dir );
2b0906489073 Uploaded
greg
parents:
diff changeset
355 update_scaffold_data( $proteins, $scaffold, $method, $num_threads, $method_dir, $species_name, $working_dir, $scaffold_dir );
2b0906489073 Uploaded
greg
parents:
diff changeset
356 }
2b0906489073 Uploaded
greg
parents:
diff changeset
357
2b0906489073 Uploaded
greg
parents:
diff changeset
358 sub get_best_blastp_orthos {
2b0906489073 Uploaded
greg
parents:
diff changeset
359 my ( $blast_results, $scaffold, $method, $method_dir, $scaffold_dir ) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
360 my (%best, %max, %list);
2b0906489073 Uploaded
greg
parents:
diff changeset
361 open (IN, "$method_dir/$blast_results") or die "Can't open $method_dir/$blast_results file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
362 while (<IN>) {
2b0906489073 Uploaded
greg
parents:
diff changeset
363 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
364 my @F=split(/\t/, $_);
2b0906489073 Uploaded
greg
parents:
diff changeset
365 if ($F[0] eq $F[1]) { next; }
2b0906489073 Uploaded
greg
parents:
diff changeset
366 if (!$best{$F[0]}) {
2b0906489073 Uploaded
greg
parents:
diff changeset
367 $best{$F[0]} = $_;
2b0906489073 Uploaded
greg
parents:
diff changeset
368 $max{$F[0]} = $F[11];
2b0906489073 Uploaded
greg
parents:
diff changeset
369 }
2b0906489073 Uploaded
greg
parents:
diff changeset
370 else {
2b0906489073 Uploaded
greg
parents:
diff changeset
371 if ($F[11] > $max{$F[0]}) {
2b0906489073 Uploaded
greg
parents:
diff changeset
372 $best{$F[0]} = $_;
2b0906489073 Uploaded
greg
parents:
diff changeset
373 $max{$F[0]} = $F[11];
2b0906489073 Uploaded
greg
parents:
diff changeset
374 }
2b0906489073 Uploaded
greg
parents:
diff changeset
375 }
2b0906489073 Uploaded
greg
parents:
diff changeset
376 }
2b0906489073 Uploaded
greg
parents:
diff changeset
377 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
378 open (IN, "$scaffold_dir/annot/$method.list") or die "Can't open $scaffold_dir/annot/$method.list file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
379 while (<IN>) {
2b0906489073 Uploaded
greg
parents:
diff changeset
380 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
381 my @F=split(/\t/, $_);
2b0906489073 Uploaded
greg
parents:
diff changeset
382 $list{$F[1]} = $F[0];
2b0906489073 Uploaded
greg
parents:
diff changeset
383 }
2b0906489073 Uploaded
greg
parents:
diff changeset
384 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
385 open (OUT, ">$method_dir/$blast_results.bestOrthos") or die "Can't open $method_dir/$blast_results.bestOrthos file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
386 print OUT "Gene ID\tOrthogroup ID\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
387 foreach (keys %best) {
2b0906489073 Uploaded
greg
parents:
diff changeset
388 my @F = split(/\t/, $best{$_});
2b0906489073 Uploaded
greg
parents:
diff changeset
389 print OUT "$F[0]\t$list{$F[1]}\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
390 }
2b0906489073 Uploaded
greg
parents:
diff changeset
391 close OUT;
2b0906489073 Uploaded
greg
parents:
diff changeset
392 }
2b0906489073 Uploaded
greg
parents:
diff changeset
393
2b0906489073 Uploaded
greg
parents:
diff changeset
394 sub get_best_hmmscan_orthos {
2b0906489073 Uploaded
greg
parents:
diff changeset
395 my ( $hmmscan_results, $scaffold, $method, $method_dir, $scaffold_dir ) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
396 my %hits;
2b0906489073 Uploaded
greg
parents:
diff changeset
397 open (IN, "$method_dir/$hmmscan_results") or die "Can't open $method_dir/$hmmscan_results file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
398 while (<IN>) {
2b0906489073 Uploaded
greg
parents:
diff changeset
399 if (/^#/){next;}
2b0906489073 Uploaded
greg
parents:
diff changeset
400 my @F = split(/\s+/, $_);
2b0906489073 Uploaded
greg
parents:
diff changeset
401 $hits{$F[2]}{$F[0]} = $F[5];
2b0906489073 Uploaded
greg
parents:
diff changeset
402 }
2b0906489073 Uploaded
greg
parents:
diff changeset
403 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
404 open (OUT, ">$method_dir/$hmmscan_results.bestOrthos") or die "Can't open $method_dir/$hmmscan_results.bestOrthos file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
405 print OUT "Gene ID\tOrthogroup ID\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
406 for my $hit (keys %hits) {
2b0906489073 Uploaded
greg
parents:
diff changeset
407 my $score = 0;
2b0906489073 Uploaded
greg
parents:
diff changeset
408 my $best_target;
2b0906489073 Uploaded
greg
parents:
diff changeset
409 for my $target (keys %{$hits{$hit}}) {
2b0906489073 Uploaded
greg
parents:
diff changeset
410 if ($hits{$hit}{$target} >= $score) {
2b0906489073 Uploaded
greg
parents:
diff changeset
411 $score = $hits{$hit}{$target};
2b0906489073 Uploaded
greg
parents:
diff changeset
412 $best_target = $target;
2b0906489073 Uploaded
greg
parents:
diff changeset
413 }
2b0906489073 Uploaded
greg
parents:
diff changeset
414 }
2b0906489073 Uploaded
greg
parents:
diff changeset
415 print OUT "$hit\t$best_target\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
416 }
2b0906489073 Uploaded
greg
parents:
diff changeset
417 close OUT;
2b0906489073 Uploaded
greg
parents:
diff changeset
418 }
2b0906489073 Uploaded
greg
parents:
diff changeset
419
2b0906489073 Uploaded
greg
parents:
diff changeset
420 sub get_blast_hmmscan_orthos {
2b0906489073 Uploaded
greg
parents:
diff changeset
421 my ( $scaffold, $method, $method_dir, $scaffold_dir ) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
422 my (%blastp, %hmmscan, %genes);
2b0906489073 Uploaded
greg
parents:
diff changeset
423 opendir (DIR, "$method_dir") or die "Can't open $method_dir directory\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
424 while (my $filename = readdir(DIR)) {
2b0906489073 Uploaded
greg
parents:
diff changeset
425 if ($filename =~ /^proteins\.blastp\.bestOrthos$/){
2b0906489073 Uploaded
greg
parents:
diff changeset
426 open (IN, "$method_dir/$filename") or die "Can't open $method_dir/$filename file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
427 while (<IN>) {
2b0906489073 Uploaded
greg
parents:
diff changeset
428 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
429 if (/^Gene/) {next;}
2b0906489073 Uploaded
greg
parents:
diff changeset
430 my @F = split(/\t/, $_);
2b0906489073 Uploaded
greg
parents:
diff changeset
431 $blastp{$F[0]} = $F[1];
2b0906489073 Uploaded
greg
parents:
diff changeset
432 $genes{$F[0]} = $F[0];
2b0906489073 Uploaded
greg
parents:
diff changeset
433 }
2b0906489073 Uploaded
greg
parents:
diff changeset
434 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
435 }
2b0906489073 Uploaded
greg
parents:
diff changeset
436 if ($filename =~ /^proteins\.hmmscan\.bestOrthos$/){
2b0906489073 Uploaded
greg
parents:
diff changeset
437 open (IN, "$method_dir/$filename") or die "Can't open $method_dir/$filename file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
438 while (<IN>) {
2b0906489073 Uploaded
greg
parents:
diff changeset
439 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
440 if (/^Gene/) {next;}
2b0906489073 Uploaded
greg
parents:
diff changeset
441 my @F = split(/\t/, $_);
2b0906489073 Uploaded
greg
parents:
diff changeset
442 $hmmscan{$F[0]} = $F[1];
2b0906489073 Uploaded
greg
parents:
diff changeset
443 $genes{$F[0]} = $F[0];
2b0906489073 Uploaded
greg
parents:
diff changeset
444 }
2b0906489073 Uploaded
greg
parents:
diff changeset
445 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
446 }
2b0906489073 Uploaded
greg
parents:
diff changeset
447 }
2b0906489073 Uploaded
greg
parents:
diff changeset
448 closedir DIR;
2b0906489073 Uploaded
greg
parents:
diff changeset
449 open (OUT, ">$method_dir/proteins.both.bestOrthos") or die "Can't open $method_dir/protein.both.bestOrthos file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
450 print OUT "Gene ID\tOrthogroup ID\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
451 foreach (sort keys %genes) {
2b0906489073 Uploaded
greg
parents:
diff changeset
452 if (!$blastp{$_} and $hmmscan{$_}) { print OUT "$_\t$hmmscan{$_}\n"; next; }
2b0906489073 Uploaded
greg
parents:
diff changeset
453 elsif ($blastp{$_} and !$hmmscan{$_}) { print OUT "$_\t$blastp{$_}\n"; next; }
2b0906489073 Uploaded
greg
parents:
diff changeset
454 elsif ($blastp{$_} == $hmmscan{$_}) { print OUT "$_\t$blastp{$_}\n"; next }
2b0906489073 Uploaded
greg
parents:
diff changeset
455 else { print OUT "$_\t$hmmscan{$_}\n"; }
2b0906489073 Uploaded
greg
parents:
diff changeset
456 }
2b0906489073 Uploaded
greg
parents:
diff changeset
457 close OUT;
2b0906489073 Uploaded
greg
parents:
diff changeset
458 }
2b0906489073 Uploaded
greg
parents:
diff changeset
459
2b0906489073 Uploaded
greg
parents:
diff changeset
460 sub get_orthogroup_fasta {
2b0906489073 Uploaded
greg
parents:
diff changeset
461 my ( $proteins, $coding_sequences, $method, $method_dir, $scaffold_dir ) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
462 log_msg("Retrieving orthogroup fasta files.");
2b0906489073 Uploaded
greg
parents:
diff changeset
463 my (%orthos, %pep, %cds);
2b0906489073 Uploaded
greg
parents:
diff changeset
464 my $orthogroup_fasta = "$method_dir/orthogroups_fasta";
2b0906489073 Uploaded
greg
parents:
diff changeset
465 make_directory($orthogroup_fasta);
2b0906489073 Uploaded
greg
parents:
diff changeset
466 my $orthogroup_assignment = "proteins.both.bestOrthos";
2b0906489073 Uploaded
greg
parents:
diff changeset
467 open (IN, "$method_dir/$orthogroup_assignment") or die "Can't open $method_dir/$orthogroup_assignment file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
468 while (<IN>) {
2b0906489073 Uploaded
greg
parents:
diff changeset
469 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
470 if ($_ =~ /^Gene/) { next; }
2b0906489073 Uploaded
greg
parents:
diff changeset
471 my @F = split(/\t/, $_);
2b0906489073 Uploaded
greg
parents:
diff changeset
472 $orthos{$F[1]}{$F[0]} = $F[0];
2b0906489073 Uploaded
greg
parents:
diff changeset
473 }
2b0906489073 Uploaded
greg
parents:
diff changeset
474 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
475 %pep = get_sequences ($proteins);
2b0906489073 Uploaded
greg
parents:
diff changeset
476 if ($coding_sequences) { %cds = get_sequences ($coding_sequences); }
2b0906489073 Uploaded
greg
parents:
diff changeset
477 my ($ortho_id, $seq_id);
2b0906489073 Uploaded
greg
parents:
diff changeset
478 foreach $ortho_id (keys %orthos) {
2b0906489073 Uploaded
greg
parents:
diff changeset
479 open (PEP, ">$orthogroup_fasta/$ortho_id.faa") or die "Can't open $orthogroup_fasta/$ortho_id.faa file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
480 if ($coding_sequences) { open (CDS, ">$orthogroup_fasta/$ortho_id.fna") or die "Can't open $orthogroup_fasta/$ortho_id.fna file\n"; }
2b0906489073 Uploaded
greg
parents:
diff changeset
481 foreach $seq_id (sort keys %{$orthos{$ortho_id}}) {
2b0906489073 Uploaded
greg
parents:
diff changeset
482 $pep{$seq_id} =~ s/.{80}(?=.)/$&\n/g;
2b0906489073 Uploaded
greg
parents:
diff changeset
483 print PEP ">$seq_id\n$pep{$seq_id}\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
484 if ($coding_sequences) {
2b0906489073 Uploaded
greg
parents:
diff changeset
485 $cds{$seq_id} =~ s/.{80}(?=.)/$&\n/g;
2b0906489073 Uploaded
greg
parents:
diff changeset
486 print CDS ">$seq_id\n$cds{$seq_id}\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
487 }
2b0906489073 Uploaded
greg
parents:
diff changeset
488 }
2b0906489073 Uploaded
greg
parents:
diff changeset
489 close PEP;
2b0906489073 Uploaded
greg
parents:
diff changeset
490 close CDS;
2b0906489073 Uploaded
greg
parents:
diff changeset
491 }
2b0906489073 Uploaded
greg
parents:
diff changeset
492 my @files;
2b0906489073 Uploaded
greg
parents:
diff changeset
493 my $formated_fasta = "$orthogroup_fasta/formated_fasta";
2b0906489073 Uploaded
greg
parents:
diff changeset
494 make_directory($formated_fasta);
2b0906489073 Uploaded
greg
parents:
diff changeset
495 opendir(DIR, "$orthogroup_fasta") or die "Can't open $orthogroup_fasta directory\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
496 while(my $filename = readdir(DIR)) {
2b0906489073 Uploaded
greg
parents:
diff changeset
497 if($filename =~ /\d+\.fna/ or $filename =~ /\d+\.faa/){
2b0906489073 Uploaded
greg
parents:
diff changeset
498 push (@files, $filename);
2b0906489073 Uploaded
greg
parents:
diff changeset
499 }
2b0906489073 Uploaded
greg
parents:
diff changeset
500 }
2b0906489073 Uploaded
greg
parents:
diff changeset
501 closedir(DIR);
2b0906489073 Uploaded
greg
parents:
diff changeset
502 foreach my $file (@files) {
2b0906489073 Uploaded
greg
parents:
diff changeset
503 open (IN, "$orthogroup_fasta/$file") or die "Can't open $orthogroup_fasta/$file file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
504 open (OUT, ">$formated_fasta/$file") or die "Can't open $formated_fasta/$file file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
505 while (<IN>) {
2b0906489073 Uploaded
greg
parents:
diff changeset
506 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
507 if(/^>/){ s/\|/_/g; print OUT "$_\n"; }
2b0906489073 Uploaded
greg
parents:
diff changeset
508 else { print OUT "$_\n"; }
2b0906489073 Uploaded
greg
parents:
diff changeset
509 }
2b0906489073 Uploaded
greg
parents:
diff changeset
510 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
511 close OUT;
2b0906489073 Uploaded
greg
parents:
diff changeset
512 }
2b0906489073 Uploaded
greg
parents:
diff changeset
513 my $integrated_orthogroup_fasta = "$method_dir/integrated_orthogroup_fasta";
2b0906489073 Uploaded
greg
parents:
diff changeset
514 make_directory($integrated_orthogroup_fasta);
2b0906489073 Uploaded
greg
parents:
diff changeset
515 integrate_orthogroup_fasta ( $formated_fasta, $method, $integrated_orthogroup_fasta, $scaffold_dir );
2b0906489073 Uploaded
greg
parents:
diff changeset
516 }
2b0906489073 Uploaded
greg
parents:
diff changeset
517
2b0906489073 Uploaded
greg
parents:
diff changeset
518 sub get_sequences {
2b0906489073 Uploaded
greg
parents:
diff changeset
519 my ( $file ) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
520 my (%sequences, $id);
2b0906489073 Uploaded
greg
parents:
diff changeset
521 open (IN, "$file") or die "Can't open $file file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
522 while (<IN>) {
2b0906489073 Uploaded
greg
parents:
diff changeset
523 if ($_ =~ />(\S+)/){ $id = $1; }
2b0906489073 Uploaded
greg
parents:
diff changeset
524 else { s/\s+//g; $sequences{$id} .= $_; }
2b0906489073 Uploaded
greg
parents:
diff changeset
525 }
2b0906489073 Uploaded
greg
parents:
diff changeset
526 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
527 return %sequences;
2b0906489073 Uploaded
greg
parents:
diff changeset
528 }
2b0906489073 Uploaded
greg
parents:
diff changeset
529
2b0906489073 Uploaded
greg
parents:
diff changeset
530 sub integrate_orthogroup_fasta {
2b0906489073 Uploaded
greg
parents:
diff changeset
531 my ( $formated_fasta, $method, $integrated_orthogroup_fasta, $scaffold_dir) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
532 log_msg("Integrating orthogroup fasta files.");
2b0906489073 Uploaded
greg
parents:
diff changeset
533 my (%pep, %cds);
2b0906489073 Uploaded
greg
parents:
diff changeset
534 opendir (DIR, "$formated_fasta") or die "Can't open $formated_fasta directory\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
535 while ( my $filename = readdir(DIR) ) {
2b0906489073 Uploaded
greg
parents:
diff changeset
536 if ($filename =~ /^(\d+)\.faa$/) { $pep{$1} = $1; }
2b0906489073 Uploaded
greg
parents:
diff changeset
537 if ($filename =~ /^(\d+)\.fna$/) { $cds{$1} = $1; }
2b0906489073 Uploaded
greg
parents:
diff changeset
538 }
2b0906489073 Uploaded
greg
parents:
diff changeset
539 closedir DIR;
2b0906489073 Uploaded
greg
parents:
diff changeset
540 if (keys(%cds) and (keys(%pep) != keys(%cds))) {
2b0906489073 Uploaded
greg
parents:
diff changeset
541 die "Exiting...!\nOrthogroup classification protein and CDS fasta files not equivalent in $formated_fasta directory\n\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
542 }
2b0906489073 Uploaded
greg
parents:
diff changeset
543 foreach my $ortho_id (keys %pep) {
2b0906489073 Uploaded
greg
parents:
diff changeset
544 my $merging_call = system "cat $scaffold_dir/fasta/$method/$ortho_id.faa $formated_fasta/$ortho_id.faa > $integrated_orthogroup_fasta/$ortho_id.faa";
2b0906489073 Uploaded
greg
parents:
diff changeset
545 if ($merging_call != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
546 stop_err("Merging orthogroup $ortho_id failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
547 }
2b0906489073 Uploaded
greg
parents:
diff changeset
548 if (keys(%cds) and $cds{$ortho_id}) {
2b0906489073 Uploaded
greg
parents:
diff changeset
549 my $merging_call = system "cat $scaffold_dir/fasta/$method/$ortho_id.fna $formated_fasta/$ortho_id.fna > $integrated_orthogroup_fasta/$ortho_id.fna";
2b0906489073 Uploaded
greg
parents:
diff changeset
550 if ($merging_call != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
551 stop_err("Merging orthogroup $ortho_id failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
552 }
2b0906489073 Uploaded
greg
parents:
diff changeset
553 }
2b0906489073 Uploaded
greg
parents:
diff changeset
554 }
2b0906489073 Uploaded
greg
parents:
diff changeset
555 }
2b0906489073 Uploaded
greg
parents:
diff changeset
556
2b0906489073 Uploaded
greg
parents:
diff changeset
557 sub update_scaffold_data {
2b0906489073 Uploaded
greg
parents:
diff changeset
558 my ( $proteins, $scaffold, $method, $num_threads, $method_dir, $species_name, $working_dir, $scaffold_dir ) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
559 log_msg("Updating scaffold data files.");
2b0906489073 Uploaded
greg
parents:
diff changeset
560 # update orthogroup annotation files
2b0906489073 Uploaded
greg
parents:
diff changeset
561 log_msg("Updating orthogroup annotation files.");
2b0906489073 Uploaded
greg
parents:
diff changeset
562 my %annot;
2b0906489073 Uploaded
greg
parents:
diff changeset
563 open (OUT, ">>$working_dir/$scaffold/annot/$method.list") or die "Can't open $working_dir/$scaffold/annot/$method.list file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
564 opendir (DIR, "$method_dir/orthogroups_fasta") or die "Can't open $method_dir/orthogroups_fasta directory\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
565 while ( my $filename = readdir(DIR) ) {
2b0906489073 Uploaded
greg
parents:
diff changeset
566 my $seq_count = 0;
2b0906489073 Uploaded
greg
parents:
diff changeset
567 my $ortho_id;
2b0906489073 Uploaded
greg
parents:
diff changeset
568 if ($filename =~ /^(\d+)\.faa$/) {
2b0906489073 Uploaded
greg
parents:
diff changeset
569 $ortho_id = $1;
2b0906489073 Uploaded
greg
parents:
diff changeset
570 open (IN, "$method_dir/orthogroups_fasta/$filename") or die "Can't open $method_dir/orthogroups_fasta/$filename file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
571 while (<IN>) {
2b0906489073 Uploaded
greg
parents:
diff changeset
572 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
573 if (/^>(\S+)/) {
2b0906489073 Uploaded
greg
parents:
diff changeset
574 print OUT "$ortho_id\t$1\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
575 $seq_count++;
2b0906489073 Uploaded
greg
parents:
diff changeset
576 }
2b0906489073 Uploaded
greg
parents:
diff changeset
577 }
2b0906489073 Uploaded
greg
parents:
diff changeset
578 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
579 }
2b0906489073 Uploaded
greg
parents:
diff changeset
580 else { next; }
2b0906489073 Uploaded
greg
parents:
diff changeset
581 $annot{$ortho_id} = $seq_count;
2b0906489073 Uploaded
greg
parents:
diff changeset
582 }
2b0906489073 Uploaded
greg
parents:
diff changeset
583 closedir DIR;
2b0906489073 Uploaded
greg
parents:
diff changeset
584 close OUT;
2b0906489073 Uploaded
greg
parents:
diff changeset
585 my ($fields, %avg_summary, %min_summary);
2b0906489073 Uploaded
greg
parents:
diff changeset
586 if (File::Spec->file_name_is_absolute($proteins)) { $proteins = basename($proteins); }
2b0906489073 Uploaded
greg
parents:
diff changeset
587 open (IN, "$working_dir/$scaffold/annot/$method.avg_evalue.summary") or die "Can't open $working_dir/$scaffold/annot/$method.avg_evalue.summary file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
588 while (<IN>) {
2b0906489073 Uploaded
greg
parents:
diff changeset
589 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
590 if (/^Orthogroup\s+ID\s+(.*)/) { $fields = "Orthogroup ID\t$species_name\t$1"; next; }
2b0906489073 Uploaded
greg
parents:
diff changeset
591 else { /(\d+)\s+(.*)/; $avg_summary{$1} = $2; }
2b0906489073 Uploaded
greg
parents:
diff changeset
592 }
2b0906489073 Uploaded
greg
parents:
diff changeset
593 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
594 open (OUT, ">$working_dir/$scaffold/annot/$method.avg_evalue.summary") or die "Can't open $working_dir/$scaffold/annot/$method.avg_evalue.summary file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
595 print OUT "$fields\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
596 foreach (sort {$a<=>$b} keys %avg_summary) {
2b0906489073 Uploaded
greg
parents:
diff changeset
597 if ($annot{$_}) { print OUT "$_\t$annot{$_}\t$avg_summary{$_}\n"; }
2b0906489073 Uploaded
greg
parents:
diff changeset
598 else { print OUT "$_\t0\t$avg_summary{$_}\n"; }
2b0906489073 Uploaded
greg
parents:
diff changeset
599 }
2b0906489073 Uploaded
greg
parents:
diff changeset
600 close OUT;
2b0906489073 Uploaded
greg
parents:
diff changeset
601 open (IN, "$working_dir/$scaffold/annot/$method.min_evalue.summary") or die "Can't open $working_dir/$scaffold/annot/$method.min_evalue.summary file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
602 while (<IN>) {
2b0906489073 Uploaded
greg
parents:
diff changeset
603 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
604 if (/^Orthogroup\s+ID\s+(.*)/) { $fields = "Orthogroup ID\t$species_name\t$1"; next; }
2b0906489073 Uploaded
greg
parents:
diff changeset
605 else { /(\d+)\s+(.*)/; $min_summary{$1} = $2; }
2b0906489073 Uploaded
greg
parents:
diff changeset
606 }
2b0906489073 Uploaded
greg
parents:
diff changeset
607 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
608 open (OUT, ">$working_dir/$scaffold/annot/$method.min_evalue.summary") or die "Can't open $working_dir/$scaffold/annot/$method.min_evalue.summary file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
609 print OUT "$fields\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
610 foreach (sort {$a<=>$b} keys %min_summary) {
2b0906489073 Uploaded
greg
parents:
diff changeset
611 if ($annot{$_}) { print OUT "$_\t$annot{$_}\t$min_summary{$_}\n"; }
2b0906489073 Uploaded
greg
parents:
diff changeset
612 else { print OUT "$_\t0\t$min_summary{$_}\n"; }
2b0906489073 Uploaded
greg
parents:
diff changeset
613 }
2b0906489073 Uploaded
greg
parents:
diff changeset
614 close OUT;
2b0906489073 Uploaded
greg
parents:
diff changeset
615
2b0906489073 Uploaded
greg
parents:
diff changeset
616 # update orthogroup fasta files
2b0906489073 Uploaded
greg
parents:
diff changeset
617 log_msg("Updating orthogroup fasta files.");
2b0906489073 Uploaded
greg
parents:
diff changeset
618 opendir (DIR, "$method_dir/integrated_orthogroup_fasta") or die "Can't open $method_dir/integrated_orthogroup_fasta directory\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
619 while ( my $filename = readdir(DIR) ) {
2b0906489073 Uploaded
greg
parents:
diff changeset
620 if (($filename =~ /^(\d+)\.faa$/) or ($filename =~ /^(\d+)\.fna$/)) {
2b0906489073 Uploaded
greg
parents:
diff changeset
621 my $update_orthogroup_fasta = system "cp $method_dir/integrated_orthogroup_fasta/$filename $working_dir/$scaffold/fasta/$method/$filename >/dev/null";
2b0906489073 Uploaded
greg
parents:
diff changeset
622 if ($update_orthogroup_fasta != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
623 stop_err("Updating orthogroup fasta $filename failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
624 }
2b0906489073 Uploaded
greg
parents:
diff changeset
625 }
2b0906489073 Uploaded
greg
parents:
diff changeset
626 }
2b0906489073 Uploaded
greg
parents:
diff changeset
627 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
628 closedir DIR;
2b0906489073 Uploaded
greg
parents:
diff changeset
629
2b0906489073 Uploaded
greg
parents:
diff changeset
630 # update orthogroup alignments
2b0906489073 Uploaded
greg
parents:
diff changeset
631 log_msg("Updating alignments.");
2b0906489073 Uploaded
greg
parents:
diff changeset
632 opendir (DIR, "$method_dir/orthogroups_fasta/formated_fasta") or die "Can't open $method_dir/orthogroups_fasta/formated_fasta directory\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
633 while ( my $filename = readdir(DIR) ) {
2b0906489073 Uploaded
greg
parents:
diff changeset
634 if ($filename =~ /^(\d+)\.faa$/) {
2b0906489073 Uploaded
greg
parents:
diff changeset
635 my $ortho_id = $1;
2b0906489073 Uploaded
greg
parents:
diff changeset
636 my $align_fasta = system "mafft --thread $num_threads --add $method_dir/orthogroups_fasta/formated_fasta/$filename $working_dir/$scaffold/alns/$method/$ortho_id.aln > $method_dir/orthogroups_fasta/formated_fasta/$ortho_id.aln 2>/dev/null";
2b0906489073 Uploaded
greg
parents:
diff changeset
637 if ($align_fasta != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
638 stop_err("Aligning orthogroup fasta file $filename failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
639 }
2b0906489073 Uploaded
greg
parents:
diff changeset
640 my $update_orthogroup_alignment = system "mv $method_dir/orthogroups_fasta/formated_fasta/$ortho_id.aln $working_dir/$scaffold/alns/$method/$ortho_id.aln >/dev/null";
2b0906489073 Uploaded
greg
parents:
diff changeset
641 if ($update_orthogroup_alignment != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
642 stop_err(" - Updating orthogroup alignment $ortho_id.aln failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
643 }
2b0906489073 Uploaded
greg
parents:
diff changeset
644 }
2b0906489073 Uploaded
greg
parents:
diff changeset
645 }
2b0906489073 Uploaded
greg
parents:
diff changeset
646 closedir DIR;
2b0906489073 Uploaded
greg
parents:
diff changeset
647
2b0906489073 Uploaded
greg
parents:
diff changeset
648 # update orthogroup hmm profiles
2b0906489073 Uploaded
greg
parents:
diff changeset
649 log_msg("Updating hmm profiles.");
2b0906489073 Uploaded
greg
parents:
diff changeset
650 opendir (DIR, "$working_dir/$scaffold/alns/$method") or die "Can't open $working_dir/$scaffold/alns/$method directory\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
651 while ( my $filename = readdir(DIR) ) {
2b0906489073 Uploaded
greg
parents:
diff changeset
652 if ($filename =~ /^(\d+)\.aln$/) {
2b0906489073 Uploaded
greg
parents:
diff changeset
653 my $ortho_id = $1;
2b0906489073 Uploaded
greg
parents:
diff changeset
654 my $update_orthogroup_hmm = system "hmmbuild -n $ortho_id --amino --cpu $num_threads $working_dir/$ortho_id.hmm $working_dir/$scaffold/alns/$method/$ortho_id.aln >/dev/null";
2b0906489073 Uploaded
greg
parents:
diff changeset
655 if ($update_orthogroup_hmm != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
656 stop_err("Updating orthogroup hmm profile $ortho_id.hmm failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
657 }
2b0906489073 Uploaded
greg
parents:
diff changeset
658 my $convert_hmm_format = system "hmmconvert $working_dir/$ortho_id.hmm > $working_dir/$scaffold/hmms/$method/$ortho_id.hmm";
2b0906489073 Uploaded
greg
parents:
diff changeset
659 if ($convert_hmm_format != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
660 stop_err("Converting orthogroup hmm profile $ortho_id.hmm format failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
661 }
2b0906489073 Uploaded
greg
parents:
diff changeset
662 my $remove_tmp_file = system "rm $working_dir/$ortho_id.hmm >/dev/null";
2b0906489073 Uploaded
greg
parents:
diff changeset
663 if ($remove_tmp_file != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
664 stop_err("Could not remove temporary hmm profile $ortho_id.hmm.");
2b0906489073 Uploaded
greg
parents:
diff changeset
665 }
2b0906489073 Uploaded
greg
parents:
diff changeset
666 }
2b0906489073 Uploaded
greg
parents:
diff changeset
667 }
2b0906489073 Uploaded
greg
parents:
diff changeset
668
2b0906489073 Uploaded
greg
parents:
diff changeset
669 # update orthogroup blast and hmm databases
2b0906489073 Uploaded
greg
parents:
diff changeset
670 log_msg("Updating blast and hmm databases.");
2b0906489073 Uploaded
greg
parents:
diff changeset
671 my $update_blast_database = system "find $method_dir/orthogroups_fasta/ -name \"*.faa\" -print0 | xargs -0 cat >> $working_dir/$scaffold/db/blast/$method";
2b0906489073 Uploaded
greg
parents:
diff changeset
672 if ($update_blast_database != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
673 stop_err("Updating blast database failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
674 }
2b0906489073 Uploaded
greg
parents:
diff changeset
675 my $index_blast_database = system "makeblastdb -in $working_dir/$scaffold/db/blast/$method -dbtype prot >/dev/null";
2b0906489073 Uploaded
greg
parents:
diff changeset
676 if ($index_blast_database != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
677 stop_err("Indexing blast database failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
678 }
2b0906489073 Uploaded
greg
parents:
diff changeset
679 my $update_hmm_database = system "find $working_dir/$scaffold/hmms/$method/ -name \"*.hmm\" -print0 | xargs -0 cat > $working_dir/$scaffold/db/hmm/$method";
2b0906489073 Uploaded
greg
parents:
diff changeset
680 if ($update_hmm_database != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
681 stop_err("Updating hmm database failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
682 }
2b0906489073 Uploaded
greg
parents:
diff changeset
683 my $index_hmm_database = system "hmmpress -f $working_dir/$scaffold/db/hmm/$method >/dev/null";
2b0906489073 Uploaded
greg
parents:
diff changeset
684 if ($index_hmm_database != 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
685 stop_err("Indexing hmm database failed.");
2b0906489073 Uploaded
greg
parents:
diff changeset
686 }
2b0906489073 Uploaded
greg
parents:
diff changeset
687 }
2b0906489073 Uploaded
greg
parents:
diff changeset
688
2b0906489073 Uploaded
greg
parents:
diff changeset
689 sub update_database_tables {
2b0906489073 Uploaded
greg
parents:
diff changeset
690 my ( $dbh, $proteins, $scaffold, $methods, $species_name, $species_family, $species_order, $species_group, $species_clade, $scaffold_dir, $working_dir ) = @_;
2b0906489073 Uploaded
greg
parents:
diff changeset
691 log_msg("Updating for database tables.");
2b0906489073 Uploaded
greg
parents:
diff changeset
692 my ( $species_code, %method_genes, %gene_sequences, %dna, %aa );
2b0906489073 Uploaded
greg
parents:
diff changeset
693 my $gsot_association_prep_file = "$working_dir/gene_scaffold_orthogroup_taxon_association.tsv";
2b0906489073 Uploaded
greg
parents:
diff changeset
694 my $num_recs = 0;
2b0906489073 Uploaded
greg
parents:
diff changeset
695
2b0906489073 Uploaded
greg
parents:
diff changeset
696 # Output a prep file that stores information for updating
2b0906489073 Uploaded
greg
parents:
diff changeset
697 # the gene_scaffold_orthogroup_taxon_association table.
2b0906489073 Uploaded
greg
parents:
diff changeset
698 open (ASSOC, ">$gsot_association_prep_file") or die "Can't open $gsot_association_prep_file file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
699 print ASSOC "gene_id\tscaffold_id\tclustering_method\torthogroup_id\tspecies_name\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
700 # get new species name and code
2b0906489073 Uploaded
greg
parents:
diff changeset
701 if (File::Spec->file_name_is_absolute($proteins)) { $proteins = basename($proteins); }
2b0906489073 Uploaded
greg
parents:
diff changeset
702 $species_name =~ s/\_/ /g;
2b0906489073 Uploaded
greg
parents:
diff changeset
703 my $rooting_order_config = "$scaffold_dir/$scaffold.rootingOrder.config";
2b0906489073 Uploaded
greg
parents:
diff changeset
704 open(IN, "$rooting_order_config") or die "Can't open $rooting_order_config file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
705 while (<IN>){
2b0906489073 Uploaded
greg
parents:
diff changeset
706 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
707 if (/^\#/ or /^\s+/ or /^\[/){ next; }
2b0906489073 Uploaded
greg
parents:
diff changeset
708 if (/(\w+\s+\w+)\=(\w+)/) { if ($species_name eq $1) { $species_code = $2;} }
2b0906489073 Uploaded
greg
parents:
diff changeset
709 }
2b0906489073 Uploaded
greg
parents:
diff changeset
710 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
711 foreach my $clustering_method (keys %$methods) {
2b0906489073 Uploaded
greg
parents:
diff changeset
712 # Updating orthogroup database table
2b0906489073 Uploaded
greg
parents:
diff changeset
713 log_msg("Updating $clustering_method records for the plant_tribes_orthogroup database table.");
2b0906489073 Uploaded
greg
parents:
diff changeset
714 my ( $stmt, $sth, $rv, $scaffold_id );
2b0906489073 Uploaded
greg
parents:
diff changeset
715 $stmt = qq(SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '$scaffold' AND clustering_method = '$clustering_method';);
2b0906489073 Uploaded
greg
parents:
diff changeset
716 $sth = $dbh->prepare( $stmt );
2b0906489073 Uploaded
greg
parents:
diff changeset
717 $rv = $sth->execute() or die $DBI::errstr;
2b0906489073 Uploaded
greg
parents:
diff changeset
718 if ($rv < 0) { print $DBI::errstr; }
2b0906489073 Uploaded
greg
parents:
diff changeset
719 while (my @row = $sth->fetchrow_array()) {
2b0906489073 Uploaded
greg
parents:
diff changeset
720 $scaffold_id = $row[0];
2b0906489073 Uploaded
greg
parents:
diff changeset
721 }
2b0906489073 Uploaded
greg
parents:
diff changeset
722 my $scaffold_annotation_dir = "$scaffold_dir/annot";
2b0906489073 Uploaded
greg
parents:
diff changeset
723 opendir (DIR, $scaffold_annotation_dir) or die "Can't open $scaffold_annotation_dir directory\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
724 $num_recs = 0;
2b0906489073 Uploaded
greg
parents:
diff changeset
725 while ( my $filename = readdir(DIR) ) {
2b0906489073 Uploaded
greg
parents:
diff changeset
726 if ($filename =~ /$clustering_method.min_evalue\.summary/) {
2b0906489073 Uploaded
greg
parents:
diff changeset
727 open (IN, "$scaffold_annotation_dir/$filename") or die "Can't open $scaffold_annotation_dir/$filename file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
728 while (<IN>){
2b0906489073 Uploaded
greg
parents:
diff changeset
729 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
730 if (/^Orthogroup/){ next; }
2b0906489073 Uploaded
greg
parents:
diff changeset
731 my @fields = split(/\t/, $_);
2b0906489073 Uploaded
greg
parents:
diff changeset
732 my $num_species = 0;
2b0906489073 Uploaded
greg
parents:
diff changeset
733 my $num_genes = 0;
2b0906489073 Uploaded
greg
parents:
diff changeset
734 $scaffold =~ /(\d+)Gv\d+\.\d+/; # 22Gv1.1
2b0906489073 Uploaded
greg
parents:
diff changeset
735 my $genomes = $1 + 1;
2b0906489073 Uploaded
greg
parents:
diff changeset
736 for (1..$genomes){
2b0906489073 Uploaded
greg
parents:
diff changeset
737 if ($fields[$_] > 0){ $num_species++; }
2b0906489073 Uploaded
greg
parents:
diff changeset
738 $num_genes += $fields[$_];
2b0906489073 Uploaded
greg
parents:
diff changeset
739 }
2b0906489073 Uploaded
greg
parents:
diff changeset
740 $stmt = qq(UPDATE plant_tribes_orthogroup SET num_species = $num_species, num_genes = $num_genes WHERE orthogroup_id = $fields[0] AND scaffold_id = $scaffold_id;);
2b0906489073 Uploaded
greg
parents:
diff changeset
741 $rv = $dbh->do($stmt) or die $DBI::errstr;
2b0906489073 Uploaded
greg
parents:
diff changeset
742 if($rv < 0) {
2b0906489073 Uploaded
greg
parents:
diff changeset
743 print $DBI::errstr;
2b0906489073 Uploaded
greg
parents:
diff changeset
744 }
2b0906489073 Uploaded
greg
parents:
diff changeset
745 $num_recs = $num_recs + 1;
2b0906489073 Uploaded
greg
parents:
diff changeset
746 }
2b0906489073 Uploaded
greg
parents:
diff changeset
747 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
748 }
2b0906489073 Uploaded
greg
parents:
diff changeset
749 if ($filename =~ /$clustering_method\.list/) {
2b0906489073 Uploaded
greg
parents:
diff changeset
750 open (IN, "$scaffold_annotation_dir/$filename") or die "Can't open $scaffold_annotation_dir/$filename file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
751 while (<IN>){
2b0906489073 Uploaded
greg
parents:
diff changeset
752 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
753 my @fields = split(/\t/, $_);
2b0906489073 Uploaded
greg
parents:
diff changeset
754 my @gene_id = split(/\|/, $fields[1]);
2b0906489073 Uploaded
greg
parents:
diff changeset
755 if ($gene_id[1] =~ /$species_code/) {
2b0906489073 Uploaded
greg
parents:
diff changeset
756 $fields[1] =~ s/\|/_/g;
2b0906489073 Uploaded
greg
parents:
diff changeset
757 $method_genes{$clustering_method}{$fields[0]}{$fields[1]} = $fields[1];
2b0906489073 Uploaded
greg
parents:
diff changeset
758 }
2b0906489073 Uploaded
greg
parents:
diff changeset
759 }
2b0906489073 Uploaded
greg
parents:
diff changeset
760 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
761 }
2b0906489073 Uploaded
greg
parents:
diff changeset
762 }
2b0906489073 Uploaded
greg
parents:
diff changeset
763 close DIR;
2b0906489073 Uploaded
greg
parents:
diff changeset
764 log_msg("$num_recs records for $scaffold $clustering_method were successfully updated in the plant_tribes_orthogroup table.");
2b0906489073 Uploaded
greg
parents:
diff changeset
765 # Updating taxon database table
2b0906489073 Uploaded
greg
parents:
diff changeset
766 log_msg("Inserting $clustering_method records into the plant_tribes_taxon database table.");
2b0906489073 Uploaded
greg
parents:
diff changeset
767 my $num_genes = 0;
2b0906489073 Uploaded
greg
parents:
diff changeset
768 foreach my $ortho_id (keys %{$method_genes{$clustering_method}}){
2b0906489073 Uploaded
greg
parents:
diff changeset
769 foreach (keys %{$method_genes{$clustering_method}{$ortho_id}}){
2b0906489073 Uploaded
greg
parents:
diff changeset
770 $num_genes++;
2b0906489073 Uploaded
greg
parents:
diff changeset
771 }
2b0906489073 Uploaded
greg
parents:
diff changeset
772 }
2b0906489073 Uploaded
greg
parents:
diff changeset
773 my $taxa_lineage_config = "$scaffold_dir/$scaffold.taxaLineage.config";
2b0906489073 Uploaded
greg
parents:
diff changeset
774 open(IN, "$taxa_lineage_config") or die "Can't open $taxa_lineage_config file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
775 $num_recs = 0;
2b0906489073 Uploaded
greg
parents:
diff changeset
776 while (<IN>){
2b0906489073 Uploaded
greg
parents:
diff changeset
777 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
778 my @fields = split(/\t/, $_);
2b0906489073 Uploaded
greg
parents:
diff changeset
779 if ($fields[0] ne $species_name) { next; }
2b0906489073 Uploaded
greg
parents:
diff changeset
780 $stmt = qq(INSERT INTO plant_tribes_taxon (species_name, scaffold_id, num_genes, species_family, species_order, species_group, species_clade) VALUES ('$fields[0]', $scaffold_id, $num_genes, '$fields[1]', '$fields[2]', '$fields[3]', '$fields[4]'));
2b0906489073 Uploaded
greg
parents:
diff changeset
781 $rv = $dbh->do($stmt) or die $DBI::errstr;
2b0906489073 Uploaded
greg
parents:
diff changeset
782 $num_recs = $num_recs + 1;
2b0906489073 Uploaded
greg
parents:
diff changeset
783 }
2b0906489073 Uploaded
greg
parents:
diff changeset
784 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
785 log_msg("$num_recs records for $species_name $scaffold $clustering_method were successfully inserted into the plant_tribes_taxon table.");
2b0906489073 Uploaded
greg
parents:
diff changeset
786 my ($dna_id, $aa_id);
2b0906489073 Uploaded
greg
parents:
diff changeset
787 my $orthogroups_fasta_dir = "$working_dir/$clustering_method/orthogroups_fasta/formated_fasta";
2b0906489073 Uploaded
greg
parents:
diff changeset
788 opendir (DIR, $orthogroups_fasta_dir) or die "Can't open $orthogroups_fasta_dir directory\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
789 while ( my $filename = readdir(DIR) ) {
2b0906489073 Uploaded
greg
parents:
diff changeset
790 if ($filename =~ /^(\d+)\.fna$/) {
2b0906489073 Uploaded
greg
parents:
diff changeset
791 my $ortho_id = $1;
2b0906489073 Uploaded
greg
parents:
diff changeset
792 open(IN, "$orthogroups_fasta_dir/$filename") or die "Can't open $orthogroups_fasta_dir/$filename file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
793 while(<IN>){
2b0906489073 Uploaded
greg
parents:
diff changeset
794 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
795 if (/^>(\S+)/){
2b0906489073 Uploaded
greg
parents:
diff changeset
796 $dna_id = $1;
2b0906489073 Uploaded
greg
parents:
diff changeset
797 print ASSOC "$dna_id\t$scaffold\t$clustering_method\t$ortho_id\t$species_name\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
798 next;
2b0906489073 Uploaded
greg
parents:
diff changeset
799 }
2b0906489073 Uploaded
greg
parents:
diff changeset
800 else { s/\s+//g; $dna{$dna_id} .= $_; }
2b0906489073 Uploaded
greg
parents:
diff changeset
801 }
2b0906489073 Uploaded
greg
parents:
diff changeset
802 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
803 }
2b0906489073 Uploaded
greg
parents:
diff changeset
804 if ($filename =~ /^(\d+)\.faa$/) {
2b0906489073 Uploaded
greg
parents:
diff changeset
805 open(IN, "$orthogroups_fasta_dir/$filename") or die "Can't open $orthogroups_fasta_dir/$filename file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
806 while(<IN>){
2b0906489073 Uploaded
greg
parents:
diff changeset
807 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
808 if (/^>(\S+)/){ $aa_id = $1; next; }
2b0906489073 Uploaded
greg
parents:
diff changeset
809 else { s/\s+//g; $aa{$aa_id} .= $_; }
2b0906489073 Uploaded
greg
parents:
diff changeset
810 }
2b0906489073 Uploaded
greg
parents:
diff changeset
811 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
812 }
2b0906489073 Uploaded
greg
parents:
diff changeset
813 }
2b0906489073 Uploaded
greg
parents:
diff changeset
814 close DIR;
2b0906489073 Uploaded
greg
parents:
diff changeset
815 }
2b0906489073 Uploaded
greg
parents:
diff changeset
816 close ASSOC;
2b0906489073 Uploaded
greg
parents:
diff changeset
817 # Updating gene database table
2b0906489073 Uploaded
greg
parents:
diff changeset
818 log_msg("Inserting records into the plant_tribes_gene database table.");
2b0906489073 Uploaded
greg
parents:
diff changeset
819 $num_recs = 0;
2b0906489073 Uploaded
greg
parents:
diff changeset
820 foreach my $gene_id (sort keys %dna) {
2b0906489073 Uploaded
greg
parents:
diff changeset
821 my $stmt = qq(INSERT INTO plant_tribes_gene (gene_id, dna_sequence, aa_sequence) VALUES ('$gene_id', '$dna{$gene_id}', '$aa{$gene_id}'));
2b0906489073 Uploaded
greg
parents:
diff changeset
822 my $rv = $dbh->do($stmt) or die $DBI::errstr;
2b0906489073 Uploaded
greg
parents:
diff changeset
823 $num_recs = $num_recs + 1;
2b0906489073 Uploaded
greg
parents:
diff changeset
824 }
2b0906489073 Uploaded
greg
parents:
diff changeset
825 log_msg("$num_recs records for $species_name $scaffold were successfully inserted into the plant_tribes_gene table.");
2b0906489073 Uploaded
greg
parents:
diff changeset
826 # Updaing gene-scaffold-orthogroup-taxon-association database table
2b0906489073 Uploaded
greg
parents:
diff changeset
827 log_msg("Inserting records into the gene_scaffold_orthogroup_taxon_association database table.");
2b0906489073 Uploaded
greg
parents:
diff changeset
828 open(IN, "$gsot_association_prep_file") or die "Can't open $gsot_association_prep_file file\n";
2b0906489073 Uploaded
greg
parents:
diff changeset
829 $num_recs = 0;
2b0906489073 Uploaded
greg
parents:
diff changeset
830 my ( $stmt, $sth, $rv, $scaffold_id, $clustering_method, $orthogroup_id, $taxon_id, $gene_id );
2b0906489073 Uploaded
greg
parents:
diff changeset
831 my ( $gene_id_db, $scaffold_id_db, $orthogroup_id_db, $taxon_id_db );
2b0906489073 Uploaded
greg
parents:
diff changeset
832 while(<IN>){
2b0906489073 Uploaded
greg
parents:
diff changeset
833 chomp;
2b0906489073 Uploaded
greg
parents:
diff changeset
834 if (/^gene_id/) {
2b0906489073 Uploaded
greg
parents:
diff changeset
835 # gene_id scaffold_id clustering_method orthogroup_id species_name
2b0906489073 Uploaded
greg
parents:
diff changeset
836 next;
2b0906489073 Uploaded
greg
parents:
diff changeset
837 }
2b0906489073 Uploaded
greg
parents:
diff changeset
838 my @fields = split(/\t/, $_);
2b0906489073 Uploaded
greg
parents:
diff changeset
839 # gnl_Fakge_v1.0_AT1G03390.1 22Gv1.1 orthomcl 3 Fake genome
2b0906489073 Uploaded
greg
parents:
diff changeset
840 $gene_id = $fields[0];
2b0906489073 Uploaded
greg
parents:
diff changeset
841 $scaffold_id = $fields[1];
2b0906489073 Uploaded
greg
parents:
diff changeset
842 $clustering_method = $fields[2];
2b0906489073 Uploaded
greg
parents:
diff changeset
843 $orthogroup_id = $fields[3];
2b0906489073 Uploaded
greg
parents:
diff changeset
844 $species_name = $fields[4];
2b0906489073 Uploaded
greg
parents:
diff changeset
845 $stmt = qq(SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '$scaffold_id' AND clustering_method = '$clustering_method';);
2b0906489073 Uploaded
greg
parents:
diff changeset
846 $sth = $dbh->prepare( $stmt );
2b0906489073 Uploaded
greg
parents:
diff changeset
847 $rv = $sth->execute() or die $DBI::errstr;
2b0906489073 Uploaded
greg
parents:
diff changeset
848 if ($rv < 0) { print $DBI::errstr; }
2b0906489073 Uploaded
greg
parents:
diff changeset
849 while (my @row = $sth->fetchrow_array()) {
2b0906489073 Uploaded
greg
parents:
diff changeset
850 $scaffold_id_db = $row[0];
2b0906489073 Uploaded
greg
parents:
diff changeset
851 }
2b0906489073 Uploaded
greg
parents:
diff changeset
852 $stmt = qq(SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '$orthogroup_id' AND scaffold_id = '$scaffold_id_db';);
2b0906489073 Uploaded
greg
parents:
diff changeset
853 $sth = $dbh->prepare( $stmt );
2b0906489073 Uploaded
greg
parents:
diff changeset
854 $rv = $sth->execute() or die $DBI::errstr;
2b0906489073 Uploaded
greg
parents:
diff changeset
855 if ($rv < 0) { print $DBI::errstr; }
2b0906489073 Uploaded
greg
parents:
diff changeset
856 while (my @row = $sth->fetchrow_array()) {
2b0906489073 Uploaded
greg
parents:
diff changeset
857 $orthogroup_id_db = $row[0];
2b0906489073 Uploaded
greg
parents:
diff changeset
858 }
2b0906489073 Uploaded
greg
parents:
diff changeset
859 $stmt = qq(SELECT id FROM plant_tribes_taxon WHERE species_name = '$species_name' AND scaffold_id = '$scaffold_id_db';);
2b0906489073 Uploaded
greg
parents:
diff changeset
860 $sth = $dbh->prepare( $stmt );
2b0906489073 Uploaded
greg
parents:
diff changeset
861 $rv = $sth->execute() or die $DBI::errstr;
2b0906489073 Uploaded
greg
parents:
diff changeset
862 if ($rv < 0) { print $DBI::errstr; }
2b0906489073 Uploaded
greg
parents:
diff changeset
863 while (my @row = $sth->fetchrow_array()) {
2b0906489073 Uploaded
greg
parents:
diff changeset
864 $taxon_id_db = $row[0];
2b0906489073 Uploaded
greg
parents:
diff changeset
865 }
2b0906489073 Uploaded
greg
parents:
diff changeset
866 $stmt = qq(SELECT id FROM plant_tribes_gene WHERE gene_id = '$gene_id' );
2b0906489073 Uploaded
greg
parents:
diff changeset
867 $sth = $dbh->prepare( $stmt );
2b0906489073 Uploaded
greg
parents:
diff changeset
868 $rv = $sth->execute() or die $DBI::errstr;
2b0906489073 Uploaded
greg
parents:
diff changeset
869 if ($rv < 0) { print $DBI::errstr; }
2b0906489073 Uploaded
greg
parents:
diff changeset
870 while (my @row = $sth->fetchrow_array()) {
2b0906489073 Uploaded
greg
parents:
diff changeset
871 $gene_id_db = $row[0];
2b0906489073 Uploaded
greg
parents:
diff changeset
872 }
2b0906489073 Uploaded
greg
parents:
diff changeset
873 $stmt = qq(INSERT INTO gene_scaffold_orthogroup_taxon_association (gene_id, scaffold_id, orthogroup_id, taxon_id) VALUES ($gene_id_db, $scaffold_id_db, $orthogroup_id_db, $taxon_id_db));
2b0906489073 Uploaded
greg
parents:
diff changeset
874 $rv = $dbh->do($stmt) or die $DBI::errstr;
2b0906489073 Uploaded
greg
parents:
diff changeset
875 $num_recs = $num_recs + 1;
2b0906489073 Uploaded
greg
parents:
diff changeset
876 }
2b0906489073 Uploaded
greg
parents:
diff changeset
877 close IN;
2b0906489073 Uploaded
greg
parents:
diff changeset
878 log_msg("$num_recs records for $scaffold $clustering_method were successfully inserted into the gene_scaffold_orthogroup_taxon_association table.");
2b0906489073 Uploaded
greg
parents:
diff changeset
879 $dbh->disconnect();
2b0906489073 Uploaded
greg
parents:
diff changeset
880 }