annotate cpt_psm_plotter/cpt_psm_2_gentable.pl @ 0:54c7a3ea81e2 draft

Uploaded
author cpt
date Tue, 05 Jul 2022 05:40:36 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env perl
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
2 use strict;
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
3 use warnings;
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
4 use Storable;
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
5 use CPT::GalaxyGetOpt;
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
6 use CPT::Bio::NW_MSA;
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
7 use Data::Dumper;
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
8 use CPT::Circos::Conf;
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
9 use POSIX;
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
10
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
11
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
12 my $ggo = CPT::GalaxyGetOpt->new();
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
13 my $options = $ggo->getOptions(
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
14 'options' => [
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
15 [ 'file', 'PSM2 Data File', { validate => 'File/Input', required => 1 } ],
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
16 [],
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
17 ['Cutoffs'],
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
18 ['evalue' , 'Evalue cutoff' , { validate => 'Float' , default => 1e-4 } ] ,
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
19 ['dice' , 'Dice cutoff' , { validate => 'Float' , default => 50 } ] ,
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
20 [],
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
21 ['Alignment Options'],
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
22 ['mismatch' , 'Mismatch Score' , { validate => 'Float' , default => -1} ] ,
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
23 ['gap_penalty' , 'Gap Penalty' , { validate => 'Float' , default => '0.0' } ] ,
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
24 ['match' , 'Match Score' , { validate => 'Float' , default => 5 } ] ,
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
25 ],
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
26 'outputs' => [
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
27 [
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
28 'diff_table',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
29 'Output Comparison Table',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
30 {
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
31 validate => 'File/Output',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
32 required => 1,
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
33 default => 'genome_comp',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
34 data_format => 'text/tabular',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
35 default_format => 'TSV_U',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
36 },
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
37 ],
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
38 [
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
39 'blastclust',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
40 'Output Blastclust Table',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
41 {
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
42 validate => 'File/Output',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
43 required => 1,
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
44 default => 'blastclust',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
45 data_format => 'text/tabular',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
46 default_format => 'TSV_U',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
47 }
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
48 ],
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
49 ],
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
50 'defaults' => [
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
51 'appid' => 'PSM.Comp',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
52 'appname' => 'PSM Comparison Table',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
53 'appdesc' => 'aligns and lists data from PSM Prep',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
54 'appvers' => '1.94',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
55 ],
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
56 'tests' => [
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
57 ],
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
58 );
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
59
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
60
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
61 my %data_file = %{retrieve($options->{file})};
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
62
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
63 print STDERR "Aliging genomes\n";
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
64 my $msa = CPT::Bio::NW_MSA->new(
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
65 gap_penalty => $options->{'gap_penalty'},
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
66 match_score => $options->{'match'},
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
67 mismatch_score => $options->{'mismatch'},
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
68 bidi => 1,
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
69 );
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
70
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
71 my @hits = @{$data_file{hit_table}};
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
72 my @clusters;
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
73
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
74 foreach my $hit(@hits){
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
75 my ($from, $to, $evalue, $dice) = @{$hit};
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
76 if($evalue < $options->{evalue} && $dice > $options->{dice}){
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
77 if($options->{verbose}){
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
78 print "$from $to\n";
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
79 }
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
80
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
81 my $foundmatch = 0;
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
82 foreach my $cluster(@clusters){
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
83 if($from ~~ @{$cluster} || $to ~~ @{$cluster}){
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
84 $foundmatch = 1;
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
85 if(!($from ~~ @{$cluster})){
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
86 push(@{$cluster}, $from);
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
87 }
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
88 if(!($to ~~ @{$cluster})){
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
89 push(@{$cluster}, $to);
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
90 }
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
91 }
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
92 }
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
93 if($foundmatch == 0){
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
94 push(@clusters, ["".($#clusters+2), $from, $to]);
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
95 }
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
96 $msa->add_relationship($from, $to);
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
97 }
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
98 }
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
99 my @fixed_clusters;
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
100 foreach my $cluster (@clusters) {
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
101 my ($idx, @values) = @{$cluster};
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
102 push(@fixed_clusters, [$idx, join(',', @values)]);
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
103 }
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
104
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
105 my @user_ordering = keys($data_file{gbk});
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
106
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
107 foreach my $genome(@user_ordering){
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
108 print STDERR "\tAligning $genome\n";
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
109 my $gi_list_ref = $data_file{gbk}{$genome}{gi};#"GI" list
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
110 if(! defined $gi_list_ref){
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
111 warn "Could not find $genome genome in the data file. Please be sure you have correctly specified the name of a genome from a genbank file. (See the LOCUS line for the name).";
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
112 }else{
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
113 $msa->align_list($gi_list_ref);
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
114 }
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
115 }
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
116
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
117 my @aligned_results = $msa->merged_array();
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
118 # Remove CRC64 hashes from sequences
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
119 foreach my $row(@aligned_results){
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
120 $row = [map { s/_[A-F0-9]{16}$//; $_ } @{$row}];
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
121 #my $key = ${$row}[0];
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
122 #foreach my $cluster(@clusters){
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
123
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
124 #}
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
125 }
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
126
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
127 my %table = (
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
128 'Sheet1' => {
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
129 header => \@user_ordering,
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
130 data => \@aligned_results,
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
131 }
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
132 );
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
133
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
134
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
135 use CPT::OutputFiles;
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
136 my $crr_output = CPT::OutputFiles->new(
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
137 name => 'diff_table',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
138 GGO => $ggo,
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
139 );
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
140 $crr_output->CRR(data => \%table);
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
141
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
142 my %table2 = (
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
143 'Sheet1' => {
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
144 header => ['Cluster ID', 'Contents'],
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
145 data => \@fixed_clusters,
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
146 }
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
147 );
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
148 my $crr_output2 = CPT::OutputFiles->new(
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
149 name => 'blastclust',
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
150 GGO => $ggo,
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
151 );
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
152 $crr_output2->CRR(data => \%table2);
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
153
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
154 =head1 DESCRIPTION
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
155
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
156 Following the execution of the PSM Prep tool, this tool simply aligns the genomes and generates a table comparison the positions of all proteins. It can be very useful to figure out which genes are missing in which genomes.
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
157
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
158 =head2 IMPORTANT PARAMETERS
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
159
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
160 =over 4
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
161
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
162 =item C<mismatch>, C<gap_penalty>, C<match>
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
163
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
164 These parameters control the Needleman-Wunsch Multiple Sequence Alignment library's scoring scheme. Mismatch scores are generally negative and discourage unrelated proteins from being plotted in a line together. Match scores encourage related proteins to line up. Gap penalty is set at zero as we generally prefer gaps to mismatches in this tool; phage genomes are small and gaps are "cheap" to use, whereas mismatches can sometimes give an incorrect impression of relatedness. That said, how your plots look is completely up to you and we encourage experimentation!
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
165
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
166 =back
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
167
54c7a3ea81e2 Uploaded
cpt
parents:
diff changeset
168 =cut