comparison cpt_psm_2_gentable.pl @ 1:97ef96676b48 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:51:26 +0000
parents
children
comparison
equal deleted inserted replaced
0:b18e8268bf4e 1:97ef96676b48
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use Storable;
5 use CPT::GalaxyGetOpt;
6 use CPT::Bio::NW_MSA;
7 use Data::Dumper;
8 use CPT::Circos::Conf;
9 use POSIX;
10
11
12 my $ggo = CPT::GalaxyGetOpt->new();
13 my $options = $ggo->getOptions(
14 'options' => [
15 [ 'file', 'PSM2 Data File', { validate => 'File/Input', required => 1 } ],
16 [],
17 ['Cutoffs'],
18 ['evalue' , 'Evalue cutoff' , { validate => 'Float' , default => 1e-4 } ] ,
19 ['dice' , 'Dice cutoff' , { validate => 'Float' , default => 50 } ] ,
20 [],
21 ['Alignment Options'],
22 ['mismatch' , 'Mismatch Score' , { validate => 'Float' , default => -1} ] ,
23 ['gap_penalty' , 'Gap Penalty' , { validate => 'Float' , default => '0.0' } ] ,
24 ['match' , 'Match Score' , { validate => 'Float' , default => 5 } ] ,
25 ],
26 'outputs' => [
27 [
28 'diff_table',
29 'Output Comparison Table',
30 {
31 validate => 'File/Output',
32 required => 1,
33 default => 'genome_comp',
34 data_format => 'text/tabular',
35 default_format => 'TSV_U',
36 },
37 ],
38 [
39 'blastclust',
40 'Output Blastclust Table',
41 {
42 validate => 'File/Output',
43 required => 1,
44 default => 'blastclust',
45 data_format => 'text/tabular',
46 default_format => 'TSV_U',
47 }
48 ],
49 ],
50 'defaults' => [
51 'appid' => 'PSM.Comp',
52 'appname' => 'PSM Comparison Table',
53 'appdesc' => 'aligns and lists data from PSM Prep',
54 'appvers' => '1.94',
55 ],
56 'tests' => [
57 ],
58 );
59
60
61 my %data_file = %{retrieve($options->{file})};
62
63 print STDERR "Aliging genomes\n";
64 my $msa = CPT::Bio::NW_MSA->new(
65 gap_penalty => $options->{'gap_penalty'},
66 match_score => $options->{'match'},
67 mismatch_score => $options->{'mismatch'},
68 bidi => 1,
69 );
70
71 my @hits = @{$data_file{hit_table}};
72 my @clusters;
73
74 foreach my $hit(@hits){
75 my ($from, $to, $evalue, $dice) = @{$hit};
76 if($evalue < $options->{evalue} && $dice > $options->{dice}){
77 if($options->{verbose}){
78 print "$from $to\n";
79 }
80
81 my $foundmatch = 0;
82 foreach my $cluster(@clusters){
83 if($from ~~ @{$cluster} || $to ~~ @{$cluster}){
84 $foundmatch = 1;
85 if(!($from ~~ @{$cluster})){
86 push(@{$cluster}, $from);
87 }
88 if(!($to ~~ @{$cluster})){
89 push(@{$cluster}, $to);
90 }
91 }
92 }
93 if($foundmatch == 0){
94 push(@clusters, ["".($#clusters+2), $from, $to]);
95 }
96 $msa->add_relationship($from, $to);
97 }
98 }
99 my @fixed_clusters;
100 foreach my $cluster (@clusters) {
101 my ($idx, @values) = @{$cluster};
102 push(@fixed_clusters, [$idx, join(',', @values)]);
103 }
104
105 my @user_ordering = keys($data_file{gbk});
106
107 foreach my $genome(@user_ordering){
108 print STDERR "\tAligning $genome\n";
109 my $gi_list_ref = $data_file{gbk}{$genome}{gi};#"GI" list
110 if(! defined $gi_list_ref){
111 warn "Could not find $genome genome in the data file. Please be sure you have correctly specified the name of a genome from a genbank file. (See the LOCUS line for the name).";
112 }else{
113 $msa->align_list($gi_list_ref);
114 }
115 }
116
117 my @aligned_results = $msa->merged_array();
118 # Remove CRC64 hashes from sequences
119 foreach my $row(@aligned_results){
120 $row = [map { s/_[A-F0-9]{16}$//; $_ } @{$row}];
121 #my $key = ${$row}[0];
122 #foreach my $cluster(@clusters){
123
124 #}
125 }
126
127 my %table = (
128 'Sheet1' => {
129 header => \@user_ordering,
130 data => \@aligned_results,
131 }
132 );
133
134
135 use CPT::OutputFiles;
136 my $crr_output = CPT::OutputFiles->new(
137 name => 'diff_table',
138 GGO => $ggo,
139 );
140 $crr_output->CRR(data => \%table);
141
142 my %table2 = (
143 'Sheet1' => {
144 header => ['Cluster ID', 'Contents'],
145 data => \@fixed_clusters,
146 }
147 );
148 my $crr_output2 = CPT::OutputFiles->new(
149 name => 'blastclust',
150 GGO => $ggo,
151 );
152 $crr_output2->CRR(data => \%table2);
153
154 =head1 DESCRIPTION
155
156 Following the execution of the PSM Prep tool, this tool simply aligns the genomes and generates a table comparison the positions of all proteins. It can be very useful to figure out which genes are missing in which genomes.
157
158 =head2 IMPORTANT PARAMETERS
159
160 =over 4
161
162 =item C<mismatch>, C<gap_penalty>, C<match>
163
164 These parameters control the Needleman-Wunsch Multiple Sequence Alignment library's scoring scheme. Mismatch scores are generally negative and discourage unrelated proteins from being plotted in a line together. Match scores encourage related proteins to line up. Gap penalty is set at zero as we generally prefer gaps to mismatches in this tool; phage genomes are small and gaps are "cheap" to use, whereas mismatches can sometimes give an incorrect impression of relatedness. That said, how your plots look is completely up to you and we encourage experimentation!
165
166 =back
167
168 =cut