0
|
1 #!/usr/bin/env perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use Storable;
|
|
5 use Bio::SearchIO;
|
|
6 use Bio::SeqIO;
|
|
7 use Bio::Tools::CodonTable;
|
|
8 use Data::Dumper;
|
|
9 use File::Temp qw/tempfile tempdir/;
|
|
10
|
|
11
|
|
12 use CPT::GalaxyGetOpt;
|
|
13 my $ggo = CPT::GalaxyGetOpt->new();
|
|
14 my $options = $ggo->getOptions(
|
|
15 'options' => [
|
|
16 [ 'file', 'Input file', { validate => 'File/Input', multiple => 1, required => 1,
|
|
17 file_format => ['genbank', 'txt'],
|
|
18 } ],
|
|
19 ],
|
|
20 'outputs' => [
|
|
21 [
|
|
22 'cpt_psm_object',
|
|
23 'Output PSM Object',
|
|
24 {
|
|
25 validate => 'File/Output',
|
|
26 required => 1,
|
|
27 default => 'cpt_psm',
|
|
28 data_format => 'text/plain',
|
|
29 default_format => 'TXT'
|
|
30 }
|
|
31 ],
|
|
32 ],
|
|
33 'defaults' => [
|
|
34 'appid' => 'PSM.Prep',
|
|
35 'appname' => 'PSM Prep',
|
|
36 'appdesc' => 'prepares data for the PSM Plotter',
|
|
37 'appvers' => '1.94.2',
|
|
38 ],
|
|
39 'tests' => [],
|
|
40 );
|
|
41
|
|
42
|
|
43
|
|
44 use CPT::Bio;
|
|
45 my $bio = CPT::Bio->new();
|
|
46
|
|
47 my @genbank_files = @{$options->{file}};
|
|
48
|
|
49 my %data = (
|
|
50 file_list => [],
|
|
51 );
|
|
52
|
|
53 my $GLOBAL_UNLINK_VAR = 1;
|
|
54 my $tempdir = tempdir('cpt.psm2.XXXXXXX',CLEANUP => $GLOBAL_UNLINK_VAR);
|
|
55 use CPT::Util::CRC64;
|
|
56 my $crc = CPT::Util::CRC64->new();
|
|
57
|
|
58 foreach my $file(@genbank_files){
|
|
59 my $seqio_object = Bio::SeqIO->new(-file => $file,-format=>'genbank');
|
|
60 while(my $seqobj = $seqio_object->next_seq){
|
|
61 my ( $fh, $path ) = tempfile('cds_export.XXXXXXXXX', UNLINK => $GLOBAL_UNLINK_VAR, DIR => $tempdir, SUFFIX => '.fa');
|
|
62
|
|
63 my @gi_array;
|
|
64 foreach my $feat ( $seqobj->get_SeqFeatures ) {
|
|
65 if($feat->primary_tag eq 'CDS'){
|
|
66 my $header = $bio->_getIdentifier($feat);
|
|
67 # This ensures proteins have a file-specific ID appeneded to them.
|
|
68 my $seq = $bio->translate(
|
|
69 $bio->intelligent_get_seq($feat));
|
|
70
|
|
71 # Proteins come with translated stop codon
|
|
72 $seq =~ s/\*//g;
|
|
73 $seq =~ s/\+//g;
|
|
74 $seq =~ s/#//g;
|
|
75 $header .= "_" . $crc->crc64($seq);
|
|
76 push @gi_array, $header;
|
|
77 print $fh ">$header\n$seq\n";
|
|
78 }
|
|
79 }
|
|
80 $data{gbk}{$seqobj->display_id()}{'gi'} = \@gi_array;
|
|
81 $data{gbk}{$seqobj->display_id()}{'fasta_location'} = $path;
|
|
82 $data{gbk}{$seqobj->display_id()}{'gbk_location'} = $file;
|
|
83 push(@{$data{file_list}}, $seqobj->display_id());
|
|
84 close $fh;
|
|
85 }
|
|
86 }
|
|
87
|
|
88
|
|
89
|
|
90 use IPC::Run3;
|
|
91
|
|
92 # Concatenate Fasta Files
|
|
93 my @fasta_files;
|
|
94 foreach(@{$data{file_list}}){
|
|
95 push(@fasta_files, $data{gbk}{$_}{fasta_location});
|
|
96 }
|
|
97 my @command = ('cat', @fasta_files);
|
|
98 my ($merged_fa_fh, $merged_fa_path) = tempfile('merged.XXXXXXXXX', UNLINK => 1, DIR => $tempdir, SUFFIX => '.fa');
|
|
99 my ($in, $out, $err);
|
|
100 run3 \@command, \$in, \$out, \$err;
|
|
101 if($err){
|
|
102 print STDERR $err;
|
|
103 }
|
|
104 print $merged_fa_fh $out;
|
|
105 close($merged_fa_fh);
|
|
106
|
|
107
|
|
108 # Create Blast Database
|
|
109 my ($blastdb_fh, $blastdb_path) = tempfile('blastdb.XXXXXXXXX', UNLINK => 1, DIR => $tempdir);
|
|
110 @command = ('makeblastdb',
|
|
111 '-dbtype', 'prot',
|
|
112 '-in', $merged_fa_path,
|
|
113 '-out', $blastdb_path,
|
|
114 );
|
|
115 my ($makeblast_in,$makeblast_out,$makeblast_err);
|
|
116 run3 \@command, \$makeblast_in, \$makeblast_out, \$makeblast_err;
|
|
117
|
|
118 # Blast files
|
|
119 foreach(@{$data{file_list}}){
|
|
120 #push(@fasta_files, $data{gbk}{$_}{fasta_location});
|
|
121 my @blast_cmd = ('blastp',
|
|
122 '-query', $data{gbk}{$_}{fasta_location},
|
|
123 '-out', $data{gbk}{$_}{fasta_location} . ".xml",
|
|
124 '-outfmt', '5',
|
|
125 '-db', $blastdb_path,
|
|
126 );
|
|
127 my ($blast_in,$blast_out,$blast_err);
|
|
128 run3 \@blast_cmd, \$blast_in, \$blast_out, \$blast_err;
|
|
129 }
|
|
130
|
|
131 my $value;
|
|
132 my @data_tsv;
|
|
133
|
|
134 foreach(@{$data{file_list}}){
|
|
135 #push(@fasta_files, $data{gbk}{$_}{fasta_location});
|
|
136 my $file = $data{gbk}{$_}{fasta_location};
|
|
137
|
|
138 my $in = new Bio::SearchIO(
|
|
139 -format => 'blastxml',
|
|
140 -tempfile => 1,
|
|
141 -file => "$file.xml",
|
|
142 );
|
|
143 while( my $result = $in->next_result ) {
|
|
144 while( my $hit = $result->next_hit ) {
|
|
145 while( my $hsp = $hit->next_hsp ) {
|
|
146 my $Identity = $hsp->percent_identity/100 * $hsp->length('query');
|
|
147 my $IterationQueryLength = $result->query_length();
|
|
148 my $HitLength = $hit->length();
|
|
149 my $dice = (200*$Identity)/($HitLength + $IterationQueryLength);
|
|
150
|
|
151 my $c = $result->query_description();#genome_a_header
|
|
152 my $d = $hit->name();#genome_b_header
|
|
153 # Skip self-self links
|
|
154 next if($c eq $d);
|
|
155
|
|
156 push (@data_tsv,
|
|
157 [
|
|
158 $c,
|
|
159 $d,
|
|
160 $hsp->evalue(),
|
|
161 $dice,
|
|
162 ]
|
|
163 );
|
|
164
|
|
165 }#close hsp
|
|
166 }#close hit
|
|
167 }#close result
|
|
168 }#close for genomes
|
|
169
|
|
170 $data{hit_table} = \@data_tsv;
|
|
171
|
|
172 use CPT::OutputFiles;
|
|
173 my $psmout = CPT::OutputFiles->new(
|
|
174 name => 'cpt_psm_object',
|
|
175 GGO => $ggo,
|
|
176 );
|
|
177 my @val = $psmout->CRR(data => "none", extension => 'psm');
|
|
178 store \%data,$val[0];
|
|
179
|
|
180
|
|
181
|
|
182
|
|
183
|
|
184
|
|
185
|
|
186 =head1 NAME
|
|
187
|
|
188 PSM Prep
|
|
189
|
|
190 =head1 DESCRIPTION
|
|
191
|
|
192 This tool takes in 2 or more GenBank files, blasts, and prepares data structures for use in the companion tool: PSM Plotter. Select as many (multi)-gbk files as you I<might> want to plot. Once this tool is done, you can select any subset of those to plot then.
|
|
193
|
|
194 =cut
|