Mercurial > repos > cpt > cpt_psm_comparison_table
diff cpt_psm_0_prep.pl @ 1:f093e08f21f3 draft default tip
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:47:24 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_psm_0_prep.pl Mon Jun 05 02:47:24 2023 +0000 @@ -0,0 +1,194 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use Storable; +use Bio::SearchIO; +use Bio::SeqIO; +use Bio::Tools::CodonTable; +use Data::Dumper; +use File::Temp qw/tempfile tempdir/; + + +use CPT::GalaxyGetOpt; +my $ggo = CPT::GalaxyGetOpt->new(); +my $options = $ggo->getOptions( + 'options' => [ + [ 'file', 'Input file', { validate => 'File/Input', multiple => 1, required => 1, + file_format => ['genbank', 'txt'], + } ], + ], + 'outputs' => [ + [ + 'cpt_psm_object', + 'Output PSM Object', + { + validate => 'File/Output', + required => 1, + default => 'cpt_psm', + data_format => 'text/plain', + default_format => 'TXT' + } + ], + ], + 'defaults' => [ + 'appid' => 'PSM.Prep', + 'appname' => 'PSM Prep', + 'appdesc' => 'prepares data for the PSM Plotter', + 'appvers' => '1.94.2', + ], + 'tests' => [], +); + + + +use CPT::Bio; +my $bio = CPT::Bio->new(); + +my @genbank_files = @{$options->{file}}; + +my %data = ( + file_list => [], +); + +my $GLOBAL_UNLINK_VAR = 1; +my $tempdir = tempdir('cpt.psm2.XXXXXXX',CLEANUP => $GLOBAL_UNLINK_VAR); +use CPT::Util::CRC64; +my $crc = CPT::Util::CRC64->new(); + +foreach my $file(@genbank_files){ + my $seqio_object = Bio::SeqIO->new(-file => $file,-format=>'genbank'); + while(my $seqobj = $seqio_object->next_seq){ + my ( $fh, $path ) = tempfile('cds_export.XXXXXXXXX', UNLINK => $GLOBAL_UNLINK_VAR, DIR => $tempdir, SUFFIX => '.fa'); + + my @gi_array; + foreach my $feat ( $seqobj->get_SeqFeatures ) { + if($feat->primary_tag eq 'CDS'){ + my $header = $bio->_getIdentifier($feat); + # This ensures proteins have a file-specific ID appeneded to them. + my $seq = $bio->translate( + $bio->intelligent_get_seq($feat)); + + # Proteins come with translated stop codon + $seq =~ s/\*//g; + $seq =~ s/\+//g; + $seq =~ s/#//g; + $header .= "_" . $crc->crc64($seq); + push @gi_array, $header; + print $fh ">$header\n$seq\n"; + } + } + $data{gbk}{$seqobj->display_id()}{'gi'} = \@gi_array; + $data{gbk}{$seqobj->display_id()}{'fasta_location'} = $path; + $data{gbk}{$seqobj->display_id()}{'gbk_location'} = $file; + push(@{$data{file_list}}, $seqobj->display_id()); + close $fh; + } +} + + + +use IPC::Run3; + +# Concatenate Fasta Files +my @fasta_files; +foreach(@{$data{file_list}}){ + push(@fasta_files, $data{gbk}{$_}{fasta_location}); +} +my @command = ('cat', @fasta_files); +my ($merged_fa_fh, $merged_fa_path) = tempfile('merged.XXXXXXXXX', UNLINK => 1, DIR => $tempdir, SUFFIX => '.fa'); +my ($in, $out, $err); +run3 \@command, \$in, \$out, \$err; +if($err){ + print STDERR $err; +} +print $merged_fa_fh $out; +close($merged_fa_fh); + + +# Create Blast Database +my ($blastdb_fh, $blastdb_path) = tempfile('blastdb.XXXXXXXXX', UNLINK => 1, DIR => $tempdir); +@command = ('makeblastdb', + '-dbtype', 'prot', + '-in', $merged_fa_path, + '-out', $blastdb_path, +); +my ($makeblast_in,$makeblast_out,$makeblast_err); +run3 \@command, \$makeblast_in, \$makeblast_out, \$makeblast_err; + +# Blast files +foreach(@{$data{file_list}}){ + #push(@fasta_files, $data{gbk}{$_}{fasta_location}); + my @blast_cmd = ('blastp', + '-query', $data{gbk}{$_}{fasta_location}, + '-out', $data{gbk}{$_}{fasta_location} . ".xml", + '-outfmt', '5', + '-db', $blastdb_path, + ); + my ($blast_in,$blast_out,$blast_err); + run3 \@blast_cmd, \$blast_in, \$blast_out, \$blast_err; +} + +my $value; +my @data_tsv; + +foreach(@{$data{file_list}}){ + #push(@fasta_files, $data{gbk}{$_}{fasta_location}); + my $file = $data{gbk}{$_}{fasta_location}; + + my $in = new Bio::SearchIO( + -format => 'blastxml', + -tempfile => 1, + -file => "$file.xml", + ); + while( my $result = $in->next_result ) { + while( my $hit = $result->next_hit ) { + while( my $hsp = $hit->next_hsp ) { + my $Identity = $hsp->percent_identity/100 * $hsp->length('query'); + my $IterationQueryLength = $result->query_length(); + my $HitLength = $hit->length(); + my $dice = (200*$Identity)/($HitLength + $IterationQueryLength); + + my $c = $result->query_description();#genome_a_header + my $d = $hit->name();#genome_b_header + # Skip self-self links + next if($c eq $d); + + push (@data_tsv, + [ + $c, + $d, + $hsp->evalue(), + $dice, + ] + ); + + }#close hsp + }#close hit + }#close result +}#close for genomes + +$data{hit_table} = \@data_tsv; + +use CPT::OutputFiles; +my $psmout = CPT::OutputFiles->new( + name => 'cpt_psm_object', + GGO => $ggo, +); +my @val = $psmout->CRR(data => "none", extension => 'psm'); +store \%data,$val[0]; + + + + + + + +=head1 NAME + +PSM Prep + +=head1 DESCRIPTION + +This tool takes in 2 or more GenBank files, blasts, and prepares data structures for use in the companion tool: PSM Plotter. Select as many (multi)-gbk files as you I<might> want to plot. Once this tool is done, you can select any subset of those to plot then. + +=cut