diff smart-domain.pl @ 0:a3b26189fee3 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/smart_domains commit 266d7c45a443e893f15eab4b1485ca7c1c406a14
author earlhaminst
date Thu, 15 Jun 2017 07:52:09 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/smart-domain.pl	Thu Jun 15 07:52:09 2017 -0400
@@ -0,0 +1,302 @@
+#!/usr/bin/env perl
+use strict;
+use HTTP::Request::Common;
+use LWP::UserAgent;
+use Pod::Usage;
+use Getopt::Long;
+use Bio::SeqIO;
+use JSON;
+
+my $json = JSON->new->allow_nonref;
+#run this script with --help to see the options
+
+=pod
+
+=head1 NAME
+
+SMART_batch  -  submit sequences from a FASTA file to SMART
+
+=head1 SYNOPSIS
+
+B<SMART_batch.pl> I<options>
+
+=head1 DESCRIPTION
+
+Use B<SMART_batch.pl> to submit multiple protein sequences from a FASTA file into the SMART analysis queue. Results are saved into plain text files.
+
+=head1 GENERAL OPTIONS
+
+
+=over 4
+
+=item B<--help>
+
+display this message
+
+=item B<--inputFile>
+
+FASTA file with sequences to submit
+
+=item B<--outputDirectory>
+
+Directory which will be used to store the results. Will be created if it doesn't exist. Defaults to 'SMART_results'.
+
+=item B<--outputFormat>
+
+Choose prefered output format from txt, json or tabular.
+Default is txt.
+
+=back
+
+=head1 ANALYSIS OPTIONS
+
+=over 4
+
+=item B<--includePfam>
+
+Include Pfam domains in the search. (http://pfam.sanger.ac.uk/)
+
+=item B<--includeSignalP>
+
+Include signal peptide predictions. (http://www.cbs.dtu.dk/services/SignalP/)
+
+=item B<--includeRepeats>
+
+Include internal repeat predictions. (http://www.well.ox.ac.uk/rmott/ARIADNE/)
+
+=item B<--includeDISEMBL>
+
+Include predictions of internal protein disorder. (http://dis.embl.de/)
+
+=item B<--includeSchnipsel>
+
+Include predictions of outlier homologues and homologues of known structures. (http://smart.embl.de/help/smart_glossary.shtml#outlier)
+
+=back
+
+
+=head1 SEE ALSO
+
+ SMART Home page : http://smart.embl.de
+ SMART FAQ       : http://smart.embl.de/help/FAQ.shtml
+
+=head1 AUTHORS
+
+ Written by Ivica Letunic <ivica@letunic.com>
+
+ Modified by Anil Thanki <Anil.Thanki@earlham.ac.uk> to parse output in JSON and tabular format to adapt in Galaxy
+
+=cut
+
+my $submit_url = "http://smart.embl.de/smart/show_motifs.pl";
+my $job_status_url = "http://smart.embl.de/smart/job_status.pl";
+my $output_format = "txt";
+my ($show_help, $input_file, $output_directory, $do_pfam, $do_signalp, $do_rep, $do_disembl, $do_schnipsel);
+my $op_r = GetOptions (
+                       "help" => \$show_help,
+                       "inputFile=s"   => \$input_file,
+                       "outputDirectory=s"   => \$output_directory,
+                       "includePfam" => \$do_pfam,
+                       "includeSignalP" => \$do_signalp,
+                       "includeRepeats" => \$do_rep,
+                       "includeDISEMBL" => \$do_disembl,
+                       "includeSchnipsel" => \$do_schnipsel,
+                       "outputFormat=s" => \$output_format,
+                      );
+
+unless ($input_file) { $show_help = 1; }
+
+pod2usage(VERBOSE => 2) if ( $show_help );
+
+my $ua  = LWP::UserAgent->new();
+my $result = "";
+$ua->agent("SMARTbatch1.0");
+
+
+print "\nSMART batch analysis\n======================\n";
+
+unless (defined $output_directory) { $output_directory = 'SMART_results'; }
+unless (-d $output_directory) { mkdir $output_directory; }
+unless (-e $input_file) { print STDERR "Input file does not exist."; exit;}
+
+my $io = new Bio::SeqIO(-format=> 'fasta', -file=> $input_file);
+
+#process sequences one by one. ALWAYS wait for the results before submitting the next sequence.
+
+while (my $seq = $io->next_seq) {
+  my $seq_id = $seq->display_id;
+  my $output_file;
+  if ($output_format eq "txt")
+  {
+    $output_file = $output_directory . "/" . $seq_id . "_SMART_results.txt";
+  } elsif ($output_format eq "tabular")
+  {
+    $output_file = $output_directory . "/" . $seq_id . "_SMART_results.tabular";
+  } elsif ($output_format eq "json"){
+    $output_file = $output_directory . "/" . $seq_id . "_SMART_results.json";
+  }
+  if (-e $output_file) {
+    my @s = stat($output_file);
+    if ($s[7] == 0) {
+      print "Removing empty results file $output_file.\n";
+      unlink $output_file;
+    } else {
+      print "Skipping sequence $seq_id because the results file already exists.\n";
+      next;
+    }
+  }
+  print "Submitting sequence $seq_id...\n";
+  #prepare the basic POST data
+  my %post_content;
+  $post_content{'SEQUENCE'} = $seq->seq;
+  $post_content{'TEXTONLY'} = 1;
+  if ($do_pfam) { $post_content{'DO_PFAM'} = 'DO_PFAM'; }
+  if ($do_signalp) { $post_content{'INCLUDE_SIGNALP'} = 'INCLUDE_SIGNALP'; }
+  if ($do_rep) { $post_content{'DO_PROSPERO'} = 'DO_PROSPERO'; }
+  if ($do_disembl) { $post_content{'DO_DISEMBL'} = 'DO_DISEMBL'; }
+  if ($do_schnipsel) { $post_content{'INCLUDE_BLAST'} = 'INCLUDE_BLAST'; }
+  my $req = POST $submit_url, Content_Type => 'form-data', Content => [ %post_content ];
+  my $response = $ua->request($req);
+  if ($response->is_success()) {
+    my @res = split(/\n/, $response->content);
+    #check if we got the results directly (precomputed results)
+    shift @res if ($res[1] =~ /^--\ SMART\ RESULT/);
+    if ($res[0] =~ /^--\ SMART\ RESULT/) {
+      response_parser($output_file, $response, $output_format);
+    } else {
+      #we're in the queue, or there was an error
+      my $job_id;
+      for (my $i = 0; $i <= $#res; $i++) {
+        if ($res[$i] =~ /job_status\.pl\?jobid=(\d+.+?)'/) {
+          $job_id = $1;
+          last;
+        }
+      }
+      unless (length $job_id) {
+        #there is no job ID, so an error occured
+        my $error_file = "$output_directory/$seq_id\_SMART_error.html";
+        open (ERR, ">$error_file") or die "Cannot write to $error_file";
+        print ERR $response->content;
+        close ERR;
+        print "SMART returned an error page, which was saved into '$error_file'.\nPlease check the file for details. Aborting further submissions.\n";
+        exit;
+      } else {
+        #we have a jobID, check every 10 seconds until we get the results
+        print "Job entered the queue with ID $job_id. Waiting for results.\n";
+        my $job_status_req = GET "$job_status_url?jobid=$job_id";
+        sleep 5;
+        while (1) {
+          my $job_status_response = $ua->request($job_status_req);
+          if ($job_status_response->is_success) {
+            #check if we got the results
+            my @job_status_res = split(/\n/, $job_status_response->content);
+       shift @job_status_res if ($job_status_res[1] =~ /^--\ SMART\ RESULT/);
+            if ($job_status_res[0] =~ /^--\ SMART\ RESULT/) {
+              response_parser($output_file, $job_status_response, $output_format);
+              last;
+            } else {
+              #still in queue
+              sleep 10;
+            }
+          } else {
+            print "SMART returned a web server error. Full message follows:\n\n";
+            print $response->as_string;
+            die;
+          }
+        }
+      }
+    }
+
+  } else {
+    print "SMART returned a web server error. Full message follows:\n\n";
+    print $response->as_string;
+    die;
+  }
+  #be nice to other users
+  sleep 5;
+}
+
+sub toJSON{
+  my ($text) = @_;
+
+  my @result = split("\n", $text);
+  my $line;
+  my %hash;
+  my @hashes;
+  my $json;
+
+  foreach $line (@result)
+  {
+    if(index($line, "=") > 0){
+      my $key = (split(/=/, $line))[0];
+      my $value = (split(/=/, $line))[1];
+      $hash{$key} = $value;
+    } elsif(length($line) == 0){
+      if (exists $hash{"DOMAIN"})
+      {
+        $json = encode_json \%hash;
+        push @hashes, $json;
+      }
+      %hash = ();
+    }
+  }
+
+  return @hashes;
+}
+
+sub response_parser{
+  my $output_file = $_[0];
+  my $job_status_response = $_[1];
+  my $output_format = $_[2];
+
+
+    open (OUT, ">$output_file") or die "Cannot write to $output_file";
+      $result = $job_status_response->content;
+      if ($output_format eq "txt")
+        {
+          print OUT $result;
+        } elsif ($output_format eq "tabular")
+        {
+          my @result = toJSON($result);
+
+          my $first_row = decode_json $result[0];
+          my @keys;
+          my $counter;
+
+          foreach my $key(sort keys %$first_row) {
+              print OUT "$key";
+              print OUT "\t"    if ++$counter < scalar keys %$first_row;
+              push @keys, $key;
+          }
+          print OUT "\n";
+
+          my $counter;
+
+          foreach my $line (@result)
+          {
+            my $first_row = decode_json $line;
+            my $counter;
+            foreach my $key (@keys)
+            {
+              print OUT $first_row->{$key};
+              print OUT "\t"    if ++$counter < scalar(@keys);
+            }
+            print OUT "\n";
+          }
+
+        } elsif ($output_format eq "json"){
+          my @result = toJSON($result);
+
+          print OUT "[";
+          my $counter;
+          foreach my $line (@result)
+          {
+            print OUT $line;
+            print OUT ","    if ++$counter < scalar(@result);
+          }
+          print OUT "]";
+
+        }
+      close OUT;
+      print "Results saved to '$output_file'\n";
+}