Mercurial > repos > earlhaminst > smart_domains
comparison smart-domain.pl @ 0:a3b26189fee3 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/smart_domains commit 266d7c45a443e893f15eab4b1485ca7c1c406a14
author | earlhaminst |
---|---|
date | Thu, 15 Jun 2017 07:52:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a3b26189fee3 |
---|---|
1 #!/usr/bin/env perl | |
2 use strict; | |
3 use HTTP::Request::Common; | |
4 use LWP::UserAgent; | |
5 use Pod::Usage; | |
6 use Getopt::Long; | |
7 use Bio::SeqIO; | |
8 use JSON; | |
9 | |
10 my $json = JSON->new->allow_nonref; | |
11 #run this script with --help to see the options | |
12 | |
13 =pod | |
14 | |
15 =head1 NAME | |
16 | |
17 SMART_batch - submit sequences from a FASTA file to SMART | |
18 | |
19 =head1 SYNOPSIS | |
20 | |
21 B<SMART_batch.pl> I<options> | |
22 | |
23 =head1 DESCRIPTION | |
24 | |
25 Use B<SMART_batch.pl> to submit multiple protein sequences from a FASTA file into the SMART analysis queue. Results are saved into plain text files. | |
26 | |
27 =head1 GENERAL OPTIONS | |
28 | |
29 | |
30 =over 4 | |
31 | |
32 =item B<--help> | |
33 | |
34 display this message | |
35 | |
36 =item B<--inputFile> | |
37 | |
38 FASTA file with sequences to submit | |
39 | |
40 =item B<--outputDirectory> | |
41 | |
42 Directory which will be used to store the results. Will be created if it doesn't exist. Defaults to 'SMART_results'. | |
43 | |
44 =item B<--outputFormat> | |
45 | |
46 Choose prefered output format from txt, json or tabular. | |
47 Default is txt. | |
48 | |
49 =back | |
50 | |
51 =head1 ANALYSIS OPTIONS | |
52 | |
53 =over 4 | |
54 | |
55 =item B<--includePfam> | |
56 | |
57 Include Pfam domains in the search. (http://pfam.sanger.ac.uk/) | |
58 | |
59 =item B<--includeSignalP> | |
60 | |
61 Include signal peptide predictions. (http://www.cbs.dtu.dk/services/SignalP/) | |
62 | |
63 =item B<--includeRepeats> | |
64 | |
65 Include internal repeat predictions. (http://www.well.ox.ac.uk/rmott/ARIADNE/) | |
66 | |
67 =item B<--includeDISEMBL> | |
68 | |
69 Include predictions of internal protein disorder. (http://dis.embl.de/) | |
70 | |
71 =item B<--includeSchnipsel> | |
72 | |
73 Include predictions of outlier homologues and homologues of known structures. (http://smart.embl.de/help/smart_glossary.shtml#outlier) | |
74 | |
75 =back | |
76 | |
77 | |
78 =head1 SEE ALSO | |
79 | |
80 SMART Home page : http://smart.embl.de | |
81 SMART FAQ : http://smart.embl.de/help/FAQ.shtml | |
82 | |
83 =head1 AUTHORS | |
84 | |
85 Written by Ivica Letunic <ivica@letunic.com> | |
86 | |
87 Modified by Anil Thanki <Anil.Thanki@earlham.ac.uk> to parse output in JSON and tabular format to adapt in Galaxy | |
88 | |
89 =cut | |
90 | |
91 my $submit_url = "http://smart.embl.de/smart/show_motifs.pl"; | |
92 my $job_status_url = "http://smart.embl.de/smart/job_status.pl"; | |
93 my $output_format = "txt"; | |
94 my ($show_help, $input_file, $output_directory, $do_pfam, $do_signalp, $do_rep, $do_disembl, $do_schnipsel); | |
95 my $op_r = GetOptions ( | |
96 "help" => \$show_help, | |
97 "inputFile=s" => \$input_file, | |
98 "outputDirectory=s" => \$output_directory, | |
99 "includePfam" => \$do_pfam, | |
100 "includeSignalP" => \$do_signalp, | |
101 "includeRepeats" => \$do_rep, | |
102 "includeDISEMBL" => \$do_disembl, | |
103 "includeSchnipsel" => \$do_schnipsel, | |
104 "outputFormat=s" => \$output_format, | |
105 ); | |
106 | |
107 unless ($input_file) { $show_help = 1; } | |
108 | |
109 pod2usage(VERBOSE => 2) if ( $show_help ); | |
110 | |
111 my $ua = LWP::UserAgent->new(); | |
112 my $result = ""; | |
113 $ua->agent("SMARTbatch1.0"); | |
114 | |
115 | |
116 print "\nSMART batch analysis\n======================\n"; | |
117 | |
118 unless (defined $output_directory) { $output_directory = 'SMART_results'; } | |
119 unless (-d $output_directory) { mkdir $output_directory; } | |
120 unless (-e $input_file) { print STDERR "Input file does not exist."; exit;} | |
121 | |
122 my $io = new Bio::SeqIO(-format=> 'fasta', -file=> $input_file); | |
123 | |
124 #process sequences one by one. ALWAYS wait for the results before submitting the next sequence. | |
125 | |
126 while (my $seq = $io->next_seq) { | |
127 my $seq_id = $seq->display_id; | |
128 my $output_file; | |
129 if ($output_format eq "txt") | |
130 { | |
131 $output_file = $output_directory . "/" . $seq_id . "_SMART_results.txt"; | |
132 } elsif ($output_format eq "tabular") | |
133 { | |
134 $output_file = $output_directory . "/" . $seq_id . "_SMART_results.tabular"; | |
135 } elsif ($output_format eq "json"){ | |
136 $output_file = $output_directory . "/" . $seq_id . "_SMART_results.json"; | |
137 } | |
138 if (-e $output_file) { | |
139 my @s = stat($output_file); | |
140 if ($s[7] == 0) { | |
141 print "Removing empty results file $output_file.\n"; | |
142 unlink $output_file; | |
143 } else { | |
144 print "Skipping sequence $seq_id because the results file already exists.\n"; | |
145 next; | |
146 } | |
147 } | |
148 print "Submitting sequence $seq_id...\n"; | |
149 #prepare the basic POST data | |
150 my %post_content; | |
151 $post_content{'SEQUENCE'} = $seq->seq; | |
152 $post_content{'TEXTONLY'} = 1; | |
153 if ($do_pfam) { $post_content{'DO_PFAM'} = 'DO_PFAM'; } | |
154 if ($do_signalp) { $post_content{'INCLUDE_SIGNALP'} = 'INCLUDE_SIGNALP'; } | |
155 if ($do_rep) { $post_content{'DO_PROSPERO'} = 'DO_PROSPERO'; } | |
156 if ($do_disembl) { $post_content{'DO_DISEMBL'} = 'DO_DISEMBL'; } | |
157 if ($do_schnipsel) { $post_content{'INCLUDE_BLAST'} = 'INCLUDE_BLAST'; } | |
158 my $req = POST $submit_url, Content_Type => 'form-data', Content => [ %post_content ]; | |
159 my $response = $ua->request($req); | |
160 if ($response->is_success()) { | |
161 my @res = split(/\n/, $response->content); | |
162 #check if we got the results directly (precomputed results) | |
163 shift @res if ($res[1] =~ /^--\ SMART\ RESULT/); | |
164 if ($res[0] =~ /^--\ SMART\ RESULT/) { | |
165 response_parser($output_file, $response, $output_format); | |
166 } else { | |
167 #we're in the queue, or there was an error | |
168 my $job_id; | |
169 for (my $i = 0; $i <= $#res; $i++) { | |
170 if ($res[$i] =~ /job_status\.pl\?jobid=(\d+.+?)'/) { | |
171 $job_id = $1; | |
172 last; | |
173 } | |
174 } | |
175 unless (length $job_id) { | |
176 #there is no job ID, so an error occured | |
177 my $error_file = "$output_directory/$seq_id\_SMART_error.html"; | |
178 open (ERR, ">$error_file") or die "Cannot write to $error_file"; | |
179 print ERR $response->content; | |
180 close ERR; | |
181 print "SMART returned an error page, which was saved into '$error_file'.\nPlease check the file for details. Aborting further submissions.\n"; | |
182 exit; | |
183 } else { | |
184 #we have a jobID, check every 10 seconds until we get the results | |
185 print "Job entered the queue with ID $job_id. Waiting for results.\n"; | |
186 my $job_status_req = GET "$job_status_url?jobid=$job_id"; | |
187 sleep 5; | |
188 while (1) { | |
189 my $job_status_response = $ua->request($job_status_req); | |
190 if ($job_status_response->is_success) { | |
191 #check if we got the results | |
192 my @job_status_res = split(/\n/, $job_status_response->content); | |
193 shift @job_status_res if ($job_status_res[1] =~ /^--\ SMART\ RESULT/); | |
194 if ($job_status_res[0] =~ /^--\ SMART\ RESULT/) { | |
195 response_parser($output_file, $job_status_response, $output_format); | |
196 last; | |
197 } else { | |
198 #still in queue | |
199 sleep 10; | |
200 } | |
201 } else { | |
202 print "SMART returned a web server error. Full message follows:\n\n"; | |
203 print $response->as_string; | |
204 die; | |
205 } | |
206 } | |
207 } | |
208 } | |
209 | |
210 } else { | |
211 print "SMART returned a web server error. Full message follows:\n\n"; | |
212 print $response->as_string; | |
213 die; | |
214 } | |
215 #be nice to other users | |
216 sleep 5; | |
217 } | |
218 | |
219 sub toJSON{ | |
220 my ($text) = @_; | |
221 | |
222 my @result = split("\n", $text); | |
223 my $line; | |
224 my %hash; | |
225 my @hashes; | |
226 my $json; | |
227 | |
228 foreach $line (@result) | |
229 { | |
230 if(index($line, "=") > 0){ | |
231 my $key = (split(/=/, $line))[0]; | |
232 my $value = (split(/=/, $line))[1]; | |
233 $hash{$key} = $value; | |
234 } elsif(length($line) == 0){ | |
235 if (exists $hash{"DOMAIN"}) | |
236 { | |
237 $json = encode_json \%hash; | |
238 push @hashes, $json; | |
239 } | |
240 %hash = (); | |
241 } | |
242 } | |
243 | |
244 return @hashes; | |
245 } | |
246 | |
247 sub response_parser{ | |
248 my $output_file = $_[0]; | |
249 my $job_status_response = $_[1]; | |
250 my $output_format = $_[2]; | |
251 | |
252 | |
253 open (OUT, ">$output_file") or die "Cannot write to $output_file"; | |
254 $result = $job_status_response->content; | |
255 if ($output_format eq "txt") | |
256 { | |
257 print OUT $result; | |
258 } elsif ($output_format eq "tabular") | |
259 { | |
260 my @result = toJSON($result); | |
261 | |
262 my $first_row = decode_json $result[0]; | |
263 my @keys; | |
264 my $counter; | |
265 | |
266 foreach my $key(sort keys %$first_row) { | |
267 print OUT "$key"; | |
268 print OUT "\t" if ++$counter < scalar keys %$first_row; | |
269 push @keys, $key; | |
270 } | |
271 print OUT "\n"; | |
272 | |
273 my $counter; | |
274 | |
275 foreach my $line (@result) | |
276 { | |
277 my $first_row = decode_json $line; | |
278 my $counter; | |
279 foreach my $key (@keys) | |
280 { | |
281 print OUT $first_row->{$key}; | |
282 print OUT "\t" if ++$counter < scalar(@keys); | |
283 } | |
284 print OUT "\n"; | |
285 } | |
286 | |
287 } elsif ($output_format eq "json"){ | |
288 my @result = toJSON($result); | |
289 | |
290 print OUT "["; | |
291 my $counter; | |
292 foreach my $line (@result) | |
293 { | |
294 print OUT $line; | |
295 print OUT "," if ++$counter < scalar(@result); | |
296 } | |
297 print OUT "]"; | |
298 | |
299 } | |
300 close OUT; | |
301 print "Results saved to '$output_file'\n"; | |
302 } |