Mercurial > repos > nml > srst2
annotate srst2.pl @ 1:599a4dc309aa draft
planemo upload commit 1ea98fb88a93a571beda5bbd56449c946860a258
author | nml |
---|---|
date | Wed, 01 Mar 2017 12:39:11 -0500 |
parents | 6f870ed59b6e |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env perl |
2 | |
3 | |
4 use strict; | |
5 use warnings; | |
6 use Cwd; | |
7 use File::Copy; | |
8 | |
1
599a4dc309aa
planemo upload commit 1ea98fb88a93a571beda5bbd56449c946860a258
nml
parents:
0
diff
changeset
|
9 #The first 3 arguments should be in this format: |
599a4dc309aa
planemo upload commit 1ea98fb88a93a571beda5bbd56449c946860a258
nml
parents:
0
diff
changeset
|
10 # bam_output scores_output pileup_output ... |
0 | 11 |
12 | |
13 my ($bam_results, $scores, $pileup, $job_type, $txt_results, $genes_results, $fullgenes_results, $name, $databases); | |
14 my ($allele_results,$allele_type); | |
15 | |
16 | |
17 $bam_results = $ARGV[0]; | |
18 shift(@ARGV); | |
19 $scores = $ARGV[0]; | |
20 shift(@ARGV); | |
21 $pileup = $ARGV[0]; | |
22 shift(@ARGV); | |
23 | |
24 #Now that we shifted the first 4 arguments, get the rest depending on the job type. There are three: | |
25 #I pass a letter to tell us which job type was selected. | |
26 $job_type = $ARGV[0]; | |
27 shift(@ARGV); | |
28 | |
29 #If m, mlst only: we only have one mlst output file | |
30 if($job_type eq "m") | |
31 { | |
32 $txt_results = $ARGV[0]; | |
33 shift(@ARGV); | |
34 $allele_results = $ARGV[0]; | |
35 shift(@ARGV); | |
36 $allele_type = $ARGV[0]; | |
37 shift(@ARGV); | |
38 } | |
39 #If g, genedb only: we have two outputs: genes and fullgenes. We also get the name of the database | |
40 elsif($job_type eq "g") | |
41 { | |
42 $genes_results = $ARGV[0]; | |
43 shift(@ARGV); | |
44 $fullgenes_results = $ARGV[0]; | |
45 shift(@ARGV); | |
46 $databases = $ARGV[0]; | |
47 shift (@ARGV); | |
48 } | |
49 #If b, both mlst and genedb: We will have three output files and the database name. | |
50 else | |
51 { | |
52 $txt_results = $ARGV[0]; | |
53 shift(@ARGV); | |
54 $genes_results = $ARGV[0]; | |
55 shift(@ARGV); | |
56 $fullgenes_results = $ARGV[0]; | |
57 shift(@ARGV); | |
58 $databases = $ARGV[0]; | |
59 shift(@ARGV); | |
60 } | |
61 | |
62 #After we get the output files/database name, now we get the name of the reads | |
63 #This allows SRST2 to give meaningful output instead of just printing 'dataset_xxx' as the sample name | |
64 my $filename = $ARGV[0]; | |
65 shift(@ARGV); | |
66 | |
67 #This index offset is used to determine where the 'genedb' option is located. | |
68 #If only a single-end input, the genedb will be 3 positions into the arguments: | |
69 # -input_se sample.fastq --genedb database.fasta | |
70 my $index_offset = 3; | |
71 | |
72 print @ARGV; | |
73 #change the file extensions of the input reads so srst can use them | |
74 | |
75 #Usually the file name looks like this: sample_S1_L001_R1_001.fastq | |
76 #If we use this file name, it confuses srst2 a lot. So we just extract | |
77 #the sample name to use as input file name. | |
78 my @file_name = split /_/, $filename; | |
79 $name = "temp_file_name"; | |
80 | |
81 my ($for_read, $rev_read, $sing_read, $database); | |
82 if ($ARGV[0] eq "--input_pe") | |
83 { | |
84 #Increment index offset if we paired-end inputs | |
85 $index_offset++; | |
86 | |
87 $for_read = $name."_1.dat"; | |
88 $rev_read = $name."_2.dat"; | |
89 | |
90 symlink($ARGV[1], $for_read); | |
91 symlink($ARGV[2], $rev_read); | |
92 | |
93 $ARGV[1] = $for_read; | |
94 $ARGV[2] = $rev_read; | |
95 } | |
96 else | |
97 { | |
98 $sing_read = $name.".dat"; | |
99 | |
100 symlink($ARGV[1], $sing_read); | |
101 | |
102 $ARGV[1] = $sing_read; | |
103 | |
104 } | |
105 #If we are running a job to include genedb, use the database name for input file name | |
106 if ($job_type eq 'g' | $job_type eq 'b') | |
107 { | |
108 my @db_names = split /,/, $databases; | |
109 my $num_db = @db_names; | |
110 my %names_hash = (); | |
111 # loop through dbs to replace spaces with _ and check for duplicates | |
112 for (my $i = 0; $i < $num_db; $i++){ | |
113 $db_names[$i]=~s/ /_/g; | |
114 if( exists($names_hash{$db_names[$i]}) ) { | |
115 print STDERR "More than one database with the same name"; | |
116 exit(1); | |
117 }else{ | |
118 $names_hash{$db_names[$i]}=$db_names[$i]; | |
119 } | |
120 } | |
121 | |
122 | |
123 foreach my $db_name (@db_names){ | |
124 (my $base = $db_name) =~ s/\.[^.]+$//; | |
125 $database = $base.".dat"; | |
126 | |
127 symlink($ARGV[$index_offset], $database); | |
128 $ARGV[$index_offset] = $database; | |
129 | |
130 $index_offset++; | |
131 } | |
132 } | |
133 | |
134 for (my $i =0; $i< @ARGV; $i++){ | |
135 if (index($ARGV[$i], "maxins") != -1){ | |
136 my ($maxins, $minins); | |
137 my @b2args = split(' ', $ARGV[$i]); | |
138 for (my $j = 0; $j < @b2args; $j++){ | |
139 if (index($b2args[$j], "maxins") != -1){ | |
140 $maxins = $b2args[$j+1]; | |
141 } | |
142 if (index($b2args[$j], "minins") != -1){ | |
143 $minins = $b2args[$j+1]; | |
144 } | |
145 } | |
146 if ($maxins - $minins < 0){ | |
147 print STDERR "--minins cannot be greater than --maxins"; | |
148 exit(1); | |
149 } | |
150 } | |
151 } | |
152 | |
153 | |
154 | |
1
599a4dc309aa
planemo upload commit 1ea98fb88a93a571beda5bbd56449c946860a258
nml
parents:
0
diff
changeset
|
155 my $command = "srst2 @ARGV"; |
0 | 156 |
157 my $exit_code = system($command); | |
158 | |
159 my $cur_dir = getcwd(); | |
160 # make arrays for using multiple custom databases (creates multiple output files - need to be concatenated) | |
161 my (@genefiles, @bamfiles, @pileupfiles, @fullgenefiles, @scoresfiles); | |
162 | |
163 # go through files in the output directory to move/concatenate them as required. | |
164 | |
165 foreach my $file (<$cur_dir/*>) | |
166 { | |
167 print $file, "\n"; | |
168 #Will cause problems if any files have 'mlst' in them | |
169 if ($file =~ /mlst/) | |
170 { | |
171 move($file, $txt_results); | |
172 } | |
173 elsif ($file =~ /\.bam$/) | |
174 { | |
175 push @bamfiles, $file; | |
176 } | |
177 elsif ($file =~ /\.scores$/) | |
178 { | |
179 push @scoresfiles, $file; | |
180 } | |
181 elsif ($file =~ /\.pileup$/) | |
182 { | |
183 push @pileupfiles, $file; | |
184 } | |
185 elsif ($file =~ /__fullgenes__/) | |
186 { | |
187 push @fullgenefiles, $file; | |
188 } | |
189 elsif ($file =~ /__genes__/) | |
190 { | |
191 push @genefiles, $file; | |
192 } | |
193 elsif ($file =~ /all_consensus_alleles.fasta$/ && $allele_type eq 'all') { | |
194 move($file,$allele_results); | |
195 } | |
196 elsif ($file =~ /new_consensus_alleles.fasta$/ && $allele_type eq 'new'){ | |
197 move($file,$allele_results); | |
198 } | |
199 } | |
200 | |
201 | |
202 my ($cmd, $temp_head, $temp_full, $cat_header, $final_bam, @headers ); | |
203 | |
204 # create new concatenated bam file with all bam files. | |
205 if (@bamfiles > 1){ | |
206 my $counter = 0; | |
207 $cat_header = "cat_header"; | |
208 while ($counter < @bamfiles) { | |
209 $headers[$counter] = "bam_header".$counter; | |
210 # make a header file for each bam results file | |
211 my $cmd = "samtools view -H $bamfiles[$counter] > $headers[$counter]"; | |
212 system($cmd); | |
213 if ($counter >= 1){ | |
214 # only keep the @hd and @pg from first file because the final concatenated file can only have one of each (doesn't matter location) | |
215 $temp_head="cut_head".$counter; | |
216 # cut off first row and last row of each file (the @HD and @PG) | |
217 $cmd = "tail -n +2 $headers[$counter] | head -n -1 > $temp_head"; | |
218 system($cmd); | |
219 unlink $headers[$counter]; | |
220 # replace the old header with the new cut header in the array | |
221 $headers[$counter] = $temp_head; | |
222 } | |
223 $counter++; | |
224 } | |
225 # combine all header files | |
226 $cmd = "cat ".join(" ",@headers)." > $cat_header"; | |
227 system($cmd); | |
228 | |
229 $final_bam = "final_bam"; | |
230 # concatenate all the bam files *must include the concatenated header file created above | |
231 $cmd = "samtools cat -h $cat_header -o $final_bam ".join(" ",@bamfiles)." "; | |
232 system($cmd); | |
233 | |
234 # sort the bam file so it can be indexed | |
235 $cmd = "samtools sort $final_bam 'last_bam'"; | |
236 system($cmd); | |
237 | |
238 # move bam file to where Galaxy expects it. | |
239 $cmd = "mv 'last_bam.bam' $bam_results"; | |
240 system($cmd); | |
241 } else { | |
242 # only one bam file, don't need to concatenate | |
243 move($bamfiles[0], $bam_results); | |
244 } | |
245 | |
246 # concatenate all pileup files | |
247 if (@pileupfiles > 1){ | |
248 $cmd = "cat ".join(" ",@pileupfiles)." > $pileup"; | |
249 system($cmd); | |
250 } else { | |
251 move($pileupfiles[0], $pileup); | |
252 } | |
253 | |
254 # perform find-replace to restore original user-specified file names | |
255 foreach my $gene (@genefiles){ | |
256 my $data = read_file($gene); | |
257 $data =~ s/temp_file_name/$file_name[0]/g; | |
258 write_file($gene, $data); | |
259 } | |
260 | |
261 foreach my $gene (@fullgenefiles){ | |
262 my $data = read_file($gene); | |
263 $data =~ s/temp_file_name/$file_name[0]/g; | |
264 write_file($gene, $data); | |
265 } | |
266 | |
267 # concatenate gene files with a space separating each file | |
268 if (@genefiles > 1){ | |
269 my $join = join(" <(echo) ", @genefiles); | |
270 my @args = ("bash", "-c", "cat $join > $genes_results"); | |
271 system(@args); | |
272 } else { | |
273 # only one gene results file | |
274 move($genefiles[0], $genes_results); | |
275 } | |
276 | |
277 # concatenate full gene results files but only keep header in first file. | |
278 if (@fullgenefiles >1){ | |
279 for (my $i= 1; $i < @fullgenefiles; $i++){ | |
280 # go through all files but the first one to remove headers | |
281 # create a temp file to save the file after header is removed | |
282 $temp_full = "temp_full".$i; | |
283 $cmd = "tail -n +2 $fullgenefiles[$i] > $temp_full"; | |
284 system($cmd); | |
285 unlink $fullgenefiles[$i]; | |
286 $fullgenefiles[$i] = $temp_full; | |
287 } | |
288 $cmd = "cat ". join(" ",@fullgenefiles)." > $fullgenes_results"; | |
289 system($cmd); | |
290 } else{ | |
291 # only one full gene results file | |
292 move($fullgenefiles[0], $fullgenes_results); | |
293 } | |
294 | |
295 # concatenate full gene results files but only keep header in first file. | |
296 if (@scoresfiles >1){ | |
297 for (my $i= 1; $i < @scoresfiles; $i++){ | |
298 # go through all files but the first one to remove headers | |
299 # create a temp file to save the file after header is removed | |
300 $temp_full = "temp_full".$i; | |
301 $cmd = "tail -n +2 $scoresfiles[$i] > $temp_full"; | |
302 system($cmd); | |
303 unlink $scoresfiles[$i]; | |
304 $scoresfiles[$i] = $temp_full; | |
305 } | |
306 $cmd = "cat ". join(" ",@scoresfiles)." > $scores"; | |
307 system($cmd); | |
308 } else{ | |
309 # only one scores file | |
310 move($scoresfiles[0], $scores); | |
311 } | |
312 | |
313 # cleanup srst2 output and temp files | |
314 foreach my $file (@fullgenefiles){ | |
315 unlink $file; | |
316 } | |
317 foreach my $file (@genefiles){ | |
318 unlink $file; | |
319 } | |
320 foreach my $file (@pileupfiles){ | |
321 unlink $file; | |
322 } | |
323 foreach my $file (@bamfiles){ | |
324 unlink $file; | |
325 } | |
326 foreach my $file (@headers){ | |
327 unlink $file; | |
328 } | |
329 foreach my $file (@scoresfiles){ | |
330 unlink $file; | |
331 } | |
332 unlink $temp_head; | |
333 unlink $temp_full; | |
334 unlink $cat_header; | |
335 unlink $final_bam; | |
336 | |
337 #get rid of symlinks | |
338 if ($for_read) | |
339 { | |
340 unlink $for_read; | |
341 } | |
342 if ($rev_read) | |
343 { | |
344 unlink $rev_read; | |
345 } | |
346 if ($sing_read) | |
347 { | |
348 unlink $sing_read; | |
349 } | |
350 if ($database) | |
351 { | |
352 unlink $database; | |
353 } | |
354 $exit_code = $exit_code >> 8; | |
355 exit $exit_code; | |
356 | |
357 sub read_file { | |
358 my ($filename) = @_; | |
359 | |
360 open my $in, '<:encoding(UTF-8)', $filename or die "Could not open '$filename' for reading $!"; | |
361 local $/ = undef; | |
362 my $all = <$in>; | |
363 close $in; | |
364 | |
365 return $all; | |
366 } | |
367 | |
368 sub write_file { | |
369 my ($filename, $content) = @_; | |
370 | |
371 open my $out, '>:encoding(UTF-8)', $filename or die "Could not open '$filename' for writing $!";; | |
372 print $out $content; | |
373 close $out; | |
374 | |
375 return; | |
1
599a4dc309aa
planemo upload commit 1ea98fb88a93a571beda5bbd56449c946860a258
nml
parents:
0
diff
changeset
|
376 } |