0
|
1 #!/usr/bin/env perl
|
|
2
|
|
3
|
|
4 use strict;
|
|
5 use warnings;
|
|
6 use Cwd;
|
|
7 use File::Copy;
|
|
8
|
|
9 #The first 4 arguments should be in this format:
|
|
10 # /path/to/srst2.py bam_output scores_output pileup_output ...
|
|
11
|
|
12 my $binary = $ARGV[0];
|
|
13 shift;
|
|
14
|
|
15 my ($bam_results, $scores, $pileup, $job_type, $txt_results, $genes_results, $fullgenes_results, $name, $databases);
|
|
16 my ($allele_results,$allele_type);
|
|
17
|
|
18
|
|
19 $bam_results = $ARGV[0];
|
|
20 shift(@ARGV);
|
|
21 $scores = $ARGV[0];
|
|
22 shift(@ARGV);
|
|
23 $pileup = $ARGV[0];
|
|
24 shift(@ARGV);
|
|
25
|
|
26 #Now that we shifted the first 4 arguments, get the rest depending on the job type. There are three:
|
|
27 #I pass a letter to tell us which job type was selected.
|
|
28 $job_type = $ARGV[0];
|
|
29 shift(@ARGV);
|
|
30
|
|
31 #If m, mlst only: we only have one mlst output file
|
|
32 if($job_type eq "m")
|
|
33 {
|
|
34 $txt_results = $ARGV[0];
|
|
35 shift(@ARGV);
|
|
36 $allele_results = $ARGV[0];
|
|
37 shift(@ARGV);
|
|
38 $allele_type = $ARGV[0];
|
|
39 shift(@ARGV);
|
|
40 }
|
|
41 #If g, genedb only: we have two outputs: genes and fullgenes. We also get the name of the database
|
|
42 elsif($job_type eq "g")
|
|
43 {
|
|
44 $genes_results = $ARGV[0];
|
|
45 shift(@ARGV);
|
|
46 $fullgenes_results = $ARGV[0];
|
|
47 shift(@ARGV);
|
|
48 $databases = $ARGV[0];
|
|
49 shift (@ARGV);
|
|
50 }
|
|
51 #If b, both mlst and genedb: We will have three output files and the database name.
|
|
52 else
|
|
53 {
|
|
54 $txt_results = $ARGV[0];
|
|
55 shift(@ARGV);
|
|
56 $genes_results = $ARGV[0];
|
|
57 shift(@ARGV);
|
|
58 $fullgenes_results = $ARGV[0];
|
|
59 shift(@ARGV);
|
|
60 $databases = $ARGV[0];
|
|
61 shift(@ARGV);
|
|
62 }
|
|
63
|
|
64 #After we get the output files/database name, now we get the name of the reads
|
|
65 #This allows SRST2 to give meaningful output instead of just printing 'dataset_xxx' as the sample name
|
|
66 my $filename = $ARGV[0];
|
|
67 shift(@ARGV);
|
|
68
|
|
69 #This index offset is used to determine where the 'genedb' option is located.
|
|
70 #If only a single-end input, the genedb will be 3 positions into the arguments:
|
|
71 # -input_se sample.fastq --genedb database.fasta
|
|
72 my $index_offset = 3;
|
|
73
|
|
74 print @ARGV;
|
|
75 #change the file extensions of the input reads so srst can use them
|
|
76
|
|
77 #Usually the file name looks like this: sample_S1_L001_R1_001.fastq
|
|
78 #If we use this file name, it confuses srst2 a lot. So we just extract
|
|
79 #the sample name to use as input file name.
|
|
80 my @file_name = split /_/, $filename;
|
|
81 $name = "temp_file_name";
|
|
82
|
|
83 my ($for_read, $rev_read, $sing_read, $database);
|
|
84 if ($ARGV[0] eq "--input_pe")
|
|
85 {
|
|
86 #Increment index offset if we paired-end inputs
|
|
87 $index_offset++;
|
|
88
|
|
89 $for_read = $name."_1.dat";
|
|
90 $rev_read = $name."_2.dat";
|
|
91
|
|
92 symlink($ARGV[1], $for_read);
|
|
93 symlink($ARGV[2], $rev_read);
|
|
94
|
|
95 $ARGV[1] = $for_read;
|
|
96 $ARGV[2] = $rev_read;
|
|
97 }
|
|
98 else
|
|
99 {
|
|
100 $sing_read = $name.".dat";
|
|
101
|
|
102 symlink($ARGV[1], $sing_read);
|
|
103
|
|
104 $ARGV[1] = $sing_read;
|
|
105
|
|
106 }
|
|
107 #If we are running a job to include genedb, use the database name for input file name
|
|
108 if ($job_type eq 'g' | $job_type eq 'b')
|
|
109 {
|
|
110 my @db_names = split /,/, $databases;
|
|
111 my $num_db = @db_names;
|
|
112 my %names_hash = ();
|
|
113 # loop through dbs to replace spaces with _ and check for duplicates
|
|
114 for (my $i = 0; $i < $num_db; $i++){
|
|
115 $db_names[$i]=~s/ /_/g;
|
|
116 if( exists($names_hash{$db_names[$i]}) ) {
|
|
117 print STDERR "More than one database with the same name";
|
|
118 exit(1);
|
|
119 }else{
|
|
120 $names_hash{$db_names[$i]}=$db_names[$i];
|
|
121 }
|
|
122 }
|
|
123
|
|
124
|
|
125 foreach my $db_name (@db_names){
|
|
126 (my $base = $db_name) =~ s/\.[^.]+$//;
|
|
127 $database = $base.".dat";
|
|
128
|
|
129 symlink($ARGV[$index_offset], $database);
|
|
130 $ARGV[$index_offset] = $database;
|
|
131
|
|
132 $index_offset++;
|
|
133 }
|
|
134 }
|
|
135
|
|
136 for (my $i =0; $i< @ARGV; $i++){
|
|
137 if (index($ARGV[$i], "maxins") != -1){
|
|
138 my ($maxins, $minins);
|
|
139 my @b2args = split(' ', $ARGV[$i]);
|
|
140 for (my $j = 0; $j < @b2args; $j++){
|
|
141 if (index($b2args[$j], "maxins") != -1){
|
|
142 $maxins = $b2args[$j+1];
|
|
143 }
|
|
144 if (index($b2args[$j], "minins") != -1){
|
|
145 $minins = $b2args[$j+1];
|
|
146 }
|
|
147 }
|
|
148 if ($maxins - $minins < 0){
|
|
149 print STDERR "--minins cannot be greater than --maxins";
|
|
150 exit(1);
|
|
151 }
|
|
152 }
|
|
153 }
|
|
154
|
|
155
|
|
156
|
|
157 my $command = "python $binary @ARGV";
|
|
158
|
|
159 my $exit_code = system($command);
|
|
160
|
|
161 my $cur_dir = getcwd();
|
|
162 # make arrays for using multiple custom databases (creates multiple output files - need to be concatenated)
|
|
163 my (@genefiles, @bamfiles, @pileupfiles, @fullgenefiles, @scoresfiles);
|
|
164
|
|
165 # go through files in the output directory to move/concatenate them as required.
|
|
166
|
|
167 foreach my $file (<$cur_dir/*>)
|
|
168 {
|
|
169 print $file, "\n";
|
|
170 #Will cause problems if any files have 'mlst' in them
|
|
171 if ($file =~ /mlst/)
|
|
172 {
|
|
173 move($file, $txt_results);
|
|
174 }
|
|
175 elsif ($file =~ /\.bam$/)
|
|
176 {
|
|
177 push @bamfiles, $file;
|
|
178 }
|
|
179 elsif ($file =~ /\.scores$/)
|
|
180 {
|
|
181 push @scoresfiles, $file;
|
|
182 }
|
|
183 elsif ($file =~ /\.pileup$/)
|
|
184 {
|
|
185 push @pileupfiles, $file;
|
|
186 }
|
|
187 elsif ($file =~ /__fullgenes__/)
|
|
188 {
|
|
189 push @fullgenefiles, $file;
|
|
190 }
|
|
191 elsif ($file =~ /__genes__/)
|
|
192 {
|
|
193 push @genefiles, $file;
|
|
194 }
|
|
195 elsif ($file =~ /all_consensus_alleles.fasta$/ && $allele_type eq 'all') {
|
|
196 move($file,$allele_results);
|
|
197 }
|
|
198 elsif ($file =~ /new_consensus_alleles.fasta$/ && $allele_type eq 'new'){
|
|
199 move($file,$allele_results);
|
|
200 }
|
|
201 }
|
|
202
|
|
203
|
|
204 my ($cmd, $temp_head, $temp_full, $cat_header, $final_bam, @headers );
|
|
205
|
|
206 # create new concatenated bam file with all bam files.
|
|
207 if (@bamfiles > 1){
|
|
208 my $counter = 0;
|
|
209 $cat_header = "cat_header";
|
|
210 while ($counter < @bamfiles) {
|
|
211 $headers[$counter] = "bam_header".$counter;
|
|
212 # make a header file for each bam results file
|
|
213 my $cmd = "samtools view -H $bamfiles[$counter] > $headers[$counter]";
|
|
214 system($cmd);
|
|
215 if ($counter >= 1){
|
|
216 # only keep the @hd and @pg from first file because the final concatenated file can only have one of each (doesn't matter location)
|
|
217 $temp_head="cut_head".$counter;
|
|
218 # cut off first row and last row of each file (the @HD and @PG)
|
|
219 $cmd = "tail -n +2 $headers[$counter] | head -n -1 > $temp_head";
|
|
220 system($cmd);
|
|
221 unlink $headers[$counter];
|
|
222 # replace the old header with the new cut header in the array
|
|
223 $headers[$counter] = $temp_head;
|
|
224 }
|
|
225 $counter++;
|
|
226 }
|
|
227 # combine all header files
|
|
228 $cmd = "cat ".join(" ",@headers)." > $cat_header";
|
|
229 system($cmd);
|
|
230
|
|
231 $final_bam = "final_bam";
|
|
232 # concatenate all the bam files *must include the concatenated header file created above
|
|
233 $cmd = "samtools cat -h $cat_header -o $final_bam ".join(" ",@bamfiles)." ";
|
|
234 system($cmd);
|
|
235
|
|
236 # sort the bam file so it can be indexed
|
|
237 $cmd = "samtools sort $final_bam 'last_bam'";
|
|
238 system($cmd);
|
|
239
|
|
240 # move bam file to where Galaxy expects it.
|
|
241 $cmd = "mv 'last_bam.bam' $bam_results";
|
|
242 system($cmd);
|
|
243 } else {
|
|
244 # only one bam file, don't need to concatenate
|
|
245 move($bamfiles[0], $bam_results);
|
|
246 }
|
|
247
|
|
248 # concatenate all pileup files
|
|
249 if (@pileupfiles > 1){
|
|
250 $cmd = "cat ".join(" ",@pileupfiles)." > $pileup";
|
|
251 system($cmd);
|
|
252 } else {
|
|
253 move($pileupfiles[0], $pileup);
|
|
254 }
|
|
255
|
|
256 # perform find-replace to restore original user-specified file names
|
|
257 foreach my $gene (@genefiles){
|
|
258 my $data = read_file($gene);
|
|
259 $data =~ s/temp_file_name/$file_name[0]/g;
|
|
260 write_file($gene, $data);
|
|
261 }
|
|
262
|
|
263 foreach my $gene (@fullgenefiles){
|
|
264 my $data = read_file($gene);
|
|
265 $data =~ s/temp_file_name/$file_name[0]/g;
|
|
266 write_file($gene, $data);
|
|
267 }
|
|
268
|
|
269 # concatenate gene files with a space separating each file
|
|
270 if (@genefiles > 1){
|
|
271 my $join = join(" <(echo) ", @genefiles);
|
|
272 my @args = ("bash", "-c", "cat $join > $genes_results");
|
|
273 system(@args);
|
|
274 } else {
|
|
275 # only one gene results file
|
|
276 move($genefiles[0], $genes_results);
|
|
277 }
|
|
278
|
|
279 # concatenate full gene results files but only keep header in first file.
|
|
280 if (@fullgenefiles >1){
|
|
281 for (my $i= 1; $i < @fullgenefiles; $i++){
|
|
282 # go through all files but the first one to remove headers
|
|
283 # create a temp file to save the file after header is removed
|
|
284 $temp_full = "temp_full".$i;
|
|
285 $cmd = "tail -n +2 $fullgenefiles[$i] > $temp_full";
|
|
286 system($cmd);
|
|
287 unlink $fullgenefiles[$i];
|
|
288 $fullgenefiles[$i] = $temp_full;
|
|
289 }
|
|
290 $cmd = "cat ". join(" ",@fullgenefiles)." > $fullgenes_results";
|
|
291 system($cmd);
|
|
292 } else{
|
|
293 # only one full gene results file
|
|
294 move($fullgenefiles[0], $fullgenes_results);
|
|
295 }
|
|
296
|
|
297 # concatenate full gene results files but only keep header in first file.
|
|
298 if (@scoresfiles >1){
|
|
299 for (my $i= 1; $i < @scoresfiles; $i++){
|
|
300 # go through all files but the first one to remove headers
|
|
301 # create a temp file to save the file after header is removed
|
|
302 $temp_full = "temp_full".$i;
|
|
303 $cmd = "tail -n +2 $scoresfiles[$i] > $temp_full";
|
|
304 system($cmd);
|
|
305 unlink $scoresfiles[$i];
|
|
306 $scoresfiles[$i] = $temp_full;
|
|
307 }
|
|
308 $cmd = "cat ". join(" ",@scoresfiles)." > $scores";
|
|
309 system($cmd);
|
|
310 } else{
|
|
311 # only one scores file
|
|
312 move($scoresfiles[0], $scores);
|
|
313 }
|
|
314
|
|
315 # cleanup srst2 output and temp files
|
|
316 foreach my $file (@fullgenefiles){
|
|
317 unlink $file;
|
|
318 }
|
|
319 foreach my $file (@genefiles){
|
|
320 unlink $file;
|
|
321 }
|
|
322 foreach my $file (@pileupfiles){
|
|
323 unlink $file;
|
|
324 }
|
|
325 foreach my $file (@bamfiles){
|
|
326 unlink $file;
|
|
327 }
|
|
328 foreach my $file (@headers){
|
|
329 unlink $file;
|
|
330 }
|
|
331 foreach my $file (@scoresfiles){
|
|
332 unlink $file;
|
|
333 }
|
|
334 unlink $temp_head;
|
|
335 unlink $temp_full;
|
|
336 unlink $cat_header;
|
|
337 unlink $final_bam;
|
|
338
|
|
339 #get rid of symlinks
|
|
340 if ($for_read)
|
|
341 {
|
|
342 unlink $for_read;
|
|
343 }
|
|
344 if ($rev_read)
|
|
345 {
|
|
346 unlink $rev_read;
|
|
347 }
|
|
348 if ($sing_read)
|
|
349 {
|
|
350 unlink $sing_read;
|
|
351 }
|
|
352 if ($database)
|
|
353 {
|
|
354 unlink $database;
|
|
355 }
|
|
356 $exit_code = $exit_code >> 8;
|
|
357 exit $exit_code;
|
|
358
|
|
359 sub read_file {
|
|
360 my ($filename) = @_;
|
|
361
|
|
362 open my $in, '<:encoding(UTF-8)', $filename or die "Could not open '$filename' for reading $!";
|
|
363 local $/ = undef;
|
|
364 my $all = <$in>;
|
|
365 close $in;
|
|
366
|
|
367 return $all;
|
|
368 }
|
|
369
|
|
370 sub write_file {
|
|
371 my ($filename, $content) = @_;
|
|
372
|
|
373 open my $out, '>:encoding(UTF-8)', $filename or die "Could not open '$filename' for writing $!";;
|
|
374 print $out $content;
|
|
375 close $out;
|
|
376
|
|
377 return;
|
|
378 } |