comparison srst2.pl @ 0:6f870ed59b6e draft

Uploaded
author nml
date Mon, 06 Feb 2017 12:31:04 -0500
parents
children 599a4dc309aa
comparison
equal deleted inserted replaced
-1:000000000000 0:6f870ed59b6e
1 #!/usr/bin/env perl
2
3
4 use strict;
5 use warnings;
6 use Cwd;
7 use File::Copy;
8
9 #The first 4 arguments should be in this format:
10 # /path/to/srst2.py bam_output scores_output pileup_output ...
11
12 my $binary = $ARGV[0];
13 shift;
14
15 my ($bam_results, $scores, $pileup, $job_type, $txt_results, $genes_results, $fullgenes_results, $name, $databases);
16 my ($allele_results,$allele_type);
17
18
19 $bam_results = $ARGV[0];
20 shift(@ARGV);
21 $scores = $ARGV[0];
22 shift(@ARGV);
23 $pileup = $ARGV[0];
24 shift(@ARGV);
25
26 #Now that we shifted the first 4 arguments, get the rest depending on the job type. There are three:
27 #I pass a letter to tell us which job type was selected.
28 $job_type = $ARGV[0];
29 shift(@ARGV);
30
31 #If m, mlst only: we only have one mlst output file
32 if($job_type eq "m")
33 {
34 $txt_results = $ARGV[0];
35 shift(@ARGV);
36 $allele_results = $ARGV[0];
37 shift(@ARGV);
38 $allele_type = $ARGV[0];
39 shift(@ARGV);
40 }
41 #If g, genedb only: we have two outputs: genes and fullgenes. We also get the name of the database
42 elsif($job_type eq "g")
43 {
44 $genes_results = $ARGV[0];
45 shift(@ARGV);
46 $fullgenes_results = $ARGV[0];
47 shift(@ARGV);
48 $databases = $ARGV[0];
49 shift (@ARGV);
50 }
51 #If b, both mlst and genedb: We will have three output files and the database name.
52 else
53 {
54 $txt_results = $ARGV[0];
55 shift(@ARGV);
56 $genes_results = $ARGV[0];
57 shift(@ARGV);
58 $fullgenes_results = $ARGV[0];
59 shift(@ARGV);
60 $databases = $ARGV[0];
61 shift(@ARGV);
62 }
63
64 #After we get the output files/database name, now we get the name of the reads
65 #This allows SRST2 to give meaningful output instead of just printing 'dataset_xxx' as the sample name
66 my $filename = $ARGV[0];
67 shift(@ARGV);
68
69 #This index offset is used to determine where the 'genedb' option is located.
70 #If only a single-end input, the genedb will be 3 positions into the arguments:
71 # -input_se sample.fastq --genedb database.fasta
72 my $index_offset = 3;
73
74 print @ARGV;
75 #change the file extensions of the input reads so srst can use them
76
77 #Usually the file name looks like this: sample_S1_L001_R1_001.fastq
78 #If we use this file name, it confuses srst2 a lot. So we just extract
79 #the sample name to use as input file name.
80 my @file_name = split /_/, $filename;
81 $name = "temp_file_name";
82
83 my ($for_read, $rev_read, $sing_read, $database);
84 if ($ARGV[0] eq "--input_pe")
85 {
86 #Increment index offset if we paired-end inputs
87 $index_offset++;
88
89 $for_read = $name."_1.dat";
90 $rev_read = $name."_2.dat";
91
92 symlink($ARGV[1], $for_read);
93 symlink($ARGV[2], $rev_read);
94
95 $ARGV[1] = $for_read;
96 $ARGV[2] = $rev_read;
97 }
98 else
99 {
100 $sing_read = $name.".dat";
101
102 symlink($ARGV[1], $sing_read);
103
104 $ARGV[1] = $sing_read;
105
106 }
107 #If we are running a job to include genedb, use the database name for input file name
108 if ($job_type eq 'g' | $job_type eq 'b')
109 {
110 my @db_names = split /,/, $databases;
111 my $num_db = @db_names;
112 my %names_hash = ();
113 # loop through dbs to replace spaces with _ and check for duplicates
114 for (my $i = 0; $i < $num_db; $i++){
115 $db_names[$i]=~s/ /_/g;
116 if( exists($names_hash{$db_names[$i]}) ) {
117 print STDERR "More than one database with the same name";
118 exit(1);
119 }else{
120 $names_hash{$db_names[$i]}=$db_names[$i];
121 }
122 }
123
124
125 foreach my $db_name (@db_names){
126 (my $base = $db_name) =~ s/\.[^.]+$//;
127 $database = $base.".dat";
128
129 symlink($ARGV[$index_offset], $database);
130 $ARGV[$index_offset] = $database;
131
132 $index_offset++;
133 }
134 }
135
136 for (my $i =0; $i< @ARGV; $i++){
137 if (index($ARGV[$i], "maxins") != -1){
138 my ($maxins, $minins);
139 my @b2args = split(' ', $ARGV[$i]);
140 for (my $j = 0; $j < @b2args; $j++){
141 if (index($b2args[$j], "maxins") != -1){
142 $maxins = $b2args[$j+1];
143 }
144 if (index($b2args[$j], "minins") != -1){
145 $minins = $b2args[$j+1];
146 }
147 }
148 if ($maxins - $minins < 0){
149 print STDERR "--minins cannot be greater than --maxins";
150 exit(1);
151 }
152 }
153 }
154
155
156
157 my $command = "python $binary @ARGV";
158
159 my $exit_code = system($command);
160
161 my $cur_dir = getcwd();
162 # make arrays for using multiple custom databases (creates multiple output files - need to be concatenated)
163 my (@genefiles, @bamfiles, @pileupfiles, @fullgenefiles, @scoresfiles);
164
165 # go through files in the output directory to move/concatenate them as required.
166
167 foreach my $file (<$cur_dir/*>)
168 {
169 print $file, "\n";
170 #Will cause problems if any files have 'mlst' in them
171 if ($file =~ /mlst/)
172 {
173 move($file, $txt_results);
174 }
175 elsif ($file =~ /\.bam$/)
176 {
177 push @bamfiles, $file;
178 }
179 elsif ($file =~ /\.scores$/)
180 {
181 push @scoresfiles, $file;
182 }
183 elsif ($file =~ /\.pileup$/)
184 {
185 push @pileupfiles, $file;
186 }
187 elsif ($file =~ /__fullgenes__/)
188 {
189 push @fullgenefiles, $file;
190 }
191 elsif ($file =~ /__genes__/)
192 {
193 push @genefiles, $file;
194 }
195 elsif ($file =~ /all_consensus_alleles.fasta$/ && $allele_type eq 'all') {
196 move($file,$allele_results);
197 }
198 elsif ($file =~ /new_consensus_alleles.fasta$/ && $allele_type eq 'new'){
199 move($file,$allele_results);
200 }
201 }
202
203
204 my ($cmd, $temp_head, $temp_full, $cat_header, $final_bam, @headers );
205
206 # create new concatenated bam file with all bam files.
207 if (@bamfiles > 1){
208 my $counter = 0;
209 $cat_header = "cat_header";
210 while ($counter < @bamfiles) {
211 $headers[$counter] = "bam_header".$counter;
212 # make a header file for each bam results file
213 my $cmd = "samtools view -H $bamfiles[$counter] > $headers[$counter]";
214 system($cmd);
215 if ($counter >= 1){
216 # only keep the @hd and @pg from first file because the final concatenated file can only have one of each (doesn't matter location)
217 $temp_head="cut_head".$counter;
218 # cut off first row and last row of each file (the @HD and @PG)
219 $cmd = "tail -n +2 $headers[$counter] | head -n -1 > $temp_head";
220 system($cmd);
221 unlink $headers[$counter];
222 # replace the old header with the new cut header in the array
223 $headers[$counter] = $temp_head;
224 }
225 $counter++;
226 }
227 # combine all header files
228 $cmd = "cat ".join(" ",@headers)." > $cat_header";
229 system($cmd);
230
231 $final_bam = "final_bam";
232 # concatenate all the bam files *must include the concatenated header file created above
233 $cmd = "samtools cat -h $cat_header -o $final_bam ".join(" ",@bamfiles)." ";
234 system($cmd);
235
236 # sort the bam file so it can be indexed
237 $cmd = "samtools sort $final_bam 'last_bam'";
238 system($cmd);
239
240 # move bam file to where Galaxy expects it.
241 $cmd = "mv 'last_bam.bam' $bam_results";
242 system($cmd);
243 } else {
244 # only one bam file, don't need to concatenate
245 move($bamfiles[0], $bam_results);
246 }
247
248 # concatenate all pileup files
249 if (@pileupfiles > 1){
250 $cmd = "cat ".join(" ",@pileupfiles)." > $pileup";
251 system($cmd);
252 } else {
253 move($pileupfiles[0], $pileup);
254 }
255
256 # perform find-replace to restore original user-specified file names
257 foreach my $gene (@genefiles){
258 my $data = read_file($gene);
259 $data =~ s/temp_file_name/$file_name[0]/g;
260 write_file($gene, $data);
261 }
262
263 foreach my $gene (@fullgenefiles){
264 my $data = read_file($gene);
265 $data =~ s/temp_file_name/$file_name[0]/g;
266 write_file($gene, $data);
267 }
268
269 # concatenate gene files with a space separating each file
270 if (@genefiles > 1){
271 my $join = join(" <(echo) ", @genefiles);
272 my @args = ("bash", "-c", "cat $join > $genes_results");
273 system(@args);
274 } else {
275 # only one gene results file
276 move($genefiles[0], $genes_results);
277 }
278
279 # concatenate full gene results files but only keep header in first file.
280 if (@fullgenefiles >1){
281 for (my $i= 1; $i < @fullgenefiles; $i++){
282 # go through all files but the first one to remove headers
283 # create a temp file to save the file after header is removed
284 $temp_full = "temp_full".$i;
285 $cmd = "tail -n +2 $fullgenefiles[$i] > $temp_full";
286 system($cmd);
287 unlink $fullgenefiles[$i];
288 $fullgenefiles[$i] = $temp_full;
289 }
290 $cmd = "cat ". join(" ",@fullgenefiles)." > $fullgenes_results";
291 system($cmd);
292 } else{
293 # only one full gene results file
294 move($fullgenefiles[0], $fullgenes_results);
295 }
296
297 # concatenate full gene results files but only keep header in first file.
298 if (@scoresfiles >1){
299 for (my $i= 1; $i < @scoresfiles; $i++){
300 # go through all files but the first one to remove headers
301 # create a temp file to save the file after header is removed
302 $temp_full = "temp_full".$i;
303 $cmd = "tail -n +2 $scoresfiles[$i] > $temp_full";
304 system($cmd);
305 unlink $scoresfiles[$i];
306 $scoresfiles[$i] = $temp_full;
307 }
308 $cmd = "cat ". join(" ",@scoresfiles)." > $scores";
309 system($cmd);
310 } else{
311 # only one scores file
312 move($scoresfiles[0], $scores);
313 }
314
315 # cleanup srst2 output and temp files
316 foreach my $file (@fullgenefiles){
317 unlink $file;
318 }
319 foreach my $file (@genefiles){
320 unlink $file;
321 }
322 foreach my $file (@pileupfiles){
323 unlink $file;
324 }
325 foreach my $file (@bamfiles){
326 unlink $file;
327 }
328 foreach my $file (@headers){
329 unlink $file;
330 }
331 foreach my $file (@scoresfiles){
332 unlink $file;
333 }
334 unlink $temp_head;
335 unlink $temp_full;
336 unlink $cat_header;
337 unlink $final_bam;
338
339 #get rid of symlinks
340 if ($for_read)
341 {
342 unlink $for_read;
343 }
344 if ($rev_read)
345 {
346 unlink $rev_read;
347 }
348 if ($sing_read)
349 {
350 unlink $sing_read;
351 }
352 if ($database)
353 {
354 unlink $database;
355 }
356 $exit_code = $exit_code >> 8;
357 exit $exit_code;
358
359 sub read_file {
360 my ($filename) = @_;
361
362 open my $in, '<:encoding(UTF-8)', $filename or die "Could not open '$filename' for reading $!";
363 local $/ = undef;
364 my $all = <$in>;
365 close $in;
366
367 return $all;
368 }
369
370 sub write_file {
371 my ($filename, $content) = @_;
372
373 open my $out, '>:encoding(UTF-8)', $filename or die "Could not open '$filename' for writing $!";;
374 print $out $content;
375 close $out;
376
377 return;
378 }