Mercurial > repos > nml > srst2
comparison srst2.pl @ 0:6f870ed59b6e draft
Uploaded
author | nml |
---|---|
date | Mon, 06 Feb 2017 12:31:04 -0500 |
parents | |
children | 599a4dc309aa |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6f870ed59b6e |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 | |
4 use strict; | |
5 use warnings; | |
6 use Cwd; | |
7 use File::Copy; | |
8 | |
9 #The first 4 arguments should be in this format: | |
10 # /path/to/srst2.py bam_output scores_output pileup_output ... | |
11 | |
12 my $binary = $ARGV[0]; | |
13 shift; | |
14 | |
15 my ($bam_results, $scores, $pileup, $job_type, $txt_results, $genes_results, $fullgenes_results, $name, $databases); | |
16 my ($allele_results,$allele_type); | |
17 | |
18 | |
19 $bam_results = $ARGV[0]; | |
20 shift(@ARGV); | |
21 $scores = $ARGV[0]; | |
22 shift(@ARGV); | |
23 $pileup = $ARGV[0]; | |
24 shift(@ARGV); | |
25 | |
26 #Now that we shifted the first 4 arguments, get the rest depending on the job type. There are three: | |
27 #I pass a letter to tell us which job type was selected. | |
28 $job_type = $ARGV[0]; | |
29 shift(@ARGV); | |
30 | |
31 #If m, mlst only: we only have one mlst output file | |
32 if($job_type eq "m") | |
33 { | |
34 $txt_results = $ARGV[0]; | |
35 shift(@ARGV); | |
36 $allele_results = $ARGV[0]; | |
37 shift(@ARGV); | |
38 $allele_type = $ARGV[0]; | |
39 shift(@ARGV); | |
40 } | |
41 #If g, genedb only: we have two outputs: genes and fullgenes. We also get the name of the database | |
42 elsif($job_type eq "g") | |
43 { | |
44 $genes_results = $ARGV[0]; | |
45 shift(@ARGV); | |
46 $fullgenes_results = $ARGV[0]; | |
47 shift(@ARGV); | |
48 $databases = $ARGV[0]; | |
49 shift (@ARGV); | |
50 } | |
51 #If b, both mlst and genedb: We will have three output files and the database name. | |
52 else | |
53 { | |
54 $txt_results = $ARGV[0]; | |
55 shift(@ARGV); | |
56 $genes_results = $ARGV[0]; | |
57 shift(@ARGV); | |
58 $fullgenes_results = $ARGV[0]; | |
59 shift(@ARGV); | |
60 $databases = $ARGV[0]; | |
61 shift(@ARGV); | |
62 } | |
63 | |
64 #After we get the output files/database name, now we get the name of the reads | |
65 #This allows SRST2 to give meaningful output instead of just printing 'dataset_xxx' as the sample name | |
66 my $filename = $ARGV[0]; | |
67 shift(@ARGV); | |
68 | |
69 #This index offset is used to determine where the 'genedb' option is located. | |
70 #If only a single-end input, the genedb will be 3 positions into the arguments: | |
71 # -input_se sample.fastq --genedb database.fasta | |
72 my $index_offset = 3; | |
73 | |
74 print @ARGV; | |
75 #change the file extensions of the input reads so srst can use them | |
76 | |
77 #Usually the file name looks like this: sample_S1_L001_R1_001.fastq | |
78 #If we use this file name, it confuses srst2 a lot. So we just extract | |
79 #the sample name to use as input file name. | |
80 my @file_name = split /_/, $filename; | |
81 $name = "temp_file_name"; | |
82 | |
83 my ($for_read, $rev_read, $sing_read, $database); | |
84 if ($ARGV[0] eq "--input_pe") | |
85 { | |
86 #Increment index offset if we paired-end inputs | |
87 $index_offset++; | |
88 | |
89 $for_read = $name."_1.dat"; | |
90 $rev_read = $name."_2.dat"; | |
91 | |
92 symlink($ARGV[1], $for_read); | |
93 symlink($ARGV[2], $rev_read); | |
94 | |
95 $ARGV[1] = $for_read; | |
96 $ARGV[2] = $rev_read; | |
97 } | |
98 else | |
99 { | |
100 $sing_read = $name.".dat"; | |
101 | |
102 symlink($ARGV[1], $sing_read); | |
103 | |
104 $ARGV[1] = $sing_read; | |
105 | |
106 } | |
107 #If we are running a job to include genedb, use the database name for input file name | |
108 if ($job_type eq 'g' | $job_type eq 'b') | |
109 { | |
110 my @db_names = split /,/, $databases; | |
111 my $num_db = @db_names; | |
112 my %names_hash = (); | |
113 # loop through dbs to replace spaces with _ and check for duplicates | |
114 for (my $i = 0; $i < $num_db; $i++){ | |
115 $db_names[$i]=~s/ /_/g; | |
116 if( exists($names_hash{$db_names[$i]}) ) { | |
117 print STDERR "More than one database with the same name"; | |
118 exit(1); | |
119 }else{ | |
120 $names_hash{$db_names[$i]}=$db_names[$i]; | |
121 } | |
122 } | |
123 | |
124 | |
125 foreach my $db_name (@db_names){ | |
126 (my $base = $db_name) =~ s/\.[^.]+$//; | |
127 $database = $base.".dat"; | |
128 | |
129 symlink($ARGV[$index_offset], $database); | |
130 $ARGV[$index_offset] = $database; | |
131 | |
132 $index_offset++; | |
133 } | |
134 } | |
135 | |
136 for (my $i =0; $i< @ARGV; $i++){ | |
137 if (index($ARGV[$i], "maxins") != -1){ | |
138 my ($maxins, $minins); | |
139 my @b2args = split(' ', $ARGV[$i]); | |
140 for (my $j = 0; $j < @b2args; $j++){ | |
141 if (index($b2args[$j], "maxins") != -1){ | |
142 $maxins = $b2args[$j+1]; | |
143 } | |
144 if (index($b2args[$j], "minins") != -1){ | |
145 $minins = $b2args[$j+1]; | |
146 } | |
147 } | |
148 if ($maxins - $minins < 0){ | |
149 print STDERR "--minins cannot be greater than --maxins"; | |
150 exit(1); | |
151 } | |
152 } | |
153 } | |
154 | |
155 | |
156 | |
157 my $command = "python $binary @ARGV"; | |
158 | |
159 my $exit_code = system($command); | |
160 | |
161 my $cur_dir = getcwd(); | |
162 # make arrays for using multiple custom databases (creates multiple output files - need to be concatenated) | |
163 my (@genefiles, @bamfiles, @pileupfiles, @fullgenefiles, @scoresfiles); | |
164 | |
165 # go through files in the output directory to move/concatenate them as required. | |
166 | |
167 foreach my $file (<$cur_dir/*>) | |
168 { | |
169 print $file, "\n"; | |
170 #Will cause problems if any files have 'mlst' in them | |
171 if ($file =~ /mlst/) | |
172 { | |
173 move($file, $txt_results); | |
174 } | |
175 elsif ($file =~ /\.bam$/) | |
176 { | |
177 push @bamfiles, $file; | |
178 } | |
179 elsif ($file =~ /\.scores$/) | |
180 { | |
181 push @scoresfiles, $file; | |
182 } | |
183 elsif ($file =~ /\.pileup$/) | |
184 { | |
185 push @pileupfiles, $file; | |
186 } | |
187 elsif ($file =~ /__fullgenes__/) | |
188 { | |
189 push @fullgenefiles, $file; | |
190 } | |
191 elsif ($file =~ /__genes__/) | |
192 { | |
193 push @genefiles, $file; | |
194 } | |
195 elsif ($file =~ /all_consensus_alleles.fasta$/ && $allele_type eq 'all') { | |
196 move($file,$allele_results); | |
197 } | |
198 elsif ($file =~ /new_consensus_alleles.fasta$/ && $allele_type eq 'new'){ | |
199 move($file,$allele_results); | |
200 } | |
201 } | |
202 | |
203 | |
204 my ($cmd, $temp_head, $temp_full, $cat_header, $final_bam, @headers ); | |
205 | |
206 # create new concatenated bam file with all bam files. | |
207 if (@bamfiles > 1){ | |
208 my $counter = 0; | |
209 $cat_header = "cat_header"; | |
210 while ($counter < @bamfiles) { | |
211 $headers[$counter] = "bam_header".$counter; | |
212 # make a header file for each bam results file | |
213 my $cmd = "samtools view -H $bamfiles[$counter] > $headers[$counter]"; | |
214 system($cmd); | |
215 if ($counter >= 1){ | |
216 # only keep the @hd and @pg from first file because the final concatenated file can only have one of each (doesn't matter location) | |
217 $temp_head="cut_head".$counter; | |
218 # cut off first row and last row of each file (the @HD and @PG) | |
219 $cmd = "tail -n +2 $headers[$counter] | head -n -1 > $temp_head"; | |
220 system($cmd); | |
221 unlink $headers[$counter]; | |
222 # replace the old header with the new cut header in the array | |
223 $headers[$counter] = $temp_head; | |
224 } | |
225 $counter++; | |
226 } | |
227 # combine all header files | |
228 $cmd = "cat ".join(" ",@headers)." > $cat_header"; | |
229 system($cmd); | |
230 | |
231 $final_bam = "final_bam"; | |
232 # concatenate all the bam files *must include the concatenated header file created above | |
233 $cmd = "samtools cat -h $cat_header -o $final_bam ".join(" ",@bamfiles)." "; | |
234 system($cmd); | |
235 | |
236 # sort the bam file so it can be indexed | |
237 $cmd = "samtools sort $final_bam 'last_bam'"; | |
238 system($cmd); | |
239 | |
240 # move bam file to where Galaxy expects it. | |
241 $cmd = "mv 'last_bam.bam' $bam_results"; | |
242 system($cmd); | |
243 } else { | |
244 # only one bam file, don't need to concatenate | |
245 move($bamfiles[0], $bam_results); | |
246 } | |
247 | |
248 # concatenate all pileup files | |
249 if (@pileupfiles > 1){ | |
250 $cmd = "cat ".join(" ",@pileupfiles)." > $pileup"; | |
251 system($cmd); | |
252 } else { | |
253 move($pileupfiles[0], $pileup); | |
254 } | |
255 | |
256 # perform find-replace to restore original user-specified file names | |
257 foreach my $gene (@genefiles){ | |
258 my $data = read_file($gene); | |
259 $data =~ s/temp_file_name/$file_name[0]/g; | |
260 write_file($gene, $data); | |
261 } | |
262 | |
263 foreach my $gene (@fullgenefiles){ | |
264 my $data = read_file($gene); | |
265 $data =~ s/temp_file_name/$file_name[0]/g; | |
266 write_file($gene, $data); | |
267 } | |
268 | |
269 # concatenate gene files with a space separating each file | |
270 if (@genefiles > 1){ | |
271 my $join = join(" <(echo) ", @genefiles); | |
272 my @args = ("bash", "-c", "cat $join > $genes_results"); | |
273 system(@args); | |
274 } else { | |
275 # only one gene results file | |
276 move($genefiles[0], $genes_results); | |
277 } | |
278 | |
279 # concatenate full gene results files but only keep header in first file. | |
280 if (@fullgenefiles >1){ | |
281 for (my $i= 1; $i < @fullgenefiles; $i++){ | |
282 # go through all files but the first one to remove headers | |
283 # create a temp file to save the file after header is removed | |
284 $temp_full = "temp_full".$i; | |
285 $cmd = "tail -n +2 $fullgenefiles[$i] > $temp_full"; | |
286 system($cmd); | |
287 unlink $fullgenefiles[$i]; | |
288 $fullgenefiles[$i] = $temp_full; | |
289 } | |
290 $cmd = "cat ". join(" ",@fullgenefiles)." > $fullgenes_results"; | |
291 system($cmd); | |
292 } else{ | |
293 # only one full gene results file | |
294 move($fullgenefiles[0], $fullgenes_results); | |
295 } | |
296 | |
297 # concatenate full gene results files but only keep header in first file. | |
298 if (@scoresfiles >1){ | |
299 for (my $i= 1; $i < @scoresfiles; $i++){ | |
300 # go through all files but the first one to remove headers | |
301 # create a temp file to save the file after header is removed | |
302 $temp_full = "temp_full".$i; | |
303 $cmd = "tail -n +2 $scoresfiles[$i] > $temp_full"; | |
304 system($cmd); | |
305 unlink $scoresfiles[$i]; | |
306 $scoresfiles[$i] = $temp_full; | |
307 } | |
308 $cmd = "cat ". join(" ",@scoresfiles)." > $scores"; | |
309 system($cmd); | |
310 } else{ | |
311 # only one scores file | |
312 move($scoresfiles[0], $scores); | |
313 } | |
314 | |
315 # cleanup srst2 output and temp files | |
316 foreach my $file (@fullgenefiles){ | |
317 unlink $file; | |
318 } | |
319 foreach my $file (@genefiles){ | |
320 unlink $file; | |
321 } | |
322 foreach my $file (@pileupfiles){ | |
323 unlink $file; | |
324 } | |
325 foreach my $file (@bamfiles){ | |
326 unlink $file; | |
327 } | |
328 foreach my $file (@headers){ | |
329 unlink $file; | |
330 } | |
331 foreach my $file (@scoresfiles){ | |
332 unlink $file; | |
333 } | |
334 unlink $temp_head; | |
335 unlink $temp_full; | |
336 unlink $cat_header; | |
337 unlink $final_bam; | |
338 | |
339 #get rid of symlinks | |
340 if ($for_read) | |
341 { | |
342 unlink $for_read; | |
343 } | |
344 if ($rev_read) | |
345 { | |
346 unlink $rev_read; | |
347 } | |
348 if ($sing_read) | |
349 { | |
350 unlink $sing_read; | |
351 } | |
352 if ($database) | |
353 { | |
354 unlink $database; | |
355 } | |
356 $exit_code = $exit_code >> 8; | |
357 exit $exit_code; | |
358 | |
359 sub read_file { | |
360 my ($filename) = @_; | |
361 | |
362 open my $in, '<:encoding(UTF-8)', $filename or die "Could not open '$filename' for reading $!"; | |
363 local $/ = undef; | |
364 my $all = <$in>; | |
365 close $in; | |
366 | |
367 return $all; | |
368 } | |
369 | |
370 sub write_file { | |
371 my ($filename, $content) = @_; | |
372 | |
373 open my $out, '>:encoding(UTF-8)', $filename or die "Could not open '$filename' for writing $!";; | |
374 print $out $content; | |
375 close $out; | |
376 | |
377 return; | |
378 } |