Mercurial > repos > nml > srst2
view srst2.pl @ 0:6f870ed59b6e draft
Uploaded
author | nml |
---|---|
date | Mon, 06 Feb 2017 12:31:04 -0500 |
parents | |
children | 599a4dc309aa |
line wrap: on
line source
#!/usr/bin/env perl use strict; use warnings; use Cwd; use File::Copy; #The first 4 arguments should be in this format: # /path/to/srst2.py bam_output scores_output pileup_output ... my $binary = $ARGV[0]; shift; my ($bam_results, $scores, $pileup, $job_type, $txt_results, $genes_results, $fullgenes_results, $name, $databases); my ($allele_results,$allele_type); $bam_results = $ARGV[0]; shift(@ARGV); $scores = $ARGV[0]; shift(@ARGV); $pileup = $ARGV[0]; shift(@ARGV); #Now that we shifted the first 4 arguments, get the rest depending on the job type. There are three: #I pass a letter to tell us which job type was selected. $job_type = $ARGV[0]; shift(@ARGV); #If m, mlst only: we only have one mlst output file if($job_type eq "m") { $txt_results = $ARGV[0]; shift(@ARGV); $allele_results = $ARGV[0]; shift(@ARGV); $allele_type = $ARGV[0]; shift(@ARGV); } #If g, genedb only: we have two outputs: genes and fullgenes. We also get the name of the database elsif($job_type eq "g") { $genes_results = $ARGV[0]; shift(@ARGV); $fullgenes_results = $ARGV[0]; shift(@ARGV); $databases = $ARGV[0]; shift (@ARGV); } #If b, both mlst and genedb: We will have three output files and the database name. else { $txt_results = $ARGV[0]; shift(@ARGV); $genes_results = $ARGV[0]; shift(@ARGV); $fullgenes_results = $ARGV[0]; shift(@ARGV); $databases = $ARGV[0]; shift(@ARGV); } #After we get the output files/database name, now we get the name of the reads #This allows SRST2 to give meaningful output instead of just printing 'dataset_xxx' as the sample name my $filename = $ARGV[0]; shift(@ARGV); #This index offset is used to determine where the 'genedb' option is located. #If only a single-end input, the genedb will be 3 positions into the arguments: # -input_se sample.fastq --genedb database.fasta my $index_offset = 3; print @ARGV; #change the file extensions of the input reads so srst can use them #Usually the file name looks like this: sample_S1_L001_R1_001.fastq #If we use this file name, it confuses srst2 a lot. So we just extract #the sample name to use as input file name. my @file_name = split /_/, $filename; $name = "temp_file_name"; my ($for_read, $rev_read, $sing_read, $database); if ($ARGV[0] eq "--input_pe") { #Increment index offset if we paired-end inputs $index_offset++; $for_read = $name."_1.dat"; $rev_read = $name."_2.dat"; symlink($ARGV[1], $for_read); symlink($ARGV[2], $rev_read); $ARGV[1] = $for_read; $ARGV[2] = $rev_read; } else { $sing_read = $name.".dat"; symlink($ARGV[1], $sing_read); $ARGV[1] = $sing_read; } #If we are running a job to include genedb, use the database name for input file name if ($job_type eq 'g' | $job_type eq 'b') { my @db_names = split /,/, $databases; my $num_db = @db_names; my %names_hash = (); # loop through dbs to replace spaces with _ and check for duplicates for (my $i = 0; $i < $num_db; $i++){ $db_names[$i]=~s/ /_/g; if( exists($names_hash{$db_names[$i]}) ) { print STDERR "More than one database with the same name"; exit(1); }else{ $names_hash{$db_names[$i]}=$db_names[$i]; } } foreach my $db_name (@db_names){ (my $base = $db_name) =~ s/\.[^.]+$//; $database = $base.".dat"; symlink($ARGV[$index_offset], $database); $ARGV[$index_offset] = $database; $index_offset++; } } for (my $i =0; $i< @ARGV; $i++){ if (index($ARGV[$i], "maxins") != -1){ my ($maxins, $minins); my @b2args = split(' ', $ARGV[$i]); for (my $j = 0; $j < @b2args; $j++){ if (index($b2args[$j], "maxins") != -1){ $maxins = $b2args[$j+1]; } if (index($b2args[$j], "minins") != -1){ $minins = $b2args[$j+1]; } } if ($maxins - $minins < 0){ print STDERR "--minins cannot be greater than --maxins"; exit(1); } } } my $command = "python $binary @ARGV"; my $exit_code = system($command); my $cur_dir = getcwd(); # make arrays for using multiple custom databases (creates multiple output files - need to be concatenated) my (@genefiles, @bamfiles, @pileupfiles, @fullgenefiles, @scoresfiles); # go through files in the output directory to move/concatenate them as required. foreach my $file (<$cur_dir/*>) { print $file, "\n"; #Will cause problems if any files have 'mlst' in them if ($file =~ /mlst/) { move($file, $txt_results); } elsif ($file =~ /\.bam$/) { push @bamfiles, $file; } elsif ($file =~ /\.scores$/) { push @scoresfiles, $file; } elsif ($file =~ /\.pileup$/) { push @pileupfiles, $file; } elsif ($file =~ /__fullgenes__/) { push @fullgenefiles, $file; } elsif ($file =~ /__genes__/) { push @genefiles, $file; } elsif ($file =~ /all_consensus_alleles.fasta$/ && $allele_type eq 'all') { move($file,$allele_results); } elsif ($file =~ /new_consensus_alleles.fasta$/ && $allele_type eq 'new'){ move($file,$allele_results); } } my ($cmd, $temp_head, $temp_full, $cat_header, $final_bam, @headers ); # create new concatenated bam file with all bam files. if (@bamfiles > 1){ my $counter = 0; $cat_header = "cat_header"; while ($counter < @bamfiles) { $headers[$counter] = "bam_header".$counter; # make a header file for each bam results file my $cmd = "samtools view -H $bamfiles[$counter] > $headers[$counter]"; system($cmd); if ($counter >= 1){ # only keep the @hd and @pg from first file because the final concatenated file can only have one of each (doesn't matter location) $temp_head="cut_head".$counter; # cut off first row and last row of each file (the @HD and @PG) $cmd = "tail -n +2 $headers[$counter] | head -n -1 > $temp_head"; system($cmd); unlink $headers[$counter]; # replace the old header with the new cut header in the array $headers[$counter] = $temp_head; } $counter++; } # combine all header files $cmd = "cat ".join(" ",@headers)." > $cat_header"; system($cmd); $final_bam = "final_bam"; # concatenate all the bam files *must include the concatenated header file created above $cmd = "samtools cat -h $cat_header -o $final_bam ".join(" ",@bamfiles)." "; system($cmd); # sort the bam file so it can be indexed $cmd = "samtools sort $final_bam 'last_bam'"; system($cmd); # move bam file to where Galaxy expects it. $cmd = "mv 'last_bam.bam' $bam_results"; system($cmd); } else { # only one bam file, don't need to concatenate move($bamfiles[0], $bam_results); } # concatenate all pileup files if (@pileupfiles > 1){ $cmd = "cat ".join(" ",@pileupfiles)." > $pileup"; system($cmd); } else { move($pileupfiles[0], $pileup); } # perform find-replace to restore original user-specified file names foreach my $gene (@genefiles){ my $data = read_file($gene); $data =~ s/temp_file_name/$file_name[0]/g; write_file($gene, $data); } foreach my $gene (@fullgenefiles){ my $data = read_file($gene); $data =~ s/temp_file_name/$file_name[0]/g; write_file($gene, $data); } # concatenate gene files with a space separating each file if (@genefiles > 1){ my $join = join(" <(echo) ", @genefiles); my @args = ("bash", "-c", "cat $join > $genes_results"); system(@args); } else { # only one gene results file move($genefiles[0], $genes_results); } # concatenate full gene results files but only keep header in first file. if (@fullgenefiles >1){ for (my $i= 1; $i < @fullgenefiles; $i++){ # go through all files but the first one to remove headers # create a temp file to save the file after header is removed $temp_full = "temp_full".$i; $cmd = "tail -n +2 $fullgenefiles[$i] > $temp_full"; system($cmd); unlink $fullgenefiles[$i]; $fullgenefiles[$i] = $temp_full; } $cmd = "cat ". join(" ",@fullgenefiles)." > $fullgenes_results"; system($cmd); } else{ # only one full gene results file move($fullgenefiles[0], $fullgenes_results); } # concatenate full gene results files but only keep header in first file. if (@scoresfiles >1){ for (my $i= 1; $i < @scoresfiles; $i++){ # go through all files but the first one to remove headers # create a temp file to save the file after header is removed $temp_full = "temp_full".$i; $cmd = "tail -n +2 $scoresfiles[$i] > $temp_full"; system($cmd); unlink $scoresfiles[$i]; $scoresfiles[$i] = $temp_full; } $cmd = "cat ". join(" ",@scoresfiles)." > $scores"; system($cmd); } else{ # only one scores file move($scoresfiles[0], $scores); } # cleanup srst2 output and temp files foreach my $file (@fullgenefiles){ unlink $file; } foreach my $file (@genefiles){ unlink $file; } foreach my $file (@pileupfiles){ unlink $file; } foreach my $file (@bamfiles){ unlink $file; } foreach my $file (@headers){ unlink $file; } foreach my $file (@scoresfiles){ unlink $file; } unlink $temp_head; unlink $temp_full; unlink $cat_header; unlink $final_bam; #get rid of symlinks if ($for_read) { unlink $for_read; } if ($rev_read) { unlink $rev_read; } if ($sing_read) { unlink $sing_read; } if ($database) { unlink $database; } $exit_code = $exit_code >> 8; exit $exit_code; sub read_file { my ($filename) = @_; open my $in, '<:encoding(UTF-8)', $filename or die "Could not open '$filename' for reading $!"; local $/ = undef; my $all = <$in>; close $in; return $all; } sub write_file { my ($filename, $content) = @_; open my $out, '>:encoding(UTF-8)', $filename or die "Could not open '$filename' for writing $!";; print $out $content; close $out; return; }