Mercurial > repos > nml > srst2
diff srst2.pl @ 0:6f870ed59b6e draft
Uploaded
author | nml |
---|---|
date | Mon, 06 Feb 2017 12:31:04 -0500 |
parents | |
children | 599a4dc309aa |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/srst2.pl Mon Feb 06 12:31:04 2017 -0500 @@ -0,0 +1,378 @@ +#!/usr/bin/env perl + + +use strict; +use warnings; +use Cwd; +use File::Copy; + +#The first 4 arguments should be in this format: +# /path/to/srst2.py bam_output scores_output pileup_output ... + +my $binary = $ARGV[0]; +shift; + +my ($bam_results, $scores, $pileup, $job_type, $txt_results, $genes_results, $fullgenes_results, $name, $databases); +my ($allele_results,$allele_type); + + +$bam_results = $ARGV[0]; +shift(@ARGV); +$scores = $ARGV[0]; +shift(@ARGV); +$pileup = $ARGV[0]; +shift(@ARGV); + +#Now that we shifted the first 4 arguments, get the rest depending on the job type. There are three: +#I pass a letter to tell us which job type was selected. +$job_type = $ARGV[0]; +shift(@ARGV); + +#If m, mlst only: we only have one mlst output file +if($job_type eq "m") +{ + $txt_results = $ARGV[0]; + shift(@ARGV); + $allele_results = $ARGV[0]; + shift(@ARGV); + $allele_type = $ARGV[0]; + shift(@ARGV); +} +#If g, genedb only: we have two outputs: genes and fullgenes. We also get the name of the database +elsif($job_type eq "g") +{ + $genes_results = $ARGV[0]; + shift(@ARGV); + $fullgenes_results = $ARGV[0]; + shift(@ARGV); + $databases = $ARGV[0]; + shift (@ARGV); +} +#If b, both mlst and genedb: We will have three output files and the database name. +else +{ + $txt_results = $ARGV[0]; + shift(@ARGV); + $genes_results = $ARGV[0]; + shift(@ARGV); + $fullgenes_results = $ARGV[0]; + shift(@ARGV); + $databases = $ARGV[0]; + shift(@ARGV); +} + +#After we get the output files/database name, now we get the name of the reads +#This allows SRST2 to give meaningful output instead of just printing 'dataset_xxx' as the sample name +my $filename = $ARGV[0]; +shift(@ARGV); + +#This index offset is used to determine where the 'genedb' option is located. +#If only a single-end input, the genedb will be 3 positions into the arguments: +# -input_se sample.fastq --genedb database.fasta +my $index_offset = 3; + +print @ARGV; +#change the file extensions of the input reads so srst can use them + +#Usually the file name looks like this: sample_S1_L001_R1_001.fastq +#If we use this file name, it confuses srst2 a lot. So we just extract +#the sample name to use as input file name. +my @file_name = split /_/, $filename; +$name = "temp_file_name"; + +my ($for_read, $rev_read, $sing_read, $database); +if ($ARGV[0] eq "--input_pe") +{ + #Increment index offset if we paired-end inputs + $index_offset++; + + $for_read = $name."_1.dat"; + $rev_read = $name."_2.dat"; + + symlink($ARGV[1], $for_read); + symlink($ARGV[2], $rev_read); + + $ARGV[1] = $for_read; + $ARGV[2] = $rev_read; +} +else +{ + $sing_read = $name.".dat"; + + symlink($ARGV[1], $sing_read); + + $ARGV[1] = $sing_read; + +} +#If we are running a job to include genedb, use the database name for input file name +if ($job_type eq 'g' | $job_type eq 'b') +{ + my @db_names = split /,/, $databases; + my $num_db = @db_names; + my %names_hash = (); + # loop through dbs to replace spaces with _ and check for duplicates + for (my $i = 0; $i < $num_db; $i++){ + $db_names[$i]=~s/ /_/g; + if( exists($names_hash{$db_names[$i]}) ) { + print STDERR "More than one database with the same name"; + exit(1); + }else{ + $names_hash{$db_names[$i]}=$db_names[$i]; + } + } + + + foreach my $db_name (@db_names){ + (my $base = $db_name) =~ s/\.[^.]+$//; + $database = $base.".dat"; + + symlink($ARGV[$index_offset], $database); + $ARGV[$index_offset] = $database; + + $index_offset++; + } +} + +for (my $i =0; $i< @ARGV; $i++){ + if (index($ARGV[$i], "maxins") != -1){ + my ($maxins, $minins); + my @b2args = split(' ', $ARGV[$i]); + for (my $j = 0; $j < @b2args; $j++){ + if (index($b2args[$j], "maxins") != -1){ + $maxins = $b2args[$j+1]; + } + if (index($b2args[$j], "minins") != -1){ + $minins = $b2args[$j+1]; + } + } + if ($maxins - $minins < 0){ + print STDERR "--minins cannot be greater than --maxins"; + exit(1); + } + } +} + + + +my $command = "python $binary @ARGV"; + +my $exit_code = system($command); + +my $cur_dir = getcwd(); +# make arrays for using multiple custom databases (creates multiple output files - need to be concatenated) +my (@genefiles, @bamfiles, @pileupfiles, @fullgenefiles, @scoresfiles); + +# go through files in the output directory to move/concatenate them as required. + +foreach my $file (<$cur_dir/*>) +{ + print $file, "\n"; + #Will cause problems if any files have 'mlst' in them + if ($file =~ /mlst/) + { + move($file, $txt_results); + } + elsif ($file =~ /\.bam$/) + { + push @bamfiles, $file; + } + elsif ($file =~ /\.scores$/) + { + push @scoresfiles, $file; + } + elsif ($file =~ /\.pileup$/) + { + push @pileupfiles, $file; + } + elsif ($file =~ /__fullgenes__/) + { + push @fullgenefiles, $file; + } + elsif ($file =~ /__genes__/) + { + push @genefiles, $file; + } + elsif ($file =~ /all_consensus_alleles.fasta$/ && $allele_type eq 'all') { + move($file,$allele_results); + } + elsif ($file =~ /new_consensus_alleles.fasta$/ && $allele_type eq 'new'){ + move($file,$allele_results); + } +} + + +my ($cmd, $temp_head, $temp_full, $cat_header, $final_bam, @headers ); + +# create new concatenated bam file with all bam files. +if (@bamfiles > 1){ + my $counter = 0; + $cat_header = "cat_header"; + while ($counter < @bamfiles) { + $headers[$counter] = "bam_header".$counter; + # make a header file for each bam results file + my $cmd = "samtools view -H $bamfiles[$counter] > $headers[$counter]"; + system($cmd); + if ($counter >= 1){ + # only keep the @hd and @pg from first file because the final concatenated file can only have one of each (doesn't matter location) + $temp_head="cut_head".$counter; + # cut off first row and last row of each file (the @HD and @PG) + $cmd = "tail -n +2 $headers[$counter] | head -n -1 > $temp_head"; + system($cmd); + unlink $headers[$counter]; + # replace the old header with the new cut header in the array + $headers[$counter] = $temp_head; + } + $counter++; + } + # combine all header files + $cmd = "cat ".join(" ",@headers)." > $cat_header"; + system($cmd); + + $final_bam = "final_bam"; + # concatenate all the bam files *must include the concatenated header file created above + $cmd = "samtools cat -h $cat_header -o $final_bam ".join(" ",@bamfiles)." "; + system($cmd); + + # sort the bam file so it can be indexed + $cmd = "samtools sort $final_bam 'last_bam'"; + system($cmd); + + # move bam file to where Galaxy expects it. + $cmd = "mv 'last_bam.bam' $bam_results"; + system($cmd); +} else { + # only one bam file, don't need to concatenate + move($bamfiles[0], $bam_results); +} + +# concatenate all pileup files +if (@pileupfiles > 1){ + $cmd = "cat ".join(" ",@pileupfiles)." > $pileup"; + system($cmd); +} else { + move($pileupfiles[0], $pileup); +} + +# perform find-replace to restore original user-specified file names +foreach my $gene (@genefiles){ + my $data = read_file($gene); + $data =~ s/temp_file_name/$file_name[0]/g; + write_file($gene, $data); +} + +foreach my $gene (@fullgenefiles){ + my $data = read_file($gene); + $data =~ s/temp_file_name/$file_name[0]/g; + write_file($gene, $data); +} + +# concatenate gene files with a space separating each file +if (@genefiles > 1){ + my $join = join(" <(echo) ", @genefiles); + my @args = ("bash", "-c", "cat $join > $genes_results"); + system(@args); +} else { + # only one gene results file + move($genefiles[0], $genes_results); +} + +# concatenate full gene results files but only keep header in first file. +if (@fullgenefiles >1){ + for (my $i= 1; $i < @fullgenefiles; $i++){ + # go through all files but the first one to remove headers + # create a temp file to save the file after header is removed + $temp_full = "temp_full".$i; + $cmd = "tail -n +2 $fullgenefiles[$i] > $temp_full"; + system($cmd); + unlink $fullgenefiles[$i]; + $fullgenefiles[$i] = $temp_full; + } + $cmd = "cat ". join(" ",@fullgenefiles)." > $fullgenes_results"; + system($cmd); +} else{ + # only one full gene results file + move($fullgenefiles[0], $fullgenes_results); +} + +# concatenate full gene results files but only keep header in first file. +if (@scoresfiles >1){ + for (my $i= 1; $i < @scoresfiles; $i++){ + # go through all files but the first one to remove headers + # create a temp file to save the file after header is removed + $temp_full = "temp_full".$i; + $cmd = "tail -n +2 $scoresfiles[$i] > $temp_full"; + system($cmd); + unlink $scoresfiles[$i]; + $scoresfiles[$i] = $temp_full; + } + $cmd = "cat ". join(" ",@scoresfiles)." > $scores"; + system($cmd); +} else{ + # only one scores file + move($scoresfiles[0], $scores); +} + +# cleanup srst2 output and temp files +foreach my $file (@fullgenefiles){ + unlink $file; +} +foreach my $file (@genefiles){ + unlink $file; +} +foreach my $file (@pileupfiles){ + unlink $file; +} +foreach my $file (@bamfiles){ + unlink $file; +} +foreach my $file (@headers){ + unlink $file; +} +foreach my $file (@scoresfiles){ + unlink $file; +} +unlink $temp_head; +unlink $temp_full; +unlink $cat_header; +unlink $final_bam; + +#get rid of symlinks +if ($for_read) +{ + unlink $for_read; +} +if ($rev_read) +{ + unlink $rev_read; +} +if ($sing_read) +{ + unlink $sing_read; +} +if ($database) +{ + unlink $database; +} +$exit_code = $exit_code >> 8; +exit $exit_code; + +sub read_file { + my ($filename) = @_; + + open my $in, '<:encoding(UTF-8)', $filename or die "Could not open '$filename' for reading $!"; + local $/ = undef; + my $all = <$in>; + close $in; + + return $all; +} + +sub write_file { + my ($filename, $content) = @_; + + open my $out, '>:encoding(UTF-8)', $filename or die "Could not open '$filename' for writing $!";; + print $out $content; + close $out; + + return; +} \ No newline at end of file