Mercurial > repos > nml > srst2

diff srst2.pl @ 0:6f870ed59b6e draft
Uploaded
author: nml
date: Mon, 06 Feb 2017 12:31:04 -0500
children: 599a4dc309aa
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/srst2.pl	Mon Feb 06 12:31:04 2017 -0500
@@ -0,0 +1,378 @@
+#!/usr/bin/env perl
+
+
+use strict;
+use warnings;
+use Cwd;
+use File::Copy;
+
+#The first 4 arguments should be in this format:
+#  /path/to/srst2.py bam_output scores_output pileup_output ...
+
+my $binary = $ARGV[0];
+shift;
+
+my ($bam_results, $scores, $pileup, $job_type, $txt_results, $genes_results, $fullgenes_results, $name, $databases);
+my ($allele_results,$allele_type);
+
+
+$bam_results = $ARGV[0];
+shift(@ARGV);
+$scores = $ARGV[0];
+shift(@ARGV);
+$pileup = $ARGV[0];
+shift(@ARGV);
+
+#Now that we shifted the first 4 arguments, get the rest depending on the job type. There are three:
+#I pass a letter to tell us which job type was selected.
+$job_type = $ARGV[0];
+shift(@ARGV);
+
+#If m, mlst only:  we only have one mlst output file
+if($job_type eq "m")
+{
+   $txt_results = $ARGV[0];
+   shift(@ARGV);
+   $allele_results = $ARGV[0];
+   shift(@ARGV);
+   $allele_type = $ARGV[0];
+   shift(@ARGV);
+}
+#If g, genedb only: we have two outputs: genes and fullgenes. We also get the name of the database
+elsif($job_type eq "g")
+{
+   $genes_results = $ARGV[0];
+   shift(@ARGV);
+   $fullgenes_results = $ARGV[0];
+   shift(@ARGV);
+   $databases = $ARGV[0];
+   shift (@ARGV);
+}
+#If b, both mlst and genedb: We will have three output files and the database name.
+else
+{
+   $txt_results = $ARGV[0];
+   shift(@ARGV);
+   $genes_results = $ARGV[0];
+   shift(@ARGV);
+   $fullgenes_results = $ARGV[0];
+   shift(@ARGV);
+   $databases = $ARGV[0];
+   shift(@ARGV);
+}
+
+#After we get the output files/database name, now we get the name of the reads
+#This allows SRST2 to give meaningful output instead of just printing 'dataset_xxx' as the sample name
+my $filename = $ARGV[0];
+shift(@ARGV);
+
+#This index offset is used to determine where the 'genedb' option is located.
+#If only a single-end input, the genedb will be 3 positions into the arguments:
+#  -input_se sample.fastq --genedb database.fasta
+my $index_offset = 3;
+
+print @ARGV;
+#change the file extensions of the input reads so srst can use them
+
+#Usually the file name looks like this: sample_S1_L001_R1_001.fastq
+#If we use this file name, it confuses srst2 a lot. So we just extract
+#the sample name to use as input file name.
+my @file_name = split /_/, $filename;
+$name = "temp_file_name";
+
+my ($for_read, $rev_read, $sing_read, $database);
+if ($ARGV[0] eq "--input_pe")
+{
+        #Increment index offset if we paired-end inputs
+        $index_offset++;
+
+	$for_read = $name."_1.dat";
+	$rev_read = $name."_2.dat";
+
+	symlink($ARGV[1], $for_read);
+	symlink($ARGV[2], $rev_read);
+
+	$ARGV[1] = $for_read;
+	$ARGV[2] = $rev_read;
+}
+else
+{
+        $sing_read = $name.".dat";
+
+        symlink($ARGV[1], $sing_read);
+
+        $ARGV[1] = $sing_read;
+
+}
+#If we are running a job to include genedb, use the database name for input file name
+if ($job_type eq 'g' | $job_type eq 'b')
+{
+  my @db_names = split /,/, $databases;
+  my $num_db = @db_names;
+  my %names_hash = ();
+  # loop through dbs to replace spaces with _ and check for duplicates
+  for (my $i = 0; $i < $num_db; $i++){
+    $db_names[$i]=~s/ /_/g;
+    if( exists($names_hash{$db_names[$i]}) ) {
+      print STDERR  "More than one database with the same name";
+      exit(1);
+    }else{
+      $names_hash{$db_names[$i]}=$db_names[$i];
+    }
+  }
+
+
+  foreach my $db_name (@db_names){
+    (my $base = $db_name) =~ s/\.[^.]+$//;
+    $database = $base.".dat";
+
+    symlink($ARGV[$index_offset], $database);
+    $ARGV[$index_offset] = $database;
+
+    $index_offset++;
+  }
+}
+
+for (my $i  =0; $i< @ARGV; $i++){
+  if (index($ARGV[$i], "maxins") != -1){
+    my ($maxins, $minins);
+    my @b2args = split(' ', $ARGV[$i]);
+    for (my $j = 0; $j < @b2args; $j++){
+      if (index($b2args[$j], "maxins") != -1){
+        $maxins = $b2args[$j+1];
+      }
+      if (index($b2args[$j], "minins") != -1){
+        $minins = $b2args[$j+1];
+      }
+    }
+    if ($maxins - $minins < 0){
+      print STDERR  "--minins cannot be greater than --maxins";
+      exit(1);
+    }
+  }
+}
+
+
+
+my $command = "python $binary @ARGV";
+
+my $exit_code = system($command);
+
+my $cur_dir = getcwd();
+# make arrays for using multiple custom databases (creates multiple output files - need to be concatenated)
+my (@genefiles, @bamfiles, @pileupfiles, @fullgenefiles, @scoresfiles);
+
+# go through files in the output directory to move/concatenate them as required.
+
+foreach my $file (<$cur_dir/*>)
+{
+    print $file, "\n";
+    #Will cause problems if any files have 'mlst' in them
+  if ($file =~ /mlst/)
+  {
+    move($file, $txt_results);
+  }
+  elsif ($file =~ /\.bam$/)
+  {
+    push @bamfiles, $file;
+  }
+  elsif ($file =~ /\.scores$/)
+  {
+    push @scoresfiles, $file;
+  }
+  elsif ($file =~ /\.pileup$/)
+  {
+    push @pileupfiles, $file;
+  }
+  elsif ($file =~ /__fullgenes__/)
+  {
+    push @fullgenefiles, $file;
+  }
+  elsif ($file =~ /__genes__/)
+  {
+    push @genefiles, $file;
+  }
+  elsif ($file =~ /all_consensus_alleles.fasta$/ && $allele_type eq 'all') {
+    move($file,$allele_results);
+  }
+  elsif ($file =~ /new_consensus_alleles.fasta$/ && $allele_type eq 'new'){
+    move($file,$allele_results);
+  }
+}
+
+
+my ($cmd, $temp_head, $temp_full, $cat_header, $final_bam, @headers );
+
+# create new concatenated bam file with all bam files.
+if (@bamfiles > 1){
+  my $counter = 0;
+  $cat_header = "cat_header";
+  while ($counter < @bamfiles) {
+    $headers[$counter] = "bam_header".$counter;
+    # make a header file for each bam results file
+    my $cmd = "samtools view -H $bamfiles[$counter] > $headers[$counter]";
+    system($cmd);
+    if ($counter >= 1){
+      # only keep the @hd and @pg from first file because the final concatenated file can only have one of each (doesn't matter location)
+      $temp_head="cut_head".$counter;
+      # cut off first row and last row of each file (the @HD and @PG)
+      $cmd = "tail -n +2 $headers[$counter] | head -n -1 > $temp_head";
+      system($cmd);
+      unlink $headers[$counter];
+      # replace the old header with the new cut header in the array
+      $headers[$counter] = $temp_head;
+    }
+    $counter++;
+  }
+  # combine all header files
+  $cmd = "cat ".join(" ",@headers)." > $cat_header";
+  system($cmd);
+
+  $final_bam = "final_bam";
+  # concatenate all the bam files *must include the concatenated header file created above
+  $cmd = "samtools cat -h $cat_header -o $final_bam ".join(" ",@bamfiles)." ";
+  system($cmd);
+
+  # sort the bam file so it can be indexed
+  $cmd = "samtools sort $final_bam 'last_bam'";
+  system($cmd);
+
+  # move bam file to where Galaxy expects it.
+  $cmd = "mv 'last_bam.bam' $bam_results";
+  system($cmd);
+} else {
+  # only one bam file, don't need to concatenate
+  move($bamfiles[0], $bam_results);
+}
+
+# concatenate all pileup files
+if (@pileupfiles > 1){
+  $cmd = "cat ".join(" ",@pileupfiles)." > $pileup";
+  system($cmd);
+} else {
+  move($pileupfiles[0], $pileup);
+}
+
+# perform find-replace to restore original user-specified file names
+foreach my $gene (@genefiles){
+  my $data = read_file($gene);
+  $data =~ s/temp_file_name/$file_name[0]/g;
+  write_file($gene, $data);
+}
+
+foreach my $gene (@fullgenefiles){
+  my $data = read_file($gene);
+  $data =~ s/temp_file_name/$file_name[0]/g;
+  write_file($gene, $data);
+}
+
+# concatenate gene files with a space separating each file
+if (@genefiles > 1){
+  my $join = join(" <(echo) ", @genefiles);
+  my @args = ("bash", "-c", "cat $join > $genes_results");
+  system(@args);
+} else {
+  # only one gene results file
+  move($genefiles[0], $genes_results);
+}
+
+# concatenate full gene results files but only keep header in first file.
+if (@fullgenefiles >1){
+  for (my $i= 1; $i < @fullgenefiles; $i++){
+    # go through all files but the first one to remove headers
+    # create a temp file to save the file after header is removed
+    $temp_full = "temp_full".$i;
+    $cmd = "tail -n +2 $fullgenefiles[$i] > $temp_full";
+    system($cmd);
+    unlink $fullgenefiles[$i];
+    $fullgenefiles[$i] = $temp_full;
+  }
+  $cmd = "cat ". join(" ",@fullgenefiles)." > $fullgenes_results";
+  system($cmd);
+} else{
+  # only one full gene results file
+  move($fullgenefiles[0], $fullgenes_results);
+}
+
+# concatenate full gene results files but only keep header in first file.
+if (@scoresfiles >1){
+  for (my $i= 1; $i < @scoresfiles; $i++){
+    # go through all files but the first one to remove headers
+    # create a temp file to save the file after header is removed
+    $temp_full = "temp_full".$i;
+    $cmd = "tail -n +2 $scoresfiles[$i] > $temp_full";
+    system($cmd);
+    unlink $scoresfiles[$i];
+    $scoresfiles[$i] = $temp_full;
+  }
+  $cmd = "cat ". join(" ",@scoresfiles)." > $scores";
+  system($cmd);
+} else{
+  # only one scores file
+  move($scoresfiles[0], $scores);
+}
+
+# cleanup srst2 output and temp files
+foreach my $file (@fullgenefiles){
+  unlink $file;
+}
+foreach my $file (@genefiles){
+  unlink $file;
+}
+foreach my $file (@pileupfiles){
+  unlink $file;
+}
+foreach my $file (@bamfiles){
+  unlink $file;
+}
+foreach my $file (@headers){
+  unlink $file;
+}
+foreach my $file (@scoresfiles){
+  unlink $file;
+}
+unlink $temp_head;
+unlink $temp_full;
+unlink $cat_header;
+unlink $final_bam;
+
+#get rid of symlinks
+if ($for_read)
+{
+	unlink $for_read;
+}
+if ($rev_read)
+{
+	unlink $rev_read;
+}
+if ($sing_read)
+{
+         unlink $sing_read;
+}
+if ($database)
+{
+         unlink $database;
+}
+$exit_code = $exit_code >> 8;
+exit $exit_code;
+
+sub read_file {
+    my ($filename) = @_;
+
+    open my $in, '<:encoding(UTF-8)', $filename or die "Could not open '$filename' for reading $!";
+    local $/ = undef;
+    my $all = <$in>;
+    close $in;
+
+    return $all;
+}
+
+sub write_file {
+    my ($filename, $content) = @_;
+
+    open my $out, '>:encoding(UTF-8)', $filename or die "Could not open '$filename' for writing $!";;
+    print $out $content;
+    close $out;
+
+    return;
+}
\ No newline at end of file
author	nml
date	Mon, 06 Feb 2017 12:31:04 -0500
parents
children	599a4dc309aa