diff call_miseq_variants @ 0:1a23ea467feb default tip

intial commit
author Yusuf Ali <ali@yusuf.email>
date Thu, 26 Mar 2015 09:36:17 -0600
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/call_miseq_variants	Thu Mar 26 09:36:17 2015 -0600
@@ -0,0 +1,122 @@
+#!/usr/bin/env perl
+
+# Locked down procedure for variant calling
+use strict;
+use warnings;
+use File::Basename;
+#@ARGV == 7 or die "Usage: $0 <sample name> <output dir> <deduped_realigned_input.bam> <target_regions.bed> <gene_regions.bed> <reference.fasta> <max num processes>\n";
+
+# parse configuration file
+my $dirname= dirname(__FILE__);
+my $tool_data = shift @ARGV;
+my %config;
+if( not -e "$tool_data/miseq_bam_variants.loc" ){
+  system("cp $dirname/tool-data/miseq_bam_variants.loc $tool_data/miseq_bam_variants.loc");
+}
+open CONFIG, '<',"$tool_data/miseq_bam_variants.loc";
+while(<CONFIG>){
+  (my $key, my $value) = split(/\s+/, $_ );
+  $config{$key} = $value;
+}
+my $dbs_dir = $config{"reference_dbs"};
+my $name_dir = $config{"named_regions_dir"};
+my $capt_dir = $config{"capture_kits_dir"};
+close CONFIG;
+
+my $sample_name = shift @ARGV;
+my $outdir = shift @ARGV;
+my $bam = shift @ARGV;
+my $bed = $capt_dir . (shift @ARGV);
+my $gene_regions = $name_dir . (shift @ARGV);
+my $fasta = $dbs_dir . (shift @ARGV);
+my $max_processes = shift @ARGV;
+
+my $log = "$outdir/$sample_name.call_variants.log.txt";
+my $gatk = "$outdir/$sample_name.gatk_haplotypecaller";
+my $freeBayes = "$outdir/$sample_name.freeBayes";
+my $atlas2_indel = "$outdir/$sample_name.atlas2_indel";
+my $phase = "$outdir/$sample_name.phase";
+my $cover = "$outdir/$sample_name.read_coverage";
+my $flagstat = "$outdir/$sample_name.flagstat";
+
+my @cmds;
+@cmds = (   "$dirname/gatk_haplotypecaller_parallel $bam ".($max_processes > 5 ? int($max_processes*2/3+0.5) : 2)." $gatk.vcf -R $fasta --emitRefConfidence GVCF -variant_index_type LINEAR -variant_index_parameter 128000 &>> $log",
+            "$dirname/freebayes_parallel $bam ".($max_processes > 5 ? int(($max_processes-1)/3+0.5) : 2)." $freeBayes.vcf -j -C 2 -i -f $fasta -X &>> $log",
+            "samtools phase -F $bam > $phase.txt 2>> $log");
+&run_cmds($max_processes, @cmds);
+
+@cmds = ("$dirname/vcf2hgvs_table -q -d 2 -p 0.05 -h 0.1 -u $dbs_dir/internal_runs.sorted.vcf.gz -m $dbs_dir/mappability/hg19.crgRefSeqExomeMappability75mer.txt -x $bed -i $gatk.vcf -o $gatk.hgvs.txt -s $dbs_dir/dbsnp_1000g_esp6500.latest.vcf.gz -r $dbs_dir/hg19.fa -e $dbs_dir/hg19_refGene_gencode_ultrasensitive.gtf -b $gene_regions -z $phase.txt",
+         "$dirname/vcf2hgvs_table -q -d 2 -p 0.05 -h 0.1 -u $dbs_dir/internal_runs.sorted.vcf.gz -m $dbs_dir/mappability/hg19.crgRefSeqExomeMappability75mer.txt -x $bed -i $freeBayes.vcf -o $freeBayes.hgvs.txt -s $dbs_dir/dbsnp_1000g_esp6500.latest.vcf.gz -r $dbs_dir/hg19.fa -e $dbs_dir/hg19_refGene_gencode_ultrasensitive.gtf -b $gene_regions -z $phase.txt");
+&run_cmds($max_processes, @cmds);
+
+my $combined = "$outdir/$sample_name.combined";
+system "$dirname/combine_hgvs_tables -q true $combined.hgvs.txt GATKHaplotypeCaller $gatk.hgvs.txt FreeBayes $freeBayes.hgvs.txt >> $log";
+system "$dirname/hgvs_collapse_transcripts $combined.hgvs.txt $combined.collapsed.hgvs.txt 1 >> $log";
+system "$dirname/split_hgvs_by_confidence $combined.collapsed.hgvs.txt $combined.collapsed.confident.hgvs.txt $combined.collapsed.marginal.hgvs.txt 2 &>> $log";
+
+# Check the logs for anything untowards
+# e.g. FreeBayes: jumping is disabled
+#  or any line saying "error"
+my @errors;
+open(LOG, $log)
+  or die "Could not check log file for error message, therefore the validity of the genotypes cannot be guaranteed\n";
+while(<LOG>){
+  if(/error/i or /jumping is disabled/){
+    push @errors, $_;
+  }
+}
+close(LOG);
+
+if(@errors){
+  print "Errors were detected in the genotyping log:\n", @errors; 
+  exit 1; # failure
+}
+exit 0; #success
+
+sub run_cmds{
+
+  my ($max_cmds, @cmd) = @_;
+  
+  my ($num_children, $pid); 
+  
+        for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){
+        # initialize the number of child processes at 1, and increment it by one
+        #while it is less than $max_cmds
+
+                my $cmd = shift (@cmds);
+                if($pid = fork) {
+                        # do nothing if parent
+                } elsif(defined $pid) { # $pid is zero here if defined
+                        #print STDERR $cmd, "\n";
+                        system $cmd;
+                        exit;
+                } else {
+                        #weird fork error
+                        die "Can't fork: $!\n";
+                }
+        }
+
+        while(@cmds) {
+                undef $pid;
+                FORK: {
+                        my $cmd = shift (@cmds);
+                        if($pid = fork) {
+                                # parent here
+                                $num_children++;
+                                wait;
+                                $num_children--;
+                                next;
+
+                        } elsif(defined $pid) { # $pid is zero here if defined
+                                #print STDERR $cmd, "\n";
+                                system $cmd;
+                                exit;
+
+                        } else {
+                                #weird fork error
+                                die "Can't fork: $!\n";
+                        }
+                }
+        }
+        wait while $num_children--;
+}