\n\n\n' )
+
+def index_bam_files( bam_filenames, tmp_dir ):
+ for bam_filename in bam_filenames:
+ bam_index_filename = "%s.bai" % bam_filename
+ if not os.path.exists( bam_index_filename ):
+ #need to index this bam file
+ stderr_name = tempfile.NamedTemporaryFile( prefix = "bam_index_stderr" ).name
+ command = 'samtools index %s %s' % ( bam_filename, bam_index_filename )
+ proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) )
+ return_code = proc.wait()
+ if return_code:
+ for line in open( stderr_name ):
+ print >> sys.stderr, line
+ os.unlink( stderr_name ) #clean up
+ cleanup_before_exit( tmp_dir )
+ raise Exception( "Error indexing BAM file" )
+ os.unlink( stderr_name ) #clean up
+
+def __main__():
+ #Parse Command Line
+ parser = optparse.OptionParser()
+ parser.add_option( '-p', '--pass_through', dest='pass_through_options', action='append', type="string", help='These options are passed through directly to GATK, without any modification.' )
+ parser.add_option( '-o', '--pass_through_options', dest='pass_through_options_encoded', action='append', type="string", help='These options are passed through directly to GATK, with decoding from binascii.unhexlify.' )
+ parser.add_option( '-d', '--dataset', dest='datasets', action='append', type="string", nargs=4, help='"-argument" "original_filename" "galaxy_filetype" "name_prefix"' )
+ parser.add_option( '', '--max_jvm_heap', dest='max_jvm_heap', action='store', type="string", default=None, help='If specified, the maximum java virtual machine heap size will be set to the provide value.' )
+ parser.add_option( '', '--max_jvm_heap_fraction', dest='max_jvm_heap_fraction', action='store', type="int", default=None, help='If specified, the maximum java virtual machine heap size will be set to the provide value as a fraction of total physical memory.' )
+ parser.add_option( '', '--stdout', dest='stdout', action='store', type="string", default=None, help='If specified, the output of stdout will be written to this file.' )
+ parser.add_option( '', '--stderr', dest='stderr', action='store', type="string", default=None, help='If specified, the output of stderr will be written to this file.' )
+ parser.add_option( '', '--html_report_from_directory', dest='html_report_from_directory', action='append', type="string", nargs=2, help='"Target HTML File" "Directory"')
+ (options, args) = parser.parse_args()
+
+ tmp_dir = tempfile.mkdtemp( prefix='tmp-gatk-' )
+ if options.pass_through_options:
+ cmd = ' '.join( options.pass_through_options )
+ else:
+ cmd = ''
+ if options.pass_through_options_encoded:
+ cmd = '%s %s' % ( cmd, ' '.join( map( unhexlify, options.pass_through_options_encoded ) ) )
+ if options.max_jvm_heap is not None:
+ cmd = cmd.replace( 'java ', 'java -Xmx%s ' % ( options.max_jvm_heap ), 1 )
+ elif options.max_jvm_heap_fraction is not None:
+ cmd = cmd.replace( 'java ', 'java -XX:DefaultMaxRAMFraction=%s -XX:+UseParallelGC ' % ( options.max_jvm_heap_fraction ), 1 )
+ bam_filenames = []
+ if options.datasets:
+ for ( dataset_arg, filename, galaxy_ext, prefix ) in options.datasets:
+ gatk_filename = gatk_filename_from_galaxy( filename, galaxy_ext, target_dir = tmp_dir, prefix = prefix )
+ if dataset_arg:
+ cmd = '%s %s "%s"' % ( cmd, gatk_filetype_argument_substitution( dataset_arg, galaxy_ext ), gatk_filename )
+ if galaxy_ext == "bam":
+ bam_filenames.append( gatk_filename )
+ index_bam_files( bam_filenames, tmp_dir )
+ #set up stdout and stderr output options
+ stdout = open_file_from_option( options.stdout, mode = 'wb' )
+ stderr = open_file_from_option( options.stderr, mode = 'wb' )
+ #if no stderr file is specified, we'll use our own
+ if stderr is None:
+ stderr = tempfile.NamedTemporaryFile( prefix="gatk-stderr-", dir=tmp_dir )
+
+ proc = subprocess.Popen( args=cmd, stdout=stdout, stderr=stderr, shell=True, cwd=tmp_dir )
+ return_code = proc.wait()
+
+ if return_code:
+ stderr_target = sys.stderr
+ else:
+ stderr_target = sys.stdout
+ stderr.flush()
+ stderr.seek(0)
+ while True:
+ chunk = stderr.read( CHUNK_SIZE )
+ if chunk:
+ stderr_target.write( chunk )
+ else:
+ break
+ stderr.close()
+ #generate html reports
+ if options.html_report_from_directory:
+ for ( html_filename, html_dir ) in options.html_report_from_directory:
+ html_report_from_directory( open( html_filename, 'wb' ), html_dir )
+
+ cleanup_before_exit( tmp_dir )
+
+if __name__=="__main__": __main__()
diff -r 000000000000 -r 53dd1bfced54 test-data/a.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/a.tab Tue Apr 01 10:49:41 2014 -0400
@@ -0,0 +1,15 @@
+CHR SNP BP A1 TEST NMISS BETA STAT P
+1 rs1181876 3671541 T DOMDEV 958 -1.415 -3.326 0.0009161
+1 rs10492923 5092886 C ADD 1007 5.105 4.368 1.382e-05
+1 rs10492923 5092886 C DOMDEV 1007 -5.612 -4.249 2.35e-05
+1 rs10492923 5092886 C GENO_2DF 1007 NA 19.9 4.775e-05
+1 rs1801133 11778965 T ADD 1022 1.23 3.97 7.682e-05
+1 rs1801133 11778965 T GENO_2DF 1022 NA 16.07 0.0003233
+1 rs1361912 12663121 A ADD 1021 12.69 4.093 4.596e-05
+1 rs1361912 12663121 A DOMDEV 1021 -12.37 -3.945 8.533e-05
+1 rs1361912 12663121 A GENO_2DF 1021 NA 17.05 0.0001982
+1 rs1009806 19373138 G ADD 1021 -1.334 -3.756 0.0001826
+1 rs1009806 19373138 G GENO_2DF 1021 NA 19.36 6.244e-05
+1 rs873654 29550948 A DOMDEV 1012 1.526 3.6 0.0003339
+1 rs10489527 36800027 C ADD 1016 12.67 4.114 4.211e-05
+1 rs10489527 36800027 C DOMDEV 1016 -13.05 -4.02 6.249e-05
diff -r 000000000000 -r 53dd1bfced54 tool-data/gatk_annotations.txt.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gatk_annotations.txt.sample Tue Apr 01 10:49:41 2014 -0400
@@ -0,0 +1,30 @@
+#unique_id name gatk_value tools_valid_for
+AlleleBalance AlleleBalance AlleleBalance UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+AlleleBalanceBySample AlleleBalanceBySample AlleleBalanceBySample UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+BaseCounts BaseCounts BaseCounts UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+BaseQualityRankSumTest BaseQualityRankSumTest BaseQualityRankSumTest UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+ChromosomeCounts ChromosomeCounts ChromosomeCounts UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+DepthOfCoverage DepthOfCoverage DepthOfCoverage UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+DepthPerAlleleBySample DepthPerAlleleBySample DepthPerAlleleBySample UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+FisherStrand FisherStrand FisherStrand UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+GCContent GCContent GCContent UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+HaplotypeScore HaplotypeScore HaplotypeScore UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+HardyWeinberg HardyWeinberg HardyWeinberg UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+HomopolymerRun HomopolymerRun HomopolymerRun UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+InbreedingCoeff InbreedingCoeff InbreedingCoeff UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+IndelType IndelType IndelType UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+LowMQ LowMQ LowMQ UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+MVLikelihoodRatio MVLikelihoodRatio MVLikelihoodRatio UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+MappingQualityRankSumTest MappingQualityRankSumTest MappingQualityRankSumTest UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+MappingQualityZero MappingQualityZero MappingQualityZero UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+MappingQualityZeroBySample MappingQualityZeroBySample MappingQualityZeroBySample UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+MappingQualityZeroFraction MappingQualityZeroFraction MappingQualityZeroFraction UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+NBaseCount NBaseCount NBaseCount UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+QualByDepth QualByDepth QualByDepth UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+RMSMappingQuality RMSMappingQuality RMSMappingQuality UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+ReadDepthAndAllelicFractionBySample ReadDepthAndAllelicFractionBySample ReadDepthAndAllelicFractionBySample UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+ReadPosRankSumTest ReadPosRankSumTest ReadPosRankSumTest UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+SampleList SampleList SampleList UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+SnpEff SnpEff SnpEff VariantAnnotator,VariantRecalibrator
+SpanningDeletions SpanningDeletions SpanningDeletions UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
+TechnologyComposition TechnologyComposition TechnologyComposition UnifiedGenotyper,VariantAnnotator,VariantRecalibrator
diff -r 000000000000 -r 53dd1bfced54 tool-data/gatk_sorted_picard_index.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gatk_sorted_picard_index.loc.sample Tue Apr 01 10:49:41 2014 -0400
@@ -0,0 +1,26 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of Picard dict and associated files. You will need
+#to create these data files and then create a picard_index.loc file
+#similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The picard_index.loc
+#file has this format (longer white space is the TAB character):
+#
+#
+#
+#So, for example, if you had hg18 indexed and stored in
+#/depot/data2/galaxy/srma/hg18/,
+#then the srma_index.loc entry would look like this:
+#
+#hg18 hg18 hg18 Pretty /depot/data2/galaxy/picard/hg18/hg18.fa
+#
+#and your /depot/data2/galaxy/srma/hg18/ directory
+#would contain the following three files:
+#hg18.fa
+#hg18.dict
+#hg18.fa.fai
+#
+#The dictionary file for each reference (ex. hg18.dict) must be
+#created via Picard (http://picard.sourceforge.net). Note that
+#the dict file does not have the .fa extension although the
+#path list in the loc file does include it.
+#
diff -r 000000000000 -r 53dd1bfced54 tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Tue Apr 01 10:49:41 2014 -0400
@@ -0,0 +1,13 @@
+
+
+
+
+ value, dbkey, name, path
+
+
+
+
+ value, name, gatk_value, tools_valid_for
+
+
+
diff -r 000000000000 -r 53dd1bfced54 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Tue Apr 01 10:49:41 2014 -0400
@@ -0,0 +1,6 @@
+
+
+
+
+
+
diff -r 000000000000 -r 53dd1bfced54 variant_recalibrator.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_recalibrator.xml Tue Apr 01 10:49:41 2014 -0400
@@ -0,0 +1,431 @@
+
+
+
+ gatk
+
+
+ gatk_macros.xml
+
+ gatk_wrapper.py
+ --max_jvm_heap_fraction "1"
+ --stdout "${output_log}"
+ #for $var_count, $variant in enumerate( $reference_source.variants ):
+ -d "--input:input_${var_count},%(file_type)s" "${variant.input_variants}" "${variant.input_variants.ext}" "input_variants_${var_count}"
+ #end for
+ -p 'java
+ -jar "\$JAVA_JAR_PATH/GenomeAnalysisTK.jar"
+ -T "VariantRecalibrator"
+ --num_threads \${GALAXY_SLOTS:-4}
+ -et "NO_ET" ##ET no phone home
+ ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout
+ #if $reference_source.reference_source_selector != "history":
+ -R "${reference_source.ref_file.fields.path}"
+ #end if
+ --recal_file "${output_recal}"
+ --tranches_file "${output_tranches}"
+ --rscript_file "${output_rscript}"
+ '
+
+ #set $rod_binding_names = dict()
+ #for $rod_binding in $rod_bind:
+ #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom':
+ #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name
+ #elif str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'comp':
+ #set $rod_bind_name = "comp" + $rod_binding.rod_bind_type.custom_rod_name
+ #else
+ #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector
+ #end if
+ #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1
+ #if $rod_binding.rod_bind_type.rod_training_type.rod_training_type_selector == "not_training_truth_known":
+ -d "--resource:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}"
+ #else:
+ -d "--resource:${rod_bind_name},%(file_type)s,known=${rod_binding.rod_bind_type.rod_training_type.known},training=${rod_binding.rod_bind_type.rod_training_type.training},truth=${rod_binding.rod_bind_type.rod_training_type.truth},bad=${rod_binding.rod_bind_type.rod_training_type.bad},prior=${rod_binding.rod_bind_type.rod_training_type.prior}" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}"
+ #end if
+ #end for
+
+ #include source=$standard_gatk_options#
+
+ ##start analysis specific options
+ -p '
+ #if str( $annotations ) != "None":
+ #for $annotation in str( $annotations.fields.gatk_value ).split( ',' ):
+ --use_annotation "${annotation}"
+ #end for
+ #end if
+ #for $additional_annotation in $additional_annotations:
+ --use_annotation "${additional_annotation.additional_annotation_name}"
+ #end for
+ --mode "${mode}"
+ '
+
+ #if $analysis_param_type.analysis_param_type_selector == "advanced":
+ -p '
+ --maxGaussians "${analysis_param_type.max_gaussians}"
+ --maxIterations "${analysis_param_type.max_iterations}"
+ --numKMeans "${analysis_param_type.num_k_means}"
+ --stdThreshold "${analysis_param_type.std_threshold}"
+ --qualThreshold "${analysis_param_type.qual_threshold}"
+ --shrinkage "${analysis_param_type.shrinkage}"
+ --dirichlet "${analysis_param_type.dirichlet}"
+ --priorCounts "${analysis_param_type.prior_counts}"
+ #if str( $analysis_param_type.bad_variant_selector.bad_variant_selector_type ) == 'percent':
+ --percentBadVariants "${analysis_param_type.bad_variant_selector.percent_bad_variants}"
+ #else:
+ --minNumBadVariants "${analysis_param_type.bad_variant_selector.min_num_bad_variants}"
+ #end if
+ --target_titv "${analysis_param_type.target_titv}"
+ #for $tranche in [ $tranche.strip() for $tranche in str( $analysis_param_type.ts_tranche ).split( ',' ) if $tranche.strip() ]
+ --TStranche "${tranche}"
+ #end for
+ #for $ignore_filter in $analysis_param_type.ignore_filters:
+ #set $ignore_filter_name = str( $ignore_filter.ignore_filter_type.ignore_filter_type_selector )
+ #if $ignore_filter_name == "custom":
+ #set $ignore_filter_name = str( $ignore_filter.ignore_filter_type.filter_name )
+ #end if
+ --ignore_filter "${ignore_filter_name}"
+ #end for
+ --ts_filter_level "${analysis_param_type.ts_filter_level}"
+ '
+ #end if
+
+
+ &&
+ mv "${output_rscript}.pdf" "${output_tranches_pdf}"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations and evaluates the variant -- assigning an informative lod score
+
+For more information on using the VariantRecalibrator module, see this `tool specific page <http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration>`_.
+
+To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3>`_.
+
+If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions>`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: VariantRecalibrator accepts a variant input file.
+
+
+**Outputs**
+
+The output is in VCF format.
+
+
+Go `here <http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK>`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+
+ tranches_file The output tranches file used by ApplyRecalibration
+ use_annotation The names of the annotations which should used for calculations
+ mode Recalibration mode to employ: 1.) SNP for recalibrating only snps (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both snps and indels simultaneously. (SNP|INDEL|BOTH)
+ maxGaussians The maximum number of Gaussians to try during variational Bayes algorithm
+ maxIterations The maximum number of VBEM iterations to be performed in variational Bayes algorithm. Procedure will normally end when convergence is detected.
+ numKMeans The number of k-means iterations to perform in order to initialize the means of the Gaussians in the Gaussian mixture model.
+ stdThreshold If a variant has annotations more than -std standard deviations away from mean then don't use it for building the Gaussian mixture model.
+ qualThreshold If a known variant has raw QUAL value less than -qual then don't use it for building the Gaussian mixture model.
+ shrinkage The shrinkage parameter in variational Bayes algorithm.
+ dirichlet The dirichlet parameter in variational Bayes algorithm.
+ priorCounts The number of prior counts to use in variational Bayes algorithm.
+ percentBadVariants What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.
+ minNumBadVariants The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary.
+ recal_file The output recal file used by ApplyRecalibration
+ target_titv The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!
+ TStranche The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)
+ ignore_filter If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file
+ path_to_Rscript The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript
+ rscript_file The output rscript file generated by the VQSR to aid in visualization of the input data and learned model
+ path_to_resources Path to resources folder holding the Sting R scripts.
+ ts_filter_level The truth sensitivity level at which to start filtering, used here to indicate filtered variants in plots
+
+@CITATION_SECTION@
+
+