\n\n\n' )
+
+
+def index_bam_files( bam_filenames ):
+ for bam_filename in bam_filenames:
+ bam_index_filename = "%s.bai" % bam_filename
+ if not os.path.exists( bam_index_filename ):
+ # need to index this bam file
+ stderr_name = tempfile.NamedTemporaryFile( prefix="bam_index_stderr" ).name
+ command = 'samtools index %s %s' % ( bam_filename, bam_index_filename )
+ try:
+ subprocess.check_call( args=command, shell=True, stderr=open( stderr_name, 'wb' ) )
+ except:
+ for line in open( stderr_name ):
+ print >> sys.stderr, line
+ raise Exception( "Error indexing BAM file" )
+ finally:
+ os.unlink( stderr_name )
+
+
+def __main__():
+ # Parse Command Line
+ parser = optparse.OptionParser()
+ parser.add_option( '-p', '--pass_through', dest='pass_through_options', action='append', type="string", help='These options are passed through directly to GATK, without any modification.' )
+ parser.add_option( '-o', '--pass_through_options', dest='pass_through_options_encoded', action='append', type="string", help='These options are passed through directly to GATK, with decoding from binascii.unhexlify.' )
+ parser.add_option( '-d', '--dataset', dest='datasets', action='append', type="string", nargs=4, help='"-argument" "original_filename" "galaxy_filetype" "name_prefix"' )
+ parser.add_option( '', '--max_jvm_heap', dest='max_jvm_heap', action='store', type="string", default=None, help='If specified, the maximum java virtual machine heap size will be set to the provide value.' )
+ parser.add_option( '', '--max_jvm_heap_fraction', dest='max_jvm_heap_fraction', action='store', type="int", default=None, help='If specified, the maximum java virtual machine heap size will be set to the provide value as a fraction of total physical memory.' )
+ parser.add_option( '', '--stdout', dest='stdout', action='store', type="string", default=None, help='If specified, the output of stdout will be written to this file.' )
+ parser.add_option( '', '--stderr', dest='stderr', action='store', type="string", default=None, help='If specified, the output of stderr will be written to this file.' )
+ parser.add_option( '', '--html_report_from_directory', dest='html_report_from_directory', action='append', type="string", nargs=2, help='"Target HTML File" "Directory"')
+ parser.add_option( '-e', '--phone_home', dest='phone_home', action='store', type="string", default='STANDARD', help='What kind of GATK run report should we generate(NO_ET|STANDARD|STDOUT)' )
+ parser.add_option( '-K', '--gatk_key', dest='gatk_key', action='store', type="string", default=None, help='What kind of GATK run report should we generate(NO_ET|STANDARD|STDOUT)' )
+ (options, args) = parser.parse_args()
+
+ if options.pass_through_options:
+ cmd = ' '.join( options.pass_through_options )
+ else:
+ cmd = ''
+ if options.pass_through_options_encoded:
+ cmd = '%s %s' % ( cmd, ' '.join( map( unhexlify, options.pass_through_options_encoded ) ) )
+ if options.max_jvm_heap is not None:
+ cmd = cmd.replace( 'java ', 'java -Xmx%s ' % ( options.max_jvm_heap ), 1 )
+ elif options.max_jvm_heap_fraction is not None:
+ cmd = cmd.replace( 'java ', 'java -XX:DefaultMaxRAMFraction=%s -XX:+UseParallelGC ' % ( options.max_jvm_heap_fraction ), 1 )
+ bam_filenames = []
+ tmp_dir = tempfile.mkdtemp( prefix='tmp-gatk-' )
+ try:
+ if options.datasets:
+ for ( dataset_arg, filename, galaxy_ext, prefix ) in options.datasets:
+ gatk_filename = gatk_filename_from_galaxy( filename, galaxy_ext, target_dir=tmp_dir, prefix=prefix )
+ if dataset_arg:
+ cmd = '%s %s "%s"' % ( cmd, gatk_filetype_argument_substitution( dataset_arg, galaxy_ext ), gatk_filename )
+ if galaxy_ext == "bam":
+ bam_filenames.append( gatk_filename )
+ if galaxy_ext == 'fasta':
+ subprocess.check_call( 'samtools faidx "%s"' % gatk_filename, shell=True )
+ subprocess.check_call( 'java -jar %s R=%s O=%s QUIET=true' % ( os.path.join(os.environ['JAVA_JAR_PATH'], 'CreateSequenceDictionary.jar'), gatk_filename, os.path.splitext(gatk_filename)[0] + '.dict' ), shell=True )
+ index_bam_files( bam_filenames )
+ # set up stdout and stderr output options
+ stdout = open_file_from_option( options.stdout, mode='wb' )
+ stderr = open_file_from_option( options.stderr, mode='wb' )
+ # if no stderr file is specified, we'll use our own
+ if stderr is None:
+ stderr = tempfile.NamedTemporaryFile( prefix="gatk-stderr-", dir=tmp_dir )
+
+ proc = subprocess.Popen( args=cmd, stdout=stdout, stderr=stderr, shell=True, cwd=tmp_dir )
+ return_code = proc.wait()
+
+ if return_code:
+ stderr_target = sys.stderr
+ else:
+ stderr_target = sys.stdout
+ stderr.flush()
+ stderr.seek(0)
+ while True:
+ chunk = stderr.read( CHUNK_SIZE )
+ if chunk:
+ stderr_target.write( chunk )
+ else:
+ break
+ stderr.close()
+ finally:
+ cleanup_before_exit( tmp_dir )
+
+ # generate html reports
+ if options.html_report_from_directory:
+ for ( html_filename, html_dir ) in options.html_report_from_directory:
+ html_report_from_directory( open( html_filename, 'wb' ), html_dir )
+
+
+if __name__ == "__main__":
+ __main__()
diff -r 68426930d59c -r 01ff8dd37d4d haplotype_caller.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/haplotype_caller.xml Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,313 @@
+
+ Call SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region
+
+ gatk2_macros.xml
+
+
+
+
+ gatk2_wrapper.py
+ --stdout "${output_log}"
+ @BAM_INPUTS@
+ -p '
+ @JAR_PATH@
+ -T "HaplotypeCaller"
+ -o "${output_vcf}"
+
+ \$GATK2_SITE_OPTIONS
+
+ --num_cpu_threads_per_data_thread \${GALAXY_SLOTS:-4}
+
+ #if $reference_source.reference_source_selector != "history":
+ -R "${reference_source.ref_file.fields.path}"
+ #end if
+ #if str($input_recal) != 'None':
+ --BQSR "${input_recal}"
+ #end if
+ '
+ @DBSNP_OPTIONS@
+ $allow_n_cigar_reads
+ #include source=$standard_gatk_options#
+
+ ##start analysis specific options
+ #if $analysis_param_type.analysis_param_type_selector == "advanced":
+ -p '
+ #if $analysis_param_type.heterozygosity.__str__.strip() != '':
+ --heterozygosity $analysis_param_type.heterozygosity
+ #end if
+ --genotyping_mode "${analysis_param_type.genotyping_mode_type.genotyping_mode}"
+ #if str( $analysis_param_type.genotyping_mode_type.genotyping_mode ) == 'GENOTYPE_GIVEN_ALLELES':
+ --alleles "${analysis_param_type.genotyping_mode_type.input_alleles_rod}"
+ #end if
+ #if not $analysis_param_type.emitRefConfidence is None:
+ --emitRefConfidence $analysis_param_type.emitRefConfidence
+ #end if
+
+ ## files
+ #if str($analysis_param_type.activeRegionIn) != 'None':
+ --activeRegionIn "$analysis_param_type.activeRegionIn"
+ #end if
+ #if str($analysis_param_type.comp) != 'None':
+ --comp "$analysis_param_type.comp"
+ #end if
+ ##
+ #if str( $analysis_param_type.annotation ) != "None":
+ #for $annotation in str( $analysis_param_type.annotation.fields.gatk_value ).split( ','):
+ --annotation "${annotation}"
+ #end for
+ #end if
+ #for $additional_annotation in $analysis_param_type.additional_annotations:
+ --annotation "${additional_annotation.additional_annotation_name}"
+ #end for
+ #if str( $analysis_param_type.group ) != "None":
+ #for $group in str( $analysis_param_type.group ).split( ','):
+ --group "${group}"
+ #end for
+ #end if
+ #if str( $analysis_param_type.exclude_annotations ) != "None":
+ #for $annotation in str( $analysis_param_type.exclude_annotations.fields.gatk_value ).split( ','):
+ --excludeAnnotation "${annotation}"
+ #end for
+ #end if
+
+ ## value setings
+ #if $analysis_param_type.contamination_fraction_to_filter.__str__.strip() != '':
+ --contamination_fraction_to_filter $analysis_param_type.contamination_fraction_to_filter
+ #end if
+ #if $analysis_param_type.minPruning.__str__.strip() != '':
+ --minPruning $analysis_param_type.minPruning
+ #end if
+ #if $analysis_param_type.standard_min_confidence_threshold_for_calling.__str__.strip() != '':
+ --standard_min_confidence_threshold_for_calling $analysis_param_type.standard_min_confidence_threshold_for_calling
+ #end if
+ #if $analysis_param_type.standard_min_confidence_threshold_for_emitting.__str__.strip() != '':
+ --standard_min_confidence_threshold_for_emitting $analysis_param_type.standard_min_confidence_threshold_for_emitting
+ #end if
+ #if $analysis_param_type.gcpHMM.__str__.strip() != '':
+ --gcpHMM $analysis_param_type.gcpHMM
+ #end if
+ #if $analysis_param_type.max_alternate_alleles.__str__.strip() != '':
+ --max_alternate_alleles $analysis_param_type.max_alternate_alleles
+ #end if
+ ## mode selections
+
+ #if $analysis_param_type.pair_hmm_implementation.__str__ != "None" and len($analysis_param_type.pair_hmm_implementation.__str__) > 0:
+ --pair_hmm_implementation $analysis_param_type.pair_hmm_implementation
+ #end if
+ ## optional outputs
+ #if $analysis_param_type.activeRegionOut:
+ --activeRegionOut $active_region_out
+ #end if
+ #if $analysis_param_type.graphOutput:
+ --graphOutput $graph_out
+ #end if
+ ## flags
+ $analysis_param_type.useAllelesTrigger
+ $analysis_param_type.fullHaplotype
+ $analysis_param_type.genotypeFullActiveRegion
+ $analysis_param_type.debug
+ '
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ analysis_param_type['analysis_param_type_selector'] == "advanced" and analysis_param_type['graphOutput'] == True
+
+
+ analysis_param_type['analysis_param_type_selector'] == "advanced" and analysis_param_type['activeRegionOut'] == True
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+**HaplotypeCaller**
+calls SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region.
+Haplotypes are evaluated using an affine gap penalty Pair HMM.
+
+For more information on using read based compression in the GATK, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_haplotypecaller_HaplotypeCaller.html>`_.
+
+To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_.
+
+If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: PrintReads accepts aligned BAM files.
+
+
+**Outputs**
+
+The output is a VCF file with raw, unrecalibrated SNP and indel calls.
+
+
+Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+ activeRegionIn Use this interval list file as the active regions to process
+ activeRegionOut Output the active region to this interval list file
+ alleles The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES
+ annotation One or more specific annotations to apply to variant calls
+ comp comparison VCF file
+ contamination Fraction of contamination in sequencing data (for all samples) to aggressively remove
+ dbsnp dbSNP file
+ debug If specified, print out very verbose debug information about each triggering active region
+ excludeAnnotation One or more specific annotations to exclude
+ genotyping_mode Specifies how to determine the alternate alleles to use for genotyping
+ graphOutput File to which debug assembly graph information should be written
+ group One or more classes/groups of annotations to apply to variant calls
+ heterozygosity Heterozygosity value used to compute prior likelihoods for any locus
+ minPruning The minimum allowed pruning factor in assembly graph. Paths with less than or equal supporting kmers are pruned from the graph
+ pair_hmm_implementation The PairHMM implementation to use for genotype likelihood calculations
+ stand_call_conf The minimum phred-scaled confidence threshold at which variants should be called
+ stand_emit_conf The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)
+ useAllelesTrigger If specified, use additional trigger on variants found in an external alleles file
+ fullHaplotype If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference
+ gcpHMM Flat gap continuation penalty for use in the Pair HMM
+ genotypeFullActiveRegion If specified, alternate alleles are considered to be the full active region for the purposes of genotyping
+ max_alternate_alleles Maximum number of alternate alleles to genotype
+
+@CITATION_SECTION@
+
+
+
diff -r 68426930d59c -r 01ff8dd37d4d indel_realigner.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/indel_realigner.xml Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,213 @@
+
+ - perform local realignment
+
+ gatk2_macros.xml
+
+
+
+
+ gatk2_wrapper.py
+ --stdout "${output_log}"
+ -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input"
+ #if str( $reference_source.input_bam.metadata.bam_index ) != "None":
+ -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index
+ #end if
+ -p '
+ @JAR_PATH@
+ -T "IndelRealigner"
+ -o "${output_bam}"
+
+ \$GATK2_SITE_OPTIONS
+
+ ## according to http://www.broadinstitute.org/gatk/guide/article?id=1975
+ --num_cpu_threads_per_data_thread 1
+
+ #if $reference_source.reference_source_selector != "history":
+ -R "${reference_source.ref_file.fields.path}"
+ #end if
+ -LOD "${lod_threshold}"
+ ${knowns_only}
+ '
+
+ #set $rod_binding_names = dict()
+ #for $rod_binding in $rod_bind:
+ #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom':
+ #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name
+ #else
+ #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector
+ #end if
+ #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1
+ -d "-known:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}"
+ #end for
+
+ $allow_n_cigar_reads
+ #include source=$standard_gatk_options#
+ ##start analysis specific options
+ -d "-targetIntervals" "${target_intervals}" "${target_intervals.ext}" "gatk_target_intervals"
+ -p '
+ --disable_bam_indexing
+ '
+ #if $analysis_param_type.analysis_param_type_selector == "advanced":
+ -p '
+ --entropyThreshold "${analysis_param_type.entropy_threshold}"
+ ${analysis_param_type.simplify_bam}
+ --consensusDeterminationModel "${analysis_param_type.consensus_determination_model}"
+ --maxIsizeForMovement "${analysis_param_type.max_insert_size_for_movement}"
+ --maxPositionalMoveAllowed "${analysis_param_type.max_positional_move_allowed}"
+ --maxConsensuses "${analysis_param_type.max_consensuses}"
+ --maxReadsForConsensuses "${analysis_param_type.max_reads_for_consensuses}"
+ --maxReadsForRealignment "${analysis_param_type.max_reads_for_realignment}"
+ ${analysis_param_type.no_original_alignment_tags}
+ '
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Performs local realignment of reads based on misalignments due to the presence of indels. Unlike most mappers, this walker uses the full alignment context to determine whether an appropriate alternate reference (i.e. indel) exists and updates SAMRecords accordingly.
+
+For more information on local realignment around indels using the GATK, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_indels_IndelRealigner.html>`_.
+
+To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_.
+
+If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: IndelRealigner accepts an aligned BAM and a list of intervals to realign as input files.
+
+
+**Outputs**
+
+The output is in the BAM format.
+
+
+Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+ targetIntervals intervals file output from RealignerTargetCreator
+ LODThresholdForCleaning LOD threshold above which the cleaner will clean
+ entropyThreshold percentage of mismatches at a locus to be considered having high entropy
+ out Output bam
+ bam_compression Compression level to use for writing BAM files
+ disable_bam_indexing Turn off on-the-fly creation of indices for output BAM files.
+ simplifyBAM If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier
+ useOnlyKnownIndels Don't run 'Smith-Waterman' to generate alternate consenses; use only known indels provided as RODs for constructing the alternate references.
+ maxReadsInMemory max reads allowed to be kept in memory at a time by the SAMFileWriter. Keep it low to minimize memory consumption (but the tool may skip realignment on regions with too much coverage. If it is too low, it may generate errors during realignment); keep it high to maximize realignment (but make sure to give Java enough memory).
+ maxIsizeForMovement maximum insert size of read pairs that we attempt to realign
+ maxPositionalMoveAllowed maximum positional move in basepairs that a read can be adjusted during realignment
+ maxConsensuses max alternate consensuses to try (necessary to improve performance in deep coverage)
+ maxReadsForConsensuses max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)
+ maxReadsForRealignment max reads allowed at an interval for realignment; if this value is exceeded, realignment is not attempted and the reads are passed to the output file(s) as-is
+ noOriginalAlignmentTags Don't output the original cigar or alignment start tags for each realigned read in the output bam.
+
+@CITATION_SECTION@
+
+
+
diff -r 68426930d59c -r 01ff8dd37d4d print_reads.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/print_reads.xml Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,245 @@
+
+ on BAM files
+
+ gatk2_macros.xml
+
+
+
+
+ gatk2_wrapper.py
+ --stdout "${output_log}"
+ -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input"
+ #if str( $reference_source.input_bam.metadata.bam_index ) != "None":
+ -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index
+ #end if
+ -p '
+ @JAR_PATH@
+ -T "PrintReads"
+ -o "${output_bam}"
+ \$GATK2_SITE_OPTIONS
+
+ ## according to http://www.broadinstitute.org/gatk/guide/article?id=1975
+ --num_cpu_threads_per_data_thread \${GALAXY_SLOTS:-6}
+
+ #if $reference_source.reference_source_selector != "history":
+ -R "${reference_source.ref_file.fields.path}"
+ #end if
+ #if str($input_recal) != 'None':
+ --BQSR "${input_recal}"
+ #end if
+ --disable_bam_indexing
+ '
+
+ #include source=$standard_gatk_options#
+
+ ##start analysis specific options
+ #if $analysis_param_type.analysis_param_type_selector == "advanced":
+ -p '
+ #if $analysis_param_type.default_read_group_type.default_read_group_type_selector == "set":
+ --default_read_group "${analysis_param_type.default_read_group_type.default_read_group}"
+ #end if
+ #if str( $analysis_param_type.default_platform ) != "default":
+ --default_platform "${analysis_param_type.default_platform}"
+ #end if
+ #if str( $analysis_param_type.force_read_group_type.force_read_group_type_selector ) == "set":
+ --force_read_group "${analysis_param_type.force_read_group_type.force_read_group}"
+ #end if
+ #if str( $analysis_param_type.force_platform ) != "default":
+ --force_platform "${analysis_param_type.force_platform}"
+ #end if
+ ${analysis_param_type.exception_if_no_tile}
+ #if str( $analysis_param_type.solid_options_type.solid_options_type_selector ) == "set":
+ #if str( $analysis_param_type.solid_options_type.solid_recal_mode ) != "default":
+ --solid_recal_mode "${analysis_param_type.solid_options_type.solid_recal_mode}"
+ #end if
+ #if str( $analysis_param_type.solid_options_type.solid_nocall_strategy ) != "default":
+ --solid_nocall_strategy "${analysis_param_type.solid_options_type.solid_nocall_strategy}"
+ #end if
+ #end if
+ ${analysis_param_type.simplify_bam}
+ --preserve_qscores_less_than "${analysis_param_type.preserve_qscores_less_than}"
+ --smoothing "${analysis_param_type.smoothing}"
+ --max_quality_score "${analysis_param_type.max_quality_score}"
+ --window_size_nqs "${analysis_param_type.window_size_nqs}"
+ --homopolymer_nback "${analysis_param_type.homopolymer_nback}"
+ ${analysis_param_type.do_not_write_original_quals}
+ '
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read. This walker then outputs a new bam file with these updated (recalibrated) reads. Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker. Note: This walker is designed to be used in conjunction with CovariateCounterWalker.
+
+For more information on base quality score recalibration using the GATK, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_readutils_PrintReads.html>`_.
+
+To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_.
+
+If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: PrintReads accepts an aligned BAM and a recalibration (gatk_report) input files.
+
+
+**Outputs**
+
+The output is in BAM format.
+
+
+Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+ default_read_group If a read has no read group then default to the provided String.
+ default_platform If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.
+ force_read_group If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.
+ force_platform If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.
+ window_size_nqs The window size used by MinimumNQSCovariate for its calculation
+ homopolymer_nback The number of previous bases to look at in HomopolymerCovariate
+ exception_if_no_tile If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1
+ solid_recal_mode How should we recalibrate solid bases in whichthe reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS (DO_NOTHING|SET_Q_ZERO|SET_Q_ZERO_BASE_N|REMOVE_REF_BIAS)
+ solid_nocall_strategy Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ (THROW_EXCEPTION|LEAVE_READ_UNRECALIBRATED|PURGE_READ)
+ recal_file Filename for the input covariates table recalibration .gatk_report file
+ out The output BAM file
+ bam_compression Compression level to use for writing BAM files
+ disable_bam_indexing Turn off on-the-fly creation of indices for output BAM files.
+ simplifyBAM If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier
+ preserve_qscores_less_than Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases
+ smoothing Number of imaginary counts to add to each bin bin order to smooth out bins with few data points, default=1
+ max_quality_score The integer value at which to cap the quality scores, default=50
+ doNotWriteOriginalQuals If true, we will not write the original quality (OQ) tag for each read
+
+@CITATION_SECTION@
+
+
+
diff -r 68426930d59c -r 01ff8dd37d4d readme.rst
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/readme.rst Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,98 @@
+Galaxy wrapper for GATK2
+========================
+
+This wrapper is copyright 2013 by Björn Grüning, Jim Johnson & the Galaxy Team.
+
+The Genome Analysis Toolkit or GATK is a software package developed at the
+Broad Institute to analyse next-generation resequencing data. The toolkit offers
+a wide variety of tools, with a primary focus on variant discovery and
+genotyping as well as strong emphasis on data quality assurance. Its robust
+architecture, powerful processing engine and high-performance computing features
+make it capable of taking on projects of any size.
+
+http://www.broadinstitute.org/gatk
+http://www.broadinstitute.org/gatk/about/citing-gatk
+
+GATK is Free for academics, and fee for commercial use. Please study the GATK licensing website:
+http://www.broadinstitute.org/gatk/about/#licensing
+
+
+Installation
+============
+
+The recommended installation is by means of the toolshed_.
+
+.. _toolshed: http://toolshed.g2.bx.psu.edu/view/iuc/gatk2
+
+Galaxy should be able to install samtools dependencies automatically
+for you. GATK2, and its new licence model, does not allow us to distribute the GATK binaries.
+As a consequence you need to install GATK2 by your own, please see the GATK website for more information:
+
+http://www.broadinstitute.org/gatk/download
+
+Once you have installed GATK2, you need to edit the env.sh files that are installed together with the wrappers.
+You must edit the GATK2_PATH environment variable in the file:
+
+/environment_settings/GATK2_PATH/iuc/gatk2//env.sh
+
+to point to the folder where you have installed GATK2.
+
+Optionally, you may also want to edit the GATK2_SITE_OPTIONS environment variable in the file:
+
+/environment_settings/GATK2_SITE_OPTIONS/iuc/gatk2//env.sh
+
+to deactivate the 'call home feature' of GATK with something like:
+
+GATK2_SITE_OPTIONS='-et NO_ET -K /data/gatk2_key_file'
+
+GATK2_SITE_OPTIONS can be also used to insert other specific options into every GATK2 wrapper
+at runtime, without changing the actual wrapper.
+
+Read more about the "Phone Home" problem at:
+http://www.broadinstitute.org/gatk/guide/article?id=1250
+
+Optionally, you may also want to add some commands to be executed before GATK (e.g. to load modules) to the file:
+::
+ /gatk2/default/env.sh
+
+Note that due to the manual nature of the GATK2 installation you will be getting the
+following warnings in the Galaxy log (unless you specified the env.sh in the previous paragraph):
+::
+ Failed to resolve dependency on 'gatk2', ignoring.
+
+This is because the
+::
+ gatk2
+is specified but never resolved in the tool_dependencies.xml. It is safe to ignore.
+
+Finally, you should fill in additional information about your genomes and
+annotations in the gatk2_picard_index.loc and gatk2_annotations.txt.
+You can find them in the tool-data/ Galaxy directory.
+
+History
+=======
+
+* v0.1 - Initial public release
+* v2.8.0 - Bugfix release, increase version number to reflect the underlying GATK version
+
+
+Licence (MIT)
+=============
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff -r 68426930d59c -r 01ff8dd37d4d realigner_target_creator.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/realigner_target_creator.xml Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,174 @@
+
+ for use in local realignment
+
+ gatk2_macros.xml
+
+
+
+
+ gatk2_wrapper.py
+ --stdout "${output_log}"
+ -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input"
+ #if str( $reference_source.input_bam.metadata.bam_index ) != "None":
+ -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index
+ #end if
+ -p '
+ @JAR_PATH@
+ -T "RealignerTargetCreator"
+ -o "${output_interval}"
+
+ \$GATK2_SITE_OPTIONS
+
+ ## according to http://www.broadinstitute.org/gatk/guide/article?id=1975
+ --num_cpu_threads_per_data_thread 1
+
+ @THREADS@
+
+ #if $reference_source.reference_source_selector != "history":
+ -R "${reference_source.ref_file.fields.path}"
+ #end if
+ '
+ #set $rod_binding_names = dict()
+ #for $rod_binding in $rod_bind:
+ #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom':
+ #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name
+ #else
+ #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector
+ #end if
+ #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1
+ -d "-known:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}"
+ #end for
+
+ $allow_n_cigar_reads
+ #include source=$standard_gatk_options#
+ ##start analysis specific options
+ #if $analysis_param_type.analysis_param_type_selector == "advanced":
+ -p '
+ --minReadsAtLocus "${analysis_param_type.minReadsAtLocus}"
+ --windowSize "${analysis_param_type.windowSize}"
+ --mismatchFraction "${analysis_param_type.mismatchFraction}"
+ --maxIntervalSize "${analysis_param_type.maxIntervalSize}"
+ '
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 reads, MQ0 reads, and reads with consecutive indel operators in the CIGAR string.
+
+For more information on local realignment around indels using the GATK, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_indels_RealignerTargetCreator.html>`_.
+
+To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_.
+
+If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: RealignerTargetCreator accepts an aligned BAM input file.
+
+
+**Outputs**
+
+The output is in GATK Interval format.
+
+
+Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+ windowSize window size for calculating entropy or SNP clusters
+ mismatchFraction fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1
+ minReadsAtLocus minimum reads at a locus to enable using the entropy calculation
+ maxIntervalSize maximum interval size
+
+@CITATION_SECTION@
+
+
+
diff -r 68426930d59c -r 01ff8dd37d4d reduce_reads.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/reduce_reads.xml Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,228 @@
+
+ in BAM files
+
+ gatk2_macros.xml
+
+
+
+
+ gatk2_wrapper.py
+ --stdout "${output_log}"
+ -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input"
+ #if str( $reference_source.input_bam.metadata.bam_index ) != "None":
+ -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index
+ #end if
+ -p '
+ @JAR_PATH@
+ -T "ReduceReads"
+ -o "${output_bam}"
+
+ \$GATK2_SITE_OPTIONS
+
+ ## according to http://www.broadinstitute.org/gatk/guide/article?id=1975
+ --num_cpu_threads_per_data_thread 1
+
+ #if $reference_source.reference_source_selector != "history":
+ -R "${reference_source.ref_file.fields.path}"
+ #end if
+ #if str($input_recal) != 'None':
+ --BQSR "${input_recal}"
+ #end if
+ --disable_bam_indexing
+ '
+ #include source=$standard_gatk_options#
+
+ ##start analysis specific options
+ #if $analysis_param_type.analysis_param_type_selector == "advanced":
+ -p '
+ #if $analysis_param_type.context_size.__str__.strip() != '':
+ --context_size $analysis_param_type.context_size
+ #end if
+ #if $analysis_param_type.downsample_coverage.__str__.strip() != '':
+ --downsample_coverage $analysis_param_type.downsample_coverage
+ #end if
+ #if $analysis_param_type.minimum_del_proportion_to_trigger_variant.__str__.strip() != '':
+ --minimum_del_proportion_to_trigger_variant $analysis_param_type.minimum_del_proportion_to_trigger_variant
+ #end if
+ #if $analysis_param_type.minimum_mapping_quality.__str__.strip() != '':
+ --minimum_mapping_quality $analysis_param_type.minimum_mapping_quality
+ #end if
+ #if $analysis_param_type.minimum_tail_qualities.__str__.strip() != '':
+ --minimum_tail_qualities $analysis_param_type.minimum_tail_qualities
+ #end if
+ #if $analysis_param_type.minimum_base_quality_to_consider.__str__.strip() != '':
+ --minimum_base_quality_to_consider $analysis_param_type.minimum_base_quality_to_consider
+ #end if
+ #if $analysis_param_type.minimum_alt_proportion_to_trigger_variant.__str__.strip() != '':
+ --minimum_alt_proportion_to_trigger_variant $analysis_param_type.minimum_alt_proportion_to_trigger_variant
+ #end if
+ $analysis_param_type.allow_polyploid_reduction
+ $analysis_param_type.dont_compress_read_names
+ $analysis_param_type.dont_hardclip_low_qual_tails
+ $analysis_param_type.dont_simplify_reads
+ $analysis_param_type.dont_use_softclipped_bases
+ $analysis_param_type.hard_clip_to_interval
+ $analysis_param_type.dont_hardclip_adaptor_sequences
+ '
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+ReduceReads
+Reduces the BAM file using read based compression that keeps only essential information for variant calling
+
+This walker will generated reduced versions of the BAM files that still follow the BAM spec and contain all the information necessary for the GSA variant calling pipeline. Some options allow you to tune in how much compression you want to achieve. The default values have been shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the savings in file size and performance of the downstream tools.
+
+.. For more information on using read based compression in the GATK, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_compression_reducereads_ReduceReads.html>`_.
+
+To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_.
+
+If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: PrintReads accepts an aligned BAM and a recalibration CSV input files.
+
+
+**Outputs**
+
+The output is in BAM format.
+
+
+Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+
+ --allow_polyploid_reduction / -polyploid ( boolean with default value false )
+ Allow the experimental polyploid-based reduction capabilities of this tool
+
+ --context_size / -cs ( int with default value 10 )
+ The number of bases to keep around mismatches (potential variation)
+
+ --dont_compress_read_names / -nocmp_names ( boolean with default value false )
+ Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee uniqueness and reads with similar name will still have similar compressed names. Note: If you scatter/gather there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing.
+
+ --dont_hardclip_low_qual_tails / -noclip_tail ( boolean with default value false )
+ Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail quality.
+
+ --dont_simplify_reads / -nosimplify ( boolean with default value false )
+ Do not simplify read (strip away all extra information of the read -- anything other than bases, quals and read group).
+
+ --dont_use_softclipped_bases / -no_soft ( boolean with default value false )
+ Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped base left by the aligner and use the high quality soft clipped bases in it's traversal algorithm to identify variant regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual)
+
+ --downsample_coverage / -ds ( int with default value 250 )
+ Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this). A value of 0 turns downsampling off.
+
+ --hard_clip_to_interval / -clip_int ( boolean with default value false )
+ Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval border.
+
+ -mindel / --minimum_del_proportion_to_trigger_variant ( double with default value 0.05 )
+ Minimum proportion of indels in a site to trigger a variant region. Anything below this will be considered consensus.
+
+ --minimum_mapping_quality / -minmap ( int with default value 20 )
+ The minimum mapping quality to be considered for the consensus synthetic read. Reads that have mapping quality below this threshold will not be counted towards consensus, but are still counted towards variable regions.
+
+ --minimum_tail_qualities / -mintail ( byte with default value 2 )
+ Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
+
+ -minqual / --minimum_base_quality_to_consider ( byte with default value 20 )
+ The minimum base quality to be considered for the consensus synthetic read. Reads that have base quality below this threshold will not be counted towards consensus, but are still counted towards variable regions.
+
+ -minvar / --minimum_alt_proportion_to_trigger_variant ( double with default value 0.05 )
+ Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be considered consensus.
+
+ -noclip_ad / --dont_hardclip_adaptor_sequences ( boolean with default value false )
+ Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired. The program will behave correctly in those cases.
+
+@CITATION_SECTION@
+
+
+
diff -r 68426930d59c -r 01ff8dd37d4d test-data/gatk/fake_phiX_reads_1.bam
Binary file test-data/gatk/fake_phiX_reads_1.bam has changed
diff -r 68426930d59c -r 01ff8dd37d4d test-data/gatk/fake_phiX_variant_locations.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gatk/fake_phiX_variant_locations.bed Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,2 @@
+phiX174 1442 1443
+phiX174 1445 1446
diff -r 68426930d59c -r 01ff8dd37d4d test-data/gatk/fake_phiX_variant_locations.vcf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gatk/fake_phiX_variant_locations.vcf Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,70 @@
+##fileformat=VCFv4.1
+##samtoolsVersion=0.1.18 (r982:295)
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##INFO=
+##FORMAT=
+##FORMAT=
+##FORMAT=
+##FORMAT=
+##FORMAT=
+##FORMAT=
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A Fake phiX Sample
+phiX174 1411 . A . 28.2 . DP=1;;AC1=2;FQ=-30 PL 0
+phiX174 1412 . G . 28.2 . DP=3;;AC1=2;FQ=-30 PL 0
+phiX174 1413 . C . 28.2 . DP=5;;AC1=2;FQ=-30 PL 0
+phiX174 1414 . G . 28.2 . DP=6;;AC1=2;FQ=-30 PL 0
+phiX174 1415 . C . 28.2 . DP=7;;AC1=2;FQ=-30 PL 0
+phiX174 1416 . C . 28.2 . DP=8;;AC1=2;FQ=-30 PL 0
+phiX174 1417 . G . 28.2 . DP=9;;AC1=2;FQ=-30 PL 0
+phiX174 1418 . T . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1419 . G . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1420 . G . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1421 . A . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1422 . T . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1423 . G . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1424 . C . 28.2 . DP=10;VDB=0.0005;;AC1=2;FQ=-30 PL 0
+phiX174 1425 . C . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1426 . T . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1427 . G . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1428 . A . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1429 . C . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1430 . C . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1431 . G . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1432 . T . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1433 . A . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1434 . C . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1435 . C . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1436 . G . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1437 . A . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1438 . G . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1439 . G . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1440 . C . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1441 . T . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1442 . A . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1443 . A . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1444 . C . 28.2 . DP=7;;AC1=2;FQ=-30 PL 0
+phiX174 1445 . C . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1446 . C . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1447 . T . 28.2 . DP=10;;AC1=2;FQ=-30 PL 0
+phiX174 1448 . A . 28.2 . DP=8;;AC1=2;FQ=-30 PL 0
+phiX174 1449 . A . 28.2 . DP=6;;AC1=2;FQ=-30 PL 0
+phiX174 1450 . T . 28.2 . DP=4;;AC1=2;FQ=-30 PL 0
+phiX174 1451 . G . 28.2 . DP=3;;AC1=2;FQ=-30 PL 0
+phiX174 1452 . A . 28.2 . DP=2;;AC1=2;FQ=-30 PL 0
+phiX174 1453 . G . 28.2 . DP=1;;AC1=2;FQ=-30 PL 0
diff -r 68426930d59c -r 01ff8dd37d4d test-data/gatk/gatk_analyze_covariates/gatk_analyze_covariates_out_1.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gatk/gatk_analyze_covariates/gatk_analyze_covariates_out_1.html Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,34 @@
+
+
+Galaxy - GATK Output
+
+
+
+
+
diff -r 68426930d59c -r 01ff8dd37d4d tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,26 @@
+
+
+
+ /please set the path to your GATK2 dir in the corresponding env.sh file/
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 68426930d59c -r 01ff8dd37d4d unified_genotyper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/unified_genotyper.xml Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,289 @@
+
+ SNP and indel caller
+
+ gatk2_macros.xml
+
+
+
+
+ gatk2_wrapper.py
+ --stdout "${output_log}"
+ @BAM_INPUTS@
+ -p '
+ @JAR_PATH@
+ -T "UnifiedGenotyper"
+ @THREADS@
+ --out "${output_vcf}"
+ --metrics_file "${output_metrics}"
+ \$GATK2_SITE_OPTIONS
+
+ ## according to http://www.broadinstitute.org/gatk/guide/article?id=1975
+ --num_cpu_threads_per_data_thread 1
+
+ #if $reference_source.reference_source_selector != "history":
+ -R "${reference_source.ref_file.fields.path}"
+ #end if
+ --genotype_likelihoods_model "${genotype_likelihoods_model}"
+ --standard_min_confidence_threshold_for_calling "${standard_min_confidence_threshold_for_calling}"
+ --standard_min_confidence_threshold_for_emitting "${standard_min_confidence_threshold_for_emitting}"
+ '
+ @DBSNP_OPTIONS@
+ $allow_n_cigar_reads
+ #include source=$standard_gatk_options#
+ ##start analysis specific options
+ #if $analysis_param_type.analysis_param_type_selector == "advanced":
+ -p '
+ --heterozygosity "${analysis_param_type.heterozygosity}"
+ --pcr_error_rate "${analysis_param_type.pcr_error_rate}"
+ --genotyping_mode "${analysis_param_type.genotyping_mode_type.genotyping_mode}"
+ #if str( $analysis_param_type.genotyping_mode_type.genotyping_mode ) == 'GENOTYPE_GIVEN_ALLELES':
+ --alleles "${analysis_param_type.genotyping_mode_type.input_alleles_rod}"
+ #end if
+ --output_mode "${analysis_param_type.output_mode}"
+ ${analysis_param_type.compute_SLOD}
+ --min_base_quality_score "${analysis_param_type.min_base_quality_score}"
+ --max_deletion_fraction "${analysis_param_type.max_deletion_fraction}"
+ --max_alternate_alleles "${analysis_param_type.max_alternate_alleles}"
+ --min_indel_count_for_genotyping "${analysis_param_type.min_indel_count_for_genotyping}"
+ --indel_heterozygosity "${analysis_param_type.indel_heterozygosity}"
+ --indelGapContinuationPenalty "${analysis_param_type.indelGapContinuationPenalty}"
+ --indelGapOpenPenalty "${analysis_param_type.indelGapOpenPenalty}"
+ --indelHaplotypeSize "${analysis_param_type.indelHaplotypeSize}"
+ ${analysis_param_type.doContextDependentGapPenalties}
+ #if str( $analysis_param_type.annotation ) != "None":
+ #for $annotation in str( $analysis_param_type.annotation.fields.gatk_value ).split( ','):
+ --annotation "${annotation}"
+ #end for
+ #end if
+ #for $additional_annotation in $analysis_param_type.additional_annotations:
+ --annotation "${additional_annotation.additional_annotation_name}"
+ #end for
+ #if str( $analysis_param_type.group ) != "None":
+ #for $group in str( $analysis_param_type.group ).split( ','):
+ --group "${group}"
+ #end for
+ #end if
+ #if str( $analysis_param_type.exclude_annotations ) != "None":
+ #for $annotation in str( $analysis_param_type.exclude_annotations.fields.gatk_value ).split( ','):
+ --excludeAnnotation "${annotation}"
+ #end for
+ #end if
+ #if str( $analysis_param_type.sample_ploidy ) != '':
+ --sample_ploidy "$analysis_param_type.sample_ploidy"
+ #end if
+ '
+## #if str( $analysis_param_type.snpEff_rod_bind_type.snpEff_rod_bind_type_selector ) == 'set_snpEff':
+## -p '--annotation "SnpEff"'
+## -d "--snpEffFile:${analysis_param_type.snpEff_rod_bind_type.snpEff_rod_name},%(file_type)s" "${analysis_param_type.snpEff_rod_bind_type.snpEff_input_rod}" "${analysis_param_type.snpEff_rod_bind_type.snpEff_input_rod.ext}" "input_snpEff_${analysis_param_type.snpEff_rod_bind_type.snpEff_rod_name}"
+## #else:
+## -p '--excludeAnnotation "SnpEff"'
+## #end if
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+A variant caller which unifies the approaches of several disparate callers. Works for single-sample and multi-sample data. The user can choose from several different incorporated calculation models.
+
+For more information on the GATK Unified Genotyper, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_genotyper_UnifiedGenotyper.html>`_.
+
+To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_.
+
+If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: UnifiedGenotyper accepts an aligned BAM input file.
+
+
+**Outputs**
+
+The output is in VCF format.
+
+
+Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+ genotype_likelihoods_model Genotype likelihoods calculation model to employ -- BOTH is the default option, while INDEL is also available for calling indels and SNP is available for calling SNPs only (SNP|INDEL|BOTH)
+ heterozygosity Heterozygosity value used to compute prior likelihoods for any locus
+ pcr_error_rate The PCR error rate to be used for computing fragment-based likelihoods
+ genotyping_mode Should we output confident genotypes (i.e. including ref calls) or just the variants? (DISCOVERY|GENOTYPE_GIVEN_ALLELES)
+ output_mode Should we output confident genotypes (i.e. including ref calls) or just the variants? (EMIT_VARIANTS_ONLY|EMIT_ALL_CONFIDENT_SITES|EMIT_ALL_SITES)
+ standard_min_confidence_threshold_for_calling The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called
+ standard_min_confidence_threshold_for_emitting The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)
+ noSLOD If provided, we will not calculate the SLOD
+ min_base_quality_score Minimum base quality required to consider a base for calling
+ max_deletion_fraction Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]
+ min_indel_count_for_genotyping Minimum number of consensus indels required to trigger genotyping run
+ indel_heterozygosity Heterozygosity for indel calling
+ indelGapContinuationPenalty Indel gap continuation penalty
+ indelGapOpenPenalty Indel gap open penalty
+ indelHaplotypeSize Indel haplotype size
+ doContextDependentGapPenalties Vary gap penalties by context
+ indel_recal_file Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE
+ indelDebug Output indel debug info
+ out File to which variants should be written
+ annotation One or more specific annotations to apply to variant calls
+ group One or more classes/groups of annotations to apply to variant calls
+
+@CITATION_SECTION@
+
+
+
diff -r 68426930d59c -r 01ff8dd37d4d variant_annotator.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_annotator.xml Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,249 @@
+
+
+
+ gatk2_macros.xml
+
+
+
+
+ gatk2_wrapper.py
+ --stdout "${output_log}"
+ #if str( $reference_source.input_bam ) != "None":
+ -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input"
+ #if str( $reference_source.input_bam.metadata.bam_index ) != "None":
+ -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index
+ #end if
+ #end if
+ -d "--variant" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant"
+ -p '
+ @JAR_PATH@
+ -T "VariantAnnotator"
+ \$GATK2_SITE_OPTIONS
+
+ @THREADS@
+
+ #if $reference_source.reference_source_selector != "history":
+ -R "${reference_source.ref_file.fields.path}"
+ #end if
+ -o "${output_vcf}"
+ #if str( $annotations_type.annotations_type_selector ) == "use_all_annotations":
+ --useAllAnnotations
+ #else:
+ #if $annotations_type.annotations:
+ #for $annotation in str( $annotations_type.annotations.fields.gatk_value ).split( ',' ):
+ --annotation "${annotation}"
+ #end for
+ #end if
+ #end if
+ #if $exclude_annotations:
+ #for $annotation in str( $exclude_annotations.fields.gatk_value ).split( ',' ):
+ --excludeAnnotation "${annotation}"
+ #end for
+ #end if
+ #for $additional_annotation in $additional_annotations:
+ --annotation "${additional_annotation.additional_annotation_name}"
+ #end for
+ '
+ #if $reference_source.input_variant_bti:
+ -d "--intervals" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant_bti"
+ #end if
+
+ #for $rod_binding in $comp_rod_bind:
+ -d "--comp:${rod_binding.comp_rod_name},%(file_type)s" "${rod_binding.comp_input_rod}" "${rod_binding.comp_input_rod.ext}" "input_comp_${rod_binding.comp_rod_name}"
+ #end for
+
+ @DBSNP_OPTIONS@
+
+ #for $rod_binding in $resource_rod_bind:
+ -d "--resource:${rod_binding.resource_rod_name},%(file_type)s" "${rod_binding.resource_input_rod}" "${rod_binding.resource_input_rod.ext}" "input_resource_${rod_binding.resource_rod_name}"
+ #end for
+
+ #if str( $snpEff_rod_bind_type.snpEff_rod_bind_type_selector ) == 'set_snpEff':
+ -p '--annotation "SnpEff"'
+ -d "--snpEffFile:${snpEff_rod_bind_type.snpEff_rod_name},%(file_type)s" "${snpEff_rod_bind_type.snpEff_input_rod}" "${snpEff_rod_bind_type.snpEff_input_rod.ext}" "input_snpEff_${snpEff_rod_bind_type.snpEff_rod_name}"
+ #else:
+ -p '--excludeAnnotation "SnpEff"'
+ #end if
+
+ #for $expression in $expressions:
+ -p '--expression "${expression.expression}"'
+ #end for
+
+ #include source=$standard_gatk_options#
+
+ -p '
+ #if str( $annotation_group ) != "None":
+ #for $group in str( $annotation_group ).split( ',' ):
+ --group "${group}"
+ #end for
+ #end if
+ #if str( $family_string ) != "":
+ --family_string "${family_string}"
+ #end if
+ --MendelViolationGenotypeQualityThreshold "${mendel_violation_genotype_quality_threshold}"
+ '
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Annotates variant calls with context information. Users can specify which of the available annotations to use.
+
+For more information on using the VariantAnnotator, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_annotator_VariantAnnotator.html>`_.
+
+To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_.
+
+If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_.
+
+------
+
+
+**Inputs**
+
+GenomeAnalysisTK: VariantAnnotator accepts a variant input file.
+
+
+**Outputs**
+
+The output is in VCF format.
+
+
+Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+
+ sampleName The sample (NA-ID) corresponding to the variant input (for non-VCF input only)
+ annotation One or more specific annotations to apply to variant calls
+ group One or more classes/groups of annotations to apply to variant calls
+ expression One or more specific expressions to apply to variant calls; see documentation for more details
+ useAllAnnotations Use all possible annotations (not for the faint of heart)
+
+@CITATION_SECTION@
+
+
+
diff -r 68426930d59c -r 01ff8dd37d4d variant_apply_recalibration.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_apply_recalibration.xml Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,136 @@
+
+
+
+ gatk2_macros.xml
+
+
+
+
+ gatk2_wrapper.py
+ --stdout "${output_log}"
+ #for $var_count, $variant in enumerate( $reference_source.input_variants ):
+ -d "--input:input_${var_count},%(file_type)s" "${variant}" "${variant.ext}" "input_variants_${var_count}"
+ #end for
+ -p '
+ @JAR_PATH@
+ -T "ApplyRecalibration"
+ \$GATK2_SITE_OPTIONS
+
+ @THREADS@
+
+ #if $reference_source.reference_source_selector != "history":
+ -R "${reference_source.ref_file.fields.path}"
+ #end if
+ --recal_file "${reference_source.input_recal}"
+ --tranches_file "${reference_source.input_tranches}"
+ --out "${output_variants}"
+ '
+
+ #include source=$standard_gatk_options#
+
+ ##start analysis specific options
+ -p '
+ --mode "${mode}"
+
+ #for $ignore_filter in $ignore_filters:
+ #set $ignore_filter_name = str( $ignore_filter.ignore_filter_type.ignore_filter_type_selector )
+ #if $ignore_filter_name == "custom":
+ #set $ignore_filter_name = str( $ignore_filter.ignore_filter_type.filter_name )
+ #end if
+ --ignore_filter "${ignore_filter_name}"
+ #end for
+ --ts_filter_level "${ts_filter_level}"
+ '
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
+
+For more information on using the ApplyRecalibration module, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_variantrecalibration_ApplyRecalibration.html>`_.
+
+To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_.
+
+If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: ApplyRecalibration accepts a variant input file, a recalibration file and a tranches file.
+
+
+**Outputs**
+
+The output is in VCF format.
+
+Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+
+ recal_file The output recal file used by ApplyRecalibration
+ tranches_file The input tranches file describing where to cut the data
+ out The output filtered, recalibrated VCF file
+ ts_filter_level The truth sensitivity level at which to start filtering
+ ignore_filter If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file
+ mode Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously. (SNP|INDEL|BOTH)
+
+@CITATION_SECTION@
+
+
+
diff -r 68426930d59c -r 01ff8dd37d4d variant_combine.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_combine.xml Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,168 @@
+
+
+
+ gatk2_macros.xml
+
+
+
+
+ gatk2_wrapper.py
+ --stdout "${output_log}"
+
+ #set $priority_order = []
+ #for $input_variant in $reference_source.input_variants:
+ -d "--variant:${input_variant.input_variant_name},%(file_type)s" "${input_variant.input_variant}" "${input_variant.input_variant.ext}" "input_variant_${input_variant.input_variant_name}"
+ #set $input_variant_name = str( $input_variant.input_variant_name )
+ #assert $input_variant_name not in $priority_order, "Variant Names must be unique" ##this should be handled by a validator
+ #silent $priority_order.append( $input_variant_name )
+ #end for
+ -p '
+ @JAR_PATH@
+ -T "CombineVariants"
+ --out "${output_variants}"
+ \$GATK2_SITE_OPTIONS
+
+ @THREADS@
+
+ #if $reference_source.reference_source_selector != "history":
+ -R "${reference_source.ref_file.fields.path}"
+ #end if
+ --genotypemergeoption "${genotype_merge_option}"
+ --rod_priority_list "${ ','.join( $priority_order ) }"
+ '
+
+ #include source=$standard_gatk_options#
+
+ ##start analysis specific options
+ #if $analysis_param_type.analysis_param_type_selector == "advanced":
+ -p '
+ --filteredrecordsmergetype "${analysis_param_type.filtered_records_merge_type}"
+ ${analysis_param_type.print_complex_merges}
+ ${analysis_param_type.filtered_are_uncalled}
+ ${analysis_param_type.minimal_vcf}
+ ${analysis_param_type.assume_identical_samples}
+
+ #if str( $analysis_param_type.set_key ):
+ --setKey "${analysis_param_type.set_key}"
+ #end if
+
+ --minimumN "${analysis_param_type.minimum_n}"
+ '
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Combines VCF records from different sources; supports both full merges and set unions. Merge: combines multiple records into a single one; if sample names overlap then they are uniquified. Union: assumes each rod represents the same set of samples (although this is not enforced); using the priority list (if provided), emits a single record instance at every position represented in the rods.
+
+For more information on using the CombineVariants module, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_variantutils_CombineVariants.html>`_.
+
+To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_.
+
+If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: CombineVariants accepts variant files as input.
+
+------
+
+**Outputs**
+
+The output is a combined vcf file.
+
+
+Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+ out File to which variants should be written
+ genotypemergeoption How should we merge genotype records for samples shared across the ROD files? (UNIQUIFY|PRIORITIZE|UNSORTED|REQUIRE_UNIQUE)
+ filteredrecordsmergetype How should we deal with records seen at the same site in the VCF, but with different FILTER fields? KEEP_IF_ANY_UNFILTERED PASSes the record if any record is unfiltered, KEEP_IF_ALL_UNFILTERED requires all records to be unfiltered (KEEP_IF_ANY_UNFILTERED|KEEP_IF_ALL_UNFILTERED)
+ rod_priority_list When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided
+ printComplexMerges Print out interesting sites requiring complex compatibility merging
+ filteredAreUncalled If true, then filtered VCFs are treated as uncalled, so that filtered set annotation don't appear in the combined VCF
+ minimalVCF If true, then the output VCF will contain no INFO or genotype INFO field
+ setKey Key, by default set, in the INFO key=value tag emitted describing which set the combined VCF record came from. Set to null if you don't want the set field emitted.
+ assumeIdenticalSamples If true, assume input VCFs have identical sample sets and disjoint calls so that one can simply perform a merge sort to combine the VCFs into one, drastically reducing the runtime.
+ minimumN Combine variants and output site only if variant is present in at least N input files.
+
+@CITATION_SECTION@
+
+
+
diff -r 68426930d59c -r 01ff8dd37d4d variant_eval.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_eval.xml Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,276 @@
+
+
+
+ gatk2_macros.xml
+
+
+
+
+ #from binascii import hexlify
+
+ gatk2_wrapper.py
+ --stdout "${output_log}"
+ #for $var_count, $variant in enumerate( $reference_source.input_variants ):
+ -d "--eval:input_${var_count},%(file_type)s" "${variant}" "${variant.ext}" "input_variants_${var_count}"
+ #end for
+ -p '
+ @JAR_PATH@
+ -T "VariantEval"
+ --out "${output_report}"
+ \$GATK2_SITE_OPTIONS
+
+ @THREADS@
+
+ #if $reference_source.reference_source_selector != "history":
+ -R "${reference_source.ref_file.fields.path}"
+ #end if
+ '
+
+ #for $rod_binding in $comp_rod_bind:
+ -d "--comp:${rod_binding.comp_rod_name},%(file_type)s" "${rod_binding.comp_input_rod}" "${rod_binding.comp_input_rod.ext}" "input_comp_${rod_binding.comp_rod_name}"
+ #if str( $rod_binding.comp_known_names ):
+ -p '--known_names "${rod_binding.comp_rod_name}"'
+ #end if
+ #end for
+
+ #if $dbsnp_rod_bind_type.dbsnp_rod_bind_type_selector == 'set_dbsnp'
+ -d "--dbsnp:${dbsnp_rod_bind_type.dbsnp_rod_name},%(file_type)s" "${dbsnp_rod_bind_type.dbsnp_input_rod}" "${dbsnp_rod_bind_type.dbsnp_input_rod.ext}" "input_dbsnp_${dbsnp_rod_bind_type.dbsnp_rod_name}"
+ #if $dbsnp_rod_bind_type.dbsnp_known_names
+ -p '--known_names "${dbsnp_rod_bind_type.dbsnp_rod_name}"'
+ #end if
+ #end if
+
+ #include source=$standard_gatk_options#
+
+ ##start analysis specific options
+ #if $analysis_param_type.analysis_param_type_selector == "advanced":
+ #for $stratification in $analysis_param_type.stratifications:
+ #set $select_string = "--select_exps '%s' --select_names '%s'" % ( str( $stratification.select_exps ), str( $stratification.select_name ) )
+ -o '${ hexlify( $select_string ) }'
+ #end for
+ -p '
+
+ #for $sample in $analysis_param_type.samples:
+ --sample "${sample.sample}"
+ #end for
+
+ #if str( $analysis_param_type.stratification_modules ) != "None":
+ #for $stratification_module in str( $analysis_param_type.stratification_modules).split( ',' ):
+ --stratificationModule "${stratification_module}"
+ #end for
+ #end if
+
+ ${analysis_param_type.do_not_use_all_standard_stratifications}
+
+ #for $variant_type in $analysis_param_type.only_variants_of_type:
+ --onlyVariantsOfType "${variant_type.variant_type}"
+ #end for
+
+ #if str( $analysis_param_type.eval_modules ) != "None":
+ #for $eval_module in str( $analysis_param_type.eval_modules).split( ',' ):
+ --evalModule "${eval_module}"
+ #end for
+ #end if
+
+ ${analysis_param_type.do_not_use_all_standard_modules}
+
+ #if str( $analysis_param_type.num_samples ) != "0":
+ --numSamples "${analysis_param_type.num_samples}"
+ #end if
+
+ --minPhaseQuality "${analysis_param_type.min_phase_quality}"
+
+ --mendelianViolationQualThreshold "${analysis_param_type.mendelian_violation_qual_threshold}"
+
+ #if str( $analysis_param_type.ancestral_alignments ) != "None":
+ --ancestralAlignments "${analysis_param_type.ancestral_alignments}"
+ #end if
+ '
+ #if str( $analysis_param_type.known_cnvs ) != "None":
+ -d "--knownCNVs" "${analysis_param_type.known_cnvs}" "${analysis_param_type.known_cnvs.ext}" "input_known_cnvs"
+ #end if
+
+ #if str( $analysis_param_type.strat_intervals ) != "None":
+ -d "--stratIntervals" "${analysis_param_type.strat_intervals}" "${analysis_param_type.strat_intervals.ext}" "input_strat_intervals"
+ #end if
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ^\w+$
+
+
+
+
+
+
+
+
+
+
+
+
+ ^\w+$
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and a lot more)
+
+For more information on using the VariantEval module, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_varianteval_VariantEval.html>`_.
+
+To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_.
+
+If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: VariantEval accepts variant files as input.
+
+
+**Outputs**
+
+The output is a table of variant evaluation.
+
+
+Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+ out An output file presented to the walker. Will overwrite contents if file exists.
+ list List the available eval modules and exit
+ select_exps One or more stratifications to use when evaluating the data
+ select_names Names to use for the list of stratifications (must be a 1-to-1 mapping)
+ sample Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context
+ known_names Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets
+ stratificationModule One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)
+ doNotUseAllStandardStratifications Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)
+ onlyVariantsOfType If provided, only variants of these types will be considered during the evaluation, in
+ evalModule One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)
+ doNotUseAllStandardModules Do not use the standard modules by default (instead, only those that are specified with the -E option)
+ numSamples Number of samples (used if no samples are available in the VCF file
+ minPhaseQuality Minimum phasing quality
+ mendelianViolationQualThreshold Minimum genotype QUAL score for each trio member required to accept a site as a violation
+ ancestralAlignments Fasta file with ancestral alleles
+
+@CITATION_SECTION@
+
+
+
diff -r 68426930d59c -r 01ff8dd37d4d variant_filtration.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_filtration.xml Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,178 @@
+
+ on VCF files
+
+ gatk2_macros.xml
+
+
+
+
+ #from binascii import hexlify
+
+ gatk2_wrapper.py
+ --stdout "${output_log}"
+ -d "--variant:variant,%(file_type)s" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant"
+ -p '
+ @JAR_PATH@
+ -T "VariantFiltration"
+ \$GATK2_SITE_OPTIONS
+
+ -o "${output_vcf}"
+
+ #if $reference_source.reference_source_selector != "history":
+ -R "${reference_source.ref_file.fields.path}"
+ #end if
+ '
+ #for $variant_filter in $variant_filters:
+ #set $variant_filter = "--%sExpression '%s' --%sName '%s'" % ( str( $variant_filter.is_genotype_filter ), str( $variant_filter.filter_expression ), str( $variant_filter.is_genotype_filter ), str( $variant_filter.filter_name ) )
+ -o '${ hexlify( $variant_filter ) }'
+ #end for
+
+ #if str( $mask_rod_bind_type.mask_rod_bind_type_selector ) == 'set_mask':
+ -d "--mask:${mask_rod_bind_type.mask_rod_name},%(file_type)s" "${mask_rod_bind_type.input_mask_rod}" "${mask_rod_bind_type.input_mask_rod.ext}" "input_mask_${mask_rod_bind_type.mask_rod_name}"
+ -p '
+ --maskExtension "${mask_rod_bind_type.mask_extension}"
+ --maskName "${mask_rod_bind_type.mask_rod_name}"
+ '
+ #end if
+
+ #include source=$standard_gatk_options#
+
+ ##start analysis specific options
+ #if $cluster_snp_type.cluster_snp_type_selector == "cluster_snp":
+ -p '
+ --clusterSize "${cluster_snp_type.cluster_size}"
+ --clusterWindowSize "${cluster_snp_type.cluster_window_size}"
+ '
+ #end if
+ -p '${missing_values_in_expressions_should_evaluate_as_failing}'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Filters variant calls using a number of user-selectable, parameterizable criteria.
+
+For more information on using the VariantFiltration module, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_filters_VariantFiltration.html>`_.
+
+To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_.
+
+If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: VariantFiltration accepts a VCF input file.
+
+
+**Outputs**
+
+The output is in VCF format.
+
+Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+
+ filterExpression One or more expression used with INFO fields to filter (see wiki docs for more info)
+ filterName Names to use for the list of filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered
+ genotypeFilterExpression One or more expression used with FORMAT (sample/genotype-level) fields to filter (see wiki docs for more info)
+ genotypeFilterName Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered
+ clusterSize The number of SNPs which make up a cluster (see also --clusterWindowSize); [default:3]
+ clusterWindowSize The window size (in bases) in which to evaluate clustered SNPs (to disable the clustered SNP filter, set this value to less than 1); [default:0]
+ maskName The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call; [default:'Mask']
+ missingValuesInExpressionsShouldEvaluateAsFailing When evaluating the JEXL expressions, should missing values be considered failing the expression (by default they are considered passing)?
+
+@CITATION_SECTION@
+
+
+
diff -r 68426930d59c -r 01ff8dd37d4d variant_recalibrator.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_recalibrator.xml Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,411 @@
+
+
+
+ gatk2_macros.xml
+
+
+ ggplot2
+
+
+
+ gatk2_wrapper.py
+ --stdout "${output_log}"
+ #for $var_count, $variant in enumerate( $reference_source.input_variants ):
+ -d "--input:input_${var_count},%(file_type)s" "${variant}" "${variant.ext}" "input_variants_${var_count}"
+ #end for
+ -p '
+ @JAR_PATH@
+ -T "VariantRecalibrator"
+ \$GATK2_SITE_OPTIONS
+
+ @THREADS@
+
+ #if $reference_source.reference_source_selector != "history":
+ -R "${reference_source.ref_file.fields.path}"
+ #end if
+ --recal_file "${output_recal}"
+ --tranches_file "${output_tranches}"
+ --rscript_file "${output_rscript}"
+ '
+
+ #set $rod_binding_names = dict()
+ #for $rod_binding in $rod_bind:
+ #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom':
+ #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name
+ #elif str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'comp':
+ #set $rod_bind_name = "comp" + $rod_binding.rod_bind_type.custom_rod_name
+ #else
+ #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector
+ #end if
+ #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1
+ #if $rod_binding.rod_bind_type.rod_training_type.rod_training_type_selector == "not_training_truth_known":
+ -d "--resource:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}"
+ #else:
+ -d "--resource:${rod_bind_name},%(file_type)s,known=${rod_binding.rod_bind_type.rod_training_type.known},training=${rod_binding.rod_bind_type.rod_training_type.training},truth=${rod_binding.rod_bind_type.rod_training_type.truth},bad=${rod_binding.rod_bind_type.rod_training_type.bad},prior=${rod_binding.rod_bind_type.rod_training_type.prior}" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}"
+ #end if
+ #end for
+
+ #include source=$standard_gatk_options#
+
+ ##start analysis specific options
+ -p '
+ #if str( $annotations ) != "None":
+ #for $annotation in str( $annotations.fields.gatk_value ).split( ',' ):
+ --use_annotation "${annotation}"
+ #end for
+ #end if
+ #for $additional_annotation in $additional_annotations:
+ --use_annotation "${additional_annotation.additional_annotation_name}"
+ #end for
+ --mode "${mode}"
+ '
+
+ #if $analysis_param_type.analysis_param_type_selector == "advanced":
+ -p '
+ --maxGaussians "${analysis_param_type.max_gaussians}"
+ --maxIterations "${analysis_param_type.max_iterations}"
+ --numKMeans "${analysis_param_type.num_k_means}"
+ --stdThreshold "${analysis_param_type.std_threshold}"
+ --shrinkage "${analysis_param_type.shrinkage}"
+ --dirichlet "${analysis_param_type.dirichlet}"
+ --priorCounts "${analysis_param_type.prior_counts}"
+
+ --minNumBadVariants "${analysis_param_type.min_num_bad_variants}"
+
+ --target_titv "${analysis_param_type.target_titv}"
+ #for $tranche in [ $tranche.strip() for $tranche in str( $analysis_param_type.ts_tranche ).split( ',' ) if $tranche.strip() ]
+ --TStranche "${tranche}"
+ #end for
+ #for $ignore_filter in $analysis_param_type.ignore_filters:
+ #set $ignore_filter_name = str( $ignore_filter.ignore_filter_type.ignore_filter_type_selector )
+ #if $ignore_filter_name == "custom":
+ #set $ignore_filter_name = str( $ignore_filter.ignore_filter_type.filter_name )
+ #end if
+ --ignore_filter "${ignore_filter_name}"
+ #end for
+ '
+ #end if
+
+ &&
+ mv "${output_rscript}.pdf" "${output_tranches_pdf}"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations and evaluates the variant -- assigning an informative lod score
+
+For more information on using the VariantRecalibrator module, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_variantrecalibration_VariantRecalibrator.html>`_.
+
+To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_.
+
+If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: VariantRecalibrator accepts a variant input file.
+
+
+**Outputs**
+
+The output is in VCF format.
+
+
+Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+
+ tranches_file The output tranches file used by ApplyRecalibration
+ use_annotation The names of the annotations which should used for calculations
+ mode Recalibration mode to employ: 1.) SNP for recalibrating only snps (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both snps and indels simultaneously. (SNP|INDEL|BOTH)
+ maxGaussians The maximum number of Gaussians to try during variational Bayes algorithm
+ maxIterations The maximum number of VBEM iterations to be performed in variational Bayes algorithm. Procedure will normally end when convergence is detected.
+ numKMeans The number of k-means iterations to perform in order to initialize the means of the Gaussians in the Gaussian mixture model.
+ stdThreshold If a variant has annotations more than -std standard deviations away from mean then don't use it for building the Gaussian mixture model.
+ shrinkage The shrinkage parameter in variational Bayes algorithm.
+ dirichlet The dirichlet parameter in variational Bayes algorithm.
+ priorCounts The number of prior counts to use in variational Bayes algorithm.
+ minNumBadVariants The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants.
+ recal_file The output recal file used by ApplyRecalibration
+ target_titv The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!
+ TStranche The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)
+ ignore_filter If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file
+ path_to_Rscript The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript
+ rscript_file The output rscript file generated by the VQSR to aid in visualization of the input data and learned model
+ path_to_resources Path to resources folder holding the Sting R scripts.
+
+@CITATION_SECTION@
+
+
+
diff -r 68426930d59c -r 01ff8dd37d4d variant_select.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_select.xml Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,283 @@
+
+ from VCF files
+
+ gatk2_macros.xml
+
+
+
+
+ #from binascii import hexlify
+
+ gatk2_wrapper.py
+ --stdout "${output_log}"
+ -d "--variant:variant,%(file_type)s" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant"
+ -p '
+ @JAR_PATH@
+ -T "SelectVariants"
+ \$GATK2_SITE_OPTIONS
+
+ @THREADS@
+ -o "${output_vcf}"
+
+ #if $reference_source.reference_source_selector != "history":
+ -R "${reference_source.ref_file.fields.path}"
+ #end if
+ '
+ -p '
+ #if $input_concordance:
+ --concordance "${input_concordance}"
+ #end if
+ #if $input_discordance:
+ --discordance "${input_discordance}"
+ #end if
+
+ #for $exclude_sample_name in $exclude_sample_name_repeat:
+ --exclude_sample_name "${exclude_sample_name.exclude_sample_name}"
+ #end for
+
+ ${exclude_filtered}
+
+ #for $sample_name in $sample_name_repeat:
+ --sample_name "${sample_name.sample_name}"
+ #end for
+ '
+
+ #for $select_expressions in $select_expressions_repeat:
+ #set $select_expression = "--select_expressions '%s'" % ( str( $select_expressions.select_expressions ) )
+ -o '${ hexlify( $select_expression ) }'
+ #end for
+
+ ##start tool specific options
+ #if str( $analysis_param_type.analysis_param_type_selector ) == 'advanced':
+ -p '
+ #for $esf in $analysis_param_type.exclude_sample_file:
+ --exclude_sample_file "${esf}"
+ #end for
+
+ #for $sf in $analysis_param_type.sample_file:
+ --sample_file "${sf}"
+ #end for
+
+ #if $analysis_param_type.input_keep_ids:
+ --keepIDs "${analysis_param_type.input_keep_ids}"
+ #end if
+
+ ${analysis_param_type.keep_original_AC}
+
+ ${analysis_param_type.mendelian_violation}
+
+ --mendelianViolationQualThreshold "${analysis_param_type.mendelian_violation_qual_threshold}"
+
+ --remove_fraction_genotypes "${analysis_param_type.remove_fraction_genotypes}"
+
+ --restrictAllelesTo "${analysis_param_type.restrict_alleles_to}"
+
+ #if str( $analysis_param_type.select_random_type.select_random_type_selector ) == 'select_random_fraction':
+ --select_random_fraction "${analysis_param_type.select_random_type.select_random_fraction}"
+ #elif str( $analysis_param_type.select_random_type.select_random_type_selector ) == 'select_random_number':
+ --select_random_number "${analysis_param_type.select_random_type.select_random_number}"
+ #end if
+
+ #if $analysis_param_type.select_type_to_include:
+ #for $type_to_include in str( $analysis_param_type.select_type_to_include ).split( ',' ):
+ --selectTypeToInclude "${type_to_include}"
+ #end for
+ #end if
+
+ ${analysis_param_type.exclude_non_variants}
+ '
+
+ #for $sample_expressions in $analysis_param_type.sample_expressions_repeat:
+ #set $sample_expression = "--sample_expressions '%s'" % ( str( $sample_expressions.sample_expressions ) )
+ -o '${ hexlify( $sample_expression ) }'
+ #end for
+
+ #end if
+ ##end tool specific options
+
+ #include source=$standard_gatk_options#
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Often, a VCF containing many samples and/or variants will need to be subset in order to facilitate certain analyses (e.g. comparing and contrasting cases vs. controls; extracting variant or non-variant loci that meet certain requirements, displaying just a few samples in a browser like IGV, etc.). SelectVariants can be used for this purpose. Given a single VCF file, one or more samples can be extracted from the file (based on a complete sample name or a pattern match). Variants can be further selected by specifying criteria for inclusion, i.e. "DP > 1000" (depth of coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25). These JEXL expressions are documented in the `Using JEXL expressions section <http://gatkforums.broadinstitute.org/discussion/1255/what-are-jexl-expressions-and-how-can-i-use-them-with-the-gatk>`_. One can optionally include concordance or discordance tracks for use in selecting overlapping variants.
+
+For more information on using the SelectVariants module, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_variantutils_SelectVariants.html>`_.
+
+To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_.
+
+If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: SelectVariants accepts a VCF input file.
+
+
+**Outputs**
+
+The output is in VCF format.
+
+
+Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+
+ out VCFWriter stdout File to which variants should be written
+ variant RodBinding[VariantContext] NA Input VCF file
+ concordance RodBinding[VariantContext] none Output variants that were also called in this comparison track
+ discordance RodBinding[VariantContext] none Output variants that were not called in this comparison track
+ exclude_sample_file Set[File] [] File containing a list of samples (one per line) to exclude. Can be specified multiple times
+ exclude_sample_name Set[String] [] Exclude genotypes from this sample. Can be specified multiple times
+ excludeFiltered boolean false Don't include filtered loci in the analysis
+ excludeNonVariants boolean false Don't include loci found to be non-variant after the subsetting procedure
+ keepIDs File NA Only emit sites whose ID is found in this file (one ID per line)
+ keepOriginalAC boolean false Don't update the AC, AF, or AN values in the INFO field after selecting
+ mendelianViolation Boolean false output mendelian violation sites only
+ mvq double 0.0 Minimum genotype QUAL score for each trio member required to accept a site as a violation
+ remove_fraction_genotypes double 0.0 Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall
+ restrictAllelesTo NumberAlleleRestriction ALL Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC
+ sample_expressions Set[String] NA Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times
+ sample_file Set[File] NA File containing a list of samples (one per line) to include. Can be specified multiple times
+ sample_name Set[String] [] Include genotypes from this sample. Can be specified multiple times
+ select_expressions ArrayList[String] [] One or more criteria to use when selecting the data
+ select_random_fraction double 0.0 Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track
+ select_random_number int 0 Selects a number of variants at random from the variant track
+ selectTypeToInclude List[Type] [] Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times
+
+@CITATION_SECTION@
+
+
+
diff -r 68426930d59c -r 01ff8dd37d4d variant_validate.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_validate.xml Sat Jun 01 07:20:41 2019 -0400
@@ -0,0 +1,106 @@
+
+
+
+ gatk2_macros.xml
+
+
+
+
+ gatk2_wrapper.py
+ --stdout "${output_log}"
+ -d "--variant:variant,%(file_type)s" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant"
+ -p '
+ @JAR_PATH@
+ -T "ValidateVariants"
+
+ \$GATK2_SITE_OPTIONS
+
+ #if $reference_source.reference_source_selector != "history":
+ -R "${reference_source.ref_file.fields.path}"
+ #end if
+ ${warn_on_errors}
+ ${do_not_validate_filtered_records}
+ '
+ @DBSNP_OPTIONS@
+
+ #include source=$standard_gatk_options#
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Validates a variants file.
+
+For more information on using the ValidateVariants module, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_variantutils_ValidateVariants.html>`_.
+
+To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_.
+
+If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: ValidateVariants accepts variant files as input.
+
+
+**Outputs**
+
+The output is a log of variant validation.
+
+
+Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+ doNotValidateFilteredRecords should we skip validation on filtered records?
+ warnOnErrors should we just emit warnings on errors instead of terminating the run?
+
+@CITATION_SECTION@
+
+
+