Mercurial > repos > lz_hust > gatktools
diff variant_select.xml @ 15:01ff8dd37d4d draft default tip
Uploaded
author | lz_hust |
---|---|
date | Sat, 01 Jun 2019 07:20:41 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_select.xml Sat Jun 01 07:20:41 2019 -0400 @@ -0,0 +1,283 @@ +<tool id="gatk2_variant_select" name="Select Variants" version="@VERSION@.2"> + <description>from VCF files</description> + <macros> + <import>gatk2_macros.xml</import> + </macros> + <expand macro="requirements" /> + <expand macro="version_command" /> + <command interpreter="python"> + #from binascii import hexlify + + gatk2_wrapper.py + --stdout "${output_log}" + -d "--variant:variant,%(file_type)s" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant" + -p ' + @JAR_PATH@ + -T "SelectVariants" + \$GATK2_SITE_OPTIONS + + @THREADS@ + -o "${output_vcf}" + + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + ' + -p ' + #if $input_concordance: + --concordance "${input_concordance}" + #end if + #if $input_discordance: + --discordance "${input_discordance}" + #end if + + #for $exclude_sample_name in $exclude_sample_name_repeat: + --exclude_sample_name "${exclude_sample_name.exclude_sample_name}" + #end for + + ${exclude_filtered} + + #for $sample_name in $sample_name_repeat: + --sample_name "${sample_name.sample_name}" + #end for + ' + + #for $select_expressions in $select_expressions_repeat: + #set $select_expression = "--select_expressions '%s'" % ( str( $select_expressions.select_expressions ) ) + -o '${ hexlify( $select_expression ) }' + #end for + + ##start tool specific options + #if str( $analysis_param_type.analysis_param_type_selector ) == 'advanced': + -p ' + #for $esf in $analysis_param_type.exclude_sample_file: + --exclude_sample_file "${esf}" + #end for + + #for $sf in $analysis_param_type.sample_file: + --sample_file "${sf}" + #end for + + #if $analysis_param_type.input_keep_ids: + --keepIDs "${analysis_param_type.input_keep_ids}" + #end if + + ${analysis_param_type.keep_original_AC} + + ${analysis_param_type.mendelian_violation} + + --mendelianViolationQualThreshold "${analysis_param_type.mendelian_violation_qual_threshold}" + + --remove_fraction_genotypes "${analysis_param_type.remove_fraction_genotypes}" + + --restrictAllelesTo "${analysis_param_type.restrict_alleles_to}" + + #if str( $analysis_param_type.select_random_type.select_random_type_selector ) == 'select_random_fraction': + --select_random_fraction "${analysis_param_type.select_random_type.select_random_fraction}" + #elif str( $analysis_param_type.select_random_type.select_random_type_selector ) == 'select_random_number': + --select_random_number "${analysis_param_type.select_random_type.select_random_number}" + #end if + + #if $analysis_param_type.select_type_to_include: + #for $type_to_include in str( $analysis_param_type.select_type_to_include ).split( ',' ): + --selectTypeToInclude "${type_to_include}" + #end for + #end if + + ${analysis_param_type.exclude_non_variants} + ' + + #for $sample_expressions in $analysis_param_type.sample_expressions_repeat: + #set $sample_expression = "--sample_expressions '%s'" % ( str( $sample_expressions.sample_expressions ) ) + -o '${ hexlify( $sample_expression ) }' + #end for + + #end if + ##end tool specific options + + #include source=$standard_gatk_options# + </command> + <inputs> + <conditional name="reference_source"> + <expand macro="reference_source_selector_param" /> + <when value="cached"> + <param name="input_variant" type="data" format="vcf" label="Variant file to select" help="-V,--variant &lt;variant&gt;" /> + <param name="ref_file" type="select" label="Using reference genome" help="-R,--reference_sequence &lt;reference_sequence&gt;"> + <options from_data_table="gatk2_picard_indexes"> + <filter type="data_meta" key="dbkey" ref="input_variant" column="dbkey"/> + </options> + <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/> + </param> + </when> + <when value="history"> <!-- FIX ME!!!! --> + <param name="input_variant" type="data" format="vcf" label="Variant file to select" help="-V,--variant &lt;variant&gt;" /> + <param name="ref_file" type="data" format="fasta" label="Using reference file" help="-R,--reference_sequence &lt;reference_sequence&gt;" /> + </when> + </conditional> + + <repeat name="select_expressions_repeat" title="Criteria to use when selecting the data" help="-select,--select_expressions &lt;select_expressions&gt;"> + <param name="select_expressions" type="text" label="JEXL expression"> + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + <mapping initial="none"/> + </sanitizer> + </param> + </repeat> + + <param name="input_concordance" type="data" format="vcf" label="Output variants that were also called in this comparison track" optional="True" help="-conc,--concordance &lt;concordance&gt;"/> + <param name="input_discordance" type="data" format="vcf" label="Output variants that were not called in this comparison track" optional="True" help="-disc,--discordance &lt;discordance&gt;"/> + + <repeat name="sample_name_repeat" title="Include Samples by name" help="-sn,--sample_name &lt;sample_name&gt;"> + <param name="sample_name" type="text" label="Include genotypes from this sample"/> + </repeat> + + <repeat name="exclude_sample_name_repeat" title="Exclude Samples by name" help="-xl_sn,--exclude_sample_name &lt;exclude_sample_name&gt;"> + <param name="exclude_sample_name" type="text" label="Exclude genotypes from this sample"/> + </repeat> + + <param name="exclude_filtered" type="boolean" truevalue="--excludeFiltered" falsevalue="" label="Don't include filtered loci in the analysis" help="-ef,--excludeFiltered" /> + + <expand macro="gatk_param_type_conditional" /> + + <expand macro="analysis_type_conditional"> + + <param name="exclude_sample_file" type="data" format="txt" multiple="True" label="Exclude Samples by file" help="File containing a list of samples (one per line) to exclude (-xl_sf,--exclude_sample_file &lt;exclude_sample_file&gt;)"/> + + <param name="sample_file" type="data" format="txt" multiple="True" label="Samples by file" help="File containing a list of samples (one per line) to include (-sf,--sample_file &lt;sample_file&gt;)"/> + + <param name="input_keep_ids" type="data" format="txt" label="Only emit sites whose ID is found in this file" optional="True" help="-IDs,--keepIDs &lt;keepIDs&gt;"/> + + <param name="keep_original_AC" type="boolean" truevalue="--keepOriginalAC" falsevalue="" label="Don't update the AC, AF, or AN values in the INFO field after selecting" help="-keepOriginalAC,--keepOriginalAC" /> + + <param name="mendelian_violation" type="boolean" truevalue="--mendelianViolation" falsevalue="" label="output mendelian violation sites only" help="-mv,--mendelianViolation" /> + + <param name="mendelian_violation_qual_threshold" type="float" label="Minimum genotype QUAL score for each trio member required to accept a site as a mendelian violation" value="0" help="-mvq,--mendelianViolationQualThreshold &lt;mendelianViolationQualThreshold&gt;" /> + + <param name="remove_fraction_genotypes" type="float" label="Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall" value="0" min="0" max="1" help="-fractionGenotypes,--remove_fraction_genotypes &lt;remove_fraction_genotypes&gt;" /> + + <param name="restrict_alleles_to" type="select" label="Select only variants of a particular allelicity" help="-restrictAllelesTo,--restrictAllelesTo &lt;restrictAllelesTo&gt;"> + <option value="ALL" selected="True">ALL</option> + <option value="MULTIALLELIC">MULTIALLELIC</option> + <option value="BIALLELIC">BIALLELIC</option> + </param> + + <repeat name="sample_expressions_repeat" title="Regular expression to select many samples from the ROD tracks provided" help="-se,--sample_expressions &lt;sample_expressions&gt;"> + <param name="sample_expressions" type="text" label="Regular expression"> + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + <mapping initial="none"/> + </sanitizer> + </param> + </repeat> + + <conditional name="select_random_type"> + <param name="select_random_type_selector" type="select" label="Select a random subset of variants"> + <option value="select_all" selected="True">Use all variants</option> + <option value="select_random_fraction">Select random fraction</option> + <option value="select_random_number">Select random number</option> + </param> + <when value="select_all"> + <!-- Do nothing here --> + </when> + <when value="select_random_fraction"> + <param name="select_random_fraction" type="float" value="0" label="Fraction" min="0" max="1" help="-fraction,--select_random_fraction &lt;select_random_fraction&gt;"/> + </when> + <when value="select_random_number"> + <param name="select_random_number" type="integer" value="0" label="Count" help="-number,--select_random_number &lt;select_random_number&gt;" /> + </when> + </conditional> + + <param name="exclude_non_variants" type="boolean" truevalue="--excludeNonVariants" falsevalue="" label="Don't include loci found to be non-variant after the subsetting procedure" help="-env,--excludeNonVariants" /> + + <param name="select_type_to_include" type="select" label="Select only a certain type of variants from the input file" multiple="True" display="checkboxes" help="-selectType,--selectTypeToInclude &lt;selectTypeToInclude&gt;"> + <option value="INDEL">INDEL</option> + <option value="SNP">SNP</option> + <option value="MIXED">MIXED</option> + <option value="MNP">MNP</option> + <option value="SYMBOLIC">SYMBOLIC</option> + <option value="NO_VARIATION">NO_VARIATION</option> + </param> + </expand> + + </inputs> + <outputs> + <data format="vcf" name="output_vcf" label="${tool.name} on ${on_string} (Variant File)" /> + <data format="txt" name="output_log" label="${tool.name} on ${on_string} (log)" /> + </outputs> + <tests> + <test> + <param name="reference_source_selector" value="history" /> + <param name="ref_file" value="phiX.fasta" ftype="fasta" /> + <param name="input_variant" value="gatk/gatk_variant_annotator/gatk_variant_annotator_out_1.vcf" ftype="vcf" /> + <param name="select_expressions_repeat" value="0" /> + <param name="input_concordance" /> + <param name="input_discordance" /> + <param name="exclude_sample_name_repeat" value="0" /> + <param name="exclude_filtered" /> + <param name="sample_name_repeat" value="0" /> + <param name="gatk_param_type_selector" value="basic" /> + <param name="analysis_param_type_selector" value="basic" /> + <output name="output_vcf" file="gatk/gatk_variant_select/gatk_variant_select_out_1.vcf" lines_diff="4" /> + <output name="output_log" file="gatk/gatk_variant_select/gatk_variant_select_out_1.log.contains" compare="contains" /> + </test> + </tests> + <help> +**What it does** + +Often, a VCF containing many samples and/or variants will need to be subset in order to facilitate certain analyses (e.g. comparing and contrasting cases vs. controls; extracting variant or non-variant loci that meet certain requirements, displaying just a few samples in a browser like IGV, etc.). SelectVariants can be used for this purpose. Given a single VCF file, one or more samples can be extracted from the file (based on a complete sample name or a pattern match). Variants can be further selected by specifying criteria for inclusion, i.e. "DP > 1000" (depth of coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25). These JEXL expressions are documented in the `Using JEXL expressions section <http://gatkforums.broadinstitute.org/discussion/1255/what-are-jexl-expressions-and-how-can-i-use-them-with-the-gatk>`_. One can optionally include concordance or discordance tracks for use in selecting overlapping variants. + +For more information on using the SelectVariants module, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_variantutils_SelectVariants.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: SelectVariants accepts a VCF input file. + + +**Outputs** + +The output is in VCF format. + + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + + out VCFWriter stdout File to which variants should be written + variant RodBinding[VariantContext] NA Input VCF file + concordance RodBinding[VariantContext] none Output variants that were also called in this comparison track + discordance RodBinding[VariantContext] none Output variants that were not called in this comparison track + exclude_sample_file Set[File] [] File containing a list of samples (one per line) to exclude. Can be specified multiple times + exclude_sample_name Set[String] [] Exclude genotypes from this sample. Can be specified multiple times + excludeFiltered boolean false Don't include filtered loci in the analysis + excludeNonVariants boolean false Don't include loci found to be non-variant after the subsetting procedure + keepIDs File NA Only emit sites whose ID is found in this file (one ID per line) + keepOriginalAC boolean false Don't update the AC, AF, or AN values in the INFO field after selecting + mendelianViolation Boolean false output mendelian violation sites only + mvq double 0.0 Minimum genotype QUAL score for each trio member required to accept a site as a violation + remove_fraction_genotypes double 0.0 Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall + restrictAllelesTo NumberAlleleRestriction ALL Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC + sample_expressions Set[String] NA Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times + sample_file Set[File] NA File containing a list of samples (one per line) to include. Can be specified multiple times + sample_name Set[String] [] Include genotypes from this sample. Can be specified multiple times + select_expressions ArrayList[String] [] One or more criteria to use when selecting the data + select_random_fraction double 0.0 Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track + select_random_number int 0 Selects a number of variants at random from the variant track + selectTypeToInclude List[Type] [] Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times + +@CITATION_SECTION@ + </help> + <expand macro="citations" /> +</tool>