diff variant_select.xml @ 0:340633249b3d draft

Uploaded
author bgruening
date Mon, 02 Dec 2013 06:18:36 -0500
parents
children f244b8209eb8
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_select.xml	Mon Dec 02 06:18:36 2013 -0500
@@ -0,0 +1,288 @@
+<tool id="gatk2_variant_select" name="Select Variants" version="0.0.7">
+  <description>from VCF files</description>
+  <expand macro="requirements" />
+  <macros>
+    <import>gatk2_macros.xml</import>
+  </macros>
+  <command interpreter="python">
+    #from binascii import hexlify
+
+    gatk2_wrapper.py
+   --stdout "${output_log}"
+   -d "--variant:variant,%(file_type)s" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant"
+   -p '
+   @JAR_PATH@
+    -T "SelectVariants"
+    \$GATK2_SITE_OPTIONS
+
+    @THREADS@
+    -o "${output_vcf}"
+    
+    #if $reference_source.reference_source_selector != "history":
+        -R "${reference_source.ref_file.fields.path}"
+    #end if
+    '
+    -p '
+    #if $input_concordance:
+        --concordance "${input_concordance}"
+    #end if
+    #if $input_discordance:
+        --discordance "${input_discordance}"
+    #end if
+    
+    #for $exclude_sample_name in $exclude_sample_name_repeat:
+        --exclude_sample_name "${exclude_sample_name.exclude_sample_name}"
+    #end for
+    
+    ${exclude_filtered}
+    
+    #for $sample_name in $sample_name_repeat:
+        --sample_name "${sample_name.sample_name}"
+    #end for
+    '
+    
+    #for $select_expressions in $select_expressions_repeat:
+        #set $select_expression = "--select_expressions '%s'" % ( str( $select_expressions.select_expressions ) )
+        -o '${ hexlify( $select_expression ) }'
+    #end for
+    
+    ##start tool specific options
+    #if str( $analysis_param_type.analysis_param_type_selector ) == 'advanced':
+        -p '
+          #for $exclude_sample_file in $analysis_param_type.exclude_sample_file_repeat:
+              --exclude_sample_file "${exclude_sample_file.exclude_sample_file}"
+          #end for
+          
+          #for $sample_file in $analysis_param_type.sample_file_repeat:
+              --sample_file "${ample_file.sample_file}"
+          #end for
+          
+          #if $analysis_param_type.input_keep_ids:
+              --keepIDs "${analysis_param_type.input_keep_ids}"
+          #end if
+          
+          ${analysis_param_type.keep_original_AC}
+          
+          ${analysis_param_type.mendelian_violation}
+          
+          --mendelianViolationQualThreshold "${analysis_param_type.mendelian_violation_qual_threshold}"
+          
+          --remove_fraction_genotypes "${analysis_param_type.remove_fraction_genotypes}"
+          
+          --restrictAllelesTo "${analysis_param_type.restrict_alleles_to}"
+          
+          #if str( $analysis_param_type.select_random_type.select_random_type_selector ) == 'select_random_fraction':
+              --select_random_fraction "${analysis_param_type.select_random_type.select_random_fraction}"
+          #elif str( $analysis_param_type.select_random_type.select_random_type_selector ) == 'select_random_number':
+              --select_random_number "${analysis_param_type.select_random_type.select_random_number}"
+          #end if
+          
+          #if $analysis_param_type.select_type_to_include:
+              #for $type_to_include in str( $analysis_param_type.select_type_to_include ).split( ',' ):
+                  --selectTypeToInclude "${type_to_include}"
+              #end for
+          #end if
+          
+          ${analysis_param_type.exclude_non_variants}
+        '
+        
+        #for $sample_expressions in $analysis_param_type.sample_expressions_repeat:
+            #set $sample_expression = "--sample_expressions '%s'" % ( str( $sample_expressions.sample_expressions ) )
+            -o '${ hexlify( $sample_expression ) }'
+        #end for
+        
+    #end if
+    ##end tool specific options
+    
+    #include source=$standard_gatk_options#
+    
+    
+  </command>
+  <inputs>
+    <conditional name="reference_source">
+      <expand macro="reference_source_selector_param" />
+      <when value="cached">
+        <param name="input_variant" type="data" format="vcf" label="Variant file to select" help="-V,--variant &amp;lt;variant&amp;gt;" />
+        <param name="ref_file" type="select" label="Using reference genome" help="-R,--reference_sequence &amp;lt;reference_sequence&amp;gt;">
+          <options from_data_table="gatk2_picard_indexes">
+            <filter type="data_meta" key="dbkey" ref="input_variant" column="dbkey"/>
+          </options>
+          <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
+        </param>
+      </when>
+      <when value="history"> <!-- FIX ME!!!! -->
+        <param name="input_variant" type="data" format="vcf" label="Variant file to select" help="-V,--variant &amp;lt;variant&amp;gt;" />
+        <param name="ref_file" type="data" format="fasta" label="Using reference file" help="-R,--reference_sequence &amp;lt;reference_sequence&amp;gt;" />
+      </when>
+    </conditional>
+    
+    <repeat name="select_expressions_repeat" title="Criteria to use when selecting the data" help="-select,--select_expressions &amp;lt;select_expressions&amp;gt;">
+        <param name="select_expressions" type="text" label="JEXL expression">
+            <sanitizer>
+              <valid initial="string.printable">
+               <remove value="&apos;"/>
+             </valid>
+              <mapping initial="none"/>
+            </sanitizer>
+        </param>
+    </repeat>
+    
+    <param name="input_concordance" type="data" format="vcf" label="Output variants that were also called in this comparison track" optional="True" help="-conc,--concordance &amp;lt;concordance&amp;gt;"/>
+    <param name="input_discordance" type="data" format="vcf" label="Output variants that were not called in this comparison track" optional="True" help="-disc,--discordance &amp;lt;discordance&amp;gt;"/>
+    
+    <repeat name="sample_name_repeat" title="Include Samples by name" help="-sn,--sample_name &amp;lt;sample_name&amp;gt;">
+        <param name="sample_name" type="text" label="Include genotypes from this sample"/>
+    </repeat>
+    
+    <repeat name="exclude_sample_name_repeat" title="Exclude Samples by name" help="-xl_sn,--exclude_sample_name &amp;lt;exclude_sample_name&amp;gt;">
+        <param name="exclude_sample_name" type="text" label="Exclude genotypes from this sample"/>
+    </repeat>
+    
+    <param name="exclude_filtered" type="boolean" truevalue="--excludeFiltered" falsevalue="" label="Don't include filtered loci in the analysis" help="-ef,--excludeFiltered" />
+    
+    <expand macro="gatk_param_type_conditional" />
+    
+    
+    <expand macro="analysis_type_conditional">
+        
+        <repeat name="exclude_sample_file_repeat" title="Exclude Samples by file" help="-xl_sf,--exclude_sample_file &amp;lt;exclude_sample_file&amp;gt;">
+            <param name="exclude_sample_file" type="data" format="txt" label="File containing a list of samples (one per line) to exclude"/>
+        </repeat>
+        
+        <repeat name="sample_file_repeat" title="Samples by file" help="-sf,--sample_file &amp;lt;sample_file&amp;gt;">
+            <param name="sample_file" type="data" format="txt" label="File containing a list of samples (one per line) to include" />
+        </repeat>
+        
+        <param name="input_keep_ids" type="data" format="text" label="Only emit sites whose ID is found in this file" optional="True" help="-IDs,--keepIDs &amp;lt;keepIDs&amp;gt;"/>
+        
+        <param name="keep_original_AC" type="boolean" truevalue="--keepOriginalAC" falsevalue="" label="Don't update the AC, AF, or AN values in the INFO field after selecting" help="-keepOriginalAC,--keepOriginalAC" />
+        
+        <param name="mendelian_violation" type="boolean" truevalue="--mendelianViolation" falsevalue="" label="output mendelian violation sites only" help="-mv,--mendelianViolation" />
+        
+        <param name="mendelian_violation_qual_threshold" type="float" label="Minimum genotype QUAL score for each trio member required to accept a site as a mendelian violation" value="0" help="-mvq,--mendelianViolationQualThreshold &amp;lt;mendelianViolationQualThreshold&amp;gt;" />
+        
+        <param name="remove_fraction_genotypes" type="float" label="Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall" value="0" min="0" max="1" help="-fractionGenotypes,--remove_fraction_genotypes &amp;lt;remove_fraction_genotypes&amp;gt;" />
+        
+        <param name="restrict_alleles_to" type="select" label="Select only variants of a particular allelicity" help="-restrictAllelesTo,--restrictAllelesTo &amp;lt;restrictAllelesTo&amp;gt;">
+            <option value="ALL" selected="True">ALL</option>
+            <option value="MULTIALLELIC">MULTIALLELIC</option>
+            <option value="BIALLELIC">BIALLELIC</option>
+        </param>
+        
+        <repeat name="sample_expressions_repeat" title="Regular expression to select many samples from the ROD tracks provided" help="-se,--sample_expressions &amp;lt;sample_expressions&amp;gt;">
+            <param name="sample_expressions" type="text" label="Regular expression">
+                <sanitizer>
+                  <valid initial="string.printable">
+                   <remove value="&apos;"/>
+                 </valid>
+                  <mapping initial="none"/>
+                </sanitizer>
+            </param>
+        </repeat>
+        
+        <conditional name="select_random_type">
+          <param name="select_random_type_selector" type="select" label="Select a random subset of variants">
+            <option value="select_all" selected="True">Use all variants</option>
+            <option value="select_random_fraction">Select random fraction</option>
+            <option value="select_random_number">Select random number</option>
+          </param>
+          <when value="select_all">
+            <!-- Do nothing here -->
+          </when>
+          <when value="select_random_fraction">
+            <param name="select_random_fraction" type="float" value="0" label="Fraction" min="0" max="1" help="-fraction,--select_random_fraction &amp;lt;select_random_fraction&amp;gt;"/>
+          </when>
+          <when value="select_random_number">
+            <param name="select_random_number" type="integer" value="0" label="Count" help="-number,--select_random_number &amp;lt;select_random_number&amp;gt;" />
+          </when>
+        </conditional>
+        
+        <param name="exclude_non_variants" type="boolean" truevalue="--excludeNonVariants" falsevalue="" label="Don't include loci found to be non-variant after the subsetting procedure" help="-env,--excludeNonVariants" />
+        
+        <param name="select_type_to_include" type="select" label="Select only a certain type of variants from the input file" multiple="True" display="checkboxes" help="-selectType,--selectTypeToInclude &amp;lt;selectTypeToInclude&amp;gt;">
+            <option value="INDEL">INDEL</option>
+            <option value="SNP">SNP</option>
+            <option value="MIXED">MIXED</option>
+            <option value="MNP">MNP</option>
+            <option value="SYMBOLIC">SYMBOLIC</option>
+            <option value="NO_VARIATION">NO_VARIATION</option>
+        </param>
+    </expand>
+    
+  </inputs>
+  <outputs>
+    <data format="vcf" name="output_vcf" label="${tool.name} on ${on_string} (Variant File)" />
+    <data format="txt" name="output_log" label="${tool.name} on ${on_string} (log)" />
+  </outputs>
+  <tests>
+      <test>
+          <param name="reference_source_selector" value="history" />
+          <param name="ref_file" value="phiX.fasta" ftype="fasta" />
+          <param name="input_variant" value="gatk/gatk_variant_annotator/gatk_variant_annotator_out_1.vcf" ftype="vcf" />
+          <param name="select_expressions_repeat" value="0" />
+          <param name="input_concordance" />
+          <param name="input_discordance" />
+          <param name="exclude_sample_name_repeat" value="0" />
+          <param name="exclude_filtered" />
+          <param name="sample_name_repeat" value="0" />
+          <param name="gatk_param_type_selector" value="basic" />
+          <param name="analysis_param_type_selector" value="basic" />
+          <output name="output_vcf" file="gatk/gatk_variant_select/gatk_variant_select_out_1.vcf" lines_diff="4" /> 
+          <output name="output_log" file="gatk/gatk_variant_select/gatk_variant_select_out_1.log.contains" compare="contains" />
+      </test>
+  </tests>
+  <help>
+**What it does**
+
+Often, a VCF containing many samples and/or variants will need to be subset in order to facilitate certain analyses (e.g. comparing and contrasting cases vs. controls; extracting variant or non-variant loci that meet certain requirements, displaying just a few samples in a browser like IGV, etc.). SelectVariants can be used for this purpose. Given a single VCF file, one or more samples can be extracted from the file (based on a complete sample name or a pattern match). Variants can be further selected by specifying criteria for inclusion, i.e. "DP &gt; 1000" (depth of coverage greater than 1000x), "AF &lt; 0.25" (sites with allele frequency less than 0.25). These JEXL expressions are documented in the `Using JEXL expressions section &lt;http://gatkforums.broadinstitute.org/discussion/1255/what-are-jexl-expressions-and-how-can-i-use-them-with-the-gatk&gt;`_. One can optionally include concordance or discordance tracks for use in selecting overlapping variants. 
+
+For more information on using the SelectVariants module, see this `tool specific page &lt;http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_variantutils_SelectVariants.html&gt;`_.
+
+To learn about best practices for variant detection using GATK, see this `overview &lt;http://www.broadinstitute.org/gatk/guide/topic?name=best-practices&gt;`_.
+
+If you encounter errors, please view the `GATK FAQ &lt;http://www.broadinstitute.org/gatk/guide/topic?name=faqs&gt;`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: SelectVariants accepts a VCF input file.
+
+
+**Outputs**
+
+The output is in VCF format.
+
+
+Go `here &lt;http://www.broadinstitute.org/gatk/guide/topic?name=intro&gt;`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+
+ out                         VCFWriter  stdout  File to which variants should be written
+ variant                     RodBinding[VariantContext]  NA  Input VCF file
+ concordance                 RodBinding[VariantContext]  none  Output variants that were also called in this comparison track
+ discordance                 RodBinding[VariantContext]  none  Output variants that were not called in this comparison track
+ exclude_sample_file         Set[File]  []  File containing a list of samples (one per line) to exclude. Can be specified multiple times
+ exclude_sample_name         Set[String]  []  Exclude genotypes from this sample. Can be specified multiple times
+ excludeFiltered             boolean  false  Don't include filtered loci in the analysis
+ excludeNonVariants          boolean  false  Don't include loci found to be non-variant after the subsetting procedure
+ keepIDs                     File  NA  Only emit sites whose ID is found in this file (one ID per line)
+ keepOriginalAC              boolean  false  Don't update the AC, AF, or AN values in the INFO field after selecting
+ mendelianViolation          Boolean  false  output mendelian violation sites only
+ mvq                         double  0.0  Minimum genotype QUAL score for each trio member required to accept a site as a violation
+ remove_fraction_genotypes   double  0.0  Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall
+ restrictAllelesTo           NumberAlleleRestriction  ALL  Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC
+ sample_expressions          Set[String]  NA  Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times
+ sample_file                 Set[File]  NA  File containing a list of samples (one per line) to include. Can be specified multiple times
+ sample_name                 Set[String]  []  Include genotypes from this sample. Can be specified multiple times
+ select_expressions          ArrayList[String]  []  One or more criteria to use when selecting the data
+ select_random_fraction      double  0.0  Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track
+ select_random_number        int  0  Selects a number of variants at random from the variant track
+ selectTypeToInclude         List[Type]  []  Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times
+
+@CITATION_SECTION@
+  </help>
+</tool>