diff variant_combine.xml @ 0:1a6e16391727 draft default tip

Imported from capsule None
author devteam
date Tue, 01 Apr 2014 10:50:25 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_combine.xml	Tue Apr 01 10:50:25 2014 -0400
@@ -0,0 +1,171 @@
+<tool id="gatk_variant_combine" name="Combine Variants" version="0.0.4">
+  <description></description>
+  <requirements>
+      <requirement type="package" version="1.4">gatk</requirement>
+  </requirements>
+  <macros>
+    <import>gatk_macros.xml</import>
+  </macros>
+  <command interpreter="python">gatk_wrapper.py
+   --max_jvm_heap_fraction "1"
+   --stdout "${output_log}"
+   
+   #set $priority_order = []
+   #for $input_variant in $reference_source.input_variants:
+       -d "--variant:${input_variant.input_variant_name},%(file_type)s" "${input_variant.input_variant}" "${input_variant.input_variant.ext}" "input_variant_${input_variant.input_variant_name}"
+       #set $input_variant_name = str( $input_variant.input_variant_name )
+       #assert $input_variant_name not in $priority_order, "Variant Names must be unique" ##this should be handled by a validator
+       #silent $priority_order.append( $input_variant_name )
+   #end for
+   -p 'java 
+    -jar "\$JAVA_JAR_PATH/GenomeAnalysisTK.jar"
+    -T "CombineVariants"
+    --out "${output_variants}"
+    ##--num_threads 4 ##hard coded, for now
+    -et "NO_ET" ##ET no phone home
+    ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout
+    #if $reference_source.reference_source_selector != "history":
+        -R "${reference_source.ref_file.fields.path}"
+    #end if
+   --genotypemergeoption "${genotype_merge_option}"
+   --rod_priority_list "${ ','.join( $priority_order ) }"
+   '
+   
+    #include source=$standard_gatk_options#
+    
+    
+    ##start analysis specific options
+    #if $analysis_param_type.analysis_param_type_selector == "advanced":
+        -p '
+        --filteredrecordsmergetype "${analysis_param_type.filtered_records_merge_type}"
+        ${analysis_param_type.print_complex_merges}
+        ${analysis_param_type.filtered_are_uncalled}
+        ${analysis_param_type.minimal_vcf}
+        ${analysis_param_type.assume_identical_samples}
+        
+        #if str( $analysis_param_type.set_key ):
+            --setKey "${analysis_param_type.set_key}"
+        #end if
+        
+        --minimumN "${analysis_param_type.minimum_n}"
+        '
+    #end if
+  </command>
+  <inputs>
+    
+    <conditional name="reference_source">
+      <expand macro="reference_source_selector_param" />
+      <when value="cached">
+        <repeat min="1" name="input_variants" title="Variants to Merge" help="Records will be prioritized in the order that you list them here (-V,--variant &amp;lt;variant&amp;gt;)">
+          <param name="input_variant" type="data" format="vcf" label="Input variant file" />
+          <param name="input_variant_name" type="text" value="" label="Variant name" help="Names must be unique">
+            <validator type="length" min="1" message="You must provide a unique name for this set of variants" />
+          </param>
+        </repeat>
+        <param name="ref_file" type="select" label="Using reference genome" help="-R,--reference_sequence &amp;lt;reference_sequence&amp;gt;">
+          <options from_data_table="gatk_picard_indexes">
+            <!-- <filter type="data_meta" key="dbkey" ref="input_variants.input_variant" column="dbkey"/> -->
+          </options>
+          <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
+        </param>
+      </when>
+      <when value="history"> <!-- FIX ME!!!! -->
+        <repeat min="1" name="input_variants" title="Variants to Merge" help="Records will be prioritized in the order that you list them here (-V,--variant &amp;lt;variant&amp;gt;)">
+          <param name="input_variant" type="data" format="vcf" label="Input variant file" />
+          <param name="input_variant_name" type="text" value="" label="Variant name" help="Names must be unique">
+            <validator type="length" min="1" message="You must provide a unique name for this set of variants" />
+          </param>
+        </repeat>
+        <param name="ref_file" type="data" format="fasta" label="Using reference file" help="-R,--reference_sequence &amp;lt;reference_sequence&amp;gt;" />
+      </when>
+    </conditional>
+    
+    <param name="genotype_merge_option" type="select" label="How should we merge genotype records across records for samples shared across the ROD files" help="-genotypeMergeOptions,--genotypemergeoption &amp;lt;genotypemergeoption&amp;gt;" >
+      <option value="UNIQUIFY" />
+      <option value="PRIORITIZE" selected="true"/>
+      <option value="UNSORTED" />
+      <option value="REQUIRE_UNIQUE" />
+    </param>
+    
+    <expand macro="gatk_param_type_conditional" />
+    
+    
+    <expand macro="analysis_type_conditional">
+        <param name="filtered_records_merge_type" type="select" label="How should we deal with records seen at the same site in the VCF, but with different FILTER fields?" help="-filteredRecordsMergeType,--filteredrecordsmergetype &amp;lt;filteredrecordsmergetype&amp;gt;" >
+          <option value="KEEP_IF_ANY_UNFILTERED" selected="true"/>
+          <option value="KEEP_IF_ALL_UNFILTERED" />
+        </param>
+        
+        <param name="print_complex_merges" checked="false" type="boolean" truevalue="--printComplexMerges" falsevalue="" label="Print out interesting sites requiring complex compatibility merging" help="-printComplexMerges,--printComplexMerges" />
+        <param name="filtered_are_uncalled" checked="false" type="boolean" truevalue="--filteredAreUncalled" falsevalue="" label="If true, then filtered VCFs are treated as uncalled, so that filtered set annotation don't appear in the combined VCF" help="-filteredAreUncalled,--filteredAreUncalled" />
+        <param name="minimal_vcf" checked="false" type="boolean" truevalue="--minimalVCF" falsevalue="" label="If true, then the output VCF will contain no INFO or genotype INFO field" help="-minimalVCF,--minimalVCF" />
+        
+        <param name="set_key" type="text" value="" label="Key, by default set, in the INFO key=value tag emitted describing which set the combined VCF record came from." help="-setKey,--setKey &amp;lt;setKey&amp;gt;"/>
+        <param name="assume_identical_samples" checked="false" type="boolean" truevalue="--assumeIdenticalSamples" falsevalue="" label="If true, assume input VCFs have identical sample sets and disjoint calls so that one can simply perform a merge sort to combine the VCFs into one, drastically reducing the runtime." help="-assumeIdenticalSamples,--assumeIdenticalSamples" />
+        <param name="minimum_n" type="integer" value="1" label="Combine variants and output site only if variant is present in at least N input files." help="-minN,--minimumN &amp;lt;minimumN&amp;gt;"/>
+        
+    </expand>
+    
+    
+  </inputs>
+  <outputs>
+    <data format="vcf" name="output_variants" label="${tool.name} on ${on_string} (variants)" />
+    <data format="txt" name="output_log" label="${tool.name} on ${on_string} (log)" />
+  </outputs>
+  <tests>
+      <test>
+          <param name="reference_source_selector" value="history" />
+          <param name="ref_file" value="phiX.fasta" ftype="fasta" />
+          <param name="input_variant" value="gatk/gatk_variant_annotator/gatk_variant_annotator_out_1.vcf" ftype="vcf" />
+          <param name="input_variant_name" value="from_variant_annotator" />
+          <param name="genotype_merge_option" value="PRIORITIZE" />
+          <param name="gatk_param_type_selector" value="basic" />
+          <param name="analysis_param_type_selector" value="basic" />
+          <output name="output_variants" file="gatk/gatk_variant_combine/gatk_variant_combine_out_1.vcf" lines_diff="4" />
+          <output name="output_log" file="gatk/gatk_variant_combine/gatk_variant_combine_out_1.log.contains" compare="contains" />
+      </test>
+  </tests>
+  <help>
+**What it does**
+
+Combines VCF records from different sources; supports both full merges and set unions. Merge: combines multiple records into a single one; if sample names overlap then they are uniquified. Union: assumes each rod represents the same set of samples (although this is not enforced); using the priority list (if provided), emits a single record instance at every position represented in the rods.
+
+For more information on using the CombineVariants module, see this `tool specific page &lt;http://www.broadinstitute.org/gsa/wiki/index.php/CombineVariants&gt;`_.
+
+To learn about best practices for variant detection using GATK, see this `overview &lt;http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3&gt;`_.
+
+If you encounter errors, please view the `GATK FAQ &lt;http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions&gt;`_.
+
+------
+
+**Inputs**
+
+GenomeAnalysisTK: CombineVariants accepts variant files as input.
+
+------
+
+**Outputs**
+
+The output is a combined vcf file.
+
+
+Go `here &lt;http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK&gt;`_ for details on GATK file formats.
+
+-------
+
+**Settings**::
+
+ out                         File to which variants should be written
+ genotypemergeoption         How should we merge genotype records for samples shared across the ROD files? (UNIQUIFY|PRIORITIZE|UNSORTED|REQUIRE_UNIQUE)
+ filteredrecordsmergetype    How should we deal with records seen at the same site in the VCF, but with different FILTER fields? KEEP_IF_ANY_UNFILTERED PASSes the record if any record is unfiltered, KEEP_IF_ALL_UNFILTERED requires all records to be unfiltered (KEEP_IF_ANY_UNFILTERED|KEEP_IF_ALL_UNFILTERED)
+ rod_priority_list           When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided
+ printComplexMerges          Print out interesting sites requiring complex compatibility merging
+ filteredAreUncalled         If true, then filtered VCFs are treated as uncalled, so that filtered set annotation don't appear in the combined VCF
+ minimalVCF                  If true, then the output VCF will contain no INFO or genotype INFO field
+ setKey                      Key, by default set, in the INFO key=value tag emitted describing which set the combined VCF record came from.  Set to null if you don't want the set field emitted.
+ assumeIdenticalSamples      If true, assume input VCFs have identical sample sets and disjoint calls so that one can simply perform a merge sort to combine the VCFs into one, drastically reducing the runtime.
+ minimumN                    Combine variants and output site only if variant is present in at least N input files.
+
+@CITATION_SECTION@
+  </help>
+</tool>