diff snpSift_extractFields.xml @ 5:09d6806c609e draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpsift/snpsift commit 70ff70918368ff0deeb596c2190a770abe9e1c9b
author iuc
date Wed, 18 Apr 2018 07:28:51 -0400
parents 20c7d583fec1
children
line wrap: on
line diff
--- a/snpSift_extractFields.xml	Tue Oct 24 07:28:17 2017 -0400
+++ b/snpSift_extractFields.xml	Wed Apr 18 07:28:51 2018 -0400
@@ -1,4 +1,4 @@
-<tool id="snpSift_extractFields" name="SnpSift Extract Fields" version="@WRAPPER_VERSION@.0">
+<tool id="snpSift_extractFields" name="SnpSift Extract Fields" version="@WRAPPER_VERSION@.galaxy0">
     <options sanitize="False" />
     <description>from a VCF file into a tabular file</description>
     <macros>
@@ -9,27 +9,27 @@
     <expand macro="version_command" />
     <command><![CDATA[
 @CONDA_SNPSIFT_JAR_PATH@ &&
-cat '$input'
+cat '${input}'
 #if $one_effect_per_line:
-    | "\$SNPSIFT_JAR_PATH/scripts/vcfEffOnePerLine.pl"
+    | perl "\$SNPSIFT_JAR_PATH/scripts/vcfEffOnePerLine.pl"
 #end if
 | SnpSift -Xmx6G extractFields
 #if $separator:
-    -s '$separator'
+    -s '${separator}'
 #end if
 #if $empty_text:
-    -e '$empty_text'
+    -e '${empty_text}'
 #end if
 -
 #echo ' '.join(['"%s"' % x for x in $extract.split()])
-> '$output'
+> '${output}'
     ]]></command>
     <inputs>
         <param name="input" type="data" format="vcf" label="Variant input file in VCF format"/>
-        <param name="extract" type="text" label="Extract" help="Need help? See below a few examples." />
+        <param name="extract" type="text" label="Fields to extract" value="CHROM POS ID REF ALT FILTER" help="Separated by spaces. See help below for an explanation" />
         <param name="one_effect_per_line" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="One effect per line" help="When variants have more than one effect, lists one effect per line, while all other parameters in the line are repeated across mutiple lines" />
-        <param name="separator" type="text" value="" label="multiple field separator" help="Separate multiple fields in one column with this character, e.g. a comma, rather than a column for each of the multiple values" />
-        <param name="empty_text" type="text" value="" label="empty field text" help="Represent empty fields with this value, rather than leaving them blank" />
+        <param name="separator" type="text" value="" label="multiple field separator" help="Separate multiple fields in one column with this character, e.g. a comma, rather than a column for each of the multiple values" argument="-s" />
+        <param name="empty_text" type="text" value="" label="empty field text" help="Represent empty fields with this value, rather than leaving them blank" argument="-e"/>
     </inputs>
     <outputs>
         <data name="output" format="tabular" />
@@ -40,156 +40,171 @@
             <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/>
             <output name="output">
                 <assert_contents>
-                <has_text text="INTRAGENIC" />
-                <not_has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
+                    <has_text text="INTRAGENIC" />
+                    <not_has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
                 </assert_contents>
             </output>
         </test>
-
         <test>
             <param name="input" ftype="vcf" value="test_rmInfo.vcf"/>
             <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/>
             <param name="separator" value=","/>
             <output name="output">
                 <assert_contents>
-		<has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
+                    <has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
                 </assert_contents>
             </output>
         </test>
+        <test>
+            <param name="input" ftype="vcf" value="extFields_test3_in.vcf"/>
+            <param name="extract" value="CHROM POS ID REF ALT FILTER ANN[*].EFFECT"/>
+            <param name="one_effect_per_line" value="true"/>
+            <output name="output" value="extFields_test3_out.vcf"/>
+        </test>
     </tests>
     <help><![CDATA[
-**SnpSift Extract Fields**
+**What is does**
+
+`SnpSift Extract Fields <http://snpeff.sourceforge.net/SnpSift.html#Extract>`_ selects columns from a VCF dataset into a Tab-delimited format.
+
+------
 
-Extract fields from a VCF file to a TXT, tab separated format, that you can easily load in R, XLS, etc.
+.. class:: infomark
+
+**How to know which fields to extract?**
+
+A VCF dataset contains mandatory fields as well as optional fields. Mandatory fields are required by `VCF specifications <https://samtools.github.io/hts-specs/VCFv4.2.pdf>`_ and present in any valid VCF dataset. The **Fields to extract** input box of the tool above is already pre-filled with names of mandatory fields.
 
-http://snpeff.sourceforge.net/SnpSift.html#Extract
+To know what other fields are available in a given VCF file simply look at its header. `INFO` and `FORMAT` lines will contain description of existing fields. For example, if you see a line:
+
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
+
+you can use *NS* as the field name.
 
-You can also use sub-fields and genotype fields / sub-fields such as::
+------
+
+**Dealing with field generated with SnpEff**
+
+The current version of `SnpEff <http://snpeff.sourceforge.net/SnpEff_manual.html>`_ produces so called *ANN* fields::
 
-    Standard VCF fields:
-        CHROM
-        POS
-        ID
-        REF
-        ALT
-        FILTER
-    INFO fields:
-        AF
-        AC
-        DP
-        MQ
-        etc. (any info field available)
-    SnpEff 'ANN' fields:
-        "ANN[*].ALLELE" (alias GENOTYPE)
-        "ANN[*].EFFECT" (alias ANNOTATION): Effect in Sequence ontology terms (e.g. 'missense_variant', 'synonymous_variant', 'stop_gained', etc.)
-        "ANN[*].IMPACT" { HIGH, MODERATE, LOW, MODIFIER }
-        "ANN[*].GENE" Gene name (e.g. 'PSD3')
-        "ANN[*].GENEID" Gene ID
-        "ANN[*].FEATURE"
-        "ANN[*].FEATUREID" (alias TRID: Transcript ID)
-        "ANN[*].BIOTYPE" Biotype, as described by the annotations (e.g. 'protein_coding')
-        "ANN[*].RANK" Exon or Intron rank (i.e. exon number in a transcript)
-        "ANN[*].HGVS_C" (alias HGVS_DNA, CODON): Variant in HGVS (DNA) notation
-        "ANN[*].HGVS_P" (alias HGVS, HGVS_PROT, AA): Variant in HGVS (protein) notation
-        "ANN[*].CDNA_POS" (alias POS_CDNA)
-        "ANN[*].CDNA_LEN" (alias LEN_CDNA)
-        "ANN[*].CDS_POS" (alias POS_CDS)
-        "ANN[*].CDS_LEN" (alias LEN_CDS)
-        "ANN[*].AA_POS" (alias POS_AA)
-        "ANN[*].AA_LEN" (alias LEN_AA)
-        "ANN[*].DISTANCE"
-        "ANN[*].ERRORS" (alias WARNING, INFOS)
-    SnpEff 'EFF' fields (this is for older SnpEff/SnpSift versions, new version use 'ANN' field):
-        "EFF[*].EFFECT"
-        "EFF[*].IMPACT"
-        "EFF[*].FUNCLASS"
-        "EFF[*].CODON"
-        "EFF[*].AA"
-        "EFF[*].AA_LEN"
-        "EFF[*].GENE"
-        "EFF[*].BIOTYPE"
-        "EFF[*].CODING"
-        "EFF[*].TRID"
-        "EFF[*].RANK"
-    SnpEff 'LOF' fields:
-        "LOF[*].GENE"
-        "LOF[*].GENEID"
-        "LOF[*].NUMTR"
-        "LOF[*].PERC"
-    SnpEff' NMD' fields:
-        "NMD[*].GENE"
-        "NMD[*].GENEID"
-        "NMD[*].NUMTR"
-        "NMD[*].PERC"
+    "ANN[*].ALLELE" (alias GENOTYPE)
+    "ANN[*].EFFECT" (alias ANNOTATION): Effect in Sequence ontology terms (e.g. 'missense_variant', 'synonymous_variant', 'stop_gained', etc.)
+    "ANN[*].IMPACT" { HIGH, MODERATE, LOW, MODIFIER }
+    "ANN[*].GENE" Gene name (e.g. 'PSD3')
+    "ANN[*].GENEID" Gene ID
+    "ANN[*].FEATURE"
+    "ANN[*].FEATUREID" (alias TRID: Transcript ID)
+    "ANN[*].BIOTYPE" Biotype, as described by the annotations (e.g. 'protein_coding')
+    "ANN[*].RANK" Exon or Intron rank (i.e. exon number in a transcript)
+    "ANN[*].HGVS_C" (alias HGVS_DNA, CODON): Variant in HGVS (DNA) notation
+    "ANN[*].HGVS_P" (alias HGVS, HGVS_PROT, AA): Variant in HGVS (protein) notation
+    "ANN[*].CDNA_POS" (alias POS_CDNA)
+    "ANN[*].CDNA_LEN" (alias LEN_CDNA)
+    "ANN[*].CDS_POS" (alias POS_CDS)
+    "ANN[*].CDS_LEN" (alias LEN_CDS)
+    "ANN[*].AA_POS" (alias POS_AA)
+    "ANN[*].AA_LEN" (alias LEN_AA)
+    "ANN[*].DISTANCE"
+    "ANN[*].ERRORS" (alias WARNING, INFOS)
+
+Older versions produced *EFF* fields::
 
-Some examples:
+    "EFF[*].EFFECT"
+    "EFF[*].IMPACT"
+    "EFF[*].FUNCLASS"
+    "EFF[*].CODON"
+    "EFF[*].AA"
+    "EFF[*].AA_LEN"
+    "EFF[*].GENE"
+    "EFF[*].BIOTYPE"
+    "EFF[*].CODING"
+    "EFF[*].TRID"
+    "EFF[*].RANK"
+
+In addition there are *LOF* and *NMD* fields::
+
+    "LOF[*].GENE"
+    "LOF[*].GENEID"
+    "LOF[*].NUMTR"
+    "LOF[*].PERC"
+
+    "NMD[*].GENE"
+    "NMD[*].GENEID"
+    "NMD[*].NUMTR"
+    "NMD[*].PERC"
 
-- *Extracting chromosome, position, ID and allele frequency from a VCF file*:
+To find our whether your VCF contains *ANN* or *EFF* annotations simply look at its header.
+
+-----
 
-  **CHROM POS ID AF**
+**Usage examples**
 
-  The result will look something like::
+*Extracting chromosome, position, ID and allele frequency from a VCF file*:
 
-      #CHROM        POS        ID            AF
-      1             69134                    0.086
-      1             69496      rs150690004   0.001
+**CHROM POS ID AF**
+
+The result will look something like::
 
-- *Extracting genotype fields*:
+    #CHROM        POS        ID            AF
+    1             69134                    0.086
+    1             69496      rs150690004   0.001
 
-  **CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT**
-
-  This means to extract:
+*Extracting genotype fields*:
 
-  - CHROM POS ID: regular fields (as in the previous example)
-  - THETA : This one is from INFO
-  - GEN[0].GL[1] : Second likelihood from first genotype
-  - GEN[1].GL : The whole GL fiels (all entries without separating them)
-  - GEN[3].GL[*] : All likelihoods form genotype 3 (this time they will be tab separated, as opposed to the previous one).
-  - GEN[*].GT : Genotype subfields (GT) from ALL samples (tab separated).
+**CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT**
+
+This means to extract:
 
-  The result will look something like::
+- CHROM POS ID: regular fields (as in the previous example)
+- THETA : This one is from INFO
+- GEN[0].GL[1] : Second likelihood from first genotype
+- GEN[1].GL : The whole GL field (all entries without separating them)
+- GEN[3].GL[*] : All likelihoods form genotype 3 (this time they will be tab separated, as opposed to the previous one).
+- GEN[*].GT : Genotype subfields (GT) from ALL samples (tab separated).
 
-      #CHROM  POS     ID              THETA   GEN[0].GL[1]    GEN[1].GL               GEN[3].GL[*]            GEN[*].GT
-      1       10583   rs58108140      0.0046  -0.47           -0.24,-0.44,-1.16       -0.48   -0.48   -0.48   0|0     0|0     0|0     0|1     0|0     0|1     0|0     0|0     0|1
-      1       10611   rs189107123     0.0077  -0.48           -0.24,-0.44,-1.16       -0.48   -0.48   -0.48   0|0     0|1     0|0     0|0     0|0     0|0     0|0     0|0     0|0
-      1       13302   rs180734498     0.0048  -0.58           -2.45,-0.00,-5.00       -0.48   -0.48   -0.48   0|0     0|1     0|0     0|0     0|0     1|0     0|0     0|1     0|0
+The result will look something like::
 
-- *Extracting fields with multiple values*:
-  (notice that there are multiple effect columns per line because there are mutiple effects per variant)
+    #CHROM  POS     ID              THETA   GEN[0].GL[1]    GEN[1].GL               GEN[3].GL[*]            GEN[*].GT
+    1       10583   rs58108140      0.0046  -0.47           -0.24,-0.44,-1.16       -0.48   -0.48   -0.48   0|0     0|0     0|0     0|1     0|0     0|1     0|0     0|0     0|1
+    1       10611   rs189107123     0.0077  -0.48           -0.24,-0.44,-1.16       -0.48   -0.48   -0.48   0|0     0|1     0|0     0|0     0|0     0|0     0|0     0|0     0|0
+    1       13302   rs180734498     0.0048  -0.58           -2.45,-0.00,-5.00       -0.48   -0.48   -0.48   0|0     0|1     0|0     0|0     0|0     1|0     0|0     0|1     0|0
 
-  **CHROM POS REF ALT ANN[*].EFFECT**
+*Extracting fields with multiple values*:
+    (notice that there are multiple effect columns per line because there are multiple effects per variant)
 
-  The result will look something like::
+**CHROM POS REF ALT ANN[*].EFFECT**
+
+The result will look something like::
 
-      #CHROM	POS	REF	ALT	ANN[*].EFFECT
-      22	17071756	T	C	3_prime_UTR_variant	downstream_gene_variant
-      22	17072035	C	T	missense_variant	downstream_gene_variant
-      22	17072258	C	A	missense_variant	downstream_gene_variant
+    #CHROM  POS REF ALT ANN[*].EFFECT
+    22  17071756    T   C   3_prime_UTR_variant downstream_gene_variant
+    22  17072035    C   T   missense_variant    downstream_gene_variant
+    22  17072258    C   A   missense_variant    downstream_gene_variant
 
-- *Extracting fields with multiple values using a comma as a multipe field separator:*
+*Extracting fields with multiple values using a comma as a multiple field separator:*
 
-  **CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P**
+**CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P**
 
-  The result will look something like::
+The result will look something like::
 
-      #CHROM	POS	REF	ALT	ANN[*].EFFECT	ANN[*].HGVS_P
-      22	17071756	T	C	3_prime_UTR_variant,downstream_gene_variant	.,.
-      22	17072035	C	T	missense_variant,downstream_gene_variant	p.Gly469Glu,.
-      22	17072258	C	A	missense_variant,downstream_gene_variant	p.Gly395Cys,.
+    #CHROM  POS REF ALT ANN[*].EFFECT   ANN[*].HGVS_P
+    22  17071756    T   C   3_prime_UTR_variant,downstream_gene_variant .,.
+    22  17072035    C   T   missense_variant,downstream_gene_variant    p.Gly469Glu,.
+    22  17072258    C   A   missense_variant,downstream_gene_variant    p.Gly395Cys,.
 
-- *Extracting fields with multiple values, one effect per line:*
+*Extracting fields with multiple values, one effect per line:*
 
-  **CHROM POS REF ALT ANN[*].EFFECT**
+**CHROM POS REF ALT ANN[*].EFFECT**
 
-  The result will look something like::
+The result will look something like::
 
-      #CHROM	POS	REF	ALT	ANN[*].EFFECT
-      22	17071756	T	C	3_prime_UTR_variant
-      22	17071756	T	C	downstream_gene_variant
-      22	17072035	C	T	missense_variant
-      22	17072035	C	T	downstream_gene_variant
-      22	17072258	C	A	missense_variant
-      22	17072258	C	A	downstream_gene_variant
+    #CHROM  POS REF ALT ANN[*].EFFECT
+    22  17071756    T   C   3_prime_UTR_variant
+    22  17071756    T   C   downstream_gene_variant
+    22  17072035    C   T   missense_variant
+    22  17072035    C   T   downstream_gene_variant
+    22  17072258    C   A   missense_variant
+    22  17072258    C   A   downstream_gene_variant
 
 @EXTERNAL_DOCUMENTATION@
 - http://snpeff.sourceforge.net/SnpSift.html#Extract