snpsift: snpSift_extractFields.xml comparison

comparison snpSift_extractFields.xml @ 5:09d6806c609e draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpsift/snpsift commit 70ff70918368ff0deeb596c2190a770abe9e1c9b

author	iuc
date	Wed, 18 Apr 2018 07:28:51 -0400
parents	20c7d583fec1
children

comparison

equal deleted inserted replaced

-:b04635ebfab0
+:09d6806c609e
-<tool id="snpSift_extractFields" name="SnpSift Extract Fields" version="@WRAPPER_VERSION@.0">
+<tool id="snpSift_extractFields" name="SnpSift Extract Fields" version="@WRAPPER_VERSION@.galaxy0">
 <options sanitize="False" />
 <description>from a VCF file into a tabular file</description>
 <macros>
 <import>snpSift_macros.xml</import>
 </macros>
 <expand macro="requirements" />
 <expand macro="stdio" />
 <expand macro="version_command" />
 <command><![CDATA[
 @CONDA_SNPSIFT_JAR_PATH@ &&
-cat '$input'
+cat '${input}'
 #if $one_effect_per_line:
-| "\$SNPSIFT_JAR_PATH/scripts/vcfEffOnePerLine.pl"
+| perl "\$SNPSIFT_JAR_PATH/scripts/vcfEffOnePerLine.pl"
 #end if
 | SnpSift -Xmx6G extractFields
 #if $separator:
--s '$separator'
+-s '${separator}'
 #end if
 #if $empty_text:
--e '$empty_text'
+-e '${empty_text}'
 #end if
 -
 #echo ' '.join(['"%s"' % x for x in $extract.split()])
-> '$output'
+> '${output}'
 ]]></command>
 <inputs>
 <param name="input" type="data" format="vcf" label="Variant input file in VCF format"/>
-<param name="extract" type="text" label="Extract" help="Need help? See below a few examples." />
+<param name="extract" type="text" label="Fields to extract" value="CHROM POS ID REF ALT FILTER" help="Separated by spaces. See help below for an explanation" />
 <param name="one_effect_per_line" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="One effect per line" help="When variants have more than one effect, lists one effect per line, while all other parameters in the line are repeated across mutiple lines" />
-<param name="separator" type="text" value="" label="multiple field separator" help="Separate multiple fields in one column with this character, e.g. a comma, rather than a column for each of the multiple values" />
+<param name="separator" type="text" value="" label="multiple field separator" help="Separate multiple fields in one column with this character, e.g. a comma, rather than a column for each of the multiple values" argument="-s" />
-<param name="empty_text" type="text" value="" label="empty field text" help="Represent empty fields with this value, rather than leaving them blank" />
+<param name="empty_text" type="text" value="" label="empty field text" help="Represent empty fields with this value, rather than leaving them blank" argument="-e"/>
 </inputs>
 <outputs>
 <data name="output" format="tabular" />
 </outputs>
 <tests>
 <test>
 <param name="input" ftype="vcf" value="test_rmInfo.vcf"/>
 <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/>
 <output name="output">
 <assert_contents>
 <has_text text="INTRAGENIC" />
 <not_has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
 </assert_contents>
 </output>
 </test>
 <test>
 <param name="input" ftype="vcf" value="test_rmInfo.vcf"/>
 <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/>
 <param name="separator" value=","/>
 <output name="output">
 <assert_contents>
-		<has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
+<has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
 </assert_contents>
 </output>
 </test>
+<test>
+<param name="input" ftype="vcf" value="extFields_test3_in.vcf"/>
+<param name="extract" value="CHROM POS ID REF ALT FILTER ANN[*].EFFECT"/>
+<param name="one_effect_per_line" value="true"/>
+<output name="output" value="extFields_test3_out.vcf"/>
+</test>
 </tests>
 <help><![CDATA[
-**SnpSift Extract Fields**
+**What is does**
-Extract fields from a VCF file to a TXT, tab separated format, that you can easily load in R, XLS, etc.
+`SnpSift Extract Fields <http://snpeff.sourceforge.net/SnpSift.html#Extract>`_ selects columns from a VCF dataset into a Tab-delimited format.
-http://snpeff.sourceforge.net/SnpSift.html#Extract
+------
-You can also use sub-fields and genotype fields / sub-fields such as::
+.. class:: infomark
-Standard VCF fields:
+**How to know which fields to extract?**
-CHROM
-POS
+A VCF dataset contains mandatory fields as well as optional fields. Mandatory fields are required by `VCF specifications <https://samtools.github.io/hts-specs/VCFv4.2.pdf>`_ and present in any valid VCF dataset. The **Fields to extract** input box of the tool above is already pre-filled with names of mandatory fields.
-ID
-REF
+To know what other fields are available in a given VCF file simply look at its header. `INFO` and `FORMAT` lines will contain description of existing fields. For example, if you see a line:
-ALT
-FILTER
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
-INFO fields:
-AF
+you can use *NS* as the field name.
-AC
-DP
+------
-MQ
-etc. (any info field available)
+**Dealing with field generated with SnpEff**
-SnpEff 'ANN' fields:
-"ANN[*].ALLELE" (alias GENOTYPE)
+The current version of `SnpEff <http://snpeff.sourceforge.net/SnpEff_manual.html>`_ produces so called *ANN* fields::
-"ANN[*].EFFECT" (alias ANNOTATION): Effect in Sequence ontology terms (e.g. 'missense_variant', 'synonymous_variant', 'stop_gained', etc.)
-"ANN[*].IMPACT" { HIGH, MODERATE, LOW, MODIFIER }
+"ANN[*].ALLELE" (alias GENOTYPE)
-"ANN[*].GENE" Gene name (e.g. 'PSD3')
+"ANN[*].EFFECT" (alias ANNOTATION): Effect in Sequence ontology terms (e.g. 'missense_variant', 'synonymous_variant', 'stop_gained', etc.)
-"ANN[*].GENEID" Gene ID
+"ANN[*].IMPACT" { HIGH, MODERATE, LOW, MODIFIER }
-"ANN[*].FEATURE"
+"ANN[*].GENE" Gene name (e.g. 'PSD3')
-"ANN[*].FEATUREID" (alias TRID: Transcript ID)
+"ANN[*].GENEID" Gene ID
-"ANN[*].BIOTYPE" Biotype, as described by the annotations (e.g. 'protein_coding')
+"ANN[*].FEATURE"
-"ANN[*].RANK" Exon or Intron rank (i.e. exon number in a transcript)
+"ANN[*].FEATUREID" (alias TRID: Transcript ID)
-"ANN[*].HGVS_C" (alias HGVS_DNA, CODON): Variant in HGVS (DNA) notation
+"ANN[*].BIOTYPE" Biotype, as described by the annotations (e.g. 'protein_coding')
-"ANN[*].HGVS_P" (alias HGVS, HGVS_PROT, AA): Variant in HGVS (protein) notation
+"ANN[*].RANK" Exon or Intron rank (i.e. exon number in a transcript)
-"ANN[*].CDNA_POS" (alias POS_CDNA)
+"ANN[*].HGVS_C" (alias HGVS_DNA, CODON): Variant in HGVS (DNA) notation
-"ANN[*].CDNA_LEN" (alias LEN_CDNA)
+"ANN[*].HGVS_P" (alias HGVS, HGVS_PROT, AA): Variant in HGVS (protein) notation
-"ANN[*].CDS_POS" (alias POS_CDS)
+"ANN[*].CDNA_POS" (alias POS_CDNA)
-"ANN[*].CDS_LEN" (alias LEN_CDS)
+"ANN[*].CDNA_LEN" (alias LEN_CDNA)
-"ANN[*].AA_POS" (alias POS_AA)
+"ANN[*].CDS_POS" (alias POS_CDS)
-"ANN[*].AA_LEN" (alias LEN_AA)
+"ANN[*].CDS_LEN" (alias LEN_CDS)
-"ANN[*].DISTANCE"
+"ANN[*].AA_POS" (alias POS_AA)
-"ANN[*].ERRORS" (alias WARNING, INFOS)
+"ANN[*].AA_LEN" (alias LEN_AA)
-SnpEff 'EFF' fields (this is for older SnpEff/SnpSift versions, new version use 'ANN' field):
+"ANN[*].DISTANCE"
-"EFF[*].EFFECT"
+"ANN[*].ERRORS" (alias WARNING, INFOS)
-"EFF[*].IMPACT"
-"EFF[*].FUNCLASS"
+Older versions produced *EFF* fields::
-"EFF[*].CODON"
-"EFF[*].AA"
+"EFF[*].EFFECT"
-"EFF[*].AA_LEN"
+"EFF[*].IMPACT"
-"EFF[*].GENE"
+"EFF[*].FUNCLASS"
-"EFF[*].BIOTYPE"
+"EFF[*].CODON"
-"EFF[*].CODING"
+"EFF[*].AA"
-"EFF[*].TRID"
+"EFF[*].AA_LEN"
-"EFF[*].RANK"
+"EFF[*].GENE"
-SnpEff 'LOF' fields:
+"EFF[*].BIOTYPE"
-"LOF[*].GENE"
+"EFF[*].CODING"
-"LOF[*].GENEID"
+"EFF[*].TRID"
-"LOF[*].NUMTR"
+"EFF[*].RANK"
-"LOF[*].PERC"
-SnpEff' NMD' fields:
+In addition there are *LOF* and *NMD* fields::
-"NMD[*].GENE"
-"NMD[*].GENEID"
+"LOF[*].GENE"
-"NMD[*].NUMTR"
+"LOF[*].GENEID"
-"NMD[*].PERC"
+"LOF[*].NUMTR"
+"LOF[*].PERC"
-Some examples:
+"NMD[*].GENE"
-- *Extracting chromosome, position, ID and allele frequency from a VCF file*:
+"NMD[*].GENEID"
+"NMD[*].NUMTR"
-**CHROM POS ID AF**
+"NMD[*].PERC"
-The result will look something like::
+To find our whether your VCF contains *ANN* or *EFF* annotations simply look at its header.
-#CHROM        POS        ID            AF
+-----
-1             69134                    0.086
-1             69496      rs150690004   0.001
+**Usage examples**
-- *Extracting genotype fields*:
+*Extracting chromosome, position, ID and allele frequency from a VCF file*:
-**CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT**
+**CHROM POS ID AF**
-This means to extract:
+The result will look something like::
-- CHROM POS ID: regular fields (as in the previous example)
+#CHROM        POS        ID            AF
-- THETA : This one is from INFO
+1             69134                    0.086
-- GEN[0].GL[1] : Second likelihood from first genotype
+1             69496      rs150690004   0.001
-- GEN[1].GL : The whole GL fiels (all entries without separating them)
-- GEN[3].GL[*] : All likelihoods form genotype 3 (this time they will be tab separated, as opposed to the previous one).
+*Extracting genotype fields*:
-- GEN[*].GT : Genotype subfields (GT) from ALL samples (tab separated).
+**CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT**
-The result will look something like::
+This means to extract:
-#CHROM  POS     ID              THETA   GEN[0].GL[1]    GEN[1].GL               GEN[3].GL[*]            GEN[*].GT
-1       10583   rs58108140      0.0046  -0.47           -0.24,-0.44,-1.16       -0.48   -0.48   -0.48   0|0     0|0     0|0     0|1     0|0     0|1     0|0     0|0     0|1
+- CHROM POS ID: regular fields (as in the previous example)
-1       10611   rs189107123     0.0077  -0.48           -0.24,-0.44,-1.16       -0.48   -0.48   -0.48   0|0     0|1     0|0     0|0     0|0     0|0     0|0     0|0     0|0
+- THETA : This one is from INFO
-1       13302   rs180734498     0.0048  -0.58           -2.45,-0.00,-5.00       -0.48   -0.48   -0.48   0|0     0|1     0|0     0|0     0|0     1|0     0|0     0|1     0|0
+- GEN[0].GL[1] : Second likelihood from first genotype
+- GEN[1].GL : The whole GL field (all entries without separating them)
-- *Extracting fields with multiple values*:
+- GEN[3].GL[*] : All likelihoods form genotype 3 (this time they will be tab separated, as opposed to the previous one).
-(notice that there are multiple effect columns per line because there are mutiple effects per variant)
+- GEN[*].GT : Genotype subfields (GT) from ALL samples (tab separated).
-**CHROM POS REF ALT ANN[*].EFFECT**
+The result will look something like::
-The result will look something like::
+#CHROM  POS     ID              THETA   GEN[0].GL[1]    GEN[1].GL               GEN[3].GL[*]            GEN[*].GT
+1       10583   rs58108140      0.0046  -0.47           -0.24,-0.44,-1.16       -0.48   -0.48   -0.48   0|0     0|0     0|0     0|1     0|0     0|1     0|0     0|0     0|1
-#CHROM	POS	REF	ALT	ANN[*].EFFECT
+1       10611   rs189107123     0.0077  -0.48           -0.24,-0.44,-1.16       -0.48   -0.48   -0.48   0|0     0|1     0|0     0|0     0|0     0|0     0|0     0|0     0|0
-22	17071756	T	C	3_prime_UTR_variant	downstream_gene_variant
+1       13302   rs180734498     0.0048  -0.58           -2.45,-0.00,-5.00       -0.48   -0.48   -0.48   0|0     0|1     0|0     0|0     0|0     1|0     0|0     0|1     0|0
-22	17072035	C	T	missense_variant	downstream_gene_variant
-22	17072258	C	A	missense_variant	downstream_gene_variant
+*Extracting fields with multiple values*:
+(notice that there are multiple effect columns per line because there are multiple effects per variant)
-- *Extracting fields with multiple values using a comma as a multipe field separator:*
+**CHROM POS REF ALT ANN[*].EFFECT**
-**CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P**
+The result will look something like::
-The result will look something like::
+#CHROM  POS REF ALT ANN[*].EFFECT
-#CHROM	POS	REF	ALT	ANN[*].EFFECT	ANN[*].HGVS_P
+22  17071756    T   C   3_prime_UTR_variant downstream_gene_variant
-22	17071756	T	C	3_prime_UTR_variant,downstream_gene_variant	.,.
+22  17072035    C   T   missense_variant    downstream_gene_variant
-22	17072035	C	T	missense_variant,downstream_gene_variant	p.Gly469Glu,.
+22  17072258    C   A   missense_variant    downstream_gene_variant
-22	17072258	C	A	missense_variant,downstream_gene_variant	p.Gly395Cys,.
+*Extracting fields with multiple values using a comma as a multiple field separator:*
-- *Extracting fields with multiple values, one effect per line:*
+**CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P**
-**CHROM POS REF ALT ANN[*].EFFECT**
+The result will look something like::
-The result will look something like::
+#CHROM  POS REF ALT ANN[*].EFFECT   ANN[*].HGVS_P
-#CHROM	POS	REF	ALT	ANN[*].EFFECT
+22  17071756    T   C   3_prime_UTR_variant,downstream_gene_variant .,.
-22	17071756	T	C	3_prime_UTR_variant
+22  17072035    C   T   missense_variant,downstream_gene_variant    p.Gly469Glu,.
-22	17071756	T	C	downstream_gene_variant
+22  17072258    C   A   missense_variant,downstream_gene_variant    p.Gly395Cys,.
-22	17072035	C	T	missense_variant
-22	17072035	C	T	downstream_gene_variant
+*Extracting fields with multiple values, one effect per line:*
-22	17072258	C	A	missense_variant
-22	17072258	C	A	downstream_gene_variant
+**CHROM POS REF ALT ANN[*].EFFECT**
+The result will look something like::
+#CHROM  POS REF ALT ANN[*].EFFECT
+22  17071756    T   C   3_prime_UTR_variant
+22  17071756    T   C   downstream_gene_variant
+22  17072035    C   T   missense_variant
+22  17072035    C   T   downstream_gene_variant
+22  17072258    C   A   missense_variant
+22  17072258    C   A   downstream_gene_variant
 @EXTERNAL_DOCUMENTATION@
 - http://snpeff.sourceforge.net/SnpSift.html#Extract
 ]]></help>
 <expand macro="citations" />

Mercurial > repos > iuc > snpsift

comparison snpSift_extractFields.xml @ 5:09d6806c609e draft