Mercurial > repos > iuc > snpsift
diff snpSift_extractFields.xml @ 5:09d6806c609e draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpsift/snpsift commit 70ff70918368ff0deeb596c2190a770abe9e1c9b
author | iuc |
---|---|
date | Wed, 18 Apr 2018 07:28:51 -0400 |
parents | 20c7d583fec1 |
children |
line wrap: on
line diff
--- a/snpSift_extractFields.xml Tue Oct 24 07:28:17 2017 -0400 +++ b/snpSift_extractFields.xml Wed Apr 18 07:28:51 2018 -0400 @@ -1,4 +1,4 @@ -<tool id="snpSift_extractFields" name="SnpSift Extract Fields" version="@WRAPPER_VERSION@.0"> +<tool id="snpSift_extractFields" name="SnpSift Extract Fields" version="@WRAPPER_VERSION@.galaxy0"> <options sanitize="False" /> <description>from a VCF file into a tabular file</description> <macros> @@ -9,27 +9,27 @@ <expand macro="version_command" /> <command><![CDATA[ @CONDA_SNPSIFT_JAR_PATH@ && -cat '$input' +cat '${input}' #if $one_effect_per_line: - | "\$SNPSIFT_JAR_PATH/scripts/vcfEffOnePerLine.pl" + | perl "\$SNPSIFT_JAR_PATH/scripts/vcfEffOnePerLine.pl" #end if | SnpSift -Xmx6G extractFields #if $separator: - -s '$separator' + -s '${separator}' #end if #if $empty_text: - -e '$empty_text' + -e '${empty_text}' #end if - #echo ' '.join(['"%s"' % x for x in $extract.split()]) -> '$output' +> '${output}' ]]></command> <inputs> <param name="input" type="data" format="vcf" label="Variant input file in VCF format"/> - <param name="extract" type="text" label="Extract" help="Need help? See below a few examples." /> + <param name="extract" type="text" label="Fields to extract" value="CHROM POS ID REF ALT FILTER" help="Separated by spaces. See help below for an explanation" /> <param name="one_effect_per_line" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="One effect per line" help="When variants have more than one effect, lists one effect per line, while all other parameters in the line are repeated across mutiple lines" /> - <param name="separator" type="text" value="" label="multiple field separator" help="Separate multiple fields in one column with this character, e.g. a comma, rather than a column for each of the multiple values" /> - <param name="empty_text" type="text" value="" label="empty field text" help="Represent empty fields with this value, rather than leaving them blank" /> + <param name="separator" type="text" value="" label="multiple field separator" help="Separate multiple fields in one column with this character, e.g. a comma, rather than a column for each of the multiple values" argument="-s" /> + <param name="empty_text" type="text" value="" label="empty field text" help="Represent empty fields with this value, rather than leaving them blank" argument="-e"/> </inputs> <outputs> <data name="output" format="tabular" /> @@ -40,156 +40,171 @@ <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/> <output name="output"> <assert_contents> - <has_text text="INTRAGENIC" /> - <not_has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" /> + <has_text text="INTRAGENIC" /> + <not_has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" /> </assert_contents> </output> </test> - <test> <param name="input" ftype="vcf" value="test_rmInfo.vcf"/> <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/> <param name="separator" value=","/> <output name="output"> <assert_contents> - <has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" /> + <has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" /> </assert_contents> </output> </test> + <test> + <param name="input" ftype="vcf" value="extFields_test3_in.vcf"/> + <param name="extract" value="CHROM POS ID REF ALT FILTER ANN[*].EFFECT"/> + <param name="one_effect_per_line" value="true"/> + <output name="output" value="extFields_test3_out.vcf"/> + </test> </tests> <help><![CDATA[ -**SnpSift Extract Fields** +**What is does** + +`SnpSift Extract Fields <http://snpeff.sourceforge.net/SnpSift.html#Extract>`_ selects columns from a VCF dataset into a Tab-delimited format. + +------ -Extract fields from a VCF file to a TXT, tab separated format, that you can easily load in R, XLS, etc. +.. class:: infomark + +**How to know which fields to extract?** + +A VCF dataset contains mandatory fields as well as optional fields. Mandatory fields are required by `VCF specifications <https://samtools.github.io/hts-specs/VCFv4.2.pdf>`_ and present in any valid VCF dataset. The **Fields to extract** input box of the tool above is already pre-filled with names of mandatory fields. -http://snpeff.sourceforge.net/SnpSift.html#Extract +To know what other fields are available in a given VCF file simply look at its header. `INFO` and `FORMAT` lines will contain description of existing fields. For example, if you see a line: + +##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data"> + +you can use *NS* as the field name. -You can also use sub-fields and genotype fields / sub-fields such as:: +------ + +**Dealing with field generated with SnpEff** + +The current version of `SnpEff <http://snpeff.sourceforge.net/SnpEff_manual.html>`_ produces so called *ANN* fields:: - Standard VCF fields: - CHROM - POS - ID - REF - ALT - FILTER - INFO fields: - AF - AC - DP - MQ - etc. (any info field available) - SnpEff 'ANN' fields: - "ANN[*].ALLELE" (alias GENOTYPE) - "ANN[*].EFFECT" (alias ANNOTATION): Effect in Sequence ontology terms (e.g. 'missense_variant', 'synonymous_variant', 'stop_gained', etc.) - "ANN[*].IMPACT" { HIGH, MODERATE, LOW, MODIFIER } - "ANN[*].GENE" Gene name (e.g. 'PSD3') - "ANN[*].GENEID" Gene ID - "ANN[*].FEATURE" - "ANN[*].FEATUREID" (alias TRID: Transcript ID) - "ANN[*].BIOTYPE" Biotype, as described by the annotations (e.g. 'protein_coding') - "ANN[*].RANK" Exon or Intron rank (i.e. exon number in a transcript) - "ANN[*].HGVS_C" (alias HGVS_DNA, CODON): Variant in HGVS (DNA) notation - "ANN[*].HGVS_P" (alias HGVS, HGVS_PROT, AA): Variant in HGVS (protein) notation - "ANN[*].CDNA_POS" (alias POS_CDNA) - "ANN[*].CDNA_LEN" (alias LEN_CDNA) - "ANN[*].CDS_POS" (alias POS_CDS) - "ANN[*].CDS_LEN" (alias LEN_CDS) - "ANN[*].AA_POS" (alias POS_AA) - "ANN[*].AA_LEN" (alias LEN_AA) - "ANN[*].DISTANCE" - "ANN[*].ERRORS" (alias WARNING, INFOS) - SnpEff 'EFF' fields (this is for older SnpEff/SnpSift versions, new version use 'ANN' field): - "EFF[*].EFFECT" - "EFF[*].IMPACT" - "EFF[*].FUNCLASS" - "EFF[*].CODON" - "EFF[*].AA" - "EFF[*].AA_LEN" - "EFF[*].GENE" - "EFF[*].BIOTYPE" - "EFF[*].CODING" - "EFF[*].TRID" - "EFF[*].RANK" - SnpEff 'LOF' fields: - "LOF[*].GENE" - "LOF[*].GENEID" - "LOF[*].NUMTR" - "LOF[*].PERC" - SnpEff' NMD' fields: - "NMD[*].GENE" - "NMD[*].GENEID" - "NMD[*].NUMTR" - "NMD[*].PERC" + "ANN[*].ALLELE" (alias GENOTYPE) + "ANN[*].EFFECT" (alias ANNOTATION): Effect in Sequence ontology terms (e.g. 'missense_variant', 'synonymous_variant', 'stop_gained', etc.) + "ANN[*].IMPACT" { HIGH, MODERATE, LOW, MODIFIER } + "ANN[*].GENE" Gene name (e.g. 'PSD3') + "ANN[*].GENEID" Gene ID + "ANN[*].FEATURE" + "ANN[*].FEATUREID" (alias TRID: Transcript ID) + "ANN[*].BIOTYPE" Biotype, as described by the annotations (e.g. 'protein_coding') + "ANN[*].RANK" Exon or Intron rank (i.e. exon number in a transcript) + "ANN[*].HGVS_C" (alias HGVS_DNA, CODON): Variant in HGVS (DNA) notation + "ANN[*].HGVS_P" (alias HGVS, HGVS_PROT, AA): Variant in HGVS (protein) notation + "ANN[*].CDNA_POS" (alias POS_CDNA) + "ANN[*].CDNA_LEN" (alias LEN_CDNA) + "ANN[*].CDS_POS" (alias POS_CDS) + "ANN[*].CDS_LEN" (alias LEN_CDS) + "ANN[*].AA_POS" (alias POS_AA) + "ANN[*].AA_LEN" (alias LEN_AA) + "ANN[*].DISTANCE" + "ANN[*].ERRORS" (alias WARNING, INFOS) + +Older versions produced *EFF* fields:: -Some examples: + "EFF[*].EFFECT" + "EFF[*].IMPACT" + "EFF[*].FUNCLASS" + "EFF[*].CODON" + "EFF[*].AA" + "EFF[*].AA_LEN" + "EFF[*].GENE" + "EFF[*].BIOTYPE" + "EFF[*].CODING" + "EFF[*].TRID" + "EFF[*].RANK" + +In addition there are *LOF* and *NMD* fields:: + + "LOF[*].GENE" + "LOF[*].GENEID" + "LOF[*].NUMTR" + "LOF[*].PERC" + + "NMD[*].GENE" + "NMD[*].GENEID" + "NMD[*].NUMTR" + "NMD[*].PERC" -- *Extracting chromosome, position, ID and allele frequency from a VCF file*: +To find our whether your VCF contains *ANN* or *EFF* annotations simply look at its header. + +----- - **CHROM POS ID AF** +**Usage examples** - The result will look something like:: +*Extracting chromosome, position, ID and allele frequency from a VCF file*: - #CHROM POS ID AF - 1 69134 0.086 - 1 69496 rs150690004 0.001 +**CHROM POS ID AF** + +The result will look something like:: -- *Extracting genotype fields*: + #CHROM POS ID AF + 1 69134 0.086 + 1 69496 rs150690004 0.001 - **CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT** - - This means to extract: +*Extracting genotype fields*: - - CHROM POS ID: regular fields (as in the previous example) - - THETA : This one is from INFO - - GEN[0].GL[1] : Second likelihood from first genotype - - GEN[1].GL : The whole GL fiels (all entries without separating them) - - GEN[3].GL[*] : All likelihoods form genotype 3 (this time they will be tab separated, as opposed to the previous one). - - GEN[*].GT : Genotype subfields (GT) from ALL samples (tab separated). +**CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT** + +This means to extract: - The result will look something like:: +- CHROM POS ID: regular fields (as in the previous example) +- THETA : This one is from INFO +- GEN[0].GL[1] : Second likelihood from first genotype +- GEN[1].GL : The whole GL field (all entries without separating them) +- GEN[3].GL[*] : All likelihoods form genotype 3 (this time they will be tab separated, as opposed to the previous one). +- GEN[*].GT : Genotype subfields (GT) from ALL samples (tab separated). - #CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT - 1 10583 rs58108140 0.0046 -0.47 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|0 0|0 0|1 0|0 0|1 0|0 0|0 0|1 - 1 10611 rs189107123 0.0077 -0.48 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 - 1 13302 rs180734498 0.0048 -0.58 -2.45,-0.00,-5.00 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 1|0 0|0 0|1 0|0 +The result will look something like:: -- *Extracting fields with multiple values*: - (notice that there are multiple effect columns per line because there are mutiple effects per variant) + #CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT + 1 10583 rs58108140 0.0046 -0.47 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|0 0|0 0|1 0|0 0|1 0|0 0|0 0|1 + 1 10611 rs189107123 0.0077 -0.48 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 + 1 13302 rs180734498 0.0048 -0.58 -2.45,-0.00,-5.00 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 1|0 0|0 0|1 0|0 - **CHROM POS REF ALT ANN[*].EFFECT** +*Extracting fields with multiple values*: + (notice that there are multiple effect columns per line because there are multiple effects per variant) - The result will look something like:: +**CHROM POS REF ALT ANN[*].EFFECT** + +The result will look something like:: - #CHROM POS REF ALT ANN[*].EFFECT - 22 17071756 T C 3_prime_UTR_variant downstream_gene_variant - 22 17072035 C T missense_variant downstream_gene_variant - 22 17072258 C A missense_variant downstream_gene_variant + #CHROM POS REF ALT ANN[*].EFFECT + 22 17071756 T C 3_prime_UTR_variant downstream_gene_variant + 22 17072035 C T missense_variant downstream_gene_variant + 22 17072258 C A missense_variant downstream_gene_variant -- *Extracting fields with multiple values using a comma as a multipe field separator:* +*Extracting fields with multiple values using a comma as a multiple field separator:* - **CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P** +**CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P** - The result will look something like:: +The result will look something like:: - #CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P - 22 17071756 T C 3_prime_UTR_variant,downstream_gene_variant .,. - 22 17072035 C T missense_variant,downstream_gene_variant p.Gly469Glu,. - 22 17072258 C A missense_variant,downstream_gene_variant p.Gly395Cys,. + #CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P + 22 17071756 T C 3_prime_UTR_variant,downstream_gene_variant .,. + 22 17072035 C T missense_variant,downstream_gene_variant p.Gly469Glu,. + 22 17072258 C A missense_variant,downstream_gene_variant p.Gly395Cys,. -- *Extracting fields with multiple values, one effect per line:* +*Extracting fields with multiple values, one effect per line:* - **CHROM POS REF ALT ANN[*].EFFECT** +**CHROM POS REF ALT ANN[*].EFFECT** - The result will look something like:: +The result will look something like:: - #CHROM POS REF ALT ANN[*].EFFECT - 22 17071756 T C 3_prime_UTR_variant - 22 17071756 T C downstream_gene_variant - 22 17072035 C T missense_variant - 22 17072035 C T downstream_gene_variant - 22 17072258 C A missense_variant - 22 17072258 C A downstream_gene_variant + #CHROM POS REF ALT ANN[*].EFFECT + 22 17071756 T C 3_prime_UTR_variant + 22 17071756 T C downstream_gene_variant + 22 17072035 C T missense_variant + 22 17072035 C T downstream_gene_variant + 22 17072258 C A missense_variant + 22 17072258 C A downstream_gene_variant @EXTERNAL_DOCUMENTATION@ - http://snpeff.sourceforge.net/SnpSift.html#Extract