comparison gstf_preparation.xml @ 11:dbe37a658cd2 draft

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
author earlhaminst
date Sun, 27 Sep 2020 18:54:31 +0000
parents e8e75a79de59
children 99bae410128c
comparison
equal deleted inserted replaced
10:e8e75a79de59 11:dbe37a658cd2
1 <tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.1"> 1 <tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.2">
2 <description>converts data for the workflow</description> 2 <description>converts data for the workflow</description>
3 <requirements>
4 <requirement type="package" version="3.7">python</requirement>
5 </requirements>
3 <command detect_errors="exit_code"><![CDATA[ 6 <command detect_errors="exit_code"><![CDATA[
4 python '$__tool_directory__/gstf_preparation.py' 7 python '$__tool_directory__/gstf_preparation.py'
5 #for $q in $queries 8 #for $q in $queries
6 --gff3 '${q.genome}:${q.gff3_input}' 9 --gff3 '${q.genome}:${q.gff3_input}'
7 #end for 10 #end for
12 #end if 15 #end if
13 #for $fasta_input in $fasta_inputs 16 #for $fasta_input in $fasta_inputs
14 --fasta '${fasta_input}' 17 --fasta '${fasta_input}'
15 #end for 18 #end for
16 #if $headers 19 #if $headers
17 --headers 20 --headers $headers
18 #end if 21 #end if
19 #if $longestCDS 22 #if $filter
20 -l 23 --filter $filter
21 #end if 24 #end if
22 #if $regions 25 #if $regions
23 --regions '$regions' 26 --regions '$regions'
24 --ff '$filtered_fasta' 27 --ff '$filtered_fasta'
25 #end if 28 #end if
34 <validator type="empty_field" /> 37 <validator type="empty_field" />
35 </param> 38 </param>
36 </repeat> 39 </repeat>
37 <param name="json" type="data" format="json" multiple="true" optional="true" label="Gene features in JSON format generated by 'Get features by Ensembl ID' tool" /> 40 <param name="json" type="data" format="json" multiple="true" optional="true" label="Gene features in JSON format generated by 'Get features by Ensembl ID' tool" />
38 <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding CDS datasets in FASTA format" help="Each FASTA header line should start with a transcript id" /> 41 <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding CDS datasets in FASTA format" help="Each FASTA header line should start with a transcript id" />
39 <param name="longestCDS" type="boolean" checked="false" label="Keep only the longest CDS per gene" /> 42 <param name="filter" type="select" display="radio" label="Which transcripts to keep">
40 <param name="headers" type="boolean" checked="true" label="Change the header line of the FASTA sequences to the &gt;TranscriptId_species format" help="As required by TreeBest, part of the GeneSeqToFamily workflow" /> 43 <option value="canonical" selected="true">Only canonical transcripts (or longest CDS per gene)</option>
44 <option value="coding">Only protein-coding transcripts</option>
45 <option value="">All transcripts</option>
46 </param>
47
48 <param name="headers" type="select" display="radio" label="Change the header line of the FASTA sequences to the following format" help="As required by TreeBest, part of the GeneSeqToFamily workflow, only TranscriptId_species is acceptable format by Aequatus visualisation">
49 <option value="TranscriptId_species" selected="true">TranscriptId_species</option>
50 <option value="GeneSymbol-TranscriptID_species">GeneSymbol-TranscriptID_species</option>
51 <option value="TranscriptSymbol-TranscriptID_species">TranscriptSymbol-TranscriptID_species</option>
52 <option value="">Don't change</option>
53 </param>
41 <param name="regions" type="text" optional="true" label="Comma-separated list of region IDs (e.g. chromosomes or scaffolds) for which FASTA sequences should be filtered out" help="Region IDs are in the `seqid` column for GFF3 and in the `seq_region_name` field in JSON. This is typically used to filter out chromosomes with a non-standard genetic code, like mitochondria, to be analysed separately" /> 54 <param name="regions" type="text" optional="true" label="Comma-separated list of region IDs (e.g. chromosomes or scaffolds) for which FASTA sequences should be filtered out" help="Region IDs are in the `seqid` column for GFF3 and in the `seq_region_name` field in JSON. This is typically used to filter out chromosomes with a non-standard genetic code, like mitochondria, to be analysed separately" />
42 </inputs> 55 </inputs>
43 56
44 <outputs> 57 <outputs>
45 <data name="output_db" format="sqlite" label="${tool.name} on ${on_string}: SQLite" /> 58 <data name="output_db" format="sqlite" label="${tool.name} on ${on_string}: SQLite" />
49 </data> 62 </data>
50 </outputs> 63 </outputs>
51 64
52 <tests> 65 <tests>
53 <test expect_num_outputs="2"> 66 <test expect_num_outputs="2">
67 <repeat name="queries">
68 <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />
69 <param name="genome" value="caenorhabditis_elegans" />
70 </repeat>
54 <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> 71 <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />
55 <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> 72 <param name="filter" value="coding" />
56 <param name="genome" value="caenorhabditis_elegans" /> 73 <param name="headers" value="TranscriptId_species" />
57 <param name="longestCDS" value="false" />
58 <param name="headers" value="true" />
59 74
60 <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" /> 75 <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" />
61 <output name="output_fasta" file="test1.fasta" /> 76 <output name="output_fasta" file="test1.fasta" />
62 </test> 77 </test>
63 <test expect_num_outputs="2"> 78 <test expect_num_outputs="2">
79 <repeat name="queries">
80 <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />
81 <param name="genome" value="caenorhabditis_elegans" />
82 </repeat>
64 <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> 83 <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />
65 <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> 84 <param name="filter" value="canonical" />
66 <param name="genome" value="caenorhabditis_elegans" /> 85 <param name="headers" value="TranscriptId_species" />
67 <param name="longestCDS" value="true" />
68 <param name="headers" value="true" />
69 86
70 <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" /> 87 <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" />
71 <output name="output_fasta" file="test1_longest.fasta" /> 88 <output name="output_fasta" file="test1_longest.fasta" />
72 </test> 89 </test>
73 <test expect_num_outputs="2"> 90 <test expect_num_outputs="2">
74 <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> 91 <param name="json" ftype="gff3" value="gene.json" />
75 <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> 92 <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" />
76 <param name="genome" value="caenorhabditis_elegans" /> 93 <param name="filter" value="" />
77 <param name="longestCDS" value="false" /> 94 <param name="headers" value="" />
78 <param name="headers" value="false" />
79 95
80 <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" /> 96 <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" />
81 <output name="output_fasta" file="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> 97 <output name="output_fasta" file="CDS.fasta" />
82 </test> 98 </test>
83 <test expect_num_outputs="2"> 99 <test expect_num_outputs="2">
100 <param name="json" ftype="json" value="gene.json" />
84 <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" /> 101 <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" />
85 <param name="json" ftype="json" value="gene.json" /> 102 <param name="filter" value="coding" />
86 <param name="longestCDS" value="false" /> 103 <param name="headers" value="TranscriptId_species" />
87 <param name="headers" value="true" />
88 104
89 <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" /> 105 <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" />
90 <output name="output_fasta" file="test4.fasta" /> 106 <output name="output_fasta" file="test4.fasta" />
91 </test> 107 </test>
92 <test> 108 <test>
109 <param name="json" ftype="json" value="gene.json" />
93 <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" /> 110 <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" />
94 <param name="json" ftype="json" value="gene.json" /> 111 <param name="filter" value="coding" />
95 <param name="longestCDS" value="false" /> 112 <param name="headers" value="TranscriptId_species" />
96 <param name="headers" value="true" />
97 <param name="regions" value="X" /> 113 <param name="regions" value="X" />
98 114
99 <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" /> 115 <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" />
100 <output name="output_fasta" file="test5_filtered.fasta" /> 116 <output name="output_fasta" file="test5_filtered.fasta" />
101 <output name="filtered_fasta" file="test5.ns.fasta" /> 117 <output name="filtered_fasta" file="test5.ns.fasta" />
102 </test> 118 </test>
103 <test expect_num_outputs="2"> 119 <test expect_num_outputs="2">
120 <repeat name="queries">
121 <param name="gff3_input" ftype="gff3" value="MGP_PahariEiJ_G0008413.1.gff3" />
122 <param name="genome" value="mus_pahari" />
123 </repeat>
104 <param name="fasta_inputs" ftype="fasta" value="Mus_pahari.PAHARI_EIJ_v1.1.cds.all.shortened.fa" /> 124 <param name="fasta_inputs" ftype="fasta" value="Mus_pahari.PAHARI_EIJ_v1.1.cds.all.shortened.fa" />
105 <param name="gff3_input" ftype="gff3" value="MGP_PahariEiJ_G0008413.1.gff3" /> 125 <param name="filter" value="canonical" />
106 <param name="genome" value="mus_pahari" /> 126 <param name="headers" value="TranscriptId_species" />
107 <param name="longestCDS" value="true" />
108 <param name="headers" value="true" />
109 127
110 <output name="output_db" file="test6.sqlite" compare="sim_size" delta="30000" /> 128 <output name="output_db" file="test6.sqlite" compare="sim_size" delta="30000" />
111 <output name="output_fasta" file="test6.fasta" /> 129 <output name="output_fasta" file="test6.fasta" />
112 </test> 130 </test>
113 </tests> 131 </tests>
114 <help><![CDATA[ 132 <help><![CDATA[
115 **What it does** 133 **What it does**
116 134
117 This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format. 135 This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format.
118 136
119 It also filters the CDS FASTA datasets to: 137 It also filters the CDS FASTA datasets to keep only the transcripts present in the gene feature information.
120 138
121 - remove coding sequences whose length is not a multiple of 3 139 Optionally it can also:
122 - keep only the transcripts present in the gene feature information. 140 - keep only canonical transcripts (or the longest CDS per gene, if this attribute is not provided)
123 141 - remove sequences which are annotated as non protein-coding or whose length is not a multiple of 3
124 Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow). 142 - change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow).
125 143
126 Example GFF3 file:: 144 Example GFF3 file::
127 145
128 scaffold_0 MYZPE13164_Clone_G006_v1.0 gene 44968 69413 . - . ID=MYZPE13164_G006_v1.0_000000030;Name=MYZPE13164_G006_v1.0_000000030;biotype=protein_coding 146 scaffold_0 MYZPE13164_Clone_G006_v1.0 gene 44968 69413 . - . ID=MYZPE13164_G006_v1.0_000000030;Name=MYZPE13164_G006_v1.0_000000030;biotype=protein_coding
129 scaffold_0 MYZPE13164_Clone_G006_v1.0 mRNA 44968 69413 . - . ID=MYZPE13164_G006_v1.0_000000030.1;Parent=MYZPE13164_G006_v1.0_000000030;Name=MYZPE13164_G006_v1.0_000000030.1;biotype=protein_coding;_AED=0.31 147 scaffold_0 MYZPE13164_Clone_G006_v1.0 mRNA 44968 69413 . - . ID=MYZPE13164_G006_v1.0_000000030.1;Parent=MYZPE13164_G006_v1.0_000000030;Name=MYZPE13164_G006_v1.0_000000030.1;biotype=protein_coding;_AED=0.31