Mercurial > repos > earlhaminst > gstf_preparation
comparison gstf_preparation.xml @ 11:dbe37a658cd2 draft
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
author | earlhaminst |
---|---|
date | Sun, 27 Sep 2020 18:54:31 +0000 |
parents | e8e75a79de59 |
children | 99bae410128c |
comparison
equal
deleted
inserted
replaced
10:e8e75a79de59 | 11:dbe37a658cd2 |
---|---|
1 <tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.1"> | 1 <tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.2"> |
2 <description>converts data for the workflow</description> | 2 <description>converts data for the workflow</description> |
3 <requirements> | |
4 <requirement type="package" version="3.7">python</requirement> | |
5 </requirements> | |
3 <command detect_errors="exit_code"><![CDATA[ | 6 <command detect_errors="exit_code"><![CDATA[ |
4 python '$__tool_directory__/gstf_preparation.py' | 7 python '$__tool_directory__/gstf_preparation.py' |
5 #for $q in $queries | 8 #for $q in $queries |
6 --gff3 '${q.genome}:${q.gff3_input}' | 9 --gff3 '${q.genome}:${q.gff3_input}' |
7 #end for | 10 #end for |
12 #end if | 15 #end if |
13 #for $fasta_input in $fasta_inputs | 16 #for $fasta_input in $fasta_inputs |
14 --fasta '${fasta_input}' | 17 --fasta '${fasta_input}' |
15 #end for | 18 #end for |
16 #if $headers | 19 #if $headers |
17 --headers | 20 --headers $headers |
18 #end if | 21 #end if |
19 #if $longestCDS | 22 #if $filter |
20 -l | 23 --filter $filter |
21 #end if | 24 #end if |
22 #if $regions | 25 #if $regions |
23 --regions '$regions' | 26 --regions '$regions' |
24 --ff '$filtered_fasta' | 27 --ff '$filtered_fasta' |
25 #end if | 28 #end if |
34 <validator type="empty_field" /> | 37 <validator type="empty_field" /> |
35 </param> | 38 </param> |
36 </repeat> | 39 </repeat> |
37 <param name="json" type="data" format="json" multiple="true" optional="true" label="Gene features in JSON format generated by 'Get features by Ensembl ID' tool" /> | 40 <param name="json" type="data" format="json" multiple="true" optional="true" label="Gene features in JSON format generated by 'Get features by Ensembl ID' tool" /> |
38 <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding CDS datasets in FASTA format" help="Each FASTA header line should start with a transcript id" /> | 41 <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding CDS datasets in FASTA format" help="Each FASTA header line should start with a transcript id" /> |
39 <param name="longestCDS" type="boolean" checked="false" label="Keep only the longest CDS per gene" /> | 42 <param name="filter" type="select" display="radio" label="Which transcripts to keep"> |
40 <param name="headers" type="boolean" checked="true" label="Change the header line of the FASTA sequences to the >TranscriptId_species format" help="As required by TreeBest, part of the GeneSeqToFamily workflow" /> | 43 <option value="canonical" selected="true">Only canonical transcripts (or longest CDS per gene)</option> |
44 <option value="coding">Only protein-coding transcripts</option> | |
45 <option value="">All transcripts</option> | |
46 </param> | |
47 | |
48 <param name="headers" type="select" display="radio" label="Change the header line of the FASTA sequences to the following format" help="As required by TreeBest, part of the GeneSeqToFamily workflow, only TranscriptId_species is acceptable format by Aequatus visualisation"> | |
49 <option value="TranscriptId_species" selected="true">TranscriptId_species</option> | |
50 <option value="GeneSymbol-TranscriptID_species">GeneSymbol-TranscriptID_species</option> | |
51 <option value="TranscriptSymbol-TranscriptID_species">TranscriptSymbol-TranscriptID_species</option> | |
52 <option value="">Don't change</option> | |
53 </param> | |
41 <param name="regions" type="text" optional="true" label="Comma-separated list of region IDs (e.g. chromosomes or scaffolds) for which FASTA sequences should be filtered out" help="Region IDs are in the `seqid` column for GFF3 and in the `seq_region_name` field in JSON. This is typically used to filter out chromosomes with a non-standard genetic code, like mitochondria, to be analysed separately" /> | 54 <param name="regions" type="text" optional="true" label="Comma-separated list of region IDs (e.g. chromosomes or scaffolds) for which FASTA sequences should be filtered out" help="Region IDs are in the `seqid` column for GFF3 and in the `seq_region_name` field in JSON. This is typically used to filter out chromosomes with a non-standard genetic code, like mitochondria, to be analysed separately" /> |
42 </inputs> | 55 </inputs> |
43 | 56 |
44 <outputs> | 57 <outputs> |
45 <data name="output_db" format="sqlite" label="${tool.name} on ${on_string}: SQLite" /> | 58 <data name="output_db" format="sqlite" label="${tool.name} on ${on_string}: SQLite" /> |
49 </data> | 62 </data> |
50 </outputs> | 63 </outputs> |
51 | 64 |
52 <tests> | 65 <tests> |
53 <test expect_num_outputs="2"> | 66 <test expect_num_outputs="2"> |
67 <repeat name="queries"> | |
68 <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> | |
69 <param name="genome" value="caenorhabditis_elegans" /> | |
70 </repeat> | |
54 <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> | 71 <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> |
55 <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> | 72 <param name="filter" value="coding" /> |
56 <param name="genome" value="caenorhabditis_elegans" /> | 73 <param name="headers" value="TranscriptId_species" /> |
57 <param name="longestCDS" value="false" /> | |
58 <param name="headers" value="true" /> | |
59 | 74 |
60 <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" /> | 75 <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" /> |
61 <output name="output_fasta" file="test1.fasta" /> | 76 <output name="output_fasta" file="test1.fasta" /> |
62 </test> | 77 </test> |
63 <test expect_num_outputs="2"> | 78 <test expect_num_outputs="2"> |
79 <repeat name="queries"> | |
80 <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> | |
81 <param name="genome" value="caenorhabditis_elegans" /> | |
82 </repeat> | |
64 <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> | 83 <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> |
65 <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> | 84 <param name="filter" value="canonical" /> |
66 <param name="genome" value="caenorhabditis_elegans" /> | 85 <param name="headers" value="TranscriptId_species" /> |
67 <param name="longestCDS" value="true" /> | |
68 <param name="headers" value="true" /> | |
69 | 86 |
70 <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" /> | 87 <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" /> |
71 <output name="output_fasta" file="test1_longest.fasta" /> | 88 <output name="output_fasta" file="test1_longest.fasta" /> |
72 </test> | 89 </test> |
73 <test expect_num_outputs="2"> | 90 <test expect_num_outputs="2"> |
74 <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> | 91 <param name="json" ftype="gff3" value="gene.json" /> |
75 <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> | 92 <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" /> |
76 <param name="genome" value="caenorhabditis_elegans" /> | 93 <param name="filter" value="" /> |
77 <param name="longestCDS" value="false" /> | 94 <param name="headers" value="" /> |
78 <param name="headers" value="false" /> | |
79 | 95 |
80 <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" /> | 96 <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" /> |
81 <output name="output_fasta" file="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> | 97 <output name="output_fasta" file="CDS.fasta" /> |
82 </test> | 98 </test> |
83 <test expect_num_outputs="2"> | 99 <test expect_num_outputs="2"> |
100 <param name="json" ftype="json" value="gene.json" /> | |
84 <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" /> | 101 <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" /> |
85 <param name="json" ftype="json" value="gene.json" /> | 102 <param name="filter" value="coding" /> |
86 <param name="longestCDS" value="false" /> | 103 <param name="headers" value="TranscriptId_species" /> |
87 <param name="headers" value="true" /> | |
88 | 104 |
89 <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" /> | 105 <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" /> |
90 <output name="output_fasta" file="test4.fasta" /> | 106 <output name="output_fasta" file="test4.fasta" /> |
91 </test> | 107 </test> |
92 <test> | 108 <test> |
109 <param name="json" ftype="json" value="gene.json" /> | |
93 <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" /> | 110 <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" /> |
94 <param name="json" ftype="json" value="gene.json" /> | 111 <param name="filter" value="coding" /> |
95 <param name="longestCDS" value="false" /> | 112 <param name="headers" value="TranscriptId_species" /> |
96 <param name="headers" value="true" /> | |
97 <param name="regions" value="X" /> | 113 <param name="regions" value="X" /> |
98 | 114 |
99 <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" /> | 115 <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" /> |
100 <output name="output_fasta" file="test5_filtered.fasta" /> | 116 <output name="output_fasta" file="test5_filtered.fasta" /> |
101 <output name="filtered_fasta" file="test5.ns.fasta" /> | 117 <output name="filtered_fasta" file="test5.ns.fasta" /> |
102 </test> | 118 </test> |
103 <test expect_num_outputs="2"> | 119 <test expect_num_outputs="2"> |
120 <repeat name="queries"> | |
121 <param name="gff3_input" ftype="gff3" value="MGP_PahariEiJ_G0008413.1.gff3" /> | |
122 <param name="genome" value="mus_pahari" /> | |
123 </repeat> | |
104 <param name="fasta_inputs" ftype="fasta" value="Mus_pahari.PAHARI_EIJ_v1.1.cds.all.shortened.fa" /> | 124 <param name="fasta_inputs" ftype="fasta" value="Mus_pahari.PAHARI_EIJ_v1.1.cds.all.shortened.fa" /> |
105 <param name="gff3_input" ftype="gff3" value="MGP_PahariEiJ_G0008413.1.gff3" /> | 125 <param name="filter" value="canonical" /> |
106 <param name="genome" value="mus_pahari" /> | 126 <param name="headers" value="TranscriptId_species" /> |
107 <param name="longestCDS" value="true" /> | |
108 <param name="headers" value="true" /> | |
109 | 127 |
110 <output name="output_db" file="test6.sqlite" compare="sim_size" delta="30000" /> | 128 <output name="output_db" file="test6.sqlite" compare="sim_size" delta="30000" /> |
111 <output name="output_fasta" file="test6.fasta" /> | 129 <output name="output_fasta" file="test6.fasta" /> |
112 </test> | 130 </test> |
113 </tests> | 131 </tests> |
114 <help><![CDATA[ | 132 <help><![CDATA[ |
115 **What it does** | 133 **What it does** |
116 | 134 |
117 This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format. | 135 This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format. |
118 | 136 |
119 It also filters the CDS FASTA datasets to: | 137 It also filters the CDS FASTA datasets to keep only the transcripts present in the gene feature information. |
120 | 138 |
121 - remove coding sequences whose length is not a multiple of 3 | 139 Optionally it can also: |
122 - keep only the transcripts present in the gene feature information. | 140 - keep only canonical transcripts (or the longest CDS per gene, if this attribute is not provided) |
123 | 141 - remove sequences which are annotated as non protein-coding or whose length is not a multiple of 3 |
124 Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow). | 142 - change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow). |
125 | 143 |
126 Example GFF3 file:: | 144 Example GFF3 file:: |
127 | 145 |
128 scaffold_0 MYZPE13164_Clone_G006_v1.0 gene 44968 69413 . - . ID=MYZPE13164_G006_v1.0_000000030;Name=MYZPE13164_G006_v1.0_000000030;biotype=protein_coding | 146 scaffold_0 MYZPE13164_Clone_G006_v1.0 gene 44968 69413 . - . ID=MYZPE13164_G006_v1.0_000000030;Name=MYZPE13164_G006_v1.0_000000030;biotype=protein_coding |
129 scaffold_0 MYZPE13164_Clone_G006_v1.0 mRNA 44968 69413 . - . ID=MYZPE13164_G006_v1.0_000000030.1;Parent=MYZPE13164_G006_v1.0_000000030;Name=MYZPE13164_G006_v1.0_000000030.1;biotype=protein_coding;_AED=0.31 | 147 scaffold_0 MYZPE13164_Clone_G006_v1.0 mRNA 44968 69413 . - . ID=MYZPE13164_G006_v1.0_000000030.1;Parent=MYZPE13164_G006_v1.0_000000030;Name=MYZPE13164_G006_v1.0_000000030.1;biotype=protein_coding;_AED=0.31 |