Mercurial > repos > galaxyp > translate_bed
comparison translate_bed.xml @ 0:038ecf54cbec draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/translate_bed commit 383bb485120a193bcc14f88364e51356d6ede219
author | galaxyp |
---|---|
date | Mon, 22 Jan 2018 13:59:27 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:038ecf54cbec |
---|---|
1 <tool id="translate_bed" name="Translate BED transcripts" version="0.1.0"> | |
2 <description>cDNA in 3frames or CDS</description> | |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
6 <requirements> | |
7 <expand macro="ensembl_requirements" /> | |
8 <expand macro="bedutil_requirements" /> | |
9 <expand macro="twobit_requirements" /> | |
10 </requirements> | |
11 <stdio> | |
12 <exit_code range="1:" /> | |
13 </stdio> | |
14 <command detect_errors="aggressive"><![CDATA[ | |
15 python '$__tool_directory__/translate_bed.py' | |
16 #if $ref.ref_source == 'cached': | |
17 --twobit='$ref.ref_loc.fields.path' | |
18 #elif $ref.ref_source == 'history': | |
19 --twobit='$ref.ref_file' | |
20 #elif $ref.ref_source == 'last_column': | |
21 --column '-1' | |
22 #elif $ref.ref_source == 'select_column': | |
23 --column ${int(str($ref.seq_column)) - 1} | |
24 #elif $ref.ref_source == 'ensembl_rest': | |
25 --ensembl | |
26 #end if | |
27 #if $translations.translate == 'CDS': | |
28 --cds | |
29 #elif $translations.translate == 'cDNA': | |
30 --all | |
31 #end if | |
32 $translations.start_codon | |
33 #if $bed_filters.biotypes: | |
34 --biotypes '$bed_filters.biotypes' | |
35 --ensembl | |
36 #end if | |
37 #if $bed_filters.regions: | |
38 --regions '$bed_filters.regions' | |
39 #end if | |
40 --min_length $translations.min_length | |
41 #if $translations.enzyme: | |
42 --enzyme '$translations.enzyme' | |
43 #end if | |
44 #if $fa_id.fa_db: | |
45 --fa_db='$fa_id.fa_db' | |
46 #end if | |
47 #if $fa_id.fa_sep: | |
48 --fa_sep='$fa_id.fa_sep' | |
49 #end if | |
50 #if $fa_id.reference: | |
51 --reference $fa_id.reference | |
52 #else: | |
53 --reference ${input.metadata.dbkey} | |
54 #end if | |
55 #if $fa_id.id_prefix: | |
56 --id_prefix '$fa_id.id_prefix' | |
57 #end if | |
58 --bed '$translation_bed' | |
59 --fasta '$translation_fasta' | |
60 -v | |
61 $input | |
62 ]]></command> | |
63 <inputs> | |
64 <param name="input" type="data" format="bed" label="A BED file with 12 columns" | |
65 help="thickStart and thickEnd define protein coding region, blocks define exon regions"/> | |
66 <conditional name="ref"> | |
67 <param name="ref_source" type="select" label="Source for Genomic Sequence Data"> | |
68 <option value="cached">Locally cached twobit</option> | |
69 <option value="history">History dataset twobit</option> | |
70 <option value="last_column">Last Column in the BED file</option> | |
71 <option value="select_column">Select Column in the BED file</option> | |
72 <option value="ensembl_rest">Retrieve sequences from Ensembl (Slow and only for Ensembl Transcripts)</option> | |
73 </param> | |
74 <when value="cached"> | |
75 <param name="ref_loc" type="select" label="Select reference 2bit file"> | |
76 <options from_data_table="twobit" /> | |
77 </param> | |
78 </when> | |
79 <when value="history"> | |
80 <param name="ref_file" type="data" format="twobit" label="reference 2bit file" /> | |
81 </when> | |
82 <when value="last_column"/> | |
83 <when value="select_column"> | |
84 <param name="seq_column" type="data_column" data_ref="input" label="BED column conatining the genomic sequence" | |
85 help="unspliced genomic sequence from chromStart to chromEnd (Extract Genomic DNA)"/> | |
86 </when> | |
87 <when value="ensembl_rest"/> | |
88 </conditional> | |
89 <section name="bed_filters" expanded="false" title="BED Filtering Options"> | |
90 <param name="regions" type="text" value="" optional="true" label="Restrict to features overlapping a comma-separated list of regions" > | |
91 <help>Each region is specifed as: chr or chr:pos or chr:from-to</help> | |
92 <validator type="regex" message="">^(\w+(:\d+(-\d+)?)?(,\w+(:\d+(-\d+)?)?)*)?$</validator> | |
93 </param> | |
94 <param name="biotypes" type="text" value="" optional="true" label="Restrict Feature translation to these biotypes" | |
95 help="For 20 column BED from Ensembl REST server"> | |
96 <expand macro="biotypes_help" /> | |
97 </param> | |
98 </section> | |
99 <section name="translations" expanded="false" title="Translation Options"> | |
100 <param name="translate" type="select" label="Feature translation"> | |
101 <option value="cDNA_minus_CDS">cDNA in 3 frames excluding known CDS</option> | |
102 <option value="cDNA">cDNA in 3 frames</option> | |
103 <option value="CDS">CDS proteins</option> | |
104 </param> | |
105 <param name="min_length" type="integer" value="10" min="1" label="Minimum length of protein translation to report"/> | |
106 <param name="start_codon" type="boolean" truevalue="--start_codon" falsevalue="" checked="false" | |
107 label="Require translations to start with Methionine, trim other leading Amino Acids"/> | |
108 <param name="enzyme" type="select" optional="true" label="Digest enzyme" | |
109 help="Split the protein into peptides according to enzyme digestion"> | |
110 <option value="trypsin">trypsin: ([KR](?=[^P]))|((?<=W)K(?=P))|((?<=M)R(?=P))</option> | |
111 </param> | |
112 </section> | |
113 <section name="fa_id" expanded="false" title="Fasta ID Options"> | |
114 <param name="reference" type="text" value="" optional="true" label="Genome reference name" | |
115 help="By default, the input bed dataset metadata will be used."/> | |
116 <param name="fa_db" type="text" value="" optional="true" label="fasta ID source, e.g. generic" | |
117 help="Any Compomics application such as PeptideShaker, requires a source for non reference proteins of 'generic' e.g.: generic|pep1|peptide description"> | |
118 </param> | |
119 <param name="fa_sep" type="text" value="" optional="true" label="fasta ID line separator character" | |
120 help="defaults to the pipe character, Ensembl FASTA files usually use a space character"> | |
121 </param> | |
122 <param name="id_prefix" type="text" value="" optional="true" label="ID prefix for generated IDs" | |
123 help="Can be used to distinguish samples"> | |
124 <validator type="regex" message="Allowed chars:a-z A-Z 0-9 _ - |">^[a-zA-Z0-9_-|]*$</validator> | |
125 </param> | |
126 </section> | |
127 </inputs> | |
128 <outputs> | |
129 <!-- update translation_bed format to "probed" when datatype is available --> | |
130 <data name="translation_bed" format="bed" label="Translate ${translations.translate} on ${on_string} ${input.name}.proBed"> | |
131 <actions> | |
132 <action name="column_names" type="metadata" | |
133 default="chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,proteinAccession,peptideSequence,uniqueness,genomeReferenceVersion,psmScore,fdr,modifications,charge,expMassToCharge,calcMassToCharge,psmRank,datasetID,uri"/> | |
134 </actions> | |
135 </data> | |
136 <data name="translation_fasta" format="fasta" label="Translate ${translations.translate} on ${on_string} ${input.name}.fasta"/> | |
137 </outputs> | |
138 <tests> | |
139 <test> | |
140 <param name="input" value="human_transcripts.bed" ftype="bed12"/> | |
141 <param name="ref_source" value="ensembl_rest"/> | |
142 <param name="translate" value="cDNA_minus_CDS"/> | |
143 <output name="translation_bed"> | |
144 <assert_contents> | |
145 <has_text text="ENST00000641515" /> | |
146 </assert_contents> | |
147 </output> | |
148 <output name="translation_fasta"> | |
149 <assert_contents> | |
150 <has_text text=">ENST00000641515" /> | |
151 </assert_contents> | |
152 </output> | |
153 </test> | |
154 <test> | |
155 <param name="input" value="human_transcripts_seq.bed" ftype="bed12"/> | |
156 <param name="ref_source" value="last_column"/> | |
157 <param name="translate" value="cDNA_minus_CDS"/> | |
158 <output name="translation_bed"> | |
159 <assert_contents> | |
160 <has_text text="ENST00000488147" /> | |
161 <has_text text="FLLSSLLIGVPFCTSPHSCFSMFFGRSKAALTAKLTLMRV" /> | |
162 </assert_contents> | |
163 </output> | |
164 <output name="translation_fasta"> | |
165 <assert_contents> | |
166 <has_text text="ENST00000488147" /> | |
167 <has_text text="FLLSSLLIGVPFCTSPHSCFSMFFGRSKAALTAKLTLMRV" /> | |
168 </assert_contents> | |
169 </output> | |
170 </test> | |
171 <test> | |
172 <param name="input" value="human_transcripts_seq.bed" ftype="bed12"/> | |
173 <param name="ref_source" value="last_column"/> | |
174 <param name="translate" value="CDS"/> | |
175 <output name="translation_bed"> | |
176 <assert_contents> | |
177 <has_text text="ENST00000641515" /> | |
178 <has_text text="MVTEFIFLGLSDSQELQTFLFMLFFVFY" /> | |
179 </assert_contents> | |
180 </output> | |
181 <output name="translation_fasta"> | |
182 <assert_contents> | |
183 <has_text text="ENST00000641515" /> | |
184 <has_text text="MVTEFIFLGLSDSQELQTFLFMLFFVFY" /> | |
185 </assert_contents> | |
186 </output> | |
187 </test> | |
188 <test> | |
189 <param name="input" value="human_transcripts_seq.bed" ftype="bed12"/> | |
190 <param name="ref_source" value="last_column"/> | |
191 <param name="translate" value="cDNA_minus_CDS"/> | |
192 <param name="biotypes" value="protein_coding"/> | |
193 <param name="start_codon" value="False"/> | |
194 <param name="fa_db" value="generic"/> | |
195 <param name="id_prefix" value="test_"/> | |
196 <output name="translation_bed"> | |
197 <assert_contents> | |
198 <has_text text="test_ENST00000641515" /> | |
199 <has_text text="ELPHTLPQFIFQQLVCYILEYRYKVIMLSKYSFANS" /> | |
200 </assert_contents> | |
201 </output> | |
202 <output name="translation_fasta"> | |
203 <assert_contents> | |
204 <has_text text="generic|test_ENST00000641515" /> | |
205 <has_text text="ELPHTLPQFIFQQLVCYILEYRYKVIMLSKYSFANS" /> | |
206 </assert_contents> | |
207 </output> | |
208 </test> | |
209 <test> | |
210 <param name="input" value="human_transcripts_seq.bed" ftype="bed12"/> | |
211 <param name="ref_source" value="last_column"/> | |
212 <param name="translate" value="cDNA_minus_CDS"/> | |
213 <param name="biotypes" value="protein_coding"/> | |
214 <param name="start_codon" value="True"/> | |
215 <param name="fa_db" value="generic"/> | |
216 <param name="id_prefix" value="test_"/> | |
217 <output name="translation_bed"> | |
218 <assert_contents> | |
219 <has_text text="test_ENST00000641515" /> | |
220 <has_text text="MLSKYSFANS" /> | |
221 <not_has_text text="ELPHTLPQFIFQQLVCYILEYRYKVIMLSKYSFANS" /> | |
222 </assert_contents> | |
223 </output> | |
224 <output name="translation_fasta"> | |
225 <assert_contents> | |
226 <has_text text="generic|test_ENST00000641515" /> | |
227 <has_text text="MLSKYSFANS" /> | |
228 <not_has_text text="ELPHTLPQFIFQQLVCYILEYRYKVIMLSKYSFANS" /> | |
229 </assert_contents> | |
230 </output> | |
231 </test> | |
232 <test> | |
233 <param name="input" value="human_transcripts.bed" ftype="bed12"/> | |
234 <param name="ref_source" value="history"/> | |
235 <param name="ref_file" value="GRCh38.1.2bit" ftype="twobit"/> | |
236 <param name="translate" value="cDNA_minus_CDS"/> | |
237 <param name="regions" value="1:0-30000"/> | |
238 <param name="start_codon" value="True"/> | |
239 <param name="fa_db" value="generic"/> | |
240 <param name="id_prefix" value="test_"/> | |
241 <output name="translation_bed"> | |
242 <assert_contents> | |
243 <has_text text="test_ENST00000488147" /> | |
244 <has_text text="MAPSSRAPRTLACRDAPATGSRASTAPWTSGPCRRS" /> | |
245 <not_has_text text="ENST00000335137" /> | |
246 </assert_contents> | |
247 </output> | |
248 <output name="translation_fasta"> | |
249 <assert_contents> | |
250 <has_text text="generic|test_ENST00000488147" /> | |
251 <has_text text="MAPSSRAPRTLACRDAPATGSRASTAPWTSGPCRRS" /> | |
252 <not_has_text text="ENST00000335137" /> | |
253 </assert_contents> | |
254 </output> | |
255 </test> | |
256 </tests> | |
257 <help><![CDATA[ | |
258 Translate transcripts from the input BED file into protein sequences. | |
259 | |
260 The genomic sequence: | |
261 | |
262 - may be supplied in an extra column in the BED input file | |
263 - retrieved from a twobit genomic reference file | |
264 - retrieved from the Ensembl REST API for Ensembl transcripts | |
265 | |
266 | |
267 **INPUTS** | |
268 | |
269 - BED file with at least the standard 12 columns | |
270 - Genome reference in twobit format (optional) | |
271 | |
272 | |
273 **OUTPUTS** | |
274 | |
275 - FASTA of transcript translations | |
276 - BED with the genomic location of the translated protein. The added 13th column contains the protein sequence. | |
277 | |
278 | |
279 **OPTIONS** | |
280 | |
281 - Feature translation | |
282 | |
283 - cDNA - three frame translations of the cDNA sequences with an output for each sequence between STOP codons | |
284 - CDS - three frame translations of CDS (coding sequence defined by thickStart and thickEnd in the BED file) | |
285 | |
286 - Translation filtering | |
287 | |
288 - can be trimmed to a Methionine start codon | |
289 - can be split into peptides by an enzyme digestion | |
290 - must exceed specified minimum length | |
291 | |
292 | |
293 - BED Filtering | |
294 | |
295 - genomic regions | |
296 - ensembl biotype if the BED contains the 20 columns as retrieved from the Ensembl REST API | |
297 | |
298 | |
299 ]]></help> | |
300 <citations> | |
301 <citation type="doi">10.1093/bioinformatics/btu613</citation> | |
302 <citation type="doi">10.1093/nar/gku1010</citation> | |
303 </citations> | |
304 </tool> |