comparison translate_bed.xml @ 0:038ecf54cbec draft default tip

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/translate_bed commit 383bb485120a193bcc14f88364e51356d6ede219
author galaxyp
date Mon, 22 Jan 2018 13:59:27 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:038ecf54cbec
1 <tool id="translate_bed" name="Translate BED transcripts" version="0.1.0">
2 <description>cDNA in 3frames or CDS</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <requirements>
7 <expand macro="ensembl_requirements" />
8 <expand macro="bedutil_requirements" />
9 <expand macro="twobit_requirements" />
10 </requirements>
11 <stdio>
12 <exit_code range="1:" />
13 </stdio>
14 <command detect_errors="aggressive"><![CDATA[
15 python '$__tool_directory__/translate_bed.py'
16 #if $ref.ref_source == 'cached':
17 --twobit='$ref.ref_loc.fields.path'
18 #elif $ref.ref_source == 'history':
19 --twobit='$ref.ref_file'
20 #elif $ref.ref_source == 'last_column':
21 --column '-1'
22 #elif $ref.ref_source == 'select_column':
23 --column ${int(str($ref.seq_column)) - 1}
24 #elif $ref.ref_source == 'ensembl_rest':
25 --ensembl
26 #end if
27 #if $translations.translate == 'CDS':
28 --cds
29 #elif $translations.translate == 'cDNA':
30 --all
31 #end if
32 $translations.start_codon
33 #if $bed_filters.biotypes:
34 --biotypes '$bed_filters.biotypes'
35 --ensembl
36 #end if
37 #if $bed_filters.regions:
38 --regions '$bed_filters.regions'
39 #end if
40 --min_length $translations.min_length
41 #if $translations.enzyme:
42 --enzyme '$translations.enzyme'
43 #end if
44 #if $fa_id.fa_db:
45 --fa_db='$fa_id.fa_db'
46 #end if
47 #if $fa_id.fa_sep:
48 --fa_sep='$fa_id.fa_sep'
49 #end if
50 #if $fa_id.reference:
51 --reference $fa_id.reference
52 #else:
53 --reference ${input.metadata.dbkey}
54 #end if
55 #if $fa_id.id_prefix:
56 --id_prefix '$fa_id.id_prefix'
57 #end if
58 --bed '$translation_bed'
59 --fasta '$translation_fasta'
60 -v
61 $input
62 ]]></command>
63 <inputs>
64 <param name="input" type="data" format="bed" label="A BED file with 12 columns"
65 help="thickStart and thickEnd define protein coding region, blocks define exon regions"/>
66 <conditional name="ref">
67 <param name="ref_source" type="select" label="Source for Genomic Sequence Data">
68 <option value="cached">Locally cached twobit</option>
69 <option value="history">History dataset twobit</option>
70 <option value="last_column">Last Column in the BED file</option>
71 <option value="select_column">Select Column in the BED file</option>
72 <option value="ensembl_rest">Retrieve sequences from Ensembl (Slow and only for Ensembl Transcripts)</option>
73 </param>
74 <when value="cached">
75 <param name="ref_loc" type="select" label="Select reference 2bit file">
76 <options from_data_table="twobit" />
77 </param>
78 </when>
79 <when value="history">
80 <param name="ref_file" type="data" format="twobit" label="reference 2bit file" />
81 </when>
82 <when value="last_column"/>
83 <when value="select_column">
84 <param name="seq_column" type="data_column" data_ref="input" label="BED column conatining the genomic sequence"
85 help="unspliced genomic sequence from chromStart to chromEnd (Extract Genomic DNA)"/>
86 </when>
87 <when value="ensembl_rest"/>
88 </conditional>
89 <section name="bed_filters" expanded="false" title="BED Filtering Options">
90 <param name="regions" type="text" value="" optional="true" label="Restrict to features overlapping a comma-separated list of regions" >
91 <help>Each region is specifed as: chr or chr:pos or chr:from-to</help>
92 <validator type="regex" message="">^(\w+(:\d+(-\d+)?)?(,\w+(:\d+(-\d+)?)?)*)?$</validator>
93 </param>
94 <param name="biotypes" type="text" value="" optional="true" label="Restrict Feature translation to these biotypes"
95 help="For 20 column BED from Ensembl REST server">
96 <expand macro="biotypes_help" />
97 </param>
98 </section>
99 <section name="translations" expanded="false" title="Translation Options">
100 <param name="translate" type="select" label="Feature translation">
101 <option value="cDNA_minus_CDS">cDNA in 3 frames excluding known CDS</option>
102 <option value="cDNA">cDNA in 3 frames</option>
103 <option value="CDS">CDS proteins</option>
104 </param>
105 <param name="min_length" type="integer" value="10" min="1" label="Minimum length of protein translation to report"/>
106 <param name="start_codon" type="boolean" truevalue="--start_codon" falsevalue="" checked="false"
107 label="Require translations to start with Methionine, trim other leading Amino Acids"/>
108 <param name="enzyme" type="select" optional="true" label="Digest enzyme"
109 help="Split the protein into peptides according to enzyme digestion">
110 <option value="trypsin">trypsin: ([KR](?=[^P]))|((?&lt;=W)K(?=P))|((?&lt;=M)R(?=P))</option>
111 </param>
112 </section>
113 <section name="fa_id" expanded="false" title="Fasta ID Options">
114 <param name="reference" type="text" value="" optional="true" label="Genome reference name"
115 help="By default, the input bed dataset metadata will be used."/>
116 <param name="fa_db" type="text" value="" optional="true" label="fasta ID source, e.g. generic"
117 help="Any Compomics application such as PeptideShaker, requires a source for non reference proteins of 'generic' e.g.: generic|pep1|peptide description">
118 </param>
119 <param name="fa_sep" type="text" value="" optional="true" label="fasta ID line separator character"
120 help="defaults to the pipe character, Ensembl FASTA files usually use a space character">
121 </param>
122 <param name="id_prefix" type="text" value="" optional="true" label="ID prefix for generated IDs"
123 help="Can be used to distinguish samples">
124 <validator type="regex" message="Allowed chars:a-z A-Z 0-9 _ - |">^[a-zA-Z0-9_-|]*$</validator>
125 </param>
126 </section>
127 </inputs>
128 <outputs>
129 <!-- update translation_bed format to "probed" when datatype is available -->
130 <data name="translation_bed" format="bed" label="Translate ${translations.translate} on ${on_string} ${input.name}.proBed">
131 <actions>
132 <action name="column_names" type="metadata"
133 default="chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,proteinAccession,peptideSequence,uniqueness,genomeReferenceVersion,psmScore,fdr,modifications,charge,expMassToCharge,calcMassToCharge,psmRank,datasetID,uri"/>
134 </actions>
135 </data>
136 <data name="translation_fasta" format="fasta" label="Translate ${translations.translate} on ${on_string} ${input.name}.fasta"/>
137 </outputs>
138 <tests>
139 <test>
140 <param name="input" value="human_transcripts.bed" ftype="bed12"/>
141 <param name="ref_source" value="ensembl_rest"/>
142 <param name="translate" value="cDNA_minus_CDS"/>
143 <output name="translation_bed">
144 <assert_contents>
145 <has_text text="ENST00000641515" />
146 </assert_contents>
147 </output>
148 <output name="translation_fasta">
149 <assert_contents>
150 <has_text text=">ENST00000641515" />
151 </assert_contents>
152 </output>
153 </test>
154 <test>
155 <param name="input" value="human_transcripts_seq.bed" ftype="bed12"/>
156 <param name="ref_source" value="last_column"/>
157 <param name="translate" value="cDNA_minus_CDS"/>
158 <output name="translation_bed">
159 <assert_contents>
160 <has_text text="ENST00000488147" />
161 <has_text text="FLLSSLLIGVPFCTSPHSCFSMFFGRSKAALTAKLTLMRV" />
162 </assert_contents>
163 </output>
164 <output name="translation_fasta">
165 <assert_contents>
166 <has_text text="ENST00000488147" />
167 <has_text text="FLLSSLLIGVPFCTSPHSCFSMFFGRSKAALTAKLTLMRV" />
168 </assert_contents>
169 </output>
170 </test>
171 <test>
172 <param name="input" value="human_transcripts_seq.bed" ftype="bed12"/>
173 <param name="ref_source" value="last_column"/>
174 <param name="translate" value="CDS"/>
175 <output name="translation_bed">
176 <assert_contents>
177 <has_text text="ENST00000641515" />
178 <has_text text="MVTEFIFLGLSDSQELQTFLFMLFFVFY" />
179 </assert_contents>
180 </output>
181 <output name="translation_fasta">
182 <assert_contents>
183 <has_text text="ENST00000641515" />
184 <has_text text="MVTEFIFLGLSDSQELQTFLFMLFFVFY" />
185 </assert_contents>
186 </output>
187 </test>
188 <test>
189 <param name="input" value="human_transcripts_seq.bed" ftype="bed12"/>
190 <param name="ref_source" value="last_column"/>
191 <param name="translate" value="cDNA_minus_CDS"/>
192 <param name="biotypes" value="protein_coding"/>
193 <param name="start_codon" value="False"/>
194 <param name="fa_db" value="generic"/>
195 <param name="id_prefix" value="test_"/>
196 <output name="translation_bed">
197 <assert_contents>
198 <has_text text="test_ENST00000641515" />
199 <has_text text="ELPHTLPQFIFQQLVCYILEYRYKVIMLSKYSFANS" />
200 </assert_contents>
201 </output>
202 <output name="translation_fasta">
203 <assert_contents>
204 <has_text text="generic|test_ENST00000641515" />
205 <has_text text="ELPHTLPQFIFQQLVCYILEYRYKVIMLSKYSFANS" />
206 </assert_contents>
207 </output>
208 </test>
209 <test>
210 <param name="input" value="human_transcripts_seq.bed" ftype="bed12"/>
211 <param name="ref_source" value="last_column"/>
212 <param name="translate" value="cDNA_minus_CDS"/>
213 <param name="biotypes" value="protein_coding"/>
214 <param name="start_codon" value="True"/>
215 <param name="fa_db" value="generic"/>
216 <param name="id_prefix" value="test_"/>
217 <output name="translation_bed">
218 <assert_contents>
219 <has_text text="test_ENST00000641515" />
220 <has_text text="MLSKYSFANS" />
221 <not_has_text text="ELPHTLPQFIFQQLVCYILEYRYKVIMLSKYSFANS" />
222 </assert_contents>
223 </output>
224 <output name="translation_fasta">
225 <assert_contents>
226 <has_text text="generic|test_ENST00000641515" />
227 <has_text text="MLSKYSFANS" />
228 <not_has_text text="ELPHTLPQFIFQQLVCYILEYRYKVIMLSKYSFANS" />
229 </assert_contents>
230 </output>
231 </test>
232 <test>
233 <param name="input" value="human_transcripts.bed" ftype="bed12"/>
234 <param name="ref_source" value="history"/>
235 <param name="ref_file" value="GRCh38.1.2bit" ftype="twobit"/>
236 <param name="translate" value="cDNA_minus_CDS"/>
237 <param name="regions" value="1:0-30000"/>
238 <param name="start_codon" value="True"/>
239 <param name="fa_db" value="generic"/>
240 <param name="id_prefix" value="test_"/>
241 <output name="translation_bed">
242 <assert_contents>
243 <has_text text="test_ENST00000488147" />
244 <has_text text="MAPSSRAPRTLACRDAPATGSRASTAPWTSGPCRRS" />
245 <not_has_text text="ENST00000335137" />
246 </assert_contents>
247 </output>
248 <output name="translation_fasta">
249 <assert_contents>
250 <has_text text="generic|test_ENST00000488147" />
251 <has_text text="MAPSSRAPRTLACRDAPATGSRASTAPWTSGPCRRS" />
252 <not_has_text text="ENST00000335137" />
253 </assert_contents>
254 </output>
255 </test>
256 </tests>
257 <help><![CDATA[
258 Translate transcripts from the input BED file into protein sequences.
259
260 The genomic sequence:
261
262 - may be supplied in an extra column in the BED input file
263 - retrieved from a twobit genomic reference file
264 - retrieved from the Ensembl REST API for Ensembl transcripts
265
266
267 **INPUTS**
268
269 - BED file with at least the standard 12 columns
270 - Genome reference in twobit format (optional)
271
272
273 **OUTPUTS**
274
275 - FASTA of transcript translations
276 - BED with the genomic location of the translated protein. The added 13th column contains the protein sequence.
277
278
279 **OPTIONS**
280
281 - Feature translation
282
283 - cDNA - three frame translations of the cDNA sequences with an output for each sequence between STOP codons
284 - CDS - three frame translations of CDS (coding sequence defined by thickStart and thickEnd in the BED file)
285
286 - Translation filtering
287
288 - can be trimmed to a Methionine start codon
289 - can be split into peptides by an enzyme digestion
290 - must exceed specified minimum length
291
292
293 - BED Filtering
294
295 - genomic regions
296 - ensembl biotype if the BED contains the 20 columns as retrieved from the Ensembl REST API
297
298
299 ]]></help>
300 <citations>
301 <citation type="doi">10.1093/bioinformatics/btu613</citation>
302 <citation type="doi">10.1093/nar/gku1010</citation>
303 </citations>
304 </tool>