Mercurial > repos > galaxyp > translate_bed
diff translate_bed.xml @ 0:038ecf54cbec draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/translate_bed commit 383bb485120a193bcc14f88364e51356d6ede219
author | galaxyp |
---|---|
date | Mon, 22 Jan 2018 13:59:27 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/translate_bed.xml Mon Jan 22 13:59:27 2018 -0500 @@ -0,0 +1,304 @@ +<tool id="translate_bed" name="Translate BED transcripts" version="0.1.0"> + <description>cDNA in 3frames or CDS</description> + <macros> + <import>macros.xml</import> + </macros> + <requirements> + <expand macro="ensembl_requirements" /> + <expand macro="bedutil_requirements" /> + <expand macro="twobit_requirements" /> + </requirements> + <stdio> + <exit_code range="1:" /> + </stdio> + <command detect_errors="aggressive"><![CDATA[ + python '$__tool_directory__/translate_bed.py' + #if $ref.ref_source == 'cached': + --twobit='$ref.ref_loc.fields.path' + #elif $ref.ref_source == 'history': + --twobit='$ref.ref_file' + #elif $ref.ref_source == 'last_column': + --column '-1' + #elif $ref.ref_source == 'select_column': + --column ${int(str($ref.seq_column)) - 1} + #elif $ref.ref_source == 'ensembl_rest': + --ensembl + #end if + #if $translations.translate == 'CDS': + --cds + #elif $translations.translate == 'cDNA': + --all + #end if + $translations.start_codon + #if $bed_filters.biotypes: + --biotypes '$bed_filters.biotypes' + --ensembl + #end if + #if $bed_filters.regions: + --regions '$bed_filters.regions' + #end if + --min_length $translations.min_length + #if $translations.enzyme: + --enzyme '$translations.enzyme' + #end if + #if $fa_id.fa_db: + --fa_db='$fa_id.fa_db' + #end if + #if $fa_id.fa_sep: + --fa_sep='$fa_id.fa_sep' + #end if + #if $fa_id.reference: + --reference $fa_id.reference + #else: + --reference ${input.metadata.dbkey} + #end if + #if $fa_id.id_prefix: + --id_prefix '$fa_id.id_prefix' + #end if + --bed '$translation_bed' + --fasta '$translation_fasta' + -v + $input + ]]></command> + <inputs> + <param name="input" type="data" format="bed" label="A BED file with 12 columns" + help="thickStart and thickEnd define protein coding region, blocks define exon regions"/> + <conditional name="ref"> + <param name="ref_source" type="select" label="Source for Genomic Sequence Data"> + <option value="cached">Locally cached twobit</option> + <option value="history">History dataset twobit</option> + <option value="last_column">Last Column in the BED file</option> + <option value="select_column">Select Column in the BED file</option> + <option value="ensembl_rest">Retrieve sequences from Ensembl (Slow and only for Ensembl Transcripts)</option> + </param> + <when value="cached"> + <param name="ref_loc" type="select" label="Select reference 2bit file"> + <options from_data_table="twobit" /> + </param> + </when> + <when value="history"> + <param name="ref_file" type="data" format="twobit" label="reference 2bit file" /> + </when> + <when value="last_column"/> + <when value="select_column"> + <param name="seq_column" type="data_column" data_ref="input" label="BED column conatining the genomic sequence" + help="unspliced genomic sequence from chromStart to chromEnd (Extract Genomic DNA)"/> + </when> + <when value="ensembl_rest"/> + </conditional> + <section name="bed_filters" expanded="false" title="BED Filtering Options"> + <param name="regions" type="text" value="" optional="true" label="Restrict to features overlapping a comma-separated list of regions" > + <help>Each region is specifed as: chr or chr:pos or chr:from-to</help> + <validator type="regex" message="">^(\w+(:\d+(-\d+)?)?(,\w+(:\d+(-\d+)?)?)*)?$</validator> + </param> + <param name="biotypes" type="text" value="" optional="true" label="Restrict Feature translation to these biotypes" + help="For 20 column BED from Ensembl REST server"> + <expand macro="biotypes_help" /> + </param> + </section> + <section name="translations" expanded="false" title="Translation Options"> + <param name="translate" type="select" label="Feature translation"> + <option value="cDNA_minus_CDS">cDNA in 3 frames excluding known CDS</option> + <option value="cDNA">cDNA in 3 frames</option> + <option value="CDS">CDS proteins</option> + </param> + <param name="min_length" type="integer" value="10" min="1" label="Minimum length of protein translation to report"/> + <param name="start_codon" type="boolean" truevalue="--start_codon" falsevalue="" checked="false" + label="Require translations to start with Methionine, trim other leading Amino Acids"/> + <param name="enzyme" type="select" optional="true" label="Digest enzyme" + help="Split the protein into peptides according to enzyme digestion"> + <option value="trypsin">trypsin: ([KR](?=[^P]))|((?<=W)K(?=P))|((?<=M)R(?=P))</option> + </param> + </section> + <section name="fa_id" expanded="false" title="Fasta ID Options"> + <param name="reference" type="text" value="" optional="true" label="Genome reference name" + help="By default, the input bed dataset metadata will be used."/> + <param name="fa_db" type="text" value="" optional="true" label="fasta ID source, e.g. generic" + help="Any Compomics application such as PeptideShaker, requires a source for non reference proteins of 'generic' e.g.: generic|pep1|peptide description"> + </param> + <param name="fa_sep" type="text" value="" optional="true" label="fasta ID line separator character" + help="defaults to the pipe character, Ensembl FASTA files usually use a space character"> + </param> + <param name="id_prefix" type="text" value="" optional="true" label="ID prefix for generated IDs" + help="Can be used to distinguish samples"> + <validator type="regex" message="Allowed chars:a-z A-Z 0-9 _ - |">^[a-zA-Z0-9_-|]*$</validator> + </param> + </section> + </inputs> + <outputs> + <!-- update translation_bed format to "probed" when datatype is available --> + <data name="translation_bed" format="bed" label="Translate ${translations.translate} on ${on_string} ${input.name}.proBed"> + <actions> + <action name="column_names" type="metadata" + default="chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,proteinAccession,peptideSequence,uniqueness,genomeReferenceVersion,psmScore,fdr,modifications,charge,expMassToCharge,calcMassToCharge,psmRank,datasetID,uri"/> + </actions> + </data> + <data name="translation_fasta" format="fasta" label="Translate ${translations.translate} on ${on_string} ${input.name}.fasta"/> + </outputs> + <tests> + <test> + <param name="input" value="human_transcripts.bed" ftype="bed12"/> + <param name="ref_source" value="ensembl_rest"/> + <param name="translate" value="cDNA_minus_CDS"/> + <output name="translation_bed"> + <assert_contents> + <has_text text="ENST00000641515" /> + </assert_contents> + </output> + <output name="translation_fasta"> + <assert_contents> + <has_text text=">ENST00000641515" /> + </assert_contents> + </output> + </test> + <test> + <param name="input" value="human_transcripts_seq.bed" ftype="bed12"/> + <param name="ref_source" value="last_column"/> + <param name="translate" value="cDNA_minus_CDS"/> + <output name="translation_bed"> + <assert_contents> + <has_text text="ENST00000488147" /> + <has_text text="FLLSSLLIGVPFCTSPHSCFSMFFGRSKAALTAKLTLMRV" /> + </assert_contents> + </output> + <output name="translation_fasta"> + <assert_contents> + <has_text text="ENST00000488147" /> + <has_text text="FLLSSLLIGVPFCTSPHSCFSMFFGRSKAALTAKLTLMRV" /> + </assert_contents> + </output> + </test> + <test> + <param name="input" value="human_transcripts_seq.bed" ftype="bed12"/> + <param name="ref_source" value="last_column"/> + <param name="translate" value="CDS"/> + <output name="translation_bed"> + <assert_contents> + <has_text text="ENST00000641515" /> + <has_text text="MVTEFIFLGLSDSQELQTFLFMLFFVFY" /> + </assert_contents> + </output> + <output name="translation_fasta"> + <assert_contents> + <has_text text="ENST00000641515" /> + <has_text text="MVTEFIFLGLSDSQELQTFLFMLFFVFY" /> + </assert_contents> + </output> + </test> + <test> + <param name="input" value="human_transcripts_seq.bed" ftype="bed12"/> + <param name="ref_source" value="last_column"/> + <param name="translate" value="cDNA_minus_CDS"/> + <param name="biotypes" value="protein_coding"/> + <param name="start_codon" value="False"/> + <param name="fa_db" value="generic"/> + <param name="id_prefix" value="test_"/> + <output name="translation_bed"> + <assert_contents> + <has_text text="test_ENST00000641515" /> + <has_text text="ELPHTLPQFIFQQLVCYILEYRYKVIMLSKYSFANS" /> + </assert_contents> + </output> + <output name="translation_fasta"> + <assert_contents> + <has_text text="generic|test_ENST00000641515" /> + <has_text text="ELPHTLPQFIFQQLVCYILEYRYKVIMLSKYSFANS" /> + </assert_contents> + </output> + </test> + <test> + <param name="input" value="human_transcripts_seq.bed" ftype="bed12"/> + <param name="ref_source" value="last_column"/> + <param name="translate" value="cDNA_minus_CDS"/> + <param name="biotypes" value="protein_coding"/> + <param name="start_codon" value="True"/> + <param name="fa_db" value="generic"/> + <param name="id_prefix" value="test_"/> + <output name="translation_bed"> + <assert_contents> + <has_text text="test_ENST00000641515" /> + <has_text text="MLSKYSFANS" /> + <not_has_text text="ELPHTLPQFIFQQLVCYILEYRYKVIMLSKYSFANS" /> + </assert_contents> + </output> + <output name="translation_fasta"> + <assert_contents> + <has_text text="generic|test_ENST00000641515" /> + <has_text text="MLSKYSFANS" /> + <not_has_text text="ELPHTLPQFIFQQLVCYILEYRYKVIMLSKYSFANS" /> + </assert_contents> + </output> + </test> + <test> + <param name="input" value="human_transcripts.bed" ftype="bed12"/> + <param name="ref_source" value="history"/> + <param name="ref_file" value="GRCh38.1.2bit" ftype="twobit"/> + <param name="translate" value="cDNA_minus_CDS"/> + <param name="regions" value="1:0-30000"/> + <param name="start_codon" value="True"/> + <param name="fa_db" value="generic"/> + <param name="id_prefix" value="test_"/> + <output name="translation_bed"> + <assert_contents> + <has_text text="test_ENST00000488147" /> + <has_text text="MAPSSRAPRTLACRDAPATGSRASTAPWTSGPCRRS" /> + <not_has_text text="ENST00000335137" /> + </assert_contents> + </output> + <output name="translation_fasta"> + <assert_contents> + <has_text text="generic|test_ENST00000488147" /> + <has_text text="MAPSSRAPRTLACRDAPATGSRASTAPWTSGPCRRS" /> + <not_has_text text="ENST00000335137" /> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ +Translate transcripts from the input BED file into protein sequences. + +The genomic sequence: + + - may be supplied in an extra column in the BED input file + - retrieved from a twobit genomic reference file + - retrieved from the Ensembl REST API for Ensembl transcripts + + +**INPUTS** + + - BED file with at least the standard 12 columns + - Genome reference in twobit format (optional) + + +**OUTPUTS** + + - FASTA of transcript translations + - BED with the genomic location of the translated protein. The added 13th column contains the protein sequence. + + +**OPTIONS** + + - Feature translation + + - cDNA - three frame translations of the cDNA sequences with an output for each sequence between STOP codons + - CDS - three frame translations of CDS (coding sequence defined by thickStart and thickEnd in the BED file) + + - Translation filtering + + - can be trimmed to a Methionine start codon + - can be split into peptides by an enzyme digestion + - must exceed specified minimum length + + + - BED Filtering + + - genomic regions + - ensembl biotype if the BED contains the 20 columns as retrieved from the Ensembl REST API + + + ]]></help> + <citations> + <citation type="doi">10.1093/bioinformatics/btu613</citation> + <citation type="doi">10.1093/nar/gku1010</citation> + </citations> +</tool>