Mercurial > repos > iuc > prot_scriber
view prot-scriber.xml @ 2:4d4df9779b7b draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/prot-scriber commit cbc2e768addc5b6697cdc4ed9b57bc7889a57fe9
author | iuc |
---|---|
date | Tue, 20 Sep 2022 09:45:46 +0000 |
parents | 1e9a43cbf524 |
children | 863ab6ebcafc |
line wrap: on
line source
<tool id="prot_scriber" name="prot-scriber" version="@TOOL_VERSION@" profile="21.05"> <description>Protein annotation of short human readable descriptions</description> <macros> <token name="@TOOL_VERSION@">0.1.4</token> </macros> <requirements> <requirement type="package" version="@TOOL_VERSION@">prot-scriber</requirement> </requirements> <stdio> <regex match="panicked" level="fatal" source="stderr" /> </stdio> <command> <![CDATA['prot-scriber' #if str($input_config.input_config_selector) == "basic" #for $sst in $input_config.seq_sim_table -s '$sst' #end for #else if str($input_config.input_config_selector) == "advanced" #for $ssr in $input_config.advanced_input_repeat -s '$ssr.seq_sim_table' #if $ssr.header -e '$ssr.header' #else -e 'default' #end if #if $ssr.field_separator -p '$ssr.field_separator' #else -p 'default' #end if #if $ssr.blacklist_regexs -b '$ssr.blacklist_regexs' #else -b 'default' #end if #if $ssr.capture_replace_pairs -c '$ssr.capture_replace_pairs' #else -c 'default' #end if #if $ssr.filter_regexs -l '$ssr.filter_regexs' #else -l 'default' #end if #end for #if $input_config.expert_options.non_informative_words_regexs -w '$input_config.expert_options.non_informative_words_regexs' #end if #if $input_config.expert_options.description_split_regex -r '$input_config.expert_options.description_split_regex' #end if #if $input_config.expert_options.center_inverse_word_information_content_at_quantile -q $input_config.expert_options.center_inverse_word_information_content_at_quantile #end if #if $input_config.expert_options.polish_capture_replace_pairs -d '$input_config.expert_options.polish_capture_replace_pairs' #end if #end if #if $seq_family.seq_families -f '$seq_families' #end if #if $seq_family.annotate_non_family_queries -a #end if #if $seq_family.seq_family_gene_ids_separator -g '$seq_family_gene_ids_separator' #end if #if $seq_family.seq_family_id_genes_separator -i '$seq_family_id_genes_separator' #end if #if $exclude_not_annotated_queries -x #end if -o '$output' ]]> </command> <inputs> <conditional name="input_config"> <param type="select" name="input_config_selector" label="Choose input configuration options"> <option value="basic" selected="true">Basic</option> <option value="advanced">Advanced</option> </param> <when value="basic"> <param type="data" multiple="true" name="seq_sim_table" argument="-s" format="tabular" label="Sequence similarity search results in tabular format (-s)" help="Files in which to find sequence similarity search results in tabular format (SSST). Use e.g. Blast or Diamond to produce them. Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond)." /> </when> <when value="advanced"> <repeat name="advanced_input_repeat" title="Sequence similarity table" min="1" default="1"> <param type="data" name="seq_sim_table" argument="-s" format="tabular" label="Sequence similarity search result in tabular format (-s)" help="File in which to find sequence similarity search results in tabular format (SSST). Use e.g. Blast or Diamond to produce them. Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond)." /> <param type="text" optional="true" name="field_separator" argument="-p" label="Field separator (-p)" help="Field-Separator of the (-s) sequence similarity table. The default value is the 'TAB' character. Set to 'default' to use the hard coded default"> <sanitizer> <valid initial="default"> <add preset="string.printable" /> </valid> </sanitizer> </param> <param type="text" optional="true" name="header" argument="-e" label="Header of the sequence similarity tables (-e)" help="Header of the (-s) sequence similarity table. Separated by space (' ') the names of the in order of appearance in the respective table. Required and default columns are 'qacc sacc stitle'. Set to 'default' to use the hard coded default" /> <param type="data" optional="true" name="blacklist_regexs" argument="-b" format="tabular" label="Blacklist Regexs (-b)" help="A file with regular expressions, one per line. Any match to any of these regular expressions causes sequence similarity search result descriptions ('stitle' in Blast terminology) to be discarded from the prot-scriber annotation process. Set to 'default' to use the hard coded default" /> <param type="data" optional="true" name="capture_replace_pairs" argument="-c" format="tabular" label="Capture replace pairs (-c)" help="A file with pairs of lines. Within each pair the first line is a regular expressions defining one or more capture groups. The second line of a pair is the string used to replace the match in the regular expression with. Set to 'default' to use the hard coded default" /> <param type="data" optional="true" name="filter_regexs" argument="-l" format="tabular" label="Filter regexs (-l)" help="A file with regular expressions, one per line. Any match to any of these regular expressions causes the matched sub-string to be deleted, i.e. filtered out. Set to 'default' to use the hard coded default" /> </repeat> <section title="Expert options" name="expert_options"> <param type="data" optional="true" name="non_informative_words_regexs" argument="-w" format="tabular" label="Non informative words regexs (-w)" help="A file in which regular expressions (regexs) are stored, one per line. These regexs are used to recognize non-informative words, which will only receive a minimum score in the prot-scriber process that generates human readable description." /> <param type="text" optional="true" name="description_split_regex" argument="-r" label="Description split regex (-r)" help="A regular expression to be used to split descriptions (`stitle` in Blast terminology) into words. Default is '([~_\-/|\;,':.\s]+)'."> <sanitizer> <valid initial="default"> <add preset="string.printable" /> </valid> </sanitizer> </param> <param type="integer" optional="true" name="center_inverse_word_information_content_at_quantile" argument="-q" label="Center inverse word-information-content at quantile (-q)" help="The quantile (percentile) to be subtracted from calculated inverse word information content to center these values. Value between 0 and 1." /> <param type="data" optional="true" name="polish_capture_replace_pairs" argument="-d" label="Polishing capture replace pairs (-d)" help="A file with pairs of lines. Defines pairs of regex / replace pairs for post polishing of annotation results. Set to 'none' or provide an empty file to supress polishing."/> </section> </when> </conditional> <section title="Sequence family annotation" name="seq_family"> <param type="data" optional="true" name="seq_families" argument="-f" format="tabular" label="Families of biological sequences (-f)" help="A file in which families of biological sequences are stored, one family per line. Each line must have format 'fam_name TAB gene1,gene2,gene3'. Make sure no gene appears in more than one family." /> <param type="boolean" optional="true" name="annotate_non_family_queries" argument="-a" label="Annotate non family query sequences (-a)" help="Set this to true to also annotate sequences are not member of a sequence family." /> <param type="text" optional="true" name="seq_family_gene_ids_separator" argument="-g" label="Sequence family file gene-id separator (-g)" help=" A regular expression used to split the list of gene_identifiers in the argument --seq-families (-f) gene families file. Default is '(\s*,\s*|\s+)'."> <sanitizer> <valid initial="default"> <add preset="string.printable" /> </valid> </sanitizer> </param> <param type="text" optional="true" name="seq_family_id_genes_separator" argument="-i" label="Sequence family file family - gene-id separator (-i)" help="A string used as separator in the argument --seq-families (-f) gene families file. This string separates the gene_family_identifier (name) from the gene_identifier list that family comprises. Default is 'TAB'."> <sanitizer> <valid initial="default"> <add preset="string.printable" /> </valid> </sanitizer> </param> </section> <param type="boolean" optional="true" name="exclude_not_annotated_queries" argument="-x" label="Exclude not annotated query sequences (-x)" help="Use this option to exclude results from the output table that could not be annotated."/> </inputs> <outputs> <data format="tabular" name="output" /> </outputs> <tests> <test> <param name="input_config_selector" value="basic"/> <param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt,8_Proteins_vs_Trembl_blastp.txt" /> <output name="output" file="8_Proteins_prot-scriber.out" sort="true" /> </test> <test> <param name="input_config_selector" value="advanced" /> <repeat name="advanced_input_repeat"> <param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt" /> <param name="field_separator" value="default" /> <param name="header" value="qacc sacc stitle" /> </repeat> <repeat name="advanced_input_repeat"> <param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt" /> <param name="field_separator" value="default" /> <param name="header" value="qacc sacc stitle" /> </repeat> <output name="output" file="8_Proteins_prot-scriber.out" sort="true" /> </test> <test> <param name="input_config_selector" value="advanced" /> <repeat name="advanced_input_repeat"> <param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt" /> <param name="blacklist_regexs" value="blacklist_stitle_regexs.txt" /> </repeat> <repeat name="advanced_input_repeat"> <param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt" /> <param name="blacklist_regexs" value="blacklist_stitle_regexs.txt" /> </repeat> <param name="description_split_regex" value="([~_\-/|;,'\'':.\s]+)" /> <param name="center_inverse_word_information_content_at_quantile" value="50" /> <output name="output" file="8_Proteins_prot-scriber.out" sort="true" /> </test> </tests> <help> <![CDATA[ **What it does** prot-scriber_ assigns short human readable descriptions (HRD) to query biological sequences using reference candidate descriptions. In this, prot-scriber consumes sequence similarity search (Blast or Diamond or similar) results in tabular format. customized lexical analysis is carried out on the descriptions of these Blast Hits and a resulting HRD is assigned to the query sequences. For more information, examples and how to use the prot-scriber commandline tool refer to the prot-scriber README_ and MANUAL_. .. _prot-scriber: http://github.com/usadellab/prot-scriber .. _README: https://github.com/usadellab/prot-scriber/blob/master/README.md .. _MANUAL: https://github.com/usadellab/prot-scriber/blob/master/README.md#manual ---- **Input** The input file is one or multiple tabular output(s) of a sequence similarity search (Blast, Diamon or similar). Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond). The input is done via the -s parameter:: -s, --seq-sim-table File in which to find sequence similarity search results in tabular format (SSST). Use e.g. Blast or Diamond to produce them. Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond). If the required columns, or more, appear in different order than shown here you must use the --header (-e) argument. If any of the input SSSTs uses a different field-separator than the '<TAB>' character, you must provide the --field- separator (-p) argument. You can provide multiple SSSTs for your query proteins whose information will be combined and evaluated by the tool. **Input parameters** prot-scriber gives the user the opportunity to fine tune parameters for the provided input tables. To do so turn on the *input configuration* switch. Those are optional, as the tool also provides sensible defaults. In case you decide to customize your inputs using below parameters, be advised that prot-scriber expects the customized parameter for all input tables - the number of tables and e.g. *--header* parameters have to match. You can set the values to 'default' if you want to use the default value for a given input table:: -e, --header Header of the --seq-sim-table (-s) arg. Separated by space (' ') the names of the columns in order of appearance in the respective table. Required and default columns are 'qacc sacc stitle'. Note that this option only understands Blast terminology, i.e. even if you ran Diamond, please provide 'qacc' instead of 'qseqid' and 'sacc' instead of 'sseqid'. Luckily 'stitle' is 'stitle' in Diamond, too. You can have additional columns that will be ignored, as long as the required columns appear in the correct order. Consider this example: 'qacc sacc evalue bitscore stitle'. Set to 'default' to use the hard coded default. -p, --field-separator Field-Separator of the --seq-sim-table (-s) arg. The default value is the '<TAB>' character. Consider this example: ','. You can provide 'default' to use the hard coded default (TAB). -b, --blacklist-regexs (Expert option) A file with regular expressions, one per line. Any match to any of these regular expressions causes sequence similarity search result descriptions ('stitle' in Blast terminology) to be discarded from the prot-scriber annotation process. Set to 'default' to use the hard coded default. An example file can be downloaded here: https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/blacklist_stitle_regexs.txt. -l, --filter-regexs (Expert option) A file with regular expressions, one per line. Any match to any of these regular expressions causes the matched sub-string to be deleted, i.e. filtered out. Filtering is used to process descriptions ('stitle' in Blast terminology) and prepare the descriptions for the prot-scriber annotation process. In case of UniProt sequence similarity search results (Blast result tables), this removes the Blast Hit identifier (`sacc`) from the description (`stitle`) and also removes the taxonomic information starting with e.g. 'OS=' at the end of the `stitle` strings. Set to 'default' to use hard coded default. Anexample file can be downloaded here: https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/filter_stitle_regexs.txt. -c, --capture-replace-pairs (Expert option) A file with pairs of lines. Within each pair the first line is a regular expressions defining one or more capture groups. The second line of a pair is the string used to replace the match in the regular expression with. This means the second line contains the capture groups. These pairs are used to further filter the sequence similarity search result descriptions ('stitle' in Blast terminology). In contrast to the --filter-regex (-l) matches are not deleted, but replaced with the second line of the pair. Filtering is used to process descriptions ('stitle' in Blast terminology) and prepare the descriptions for the prot-scriber annotation process. Set to 'default' to use the hard coded default. An example file can be downloaded here: https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/capture_replace_pairs.txt. ---- **Gene family annotation** prot-scriber can also apply the same methodology to produce HRDs for sets of biological sequences, i.e. gene families:: -f, --seq-families A file in which families of biological sequences are stored, one family per line. Each line must have format 'fam-name TAB gene1,gene2,gene3'. Make sure no gene appears in more than one family. -g, --seq-family-gene-ids-separator A regular expression used to split the list of gene-identifiers in the argument --seq-families (-f) gene families file. Default is '(\s*,\s*|\s+)'. -a, --annotate-non-family-queries Use this option only in combination with --seq-families (-f), i.e. when prot-scriber is used to generate human readable descriptions for gene families. If in that context this flag is given, queries for which there are sequence similarity search (Blast) results but that are NOT member of a sequence family will receive an annotation (human readable description) in the output file, too. Default value of this setting is 'OFF' (false). ---- **Expert options** Some additional optional configuration. Only use when you know what you are doing:: -w, --non-informative-words-regexs A file in which regular expressions (regexs) are stored, one per line. These regexs are used to recognize non-informative words, which will only receive a minimun score in the prot-scriber process that generates human readable description. There is a default list hard-coded into prot-scriber. An example file can be downloaded here: https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/non_informative_words_regexs.txt. -r, --description-split-regex A regular expression to be used to split descriptions (`stitle` in Blast terminology) into words. Default is '([~_\-/|\;,':.\s]+)'. -q, --center-inverse-word-information-content-at-quantile The quantile (percentile) to be subtracted from calculated inverse word information content to center these values. Consequently, this must be a value between zero and one or literal 50, which is interpreted as mean instead of a quantile. Default is 50, implying centering at the mean. -d, --polish-capture-replace-pairs The last step of the process generating human readable descriptions (HRDs) for the queries (proteins or sequence families) is to 'polish' the selected HRDs. Polishing is done by iterative application of regular expressions (fancy-regex) and replace instructions (capture-replace-pairs). If you do not want to use the default polishing capture replace pairs specify a file in which pairs of lines are given. Of each pair the first line hold a regular expression (fancy-regex syntax) and the second the replacement instructions providing access to capture groups. Set to 'none' or provide an empty file, if you want to suppress polishing. If you want to have a template file for your custom polishing capture-replace-pairs please refer to https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/polish_capture_replace_pairs.txt ---- **Output** prot-scriber outputs a single tab-separated text file with the annotated sequences or gene-families, depending on how you ran the program, one result per line:: Annotee-Identifier Human-Readable-Description Soltu.DM.02G020600.1 arath strubbelig receptor family Soltu.DM.S001650.1 germin member Soltu.DM.03G011280.1 increased dna methylation ... If you want to supress results from the output table that could not be annotated, i.e. 'unknown protein' or 'unknown sequence family' respectively use the '-x' parameter:: -x, --exclude-not-annotated-queries Exclude results from the output table that could not be annotated, i.e. 'unknown protein' or 'unknown sequence family', respectively. ]]> </help> </tool>