Mercurial > repos > iuc > prot_scriber

--- a/prot-scriber.xml	Wed Jun 15 08:37:03 2022 +0000
+++ b/prot-scriber.xml	Tue Sep 20 09:45:46 2022 +0000
@@ -1,7 +1,7 @@
 <tool id="prot_scriber" name="prot-scriber" version="@TOOL_VERSION@" profile="21.05">
   <description>Protein annotation of short human readable descriptions</description>
   <macros>
-    <token name="@TOOL_VERSION@">0.1.3</token>
+    <token name="@TOOL_VERSION@">0.1.4</token>
   </macros>
   <requirements>
     <requirement type="package" version="@TOOL_VERSION@">prot-scriber</requirement>
@@ -20,29 +20,42 @@
         -s '$ssr.seq_sim_table'
         #if $ssr.header
           -e '$ssr.header'
+        #else
+          -e 'default'
         #end if
         #if $ssr.field_separator
           -p '$ssr.field_separator'
+        #else
+          -p 'default'
         #end if
         #if $ssr.blacklist_regexs
           -b '$ssr.blacklist_regexs'
+        #else
+          -b 'default'
         #end if
         #if $ssr.capture_replace_pairs
           -c '$ssr.capture_replace_pairs'
+        #else
+          -c 'default'
         #end if
         #if $ssr.filter_regexs
           -l '$ssr.filter_regexs'
+        #else
+          -l 'default'
         #end if
       #end for
       #if $input_config.expert_options.non_informative_words_regexs
         -w '$input_config.expert_options.non_informative_words_regexs'
       #end if
       #if $input_config.expert_options.description_split_regex
-        -r "$input_config.expert_options.description_split_regex"
+        -r '$input_config.expert_options.description_split_regex'
       #end if
       #if $input_config.expert_options.center_inverse_word_information_content_at_quantile
         -q $input_config.expert_options.center_inverse_word_information_content_at_quantile
       #end if
+      #if $input_config.expert_options.polish_capture_replace_pairs
+        -d '$input_config.expert_options.polish_capture_replace_pairs'
+      #end if
     #end if
     #if $seq_family.seq_families
       -f '$seq_families'
@@ -51,11 +64,14 @@
       -a
     #end if
     #if $seq_family.seq_family_gene_ids_separator
-      -g "$seq_family_gene_ids_separator"
+      -g '$seq_family_gene_ids_separator'
     #end if
     #if $seq_family.seq_family_id_genes_separator
       -i '$seq_family_id_genes_separator'
     #end if
+    #if $exclude_not_annotated_queries
+        -x
+    #end if
     -o '$output'
     ]]>
   </command>
@@ -102,6 +118,8 @@
           </param>
           <param type="integer" optional="true" name="center_inverse_word_information_content_at_quantile" argument="-q" label="Center inverse word-information-content at quantile (-q)" help="The quantile (percentile) to be subtracted from calculated inverse word information
             content to center these values. Value between 0 and 1." />
+          <param type="data" optional="true" name="polish_capture_replace_pairs" argument="-d" label="Polishing capture replace pairs (-d)" help="A file with pairs of lines. Defines pairs of regex / replace
+            pairs for post polishing of annotation results. Set to 'none' or provide an empty file to supress polishing."/>
         </section>
       </when>
     </conditional>
@@ -127,6 +145,7 @@
         </sanitizer>
       </param>
     </section>
+    <param type="boolean" optional="true" name="exclude_not_annotated_queries" argument="-x" label="Exclude not annotated query sequences (-x)" help="Use this option to exclude results from the output table that could not be annotated."/>
   </inputs>
   <outputs>
     <data format="tabular" name="output" />
@@ -161,7 +180,7 @@
         <param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt" />
         <param name="blacklist_regexs" value="blacklist_stitle_regexs.txt" />
       </repeat>
-      <param name="description_split_regex" value="([~_\-/|;,':.\s]+)" />
+      <param name="description_split_regex" value="([~_\-/|;,'\'':.\s]+)" />
       <param name="center_inverse_word_information_content_at_quantile" value="50" />
       <output name="output" file="8_Proteins_prot-scriber.out" sort="true" />
     </test>
@@ -291,6 +310,18 @@
             content to center these values. Consequently, this must be a value between zero and one
             or literal 50, which is interpreted as mean instead of a quantile. Default is 50,
             implying centering at the mean.
+
+    -d, --polish-capture-replace-pairs
+            The last step of the process generating human readable descriptions (HRDs) for the
+            queries (proteins or sequence families) is to 'polish' the selected HRDs. Polishing is
+            done by iterative application of regular expressions (fancy-regex) and replace
+            instructions (capture-replace-pairs). If you do not want to use the default polishing
+            capture replace pairs specify a file in which pairs of lines are given. Of each pair the
+            first line hold a regular expression (fancy-regex syntax) and the second the replacement
+            instructions providing access to capture groups. Set to 'none' or provide an empty file,
+            if you want to suppress polishing. If you want to have a template file for your custom
+            polishing capture-replace-pairs please refer to
+            https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/polish_capture_replace_pairs.txt

 ----

@@ -304,6 +335,12 @@
     Soltu.DM.03G011280.1	increased dna methylation
     ...

+If you want to supress results from the output table that could not be annotated, i.e. 'unknown protein' or 'unknown sequence family' respectively use the '-x' parameter::
+
+    -x, --exclude-not-annotated-queries
+            Exclude results from the output table that could not be annotated, i.e. 'unknown
+            protein' or 'unknown sequence family', respectively.
+
     ]]>
   </help>
-</tool>
\ No newline at end of file
+</tool>