Mercurial > repos > rnateam > graphclust_postprocessing_no_align

<tool id="graphclust_glob_report_no_align" name="Graphclust glob_report collect clusters" version="0.5" >
  <requirements>
    <requirement type="package" version="0.6.0">graphclust-wrappers</requirement>
    <requirement type="package" version='0.5'>perl-array-utils</requirement>
    <requirement type="package" version='0.18.1'>scikit-learn</requirement>
    <requirement type="package" version='1.8.10'>locarna</requirement>
    <requirement type="package" version='2.1'>rnaz</requirement>
    <requirement type="package" version="1.1.2">infernal</requirement>
    <requirement type="package" version='2.2.10'>viennarna</requirement>
    <requirement type="package" version='1.3.30'>graphicsmagick</requirement>
    <requirement type="package" version='0.6.1'>rscape</requirement>
    <requirement type="package" version='6.0'>unzip</requirement>
  </requirements>
  <command detect_errors="exit_code">
    <![CDATA[
        unzip $FASTA  &> /dev/null &&
        mkdir ./CMSEARCH &&
        mkdir ./MODEL &&
        #import re
        #for $cms_res in $cmsearch_results:
            #set $safename_cm = re.sub('[^\w\-_\.]', '_', $cms_res.element_identifier)
            ln -f -s  '$cms_res' ./CMSEARCH/$safename_cm &&
        #end for
        #for $mods in $model_tree_files:
            #set $safename_tr = re.sub('[^\w\-_\.]', '_', $mods.element_identifier)
            ln -f -s  '$mods' ./MODEL/$safename_tr &&
        #end for

        'glob_res.pl'
                $merge_cluster_ol
                $merge_overlap
                $min_cluster_size
                $cm_min_bitscore
                $cm_max_eval
                1 ## cm_bitscore_sig
                $partition_type ''
                $cut_type
                0 ## zero means do not align
        #if  $iteration_num.iteration_num_selector:
          $iteration_num.CI
          $final_partition_soft
          $final_partition_used_cmsearch
          '$combined_cm'

        #end if

        &&
        python '$__tool_directory__/evaluation.py' FASTA/ RESULTS/

        #if $cdhit:
        &&
          python '$__tool_directory__/addCdhitseqs.py' '$cdhit'
        #end if
]]>
  </command>
  <inputs>
    <param type="data" name="FASTA" format="zip" help="FASTA.zip from pre-processing step"/>
    <param type="data" name="cmsearch_results" format="tabular" multiple="True"
      help="Tabular cmsearch results of the candidate clusters from the cmsearch step"/>
    <param type="data" name="model_tree_files" format="txt" multiple="True" label="model-tree-stk"
      help="model.tree.stk files from pgma_graphclust candidate clustering step"/>
    <param name="partition_type" type="boolean" checked="True" truevalue="0" falsevalue="1" label="Hard partition"
      help="Whether to do hard partitioning (no overlap) or soft mode(cluster elements may overlap) "/>
    <param name="cut_type" type="boolean" checked="True" truevalue="0" falsevalue="1" label="Use CM score for cutoff" help="otherwise use E-value"/>
    <param name="cm_min_bitscore" type="integer" value="20" size="5" label="cm_min_bitscore" help=""/>
    <param name="cm_max_eval" type="float" value="0.001" size="5" label="cm_max_E-val" help=""/>
    <param type="data" name="cdhit" format="txt" optional="true" label="CD-HIT output"
      help="Optional CD-HIT pre-clustering output to be combined into the final clustering output"/>
    <param name="merge_cluster_ol" type="float" value="0.66" size="5" label="merge_cluster_ovelap"
      help="Overlapping ratio criteria to merge overlapping clusters or keep separate clusters (soft partitioning)"/>
    <param name="merge_overlap" type="float" value="0.51" size="5" label="merge_fraction_overlap"
      help="Overlapping ratio criteria to merge overlapping sequence fractions from same input sequence"/>
    <param name="min_cluster_size" type="integer" value="3" size="5" label="minimum cluster size"
      help="Minimum number of elements that can form a cluster. Higher values discard small clusters and may produce larger merged clusters"/>
    <!-- <param name="cm_bitscore_sig" type="integer" value="1" size="5" label="cm_bitscore_sig" help=""/> -->

    <conditional name="iteration_num">
      <param name="iteration_num_selector" type="boolean"  checked="no" label="Multiple iterations"  help="for single iteration- NO, for multiple-YES"/>
      <when value="true">
        <param name="CI" type="integer" value="2" size="5" label="Number of current iteration "/>
        <param type="data" name="final_partition_soft" format="txt" />
        <param type="data" name="final_partition_used_cmsearch" format="txt" />
        <param type="data" name="combined_cm" format="txt" />
      </when>
      <when value="false" ></when>
    </conditional>

  </inputs>
  <outputs>
    <data name="final_stats" format="txt" from_work_dir="RESULTS/cluster.final.stats" label="cluster.final.stats"  />
    <data name="tableForEval" format="tabular" from_work_dir="RESULTS/fullTab.tabular" label="tableForEval"  />
    <data name="final_soft" format="txt" from_work_dir="RESULTS/partitions/final_partition.soft" label="soft_part"   />
    <data name="final_used_cmsearch" format="txt" from_work_dir="RESULTS/partitions/final_partition.used_cmsearch" label="final_partition_used_cmsearch"   />
    <data name="evaluation" format="txt" from_work_dir="RESULTS/evaluation.txt" label="evaluation_of_clusters"  />
    <data name="combined_cm_out" format="txt" from_work_dir="combined_cm_out" label="combined_cmsearch_output"  />
    <collection name="clusters" type="list" label="CLUSTERS-cmsearch">
      <discover_datasets format="txt" pattern="(?P&lt;name&gt;^.*\.all$)" directory="RESULTS"  />
    </collection>
    <collection name="allFastaSorted" type="list" label="cluster-sequences-sorted">
      <discover_datasets format="fasta" pattern="(?P&lt;name&gt;^.*\.sorted.fa$)" directory="RESULTS"  />
    </collection>

    <collection name="partitions" type="list" label="Partitions">
      <discover_datasets pattern="(?P&lt;name&gt;^.*$)" directory="RESULTS/partitions" />
    </collection>
    <data name="RESULTS_zip" format="zip" from_work_dir="RESULTS.zip" label="RESULTS.zip"  />
  </outputs>
  <tests>
    <test>
      <param name="FASTA" value="FASTA.zip" ftype="searchgui_archive"/>
      <param name="cmsearch_results" value="1.1.tree,1.2.tree"/>
      <param name="model_tree_files" value="1.1.model.tree.fa,1.2.model.tree.fa"/>
      <param name="partition_type" value="0"/>
      <param name="cut_type" value="0"/>
      <conditional name="iteration_num">
        <param name="iteration_num_selector" value="false"/>
      </conditional>
      <param name="merge_cluster_ol" value="0.66"/>
      <param name="merge_overlap" value="0.51"/>
      <param name="min_cluster_size" value="3"/>
      <param name="cm_min_bitscore" value="20"/>
      <param name="cm_max_eval" value="0.001"/>
      <!-- <param name="cm_bitscore_sig" value="0"/> -->
      <output name="final_stats" file="RESULTS/cluster.final.stats" />
      <output name="combined_cm_out" file="combined_cm_out"/>
      <output name="evaluation" file="evaluation1.txt"/>
      <output_collection name="clusters" type="list">
        <element name="1.cluster.all" file="RESULTS/1.cluster.all" compare="contains"/>
        <element name="2.cluster.all" file="RESULTS/2.cluster.all" compare="contains"/>

      </output_collection>
      <output_collection name="partitions">
        <element name="final_overlap.map" file="RESULTS/partitions/final_overlap.map" compare="contains">
          <assert_contents>
            <has_text text="1.1  1.1 " />
            <has_text text="1.2  1.2" />
          </assert_contents>
        </element>
        <element name="final_overlap.matrix" file="RESULTS/partitions/final_overlap.matrix" compare="contains">
          <assert_contents>
            <has_text text="MODEL CLASS 0 0" />
            <!--has_text text="1.2" />
            <has_text text="1.1" /-->
          </assert_contents>
        </element>
        <element name="final_partition.hard.best" file="RESULTS/partitions/final_partition.hard.best" />
        <element name="final_partition.hard.merged" file="RESULTS/partitions/final_partition.hard.merged" />
        <element name="final_partition.soft" file="RESULTS/partitions/final_partition.soft" />
        <element name="final_partition.used_cmsearch" file="RESULTS/partitions/final_partition.used_cmsearch" compare="contains"/>
      </output_collection>

      <output name="RESULTS_zip" file="RESULTS.zip" ftype="zip" compare="sim_size" delta="20000"/>

    </test>
  </tests>
  <help>
    <![CDATA[

**What it does**

Post-processing. Redundant clusters are merged and instances that belong to multiple clusters
are assigned unambiguously. For every pair of clusters, the relative overlap (i.e. the fraction of
instances that occur in both clusters) is computed and clusters are merged if the overlap exceeds 50%.
Cluster members are finally ranked by their CM bitscore.

    ]]>
  </help>
  <citations>
      <citation type="doi">10.5281/zenodo.597695</citation>
  </citations>
</tool>
author	rnateam
date	Sat, 27 Oct 2018 13:49:00 -0400
parents
children