changeset 0:429feea2cf6d draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/craq commit fe19727db664bcad91b66546149cbf34a6a012e7
author iuc
date Wed, 18 Mar 2026 13:17:11 +0000
parents
children
files craq.xml macros.xml test-data/ids.txt
diffstat 3 files changed, 323 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/craq.xml	Wed Mar 18 13:17:11 2026 +0000
@@ -0,0 +1,300 @@
+<tool id="craq" name="CRAQ" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>assess the accuracy of assembled genomic sequences</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+        ## Decision Tree to structure and prepare input files
+        ## Craq expects sorted and indexed bam files
+        cp '$genome' genome.fa &&
+
+        ## Prepare SMS input
+        #if $sms_input:
+            #if $sms_input[0].is_of_type('bam'):
+                cp '${sms_input[0]}' 'sms_sorted.bam' &&
+                samtools index 'sms_sorted.bam' &&
+            #else:
+                #for $i, $f in enumerate($sms_input):
+                    cp '$f' 'sms_${i}.${f.ext}' &&
+                #end for
+            #end if
+        #end if
+
+        ## Prepare NGS input
+        #if $ngs_input:
+            #if $ngs_input[0].is_of_type('bam'):
+                cp '${ngs_input[0]}' 'ngs_sorted.bam' &&
+                samtools index 'ngs_sorted.bam' &&
+            #else:
+                #for $i, $f in enumerate($ngs_input):
+                    cp '$f' 'ngs_${i}.${f.ext}' &&
+                #end for
+            #end if
+        #end if
+
+
+        ## Build tool command line
+        craq
+        -g genome.fa
+
+        ## SMS input
+        #if $sms_input:
+            -sms
+            #if $sms_input[0].is_of_type('bam'):
+                'sms_sorted.bam'
+            #else:
+                #set $sms_files = ','.join(['sms_%d.%s' % ($i, $f.ext) for $i, $f in enumerate($sms_input)])
+                $sms_files
+            #end if
+        #end if
+
+        ## NGS input
+        #if $ngs_input:
+            -ngs
+            #if $ngs_input[0].is_of_type('bam'):
+                'ngs_sorted.bam'
+            #else:
+                #set $ngs_files = ','.join(['ngs_%d.%s' % ($i, $f.ext) for $i, $f in enumerate($ngs_input)])
+                $ngs_files
+            #end if
+        #end if
+
+        ## Filter parameters
+        -sn $filter_params.sn
+        -sf $filter_params.sf
+        -ln $filter_params.ln
+        -lf $filter_params.lf
+        -hmin $filter_params.hmin
+        -hmax $filter_params.hmax
+        -mgs $filter_params.mgs
+        --sms_coverage $filter_params.sms_coverage
+        --ngs_coverage $filter_params.ngs_coverage
+
+        ## Other parameters
+        $other_params.ser
+        $other_params.snv
+        --gapmodel $other_params.gapmodel
+        $other_params.break
+        --map $other_params.map
+        --mapq $other_params.mapq
+        --norm_window $other_params.norm_window
+        --regional_window $other_params.regional_window
+        $other_params.plot
+        #if $other_params.plot_ids:
+            --plot_ids $other_params.plot_ids
+        #end if
+        --thread "\${GALAXY_SLOTS:-8}"
+        -D outputs
+    ]]></command>
+    <inputs>
+        <param name="genome" type="data" label="Assembly sequence file" format="fasta" help="The genome assembly to be evaluated in FASTA format"/>
+        <param name="sms_input" type="data" optional="true" multiple="true" label="SMS long-read alignment or sequences" format="bam,fastq,fastq.gz" help="Provide either a single BAM file OR multiple FASTQ files (uncompressed or gzipped). At least one of SMS or NGS input must be provided"/>
+        <param name="ngs_input" type="data" optional="true" multiple="true" label="NGS short-read alignment or sequences" format="bam,fastq,fastq.gz" help="Provide either a single BAM file OR multiple FASTQ files (uncompressed or gzipped). At least one of SMS or NGS input must be provided"/>
+        <section name="filter_params" title="Filter Parameters" expanded="False">
+            <param argument="-sn" type="integer" min="0" value="2" label="Minimum number of NGS clipped-reads" help="Minimum number of NGS reads that must show clipping to flag potential errors"/>
+            <param argument="-sf" type="float" min="0.0" max="1.0" value="0.75" label="Minimum proportion of NGS clipped-reads" help="Minimum proportion of NGS reads that must show clipping relative to total coverage"/>
+            <param argument="-ln" type="integer" min="0" value="2" label="Minimum number of SMS clipped-reads" help="Minimum number of SMS long reads that must show clipping to flag potential errors"/>
+            <param argument="-lf" type="float" min="0.0" max="1.0" value="0.75" label="Minimum proportion of SMS clipped-reads" help="Minimum proportion of SMS reads that must show clipping relative to total coverage"/>
+            <param argument="-hmin" type="float" min="0.0" max="1.0" value="0.4" label="Lower clipping rate for heterozygous allele" help="Lower clipping rate threshold to identify heterozygous variants (CRHs)"/>
+            <param argument="-hmax" type="float" min="0.0" max="1.0" value="0.6" label="Upper clipping rate for heterozygous allele" help="Upper clipping rate threshold to identify heterozygous variants (CRHs)"/>
+            <param argument="-mgs" type="integer" min="1" value="10" label="Minimum gap size (bp)" help="Gap[N] sequences longer than this threshold will be treated as breakage"/>
+            <param argument="--sms_coverage" type="integer" min="0" value="100" label="Average SMS coverage" help="Expected average SMS long-read coverage depth for normalization"/>
+            <param argument="--ngs_coverage" type="integer" min="0" value="100" label="Average NGS coverage" help="Expected average NGS short-read coverage depth for normalization"/>
+        </section>
+        <section name="other_params" title="Other Parameters" expanded="False">
+            <param argument="-ser" type="boolean" checked="true" truevalue="-ser T" falsevalue="" label="Search error regions near breakpoints" help="Search noisy error regions near CRE/CSE breakpoints"/>
+            <param argument="-snv" type="boolean" checked="false" truevalue="-snv T" falsevalue="" label="Report SNV/heterozygous variants" help="Report tiny indel errors or heterozygous variants under 40bp.(Resource intensive)"/>
+            <param argument="--gapmodel" type="select" label="Gap model" help="Gap[N] treatment">
+                <option value="1" selected="true">CRE (regional error)</option>
+                <option value="2">CSE (structural error)</option>
+            </param>
+            <param argument="--break" type="boolean" checked="false" truevalue="--break T" falsevalue="" label="Break chimeric fragments" help="Detect and break chimeric contigs at conflict breakpoints"/>
+            <param argument="--map" type="select" label="Mapping preset" help="Ignored if .bam provided">
+                <option value="map-hifi" selected="true">PacBio HiFi</option>
+                <option value="map-pb" >PacBio CLR</option>
+                <option value="map-ont">Nanopore</option>
+            </param>
+            <param argument="--mapq" type="integer" min="0" max="60" value="20" label="Minimum mapping quality" help="Minimum read mapping quality threshold"/>
+            <param argument="--norm_window" type="float" min="0.0" max="1.0" value="0.0001" label="Normalization window fraction" help="Fraction of the total assembly length used as the window size for normalizing error counts"/>
+            <param argument="--regional_window" type="integer" min="1" value="500000" label="Regional quality window size (bp)" help="Window size in base pairs for regional quality benchmarking across the assembly"/>
+            <param argument="--plot" type="boolean" checked="false" truevalue="--plot T" falsevalue="" label="Generate plots" help="Create CRAQ visualization plots"/>
+            <param argument="--plot_ids" type="data" format="tabular,txt" optional="true" label="Selected assembly IDs for plotting" help="File listing specific assembly IDs to plot (default: all IDs)"/>
+            <param name="advanced_output" type="boolean" checked="false" label="Output advanced error region files" help="Output detailed CRE/CRH and CSE/CSH BED files for regional and structural error regions"/>
+        </section>
+    </inputs>
+    <outputs>
+        <collection name="runAQI_out" type="list" label="${tool.name} on ${on_string}: AQI Results">
+            <discover_datasets pattern="__designation_and_ext__" directory="outputs/runAQI_out"/>
+        </collection>
+        <collection name="sr_out" type="list" label="${tool.name} on ${on_string}: Short Reads outputs">
+            <discover_datasets pattern="__designation_and_ext__" directory="outputs/SRout"/>
+            <filter>ngs_input</filter>
+        </collection>
+        <collection name="lr_out" type="list" label="${tool.name} on ${on_string}: Long Reads outputs">
+            <discover_datasets pattern="__designation_and_ext__" directory="outputs/LRout"/>
+            <filter>sms_input</filter>
+        </collection>
+        <collection name="regional_errors" type="list" label="${tool.name} on ${on_string}: Regional Error Regions">
+            <discover_datasets pattern="__designation_and_ext__" directory="outputs/runAQI_out/locER_out"/>
+            <filter>other_params['advanced_output']</filter>
+        </collection>
+        <collection name="structural_errors" type="list" label="${tool.name} on ${on_string}: Structural Error Regions">
+            <discover_datasets pattern="__designation_and_ext__" directory="outputs/runAQI_out/strER_out"/>
+            <filter>other_params['advanced_output']</filter>
+        </collection>
+    </outputs>
+    <tests>
+        <!-- Test 1:  Genome + SMS BAM Input, no NGS input -->
+        <test expect_num_outputs="2">
+            <param name="genome" location="https://zenodo.org/records/19091739/files/genome.fa"/>
+            <param name="sms_input" location="https://zenodo.org/records/19091739/files/SMS_sort.bam"/>
+            <output_collection name="runAQI_out" type="list" count="3"/>
+            <output_collection name="lr_out" type="list" count="10"/>
+        </test>
+        <!-- Test 2: Genome + NGS BAM Input, no SMS input -->
+        <test expect_num_outputs="2">
+            <param name="genome" location="https://zenodo.org/records/19091739/files/genome.fa"/>
+            <param name="ngs_input" location="https://zenodo.org/records/19091739/files/NGS_sort.bam"/>
+            <section name="other_params">
+                <param name="plot" value="true"/>
+            </section>
+            <output_collection name="runAQI_out" type="list" count="4"/>
+            <output_collection name="sr_out" type="list" count="11"/>
+        </test>
+        <!-- Test 3: NGS FASTQ pair + break + snv variants + MAPQ30 -->
+        <test expect_num_outputs="3">
+            <param name="genome" location="https://zenodo.org/records/19091739/files/genome.fa"/>
+            <param name="ngs_input" location="https://zenodo.org/records/19091739/files/NGS_R1.fq.gz,https://zenodo.org/records/19091739/files/NGS_R2.fq.gz"/>
+            <param name="sms_input" location="https://zenodo.org/records/19091739/files/SMS_sort.bam"/>
+            <section name="other_params">
+                <param name="break" value="true"/>
+                <param name="snv" value="true"/>
+                <param name="mapq" value="30"/>
+            </section>
+            <output_collection name="runAQI_out" type="list" count="4"/>
+            <output_collection name="sr_out" type="list" count="9"/>
+        </test>
+        <!-- Test 4: Genome + NGS Paired FASTQ + SMS BAM + Advanced outputs-->
+        <test expect_num_outputs="5">
+            <param name="genome" location="https://zenodo.org/records/19091739/files/genome.fa"/>
+            <param name="ngs_input" location="https://zenodo.org/records/19091739/files/NGS_R1.fq.gz,https://zenodo.org/records/19091739/files/NGS_R2.fq.gz"/>
+            <param name="sms_input" location="https://zenodo.org/records/19091739/files/SMS_sort.bam"/>
+            <section name="other_params">
+                <param name="advanced_output" value="true"/>
+            </section>
+            <output_collection name="runAQI_out" type="list" count="3"/>
+            <output_collection name="lr_out" type="list" count="10"/>
+            <output_collection name="sr_out" type="list" count="9"/>
+            <output_collection name="regional_errors" type="list" count="5"/>
+            <output_collection name="structural_errors" type="list" count="6"/>
+        </test>
+        <!-- Test 5: Plot + file ids selected for plotting -->
+        <test expect_num_outputs="2">
+            <param name="genome" location="https://zenodo.org/records/19091739/files/genome.fa"/>
+            <param name="ngs_input" location="https://zenodo.org/records/19091739/files/SMS_sort.bam"/>
+            <section name="other_params">
+                <param name="plot" value="true"/>
+                <param name="plot_ids" value="ids.txt"/>
+            </section>
+            <output_collection name="runAQI_out" type="list" count="4"/>
+            <output_collection name="sr_out" type="list" count="11"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does?**
+
+CRAQ (Clipping Reveals Assembly Quality) is a reference-free genome assembly quality evaluator.
+It identifies potential errors in assembled sequences by analysing how reads clip (fail to align continuously)
+at specific positions - without requiring a reference genome.
+
+CRAQ produces two key quality scores:
+
+- **R-AQI** (Regional Assembly Quality Index): captures small-scale errors such as indels and local misassemblies detected by short reads
+- **S-AQI** (Structural Assembly Quality Index): captures large-scale structural errors such as chimeric joins and inversions detected by long reads
+
+-----
+
+**Inputs**
+
++---------------------------+----------+----------------------------------------------------------+
+| Input                     | Required | Description                                              |
++===========================+==========+==========================================================+
+| Assembly FASTA            | Yes      | Genome assembly to evaluate in FASTA format              |
++---------------------------+----------+----------------------------------------------------------+
+| SMS long-read data        | No*      | PacBio or Nanopore data as BAM (sorted or unsorted)      |
+|                           |          | OR one or more FASTQ/FASTQ.GZ sequence files             |
++---------------------------+----------+----------------------------------------------------------+
+| NGS short-read data       | No*      | Illumina data as BAM (sorted or unsorted)                |
+|                           |          | OR one or more FASTQ/FASTQ.GZ sequence files             |
++---------------------------+----------+----------------------------------------------------------+
+
+\* At least one of SMS or NGS input must be provided. Using both together gives the most complete assessment.
+
+.. class:: warningmark
+
+If providing sequence files (FASTQ) rather than alignments, CRAQ will perform the mapping internally
+using minimap2 for SMS and BWA for NGS. Ensure the correct mapping preset is selected under
+**Other Parameters** when using raw reads.
+
+-----
+
+**Outputs**
+
+**1) AQI Results** *(always produced)*
+
+The primary output collection containing:
+
+- ``AQI_summary.txt`` - final R-AQI and S-AQI scores summarising overall assembly quality
+- ``regional_statistics.txt`` - per-region breakdown of error counts and coverage
+- ``circos_plot.pdf`` - visualisation of quality metrics across the assembly *(only if plotting is enabled)*
+
+**2) Long Read Outputs** *(produced when SMS input is provided)*
+
+- Filtered long-read alignment in BAM format with index
+- Putative structural error (CSE) breakpoint coordinates
+- Heterozygous variant (CSH) breakpoint coordinates flagged by long reads
+
+**3) Short Read Outputs** *(produced when NGS input is provided)*
+
+- Filtered short-read alignment in BAM format with index
+- Putative regional error (CRE) coordinates flagged by short reads
+- Heterozygous variant (CRH) coordinates from short-read clipping patterns
+
+**4) Regional Error Regions** *(advanced output, optional)*
+
+BED files with precise coordinates of:
+
+- CRE (Clipping-based Regional Errors): local assembly errors detected by short reads
+- CRH (Clipping-based Regional Heterozygous variants): heterozygous positions in regional context
+
+**5) Structural Error Regions** *(advanced output, optional)*
+
+BED files with precise coordinates of:
+
+- CSE (Clipping-based Structural Errors): large-scale misassemblies detected by long reads
+- CSH (Clipping-based Structural Heterozygous variants): heterozygous structural positions
+- Low-coverage regions and ambiguous breakpoints
+
+-----
+
+**Interpreting AQI Scores**
+
+Both R-AQI and S-AQI are scored from 0 to 100, where higher is better:
+
++------------+-------------------+------------------------------------------+
+| Score      | Quality           | Interpretation                           |
++============+===================+==========================================+
+| 90 – 100   | Excellent         | Very few errors, high-confidence assembly|
++------------+-------------------+------------------------------------------+
+| 70 – 89    | Good              | Minor errors, suitable for most analyses |
++------------+-------------------+------------------------------------------+
+| 50 – 69    | Moderate          | Noticeable errors, use with caution      |
++------------+-------------------+------------------------------------------+
+| < 50       | Poor              | Significant errors, reassembly advised   |
++------------+-------------------+------------------------------------------+
+
+    ]]></help>
+    <expand macro="citations"/>
+    <expand macro="creators"/>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Wed Mar 18 13:17:11 2026 +0000
@@ -0,0 +1,22 @@
+<macros>
+    <token name="@TOOL_VERSION@">1.10</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">25.0</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">craq</requirement>
+        </requirements>
+    </xml>
+    <xml name="creators">
+        <creator>
+            <person givenName="Ahmad" familyName="Mahagna" url="https://github.com/Smkingsize"/>
+            <person givenName="Saim" familyName="Momin" url="https://github.com/SaimMomin12"/>
+            <organization name="Galaxy Europe"/>
+        </creator>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1038/s41467-023-42336-w</citation>
+        </citations>
+    </xml>
+</macros>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ids.txt	Wed Mar 18 13:17:11 2026 +0000
@@ -0,0 +1,1 @@
+Chr1
\ No newline at end of file