changeset 6:6756b34312cd draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/yahs commit ab918ac1eab72932e78c6e45e46d745543eac810
author iuc
date Wed, 17 Sep 2025 06:28:08 +0000
parents ff4031bfaa22
children
files test-data/test2.unsorted.bam test-data/test3.qname_sorted.bam yahs.xml
diffstat 3 files changed, 80 insertions(+), 53 deletions(-) [+]
line wrap: on
line diff
Binary file test-data/test2.unsorted.bam has changed
Binary file test-data/test3.qname_sorted.bam has changed
--- a/yahs.xml	Thu Aug 01 11:41:41 2024 +0000
+++ b/yahs.xml	Wed Sep 17 06:28:08 2025 +0000
@@ -2,7 +2,7 @@
     <description>yet another HI-C scaffolding tool</description>
     <macros>
         <token name="@VERSION@">1.2a.2</token>
-        <token name="@VERSION_SUFFIX@">2</token>
+        <token name="@VERSION_SUFFIX@">3</token>
     </macros>
     <requirements>
         <requirement type="package" version="@VERSION@">yahs</requirement>
@@ -12,22 +12,13 @@
     <command detect_errors="exit_code"><![CDATA[
         #if $function.function_select == "yahs":
             ln -s '$function.fasta' input.fasta &&
-            #if $function.bfile.ext == "bam":
-                ln -s '$function.bfile' input.bam &&
-            #else if $function.bfile.ext == "bed":
-                ln -s '$function.bfile' input.bed &&
-            #end if
+            ln -s '$function.bfile' input.$function.bfile.ext &&
             #if $function.agp:
                 ln -s '$function.agp' input.agp &&
             #end if
             samtools faidx input.fasta &&
             mkdir initial_break agp_out agp_break final_outs &&
-            yahs --no-mem-check input.fasta 
-            #if $function.bfile.ext == "bam":
-                input.bam
-            #else if $function.bfile.ext == "bed":
-                input.bed
-            #end if
+            yahs --no-mem-check input.fasta  input.$function.bfile.ext
             #if $agp:
                 -a input.agp
             #end if
@@ -83,16 +74,16 @@
             </param>
             <when value="yahs">
                 <param name="fasta" type="data" format="fasta" label="Input contig sequences"/>
-                <param name="bfile" type="data" format="bam,bed" label="Alignment file of Hi-C reads to contigs"/>
-                <param name="agp" argument="-a" type="data" format="agp" optional="true" label="Input AGP file (for rescaffolding)"
-                    help="You can specify a AGP format file to ask YaHS to do scaffolding with the scaffolds in the AGP file as the start point"/>
-                <param name="res" argument="-r" type="text" label="Resolutions" optional="true" 
-                    help="Comma separated, ascending list of range of resolutions with no spaces. Ex. 50000,100000,200000,500000,1000000,2000000,5000000. By default and the upper limit is automatically adjusted with the genome size"/>
+                <param name="bfile" type="data" format="bam,bed,qname_sorted.bam,unsorted.bam" label="Alignment file of Hi-C reads to contigs. NOTE:  The input BAM could either be sorted by read names (qname_sorted.bam) or not. The behaviours of the program are slightly different, which might lead to slightly different scaffolding results. For a BAM input sorted by read names, with each mapped read pair, a Hi-C link is counted between the middle positions of the read alignments; while for a BAM input sorted by coordinates or unsorted, Hi-C links are counted between the start positions of the read alignments. Also, for a BAM input not sorted by read names, the mapping quality filtering is suppressed (-q option). If a bed file is provided: the BAM file used to genereate BED file need to be filtered out unmapped reads, supplementary/secondary alignment records, and PCR/optical duplicates, and sorted by read names (otherwise the resulted BED file need to be sorted by the read name column)."/>
+                <param name="agp" argument="-a" type="data" format="agp" optional="true" label="Input AGP file (for rescaffolding)" help="You can specify a AGP format file to ask YaHS to do scaffolding with the scaffolds in the AGP file as the start point"/>
+                <param name="res" argument="-r" type="text" label="Resolutions" optional="true" help="Comma separated, ascending list of range of resolutions with no spaces. Ex. 50000,100000,200000,500000,1000000,2000000,5000000. By default and the upper limit is automatically adjusted with the genome size">
+                    <validator type="regex" message="Only Numbers and commas can be used in to define the list of range of resolutions.">^\d+(,\d+)*$</validator>
+                </param>
                 <conditional name="enzyme_conditional">
                     <param name="enzyme_options" type="select" label="Restriction enzyme used in Hi-C experiment" help="Hi-C experiments can use different restriction enzymes.
-                        The enzyme frequency in contigs is used to normalize the Hi-C interaction frequency. Note that you need to specify the actual 
-                        sequence of the cutting site for a restriction enzyme and not the enzyme name. You can also specify DNASE as an enzyme if you 
-                        use an enzyme-free prep, e.g. Omni-C.">
+                                             The enzyme frequency in contigs is used to normalize the Hi-C interaction frequency. Note that you need to specify the actual                          
+                                             sequence of the cutting site for a restriction enzyme and not the enzyme name. You can also specify DNASE as an enzyme if you                          
+                                             use an enzyme-free prep, e.g. Omni-C.">
                         <option value="not_specified">Not specified</option>
                         <option value="preconfigured">Preconfigured restriction enzymes</option>
                         <option value="specific">Enter a specific sequence</option>
@@ -107,9 +98,8 @@
                         </param>
                     </when>
                     <when value="specific">
-                        <param name="manual_enzyme" argument="-e" type="text" label="Restriction enzyme sequence(s)"
-                            help="Restriction enzyme sequence. If multiple were used, include all as a comma separated list without spaces (ex. 'GATC,AAGCTT').">
-                            <validator type="expression" message="Only alphabetical letters and the comma can be used in to define restriction enzym sequences.">value.replace(',', '').isalpha()</validator>
+                        <param name="manual_enzyme" argument="-e" type="text" label="Restriction enzyme sequence(s)" help="Restriction enzyme sequence. If multiple were used, include all as a comma separated list without spaces (ex. 'GATC,AAGCTT').">
+                            <validator type="expression" message="Only alphabetical letters and the comma can be used in to define restriction enzyme sequences.">value.replace(',', '').isalpha()</validator>
                         </param>
                     </when>
                 </conditional>
@@ -121,7 +111,7 @@
             <when value="agp_to_fasta">
                 <param name="agp" type="data" format="agp" label="Input AGP file"/>
                 <param name="fasta" type="data" format="fasta" label="Contig fasta file"/>
-                <param name="length" type='integer' label="Output fasta line length" value="60" min="1"/>
+                <param name="length" type="integer" label="Output fasta line length" value="60" min="1"/>
             </when>
         </conditional>
         <param name="log_out" type="boolean" label="Output log file?" truevalue="yes" falsevalue="no"/>
@@ -159,9 +149,9 @@
         <test expect_num_outputs="5">
             <conditional name="function">
                 <param name="function_select" value="yahs"/>
-                <param name="fasta" value="test.fasta"/>
-                <param name="bfile" value="test.bed"/>
-                <param name="agp" value="test.agp"/>
+                <param name="fasta" value="test.fasta" ftype="fasta"/>
+                <param name="bfile" value="test.bed" ftype="bed" />
+                <param name="agp" value="test.agp" ftype="agp"/>
                 <param name="res" value="50000,100000,150000,2000000,1000000"/>
             </conditional>
             <output name="final_agp_out" file="test_01_scaffolds_final.agp" ftype="agp"/>
@@ -172,10 +162,10 @@
         <test expect_num_outputs="5">
             <conditional name="function">
                 <param name="function_select" value="yahs"/>
-                <param name="fasta" value="test.fasta"/>
-                <param name="bfile" value="test.bed"/>
-                <param name="no_contig_ec" value="--no-contig-ec"/>
-                <param name="no_scaffold_ec" value="--no-scaffold-ec"/>
+                <param name="fasta" value="test.fasta" ftype="fasta"/>
+                <param name="bfile" value="test.bed" ftype="bed"/>
+                <param name="no_contig_ec" value="true"/>
+                <param name="no_scaffold_ec" value="true"/>
             </conditional>
             <output name="final_agp_out" file="test_02_scaffolds_final.agp" ftype="agp"/>
             <output name="final_fasta_out" file="test_02_scaffolds_final.fa" ftype="fasta"/>
@@ -188,28 +178,28 @@
         <test expect_num_outputs="6">
             <conditional name="function">
                 <param name="function_select" value="yahs"/>
-                <param name="fasta" value="test2.fasta"/>
-                <param name="bfile" value="test2.bam"/>
+                <param name="fasta" value="test2.fasta" ftype="fasta"/>
+                <param name="bfile" value="test2.bam" ftype="bam"/>
                 <param name="res" value="1000,2000,5000,10000,20000,50000,100000,200000,500000"/>
                 <conditional name="enzyme_conditional">
                     <param name="enzyme_options" value="not_specified"/>
                 </conditional>
             </conditional>
             <param name="log_out" value="yes"/>
-	    <output name="log_file" ftype="txt">
-            <assert_contents>
-                <has_text text="[I::dump_links_from_bam_file] dumped 6399 read pairs from 17675 records: 6297 intra links + 102 inter links" />
-            </assert_contents>
-	    </output>
+            <output name="log_file" ftype="txt">
+                <assert_contents>
+                    <has_text text="[I::dump_links_from_bam_file] dumped 6399 read pairs from 17675 records: 6297 intra links + 102 inter links"/>
+                </assert_contents>
+            </output>
             <!-- COMMAND:  yahs test.fasta test.bam -r 1000,2000,5000,10000,20000,50000,100000,200000,500000 -o test_3 -->
         </test>
         <!-- TEST 4 -->
         <test expect_num_outputs="5">
             <conditional name="function">
                 <param name="function_select" value="yahs"/>
-                <param name="fasta" value="test2.fasta"/>
-                <param name="bfile" value="test2.bed"/>
-                <param name="qual" value="10"/>
+                <param name="fasta" value="test2.fasta" ftype="fasta"/>
+                <param name="bfile" value="test2.bed" ftype="bed"/>
+                <param name="quality" value="10"/>
                 <param name="length" value="20"/>
                 <conditional name="enzyme_conditional">
                     <param name="enzyme_options" value="not_specified"/>
@@ -225,8 +215,8 @@
         <test expect_num_outputs="1">
             <conditional name="function">
                 <param name="function_select" value="agp_to_fasta"/>
-                <param name="fasta" value="test.fasta"/>
-                <param name="agp" value="test.agp"/>
+                <param name="fasta" value="test.fasta" ftype="fasta"/>
+                <param name="agp" value="test.agp" ftype="agp"/>
                 <param name="length" value="20"/>
             </conditional>
             <output name="fasta_from_agp" file="test_05.fasta" ftype="fasta"/>
@@ -236,9 +226,9 @@
         <test expect_num_outputs="6">
             <conditional name="function">
                 <param name="function_select" value="yahs"/>
-                <param name="fasta" value="test.fasta"/>
-                <param name="bfile" value="test.bed"/>
-                <param name="agp" value="test.agp"/>
+                <param name="fasta" value="test.fasta" ftype="fasta"/>
+                <param name="bfile" value="test.bed" ftype="bed"/>
+                <param name="agp" value="test.agp" ftype="agp"/>
                 <param name="res" value="50000,100000,150000,2000000,1000000"/>
             </conditional>
             <param name="log_out" value="yes"/>
@@ -255,12 +245,12 @@
         <test expect_num_outputs="6">
             <conditional name="function">
                 <param name="function_select" value="yahs"/>
-                <param name="fasta" value="test.fasta"/>
-                <param name="bfile" value="test.bed"/>
-            </conditional>
-            <conditional name="enzyme_conditional">
-                <param name="enzyme_options" value="preconfigured"/>
-                <param name="preconfigured_enzymes" value="omnic"/>
+                <param name="fasta" value="test.fasta" ftype="fasta"/>
+                <param name="bfile" value="test.bed" ftype="bed"/>
+                <conditional name="enzyme_conditional">
+                    <param name="enzyme_options" value="preconfigured"/>
+                    <param name="preconfigured_enzymes" value="omnic"/>
+                </conditional>
             </conditional>
             <param name="log_out" value="yes"/>
             <output name="log_file" ftype="txt">
@@ -269,9 +259,46 @@
                 </assert_contents>
             </output>
         </test>
+        <!-- TEST 8: qname_sorted-->
+        <test expect_num_outputs="6">
+            <conditional name="function">
+                <param name="function_select" value="yahs"/>
+                <param name="fasta" value="test2.fasta" ftype="fasta"/>
+                <param name="bfile" value="test3.qname_sorted.bam" ftype="qname_sorted.bam"/>
+                <param name="res" value="1000,2000,5000,10000,20000,50000,100000,200000,500000"/>
+                <conditional name="enzyme_conditional">
+                    <param name="enzyme_options" value="not_specified"/>
+                </conditional>
+            </conditional>
+            <param name="log_out" value="yes"/>
+            <output name="log_file" ftype="txt">
+                <assert_contents>
+                    <has_text text="[I::dump_links_from_bam_file] dumped 6399 read pairs from 17675 records: 6399 intra links + 0 inter links"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- TEST 9: unsorted-->
+        <test expect_num_outputs="6">
+            <conditional name="function">
+                <param name="function_select" value="yahs"/>
+                <param name="fasta" value="test2.fasta" ftype="fasta"/>
+                <param name="bfile" value="test2.unsorted.bam" ftype="bam"/>
+                <param name="res" value="1000,2000,5000,10000,20000,50000,100000,200000,500000"/>
+                <conditional name="enzyme_conditional">
+                    <param name="enzyme_options" value="not_specified"/>
+                </conditional>
+            </conditional>
+            <param name="log_out" value="yes"/>
+            <output name="log_file" ftype="txt">
+                <assert_contents>
+                    <has_text text="[I::dump_links_from_bam_file] dumped 6399 read pairs from 17675 records: 6297 intra links + 102 inter links"/>
+                </assert_contents>
+            </output>
+            <!-- COMMAND:  yahs test.fasta test.bam -r 1000,2000,5000,10000,20000,50000,100000,200000,500000 -o test_3 -->
+        </test>
     </tests>
     <help><![CDATA[
-        YaHS is scaffolding tool using Hi-C data. It relies on a new algothrim for contig joining detection which considers the topological distribution of Hi-C signals aiming to distingush real interaction signals from mapping nosies. YaHS has been tested in a wide range of genome assemblies. Compared to other Hi-C scaffolding tools, it usually generates more contiguous scaffolds - especially with a higher N90 and L90 statistics. It is also super fast - takes less than 5 minutes to reconstruct the human genome from an assembly of 5,483 contigs with ~45X Hi-C data.
+        YaHS is scaffolding tool using Hi-C data. It relies on a new algorithm for contig joining detection which considers the topological distribution of Hi-C signals aiming to distingush real interaction signals from mapping nosies. YaHS has been tested in a wide range of genome assemblies. Compared to other Hi-C scaffolding tools, it usually generates more contiguous scaffolds - especially with a higher N90 and L90 statistics. It is also super fast - takes less than 5 minutes to reconstruct the human genome from an assembly of 5,483 contigs with ~45X Hi-C data.
     ]]></help>
     <citations>
         <citation type="doi">10.5281/zenodo.5848772</citation>