Mercurial > repos > iuc > yahs
changeset 6:6756b34312cd draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/yahs commit ab918ac1eab72932e78c6e45e46d745543eac810
author | iuc |
---|---|
date | Wed, 17 Sep 2025 06:28:08 +0000 |
parents | ff4031bfaa22 |
children | |
files | test-data/test2.unsorted.bam test-data/test3.qname_sorted.bam yahs.xml |
diffstat | 3 files changed, 80 insertions(+), 53 deletions(-) [+] |
line wrap: on
line diff
--- a/yahs.xml Thu Aug 01 11:41:41 2024 +0000 +++ b/yahs.xml Wed Sep 17 06:28:08 2025 +0000 @@ -2,7 +2,7 @@ <description>yet another HI-C scaffolding tool</description> <macros> <token name="@VERSION@">1.2a.2</token> - <token name="@VERSION_SUFFIX@">2</token> + <token name="@VERSION_SUFFIX@">3</token> </macros> <requirements> <requirement type="package" version="@VERSION@">yahs</requirement> @@ -12,22 +12,13 @@ <command detect_errors="exit_code"><![CDATA[ #if $function.function_select == "yahs": ln -s '$function.fasta' input.fasta && - #if $function.bfile.ext == "bam": - ln -s '$function.bfile' input.bam && - #else if $function.bfile.ext == "bed": - ln -s '$function.bfile' input.bed && - #end if + ln -s '$function.bfile' input.$function.bfile.ext && #if $function.agp: ln -s '$function.agp' input.agp && #end if samtools faidx input.fasta && mkdir initial_break agp_out agp_break final_outs && - yahs --no-mem-check input.fasta - #if $function.bfile.ext == "bam": - input.bam - #else if $function.bfile.ext == "bed": - input.bed - #end if + yahs --no-mem-check input.fasta input.$function.bfile.ext #if $agp: -a input.agp #end if @@ -83,16 +74,16 @@ </param> <when value="yahs"> <param name="fasta" type="data" format="fasta" label="Input contig sequences"/> - <param name="bfile" type="data" format="bam,bed" label="Alignment file of Hi-C reads to contigs"/> - <param name="agp" argument="-a" type="data" format="agp" optional="true" label="Input AGP file (for rescaffolding)" - help="You can specify a AGP format file to ask YaHS to do scaffolding with the scaffolds in the AGP file as the start point"/> - <param name="res" argument="-r" type="text" label="Resolutions" optional="true" - help="Comma separated, ascending list of range of resolutions with no spaces. Ex. 50000,100000,200000,500000,1000000,2000000,5000000. By default and the upper limit is automatically adjusted with the genome size"/> + <param name="bfile" type="data" format="bam,bed,qname_sorted.bam,unsorted.bam" label="Alignment file of Hi-C reads to contigs. NOTE: The input BAM could either be sorted by read names (qname_sorted.bam) or not. The behaviours of the program are slightly different, which might lead to slightly different scaffolding results. For a BAM input sorted by read names, with each mapped read pair, a Hi-C link is counted between the middle positions of the read alignments; while for a BAM input sorted by coordinates or unsorted, Hi-C links are counted between the start positions of the read alignments. Also, for a BAM input not sorted by read names, the mapping quality filtering is suppressed (-q option). If a bed file is provided: the BAM file used to genereate BED file need to be filtered out unmapped reads, supplementary/secondary alignment records, and PCR/optical duplicates, and sorted by read names (otherwise the resulted BED file need to be sorted by the read name column)."/> + <param name="agp" argument="-a" type="data" format="agp" optional="true" label="Input AGP file (for rescaffolding)" help="You can specify a AGP format file to ask YaHS to do scaffolding with the scaffolds in the AGP file as the start point"/> + <param name="res" argument="-r" type="text" label="Resolutions" optional="true" help="Comma separated, ascending list of range of resolutions with no spaces. Ex. 50000,100000,200000,500000,1000000,2000000,5000000. By default and the upper limit is automatically adjusted with the genome size"> + <validator type="regex" message="Only Numbers and commas can be used in to define the list of range of resolutions.">^\d+(,\d+)*$</validator> + </param> <conditional name="enzyme_conditional"> <param name="enzyme_options" type="select" label="Restriction enzyme used in Hi-C experiment" help="Hi-C experiments can use different restriction enzymes. - The enzyme frequency in contigs is used to normalize the Hi-C interaction frequency. Note that you need to specify the actual - sequence of the cutting site for a restriction enzyme and not the enzyme name. You can also specify DNASE as an enzyme if you - use an enzyme-free prep, e.g. Omni-C."> + The enzyme frequency in contigs is used to normalize the Hi-C interaction frequency. Note that you need to specify the actual + sequence of the cutting site for a restriction enzyme and not the enzyme name. You can also specify DNASE as an enzyme if you + use an enzyme-free prep, e.g. Omni-C."> <option value="not_specified">Not specified</option> <option value="preconfigured">Preconfigured restriction enzymes</option> <option value="specific">Enter a specific sequence</option> @@ -107,9 +98,8 @@ </param> </when> <when value="specific"> - <param name="manual_enzyme" argument="-e" type="text" label="Restriction enzyme sequence(s)" - help="Restriction enzyme sequence. If multiple were used, include all as a comma separated list without spaces (ex. 'GATC,AAGCTT')."> - <validator type="expression" message="Only alphabetical letters and the comma can be used in to define restriction enzym sequences.">value.replace(',', '').isalpha()</validator> + <param name="manual_enzyme" argument="-e" type="text" label="Restriction enzyme sequence(s)" help="Restriction enzyme sequence. If multiple were used, include all as a comma separated list without spaces (ex. 'GATC,AAGCTT')."> + <validator type="expression" message="Only alphabetical letters and the comma can be used in to define restriction enzyme sequences.">value.replace(',', '').isalpha()</validator> </param> </when> </conditional> @@ -121,7 +111,7 @@ <when value="agp_to_fasta"> <param name="agp" type="data" format="agp" label="Input AGP file"/> <param name="fasta" type="data" format="fasta" label="Contig fasta file"/> - <param name="length" type='integer' label="Output fasta line length" value="60" min="1"/> + <param name="length" type="integer" label="Output fasta line length" value="60" min="1"/> </when> </conditional> <param name="log_out" type="boolean" label="Output log file?" truevalue="yes" falsevalue="no"/> @@ -159,9 +149,9 @@ <test expect_num_outputs="5"> <conditional name="function"> <param name="function_select" value="yahs"/> - <param name="fasta" value="test.fasta"/> - <param name="bfile" value="test.bed"/> - <param name="agp" value="test.agp"/> + <param name="fasta" value="test.fasta" ftype="fasta"/> + <param name="bfile" value="test.bed" ftype="bed" /> + <param name="agp" value="test.agp" ftype="agp"/> <param name="res" value="50000,100000,150000,2000000,1000000"/> </conditional> <output name="final_agp_out" file="test_01_scaffolds_final.agp" ftype="agp"/> @@ -172,10 +162,10 @@ <test expect_num_outputs="5"> <conditional name="function"> <param name="function_select" value="yahs"/> - <param name="fasta" value="test.fasta"/> - <param name="bfile" value="test.bed"/> - <param name="no_contig_ec" value="--no-contig-ec"/> - <param name="no_scaffold_ec" value="--no-scaffold-ec"/> + <param name="fasta" value="test.fasta" ftype="fasta"/> + <param name="bfile" value="test.bed" ftype="bed"/> + <param name="no_contig_ec" value="true"/> + <param name="no_scaffold_ec" value="true"/> </conditional> <output name="final_agp_out" file="test_02_scaffolds_final.agp" ftype="agp"/> <output name="final_fasta_out" file="test_02_scaffolds_final.fa" ftype="fasta"/> @@ -188,28 +178,28 @@ <test expect_num_outputs="6"> <conditional name="function"> <param name="function_select" value="yahs"/> - <param name="fasta" value="test2.fasta"/> - <param name="bfile" value="test2.bam"/> + <param name="fasta" value="test2.fasta" ftype="fasta"/> + <param name="bfile" value="test2.bam" ftype="bam"/> <param name="res" value="1000,2000,5000,10000,20000,50000,100000,200000,500000"/> <conditional name="enzyme_conditional"> <param name="enzyme_options" value="not_specified"/> </conditional> </conditional> <param name="log_out" value="yes"/> - <output name="log_file" ftype="txt"> - <assert_contents> - <has_text text="[I::dump_links_from_bam_file] dumped 6399 read pairs from 17675 records: 6297 intra links + 102 inter links" /> - </assert_contents> - </output> + <output name="log_file" ftype="txt"> + <assert_contents> + <has_text text="[I::dump_links_from_bam_file] dumped 6399 read pairs from 17675 records: 6297 intra links + 102 inter links"/> + </assert_contents> + </output> <!-- COMMAND: yahs test.fasta test.bam -r 1000,2000,5000,10000,20000,50000,100000,200000,500000 -o test_3 --> </test> <!-- TEST 4 --> <test expect_num_outputs="5"> <conditional name="function"> <param name="function_select" value="yahs"/> - <param name="fasta" value="test2.fasta"/> - <param name="bfile" value="test2.bed"/> - <param name="qual" value="10"/> + <param name="fasta" value="test2.fasta" ftype="fasta"/> + <param name="bfile" value="test2.bed" ftype="bed"/> + <param name="quality" value="10"/> <param name="length" value="20"/> <conditional name="enzyme_conditional"> <param name="enzyme_options" value="not_specified"/> @@ -225,8 +215,8 @@ <test expect_num_outputs="1"> <conditional name="function"> <param name="function_select" value="agp_to_fasta"/> - <param name="fasta" value="test.fasta"/> - <param name="agp" value="test.agp"/> + <param name="fasta" value="test.fasta" ftype="fasta"/> + <param name="agp" value="test.agp" ftype="agp"/> <param name="length" value="20"/> </conditional> <output name="fasta_from_agp" file="test_05.fasta" ftype="fasta"/> @@ -236,9 +226,9 @@ <test expect_num_outputs="6"> <conditional name="function"> <param name="function_select" value="yahs"/> - <param name="fasta" value="test.fasta"/> - <param name="bfile" value="test.bed"/> - <param name="agp" value="test.agp"/> + <param name="fasta" value="test.fasta" ftype="fasta"/> + <param name="bfile" value="test.bed" ftype="bed"/> + <param name="agp" value="test.agp" ftype="agp"/> <param name="res" value="50000,100000,150000,2000000,1000000"/> </conditional> <param name="log_out" value="yes"/> @@ -255,12 +245,12 @@ <test expect_num_outputs="6"> <conditional name="function"> <param name="function_select" value="yahs"/> - <param name="fasta" value="test.fasta"/> - <param name="bfile" value="test.bed"/> - </conditional> - <conditional name="enzyme_conditional"> - <param name="enzyme_options" value="preconfigured"/> - <param name="preconfigured_enzymes" value="omnic"/> + <param name="fasta" value="test.fasta" ftype="fasta"/> + <param name="bfile" value="test.bed" ftype="bed"/> + <conditional name="enzyme_conditional"> + <param name="enzyme_options" value="preconfigured"/> + <param name="preconfigured_enzymes" value="omnic"/> + </conditional> </conditional> <param name="log_out" value="yes"/> <output name="log_file" ftype="txt"> @@ -269,9 +259,46 @@ </assert_contents> </output> </test> + <!-- TEST 8: qname_sorted--> + <test expect_num_outputs="6"> + <conditional name="function"> + <param name="function_select" value="yahs"/> + <param name="fasta" value="test2.fasta" ftype="fasta"/> + <param name="bfile" value="test3.qname_sorted.bam" ftype="qname_sorted.bam"/> + <param name="res" value="1000,2000,5000,10000,20000,50000,100000,200000,500000"/> + <conditional name="enzyme_conditional"> + <param name="enzyme_options" value="not_specified"/> + </conditional> + </conditional> + <param name="log_out" value="yes"/> + <output name="log_file" ftype="txt"> + <assert_contents> + <has_text text="[I::dump_links_from_bam_file] dumped 6399 read pairs from 17675 records: 6399 intra links + 0 inter links"/> + </assert_contents> + </output> + </test> + <!-- TEST 9: unsorted--> + <test expect_num_outputs="6"> + <conditional name="function"> + <param name="function_select" value="yahs"/> + <param name="fasta" value="test2.fasta" ftype="fasta"/> + <param name="bfile" value="test2.unsorted.bam" ftype="bam"/> + <param name="res" value="1000,2000,5000,10000,20000,50000,100000,200000,500000"/> + <conditional name="enzyme_conditional"> + <param name="enzyme_options" value="not_specified"/> + </conditional> + </conditional> + <param name="log_out" value="yes"/> + <output name="log_file" ftype="txt"> + <assert_contents> + <has_text text="[I::dump_links_from_bam_file] dumped 6399 read pairs from 17675 records: 6297 intra links + 102 inter links"/> + </assert_contents> + </output> + <!-- COMMAND: yahs test.fasta test.bam -r 1000,2000,5000,10000,20000,50000,100000,200000,500000 -o test_3 --> + </test> </tests> <help><![CDATA[ - YaHS is scaffolding tool using Hi-C data. It relies on a new algothrim for contig joining detection which considers the topological distribution of Hi-C signals aiming to distingush real interaction signals from mapping nosies. YaHS has been tested in a wide range of genome assemblies. Compared to other Hi-C scaffolding tools, it usually generates more contiguous scaffolds - especially with a higher N90 and L90 statistics. It is also super fast - takes less than 5 minutes to reconstruct the human genome from an assembly of 5,483 contigs with ~45X Hi-C data. + YaHS is scaffolding tool using Hi-C data. It relies on a new algorithm for contig joining detection which considers the topological distribution of Hi-C signals aiming to distingush real interaction signals from mapping nosies. YaHS has been tested in a wide range of genome assemblies. Compared to other Hi-C scaffolding tools, it usually generates more contiguous scaffolds - especially with a higher N90 and L90 statistics. It is also super fast - takes less than 5 minutes to reconstruct the human genome from an assembly of 5,483 contigs with ~45X Hi-C data. ]]></help> <citations> <citation type="doi">10.5281/zenodo.5848772</citation>