Mercurial > repos > bgruening > hifiasm
view hifiasm.xml @ 2:f3c89da3af16 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/hifiasm commit 5a03c44d52d545bd23e651bfba1cc73050eec9a0"
author | bgruening |
---|---|
date | Sun, 20 Jun 2021 21:18:55 +0000 |
parents | 6505bd37670d |
children | 9ef6920c3089 |
line wrap: on
line source
<tool id="hifiasm" name="Hifiasm" version="@VERSION@+galaxy0"> <description>haplotype-resolved de novo assembler for PacBio Hifi reads</description> <macros> <token name="@VERSION@">0.15.3</token> <token name="@FORMATS@">fasta,fasta.gz,fastq,fastq.gz</token> <xml name="reads"> <param name="reads" type="data" format="@FORMATS@" multiple="true" label="Input reads" /> </xml> </macros> <requirements> <requirement type="package" version="@VERSION@">hifiasm</requirement> <requirement type="package" version="0.1">yak</requirement> </requirements> <version_command>hifiasm --version</version_command> <command detect_errors="exit_code"> <![CDATA[ #set $input_files = list() #set $hap1_inputs = list() #set $hap2_inputs = list() #for idx, read in enumerate($mode.reads): #set $inputfile = 'input_%d.%s' % ($idx, $read.dataset.extension) ln -s '$read' $inputfile && $input_files.append($inputfile) #end for #set $input_filenames = ' '.join($input_files) #if str($mode.mode_selector) == 'trio': #for idx, read in enumerate($mode.hap1_reads): #set $inputfile = 'hap1_input_%d.%s' % ($idx, $read.dataset.extension) ln -s '$read' $inputfile && $hap1_inputs.append($inputfile) #end for #for idx, read in enumerate($mode.hap2_reads): #set $inputfile = 'hap2_input_%d.%s' % ($idx, $read.dataset.extension) ln -s '$read' $inputfile && $hap2_inputs.append($inputfile) #end for #set $hap1_filenames = ' '.join($hap1_inputs) #set $hap2_filenames = ' '.join($hap2_inputs) yak count -k$mode.yak_kmer_length -b$filter_bits -t\${GALAXY_SLOTS:-1} -o hap1.yak $hap1_filenames && yak count -k$mode.yak_kmer_length -b$filter_bits -t\${GALAXY_SLOTS:-1} -o hap2.yak $hap2_filenames && #end if hifiasm -t \${GALAXY_SLOTS:-1} -o output -f $filter_bits #if str($advanced_options.advanced_selector) == 'set': -k $advanced_options.hifiasm_kmer_length -w $advanced_options.window_size -D $advanced_options.drop_kmers -N $advanced_options.max_overlaps -r $advanced_options.correction_rounds #if $advanced_options.min_hist_cnt: --min-hist-cnt $advanced_options.min_hist_cnt #end if #end if #if str($assembly_options.assembly_selector) == 'set': -a $assembly_options.cleaning_rounds -z $assembly_options.adapter_length -m $assembly_options.pop_contigs -p $assembly_options.pop_unitigs -n $assembly_options.remove_tips -x $assembly_options.max_overlap -y $assembly_options.min_overlap $assembly_options.disable_post_join $assembly_options.ignore_error_corrected #end if #if str($mode.mode_selector) == 'trio': -1 hap1.yak -2 hap2.yak -c $mode.max_kmers -d $mode.min_kmers #end if #if str($purge_options.purge_selector) == 'set': -l $purge_options.purge_level -s $purge_options.similarity_threshold -O $purge_options.minimum_overlap #if $purge_options.purge_cov: --purge-cov $purge_options.purge_cov #end if #if $purge_options.n_hap: --n-hap $purge_options.n_hap #end if #end if #if str($hic_partition.hic_partition_selector) == 'set': --h1 '${ ' '.join(["%s" % $x for $x in $hic_partition.h1]) }' --h2 '${ ' '.join(["%s" % $x for $x in $hic_partition.h2]) }' #if $hic_partition.seed: --seed $hic_partition.seed #end if #if $hic_partition.n_weight: --n-weight $hic_partition.n_weight #end if #if $hic_partition.n_perturb: --n-perturb $hic_partition.n_perturb #end if #if $hic_partition.f_perturb: --f-perturb $hic_partition.f_perturb #end if #end if ## Changed the default outputs of hifiasm. Hifiasm outputs a primary assembly and two balanced haplotypes in default. Incorporated the option '--primary' to output primary assembly and alternate assembly. --primary $input_filenames ]]> </command> <inputs> <conditional name="mode"> <param name="mode_selector" type="select" label="Assembly mode"> <option value="standard">Standard</option> <option value="trio">Trio mode</option> </param> <when value="standard"> <expand macro="reads" /> </when> <when value="trio"> <expand macro="reads" /> <param name="hap1_reads" type="data" format="fastq,fastq.gz" multiple="true" label="Haplotype 1 reads" /> <param name="hap2_reads" type="data" format="fastq,fastq.gz" multiple="true" label="Haplotype 2 reads" /> <param name="max_kmers" argument="-c" type="integer" value="2" label="Lower bound of the binned k-mer's frequency" /> <param name="min_kmers" argument="-d" type="integer" value="5" label="Upper bound of the binned k-mer's frequency" /> <param name="yak_kmer_length" type="integer" min="0" max="64" value="31" label="Yak counter k-mer length" /> </when> </conditional> <param name="filter_bits" argument="-f" type="integer" min="0" value="37" label="Bits for bloom filter" help="A value of 0 disables the bloom filter" /> <conditional name="advanced_options"> <param name="advanced_selector" type="select" label="Advanced options"> <option value="blank">Leave default</option> <option value="set">Specify</option> </param> <when value="blank" /> <when value="set"> <param name="hifiasm_kmer_length" argument="-k" type="integer" min="0" max="64" value="51" label="HiFiasm k-mer length" /> <param name="window_size" argument="-w" type="integer" min="0" value="51" label="Minimizer window size" /> <param name="drop_kmers" argument="-D" type="float" value="5.0" label="Drop k-mers" help="K-mers that occur more than this value multiplied by the coverage will be discarded" /> <param name="max_overlaps" argument="-N" type="integer" value="100" label="Maximum overlaps to consider" help="The software selects the larger of this value and the k-mer count multiplied by coverage" /> <param name="correction_rounds" argument="-r" type="integer" value="3" label="Correction rounds" /> <param argument="--min-hist-cnt" type="integer" min="0" value="" optional="true" label="Minimum count threshold" help="When analyzing the k-mer spectrum, ignore counts below this value" /> </when> </conditional> <conditional name="assembly_options"> <param name="assembly_selector" type="select" label="Assembly options"> <option value="blank">Leave default</option> <option value="set">Specify</option> </param> <when value="blank" /> <when value="set"> <param name="cleaning_rounds" argument="-a" type="integer" value="4" label="Cleaning rounds" /> <param name="adapter_length" argument="-z" type="integer" min="0" value="0" label="Length of adapters to be removed" /> <param name="pop_contigs" argument="-m" type="integer" value="10000000" label="Minimum contig bubble size" help="Pop contig graph bubbles smaller than this value" /> <param name="pop_unitigs" argument="-p" type="integer" value="100000" label="Minimum unitig bubble size" help="Pop unitig graph bubbles smaller than this value" /> <param name="remove_tips" argument="-n" type="integer" value="3" label="Tip unitigs" help="Keep only tip unitigs with a number of reads greater than or equal to this value" /> <param name="max_overlap" argument="-x" type="float" min="0" max="1" value="0.8" label="Maximum overlap drop ratio" help="This option is used with -r. Given a node N in the assembly graph, let max(N) be the length of the largest overlap of N. Hifiasm iteratively drops overlaps of N if their length/max(N) are below a threshold controlled by -x. Hifiasm applies -r rounds of short overlap removal with an increasing threshold between -x and -y"/> <param name="min_overlap" argument="-y" type="float" min="0" max="1" value="0.2" label="Minimum overlap drop ratio" help="This option is used with -r. Given a node N in the assembly graph, let max(N) be the length of the largest overlap of N. Hifiasm iteratively drops overlaps of N if their length/max(N) are over a threshold controlled by -y. Hifiasm applies -r rounds of short overlap removal with an increasing threshold between -x and -y"/> <param name="disable_post_join" argument="-u" type="boolean" truevalue="-u" falsevalue="" label="Skip post join contigs step" help="May improve N50" /> <param name="ignore_error_corrected" argument="-i" type="boolean" truevalue="-i" falsevalue="" value="False" label="Ignore error corrected reads and overlaps" help="Ignore error corrected reads and overlaps saved in prefix.*.bin files. Apart from assembly graphs, hifiasm also outputs three binary files that save alloverlap information during assembly step. With these files, hifiasm can avoid the time-consuming all-to-all overlap calculation step, and do the assembly directly and quickly. This might be helpful when users want to get an optimized assembly by multiple rounds of experiments with different parameters." /> </when> </conditional> <conditional name="purge_options"> <param name="purge_selector" type="select" label="Options for purging duplicates"> <option value="blank">Leave default</option> <option value="set">Specify</option> </param> <when value="blank" /> <when value="set"> <param name="purge_level" argument="-l" type="select" label="Purge level"> <option value="0" selected="true">None (0)</option> <option value="1">Light (1)</option> <option value="2">Aggressive (2)</option> <option value="3">Aggressive - high heterozygosity rate (3)</option> </param> <param name="similarity_threshold" argument="-s" type="float" min="0" max="1" value="0.75" label="Similarity threshold for duplicate haplotigs" /> <param name="minimum_overlap" argument="-O" type="integer" value="1" label="Minimum overlapped reads for duplicate haplotigs" /> <param argument="--purge-cov" type="integer" optional="true" label="Coverage upper bound" help="If not set, this will be determined automatically" /> <param argument="--n-hap" type="integer" min="0" value="" optional="True" label="Assumtion of haplotype number" help="A haplotype is defined as the combination of alleles for different polymorphisms that occur on the same chromosome." /> </when> </conditional> <conditional name="hic_partition"> <param name="hic_partition_selector" type="select" label="Options for Hi-C-partition"> <option value="blank">Leave default</option> <option value="set">Specify</option> </param> <when value="blank" /> <when value="set"> <param argument="--h1" type="data" format="fastq,fastq.gz" multiple="true" label="Hi-C R1 reads" /> <param argument="--h2" type="data" format="fastq,fastq.gz" multiple="true" label="Hi-C R2 reads" /> <param argument="--seed" type="integer" min="1" value="" optional="true" label="RNG seed" /> <param argument="--n-weight" type="integer" min="1" value="" optional="true" label="Rounds of reweighting Hi-C links" /> <param argument="--n-perturb" type="integer" min="1" value="" optional="true" label="Rounds of perturbation" /> <param argument="--f-perturb" type="float" min="0" max="1" value="" optional="true" label="Fraction to flip for perturbation" /> </when> </conditional> </inputs> <outputs> <data name="raw_unitigs" format="gfa1" from_work_dir="output.r_utg.gfa" label="${tool.name} on ${on_string}, haplotype-resolved raw unitig graph"> <filter>mode['mode_selector'] == 'standard'</filter> </data> <data name="raw_unitigs_trio" format="gfa1" from_work_dir="output.dip.r_utg.gfa" label="${tool.name} on ${on_string}, haplotype-resolved raw unitig graph"> <filter>mode['mode_selector'] == 'trio'</filter> </data> <data name="processed_unitigs" format="gfa1" from_work_dir="output.p_utg.gfa" label="${tool.name} on ${on_string}, processed unitig graph"> <filter>mode['mode_selector'] == 'standard'</filter> </data> <data name="primary_contig_graph" format="gfa1" from_work_dir="output.p_ctg.gfa" label="${tool.name} on ${on_string}, primary assembly contig graph"> <filter>mode['mode_selector'] == 'standard'</filter> </data> <data name="alternate_contig_graph" format="gfa1" from_work_dir="output.a_ctg.gfa" label="${tool.name} on ${on_string}, alternate assembly contig graph"> <filter>mode['mode_selector'] == 'standard'</filter> </data> <data name="hap1_contigs" format="gfa1" from_work_dir="output.hap1.p_ctg.gfa" label="${tool.name} on ${on_string}, hap1.p_ctg contig graph"> <filter>mode['mode_selector'] == 'trio'</filter> </data> <data name="hap2_contigs" format="gfa1" from_work_dir="output.hap2.p_ctg.gfa" label="${tool.name} on ${on_string}, hap2.p_ctg contig graph"> <filter>mode['mode_selector'] == 'trio'</filter> </data> <!-- Hi-C partition outputs --> <data name="hic_contig_graph" format="gfa1" from_work_dir="output.hic.p_ctg.gfa" label="${tool.name} ${on_string}, HI-C contig graph"> <filter>hic_partition['hic_partition_selector'] == 'set'</filter> </data> <data name="hic_balanced_contig_hap1_graph" format="gfa1" from_work_dir="output.bp.hap1.p_ctg.gfa" label="${tool.name} ${on_string}, HI-C hap1 balanced contig graph hap1"> <filter>hic_partition['hic_partition_selector'] == 'set'</filter> </data> <data name="hic_balanced_contig_hap2_graph" format="gfa1" from_work_dir="output.bp.hap2.p_ctg.gfa" label="${tool.name} ${on_string}, HI-C hap2 balanced contig graph hap2"> <filter>hic_partition['hic_partition_selector'] == 'set'</filter> </data> </outputs> <tests> <test> <param name="reads" value="hifiasm-in1.fa.gz" ftype="fasta.gz" /> <param name="filter_bits" value="0" /> <param name="mode_selector" value="standard" /> <output name="raw_unitigs" file="hifiasm-out1-raw.gfa" ftype="gfa1" /> <output name="processed_unitigs" file="hifiasm-out1-processed.gfa" ftype="gfa1" /> <output name="primary_contig_graph" file="hifiasm-out1-primary.gfa" ftype="gfa1" /> <output name="alternate_contig_graph" ftype="gfa1"> <assert_contents> <has_size value="0"/> </assert_contents> </output> </test> <test> <param name="reads" value="hifiasm-in2-0.fa.gz,hifiasm-in2-1.fa.gz,hifiasm-in2-2.fa.gz,hifiasm-in2-3.fa.gz,hifiasm-in2-4.fa.gz" ftype="fasta.gz" /> <param name="filter_bits" value="0" /> <param name="mode_selector" value="standard" /> <output name="raw_unitigs" file="hifiasm-out2-raw.gfa" ftype="gfa1" /> <output name="processed_unitigs" file="hifiasm-out2-processed.gfa" ftype="gfa1" /> <output name="primary_contig_graph" file="hifiasm-out2-primary.gfa" ftype="gfa1" /> <output name="alternate_contig_graph" ftype="gfa1"> <assert_contents> <has_size value="0"/> </assert_contents> </output> </test> <!-- Test Hi-C <test> <param name="reads" value="hifiasm-in1.fa.gz" ftype="fasta.gz" /> <param name="filter_bits" value="0" /> <param name="mode_selector" value="standard" /> <conditional name="hic_partition"> <param name="hic_partition_selector" value="set"/> <param name="h1" value="r1_1.fq"/> <param name="h2" value="r2_1.fq"/> </conditional> <output name="raw_unitigs" file="hifiasm-out3-raw.gfa" ftype="gfa1" /> <output name="processed_unitigs" file="hifiasm-out3-processed.gfa" ftype="gfa1" /> <output name="primary_contig_graph" file="hifiasm-out3-primary.gfa" ftype="gfa1" /> <output name="alternate_contig_graph" ftype="gfa1"> <assert_contents> <has_size value="0"/> </assert_contents> </output> </test> --> <!-- Test trio mode --> <test> <param name="filter_bits" value="0"/> <conditional name="mode"> <param name="mode_selector" value="trio"/> <param name="reads" value="child.fasta.gz"/> <param name="hap1_reads" value="paternal.fasta.gz"/> <param name="hap2_reads" value="maternal.fasta.gz"/> <param name="max_kmers" value="2"/> <param name="min_kmers" value="5"/> </conditional> <output name="raw_unitigs_trio" ftype="gfa1"> <assert_contents> <has_size value="0"/> </assert_contents> </output> <output name="hap1_contigs"> <assert_contents> <has_size value="0"/> </assert_contents> </output> <output name="hap2_contigs"> <assert_contents> <has_size value="0"/> </assert_contents> </output> </test> <!-- Test ignore-error-corrected option --> <test> <param name="reads" value="hifiasm-in1.fa.gz" ftype="fasta.gz" /> <param name="filter_bits" value="0" /> <param name="mode_selector" value="standard" /> <conditional name="assembly_options"> <param name="assembly_selector" value="set"/> <param name="ignore_error_corrected" value="True"/> </conditional> <output name="raw_unitigs" file="hifiasm-out3-raw.gfa" ftype="gfa1" /> <output name="processed_unitigs" file="hifiasm-out3-processed.gfa" ftype="gfa1" /> <output name="primary_contig_graph" file="hifiasm-out3-primary.gfa" ftype="gfa1" /> <output name="alternate_contig_graph" ftype="gfa1"> <assert_contents> <has_size value="0"/> </assert_contents> </output> </test> <!-- Test expected haplotype number --> <test> <param name="reads" value="hifiasm-in1.fa.gz" ftype="fasta.gz" /> <param name="filter_bits" value="0" /> <param name="mode_selector" value="standard" /> <conditional name="purge_options"> <param name="purge_selector" value="set"/> <param name="n_hap" value="1"/> </conditional> <output name="raw_unitigs" file="hifiasm-out4-raw.gfa" ftype="gfa1" /> <output name="processed_unitigs" file="hifiasm-out4-processed.gfa" ftype="gfa1" /> <output name="primary_contig_graph" file="hifiasm-out4-primary.gfa" ftype="gfa1" /> <output name="alternate_contig_graph" ftype="gfa1"> <assert_contents> <has_size value="0"/> </assert_contents> </output> </test> <!-- Test min_hist_cnt option --> <test> <param name="reads" value="hifiasm-in1.fa.gz" ftype="fasta.gz" /> <param name="filter_bits" value="0" /> <param name="mode_selector" value="standard" /> <conditional name="advanced_options"> <param name="advanced_selector" value="set"/> <param name="min_hist_cnt" value="1"/> </conditional> <output name="raw_unitigs" file="hifiasm-out5-raw.gfa" ftype="gfa1" /> <output name="processed_unitigs" file="hifiasm-out5-processed.gfa" ftype="gfa1" /> <output name="primary_contig_graph" file="hifiasm-out5-primary.gfa" ftype="gfa1" /> <output name="alternate_contig_graph" ftype="gfa1"> <assert_contents> <has_size value="0"/> </assert_contents> </output> </test> </tests> <help><![CDATA[ *********************************** HiFiASM - a fast de novo assembler *********************************** Hifiasm is a fast haplotype-resolved de novo assembler for PacBio Hifi reads. It can assemble a human genome in several hours and works with the California redwood genome, one of the most complex genomes sequenced so far. Hifiasm can produce primary/alternate assemblies of quality competitive with the best assemblers. It also introduces a new graph binning algorithm and achieves the best haplotype-resolved assembly given trio data. #### Assembly mode - *Standard* - *Trio* When parental short reads are available, hifiasm can generate a pair of haplotype-resolved assemblies with trio binning. #### Trio Options - *Haplotype 1 reads* : list of hap1/paternal read names - *Haplotype 2 reads* : list of hap2/maternal read names - *Lower bound of the binned k-mer's frequency* - *Upper bound of the binned k-mer's frequency* *Bits for bloom filter* (-f) - A value of 0 disables the bloom filter for small genomes. For genomes much larger than human, applying -f 38 or even - f39 is preferred to save memory on k-mer counting. #### Advanced options - *Length of adapters to be removed* Old HiFi reads may contain short adapter sequences at the ends of reads. You can specify 20 to trim both ends of reads by 20bp. - *K-mer length* (must be <64) - *Minimizer window size* - *Drop K-mers* K-mers that occur more than this value multiplied by the coverage will be discarded - *Maximum overlaps to consider* consider up to max(-D*coverage,-N) overlaps for each oriented read - *Correction rounds* round of correction #### Assembly options - *Cleaning rounds* round of assembly cleaning - *Minimum contig bubble* size Pop contig graph bubbles smaller than this value - *Minimum unitig bubble* size Pop unitig graph bubbles smaller than this value - *Tip unitigs* Keep only tip unitigs with a number of reads greater than or equal to this value - *Maximum overlap drop ratio* - *Minimum overlap drop ratio* - *Skip post join contigs step* disable post join contigs step which may improve N50 #### Options for purging duplicates - *Purge level* 0: no purging; 1: light; 2: aggressive [0 for trio; 2 for unzip] - *Similarity threshold for duplicate haplotigs* - *Minimum overlapped reads for duplicate haplotigs* - *Coverage upper bound* If not set, this will be determined automatically - *Experimental high-heterozygosity mode* enable this mode for high heterozygosity sample NB: May be unstable #### Hi-C-partition options - *RNG seed* - *Rounds of reweighting Hi-C links* : increasing this may improves phasing results but takes longer time. - *Rounds of perturbation* : increasing this may improves phasing results but takes longer time. - *Fraction to flip for perturbation* : increasing this may improves phasing results but takes longer time. ### Outputs Non Trio assembly - Haplotype-resolved raw unitig graph in GFA format. This graph keeps all haplotype information, including somatic mutations and recurrent sequencing errors. - Haplotype-resolved processed unitig graph without small bubbles : Small bubbles might be caused by somatic mutations or noise in data, which are not the real haplotype information. - Primary assembly contig graph : This graph collapses different haplotypes. - Alternate assembly contig graph : This graph consists of all assemblies that are discarded in primary contig graph. Trio assembly - Haplotype-resolved raw unitig graph in GFA format . This graph keeps all haplotype information. - Phased paternal/haplotype1 contig graph. This graph keeps the phased paternal/haplotype1 assembly. - Phased maternal/haplotype2 contig graph. This graph keeps the phased maternal/haplotype2 assembly. ]]></help> <citations> <citation type="doi">10.1038/s41592-020-01056-5</citation> </citations> </tool>