Mercurial > repos > iuc > concoct
diff concoct.xml @ 3:3842ef1b2f34 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 1a79c139165648b969d82530784cea3fc8f2d2c0"
author | iuc |
---|---|
date | Thu, 07 Jul 2022 08:33:35 +0000 |
parents | 7a145c72d375 |
children | 28e8d2bd6aba |
line wrap: on
line diff
--- a/concoct.xml Fri Jul 01 14:14:44 2022 +0000 +++ b/concoct.xml Thu Jul 07 08:33:35 2022 +0000 @@ -1,99 +1,140 @@ <tool id="concoct" name="CONCOCT" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> - <description>metagenome binning</description> + <description>for metagenome binning</description> <macros> <import>macros.xml</import> </macros> <expand macro="requirements"/> <command detect_errors="exit_code"><![CDATA[ -#set pca_components_file_name = 'PCA_components_data_gt' + str($advanced.length_threshold) + '.csv' -#set pca_transformed_file_name = 'PCA_transformed_data_gt' + str($advanced.length_threshold) + '.csv' -#set clustering_file_name = 'clustering_gt' + str($advanced.length_threshold) + '.csv' - ## CONCOCT doesn't handle gzipped files. #if $composition_file.ext.endswith(".gz") - gunzip -c '$composition_file' > composition_file.fa && + gunzip -c '$composition_file' > 'composition_file.fa' && #else: - ln -s '$composition_file' composition_file.fa && + ln -s '$composition_file' 'composition_file.fa' && #end if mkdir outdir && concoct ---coverage_file '$coverage_file' ---composition_file composition_file.fa ---clusters $advanced.clusters ---kmer_length $advanced.kmer_length ---threads \${GALAXY_SLOTS:-4} ---length_threshold $advanced.length_threshold ---read_length $advanced.read_length ---total_percentage_pca $advanced.total_percentage_pca ---basename 'outdir/' ---seed $advanced.seed ---iterations $advanced.iterations ---epsilon $advanced.epsilon -$advanced.no_cov_normalization -$advanced.no_total_coverage ---no_original_data -$advanced.converge_out - -## Convert all CONCOCT .csv outputs to tabular. -&& sed 's/\("\([^"]*\)"\)\?,/\2\t/g' outdir/$pca_components_file_name > '$output_pca_components' -&& sed 's/\("\([^"]*\)"\)\?,/\2\t/g' outdir/$pca_transformed_file_name > '$output_pca_transformed' -&& sed 's/\("\([^"]*\)"\)\?,/\2\t/g' outdir/$clustering_file_name > '$output_clustering' -#if str($advanced.output_process_log) == 'yes': - && mv outdir/log.txt '$process_log' -#end if + --coverage_file '$coverage_file' + --composition_file 'composition_file.fa' + --clusters $advanced.clusters + --kmer_length $advanced.kmer_length + --threads \${GALAXY_SLOTS:-4} + --length_threshold $advanced.length_threshold + --read_length $advanced.read_length + --total_percentage_pca $advanced.total_percentage_pca + --basename 'outdir/' + --seed $advanced.seed + --iterations $advanced.iterations + $advanced.no_cov_normalization + $output.no_total_coverage + --no_original_data + $output.converge_out ]]></command> <inputs> - <param argument="--coverage_file" type="data" format="tabular" label="Tabular coverage file" help="Columns correspond to samples and rows to contigs"/> - <param argument="--composition_file" type="data" format="fasta,fasta.gz" label="Fasta file" help="Used to calculate the kmer composition (the genomic signature) of each contig"/> + <param argument="--coverage_file" type="data" format="tabular" label="Coverage file" help="Table where each row correspond to a contig, and each column correspond to a sample. The values are the average coverage for this contig in that sample"/> + <param argument="--composition_file" type="data" format="fasta,fasta.gz" label="Composition file with sequences" help="It is named the composition file since it is used to calculate the kmer composition (the genomic signature) of each contig."/> <section name="advanced" title="Advanced options"> - <param argument="--clusters" type="integer" value="400" label="Maximum number of clusters for the Variational Gaussian Mixture Model algorithm"/> - <param argument="--kmer_length" type="integer" value="4" label="Kmer length"/> - <param argument="--length_threshold" type="integer" value="1000" label="Sequence length threshold" help="Contigs shorter than this value will not be included"/> - <param argument="--read_length" type="integer" value="100" label="Read length for coverage"/> - <param argument="--total_percentage_pca" type="integer" value="100" label="Percentage of variance explained by the principal components for the combined data"/> - <param argument="--seed" type="integer" min="0" value="1" label="Integer to use as seed for clustering" help="Zero value will use random seed"/> - <param argument="--iterations" type="integer" value="500" label="Maximum number of iterations for the Variational Bayes Gaussian Mixture Models"/> - <param argument="--epsilon" type="float" value="0.000001" label="Epsilon for the Variational Gaussian Mixture Model algorithm"/> - <param argument="--no_cov_normalization" type="boolean" truevalue="--no_cov_normalization" falsevalue="" checked="false" label="Skip normalization and only do log transorm of the coverage?" help="By default, the coverage is normalized for samples, then normalized for contigs and finally log transformed"/> - <param argument="--no_total_coverage" type="boolean" truevalue="--no_total_coverage" falsevalue="" checked="false" label="Eliminate the total coverage column from the coverage data matrix?" help="By default, total coverage is included, independently of coverage normalization but previous to log transformation"/> - <param argument="--converge_out" type="boolean" truevalue="--converge_out" falsevalue="" checked="false" label="Output convergence information?"/> - <param name="output_process_log" type="select" label="Output process log file?"> - <option value="no" selected="true">No</option> - <option value="yes">Yes</option> - </param> + <param argument="--clusters" type="integer" min="0" value="400" label="Maximum number of clusters for the Variational Gaussian Mixture Model (VGMM) algorithm"/> + <param argument="--kmer_length" type="integer" min="0" value="4" label="Kmer length"/> + <param argument="--length_threshold" type="integer" min="0" value="1000" label="Sequence length threshold" help="Contigs shorter than this value will not be included"/> + <param argument="--read_length" type="integer" min="0" value="100" label="Read length for coverage"/> + <param argument="--total_percentage_pca" type="integer" min="0" value="100" label="Percentage of variance explained by the principal components for the combined data"/> + <param argument="--seed" type="integer" min="0" value="1" label="Seed for clustering" help="Zero value will use random seed"/> + <param argument="--iterations" type="integer" min="0" value="500" label="Maximum number of iterations for the Variational Bayes Gaussian Mixture Models (VBGMM)"/> + <param argument="--no_cov_normalization" type="boolean" truevalue="--no_cov_normalization" falsevalue="" checked="false" label="Skip normalization and only do log transorm of the coverage?" help="By default, the coverage is normalized for samples, then normalized for contigs and finally log transformed. By setting this flag you skip the normalization and only do log transorm of the coverage."/> + </section> + <section name="output" title="Output"> + <param argument="--no_total_coverage" type="boolean" truevalue="--no_total_coverage" falsevalue="" checked="false" label="Eliminate the total coverage column from the coverage data matrix?" help="By default, total coverage is included, independently of coverage normalization but previous to log transformation. Use this tag to escape this behaviour."/> + <param argument="--converge_out" type="boolean" truevalue="--converge_out" falsevalue="" checked="false" label="Write convergence information to files?"/> + <param name="log" type="boolean" checked="false" label="Output process log file?"/> </section> </inputs> <outputs> - <data name="process_log" format="txt" label="${tool.name} on ${on_string} (process log)"> - <filter>advanced['output_process_log'] == 'yes'</filter> + <data name="output_clustering" format="csv" from_work_dir="outdir/clustering_gt*" label="${tool.name} on ${on_string}: Clusters"/> + <data name="process_log" format="txt" from_work_dir="outdir/log.txt" label="${tool.name} on ${on_string}: Log"> + <filter>output['log']</filter> </data> - <data name="output_pca_components" format="tabular" label="${tool.name} on ${on_string} (PCA components)"/> - <data name="output_pca_transformed" format="tabular" label="${tool.name} on ${on_string} (PCA transformed)"/> - <data name="output_clustering" format="tabular" label="${tool.name} on ${on_string} (Clusters)"/> + <data name="output_pca_components" format="csv" from_work_dir="outdir/PCA_components_data_gt*" label="${tool.name} on ${on_string}: PCA components"/> + <data name="output_pca_transformed" format="csv" from_work_dir="outdir/PCA_transformed_data_gt*" label="${tool.name} on ${on_string}: PCA transformed clusters"/> </outputs> <tests> <test expect_num_outputs="4"> - <param name="coverage_file" value="input1.tabular" ftype="tabular"/> - <param name="composition_file" value="input1.fa.gz" ftype="fasta.gz"/> - <param name="output_process_log" value="yes"/> - <output name="process_log" file="process_log.txt" ftype="txt" compare="re_match"/> - <output name="output_pca_components" ftype="tabular"> + <param name="coverage_file" value="coverage" ftype="tabular"/> + <param name="composition_file" value="composition.fa" ftype="fasta"/> + <section name="advanced"> + <param name="clusters" value="400"/> + <param name="kmer_length" value="4"/> + <param name="length_threshold" value="1000"/> + <param name="read_length" value="100"/> + <param name="total_percentage_pca" value="100"/> + <param name="seed" value="1"/> + <param name="iterations" value="500"/> + <param name="no_cov_normalization" value=""/> + </section> + <section name="output"> + <param name="no_total_coverage" value=""/> + <param name="converge_out" value=""/> + <param name="log" value="true"/> + </section> + <output name="process_log" ftype="txt" compare="contains"> <assert_contents> - <has_size value="367636"/> - <has_text text="7377051e-02"/> + <has_size value="786"/> + <has_text text="CONCOCT Finished"/> + </assert_contents> + </output> + <output name="output_pca_components" ftype="csv"> + <assert_contents> + <has_size value="362924" delta="10"/> + <has_text text="-5.90697200e-02"/> + </assert_contents> + </output> + <output name="output_pca_transformed" ftype="csv"> + <assert_contents> + <has_size value="834200" delta="10"/> + <has_text text="contig-21000001"/> </assert_contents> </output> - <output name="output_pca_transformed" ftype="tabular"> + <output name="output_clustering" ftype="csv"> <assert_contents> - <has_size value="737926"/> - <has_text text="NODE_103_length_20202_cov_8.395357.0"/> + <has_size value="6923" delta="10"/> + <has_text text="contig-21000001,"/> </assert_contents> </output> - <output name="output_clustering" ftype="tabular"> + </test> + <test expect_num_outputs="3"> + <param name="coverage_file" value="coverage" ftype="tabular"/> + <param name="composition_file" value="composition.fa.gz" ftype="fasta.gz"/> + <section name="advanced"> + <param name="clusters" value="400"/> + <param name="kmer_length" value="4"/> + <param name="length_threshold" value="1000"/> + <param name="read_length" value="100"/> + <param name="total_percentage_pca" value="100"/> + <param name="seed" value="1"/> + <param name="iterations" value="500"/> + <param name="no_cov_normalization" value=""/> + </section> + <section name="output"> + <param name="no_total_coverage" value=""/> + <param name="converge_out" value=""/> + <param name="log" value="false"/> + </section> + <output name="output_pca_components" ftype="csv"> <assert_contents> - <has_size value="12167"/> - <has_text text="NODE_103_length_20202_cov_8.395357"/> + <has_size value="362924" delta="10"/> + <has_text text="-5.90697200e-02"/> + </assert_contents> + </output> + <output name="output_pca_transformed" ftype="csv"> + <assert_contents> + <has_size value="834200" delta="10"/> + <has_text text="contig-21000001"/> + </assert_contents> + </output> + <output name="output_clustering" ftype="csv"> + <assert_contents> + <has_size value="6923" delta="10"/> + <has_text text="contig-21000001,"/> </assert_contents> </output> </test> @@ -103,7 +144,7 @@ CONCOCT (Clustering cONtigs with COverage and ComposiTion) performs unsupervised binning of metagenomic contigs by using nucleotide composition - kmer frequencies - and coverage data for multiple samples. CONCOCT can accurately -(up to species level) bin metagenomic contigs. +(up to species level) bin metagenomic contigs. The tool accepts 2 inputs; a tabular file where each row corresponds to a contig and each column corresponds to a sample (the values are the average coverage for this contig in that sample) and a file containing sequences in