Mercurial > repos > iuc > scanpy_remove_confounders

diff remove_confounders.xml @ 17:b70219da4a96 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scanpy/ commit 91121b1e72696f17478dae383badaa71e9f96dbb
author: iuc
date: Sat, 14 Sep 2024 12:43:38 +0000
parents: c65fce500915
--- a/remove_confounders.xml	Tue Aug 20 09:51:42 2024 +0000
+++ b/remove_confounders.xml	Sat Sep 14 12:43:38 2024 +0000
@@ -1,67 +1,164 @@
-<tool id="scanpy_remove_confounders" name="Remove confounders" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@profile@">
+<tool id="scanpy_remove_confounders" name="Scanpy remove confounders" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
     <description>with scanpy</description>
     <macros>
         <import>macros.xml</import>
     </macros>
     <expand macro="bio_tools"/>
-    <expand macro="requirements"/>
+    <expand macro="requirements">
+        <requirement type="package" version="1.6.0">bbknn</requirement>
+        <requirement type="package" version="0.0.10">harmonypy</requirement>
+        <requirement type="package" version="1.7.4">scanorama</requirement>
+        <requirement type="package" version="0.5.13">pynndescent</requirement>
+    </expand>
     <command detect_errors="exit_code"><![CDATA[
 @CMD@
       ]]></command>
     <configfiles>
         <configfile name="script_file"><![CDATA[
-@CMD_imports@
-@CMD_read_inputs@
+@CMD_IMPORTS@
+@CMD_READ_INPUTS@
 
-#if $method.method == "pp.regress_out"
+#if str($method.method) == 'pp.regress_out':
+print("stats before regress_out:", "min=", adata.X.min(), "max=", adata.X.max(), "mean=", adata.X.mean())
+
+import os
 sc.pp.regress_out(
    adata=adata,
+   #if str($method.layer) != '':
+   layer='$method.layer',
+   #end if
    #set $keys = [str(x.strip()) for x in str($method.keys).split(',')]
    keys=$keys,
+   n_jobs = int(os.getenv("GALAXY_SLOTS", 4)),
    copy=False)
 
-#else if $method.method == "pp.mnn_correct"
-    #for i, filepath in enumerate($methods.extra_adata)
-adata_$i = ad.read('$filepath')
-    #end for
+print("stats after regress_out:", "min=", adata.X.min(), "max=", adata.X.max(), "mean=", adata.X.mean())
 
-sc.pp.mnn_correct(
-    adata,
-    #for i, filepath in enumerate($methods.extra_adata)
-    adata_$i,
-    #end for
-    #if $methods.var_subset
-    #set $var_subset=([x.strip() for x in str($method.var_subset).split(',')])
-    var_subset=$var_subset,
-    #end if
-    batch_key='$method.batch_key',
-    index_unique='$method.index_unique'
-    #if $methods.batch_categories
-    #set $batch_categories=([x.strip() for x in str($method.batch_categories).split(',')])
-    batch_categories=$batch_categories,
-    #end if
-    k=$method.k,
-    sigma=$method.sigma,
-    cos_norm_in=$method.cos_norm_in,
-    cos_norm_out=$method.cos_norm_out,
-    svd_dim=$method.svd_dim,
-    var_adj=$method.var_adj,
-    compute_angle=$method.compute_angle,
-    mnn_order='$method.mnn_order',
-    svd_mode='$method.svd_mode',
-    do_concatenate=True,
-    save_raw=True,
-    n_jobs=\${GALAXY_SLOTS:-4})
+## This function is commented out because the conda package is not working. Please add this if there is user demand and the conda package is fixed. If not please remove in the next update.
+## #else if str($method.method) == 'external.pp.mnn_correct':
+##     #if $method.extra_adata:
+##     #for i, filepath in enumerate($method.extra_adata)
+## adata_$i = sc.read_h5ad('$filepath')
+##     #end for
+##     #end if
+## import os
+## corrected = sc.external.pp.mnn_correct(
+##                 adata,
+##                 #if $method.extra_adata:
+##                 #for i, filepath in enumerate($method.extra_adata)
+##                 adata_$i,
+##                 #end for
+##                 #end if
+##                 #if str($method.var_subset) != '':
+##                 #set $var_subset=([x.strip() for x in str($method.var_subset).split(',')])
+##                 var_subset=$var_subset,
+##                 #end if
+##                 batch_key='$method.batch_key',
+##                 index_unique='$method.index_unique',
+##                 #if str($method.batch_categories) != '':
+##                 #set $batch_categories=([x.strip() for x in str($method.batch_categories).split(',')])
+##                 batch_categories=$batch_categories,
+##                 #end if
+##                 k=$method.k,
+##                 sigma=$method.sigma,
+##                 cos_norm_in=$method.cos_norm_in,
+##                 cos_norm_out=$method.cos_norm_out,
+##                 #if str($method.svd_dim) != '':
+##                 svd_dim=$method.svd_dim,
+##                 #end if
+##                 var_adj=$method.var_adj,
+##                 compute_angle=$method.compute_angle,
+##                 #if str($method.mnn_order) != '':
+##                 mnn_order='$method.mnn_order',
+##                 #end if
+##                 #if str($method.svd_mode) != '':
+##                 svd_mode='$method.svd_mode',
+##                 #end if
+##                 do_concatenate=True,
+##                 save_raw=True,
+##                 n_jobs = int(os.getenv("GALAXY_SLOTS", 4)))
 
-#else if $method.method == "pp.combat"
+## adata = corrected[0]
+
+
+#else if str($method.method) == 'pp.combat':
+print("stats before combat:", "min=", adata.X.min(), "max=", adata.X.max(), "mean=", adata.X.mean())
+
 sc.pp.combat(
     adata,
     key='$method.key',
+    #if str($method.covariates) != '':
+        #set $covariates = [str(x.strip()) for x in str($method.covariates).split(',')]
+        covariates=$covariates,
+    #end if
     inplace=True)
 
+print("stats after combat:", "min=", adata.X.min(), "max=", adata.X.max(), "mean=", adata.X.mean())
+
+#else if str($method.method) == 'external.pp.bbknn':
+sc.external.pp.bbknn(
+    adata,
+    batch_key='$method.batch_key',
+    use_rep='$method.use_rep',
+    #if str($method.approx.approx_method) == 'no':
+    approx=False,
+    #else if str($method.approx.approx_method) == 'annoy':
+    approx=True,
+    use_annoy=True,
+    annoy_n_trees=$method.approx.annoy_n_trees,
+    #else if str($method.approx.approx_method) == 'pyNNDescent':
+    approx=True,
+    use_annoy=False,
+    pynndescent_n_neighbors=$method.approx.pynndescent_n_neighbors,
+    pynndescent_random_state=$method.approx.pynndescent_random_state,
+    #end if
+    metric='$method.metric',
+    neighbors_within_batch=$method.neighbors_within_batch,
+    n_pcs=$method.n_pcs,
+    #if str($method.trim) != '':
+    trim=$method.trim,
+    #end if
+    set_op_mix_ratio=$method.set_op_mix_ratio,
+    local_connectivity=$method.local_connectivity,
+    copy=False)
+
+#else if str($method.method) == 'external.pp.harmony_integrate':
+sc.external.pp.harmony_integrate(
+    adata,
+    key='$method.key',
+    basis='$method.basis',
+    adjusted_basis='$method.adjusted_basis',
+    #if str($method.theta) != '':
+    theta=$method.theta,
+    #end if
+    #if str($method.lamb) != '':
+    lamb=$method.lamb,
+    #end if
+    sigma=$method.sigma,
+    #if str($method.nclust) != '':
+    nclust=$method.nclust,
+    #end if
+    tau=$method.tau,
+    block_size=$method.block_size,
+    max_iter_harmony=$method.max_iter_harmony,
+    max_iter_kmeans=$method.max_iter_kmeans,
+    epsilon_cluster=$method.epsilon_cluster,
+    epsilon_harmony=$method.epsilon_harmony)
+
+#else if str($method.method) == 'external.pp.scanorama_integrate':
+sc.external.pp.scanorama_integrate(
+    adata,
+    key='$method.key',
+    basis='$method.basis',
+    adjusted_basis='$method.adjusted_basis',
+    knn=$method.knn,
+    sigma=$method.sigma,
+    approx=$method.approx,
+    alpha=$method.alpha,
+    batch_size=$method.batch_size)
 #end if
 
-@CMD_anndata_write_outputs@
+@CMD_ANNDATA_WRITE_OUTPUTS@
 ]]></configfile>
     </configfiles>
     <inputs>
@@ -69,21 +166,29 @@
         <conditional name="method">
             <param argument="method" type="select" label="Method used for plotting">
                 <option value="pp.regress_out">Regress out unwanted sources of variation, using 'pp.regress_out'</option>
-                <option value="pp.mnn_correct">Correct batch effects by matching mutual nearest neighbors, using 'pp.mnn_correct'</option>
+                <!-- This function is commented out because the conda package is not working. Please add this if there is user demand and the conda package is fixed. If not please remove in the next update. -->
+                <!-- <option value="external.pp.mnn_correct">Correct batch effects by matching mutual nearest neighbors, using 'pp.mnn_correct'</option> -->
                 <option value="pp.combat">Correct batch effects with ComBat function, using 'pp.combat'</option>
+                <option value="external.pp.bbknn">Batch effect removal with Batch balanced KNN (BBKNN), using 'external.pp.bbknn'</option>
+                <option value="external.pp.harmony_integrate">Integrate multiple single-cell experiments with Harmony, using 'external.pp.harmony_integrate'</option>
+                <option value="external.pp.scanorama_integrate">Integrate multiple single-cell experiments with Scanorama, using 'external.pp.scanorama_integrate'</option>
             </param>
             <when value="pp.regress_out">
+                <param argument="layer" type="text" value="" label="Which element of layers to regress on">
+                    <expand macro="sanitize_query"/>
+                </param>
                 <param argument="keys" type="text" value="" label="Keys for observation annotation on which to regress on" help="Keys separated by a comma">
-                    <expand macro="sanitize_query" />
+                    <expand macro="sanitize_query"/>
                 </param>
             </when>
-            <when value="pp.mnn_correct">
+            <!-- This function is commented out because the conda package is not working. Please add this if there is user demand and the conda package is fixed. If not please remove in the next update. -->
+            <!-- <when value="external.pp.mnn_correct">
                 <param name="extra_adata" type="data" multiple="true" optional="true" format="h5ad" label="Extra annotated data matrix" help="They should have same number of variables."/>
                 <param argument="var_subset" type="text" value="" optional="true" label="The subset of vars to be used when performing MNN correction" help="List of comma-separated key from '.var_names'. If not set, all vars are used">
-                    <expand macro="sanitize_query" />
+                    <expand macro="sanitize_query"/>
                 </param>
                 <param argument="batch_key" type="text" value="batch" label="Batch key for the concatenate">
-                    <expand macro="sanitize_query" />
+                    <expand macro="sanitize_query"/>
                 </param>
                 <param name="index_unique" type="select" label="Separator to join the existing index names with the batch category" help="Leave it empty to keep existing indices">
                     <option value="-">-</option>
@@ -92,7 +197,7 @@
                     <option value="/">/</option>
                 </param>
                 <param argument="batch_categories" type="text" value="" optional="true" label="Batch categories for the concatenate" help="List of comma-separated key">
-                    <expand macro="sanitize_query" />
+                    <expand macro="sanitize_query"/>
                 </param>
                 <param argument="k" type="integer" value="20" label="Number of mutual nearest neighbors"/>
                 <param argument="sigma" type="float" value="1" label="The bandwidth of the Gaussian smoothing kernel used to compute the correction vectors"/>
@@ -102,18 +207,89 @@
                 <param argument="var_adj" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Adjust variance of the correction vectors?" help="This step takes most computing time."/>
                 <param argument="compute_angle" type="boolean" truevalue="True" falsevalue="False" checked="false" label="compute the angle between each cell’s correction vector and the biological subspace of the reference batch?"/>
                 <param argument="mnn_order" type="text" value="" optional="true" label="The order in which batches are to be corrected" help="List of comma-separated key. If not set, datas are corrected sequentially">
-                    <expand macro="sanitize_query" />
+                    <expand macro="sanitize_query"/>
                 </param>
                 <param name="svd_mode" type="select" label="SVD mode">
                     <option value="svd">svd: SVD using a non-randomized SVD-via-ID algorithm</option>
                     <option value="rsvd" selected="true">rsvd: SVD using a randomized SVD-via-ID algorithm</option>
                     <option value="irlb">irlb: truncated SVD by implicitly restarted Lanczos bidiagonalization</option>
                 </param>
-            </when>
+            </when> -->
             <when value="pp.combat">
                 <param argument="key" type="text" value="batch" label="Key to a categorical annotation from adata.obs that will be used for batch effect removal">
-                    <expand macro="sanitize_query" />
+                    <expand macro="sanitize_query"/>
+                </param>
+                <param argument="covariates" type="text" value="" optional="true" label="Additional covariates besides the batch variable such as adjustment variables or biological condition.">
+                    <expand macro="sanitize_query"/>
+                </param>
+            </when>
+            <when value="external.pp.bbknn">
+                <param argument="batch_key" type="text" value="batch" label="Batch key for the concatenate">
+                    <expand macro="sanitize_query"/>
+                </param>
+                <param argument="use_rep" type="text" value="X_pca" label="The dimensionality reduction in .obsm to use for neighbour detection">
+                    <expand macro="sanitize_query"/>
+                </param>
+                <conditional name="approx">
+                    <param name="approx_method" type="select" label="Approximate neighbour finding">
+                        <option value="annoy" selected="true">Yes, using ANNOY algorithm</option>
+                        <option value="pyNNDescent">Yes, using pyNNDescent</option>
+                        <option value="no">Do not use approximate neighbor finding</option>
+                    </param>
+                    <when value="annoy">
+                        <param argument="annoy_n_trees" type="integer" value="10" label="The number of trees to construct in the annoy forest" help="More trees give higher precision when querying, at the cost of increased run time and resource intensity"/>
+                    </when>
+                    <when value="pyNNDescent">
+                        <param argument="pynndescent_n_neighbors" type="integer" value="30" label="The number of neighbours to include in the approximate neighbour graph" help="More neighbours give higher precision when querying, at the cost of increased run time and resource intensity"/>
+                        <param argument="pynndescent_random_state" type="integer" value="0" label="The RNG seed to use when creating the graph"/>
+                    </when>
+                    <when value="no"/>
+                </conditional>
+                <param name="metric" type="select" label="Distance metric to use for the data">
+                    <expand macro="distance_metric_options"/>
                 </param>
+                <param argument="neighbors_within_batch" type="integer" value="3" label="Number of top neighbours to report for each batch" help="total number of neighbours in the initial k-nearest-neighbours computation will be this number times the number of batches. This then serves as the basis for the construction of a symmetrical matrix of connectivities"/>
+                <param argument="n_pcs" type="integer" value="50" label="Number of dimensions to use in the analysis" help="in case of PCA, principal components"/>
+                <param argument="trim" type="integer" value="" optional="true" label="Trim the neighbours of each cell to these many top connectivities" help=" The lower the value the more independent the individual populations, at the cost of more conserved batch effect. If not set, sets the parameter value automatically to 10 times neighbors_within_batch times the number of batches. Set to 0 to skip."/>
+                <param argument="set_op_mix_ratio" type="float" value="1.0" min="0" max="1" label="UMAP connectivity computation parameter" help="controlling the blend between a connectivity matrix formed exclusively from mutual nearest neighbour pairs (0) and a union of all observed neighbour relationships with the mutual pairs emphasised (1)"/>
+                <param argument="local_connectivity" type="integer" value="1" label="Number of nearest neighbors of each cell are assumed to be fully connected"/>
+            </when>
+            <when value="external.pp.harmony_integrate">
+                <param argument="key" type="text" value="batch" label="The name of the column in adata.obs that differentiates among experiments/batches">
+                    <expand macro="sanitize_query"/>
+                </param>
+                <param argument="basis" type="text" value="X_pca" label="The name of the field in adata.obsm where the PCA table is stored">
+                    <expand macro="sanitize_query"/>
+                </param>
+                <param argument="adjusted_basis" type="text" value="X_pca_harmony" label="The name of the field in adata.obsm where the adjusted PCA table will be stored after running this function">
+                    <expand macro="sanitize_query"/>
+                </param>
+                <param argument="theta" type="integer" value="" optional="true" label="Diversity clustering penalty parameter" help="Default theta=2. theta=0 does not encourage any diversity. Larger values of theta result in more diverse clusters."/>
+                <param argument="lamb" type="integer" value="" optional="true" min="1" label="Ridge regression penalty parameter" help="Default lamb=1. Lambda must be strictly positive. Smaller values result in more aggressive correction."/>
+                <param argument="sigma" type="float" value="0.1" label="Width of soft kmeans clusters" help="Sigma scales the distance from a cell to cluster centroids. Larger values of sigma result in cells assigned to more clusters. Smaller values of sigma make soft kmeans cluster approach hard clustering."/>
+                <param argument="nclust" type="integer" value="" optional="true" label="Number of clusters in model" help="nclust=1 equivalent to simple linear regression."/>
+                <param argument="tau" type="integer" value="0" label="Expected number of cells per cluster" help="Protection against overclustering small datasets with large ones"/>
+                <param argument="block_size" type="float" value="0.05" min="0" max="1" label="Proportion of cells to update during clustering" help="Larger values may be faster but less accurate"/>
+                <param argument="max_iter_harmony" type="integer" value="10" label="Maximum number of rounds to run Harmony" help="One round of Harmony involves one clustering and one correction step"/>
+                <param argument="max_iter_kmeans" type="integer" value="20" label="Maximum number of rounds to run clustering at each round of Harmony"/>
+                <param argument="epsilon_cluster" type="float" value="1e-5" label="Convergence tolerance for clustering round of Harmony"/>
+                <param argument="epsilon_harmony" type="float" value="1e-4" label="Convergence tolerance for Harmony"/>
+            </when>
+            <when value="external.pp.scanorama_integrate">
+                <param argument="key" type="text" value="batch" label="The name of the column in adata.obs that differentiates among experiments/batches">
+                    <expand macro="sanitize_query"/>
+                </param>
+                <param argument="basis" type="text" value="X_pca" label="The name of the field in adata.obsm where the PCA table is stored">
+                    <expand macro="sanitize_query"/>
+                </param>
+                <param argument="adjusted_basis" type="text" value="X_scanorama" label="The name of the field in adata.obsm where the adjusted PCA table will be stored after running this function">
+                    <expand macro="sanitize_query"/>
+                </param>
+                <param argument="knn" type="integer" value="20" label="Number of nearest neighbors to use for matching"/>
+                <param argument="sigma" type="integer" value="15" label="Correction smoothing parameter on Gaussian kernel"/>
+                <param argument="approx" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Use approximate nearest neighbors with Python annoy" help="greatly speeds up matching runtime"/>
+                <param argument="alpha" type="float" value="0.1" label="Alignment score minimum cutoff"/>
+                <param argument="batch_size" type="integer" value="5000" label="The batch size used in the alignment vector computation" help="Useful when integrating very large (>100k samples) datasets. Set to large value that runs within available memory."/>
             </when>
         </conditional>
         <expand macro="inputs_common_advanced"/>
@@ -122,76 +298,228 @@
         <expand macro="anndata_outputs"/>
     </outputs>
     <tests>
+        
+        <!-- test 1 -->
         <test expect_num_outputs="2">
-            <!-- test 0 -->
-            <param name="adata" value="krumsiek11.h5ad" />
+            <param name="adata" value="pp.pca.krumsiek11.batch.h5ad"/>
             <conditional name="method">
                 <param name="method" value="pp.regress_out"/>
-                <param name="keys" value="cell_type"/>
+                <param name="keys" value="batch"/>
             </conditional>
             <section name="advanced_common">
-                <param name="show_log" value="true" />
+                <param name="show_log" value="true"/>
             </section>
             <output name="hidden_output">
                 <assert_contents>
                     <has_text_matching expression="sc.pp.regress_out"/>
-                    <has_text_matching expression="keys=\['cell_type'\]"/>
+                    <has_text_matching expression="keys=\['batch'\]"/>
+                    <has_text_matching expression="stats before regress_out: min= -0.0163 max= 1.0106 mean= 0.2864376"/>
+                    <has_text_matching expression="stats after regress_out: min= -0.7017021868145134 max= 0.7091581022301392 mean= -1.730938624756494e-16"/>
+                </assert_contents>
+            </output>
+            <output name="anndata_out" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="varm/PCs"/>
                 </assert_contents>
             </output>
-            <output name="anndata_out" file="pp.regress_out.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
         </test>
-        <!--<test expect_num_outputs="2">
-            < test 2 >
-            <param name="adata" value="krumsiek11.h5ad" />
+
+        <!-- This function is commented out because the conda package is not working. Please add this if there is user demand and the conda package is fixed. If not please remove in the next update. -->
+        <!-- test 2 -->
+        <!-- <test expect_num_outputs="2">
+            <param name="adata" value="pp.pca.krumsiek11.batch.h5ad"/>
+            <param name="extra_adata" value="pp.pca.krumsiek11.batch.h5ad"/>
             <conditional name="method">
-                <param name="method" value="pp.mnn_correct"/>
-                <param name="reg_keys" value="cell_type"/>
+                <param name="method" value="external.pp.mnn_correct"/>
+                <param name="batch_key" value="batch"/>
             </conditional>
+            <section name="advanced_common">
+                <param name="show_log" value="true"/>
+            </section>
             <assert_stdout>
-                <has_text_matching expression="sc.pp.mnn_correct"/>
-                <has_text_matching expression="keys='cell_type'"/>
+                <has_text_matching expression="I_have_to_check"/>
             </assert_stdout>
-            <output name="anndata_out" file="pp.mnn_correct.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
-        </test>-->
+            <output name="hidden_output">
+                <assert_contents>
+                    <has_text_matching expression="sc.external.pp.mnn_correct"/>
+                    <has_text_matching expression="batch_key='batch'"/>
+                </assert_contents>
+            </output>
+            <output name="anndata_out" file="external.pp.mnn_correct.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
+        </test> -->
+
+        <!-- test 3 -->
         <test expect_num_outputs="2">
-            <!-- test 1 -->
-            <param name="adata" value="blobs.h5ad" />
+            <param name="adata" value="pp.pca.krumsiek11.batch.h5ad"/>
             <conditional name="method">
                 <param name="method" value="pp.combat"/>
-                <param name="key" value="blobs"/>
             </conditional>
             <section name="advanced_common">
-                <param name="show_log" value="true" />
+                <param name="show_log" value="true"/>
             </section>
             <output name="hidden_output">
                 <assert_contents>
                     <has_text_matching expression="sc.pp.combat"/>
-                    <has_text_matching expression="key='blobs'"/>
+                    <has_text_matching expression="key='batch'"/>
+                    <has_text_matching expression="stats before combat: min= -0.0163 max= 1.0106 mean= 0.2864376"/>
+                    <has_text_matching expression="stats after combat: min= -0.07474318799213325 max= 1.2280063438242503 mean= 0.2870530757430964"/>
+                </assert_contents>
+            </output>
+            <output name="anndata_out" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obsm/X_pca"/>
+                </assert_contents>
+            </output>
+        </test>
+
+        <!-- test 4 -->
+        <test expect_num_outputs="2">
+            <param name="adata" value="pp.pca.krumsiek11.batch.h5ad"/>
+            <conditional name="method">
+                <param name="method" value="external.pp.bbknn"/>
+                <param name="n_pcs" value="10"/>
+                <param name="set_op_mix_ratio" value="0.5"/>
+            </conditional>
+            <section name="advanced_common">
+                <param name="show_log" value="true"/>
+            </section>
+            <output name="hidden_output">
+                <assert_contents>
+                    <has_text_matching expression="external.pp.bbknn"/>
+                    <has_text_matching expression="batch_key='batch'"/>
+                    <has_text_matching expression="use_rep='X_pca'"/>
+                    <has_text_matching expression="use_annoy=True"/>
+                    <has_text_matching expression="annoy_n_trees=10"/>
+                    <has_text_matching expression="neighbors_within_batch=3"/>
+                    <has_text_matching expression="n_pcs=10"/>
+                    <has_text_matching expression="set_op_mix_ratio=0.5"/>
+                </assert_contents>
+            </output>
+            <output name="anndata_out" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="uns/neighbors"/>
+                    <has_h5_keys keys="obsp/distances,obsp/connectivities"/>
                 </assert_contents>
             </output>
-            <output name="anndata_out" file="pp.combat.blobs.h5ad" ftype="h5ad" compare="sim_size"/>
+        </test>
+
+        <!-- test 5 -->
+        <test expect_num_outputs="2">
+            <param name="adata" value="pp.pca.krumsiek11.batch.h5ad"/>
+            <conditional name="method">
+                <param name="method" value="external.pp.bbknn"/>
+                <conditional name="approx">
+                    <param name="approx_method" value="pyNNDescent"/>
+                    <param name="pynndescent_n_neighbors" value="10"/>
+                </conditional>
+                <param name="n_pcs" value="10"/>
+                <param name="set_op_mix_ratio" value="0.5"/>
+            </conditional>
+            <section name="advanced_common">
+                <param name="show_log" value="true"/>
+            </section>
+            <output name="hidden_output">
+                <assert_contents>
+                    <has_text_matching expression="external.pp.bbknn"/>
+                    <has_text_matching expression="batch_key='batch'"/>
+                    <has_text_matching expression="use_rep='X_pca'"/>
+                    <has_text_matching expression="use_annoy=False"/>
+                    <has_text_matching expression="pynndescent_n_neighbors=10"/>
+                    <has_text_matching expression="pynndescent_random_state=0"/>
+                    <has_text_matching expression="neighbors_within_batch=3"/>
+                    <has_text_matching expression="n_pcs=10"/>
+                    <has_text_matching expression="set_op_mix_ratio=0.5"/>
+                </assert_contents>
+            </output>
+            <output name="anndata_out" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="uns/neighbors"/>
+                    <has_h5_keys keys="obsp/distances,obsp/connectivities"/>
+                </assert_contents>
+            </output>
+        </test>
+
+        <!-- test 6 -->
+        <test expect_num_outputs="2">
+            <param name="adata" value="pp.pca.krumsiek11.batch.h5ad"/>
+            <conditional name="method">
+                <param name="method" value="external.pp.harmony_integrate"/>
+                <param name="theta" value="2"/>
+                <param name="lamb" value="1"/>
+                <param name="block_size" value="0.1"/>
+            </conditional>
+            <section name="advanced_common">
+                <param name="show_log" value="true"/>
+            </section>
+            <output name="hidden_output">
+                <assert_contents>
+                    <has_text_matching expression="external.pp.harmony_integrate"/>
+                    <has_text_matching expression="key='batch'"/>
+                    <has_text_matching expression="basis='X_pca'"/>
+                    <has_text_matching expression="adjusted_basis='X_pca_harmony'"/>
+                    <has_text_matching expression="theta=2"/>
+                    <has_text_matching expression="lamb=1"/>
+                    <has_text_matching expression="block_size=0.1"/>
+                </assert_contents>
+            </output>
+            <output name="anndata_out" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obsm/X_pca_harmony"/>
+                </assert_contents>
+            </output>
+        </test>
+
+        <!-- test 7 -->
+        <test expect_num_outputs="2">
+            <param name="adata" value="pp.pca.krumsiek11.batch.h5ad"/>
+            <conditional name="method">
+                <param name="method" value="external.pp.scanorama_integrate"/>
+                <param name="knn" value="2"/>
+                <param name="sigma" value="10"/>
+                <param name="batch_size" value="100"/>
+            </conditional>
+            <section name="advanced_common">
+                <param name="show_log" value="true"/>
+            </section>
+            <output name="hidden_output">
+                <assert_contents>
+                    <has_text_matching expression="external.pp.scanorama_integrate"/>
+                    <has_text_matching expression="key='batch'"/>
+                    <has_text_matching expression="basis='X_pca'"/>
+                    <has_text_matching expression="adjusted_basis='X_scanorama'"/>
+                    <has_text_matching expression="knn=2"/>
+                    <has_text_matching expression="sigma=1"/>
+                    <has_text_matching expression="batch_size=100"/>
+                </assert_contents>
+            </output>
+            <output name="anndata_out" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obsm/X_scanorama"/>
+                </assert_contents>
+            </output>
         </test>
     </tests>
     <help><![CDATA[
 Regress out unwanted sources of variation, using `pp.regress_out`
 =================================================================
 
-Regress out unwanted sources of variation, using simple linear regression. This is 
+Regress out unwanted sources of variation, using simple linear regression. This is
 inspired by Seurat's `regressOut` function in R.
 
 More details on the `scanpy documentation
 <https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.regress_out.html>`__
 
-Correct batch effects by matching mutual nearest neighbors, using `pp.mnn_correct`
-==================================================================================
+.. This function is commented out because the conda package is not working. Please add this if there is user demand and the conda package is fixed. If not please remove in the next update.
+.. Correct batch effects by matching mutual nearest neighbors, using `external.pp.mnn_correct`
+.. ===========================================================================================
 
-This uses the implementation of mnnpy. Depending on do_concatenate, it returns AnnData objects in the 
-original order containing corrected expression values or a concatenated matrix or AnnData object.
+.. This uses the implementation of mnnpy. Depending on do_concatenate, it returns AnnData objects in the
+.. original order containing corrected expression values or a concatenated matrix or AnnData object.
 
-Be reminded that it is not advised to use the corrected data matrices for differential expression testing.
+.. Be reminded that it is not advised to use the corrected data matrices for differential expression testing.
 
-More details on the `scanpy documentation
-<https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.mnn_correct.html>`__
+.. More details on the `scanpy documentation
+.. <https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.mnn_correct.html>`__
 
 
 Correct batch effects with ComBat function (`pp.combat`)
@@ -203,6 +531,33 @@
 <https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.combat.html>`__
 
 
+Correct batch effects with bbknn function (`external.pp.bbknn`)
+===============================================================
+
+Batch balanced kNN alters the kNN procedure to identify each cell’s top neighbours in each batch separately instead of the entire cell pool with no accounting for batch. The nearest neighbours for each batch are then merged to create a final list of neighbours for the cell. Aligns batches in a quick and lightweight manner.
+
+More details on the `scanpy documentation
+<https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.bbknn.html>`__
+
+
+Correct batch effects with harmony function (`external.pp.harmony_integrate`)
+=============================================================================
+
+Harmony is an algorithm for integrating single-cell data from multiple experiments.
+As Harmony works by adjusting the principal components, this function should be run after performing PCA but before computing the neighbor graph.
+
+More details on the `scanpy documentation
+<https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.harmony_integrate.html>`__
+
+
+Correct batch effects with scanprama function (`external.pp.scanorama_integrate`)
+=================================================================================
+
+Scanprama is an algorithm for integrating single-cell data from multiple experiments stored in an AnnData object. This function should be run after performing PCA but before computing the neighbor graph.
+
+More details on the `scanpy documentation
+<https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.scanorama_integrate.html>`__
+
     ]]></help>
     <expand macro="citations"/>
 </tool>
author	iuc
date	Sat, 14 Sep 2024 12:43:38 +0000
parents	c65fce500915
children