diff bioconductor_scp.xml @ 0:cd2f3a280463 draft default tip

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/bioconductor-scp commit a0a1a3de5dd24b2aabe96ec3d6f89acdcf5e462b
author recetox
date Wed, 22 Jan 2025 07:44:00 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bioconductor_scp.xml	Wed Jan 22 07:44:00 2025 +0000
@@ -0,0 +1,515 @@
+<tool id="bioconductor_scp" name="bioconductor-scp" version="@TOOL_VERSION@+galaxy0" profile="23.0">
+    <description>single cell proteomics data analysis workflow</description>
+    <macros>
+        <import>macros.xml</import>
+        <import>help.xml</import>
+    </macros>
+    <xrefs>
+        <xref type="bio.tools">scp</xref>
+    </xrefs>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">bioconductor-scp</requirement>
+        <requirement type="package" version="3.54.0">bioconductor-sva</requirement>
+        <requirement type="package" version="1.80.0">bioconductor-impute</requirement>
+        <requirement type="package" version="1.34.0">bioconductor-scater</requirement>
+        <requirement type="package" version="1.16.0">bioconductor-qfeatures</requirement>
+        <requirement type="package" version="3.62.1">bioconductor-limma</requirement>
+        <requirement type="package" version="3.5.1">r-ggplot2</requirement>
+    </requirements>
+    <required_files>
+        <include path="utils.r" />
+    </required_files>
+    <expand macro="creator" />
+    <command detect_errors="exit_code"><![CDATA[
+    echo ${run_script} &&
+    Rscript  -e 'source("${__tool_directory__}/utils.r")' -e 'source("${run_script}")'
+    #if $data_export.export_R_script
+    && cat ${run_script} >> $script
+    #end if
+    ]]></command>
+    <configfiles>
+        <configfile name="run_script"><![CDATA[
+        data_input <- read.delim("$input_data", sep="\t")
+        metadata <- read.delim("$input_annotations", sep="\t")
+        runCol <- colnames(data_input)[$runcol]
+        fcol_aggregation_pep <- colnames(data_input)[${peptide_aggregation.column_aggregation_peptides}]
+        fcol_aggregation_prot <- colnames(data_input)[${protein_aggregation.column_aggregation_proteins}]
+
+        dir.create("plots")
+
+        scp <- scp::readSCP(
+            assayData = data_input,
+            colData = metadata,
+            runCol = runCol,
+            removeEmptyCols = $remove_empty_columns
+        )
+        number_of_assays <- length(scp)
+        scp <- QFeatures::zeroIsNA(scp, i = 1:number_of_assays)
+
+        #if $filtering_data.filter_reverse
+        scp <- QFeatures::filterFeatures(scp,
+                        ~ Reverse != "+")
+        #end if
+
+        #if $filtering_data.filter_contaminants
+        scp <- QFeatures::filterFeatures(scp,
+                        ~ Potential.contaminant != "+")
+        #end if
+
+        scp <- QFeatures::filterFeatures(scp,
+                      ~ !is.na(PIF) & PIF > ${filtering_data.PIF_threshold})
+
+        keepAssay <- QFeatures::dims(scp)[1, ] > ${filtering_data.minimum_features}
+        scp <- scp[, , keepAssay]
+        
+        number_of_assays <- length(scp)
+        single_cell_channels <- gsub(",", "|", "${filtering_data.single_cells}")
+
+        scp <- scp::computeSCR(scp,
+                  i = 1:number_of_assays,
+                  colvar = "SampleType",
+                  carrierPattern = "Carrier",
+                  samplePattern = single_cell_channels,
+                  sampleFUN = "mean",
+                  rowDataName = "MeanSCR")
+        
+        #if $generate_QC_plots
+        QC_plot_SCR <- QFeatures::rbindRowData(scp, i = 1:number_of_assays) |>
+                        data.frame() |>
+                        ggplot2::ggplot(ggplot2::aes(x = MeanSCR)) +
+                        ggplot2::geom_histogram() +
+                        ggplot2::geom_vline(xintercept = c(1/$count_cell_carrier, 0.1),
+                                                lty = c(2, 1)) +
+                        ggplot2::scale_x_log10()
+        ggplot2::ggsave(filename = file.path("plots", "QC_plot_SCR.pdf"), QC_plot_SCR)
+        
+        QC_plot_SCR_col <- QFeatures::rbindRowData(scp, i = 1:number_of_assays) |>
+                            data.frame() |>
+                            ggplot2::ggplot(ggplot2::aes(x = MeanSCR, color = runCol)) +
+                            ggplot2::geom_density() +
+                            ggplot2::geom_vline(xintercept = 0.02, lty = 2) +
+                            ggplot2::geom_vline(xintercept = 1, lty = 1)+
+                            ggplot2::scale_x_log10()        
+        ggplot2::ggsave(filename = file.path("plots", "QC_plot_SCR_col.pdf"), QC_plot_SCR_col)
+        #end if
+
+        scp <- QFeatures::filterFeatures(scp,
+                      ~ !is.na(MeanSCR) &
+                        MeanSCR < ${filtering_data.SCR_threshold})    
+
+        #if $filtering_data.qvalue_level == "PSM"
+        scp <- scp::pep2qvalue(scp,
+                    i = 1:number_of_assays,
+                    PEP = "dart_PEP",
+                    rowDataName = "qvalue")
+  
+        scp <- QFeatures::filterFeatures(scp,
+                        ~ qvalue < ${filtering_data.qvalue_threshold})
+        #else
+        scp <- scp::pep2qvalue(scp,
+                    i = 1:number_of_assays,
+                    PEP = "dart_PEP",
+                    groupBy = "$filtering_data.qvalue_level",
+                    rowDataName = "qvalue")
+ 
+        scp <- QFeatures::filterFeatures(scp,
+                        ~ qvalue < ${filtering_data.qvalue_threshold})
+        #end if
+
+        #if $filtering_data.divide_reference
+        scp <- scp::divideByReference(scp,
+                           i = 1:number_of_assays,
+                           colvar = "SampleType",
+                           samplePattern = ".",
+                           refPattern = "Reference")
+        #end if
+
+        scp <- scp::aggregateFeaturesOverAssays(
+            scp,
+            i = 1:number_of_assays,
+            fcol = fcol_aggregation_pep,
+            name = paste0("peptide_", names(scp)),
+            fun = ${peptide_aggregation.aggregation_peptides},
+            na.rm = TRUE
+        )
+
+        scp <- QFeatures::joinAssays(scp,
+                  i = grep("peptide", names(scp)),
+                  name = "peptides")
+   
+        keep_samples <- unlist(strsplit("${peptide_filtering.samples_to_keep}", split=","))
+        scp <- scp[,SummarizedExperiment::colData(scp)[["SampleType"]] %in% keep_samples, ]
+
+        #if $peptide_filtering.filter_median_intensity.cut_median_intensity == "yes"
+            medians <- colMedians(SummarizedExperiment::assay(scp[["peptides"]]), na.rm = TRUE)
+            SummarizedExperiment::colData(scp)[["MedianRI"]] <- medians
+
+            #if $generate_QC_plots
+            QC_medianRI <- SummarizedExperimentcolData(scp) |>
+                            data.frame() |>
+                            ggplot2::ggplot() +
+                            ggplot2::aes(x = MedianRI,
+                                y = SampleType,
+                                fill = SampleType) +
+                            ggplot2::geom_boxplot() +
+                            ggplot2::scale_x_log10()
+            ggplot2::ggsave(filename = file.path("plots", "QC_medianRI.pdf"), QC_medianRI)
+            #end if
+
+            scp <- scp[, !is.na(SummarizedExperiment::colData(scp)[["MedianRI"]]) & SummarizedExperiment::colData(scp)[["MedianRI"]] < ${peptide_filtering.filter_median_intensity.median_intensity_threshold}, ]
+        #end if
+
+        #if $peptide_filtering.filter_median_CV.cut_median_CV == "yes"
+        number_of_observations <- ${peptide_filtering.filter_median_CV.minimum_peptides_CV}
+        CV_threshold <- ${peptide_filtering.filter_median_CV.median_CV_threshold}
+        scp <- scp::medianCVperCell(scp,
+                         i = 1:number_of_assays,
+                         groupBy = "Leading.razor.protein",
+                         nobs = number_of_observations,
+                         norm = "div.median",
+                         na.rm = TRUE,
+                         colDataName = "MedianCV")
+
+            #if $generate_QC_plots
+            QC_medianCV <- MultiAssayExperiment::getWithColData(scp, "peptides") |>
+                            SummarizedExperiment::colData() |>
+                            data.frame() |>
+                            ggplot2::ggplot(ggplot2::aes(x = MedianCV,
+                                fill = SampleType)) +
+                            ggplot2::geom_boxplot() +
+                            ggplot2::geom_vline(xintercept = CV_threshold)
+            ggplot2::ggsave(filename = file.path("plots", "QC_medianCV.pdf"), QC_medianCV)
+            #end if
+
+        scp <- scp[, !is.na(SummarizedExperiment::colData(scp)[["MedianCV"]]) & SummarizedExperiment::colData(scp)[["MedianCV"]] < CV_threshold, ]
+        #end if
+        
+        #if $peptide_filtering.remove_blank
+        scp <- scp[, SummarizedExperiment::colData(scp)[["SampleType"]] != "Blank", ]
+        #end if
+
+        #if $peptide_processing.normalization_method.choose_normalization == "simple"
+        scp <- QFeatures::normalize(
+            scp,
+            i = "peptides",
+            name = "peptides_norm",
+            method = "${peptide_processing.normalization_method.normalize_simple_method}"
+        )
+        #else
+        scp <- QFeatures::sweep(
+            scp,
+            i = "peptides", 
+            MARGIN = 2,
+            FUN = "/",
+            STATS = ${peptide_processing.normalization_method.normalize_columns}(SummarizedExperiment::assay(scp[["peptides"]]), na.rm = TRUE),
+            name = "peptides_norm_col"
+        )
+
+        scp <- QFeatures::sweep(
+            scp,
+            i = "peptides_norm_col",
+            MARGIN = 1,
+            FUN = "/",
+            STATS = ${peptide_processing.normalization_method.normalize_rows}(SummarizedExperiment::assay(scp[["peptides_norm_col"]]),  na.rm = TRUE),
+            name =  "peptides_norm"
+        ) 
+        #end if
+
+        scp <- QFeatures::logTransform(
+            scp,
+            base = ${peptide_processing.base},
+            i = "peptides_norm",
+            name = "peptides_log"
+        )
+
+        #if $generate_QC_plots
+        QC_boxplot_peptide <- create_boxplots(scp, "peptides", FALSE, "Peptides not normalized")
+        QC_boxplot_peptide_norm <- create_boxplots(scp, "peptides_log", TRUE, "Peptides normalized")
+
+        ggplot2::ggsave(filename = file.path("plots", "QC_boxplot_peptide.pdf"), QC_boxplot_peptide)
+        ggplot2::ggsave(filename = file.path("plots", "QC_boxplot_peptide_norm.pdf"), QC_boxplot_peptide_norm)
+        #end if
+
+        #if $peptide_processing.remove_missing_peptides.remove_peptides == "yes"
+        pNA <-  ${peptide_processing.remove_missing_peptides.pNA_peptides} / 100
+        scp <- QFeatures::filterNA(scp, i = "peptides_log", pNA = pNA)
+        #end if
+
+        scp <- scp::aggregateFeaturesOverAssays(
+            scp,
+            i = "peptides_log",
+            fcol = fcol_aggregation_prot,
+            name = "proteins",
+            fun = ${protein_aggregation.aggregation_proteins},
+            na.rm = TRUE
+        )
+
+        #if $protein_processing.normalization_method_protein.choose_normalization_protein == "simple_prot"
+        scp <- QFeatures::normalize(
+            scp,
+            i = "proteins",
+            name = "proteins_norm",
+            method = "${protein_processing.normalization_method_protein.normalize_simple_method_prot}"
+        )
+        #else
+        scp <- QFeatures::sweep(
+            scp,
+            i = "proteins", 
+            MARGIN = 2,
+            FUN = "/",
+            STATS = ${protein_processing.normalization_method_protein.normalize_columns_prot}(SummarizedExperiment::assay(scp[["proteins"]]), na.rm = TRUE),
+            name = "proteins_norm_col"
+        )
+
+        scp <- QFeatures::sweep(
+            scp,
+            i = "proteins_norm_col",
+            MARGIN = 1,
+            FUN = "/",
+            STATS = ${protein_processing.normalization_method_protein.normalize_rows_prot}(SummarizedExperiment::assay(scp[["proteins_norm_col"]]),  na.rm = TRUE),
+            name =  "proteins_norm"
+        ) 
+        #end if
+
+        #if $generate_QC_plots
+        QC_boxplot_protein <- create_boxplots(scp, "proteins", TRUE, "Proteins not normalized")
+        QC_boxplot_protein_norm <- create_boxplots(scp, "proteins_norm", TRUE, "Proteins normalized")
+        
+        ggplot2::ggsave(filename = file.path("plots", "QC_boxplot_protein.pdf"), QC_boxplot_protein)
+        ggplot2::ggsave(filename = file.path("plots", "QC_boxplot_protein_norm.pdf"), QC_boxplot_protein_norm)
+        
+        pdf(file = file.path("plots", "QC_heatmap_proteins.pdf"))
+        plot_heatmap(scp, "proteins_norm")
+        dev.off()
+        #end if
+
+        scp <- QFeatures::impute(
+            scp,
+            i = "proteins_norm",
+            name = "proteins_imptd",
+            method = "knn",
+            k = ${protein_processing.impute_k},
+            rowmax = 1,
+            colmax= 1,
+            maxp = Inf,
+            rng.seed = 1234
+        )
+
+        batch_colname <- colnames(metadata)[${batch_correction.select_batch_correction.batch_col}]
+        #if $batch_correction.select_batch_correction.batch_correction_method == "combat"
+        sce <- MultiAssayExperiment::getWithColData(scp, "proteins_imptd")
+        batch <- SummarizedExperiment::colData(scp)[[batch_colname]]
+        model <- stats::model.matrix(~ SampleType, data = SummarizedExperiment::colData(sce))
+  
+        SummarizedExperiment::assay(sce) <- sva::ComBat(
+            dat = SummarizedExperiment::assay(sce),
+            batch = batch,
+            mod = model
+        )
+
+        scp <- QFeatures::addAssay(
+            scp,
+            y = sce,
+            name = "proteins_batchC"
+        )
+
+        scp <- QFeatures::addAssayLinkOneToOne(
+            scp,
+            from = "proteins_imptd",
+            to = "proteins_batchC"
+        )
+        #else
+        sce <- MultiAssayExperiment::getWithColData(scp, "proteins_imptd")
+        preserve_colname <- colnames(metadata)[${batch_correction.select_batch_correction.preserve_col}]
+
+        SummarizedExperiment::assay(sce) <- limma::removeBatchEffect(
+            SummarizedExperiment::assay(sce),
+            group = sce[[preserve_colname]],
+            batch = sce[[batch_colname]]
+        )
+        scp <- QFeatures::addAssay(scp,
+                  y = sce,
+                  name = "proteins_batchC")
+        scp <- QFeatures::addAssayLinkOneToOne(scp,
+                              from = "proteins_imptd",
+                              to = "proteins_batchC")
+        #end if 
+
+        #if $dimensionality_reduction.PCA_computation.run_PCA == "yes"
+        PCA_color <- colnames(metadata)[$dimensionality_reduction.PCA_computation.pca_coloring]
+        scp[["proteins_batchC"]] <- scater::runPCA(
+            scp[["proteins_batchC"]],
+            ncomponents = ${dimensionality_reduction.PCA_computation.ncomponents_PCA},
+            ntop = Inf,
+            scale = TRUE,
+            exprs_values = 1,
+            name = "PCA"
+        )
+  
+        pca <- scater::plotReducedDim(
+            scp[["proteins_batchC"]],
+            dimred = "PCA",
+            colour_by = PCA_color,
+            point_alpha = 1
+        )
+
+        ggplot2::ggsave(filename = file.path("plots", "PCA.pdf"), pca)
+
+            #if $dimensionality_reduction.PCA_computation.UMAP_computation.run_UMAP == "yes"
+            scp[["proteins_batchC"]] <- scater::runUMAP(
+                scp[["proteins_batchC"]],
+                ncomponents = ${dimensionality_reduction.PCA_computation.UMAP_computation.ncomponents_UMAP},
+                ntop = Inf,
+                scale = TRUE,
+                exprs_values = 1,
+                n_neighbors = 3,
+                dimred = "PCA",
+                name = "UMAP"
+            )
+  
+            umap <- scater::plotReducedDim(
+                scp[["proteins_batchC"]],
+                dimred = "UMAP",
+                colour_by = "SampleType",
+                point_alpha = 1
+            )
+            ggplot2::ggsave(filename = file.path("plots", "UMAP.pdf"), umap)
+            #end if
+        #end if
+
+        assay_df <- as.data.frame(SummarizedExperiment::assay(scp, "proteins_batchC"))
+        row_metadata <- as.data.frame(SummarizedExperiment::rowData(scp[["proteins_batchC"]]))
+
+        export_data <- cbind(row_metadata, as.data.frame(assay_df))
+        write.table(export_data, file = '$Processed_data', sep = "\t", quote = F)
+
+        #if $data_export.export_tables
+        export_all_assays(scp)
+        #end if
+
+        #if $data_export.export_RData
+        save(scp, file='$scp_object')
+        #end if    
+        ]]></configfile>
+    </configfiles>
+    <inputs>
+        <expand macro="scp_param"/>
+    </inputs>
+    <outputs>
+        <data name="Processed_data" format="tabular" label="Batch-corrected protein levels"/>
+        <collection name="intermediate_outputs" type="list" label="Intermediate outputs" format="tabular">
+            <discover_datasets pattern="__name_and_ext__" directory="outputs" />
+            <filter>data_export['export_tables']</filter>
+        </collection>
+        <data name="scp_object" format="rds" label="scp object as .rds">
+            <filter>data_export['export_RData']</filter>
+        </data>
+        <data name="script" format="txt" label="R script">
+            <filter>data_export['export_R_script']</filter>
+        </data>
+        <collection name="plots" type="list" label="Plots">
+            <discover_datasets pattern="__name_and_ext__" directory="plots" />
+                <filter>generate_QC_plots or dimensionality_reduction['PCA_computation']['run_PCA'] == 'yes' or dimensionality_reduction['PCA_computation']['run_PCA'] == 'yes'</filter>
+        </collection>
+    </outputs>
+    <tests>
+        <test expect_num_outputs='2'>
+            <param name="input_data" value="evidence_subset.txt"/>
+            <param name="input_annotations" value="sampleAnnotation.txt"/>
+            <param name="runcol" value="19"/>
+            <param name="single_cells" value="Macrophage,Monocyte"/>
+            <param name="samples_to_keep" value="Macrophage,Monocyte,Blank"/>
+            <param name="batch_col" value="2"/>
+            <param name="column_aggregation_peptides" value="6"/>
+            <param name="column_aggregation_proteins" value="17"/>
+            <param name="pca_coloring" value="4"/>
+            <output name="Processed_data">
+                <assert_contents>
+                    <has_n_lines n="90"/>
+                    <has_text text="E9PAV3"/>
+                    <has_size size="625000" delta="20"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs='2'>
+            <param name="input_data" value="evidence_subset.txt" />
+            <param name="input_annotations" value="sampleAnnotation.txt"/>
+            <param name="runcol" value="19"/>
+            <param name="single_cells" value="Macrophage,Monocyte"/>
+            <param name="samples_to_keep" value="Macrophage,Monocyte,Blank"/>
+            <param name="batch_col" value="2"/>
+            <param name="column_aggregation_peptides" value="6"/>
+            <param name="column_aggregation_proteins" value="17"/>
+            <param name="pca_coloring" value="4"/>
+            <output_collection name="plots" type="list">
+                <element name="PCA" file="PCA.pdf" ftype="pdf" compare="sim_size" delta="60"/>
+                <element name="QC_boxplot_peptide" file="QC_boxplot_peptide.pdf" ftype="pdf" compare="sim_size" delta="40"/>
+                <element name="QC_boxplot_peptide_norm" file="QC_boxplot_peptide_norm.pdf" ftype="pdf" compare="sim_size" delta="40"/>
+                <element name="QC_boxplot_protein" file="QC_boxplot.pdf" ftype="pdf" compare="sim_size" delta="40"/>
+                <element name="QC_boxplot_protein_norm" file="QC_boxplot_protein_norm.pdf" ftype="pdf" compare="sim_size" delta="40"/>
+                <element name="QC_heatmap_proteins" file="QC_heatmap_proteins.pdf" ftype="pdf" compare="sim_size" delta="40"/>
+                <element name="QC_medianCV" file="QC_medianCV.pdf" ftype="pdf" compare="sim_size" delta="40"/>
+                <element name="QC_plot_SCR" file="QC_plot_SCR.pdf" ftype="pdf" compare="sim_size" delta="40"/>
+                <element name="QC_plot_SCR_col" file="QC_plot_SCR_col.pdf" ftype="pdf" compare="sim_size" delta="40"/>
+                <element name="UMAP" file="UMAP.pdf" ftype="pdf" compare="sim_size" delta="200"/>
+            </output_collection>
+            <output name="Processed_data">
+                <assert_contents>
+                    <has_size size="625000" delta="20"/>
+                    <has_n_lines n="90"/>
+                    <has_text text="E9PAV3"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs='3'>
+            <param name="input_data" value="evidence_subset.txt" />
+            <param name="input_annotations" value="sampleAnnotation.txt"/>
+            <param name="runcol" value="19"/>
+            <param name="single_cells" value="Macrophage,Monocyte"/>
+            <param name="samples_to_keep" value="Macrophage,Monocyte,Blank"/>
+            <param name="column_aggregation_peptides" value="6"/>
+            <param name="column_aggregation_proteins" value="17"/>
+            <param name="batch_col" value="2"/>
+            <param name="export_tables" value="true"/>
+            <param name="pca_coloring" value="4"/>
+            <output_collection name="intermediate_outputs" type="list" count="6"/>
+            <output name="Processed_data">
+                <assert_contents>
+                    <has_size size="625000" delta="20"/>
+                    <has_n_lines n="90"/>
+                    <has_text text="E9PAV3"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs='4'>
+            <param name="input_data" value="evidence_subset.txt" />
+            <param name="input_annotations" value="sampleAnnotation.txt"/>
+            <param name="runcol" value="19"/>
+            <param name="single_cells" value="Macrophage,Monocyte"/>
+            <param name="samples_to_keep" value="Macrophage,Monocyte,Blank"/>
+            <param name="batch_col" value="2"/>
+            <param name="export_RData" value="TRUE"/>
+            <param name="export_R_script" value="TRUE"/>
+            <param name="column_aggregation_peptides" value="6"/>
+            <param name="column_aggregation_proteins" value="17"/>
+            <param name="pca_coloring" value="4"/>
+            <output name="Processed_data">
+                <assert_contents>
+                    <has_size size="625000" delta="20"/>
+                    <has_n_lines n="90"/>
+                    <has_text text="E9PAV3"/>
+                </assert_contents>
+            </output>
+            <output name="script">
+                <assert_contents>
+                    <has_n_lines n="271"/>
+                    <has_text text='ggplot2::ggsave(filename = file.path("plots", "PCA.pdf"), pca)'/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+        @GENERAL_HELP@
+    ]]></help>
+    <expand macro="citations" />
+</tool>
\ No newline at end of file