Mercurial > repos > perssond > naivestates
changeset 0:1fb6181c2c64 draft
"planemo upload for repository https://github.com/ohsu-comp-bio/naivestates commit 392f57d212a7499bf1d3e421112a32a56635bc67-dirty"
author | perssond |
---|---|
date | Fri, 12 Mar 2021 00:20:13 +0000 |
parents | |
children | a62b0c62270e |
files | macros.xml main.R naivestates.xml |
diffstat | 3 files changed, 360 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Fri Mar 12 00:20:13 2021 +0000 @@ -0,0 +1,19 @@ +<?xml version="1.0"?> +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@VERSION@">naivestatess</requirement> + </requirements> + </xml> + + <xml name="version_cmd"> + <version_command>echo @VERSION@</version_command> + </xml> + <xml name="citations"> + <citations> + </citations> + </xml> + + <token name="@VERSION@">1.6.1</token> + <token name="@CMD_BEGIN@">${__tool_directory__}/main.R</token> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/main.R Fri Mar 12 00:20:13 2021 +0000 @@ -0,0 +1,156 @@ +#!/usr/bin/env Rscript + +suppressMessages( library(tidyverse) ) +library( optparse ) +library( naivestates ) + +## Identify directory of the script +wd <- commandArgs( trailingOnly=FALSE ) %>% + keep( ~grepl("--file=", .x) ) %>% + str_replace( "--file=", "" ) %>% dirname() +cat( "Running the script from", wd, "\n" ) + +## Parse command-line arugments +option_list <- list( + make_option(c("-i", "--in"), type="character", help="Input file"), + make_option(c("-o", "--out"), type="character", default="/data", + help="Output directory"), + make_option(c("-m", "--markers"), type="character", default="auto", + help="Markers to model"), + make_option(c("-p", "--plots"), type="character", default="off", + help="Generate plots showing the fit"), + make_option("--mct", type="character", default="", + help="Marker -> cell type map in .csv format"), + make_option("--id", type="character", default="CellID", + help="Column containing cell IDs"), + make_option("--log", type="character", default="auto", + help="Whether to apply a log transform <yes|no|auto>"), + make_option("--sfx", type="character", default="", + help="Common suffix on marker columns (e.g., _cellMask)"), + make_option("--umap", action="store_true", default=FALSE, + help="Generate UMAP plots") +) +opt <- parse_args(OptionParser(option_list=option_list)) + +## Argument verification +if( !("in" %in% names(opt)) ) + stop( "Please provide an input file name with -i" ) +if( !(opt$log %in% c("yes","no","auto")) ) + stop( "--log must be one of <yes|no|auto>" ) +if( !(opt$plots %in% c("off", "pdf", "png")) ) + stop( "--plots must be one of <off|pdf|png>" ) + +## Identify the sample name +sn <- basename( opt$`in` ) %>% str_split( "\\." ) %>% + pluck( 1, 1 ) +cat( "Inferred sample name:", sn, "\n" ) + +## Read the data matrix +X <- read_csv( opt$`in`, col_types=cols() ) +cat( "Read", nrow(X), "entries\n" ) + +## Fix potential capitalization mismatch of --id +if( !(opt$id %in% colnames(X)) ) +{ + ## Attempt to find a singular case-insensitive match + i <- grep( tolower(opt$id), tolower(colnames(X)) ) + if( length(i) == 1 ) + { + warning( " No such column ", opt$id, + "; using ", colnames(X)[i], " instead" ) + opt$id <- colnames(X)[i] + } + else stop( "No such column ", opt$id, + "; use --id to specify which column contains cell IDs" ) +} + +## Identify markers in the matrix +mrkv <- findMarkers(setdiff(colnames(X), opt$id), opt$markers, + opt$sfx, TRUE, TRUE) + +## Handle log transformation of the data +if( opt$log == "yes" || + (opt$log == "auto" && max(X[mrkv], na.rm=TRUE) > 1000) ) +{ + cat( "Applying a log10 transform\n" ) + X <- X %>% mutate_at( unname(mrkv), ~log10(.x+1) ) +} + +## Fit Gaussian mixture models +GMM <- GMMfit(X, opt$id, !!!mrkv) +fnMdl <- file.path( opt$out, str_c(sn, "-models.csv") ) +cat( "Saving models to", fnMdl, "\n" ) +GMMmodels(GMM) %>% write_csv( fnMdl ) + +## Reshape the matrix back to cells-by-marker format +Y <- GMMreshape(GMM) + +cat( "------\n" ) + +## Find the default cell type map +if( opt$mct != "" ) { + + ## Load marker -> cell type associations + cat( "Loading cell type map from", opt$mct, "\n" ) + mct <- read_csv( opt$mct, col_types=cols() ) %>% + distinct() %>% filter(Marker %in% colnames(Y)) + + if( nrow(mct) == 0 ) { + warning( "No usable marker -> cell type mappings detected" ) + Y <- findDominant(Y, opt$id) + } else { + cat( "Using the following marker -> cell type map:\n" ) + walk2( mct$Marker, mct$State, ~cat(.x, "->", .y, "\n") ) + Y <- callStates(Y, opt$id, mct) + } +} else { + cat( "No marker -> cell type mapping provided\n" ) + Y <- findDominant(Y, opt$id) +} + +cat( "------\n" ) + +## Identify the output location(s) +fnOut <- file.path( opt$out, str_c(sn, "-states.csv") ) +cat( "Saving probabilities and calls to", fnOut, "\n") +Y %>% write_csv( fnOut ) + +## Generates plots as necessary +if( opt$plots != "off" ) +{ + ## Create a separate directory for plots + dirPlot <- file.path( opt$out, "plots", sn ) + dir.create(dirPlot, recursive=TRUE, showWarnings=FALSE) + + ## Fit overview + fn <- file.path( file.path(opt$out, "plots"), str_c(sn, "-allfits.", opt$plots) ) + ggf <- plotFitOverview(GMM) + suppressMessages(ggsave( fn, ggf, width=12, height=8 )) + + ## Compute a UMAP projection + if( opt$umap ) { + cat( "Computing a UMAP projection...\n" ) + U <- umap( Y, c(opt$id, "State", "Dominant") ) + + ## Generate and write a summary plot + gg <- plotSummary( U ) + fn <- file.path( file.path(opt$out, "plots"), str_c(sn, "-summary.", opt$plots) ) + suppressMessages(ggsave( fn, gg, width=9, height=7 )) + cat( "Plotted summary to", fn, "\n" ) + + ## Generate and write faceted probabilities plot + gg <- plotProbs( U, c(opt$id, "State", "Dominant") ) + fn <- file.path( file.path(opt$out, "plots"), str_c(sn, "-probs.", opt$plots) ) + suppressMessages(ggsave( fn, gg, width=9, height=7 )) + cat( "Plotted probabilities to", fn, "\n" ) + } + + ## Generate and write out plots for individual marker fits + for( i in names(mrkv) ) + { + gg <- plotMarker(GMM, i) + fn <- file.path( dirPlot, str_c(i,".",opt$plots) ) + suppressMessages(ggsave( fn, gg )) + cat( "Wrote", fn, "\n" ) + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/naivestates.xml Fri Mar 12 00:20:13 2021 +0000 @@ -0,0 +1,185 @@ +<tool id="naivestates" name="naivestates" version="@VERSION@.2" profile="17.09"> + <description> Inference of cell states using Naive Bayes</description> + <macros> + <import>macros.xml</import> + </macros> + + <expand macro="requirements"/> + @VERSION_CMD@ + + <command detect_errors="exit_code"><![CDATA[ + + @CMD_BEGIN@ + -i '$counts' + + #if $markers + -m $markers + #end if + + --mct $mct + -p $plots + + #if $id + --id $id + #end if + + --log $log + + #if $sfx + --sfx $sfx + #end if + + #if $umap + --umap + #end if + -o . + + && + + mv *-states.csv states.csv; + + #if $plots != "off" + mv plots/*-probs.${plots} plots/probs.${plots}; + mv plots/*-summary.${plots} plots/summary.${plots}; + mv plots/*-allfits.${plots} plots/allfits.${plots}; + #end if + + ]]></command> + + + <inputs> + <param name="counts" type="data" format="csv" label="Quantified Cell Matrix"/> + <param name="markers" type="data" format="txt" optional="true" label="Markers to model"/> + <param name="mct" type="data" format="csv" label="Marker-State Association Map"/> + <param name="plots" type="select" label="Generate plots showing the fit"> + <option selected="true" value="png">png</option> + <option value="pdf">pdf</option> + <option value="off">off</option> + </param> + <param name="id" type="text" value="" label="Column name containing cell IDs"/> + <param name="log" type="select" label="Log Transform" help="Whether to apply a log transform"> + <option selected="true" value="auto">auto</option> + <option value="yes">yes</option> + <option value="no">no</option> + </param> + <param name="sfx" type="text" value="_cellMask" optional="true" label="Common suffix" help="Common suffix on marker columns (e.g., _cellMask)"/> + <param name="umap" type="boolean" checked="true" label="Generate UMAP plots"/> + </inputs> + + <outputs> + <data format="csv" name="states" from_work_dir="states.csv" label="${tool.name} on ${on_string}: States CSV"/> + <data format="png" name="probs-png" from_work_dir="plots/probs.png" label="${tool.name} on ${on_string}: Probabilities"> + <filter>plots == 'png'</filter> + </data> + <data format="png" name="summary-png" from_work_dir="plots/summary.png" label="${tool.name} on ${on_string}: Summary"> + <filter>plots == 'png'</filter> + </data> + <data format="png" name="allfits-png" from_work_dir="plots/allfits.png" label="${tool.name} on ${on_string}: AllFits"> + <filter>plots == 'png'</filter> + </data> + <data format="pdf" name="probs-pdf" from_work_dir="plots/probs.pdf" label="${tool.name} on ${on_string}: Probabilities"> + <filter>plots == 'pdf'</filter> + </data> + <data format="pdf" name="summary-pdf" from_work_dir="plots/summary.pdf" label="${tool.name} on ${on_string}: Summary"> + <filter>plots == 'pdf'</filter> + </data> + <data format="pdf" name="allfits-pdf" from_work_dir="plots/allfits.pdf" label="${tool.name} on ${on_string}: AllFits"> + <filter>plots == 'pdf'</filter> + </data> + </outputs> + <help><![CDATA[ +naivestates - Inference of cell states using Naive Bayes +This work is supported by the NIH Grant 1U54CA225088: Systems Pharmacology of Therapeutic and Adverse Responses to Immune Checkpoint and Small Molecule Drugs and by the NCI grant 1U2CCA233262: Pre-cancer atlases of cutaneous and hematologic origin (PATCH Center). + +Introduction +naivestates is a label-free, cluster-free tool for inferring cell types from quantified marker expression data, based on known marker <-> cell type associations. The tool is designed to be run as a Docker container, but can also be installed in a Conda environment or as an R package. naivestates expects as input information about marker expression on a per-cell basis, provided in .csv format. One of the columns must contain cell IDs. An example input file may look as follows: + +CellID,KERATIN,FOXP3,SMA +1,64.18060200668896,193.00334448160535,303.5016722408027 +2,54.850202429149796,151.19433198380565,176.3846153846154 +3,63.94712643678161,210.43218390804597,483.9448275862069 +4,142.01320132013203,227.85808580858085,420.76897689768975 +5,56.66379310344828,197.01896551724138,343.7810344827586 +6,69.97454545454545,187.59636363636363,267.9709090909091 +7,67.57754010695187,185.63368983957218,351.7914438502674 +8,64.012,190.02,349.348 +9,56.9622641509434,159.79245283018867,236.43867924528303 +... +Installation +Download the container image +Pull the latest version with + +docker pull labsyspharm/naivestates +Alternatively, you can pull a specific version, which is recommended to ensure reproducibility of your analyses. For example, v1.2.0 can be pulled with + +docker pull labsyspharm/naivestates:1.2.0 +Examine the tool usage instructions +docker run --rm labsyspharm/naivestates:1.2.0 /app/main.R -h +replacing 1.2.0 with the version you are working with. Omit :1.2.0 entirely if you pulled the latest version above. The flag --rm tells Docker to delete the container instance after it finishes displaying the help message. + +Basic usage +At minimum, the tool requires an input file and the list of marker names: + +docker run --rm -v /path/to/data/folder:/data labsyspharm/naivestates:1.2.0 \ + /app/main.R -i /data/myfile.csv -m aSMA,CD45,panCK +where we can make a distinction between Docker-level arguments: + +--rm once again cleans up the container instance after it finishes running the code +-v /path/to/data/folder:/data maps the local folder containing your data to /data inside the container +:1.2.0 specifies the container version that we pulled above +and tool-level arguments: + +-i /data/myfile.csv specifies which data file to process +-m aSMA,CD45,panCK specifies the markers of interest (NOTE: comma-delimited, no spaces) +If there is a large number of markers, place their names in a standalone file markers.txt with one marker per line. Ensure that the file lives in /path/to/data/folder/ and modify the Docker call to use the new file: + +docker run --rm -v /path/to/data/folder:/data labsyspharm/naivestates:1.2.0 \ + /app/main.R -i /data/myfile.csv -m /data/markers.txt +Additional parameters +The following parameters are optional, but may be useful in certain scenarios: + +--plots <off|pdf|png> - (default: off) Produces QC plots of individual marker fits and summary UMAP plots in .png or .pdf format. +--id - (default: CellID) Name of the column that contains cell IDs +--log <yes|no|auto> - (default: auto) When a log10 transformation should be applied prior to fitting the data. The tool will do this automatically if it detects large values. Use --log no to force the use of original, non-transformed values instead. +-o - (default: /data) Alternative output directory. (Note that any file written to a directory that wasn't mapped with docker -v will not persist when the container is destroyed.) +--mct - The tool has a basic marker -> cell type (mct) mapping in typemap.csv. More sophisticated mct mappings can be defined by creating a custom-map.csv file with two columns: Marker and State. Ensure that custom-map.csv is in /path/to/data/folder and point the tool at it with --mct (e.g., /app/main.R -i /data/myfile.csv --mct /data/custom-map.csv -m aSMA,CD45,panCK) +Alternative execution environments +Running in a Conda environment +If you are working in a computational environment that doesn't support Docker, the repository provides a Conda-based alternative. Ensure that conda is installed on your system, then 1) clone this repository, 2) instantiate the conda environment and 3) install the tool. + +git clone https://github.com/labsyspharm/naivestates.git +cd naivestates +conda env create -f conda.yml +conda activate naivestates +R -s -e "devtools::install_github('labsyspharm/naivestates')" +The tool can now be used as above by running main.R: + +./main.R -h +./main.R -i /path/to/datafile.csv -m aSMA,CD45,panCK +Running as an R package +The tool can also be installed as an R package directly from GitHub: + +if( !require(devtools) ) install.packages("devtools") +devtools::install_github( "labsyspharm/naivestates" ) +Example usage: + +library( tidyverse ) +library( naivestates ) + +# Load the original data +X <- read_csv( "datafile.csv" ) + +# Fit models to channels aSMA, CD45 and panCK +# Specify that cell IDs are in column CellID +GMM <- GMMfit( X, CellID, aSMA, CD45, panCK ) + +# Plot a fit to one of the markers +plotFit( GMM, "CD45" ) + +# Write out the results to results.csv +GMMreshape(GMM) %>% write_csv( "results.csv" ) + +OHSU Wrapper Repo: https://github.com/ohsu-comp-bio/naivestates + ]]></help> + <expand macro="citations" /> +</tool>