Mercurial > repos > perssond > naivestates

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Fri Mar 12 00:20:13 2021 +0000
@@ -0,0 +1,19 @@
+<?xml version="1.0"?>
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@VERSION@">naivestatess</requirement>
+        </requirements>
+    </xml>
+
+    <xml name="version_cmd">
+        <version_command>echo @VERSION@</version_command>
+    </xml>
+    <xml name="citations">
+        <citations>
+        </citations>
+    </xml>
+
+    <token name="@VERSION@">1.6.1</token>
+    <token name="@CMD_BEGIN@">${__tool_directory__}/main.R</token>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/main.R	Fri Mar 12 00:20:13 2021 +0000
@@ -0,0 +1,156 @@
+#!/usr/bin/env Rscript
+
+suppressMessages( library(tidyverse) )
+library( optparse )
+library( naivestates )
+
+## Identify directory of the script
+wd <- commandArgs( trailingOnly=FALSE ) %>%
+    keep( ~grepl("--file=", .x) ) %>%
+    str_replace( "--file=", "" ) %>% dirname()
+cat( "Running the script from", wd, "\n" )
+
+## Parse command-line arugments
+option_list <- list(
+    make_option(c("-i", "--in"), type="character", help="Input file"),
+    make_option(c("-o", "--out"), type="character", default="/data",
+                help="Output directory"),
+    make_option(c("-m", "--markers"), type="character", default="auto",
+                help="Markers to model"),
+    make_option(c("-p", "--plots"), type="character", default="off",
+                help="Generate plots showing the fit"),
+    make_option("--mct", type="character", default="",
+                help="Marker -> cell type map in .csv format"),
+    make_option("--id", type="character", default="CellID",
+                help="Column containing cell IDs"),
+    make_option("--log", type="character", default="auto",
+                help="Whether to apply a log transform <yes|no|auto>"),
+    make_option("--sfx", type="character", default="",
+                help="Common suffix on marker columns (e.g., _cellMask)"),
+    make_option("--umap", action="store_true", default=FALSE,
+                help="Generate UMAP plots")
+)
+opt <- parse_args(OptionParser(option_list=option_list))
+
+## Argument verification
+if( !("in" %in% names(opt)) )
+    stop( "Please provide an input file name with -i" )
+if( !(opt$log %in% c("yes","no","auto")) )
+    stop( "--log must be one of <yes|no|auto>" )
+if( !(opt$plots %in% c("off", "pdf", "png")) )
+    stop( "--plots must be one of <off|pdf|png>" )
+
+## Identify the sample name
+sn <- basename( opt$`in` ) %>% str_split( "\\." ) %>%
+    pluck( 1, 1 )
+cat( "Inferred sample name:", sn, "\n" )
+
+## Read the data matrix
+X <- read_csv( opt$`in`, col_types=cols() )
+cat( "Read", nrow(X), "entries\n" )
+
+## Fix potential capitalization mismatch of --id
+if( !(opt$id %in% colnames(X)) )
+{
+    ## Attempt to find a singular case-insensitive match
+    i <- grep( tolower(opt$id), tolower(colnames(X)) )
+    if( length(i) == 1 )
+    {
+        warning( "  No such column ", opt$id,
+                "; using ", colnames(X)[i], " instead" )
+        opt$id <- colnames(X)[i]
+    }
+    else stop( "No such column ", opt$id,
+              "; use --id to specify which column contains cell IDs" )
+}
+
+## Identify markers in the matrix
+mrkv <- findMarkers(setdiff(colnames(X), opt$id), opt$markers,
+                    opt$sfx, TRUE, TRUE)
+
+## Handle log transformation of the data
+if( opt$log == "yes" ||
+    (opt$log == "auto" && max(X[mrkv], na.rm=TRUE) > 1000) )
+{
+    cat( "Applying a log10 transform\n" )
+    X <- X %>% mutate_at( unname(mrkv), ~log10(.x+1) )
+}
+
+## Fit Gaussian mixture models
+GMM <- GMMfit(X, opt$id, !!!mrkv)
+fnMdl <- file.path( opt$out, str_c(sn, "-models.csv") )
+cat( "Saving models to", fnMdl, "\n" )
+GMMmodels(GMM) %>% write_csv( fnMdl )
+
+## Reshape the matrix back to cells-by-marker format
+Y <- GMMreshape(GMM)
+
+cat( "------\n" )
+
+## Find the default cell type map
+if( opt$mct != "" ) {
+
+    ## Load marker -> cell type associations
+    cat( "Loading cell type map from", opt$mct, "\n" )
+    mct <- read_csv( opt$mct, col_types=cols() ) %>%
+        distinct() %>% filter(Marker %in% colnames(Y))
+
+    if( nrow(mct) == 0 ) {
+        warning( "No usable marker -> cell type mappings detected" )
+        Y <- findDominant(Y, opt$id)
+    } else {
+        cat( "Using the following marker -> cell type map:\n" )
+        walk2( mct$Marker, mct$State, ~cat(.x, "->", .y, "\n") )
+        Y <- callStates(Y, opt$id, mct)
+    }
+} else {
+    cat( "No marker -> cell type mapping provided\n" )
+    Y <- findDominant(Y, opt$id)
+}
+
+cat( "------\n" )
+
+## Identify the output location(s)
+fnOut <- file.path( opt$out, str_c(sn, "-states.csv") )
+cat( "Saving probabilities and calls to", fnOut, "\n")
+Y %>% write_csv( fnOut )
+
+## Generates plots as necessary
+if( opt$plots != "off" )
+{
+    ## Create a separate directory for plots
+    dirPlot <- file.path( opt$out, "plots", sn )
+    dir.create(dirPlot, recursive=TRUE, showWarnings=FALSE)
+
+    ## Fit overview
+    fn <- file.path( file.path(opt$out, "plots"), str_c(sn, "-allfits.", opt$plots) )
+    ggf <- plotFitOverview(GMM)
+    suppressMessages(ggsave( fn, ggf, width=12, height=8 ))
+
+    ## Compute a UMAP projection
+    if( opt$umap ) {
+        cat( "Computing a UMAP projection...\n" )
+        U <- umap( Y, c(opt$id, "State", "Dominant") )
+
+        ## Generate and write a summary plot
+        gg <- plotSummary( U )
+        fn <- file.path( file.path(opt$out, "plots"), str_c(sn, "-summary.", opt$plots) )
+        suppressMessages(ggsave( fn, gg, width=9, height=7 ))
+        cat( "Plotted summary to", fn, "\n" )
+
+        ## Generate and write faceted probabilities plot
+        gg <- plotProbs( U, c(opt$id, "State", "Dominant") )
+        fn <- file.path( file.path(opt$out, "plots"), str_c(sn, "-probs.", opt$plots) )
+        suppressMessages(ggsave( fn, gg, width=9, height=7 ))
+        cat( "Plotted probabilities to", fn, "\n" )
+    }
+
+    ## Generate and write out plots for individual marker fits
+    for( i in names(mrkv) )
+    {
+        gg <- plotMarker(GMM, i)
+        fn <- file.path( dirPlot, str_c(i,".",opt$plots) )
+        suppressMessages(ggsave( fn, gg ))
+        cat( "Wrote", fn, "\n" )
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/naivestates.xml	Fri Mar 12 00:20:13 2021 +0000
@@ -0,0 +1,185 @@
+<tool id="naivestates" name="naivestates" version="@VERSION@.2" profile="17.09">
+    <description> Inference of cell states using Naive Bayes</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+
+    <expand macro="requirements"/>
+    @VERSION_CMD@
+
+    <command detect_errors="exit_code"><![CDATA[
+
+        @CMD_BEGIN@
+        -i '$counts'
+
+        #if $markers
+        -m $markers
+        #end if
+
+        --mct $mct
+        -p $plots
+
+        #if $id
+        --id $id
+        #end if
+
+        --log $log
+
+        #if $sfx
+        --sfx $sfx
+        #end if
+
+        #if $umap
+        --umap
+        #end if
+        -o .
+
+        &&
+
+        mv *-states.csv states.csv;
+
+        #if $plots != "off"
+        mv plots/*-probs.${plots} plots/probs.${plots};
+        mv plots/*-summary.${plots} plots/summary.${plots};
+        mv plots/*-allfits.${plots} plots/allfits.${plots};
+        #end if
+
+    ]]></command>
+
+
+    <inputs>
+        <param name="counts" type="data" format="csv" label="Quantified Cell Matrix"/>
+        <param name="markers" type="data" format="txt" optional="true" label="Markers to model"/>
+        <param name="mct" type="data" format="csv" label="Marker-State Association Map"/>
+        <param name="plots" type="select" label="Generate plots showing the fit">
+            <option selected="true" value="png">png</option>
+            <option value="pdf">pdf</option>
+            <option value="off">off</option>
+        </param>
+        <param name="id" type="text" value="" label="Column name containing cell IDs"/>
+        <param name="log" type="select" label="Log Transform" help="Whether to apply a log transform">
+            <option selected="true" value="auto">auto</option>
+            <option value="yes">yes</option>
+            <option value="no">no</option>
+        </param>
+        <param name="sfx" type="text" value="_cellMask" optional="true" label="Common suffix" help="Common suffix on marker columns (e.g., _cellMask)"/>
+        <param name="umap" type="boolean" checked="true" label="Generate UMAP plots"/>
+    </inputs>
+
+    <outputs>
+        <data format="csv" name="states" from_work_dir="states.csv" label="${tool.name} on ${on_string}: States CSV"/>
+        <data format="png" name="probs-png" from_work_dir="plots/probs.png" label="${tool.name} on ${on_string}: Probabilities">
+            <filter>plots == 'png'</filter>
+        </data>
+        <data format="png" name="summary-png" from_work_dir="plots/summary.png" label="${tool.name} on ${on_string}: Summary">
+            <filter>plots == 'png'</filter>
+        </data>
+        <data format="png" name="allfits-png" from_work_dir="plots/allfits.png" label="${tool.name} on ${on_string}: AllFits">
+            <filter>plots == 'png'</filter>
+        </data>
+        <data format="pdf" name="probs-pdf" from_work_dir="plots/probs.pdf" label="${tool.name} on ${on_string}: Probabilities">
+            <filter>plots == 'pdf'</filter>
+        </data>
+        <data format="pdf" name="summary-pdf" from_work_dir="plots/summary.pdf" label="${tool.name} on ${on_string}: Summary">
+            <filter>plots == 'pdf'</filter>
+        </data>
+        <data format="pdf" name="allfits-pdf" from_work_dir="plots/allfits.pdf" label="${tool.name} on ${on_string}: AllFits">
+            <filter>plots == 'pdf'</filter>
+        </data>
+    </outputs>
+    <help><![CDATA[
+naivestates - Inference of cell states using Naive Bayes
+This work is supported by the NIH Grant 1U54CA225088: Systems Pharmacology of Therapeutic and Adverse Responses to Immune Checkpoint and Small Molecule Drugs and by the NCI grant 1U2CCA233262: Pre-cancer atlases of cutaneous and hematologic origin (PATCH Center).
+
+Introduction
+naivestates is a label-free, cluster-free tool for inferring cell types from quantified marker expression data, based on known marker <-> cell type associations. The tool is designed to be run as a Docker container, but can also be installed in a Conda environment or as an R package. naivestates expects as input information about marker expression on a per-cell basis, provided in .csv format. One of the columns must contain cell IDs. An example input file may look as follows:
+
+CellID,KERATIN,FOXP3,SMA
+1,64.18060200668896,193.00334448160535,303.5016722408027
+2,54.850202429149796,151.19433198380565,176.3846153846154
+3,63.94712643678161,210.43218390804597,483.9448275862069
+4,142.01320132013203,227.85808580858085,420.76897689768975
+5,56.66379310344828,197.01896551724138,343.7810344827586
+6,69.97454545454545,187.59636363636363,267.9709090909091
+7,67.57754010695187,185.63368983957218,351.7914438502674
+8,64.012,190.02,349.348
+9,56.9622641509434,159.79245283018867,236.43867924528303
+...
+Installation
+Download the container image
+Pull the latest version with
+
+docker pull labsyspharm/naivestates
+Alternatively, you can pull a specific version, which is recommended to ensure reproducibility of your analyses. For example, v1.2.0 can be pulled with
+
+docker pull labsyspharm/naivestates:1.2.0
+Examine the tool usage instructions
+docker run --rm labsyspharm/naivestates:1.2.0 /app/main.R -h
+replacing 1.2.0 with the version you are working with. Omit :1.2.0 entirely if you pulled the latest version above. The flag --rm tells Docker to delete the container instance after it finishes displaying the help message.
+
+Basic usage
+At minimum, the tool requires an input file and the list of marker names:
+
+docker run --rm -v /path/to/data/folder:/data labsyspharm/naivestates:1.2.0 \
+  /app/main.R -i /data/myfile.csv -m aSMA,CD45,panCK
+where we can make a distinction between Docker-level arguments:
+
+--rm once again cleans up the container instance after it finishes running the code
+-v /path/to/data/folder:/data maps the local folder containing your data to /data inside the container
+:1.2.0 specifies the container version that we pulled above
+and tool-level arguments:
+
+-i /data/myfile.csv specifies which data file to process
+-m aSMA,CD45,panCK specifies the markers of interest (NOTE: comma-delimited, no spaces)
+If there is a large number of markers, place their names in a standalone file markers.txt with one marker per line. Ensure that the file lives in /path/to/data/folder/ and modify the Docker call to use the new file:
+
+docker run --rm -v /path/to/data/folder:/data labsyspharm/naivestates:1.2.0 \
+  /app/main.R -i /data/myfile.csv -m /data/markers.txt
+Additional parameters
+The following parameters are optional, but may be useful in certain scenarios:
+
+--plots <off|pdf|png> - (default: off) Produces QC plots of individual marker fits and summary UMAP plots in .png or .pdf format.
+--id - (default: CellID) Name of the column that contains cell IDs
+--log <yes|no|auto> - (default: auto) When a log10 transformation should be applied prior to fitting the data. The tool will do this automatically if it detects large values. Use --log no to force the use of original, non-transformed values instead.
+-o - (default: /data) Alternative output directory. (Note that any file written to a directory that wasn't mapped with docker -v will not persist when the container is destroyed.)
+--mct - The tool has a basic marker -> cell type (mct) mapping in typemap.csv. More sophisticated mct mappings can be defined by creating a custom-map.csv file with two columns: Marker and State. Ensure that custom-map.csv is in /path/to/data/folder and point the tool at it with --mct (e.g., /app/main.R -i /data/myfile.csv --mct /data/custom-map.csv -m aSMA,CD45,panCK)
+Alternative execution environments
+Running in a Conda environment
+If you are working in a computational environment that doesn't support Docker, the repository provides a Conda-based alternative. Ensure that conda is installed on your system, then 1) clone this repository, 2) instantiate the conda environment and 3) install the tool.
+
+git clone https://github.com/labsyspharm/naivestates.git
+cd naivestates
+conda env create -f conda.yml
+conda activate naivestates
+R -s -e "devtools::install_github('labsyspharm/naivestates')"
+The tool can now be used as above by running main.R:
+
+./main.R -h
+./main.R -i /path/to/datafile.csv -m aSMA,CD45,panCK
+Running as an R package
+The tool can also be installed as an R package directly from GitHub:
+
+if( !require(devtools) ) install.packages("devtools")
+devtools::install_github( "labsyspharm/naivestates" )
+Example usage:
+
+library( tidyverse )
+library( naivestates )
+
+# Load the original data
+X <- read_csv( "datafile.csv" )
+
+# Fit models to channels aSMA, CD45 and panCK
+# Specify that cell IDs are in column CellID
+GMM <- GMMfit( X, CellID, aSMA, CD45, panCK )
+
+# Plot a fit to one of the markers
+plotFit( GMM, "CD45" )
+
+# Write out the results to results.csv
+GMMreshape(GMM) %>% write_csv( "results.csv" )
+
+OHSU Wrapper Repo: https://github.com/ohsu-comp-bio/naivestates
+    ]]></help>
+    <expand macro="citations" />
+</tool>