view scpipe.R @ 3:7397e6badc11 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scpipe commit 8007f71281553ddfa45e6f8e1172952d956bb000"
author iuc
date Thu, 11 Jun 2020 07:18:37 -0400
parents 5c4bca9dd4a2
children
line wrap: on
line source

err_foo <- function() {
    cat(geterrmessage(), file = stderr());
    q("no", 1, F)
}
options(show.error.messages = F, error = err_foo)

# we need that to not crash galaxy with an UTF8 error on German LC settings.
loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")

suppressPackageStartupMessages({
    library(scPipe)
    library(SingleCellExperiment)
    library(optparse)
    library(readr)
    library(ggplot2)
    library(plotly)
    library(DT)
    library(scater)
    library(scran)
    library(scales)
    library(Rtsne)
})

option_list <- list(
    make_option(c("-bam", "--bam"), type = "character", help = "BAM file"),
    make_option(c("-fasta", "--fasta"), type = "character", help = "Genome fasta file"),
    make_option(c("-exons", "--exons"), type = "character", help = "Exon annotation gff3 file"),
    make_option(c("-organism", "--organism"), type = "character", help = "Organism e.g. hsapiens_gene_ensembl"),
    make_option(c("-barcodes", "--barcodes"), type = "character", help = "Cell barcodes csv file"),
    make_option(c("-read1", "--read1"), type = "character", help = "Read 1 fastq.gz"),
    make_option(c("-read2", "--read2"), type = "character", help = "Read 2 fastq.gz"),
    make_option(c("-samplename", "--samplename"), type = "character", help = "Name to use for sample"),
    make_option(c("-bs1", "--bs1"), type = "integer", help = "Barcode start in Read 1"),
    make_option(c("-bl1", "--bl1"), type = "integer", help = "Barcode length in Read 1"),
    make_option(c("-bs2", "--bs2"), type = "integer", help = "Barcode start in Read 2"),
    make_option(c("-bl2", "--bl2"), type = "integer", help = "Barcode length in Read 2"),
    make_option(c("-us", "--us"), type = "integer", help = "UMI start in Read 2"),
    make_option(c("-ul", "--ul"), type = "integer", help = "UMI length in Read 2"),
    make_option(c("-rmlow", "--rmlow"), type = "logical", help = "Remove reads with N in barcode or UMI"),
    make_option(c("-rmN", "--rmN"), type = "logical", help = "Remove reads with low quality"),
    make_option(c("-minq", "--minq"), type = "integer", help = "Minimum read quality"),
    make_option(c("-numbq", "--numbq"), type = "integer", help = "Maximum number of bases below minq"),
    make_option(c("-stnd", "--stnd"), type = "logical", help = "Perform strand-specific mapping"),
    make_option(c("-max_mis", "--max_mis"), type = "integer", help = "Maximum mismatch allowed in barcode"),
    make_option(c("-UMI_cor", "--UMI_cor"), type = "integer", help = "Correct UMI sequence error"),
    make_option(c("-gene_fl", "--gene_fl"), type = "logical", help = "Remove low abundant genes"),
    make_option(c("-max_reads", "--max_reads"), type = "integer", help = "Maximum reads processed"),
    make_option(c("-min_count", "--min_count"), type = "integer", help = "Minimum count to keep"),
    make_option(c("-metrics_matrix", "--metrics_matrix"), type = "logical", help = "QC metrics matrix"),
    make_option(c("-keep_outliers", "--keep_outliers"), type = "logical", help = "Keep outlier cells"),
    make_option(c("-report", "--report"), type = "logical", help = "HTML report of plots"),
    make_option(c("-rdata", "--rdata"), type = "logical", help = "Output RData file"),
    make_option(c("-nthreads", "--nthreads"), type = "integer", help = "Number of threads")
)

parser <- OptionParser(usage = "%prog [options] file", option_list = option_list)
args <- parse_args(parser)

bam <- args$bam
fa_fn <- args$fasta
anno_fn <- args$exons
fq_r1 <- args$read1
fq_r2 <- args$read2
read_structure <- list(bs1 = args$bs1,  # barcode start position in fq_r1, -1 indicates no barcode
                       bl1 = args$bl1,  # barcode length in fq_r1, 0 since no barcode present
                       bs2 = args$bs2,  # barcode start position in fq_r2
                       bl2 = args$bl2,  # barcode length in fq_r2
                       us = args$us,    # UMI start position in fq_r2
                       ul = args$ul     # UMI length
                      )

if (args$us == -1) {
  has_umi <- FALSE
} else {
  has_umi <- TRUE
}

filter_settings <- list(rmlow = args$rmlow,
                        rmN = args$rmN,
                        minq = args$minq,
                        numbq = args$numbq)

# Outputs
out_dir <- "."
mapped_bam <- file.path(out_dir, "aligned.mapped.bam")

# if input is fastqs
if (!is.null(fa_fn)) {
  fasta_index <- file.path(out_dir, paste0(fa_fn, ".fasta_index"))
  combined_fastq <- file.path(out_dir, "combined.fastq")
  aligned_bam <- file.path(out_dir, "aligned.bam")

  print("Trimming barcodes")
  sc_trim_barcode(combined_fastq,
                  fq_r1,
                  fq_r2,
                  read_structure = read_structure,
                  filter_settings = filter_settings)

  print("Building genome index")
  Rsubread::buildindex(basename = fasta_index, reference = fa_fn)

  print("Aligning reads to genome")
  Rsubread::align(index = fasta_index,
                  readfile1 = combined_fastq,
                  output_file = aligned_bam,
                  nthreads = args$nthreads)

  if (!is.null(args$barcodes)) {
    barcode_anno <- args$barcodes
  } else {
    print("Detecting barcodes")
    # detect 10X barcodes and generate sample_index.csv file
    barcode_anno <- "sample_index.csv"
    sc_detect_bc(infq = combined_fastq,
                 outcsv = barcode_anno,
                 bc_len = read_structure$bl2,
                 max_reads = args$max_reads,
                 min_count = args$min_count,
                 max_mismatch = args$max_mis
    )
  }
} else {
   aligned_bam <- file.path(out_dir, bam)
   barcode_anno <- args$barcodes
}

print("Assigning reads to exons")
sc_exon_mapping(aligned_bam, mapped_bam, anno_fn, bc_len = read_structure$bl2, UMI_len = read_structure$ul, stnd = args$stnd)

print("De-multiplexing data")
sc_demultiplex(mapped_bam, out_dir, barcode_anno, has_UMI = has_umi)

print("Counting genes")
sc_gene_counting(out_dir, barcode_anno, UMI_cor = args$UMI_cor, gene_fl = args$gene_fl)

print("Performing QC")
sce <- create_sce_by_dir(out_dir)
pdf("plots.pdf")
plot_demultiplex(sce)
if (has_umi) {
  p <- plot_UMI_dup(sce)
  print(p)
}
sce <- calculate_QC_metrics(sce)
sce <- detect_outlier(sce)
p <- plot_mapping(sce, percentage = TRUE, dataname = args$samplename)
print(p)
p <- plot_QC_pairs(sce)
print(p)
dev.off()

print("Removing outliers")
if (is.null(args$keep_outliers)) {
  sce <- remove_outliers(sce)
  gene_counts <- counts(sce)
  write.table(data.frame("gene_id" = rownames(gene_counts), gene_counts), file = "gene_count.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
}

if (!is.null(args$metrics_matrix)) {
  metrics <- colData(sce, internal = TRUE)
  write.table(data.frame("cell_id" = rownames(metrics), metrics), file = "metrics_matrix.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
}

if (!is.null(args$report) & (!is.null(fa_fn))) {
  print("Creating report")
  create_report(sample_name = args$samplename,
                outdir = out_dir,
                r1 = fq_r1,
                r2 = fq_r2,
                outfq = combined_fastq,
                read_structure = read_structure,
                filter_settings = filter_settings,
                align_bam = aligned_bam,
                genome_index = fasta_index,
                map_bam = mapped_bam,
                exon_anno = anno_fn,
                stnd = args$stnd,
                fix_chr = FALSE,
                barcode_anno = barcode_anno,
                max_mis = args$max_mis,
                UMI_cor = args$UMI_cor,
                gene_fl = args$gene_fl,
                organism = args$organism,
                gene_id_type = "ensembl_gene_id")
}

if (!is.null(args$rdata)) {
  save(sce, file = file.path(out_dir, "scPipe_analysis.RData"))
}

sessionInfo()