Mercurial > repos > iuc > scpipe
diff scpipe.R @ 3:7397e6badc11 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scpipe commit 8007f71281553ddfa45e6f8e1172952d956bb000"
author | iuc |
---|---|
date | Thu, 11 Jun 2020 07:18:37 -0400 |
parents | 5c4bca9dd4a2 |
children |
line wrap: on
line diff
--- a/scpipe.R Mon Jan 14 08:06:47 2019 -0500 +++ b/scpipe.R Thu Jun 11 07:18:37 2020 -0400 @@ -1,4 +1,8 @@ -options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) +err_foo <- function() { + cat(geterrmessage(), file = stderr()); + q("no", 1, F) +} +options(show.error.messages = F, error = err_foo) # we need that to not crash galaxy with an UTF8 error on German LC settings. loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") @@ -18,170 +22,171 @@ }) option_list <- list( - make_option(c("-bam","--bam"), type="character", help="BAM file"), - make_option(c("-fasta","--fasta"), type="character", help="Genome fasta file"), - make_option(c("-exons","--exons"), type="character", help="Exon annotation gff3 file"), - make_option(c("-organism","--organism"), type="character", help="Organism e.g. hsapiens_gene_ensembl"), - make_option(c("-barcodes","--barcodes"), type="character", help="Cell barcodes csv file"), - make_option(c("-read1","--read1"), type="character", help="Read 1 fastq.gz"), - make_option(c("-read2","--read2"), type="character", help="Read 2 fastq.gz"), - make_option(c("-samplename","--samplename"), type="character", help="Name to use for sample"), - make_option(c("-bs1","--bs1"), type="integer", help="Barcode start in Read 1"), - make_option(c("-bl1","--bl1"), type="integer", help="Barcode length in Read 1"), - make_option(c("-bs2","--bs2"), type="integer", help="Barcode start in Read 2"), - make_option(c("-bl2","--bl2"), type="integer", help="Barcode length in Read 2"), - make_option(c("-us","--us"), type="integer", help="UMI start in Read 2"), - make_option(c("-ul","--ul"), type="integer", help="UMI length in Read 2"), - make_option(c("-rmlow","--rmlow"), type="logical", help="Remove reads with N in barcode or UMI"), - make_option(c("-rmN","--rmN"), type="logical", help="Remove reads with low quality"), - make_option(c("-minq","--minq"), type="integer", help="Minimum read quality"), - make_option(c("-numbq","--numbq"), type="integer", help="Maximum number of bases below minq"), - make_option(c("-stnd","--stnd"), type="logical", help="Perform strand-specific mapping"), - make_option(c("-max_mis","--max_mis"), type="integer", help="Maximum mismatch allowed in barcode"), - make_option(c("-UMI_cor","--UMI_cor"), type="integer", help="Correct UMI sequence error"), - make_option(c("-gene_fl","--gene_fl"), type="logical", help="Remove low abundant genes"), - make_option(c("-max_reads","--max_reads"), type="integer", help="Maximum reads processed"), - make_option(c("-min_count","--min_count"), type="integer", help="Minimum count to keep"), - make_option(c("-metrics_matrix","--metrics_matrix"), type="logical", help="QC metrics matrix"), - make_option(c("-keep_outliers","--keep_outliers"), type="logical", help="Keep outlier cells"), - make_option(c("-report","--report"), type="logical", help="HTML report of plots"), - make_option(c("-rdata","--rdata"), type="logical", help="Output RData file"), - make_option(c("-nthreads","--nthreads"), type="integer", help="Number of threads") - ) - -parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) -args = parse_args(parser) - -bam = args$bam -fa_fn = args$fasta -anno_fn = args$exons -fq_R1 = args$read1 -fq_R2 = args$read2 -read_structure = list( - bs1 = args$bs1, # barcode start position in fq_R1, -1 indicates no barcode - bl1 = args$bl1, # barcode length in fq_R1, 0 since no barcode present - bs2 = args$bs2, # barcode start position in fq_R2 - bl2 = args$bl2, # barcode length in fq_R2 - us = args$us, # UMI start position in fq_R2 - ul = args$ul # UMI length + make_option(c("-bam", "--bam"), type = "character", help = "BAM file"), + make_option(c("-fasta", "--fasta"), type = "character", help = "Genome fasta file"), + make_option(c("-exons", "--exons"), type = "character", help = "Exon annotation gff3 file"), + make_option(c("-organism", "--organism"), type = "character", help = "Organism e.g. hsapiens_gene_ensembl"), + make_option(c("-barcodes", "--barcodes"), type = "character", help = "Cell barcodes csv file"), + make_option(c("-read1", "--read1"), type = "character", help = "Read 1 fastq.gz"), + make_option(c("-read2", "--read2"), type = "character", help = "Read 2 fastq.gz"), + make_option(c("-samplename", "--samplename"), type = "character", help = "Name to use for sample"), + make_option(c("-bs1", "--bs1"), type = "integer", help = "Barcode start in Read 1"), + make_option(c("-bl1", "--bl1"), type = "integer", help = "Barcode length in Read 1"), + make_option(c("-bs2", "--bs2"), type = "integer", help = "Barcode start in Read 2"), + make_option(c("-bl2", "--bl2"), type = "integer", help = "Barcode length in Read 2"), + make_option(c("-us", "--us"), type = "integer", help = "UMI start in Read 2"), + make_option(c("-ul", "--ul"), type = "integer", help = "UMI length in Read 2"), + make_option(c("-rmlow", "--rmlow"), type = "logical", help = "Remove reads with N in barcode or UMI"), + make_option(c("-rmN", "--rmN"), type = "logical", help = "Remove reads with low quality"), + make_option(c("-minq", "--minq"), type = "integer", help = "Minimum read quality"), + make_option(c("-numbq", "--numbq"), type = "integer", help = "Maximum number of bases below minq"), + make_option(c("-stnd", "--stnd"), type = "logical", help = "Perform strand-specific mapping"), + make_option(c("-max_mis", "--max_mis"), type = "integer", help = "Maximum mismatch allowed in barcode"), + make_option(c("-UMI_cor", "--UMI_cor"), type = "integer", help = "Correct UMI sequence error"), + make_option(c("-gene_fl", "--gene_fl"), type = "logical", help = "Remove low abundant genes"), + make_option(c("-max_reads", "--max_reads"), type = "integer", help = "Maximum reads processed"), + make_option(c("-min_count", "--min_count"), type = "integer", help = "Minimum count to keep"), + make_option(c("-metrics_matrix", "--metrics_matrix"), type = "logical", help = "QC metrics matrix"), + make_option(c("-keep_outliers", "--keep_outliers"), type = "logical", help = "Keep outlier cells"), + make_option(c("-report", "--report"), type = "logical", help = "HTML report of plots"), + make_option(c("-rdata", "--rdata"), type = "logical", help = "Output RData file"), + make_option(c("-nthreads", "--nthreads"), type = "integer", help = "Number of threads") ) +parser <- OptionParser(usage = "%prog [options] file", option_list = option_list) +args <- parse_args(parser) + +bam <- args$bam +fa_fn <- args$fasta +anno_fn <- args$exons +fq_r1 <- args$read1 +fq_r2 <- args$read2 +read_structure <- list(bs1 = args$bs1, # barcode start position in fq_r1, -1 indicates no barcode + bl1 = args$bl1, # barcode length in fq_r1, 0 since no barcode present + bs2 = args$bs2, # barcode start position in fq_r2 + bl2 = args$bl2, # barcode length in fq_r2 + us = args$us, # UMI start position in fq_r2 + ul = args$ul # UMI length + ) + if (args$us == -1) { - has_umi = FALSE + has_umi <- FALSE } else { - has_umi = TRUE + has_umi <- TRUE } -filter_settings=list(rmlow=args$rmlow, rmN=args$rmN, minq=args$minq, numbq=args$numbq) +filter_settings <- list(rmlow = args$rmlow, + rmN = args$rmN, + minq = args$minq, + numbq = args$numbq) # Outputs -out_dir = "." -mapped_bam = file.path(out_dir, "aligned.mapped.bam") +out_dir <- "." +mapped_bam <- file.path(out_dir, "aligned.mapped.bam") # if input is fastqs if (!is.null(fa_fn)) { - fasta_index = file.path(out_dir, paste0(fa_fn, ".fasta_index")) - combined_fastq = file.path(out_dir, "combined.fastq") - aligned_bam = file.path(out_dir, "aligned.bam") + fasta_index <- file.path(out_dir, paste0(fa_fn, ".fasta_index")) + combined_fastq <- file.path(out_dir, "combined.fastq") + aligned_bam <- file.path(out_dir, "aligned.bam") print("Trimming barcodes") sc_trim_barcode(combined_fastq, - fq_R1, - fq_R2, - read_structure=read_structure, - filter_settings=filter_settings) + fq_r1, + fq_r2, + read_structure = read_structure, + filter_settings = filter_settings) print("Building genome index") - Rsubread::buildindex(basename=fasta_index, reference=fa_fn) + Rsubread::buildindex(basename = fasta_index, reference = fa_fn) print("Aligning reads to genome") - Rsubread::align(index=fasta_index, - readfile1=combined_fastq, - output_file=aligned_bam, - nthreads=args$nthreads) + Rsubread::align(index = fasta_index, + readfile1 = combined_fastq, + output_file = aligned_bam, + nthreads = args$nthreads) if (!is.null(args$barcodes)) { - barcode_anno=args$barcodes + barcode_anno <- args$barcodes } else { print("Detecting barcodes") # detect 10X barcodes and generate sample_index.csv file - barcode_anno = "sample_index.csv" - sc_detect_bc( - infq=combined_fastq, - outcsv=barcode_anno, - bc_len=read_structure$bl2, - max_reads=args$max_reads, - min_count=args$min_count, - max_mismatch=args$max_mis + barcode_anno <- "sample_index.csv" + sc_detect_bc(infq = combined_fastq, + outcsv = barcode_anno, + bc_len = read_structure$bl2, + max_reads = args$max_reads, + min_count = args$min_count, + max_mismatch = args$max_mis ) } } else { - aligned_bam = file.path(out_dir, bam) - barcode_anno=args$barcodes + aligned_bam <- file.path(out_dir, bam) + barcode_anno <- args$barcodes } print("Assigning reads to exons") -sc_exon_mapping(aligned_bam, mapped_bam, anno_fn, bc_len=read_structure$bl2, UMI_len=read_structure$ul, stnd=args$stnd) +sc_exon_mapping(aligned_bam, mapped_bam, anno_fn, bc_len = read_structure$bl2, UMI_len = read_structure$ul, stnd = args$stnd) print("De-multiplexing data") -sc_demultiplex(mapped_bam, out_dir, barcode_anno, has_UMI=has_umi) +sc_demultiplex(mapped_bam, out_dir, barcode_anno, has_UMI = has_umi) print("Counting genes") -sc_gene_counting(out_dir, barcode_anno, UMI_cor=args$UMI_cor, gene_fl=args$gene_fl) +sc_gene_counting(out_dir, barcode_anno, UMI_cor = args$UMI_cor, gene_fl = args$gene_fl) print("Performing QC") sce <- create_sce_by_dir(out_dir) pdf("plots.pdf") plot_demultiplex(sce) if (has_umi) { - p = plot_UMI_dup(sce) + p <- plot_UMI_dup(sce) print(p) } -sce = calculate_QC_metrics(sce) -sce = detect_outlier(sce) -p = plot_mapping(sce, percentage=TRUE, dataname=args$samplename) +sce <- calculate_QC_metrics(sce) +sce <- detect_outlier(sce) +p <- plot_mapping(sce, percentage = TRUE, dataname = args$samplename) print(p) -p = plot_QC_pairs(sce) +p <- plot_QC_pairs(sce) print(p) dev.off() print("Removing outliers") if (is.null(args$keep_outliers)) { - sce = remove_outliers(sce) + sce <- remove_outliers(sce) gene_counts <- counts(sce) - write.table(data.frame("gene_id"=rownames(gene_counts), gene_counts), file="gene_count.tsv", sep="\t", quote=FALSE, row.names=FALSE) + write.table(data.frame("gene_id" = rownames(gene_counts), gene_counts), file = "gene_count.tsv", sep = "\t", quote = FALSE, row.names = FALSE) } if (!is.null(args$metrics_matrix)) { - metrics <- colData(sce, internal=TRUE) - write.table(data.frame("cell_id"=rownames(metrics), metrics), file="metrics_matrix.tsv", sep="\t", quote=FALSE, row.names=FALSE) + metrics <- colData(sce, internal = TRUE) + write.table(data.frame("cell_id" = rownames(metrics), metrics), file = "metrics_matrix.tsv", sep = "\t", quote = FALSE, row.names = FALSE) } if (!is.null(args$report) & (!is.null(fa_fn))) { print("Creating report") - create_report(sample_name=args$samplename, - outdir=out_dir, - r1=fq_R1, - r2=fq_R2, - outfq=combined_fastq, - read_structure=read_structure, - filter_settings=filter_settings, - align_bam=aligned_bam, - genome_index=fasta_index, - map_bam=mapped_bam, - exon_anno=anno_fn, - stnd=args$stnd, - fix_chr=FALSE, - barcode_anno=barcode_anno, - max_mis=args$max_mis, - UMI_cor=args$UMI_cor, - gene_fl=args$gene_fl, - organism=args$organism, - gene_id_type="ensembl_gene_id") + create_report(sample_name = args$samplename, + outdir = out_dir, + r1 = fq_r1, + r2 = fq_r2, + outfq = combined_fastq, + read_structure = read_structure, + filter_settings = filter_settings, + align_bam = aligned_bam, + genome_index = fasta_index, + map_bam = mapped_bam, + exon_anno = anno_fn, + stnd = args$stnd, + fix_chr = FALSE, + barcode_anno = barcode_anno, + max_mis = args$max_mis, + UMI_cor = args$UMI_cor, + gene_fl = args$gene_fl, + organism = args$organism, + gene_id_type = "ensembl_gene_id") } -if (!is.null(args$rdata) ) { - save(sce, file = file.path(out_dir,"scPipe_analysis.RData")) +if (!is.null(args$rdata)) { + save(sce, file = file.path(out_dir, "scPipe_analysis.RData")) } sessionInfo()