diff scpipe.R @ 2:5c4bca9dd4a2 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scpipe commit 60e2a9e9129a22924c55b11b218b39d913c7e686
author iuc
date Mon, 14 Jan 2019 08:06:47 -0500
parents 32e1bfc6b7b2
children 7397e6badc11
line wrap: on
line diff
--- a/scpipe.R	Fri Aug 17 08:22:49 2018 -0400
+++ b/scpipe.R	Mon Jan 14 08:06:47 2019 -0500
@@ -18,8 +18,10 @@
 })
 
 option_list <- list(
+    make_option(c("-bam","--bam"), type="character", help="BAM file"),
     make_option(c("-fasta","--fasta"), type="character", help="Genome fasta file"),
     make_option(c("-exons","--exons"), type="character", help="Exon annotation gff3 file"),
+    make_option(c("-organism","--organism"), type="character", help="Organism e.g. hsapiens_gene_ensembl"),
     make_option(c("-barcodes","--barcodes"), type="character", help="Cell barcodes csv file"),
     make_option(c("-read1","--read1"), type="character", help="Read 1 fastq.gz"),
     make_option(c("-read2","--read2"), type="character", help="Read 2 fastq.gz"),
@@ -40,6 +42,8 @@
     make_option(c("-gene_fl","--gene_fl"), type="logical", help="Remove low abundant genes"),
     make_option(c("-max_reads","--max_reads"), type="integer", help="Maximum reads processed"),
     make_option(c("-min_count","--min_count"), type="integer", help="Minimum count to keep"),
+    make_option(c("-metrics_matrix","--metrics_matrix"), type="logical", help="QC metrics matrix"),
+    make_option(c("-keep_outliers","--keep_outliers"), type="logical", help="Keep outlier cells"),
     make_option(c("-report","--report"), type="logical", help="HTML report of plots"),
     make_option(c("-rdata","--rdata"), type="logical", help="Output RData file"),
     make_option(c("-nthreads","--nthreads"), type="integer", help="Number of threads")
@@ -48,6 +52,7 @@
 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 args = parse_args(parser)
 
+bam = args$bam
 fa_fn = args$fasta
 anno_fn = args$exons
 fq_R1 = args$read1
@@ -71,42 +76,48 @@
 
 # Outputs
 out_dir = "."
-fasta_index = file.path(out_dir, paste0(fa_fn, ".fasta_index"))
-combined_fastq = file.path(out_dir, "combined.fastq")
-aligned_bam = file.path(out_dir, "aligned.bam")
 mapped_bam = file.path(out_dir, "aligned.mapped.bam")
 
+# if input is fastqs
+if (!is.null(fa_fn)) {
+  fasta_index = file.path(out_dir, paste0(fa_fn, ".fasta_index"))
+  combined_fastq = file.path(out_dir, "combined.fastq")
+  aligned_bam = file.path(out_dir, "aligned.bam")
 
-print("Trimming barcodes")
-sc_trim_barcode(combined_fastq,
-                fq_R1,
-                fq_R2,
-                read_structure=read_structure,
-                filter_settings=filter_settings)
+  print("Trimming barcodes")
+  sc_trim_barcode(combined_fastq,
+                  fq_R1,
+                  fq_R2,
+                  read_structure=read_structure,
+                  filter_settings=filter_settings)
 
-print("Building genome index")
-Rsubread::buildindex(basename=fasta_index, reference=fa_fn)
+  print("Building genome index")
+  Rsubread::buildindex(basename=fasta_index, reference=fa_fn)
 
-print("Aligning reads to genome")
-Rsubread::align(index=fasta_index,
-    readfile1=combined_fastq,
-    output_file=aligned_bam,
-    nthreads=args$nthreads)
+  print("Aligning reads to genome")
+  Rsubread::align(index=fasta_index,
+      readfile1=combined_fastq,
+      output_file=aligned_bam,
+      nthreads=args$nthreads)
 
-if (!is.null(args$barcodes)) {
-  barcode_anno=args$barcodes
+  if (!is.null(args$barcodes)) {
+    barcode_anno=args$barcodes
+  } else {
+    print("Detecting barcodes")
+    # detect 10X barcodes and generate sample_index.csv file
+    barcode_anno = "sample_index.csv"
+    sc_detect_bc(
+        infq=combined_fastq,
+        outcsv=barcode_anno,
+        bc_len=read_structure$bl2,
+        max_reads=args$max_reads,
+        min_count=args$min_count,
+        max_mismatch=args$max_mis
+    )
+  }
 } else {
-  print("Detecting barcodes")
-  # detect 10X barcodes and generate sample_index.csv file
-  barcode_anno = "sample_index.csv"
-  sc_detect_bc(
-      infq=combined_fastq,
-      outcsv=barcode_anno,
-      bc_len=read_structure$bl2,
-      max_reads=args$max_reads,
-      min_count=args$min_count,
-      max_mismatch=args$max_mis
-  )
+   aligned_bam = file.path(out_dir, bam)
+   barcode_anno=args$barcodes
 }
 
 print("Assigning reads to exons")
@@ -118,32 +129,59 @@
 print("Counting genes")
 sc_gene_counting(out_dir, barcode_anno, UMI_cor=args$UMI_cor, gene_fl=args$gene_fl)
 
-print("Creating SingleCellExperiment object")
+print("Performing QC")
 sce <- create_sce_by_dir(out_dir)
+pdf("plots.pdf")
+plot_demultiplex(sce)
+if (has_umi) {
+  p = plot_UMI_dup(sce)
+  print(p)
+}
+sce = calculate_QC_metrics(sce)
+sce = detect_outlier(sce)
+p = plot_mapping(sce, percentage=TRUE, dataname=args$samplename)
+print(p)
+p = plot_QC_pairs(sce)
+print(p)
+dev.off()
+
+print("Removing outliers")
+if (is.null(args$keep_outliers)) {
+  sce = remove_outliers(sce)
+  gene_counts <- counts(sce)
+  write.table(data.frame("gene_id"=rownames(gene_counts), gene_counts), file="gene_count.tsv", sep="\t", quote=FALSE, row.names=FALSE)
+}
 
-if (!is.null(args$report)) {
-print("Creating report")
-create_report(sample_name=args$samplename,
-   outdir=out_dir,
-   r1=fq_R1,
-   r2=fq_R2,
-   outfq=combined_fastq,
-   read_structure=read_structure,
-   filter_settings=filter_settings,
-   align_bam=aligned_bam,
-   genome_index=fasta_index,
-   map_bam=mapped_bam,
-   exon_anno=anno_fn,
-   stnd=args$stnd,
-   fix_chr=FALSE,
-   barcode_anno=barcode_anno,
-   max_mis=args$max_mis,
-   UMI_cor=args$UMI_cor,
-   gene_fl=args$gene_fl)
+if (!is.null(args$metrics_matrix)) {
+  metrics <- colData(sce, internal=TRUE)
+  write.table(data.frame("cell_id"=rownames(metrics), metrics), file="metrics_matrix.tsv", sep="\t", quote=FALSE, row.names=FALSE)
+}
+
+if (!is.null(args$report) & (!is.null(fa_fn))) {
+  print("Creating report")
+  create_report(sample_name=args$samplename,
+     outdir=out_dir,
+     r1=fq_R1,
+     r2=fq_R2,
+     outfq=combined_fastq,
+     read_structure=read_structure,
+     filter_settings=filter_settings,
+     align_bam=aligned_bam,
+     genome_index=fasta_index,
+     map_bam=mapped_bam,
+     exon_anno=anno_fn,
+     stnd=args$stnd,
+     fix_chr=FALSE,
+     barcode_anno=barcode_anno,
+     max_mis=args$max_mis,
+     UMI_cor=args$UMI_cor,
+     gene_fl=args$gene_fl,
+     organism=args$organism,
+     gene_id_type="ensembl_gene_id")
 }
 
 if (!is.null(args$rdata) ) {
-    save(sce, file = file.path(out_dir,"scPipe_analysis.RData"))
+  save(sce, file = file.path(out_dir,"scPipe_analysis.RData"))
 }
 
 sessionInfo()