view scpipe.xml @ 2:5c4bca9dd4a2 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scpipe commit 60e2a9e9129a22924c55b11b218b39d913c7e686
author iuc
date Mon, 14 Jan 2019 08:06:47 -0500
parents 4ec6717872b1
children 3ffca09599ca
line wrap: on
line source

<tool id="scpipe" name="scPipe" version="1.0.0+galaxy1">
    <description>- preprocessing pipeline for single cell RNA-seq</description>
    <requirements>
        <requirement type="package" version="1.0.0">bioconductor-scpipe</requirement>
        <requirement type="package" version="1.28.1">bioconductor-rsubread</requirement>
        <!-- rhtslib can be removed with a newer scpipe package -->
        <requirement type="package" version="1.10.0">bioconductor-rhtslib</requirement>
        <requirement type="package" version="1.20">r-knitr</requirement>
        <requirement type="package" version="1.10">r-rmarkdown</requirement>
        <requirement type="package" version="1.1.1">r-readr</requirement>
        <requirement type="package" version="4.7.1">r-plotly</requirement>
        <requirement type="package" version="0.4">r-dt</requirement>
        <requirement type="package" version="1.6.0">bioconductor-scater</requirement>
        <requirement type="package" version="1.6.2">bioconductor-scran</requirement>
        <requirement type="package" version="0.13">r-rtsne</requirement>
        <!-- Using older version of ggplot2 as getting error like this with 3.0.0:
        https://github.com/ggobi/ggally/issues/263  -->
        <requirement type="package" version="2.2.1">r-ggplot2</requirement>
        <requirement type="package" version="1.6.0">r-optparse</requirement>
    </requirements>
    <version_command><![CDATA[
echo $(R --version | grep version | grep -v GNU)", scPipe version" $(R --vanilla --slave -e "library(scPipe); cat(sessionInfo()\$otherPkgs\$scPipe\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", Rsubread version" $(R --vanilla --slave -e "library(Rsubread); cat(sessionInfo()\$otherPkgs\$Rsubread\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", knitr version" $(R --vanilla --slave -e "library(knitr); cat(sessionInfo()\$otherPkgs\$knitr\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", rmarkdown version" $(R --vanilla --slave -e "library(rmarkdown); cat(sessionInfo()\$otherPkgs\$rmarkdown\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", readr version" $(R --vanilla --slave -e "library(readr); cat(sessionInfo()\$otherPkgs\$readr\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", plotly version" $(R --vanilla --slave -e "library(plotly); cat(sessionInfo()\$otherPkgs\$plotly\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", DT version" $(R --vanilla --slave -e "library(DT); cat(sessionInfo()\$otherPkgs\$DT\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", scater version" $(R --vanilla --slave -e "library(scater); cat(sessionInfo()\$otherPkgs\$scater\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", scran version" $(R --vanilla --slave -e "library(scran); cat(sessionInfo()\$otherPkgs\$scran\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", rtsne version" $(R --vanilla --slave -e "library(Rtsne); cat(sessionInfo()\$otherPkgs\$Rtsne\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", ggplot2 version" $(R --vanilla --slave -e "library(ggplot2); cat(sessionInfo()\$otherPkgs\$ggplot2\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", optparse version" $(R --vanilla --slave -e "library(optparse); cat(sessionInfo()\$otherPkgs\$optparse\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
    ]]></version_command>
    <command detect_errors="exit_code"><![CDATA[
#import re

## Link input files

#if $samples.format_select == "bam":
    #set $bam_name = re.sub('[^\w\-\s]', '_', str($samples.bam.element_identifier))
    ln -s '$samples.bam' '$bam_name' &&
    ln -s '$samples.bam.metadata.bam_index' '${bam_name}.bai' &&
#else:

    ## FASTA ##

    #if $samples.ref_fasta.fasta_source == "history":
        #set $fasta_name = re.sub('[^\w\-\s]', '_', str($samples.ref_fasta.ref_fa_hist.element_identifier))
        ln -s '$samples.ref_fasta.ref_fa_hist' '$fasta_name' &&
    #else:
        #set $fasta_name = os.path.basename(str($samples.ref_fasta.ref_fa_builtin.fields.path))
        ln -s '$samples.ref_fasta.ref_fa_builtin.fields.path' '$fasta_name' &&
    #end if

    ## Reads ##

    #if $samples.paired_format.paired_format_selector == 'paired_collection':
        #set $in1 = $samples.paired_format.paired_input.forward
        #set $in2 = $samples.paired_format.paired_input.reverse
        #set $in1_name = re.sub('[^\w\-\s]', '_', str($samples.paired_format.paired_input.name))
        #set $in2_name = re.sub('[^\w\-\s]', '_', str("%s_%s" % ($samples.paired_format.paired_input.name, "R2")))
        ln -s '$in1' '$in1_name' &&
        ln -s '$in2' '$in2_name' &&
    #elif $samples.paired_format.paired_format_selector == 'paired':
        #set $in1 = $samples.paired_format.in1
        #set $in2 = $samples.paired_format.in2
        #set $in1_name = re.sub('[^\w\-\s]', '_', str($samples.paired_format.in1.element_identifier))
        #set $in2_name = re.sub('[^\w\-\s]', '_', str($samples.paired_format.in2.element_identifier))
        ln -s '$in1' '$in1_name' &&
        ln -s '$in2' '$in2_name' &&
    #end if
#end if

## GFF3 ##

#set $anno_name = re.sub('[^\w\-\s]', '_', str($exons.element_identifier))
#set $anno_name = $anno_name + ".gff3"
ln -s '${exons}' '$anno_name' &&

#if $out.rscript:
    cp '$__tool_directory__/scpipe.R' '$out_rscript' &&
#end if

TAB=\$(printf '\t') &&

#if $samples.barcodes:
    sed -i.bak -e "s/\${TAB}/,/g" '$samples.barcodes' &&
#end if

## Run scPipe

Rscript '$__tool_directory__/scpipe.R'

#if $samples.format_select == "bam":
    --bam '$bam_name'
    --samplename '$bam_name'
    --barcodes '$samples.barcodes'
#else:
    --fasta '$fasta_name'
    --read1 '$in1_name'
    --read2 '$in2_name'
    --samplename '$in1_name'
    #if $barcodes:
        --barcodes '$samples.barcodes'
    #end if
#end if

--exons '$anno_name'
--organism '$organism'

--bs1 $bs1
--bl1 $bl1
--bs2 $bs2
--bl2 $bl2
--us $us
--ul $ul

#if $out.metrics_matrix:
    --metrics_matrix '$out.metrics_matrix'
#end if

#if $out.report:
    --report '$out.report'
#end if

#if $out.rdata:
    --rdata '$out.rdata'
#end if

--rmlow $adv.f.rmlow
--rmN $adv.f.rmN
--minq $adv.f.minq
--numbq $adv.f.numbq
--max_mis $adv.f.max_mis
--max_reads $adv.f.max_reads
--min_count $adv.f.min_count

--UMI_cor $adv.UMI_cor
--stnd $adv.stnd
--gene_fl $adv.gene_fl

--nthreads \${GALAXY_SLOTS:-2}

#if $keep_outliers:
    --keep_outliers '$keep_outliers'
    && sed -e "s/,/\${TAB}/g" gene_count.csv > gene_count.tsv
#end if

]]></command>

    <inputs>
        <conditional name="samples">
            <param name="format_select" type="select" label="FASTQs or BAM" help="Select the format of the input sample">
                <option value="fastq" selected="True">FASTQ</option>
                <option value="bam">BAM</option>
            </param>
            <when value="bam">
                <param name="bam" type="data" format="bam" label="BAM files"/>
                <param name="barcodes" type="data" format="tabular,tsv" label="Cell barcodes file" help="File of cell barcodes. Should contain at least two columns, where the first column has the cell id and the second column contains the barcode sequence."/>
            </when>
            <when value="fastq">
                <conditional name="ref_fasta">
                    <param name="fasta_source" type="select" label="Reference genome FASTA">
                        <option value="cached" selected="true">Use a built-in genome</option>
                        <option value="history">Use a FASTA from history</option>
                    </param>
                    <when value="cached">
                        <param name="ref_fa_builtin" type="select" label="Select a built-in FASTA" help="If your genome of interest is not listed, contact your Galaxy administrator">
                            <options from_data_table="all_fasta">
                                <filter type="sort_by" column="2" />
                                <validator type="no_options" message="No FASTA is available for the selected input dataset" />
                            </options>
                        </param>
                    </when>
                    <when value="history">
                        <param name="ref_fa_hist" type="data" format="fasta" label="Select a history FASTA" />
                    </when>
                </conditional>
                <conditional name="paired_format">
                    <param name="paired_format_selector" type="select" label="Paired reads or Paired collection">
                        <option value="paired">Paired</option>
                        <option value="paired_collection">Paired Collection</option>
                    </param>
                    <when value="paired">
                        <param name="in1" type="data" format="fastq.gz,fastq" label="Input Read 1" help="Read 1 should contain the transcripts in fastq.gz format"/>
                        <param name="in2" type="data" format="fastq.gz,fastq" label="Input Read 2" help="Read 2 should contain UMI and barcodes in fastq.gz format"/>
                    </when>
                    <when value="paired_collection">
                        <param name="paired_input" type="data_collection" collection_type="paired" format="fastq.gz,fastq" label="Select paired collection(s)"/>
                    </when>
                </conditional>
                <param name="barcodes" type="data" format="tabular,tsv" optional="True" label="Cell barcodes file" help="Optional file of cell barcodes. If not provied the barcodes will be detected from the reads. Should contain at least two columns, where the first column has the cell id and the second column contains the barcode sequence."/>
            </when>
        </conditional>
        <param name="exons" type="data" format="gff3" label="Exon annotation GFF3 file" help="Current supported source is ENSEMBL"/>
        <param name="organism" type="text" label="Species gene id" help="This must be in biomaRt ENSEMBL listDatasets() format e.g. hsapiens_gene_ensembl. See the biomaRt user guide here: https://www.bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/biomaRt.html">
            <validator type="empty_field" />
            <validator type="regex" message="Only letters and underscores are allowed">^[\(\w\)]+$</validator>
        </param>
        <param argument="--bs1" type="integer" min="-1" value="-1" label="Barcode start Read 1" help="Barcode start position in Read 1. Positions are 0-indexed so the first base is considered base 0, -1 indicates no barcode. Default: -1" />
        <param argument="--bl1" type="integer" min="0" value="0" label="Barcode length Read 1" help="Barcode length in Read 1, 0 if no barcode present. Default: 0" />
        <param argument="--bs2" type="integer" min="-1" value="6" label="Barcode start Read 2" help="Barcode start position in Read 2. Positions are 0-indexed so the first base is considered base 0, -1 indicates no barcode.  Default: 6" />
        <param argument="--bl2" type="integer" min="0" value="8" label="Barcode length Read 2" help="Barcode length in Read 2, 0 if no barcode present. Default: 8" />
        <param argument="--us" type="integer" min="-1" value="0" label="UMI start Read 2" help="UMI start position in Read 2. Positions are 0-indexed so the first base is considered base 0, -1 indicates no UMI.  Default: 0" />
        <param argument="--ul" type="integer" min="0" value="6" label="UMI length Read 2" help="UMI length in Read 2, 0 if no UMI present. Default: 6" />
        <param name="keep_outliers" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Keep outliers?" help="If this option is set to Yes, outlier cells will not be removed from the gene count matrix. Default: No" />
        <section name="out" title="Output Options">
            <param name="plots" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="True" label="Output PDF with plots?" help="If this option is set to Yes, a PDF containing QC plots will be output. Default: Yes" />
            <param name="metrics_matrix" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output QC metrics matrix?" help="If this option is set a matrix of QC metrics will be output. Default: No" />
            <param name="report" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output HTML Report?" help="Only valid if FASTQs are input. If this option is set to Yes, a HTML report containing QC metrics will be output. Default: No" />
            <param name="rscript" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="False" label="Output Rscript?" help="If this option is set to Yes, the Rscript used to annotate the IDs will be provided as a text file in the output. Default: No" />
            <param name="rdata" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Output RData file?"
                help="Output all the data used by R to construct the tables and plots, can be loaded into R. Default: No">
            </param>
        </section>
        <section name="adv" title="Advanced Options">
            <section name="f" title="FASTQ input only">
                <param argument="--rmlow" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Remove reads with N in barcode or UMI" help="Default: Yes" />
                <param argument="--rmN" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Remove reads with low quality" help="Default: Yes" />
                <param argument="--minq" type="integer" min="0" value="20" label="Minimum read quality" help="Default: 20" />
                <param argument="--numbq" type="integer" min="0" value="2" label="Maximum number of bases below minq" help="Default: 2" />
                <param argument="--max_mis" type="integer" min="0" value="1" label="Maximum mismatch allowed in barcode" help="Default: 1" />

                <param argument="--max_reads" type="integer" min="0" value="1000000" label="Maximum reads processed" help="Maximum reads processed if detecting barcodes. Default: 1,000,000" />
                <param argument="--min_count" type="integer" min="0" value="10" label="Minimum count to keep" help="Minimum count to keep if detecting barcodes. Barcode will be discarded if it has lower count. This should be set according to --max_reads. Default: 10" />
            </section>
                <param argument="--UMI_cor" type="integer" min="0" value="1" label="Correct UMI sequence error" help="0 means no correction, 1 means simple correction and merge UMI with distance 1. Default: 1" />
                <param argument="--stnd" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Perform strand-specific mapping" help="Default: Yes" />
                <param argument="--gene_fl" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Remove low abundant genes" help="Low abundant is defined as only one copy of one UMI for this gene. Default: No" />
        </section>
    </inputs>

    <outputs>
        <data name="out_matrix" format="tabular" from_work_dir="gene_count.tsv" label="${tool.name} on ${on_string}: Count Matrix" />
        <data name="out_plots" format="pdf" from_work_dir="plots.pdf" label="${tool.name} on ${on_string}: Plots">
            <filter>plots</filter>
        </data>
        <data name="out_metrics_matrix" format="tabular" from_work_dir="metrics_matrix.tsv" label="${tool.name} on ${on_string}: QC metrics matrix">
            <filter>metrics_matrix</filter>
        </data>
        <data name="out_report" format="html" from_work_dir="report.nb.html" label="${tool.name} on ${on_string}: HTML Report" >
            <filter>report</filter>
        </data>
        <data name="out_rscript" format="txt" from_work_dir="out_rscript.txt" label="${tool.name} on ${on_string}: Rscript">
            <filter>rscript</filter>
        </data>
        <data name="out_rdata" format="rdata" from_work_dir="scPipe_analysis.RData" label="${tool.name} on ${on_string}: RData file">
            <filter>rdata</filter>
        </data>
    </outputs>

    <tests>
        <!-- Ensure outputs work -->
        <test>
            <param name="format_select" value="fastq" />
            <param name="fasta_source" value="history"/>
            <param name="ref_fa_hist" ftype="fasta" value="mm10_MT19.fa.gz"/>
            <param name="exons" ftype="gff3" value="mm10_MT19.gff3.gz"/>
            <param name="organism" value="mmusculus_gene_ensembl"/>
            <param name="paired_format_selector" value="paired" />
            <param name="in1" ftype="fastqsanger.gz" value="CB51_MT19_R1.gz"/>
            <param name="in2" ftype="fastqsanger.gz" value="CB51_MT19_R2.gz"/>
            <param name="us" value="-1"/>
            <param name="max_reads" value="5000000"/>
            <param name="min_count" value="100"/>
            <param name="report" value="True" />
            <output name="out_matrix" >
                <assert_contents>
                    <has_text text="ENSMUSG00000024940" />
                </assert_contents>
            </output>
            <output name="out_report" >
                <assert_contents>
                    <has_text text="scPipe report for sample" />
                </assert_contents>
            </output>
        </test>
        <!-- Ensure built-in fasta works -->
        <test>
            <param name="format_select" value="fastq" />
            <param name="fasta_source" value="cached"/>
            <param name="exons" ftype="gff3" value="mm10_MT19.gff3.gz"/>
            <param name="organism" value="mmusculus_gene_ensembl"/>
            <param name="paired_format_selector" value="paired" />
            <param name="in1" ftype="fastqsanger.gz" dbkey="mm10" value="CB51_MT19_R1.gz"/>
            <param name="in2" ftype="fastqsanger.gz" dbkey="mm10" value="CB51_MT19_R2.gz"/>
            <param name="us" value="-1"/>
            <param name="max_reads" value="5000000"/>
            <param name="min_count" value="100"/>
            <output name="out_matrix" >
                <assert_contents>
                    <has_text text="ENSMUSG00000064351" />
                </assert_contents>
            </output>
        </test>
        <!-- Ensure BAM input works -->
        <test>
            <param name="format_select" value="bam" />
            <param name="bam" ftype="bam" value="aligned.mapped.bam"/>
            <param name="barcodes" ftype="tabular" value="barcode_anno.tsv"/>
            <param name="exons" ftype="gff3" value="mm10_MT19.gff3.gz"/>
            <param name="organism" value="mmusculus_gene_ensembl"/>
            <param name="us" value="-1"/>
            <output name="out_matrix" >
                <assert_contents>
                    <has_text text="ENSMUSG00000064351" />
                </assert_contents>
            </output>
        </test>
        <!-- Ensure BAM input with QC outputs works -->
        <test>
            <param name="format_select" value="bam" />
            <param name="bam" ftype="bam" value="aligned.mapped.bam"/>
            <param name="barcodes" ftype="tabular" value="barcode_anno.tsv"/>
            <param name="exons" ftype="gff3" value="mm10_MT19.gff3.gz"/>
            <param name="organism" value="mmusculus_gene_ensembl"/>
            <param name="us" value="-1"/>
            <param name="plots" value="True"/>
            <param name="metrics_matrix" value="True"/>
            <output name="out_matrix" >
                <assert_contents>
                    <has_text text="ENSMUSG00000064351" />
                </assert_contents>
            </output>
            <output name="out_metrics_matrix" >
                <assert_contents>
                    <has_line_matching expression="cell_id.*unaligned.*aligned_unmapped.*mapped_to_exon.*mapped_to_intron.*ambiguous_mapping.*mapped_to_ERCC.*mapped_to_MT.*number_of_genes.*total_count_per_cell.*non_mt_percent\toutliers" />
                </assert_contents>
            </output>
            <output name="out_plots" ftype="pdf" value="plots.pdf" compare="sim_size" />
        </test>

    </tests>
    <help><![CDATA[
.. class:: infomark

**What it does**

scPipe_ is an `R/Bioconductor package`_ that integrates barcode demultiplexing, read alignment, UMI-aware gene-level quantification and quality control of raw sequencing data generated by multiple protocols that include CEL-seq, MARS-seq, Chromium 10X, Drop-seq and Smart-seq. scPipe produces a count matrix that is essential for downstream analysis along with QC metrics and a HTML report that summarises data quality. These results can be used as input for downstream analyses including normalization, visualization and statistical testing.
The scPipe workflow is described in this vignette_ and examples of the report output can be found here_. Note that outlier cells are detected and removed by default but they can be kept if "Keep outliers?" is selected.

-----

**Inputs**

Either
    * Reference genome in FASTA format
    * Paired-end FASTQ.GZ reads
    * Cell barcodes TAB-separated file (Optional)
OR
    * BAM file
    * Cell barcodes TAB-separated file
AND
    * Exon annotation in ENSEMBL GFF3 format

*Read Structure*

The default read structure represents CEL-seq
paired-ended reads, with one cell barcode in Read 2 Start from
6bp and UMI sequence in Read 2 Start from the first bp. So the
read structure will be : `bs1=-1, bl1=0, bs2=6, bl2=8, us=0,
ul=6`. `bs1=-1, bl1=0` means we don't have index in Read 1 so we
set a negative value to start position and zero to the length.
`bs2=6, bl2=8` means we have index in Read 2 which starts at 6bp
with 8bp length. `us=0, ul=6` means we have UMI from the
start of Read 2 and the length is 6bp. NOTE: the zero
based index is used so the index of the sequence starts from zero. For a
typical Drop-seq experiment the setting will be `bs1=-1,
bl1=0, bs2=0, bl2=12, us=12, ul=8`, which means Read 1 only
contains transcript and the first 12bp in Read 2 are index,
followed by 8bp UMIs.

-----

**Outputs**

    * Count matrix of genes in Tabular format

Optionally you can choose to output

    * PDF of QC Plots (default is Yes)
    * QC metrics matrix
    * HTML report (if FASTQs are input)
    * Rscript
    * RData

.. _scPipe: http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1006361
.. _R/Bioconductor package: https://bioconductor.org/packages/release/bioc/html/scPipe.html
.. _vignette: https://bioconductor.org/packages/release/bioc/vignettes/scPipe/inst/doc/scPipe_tutorial.html
.. _here: http://bioinf.wehi.edu.au/scPipe/

]]></help>
    <citations>
        <citation type="doi">10.1371/journal.pcbi.1006361</citation>
    </citations>
</tool>