Mercurial > repos > artbio > rsem
diff rsem-bwt2.xml @ 0:e5e836936d60 draft
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit d84a0359354698a4b29df12ab581c2618bffcf80
author | artbio |
---|---|
date | Sat, 31 Mar 2018 21:30:07 -0400 |
parents | |
children | 49795544dac7 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rsem-bwt2.xml Sat Mar 31 21:30:07 2018 -0400 @@ -0,0 +1,1427 @@ +<tool id="rsembowtie2" name="RSEM-Bowtie2" version="0.4.0"> + <description></description> + <macros> + <import>macros.xml</import> + </macros> + <requirements> + <requirement type="package" version="1.3.0">rsem</requirement> + <requirement type="package" version="2.3.4">bowtie2</requirement> + </requirements> + <stdio> + <exit_code range="1:" level="warning" description="Tool exception" /> + </stdio> + <command detect_errors="exit_code"><![CDATA[ + #if $job.select_job == "index": + echo ${job.reference_name} " " | tee $reference_file && + mkdir $reference_file.files_path && + rsem-prepare-reference + #if $job.polya.polya_use == 'add': + #if $job.polya.polya_length: + --polyA-length $job.polya.polya_length + #end if + #elif $job.polya.polya_use == 'subset': + --no-polyA-subset $job.polya.no_polya_subset + #if $job.polya.polya_length: + --polyA-length $job.polya.polya_length + #end if + #elif $job.polya.polya_use == 'none': + --no-polyA + #end if + $job.ntog + #if $job.transcript_to_gene_map: + --transcript-to-gene-map $job.transcript_to_gene_map + #end if + --bowtie2 + #if $job.self_reference.ref_type == 'transcripts': + $job.self_reference.reference_fasta_file + #else: + --gtf $job.self_reference.gtf + $job.self_reference.reference_fasta_file + #end if + ${reference_file.files_path}/${job.reference_name} + > ${reference_file.files_path}/${job.reference_name}.log + #end if + + #if $job.select_job == "index" and $run_rsem.select == "Yes": + && + #end if + + #if $run_rsem.select == "Yes": + ## uncompress fastq.gz or fastqsanger.gz if needed + #if $run_rsem.input.fastq.matepair=="single": + #if $run_rsem.input.fastq.singlefastq.is_of_type('fastq.gz') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger.gz'): + gunzip < '$run_rsem.input.fastq.singlefastq' > uncomp_single.fastq && + #elif $run_rsem.input.fastq.singlefastq.is_of_type('fastq') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger'): + ln -f -s '$run_rsem.input.fastq.singlefastq' 'uncomp_single.fastq' && + #end if + #elif $run_rsem.input.fastq.matepair=="paired": + #if $run_rsem.input.fastq.fastq1.is_of_type('fastq.gz') or $run_rsem.input.fastq.fastq1.is_of_type('fastqsanger.gz'): + gunzip < '$run_rsem.input.fastq.fastq1' > uncomp_pair1.fastq && + gunzip < '$run_rsem.input.fastq.fastq2' > uncomp_pair2.fastq && + #elif $run_rsem.input.fastq.singlefastq.is_of_type('fastq') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger'): + ln -f -s '$run_rsem.input.fastq.fastq1' 'uncomp_pair1.fastq' && + ln -f -s '$run_rsem.input.fastq.fastq2' 'uncomp_pair2.fastq' && + #end if + #end if + rsem-calculate-expression + ## --tag string + #if $run_rsem.seedlength: + --seed-length $run_rsem.seedlength + #end if + --forward-prob $run_rsem.forward_prob + #if $run_rsem.rsem_options.fullparams == 'fullset': + ## Fragment info + #if $run_rsem.rsem_options.fragment_length_mean: + --fragment-length-mean $run_rsem.rsem_options.fragment_length_mean + #end if + #if $run_rsem.rsem_options.fragment_length_min: + --fragment-length-min $run_rsem.rsem_options.fragment_length_min + #end if + #if $run_rsem.rsem_options.fragment_length_sd: + --fragment-length-sd $run_rsem.rsem_options.fragment_length_sd + #end if + #if $run_rsem.rsem_options.fragment_length_max: + --fragment-length-max $run_rsem.rsem_options.fragment_length_max + #end if + ## RSPD + #if $run_rsem.rsem_options.rspd.estimate == 'yes': + --estimate-rspd + #if $run_rsem.rsem_options.rspd.num_rspd_bins: + --num-rspd-bins $run_rsem.rsem_options.rspd.num_rspd_bins + #end if + #end if + ## Calculate 95% credibility intervals and posterior mean estimates. + #if $run_rsem.rsem_options.useci.ci == 'yes': + --calc-ci + #if $run_rsem.rsem_options.useci.cimem: + --ci-memory $run_rsem.rsem_options.useci.cimem + #end if + #end if + #end if + --num-threads \${GALAXY_SLOTS:-4} + --bowtie2 + #if $run_rsem.input.format == 'fasta' and $run_rsem.input.bowtie2_options.fullparams == 'fullset': + ## Bowtie params + #if $run_rsem.input.bowtie2_options.bowtie2_mismatch_rate: + --bowtie2-mismatch-rate $run_rsem.input.bowtie2_options.bowtie2_mismatch_rate + #end if + #if $run_rsem.input.bowtie2_options.bowtie2_k: + --bowtie2-k $run_rsem.input.bowtie2_options.bowtie2_k + #end if + #if $run_rsem.input.bowtie2_options.bowtie2_sensitivity_level: + --bowtie2-sensitivity-level $run_rsem.input.bowtie2_options.bowtie2_sensitivity_level + #end if + #end if + ## Outputs + #if $run_rsem.rsem_outputs.result_bams == 'none': + --no-bam-output + #elif $run_rsem.rsem_outputs.result_bams == 'default': + --sort-bam-by-coordinate + #else + --sort-bam-by-coordinate + --output-genome-bam + $run_rsem.rsem_outputs.sampling_for_bam + #end if + ## Input data + #if $run_rsem.input.format=="fastq" + $run_rsem.input.fastq_select + #if $run_rsem.input.fastq.matepair=="single": + uncomp_single.fastq + #elif $run_rsem.input.fastq.matepair=="paired": + --paired-end + uncomp_pair1.fastq + uncomp_pair2.fastq + #end if + #elif $run_rsem.input.format=="fasta" + --no-qualities + #if $run_rsem.input.fasta.matepair=="single": + $run_rsem.input.fasta.singlefasta + #elif $run_rsem.input.fasta.matepair=="paired": + --paired-end + $run_rsem.input.fasta.fasta1 + $run_rsem.input.fasta.fasta2 + #end if + #elif $run_rsem.input.format=="sam" + #if $run_rsem.input.matepair=="paired": + --paired-end + #end if + #if $run_rsem.input.rsem_sam._extension == 'sam': + --sam + #elif $run_rsem.input.rsem_sam._extension == 'bam': + --bam + #end if + $run_rsem.input.rsem_sam + #end if + ## RSEM reference + #if $run_rsem.reference.refSrc == 'history': + ${run_rsem.reference.rsem_ref.extra_files_path}/${run_rsem.reference.rsem_ref.metadata.reference_name} + #elif $run_rsem.reference.refSrc == 'self': + ${reference_file.files_path}/${job.reference_name} + #end if + ## sample_name: use a hard coded name so we can pull out galaxy outputs + rsem_output + ## direct output into logfile + > $log + #end if + ]]></command> + + <inputs> + <conditional name="job"> + <param name="select_job" type="select" label="rsem reference"> + <option value="index">Build rsem reference</option> + <option value="no-index" selected="true">rsem reference available from history</option> + </param> + <when value="index"> + <conditional name="self_reference"> + <param name="ref_type" type="select" label="Reference transcript source"> + <option value="transcripts">transcript fasta</option> + <option value="genomic">reference genome and gtf</option> + </param> + <when value="transcripts"> + <param name="reference_fasta_file" type="data" format="fasta" label="reference fasta file" + help="The files should contain the sequences of transcripts."/> + </when> + <when value="genomic"> + <param name="reference_fasta_file" type="data" format="fasta" label="reference fasta file" + help="The file should contain the sequence of an entire genome."/> + <param name="gtf" type="data" format="gtf" label="gtf" + help="extract transcript reference sequences using the gene annotations specified in this GTF" /> + </when> + </conditional> + <param name="transcript_to_gene_map" type="data" format="tabular" optional="true" label="Map of gene ids to transcript (isoform) ids" > + <help> + Each line of should be of the form: gene_id transcript_id ( with the two fields separated by a tab character ) + The map can be obtained from the UCSC table browser + group: Genes and Gene Prediction Tracks + table: knownIsoforms + Without a map: + If a reference genome and gtf is used, then RSEM uses the "gene_id" and "transcript_id" attributes in the GTF file. + Otherwise, RSEM assumes that each sequence in the reference sequence files is a separate gene. + </help> + </param> + <param name="reference_name" type="text" value="rsem_ref_name" label="reference name"> + <help>A one word name for this RSEM reference containing only letters, digits, and underscore characters</help> + <validator type="regex" message="Use only letters, digits, and underscore characters">^\w+$</validator> + </param> + <conditional name="polya"> + <param name="polya_use" type="select" label="PolyA "> + <option value="add" selected="true">Add poly(A) tails to all transcripts</option> + <option value="subset">Exclude poly(A) tails from selected transcripts</option> + <option value="none">Do not add poly(A) tails to any transcripts</option> + </param> + <when value="add"> + <param name="polya_length" type="integer" value="125" optional="true" label="The length of the poly(A) tails to be added. (Default: 125)"> + <validator type="in_range" message="must be positive " min="1"/> + </param> + </when> + <when value="subset"> + <param name="no_polya_subset" type="data" format="tabular" optional="true" label="List of transcript IDs (one per line) that should should not have polyA tails added."/> + <param name="polya_length" type="integer" value="125" optional="true" label="The length of the poly(A) tails to be added. (Default: 125)"> + <validator type="in_range" message="must be positive " min="1"/> + </param> + </when> + <when value="none"/> + </conditional> + <param name="ntog" type="boolean" truevalue="--no-ntog" falsevalue="" checked="false" label="Disable the conversion of 'N' characters to 'G' characters in the reference sequences" help="Bowtie uses the automatic N to G conversion to align against all positions in the reference."/> + </when> + <when value="no-index"> + </when> + </conditional> + + <conditional name="run_rsem"> + <param name="select" type="select" label="calculate expression with rsem"> + <option value="No">Just build rsem reference for latter rsem profiling</option> + <option value="Yes" selected="true">profile expression with rsem</option> + </param> + <when value="Yes"> + <param name="sample" type="text" value="rsem_sample" label="Sample name" /> + <conditional name="reference"> + <param name="refSrc" type="select" label="RSEM Reference Source"> + <option value="history">From your history</option> + <option value="self">Prepare RSEM Reference with this tool</option> + </param> + <when value="history"> + <param name="rsem_ref" type="data" format="rsem_ref" label="RSEM reference" /> + </when> + <when value="self"> + </when> + </conditional> + <conditional name="input"> + <param name="format" type="select" label="RSEM Input file type"> + <option value="fastq">FASTQ</option> + <option value="fasta">FASTA</option> + <option value="sam">SAM/BAM</option> + </param> + <when value="fastq"> + <param name="fastq_select" size="15" type="select" label="FASTQ type" > + <option value="--phred33-quals" selected="true">phred33 qualities (default for sanger)</option> + <option value="--solexa-quals">solexa qualities</option> + <option value="--phred64-quals">phred64 qualities</option> + </param> + <conditional name="fastq"> + <param name="matepair" type="select" label="Library type"> + <option value="single">Single End Reads</option> + <option value="paired">Paired End Reads</option> + </param> + <when value="single"> + <param name="singlefastq" type="data" format="fastq,fastq.gz" label="FASTQ file" /> + </when> + <when value="paired"> + <param name="fastq1" type="data" format="fastq,fastq.gz" label="Read 1 fastq file" /> + <param name="fastq2" type="data" format="fastq,fastq.gz" label="Read 2 fastq file" /> + </when> + </conditional> + <expand macro="bowtie2_options"/> + </when> + <when value="fasta"> + <conditional name="fasta"> + <param name="matepair" type="select" label="Library Type"> + <option value="single">Single End Reads</option> + <option value="paired">Paired End Reads</option> + </param> + <when value="single"> + <param name="singlefasta" type="data" format="fasta" label="fasta file" /> + </when> + <when value="paired"> + <param name="fasta1" type="data" format="fasta" label="Read 1 fasta file" /> + <param name="fasta2" type="data" format="fasta" label="Read 2 fasta file" /> + </when> + </conditional> + <expand macro="bowtie2_options"/> + </when> + <when value="sam"> + <!-- convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam --> + <param name="matepair" type="select" label="Library Type"> + <option value="single">Single End Reads</option> + <option value="paired">Paired End Reads</option> + </param> + <param name="rsem_sam" type="data" format="rsem_sam" label="RSEM formatted SAM file" /> + </when> + </conditional> + <expand macro="rsem_options"/> + <conditional name="rsem_outputs"> + <param name="result_bams" type="select" label="Create bam results files" + help="In addition to the transcript-coordinate-based BAM file output, also output a BAM file with the read alignments in genomic coordinates" > + <option value="none">No BAM results files</option> + <option value="default" selected="true">Transcript BAM results file</option> + <option value="both">Transcript and genome BAM results files</option> + </param> + <when value="none"/> + <when value="default"> + <expand macro="sampling_for_bam"/> + </when> + <when value="both"> + <expand macro="sampling_for_bam"/> + </when> + </conditional> + </when> + <when value="No"> + </when> + </conditional> + </inputs> + + <outputs> + <data format="rsem_ref" name="reference_file" label="RSEM ${job.reference_name} reference"> + <filter>job['select_job'] == "index"</filter> + </data> + <data format="tabular" name="gene_abundances" label="${run_rsem.sample}.gene_abundances" from_work_dir="rsem_output.genes.results"> + <filter>run_rsem['select'] == "Yes"</filter> + </data> + <data format="tabular" name="isoform_abundances" label="${run_rsem.sample}.isoform_abundances" from_work_dir="rsem_output.isoforms.results"> + <filter>run_rsem['select'] == "Yes"</filter> + </data> + <data format="bam" name="transcript_sorted_bam" label="${run_rsem.sample}.transcript.bam" from_work_dir="rsem_output.transcript.sorted.bam" > + <filter>run_rsem['select'] == "Yes" and run_rsem['rsem_outputs']['result_bams'] != "none"</filter> + </data> + <data format="bam" name="genome_sorted_bam" label="${run_rsem.sample}.genome.bam" from_work_dir="rsem_output.genome.sorted.bam"> + <filter>run_rsem['select'] == "Yes" and run_rsem['rsem_outputs']['result_bams'] == "both"</filter> + </data> + <data format="txt" name="log" label="${run_rsem.sample}.rsem_log"> + <filter>run_rsem['select'] == "Yes"</filter> + </data> + </outputs> + + <tests> + <test> + <param name="select_job" value="index"/> + <param name="ref_type" value="genomic"/> + <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/> + <param name="gtf" value="ref.gtf" ftype="gtf"/> + <param name="reference_name" value="ref"/> + <param name="select" value="Yes"/> + <param name="sample" value="rsem_sample"/> + <param name="refSrc" value="self"/> + <param name="format" value="fastq"/> + <param name="matepair" value="single"/> + <param name="singlefastq" value="test.fq" ftype="fastqsanger"/> + <param name="result_bams" value="none"/> + <output name="reference_file"> + <assert_contents> + <has_text text="ref" /> + </assert_contents> + </output> + <output name="gene_abundances" value="gene_abundances.tab2"/> + <output name="isoform_abundances" value="isoform_abundances.tab2" /> + <output name="log"> + <assert_contents> + <has_text text="Expression Results are written" /> + </assert_contents> + </output> + </test> + <test> + <param name="select_job" value="index"/> + <param name="ref_type" value="genomic"/> + <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/> + <param name="gtf" value="ref.gtf" ftype="gtf"/> + <param name="reference_name" value="ref"/> + <param name="select" value="Yes"/> + <param name="sample" value="rsem_sample"/> + <param name="refSrc" value="self"/> + <param name="format" value="fastq"/> + <param name="matepair" value="single"/> + <param name="singlefastq" value="test.fastq.gz" ftype="fastqsanger.gz"/> + <param name="result_bams" value="none"/> + <output name="reference_file"> + <assert_contents> + <has_text text="ref" /> + </assert_contents> + </output> + <output name="gene_abundances" value="gene_abundances.tab2"/> + <output name="isoform_abundances" value="isoform_abundances.tab2" /> + <output name="log"> + <assert_contents> + <has_text text="Expression Results are written" /> + </assert_contents> + </output> + </test> + <test> + <param name="select_job" value="index"/> + <param name="ref_type" value="genomic"/> + <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/> + <param name="gtf" value="ref.gtf" ftype="gtf"/> + <param name="reference_name" value="ref"/> + <param name="select" value="No"/> + <output name="reference_file"> + <assert_contents> + <has_text text="ref" /> + </assert_contents> + </output> + </test> + <test> + <param name="select_job" value="index"/> + <param name="ref_type" value="genomic"/> + <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/> + <param name="gtf" value="ref.gtf" ftype="gtf"/> + <param name="reference_name" value="ref"/> + <param name="select" value="No"/> + <output name="reference_file"> + <assert_contents> + <has_text text="ref" /> + </assert_contents> + </output> + </test> + </tests> + + <help> +.. class:: infomark + +RSEM HOME PAGE - http://deweylab.biostat.wisc.edu/rsem/ + +NAME + rsem-prepare-reference + +SYNOPSIS + rsem-prepare-reference [options] reference_fasta_file(s) reference_name + +DESCRIPTION + The rsem-prepare-reference program extracts/preprocesses the reference sequences and builds Bowtie indices using default parameters. + This program is used in conjunction with the 'rsem-calculate-expression' program. + +INPUTS + A fasta file of transcripts + or + A genome sequence fasta file and a GTF gene annotation file. (When using UCSC data, include the related knownIsoforms.txt) + +--- + +NAME + rsem-calculate-expression - Estimate gene and isoform expression from + RNA-Seq data. + +SYNOPSIS + rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name + rsem-calculate-expression [options] --paired-end upstream_read_file(s) downstream_read_file(s) reference_name sample_name + rsem-calculate-expression [options] --alignments [--paired-end] input reference_name sample_name + +ARGUMENTS + upstream_read_files(s) + Comma-separated list of files containing single-end reads or + upstream reads for paired-end data. By default, these files are + assumed to be in FASTQ format. If the --no-qualities option is + specified, then FASTA format is expected. + + downstream_read_file(s) + Comma-separated list of files containing downstream reads which are + paired with the upstream reads. By default, these files are assumed + to be in FASTQ format. If the --no-qualities option is specified, + then FASTA format is expected. + + input + SAM/BAM/CRAM formatted input file. If "-" is specified for the + filename, the input is instead assumed to come from standard input. + RSEM requires all alignments of the same read group together. For + paired-end reads, RSEM also requires the two mates of any alignment + be adjacent. In addition, RSEM does not allow the SEQ and QUAL + fields to be empty. See Description section for how to make input + file obey RSEM's requirements. + + reference_name + The name of the reference used. The user must have run + 'rsem-prepare-reference' with this reference_name before running + this program. + + sample_name + The name of the sample analyzed. All output files are prefixed by + this name (e.g., sample_name.genes.results) + +BASIC OPTIONS + --paired-end + Input reads are paired-end reads. (Default: off) + + --no-qualities + Input reads do not contain quality scores. (Default: off) + + --strandedness <none|forward|reverse> + This option defines the strandedness of the RNA-Seq reads. It + recognizes three values: 'none', 'forward', and 'reverse'. 'none' + refers to non-strand-specific protocols. 'forward' means all + (upstream) reads are derived from the forward strand. 'reverse' + means all (upstream) reads are derived from the reverse strand. If + 'forward'/'reverse' is set, the '--norc'/'--nofw' Bowtie/Bowtie 2 + option will also be enabled to avoid aligning reads to the opposite + strand. For Illumina TruSeq Stranded protocols, please use + 'reverse'. (Default: 'none') + + -p/--num-threads <int> + Number of threads to use. Both Bowtie/Bowtie2, expression estimation + and 'samtools sort' will use this many threads. (Default: 1) + + --alignments + Input file contains alignments in SAM/BAM/CRAM format. The exact + file format will be determined automatically. (Default: off) + + --fai <file> + If the header section of input alignment file does not contain + reference sequence information, this option should be turned on. + <file> is a FAI format file containing each reference sequence's + name and length. Please refer to the SAM official website for the + details of FAI format. (Default: off) + + --bowtie2 + Use Bowtie 2 instead of Bowtie to align reads. Since currently RSEM + does not handle indel, local and discordant alignments, the Bowtie2 + parameters are set in a way to avoid those alignments. In + particular, we use options '--sensitive --dpad 0 --gbar 99999999 + --mp 1,1 --np 1 --score-min L,0,-0.1' by default. The last parameter + of '--score-min', '-0.1', is the negative of maximum mismatch rate. + This rate can be set by option '--bowtie2-mismatch-rate'. If reads + are paired-end, we additionally use options '--no-mixed' and + '--no-discordant'. (Default: off) + + --star + Use STAR to align reads. Alignment parameters are from ENCODE3's + STAR-RSEM pipeline. To save computational time and memory resources, + STAR's Output BAM file is unsorted. It is stored in RSEM's temporary + directory with name as 'sample_name.bam'. Each STAR job will have + its own private copy of the genome in memory. (Default: off) + + --append-names + If gene_name/transcript_name is available, append it to the end of + gene_id/transcript_id (separated by '_') in files + 'sample_name.isoforms.results' and 'sample_name.genes.results'. + (Default: off) + + --seed <uint32> + Set the seed for the random number generators used in calculating + posterior mean estimates and credibility intervals. The seed must be + a non-negative 32 bit integer. (Default: off) + + --single-cell-prior + By default, RSEM uses Dirichlet(1) as the prior to calculate + posterior mean estimates and credibility intervals. However, much + less genes are expressed in single cell RNA-Seq data. Thus, if you + want to compute posterior mean estimates and/or credibility + intervals and you have single-cell RNA-Seq data, you are recommended + to turn on this option. Then RSEM will use Dirichlet(0.1) as the + prior which encourage the sparsity of the expression levels. + (Default: off) + + --calc-pme + Run RSEM's collapsed Gibbs sampler to calculate posterior mean + estimates. (Default: off) + + --calc-ci + Calculate 95% credibility intervals and posterior mean estimates. + The credibility level can be changed by setting + '--ci-credibility-level'. (Default: off) + + -q/--quiet + Suppress the output of logging information. (Default: off) + + -h/--help + Show help information. + + --version + Show version information. + +OUTPUT OPTIONS + --sort-bam-by-read-name + Sort BAM file aligned under transcript coordidate by read name. + Setting this option on will produce deterministic maximum likelihood + estimations from independent runs. Note that sorting will take long + time and lots of memory. (Default: off) + + --no-bam-output + Do not output any BAM file. (Default: off) + + --sampling-for-bam + When RSEM generates a BAM file, instead of outputting all alignments + a read has with their posterior probabilities, one alignment is + sampled according to the posterior probabilities. The sampling + procedure includes the alignment to the "noise" transcript, which + does not appear in the BAM file. Only the sampled alignment has a + weight of 1. All other alignments have weight 0. If the "noise" + transcript is sampled, all alignments appeared in the BAM file + should have weight 0. (Default: off) + + --output-genome-bam + Generate a BAM file, 'sample_name.genome.bam', with alignments + mapped to genomic coordinates and annotated with their posterior + probabilities. In addition, RSEM will call samtools (included in + RSEM package) to sort and index the bam file. + 'sample_name.genome.sorted.bam' and + 'sample_name.genome.sorted.bam.bai' will be generated. (Default: + off) + + --sort-bam-by-coordinate + Sort RSEM generated transcript and genome BAM files by coordinates + and build associated indices. (Default: off) + + --sort-bam-memory-per-thread <string> + Set the maximum memory per thread that can be used by 'samtools + sort'. <string> represents the memory and accepts suffices 'K/M/G'. + RSEM will pass <string> to the '-m' option of 'samtools sort'. Note + that the default used here is different from the default used by + samtools. (Default: 1G) + +ALIGNER OPTIONS + --seed-length <int> + Seed length used by the read aligner. Providing the correct value is + important for RSEM. If RSEM runs Bowtie, it uses this value for + Bowtie's seed length parameter. Any read with its or at least one of + its mates' (for paired-end reads) length less than this value will + be ignored. If the references are not added poly(A) tails, the + minimum allowed value is 5, otherwise, the minimum allowed value is + 25. Note that this script will only check if the value >= 5 and give + a warning message if the value < 25 but >= 5. (Default: 25) + + --phred33-quals + Input quality scores are encoded as Phred+33. (Default: on) + + --phred64-quals + Input quality scores are encoded as Phred+64 (default for GA + Pipeline ver. >= 1.3). (Default: off) + + --solexa-quals + Input quality scores are solexa encoded (from GA Pipeline ver. < + 1.3). (Default: off) + + --bowtie-path <path> + The path to the Bowtie executables. (Default: the path to the Bowtie + executables is assumed to be in the user's PATH environment + variable) + + --bowtie-n <int> + (Bowtie parameter) max # of mismatches in the seed. (Range: 0-3, + Default: 2) + + --bowtie-e <int> + (Bowtie parameter) max sum of mismatch quality scores across the + alignment. (Default: 99999999) + + --bowtie-m <int> + (Bowtie parameter) suppress all alignments for a read if > <int> + valid alignments exist. (Default: 200) + + --bowtie-chunkmbs <int> + (Bowtie parameter) memory allocated for best first alignment + calculation (Default: 0 - use Bowtie's default) + + --bowtie2-path <path> + (Bowtie 2 parameter) The path to the Bowtie 2 executables. (Default: + the path to the Bowtie 2 executables is assumed to be in the user's + PATH environment variable) + + --bowtie2-mismatch-rate <double> + (Bowtie 2 parameter) The maximum mismatch rate allowed. (Default: + 0.1) + + --bowtie2-k <int> + (Bowtie 2 parameter) Find up to <int> alignments per read. (Default: + 200) + + --bowtie2-sensitivity-level <string> + (Bowtie 2 parameter) Set Bowtie 2's preset options in --end-to-end + mode. This option controls how hard Bowtie 2 tries to find + alignments. <string> must be one of "very_fast", "fast", "sensitive" + and "very_sensitive". The four candidates correspond to Bowtie 2's + "--very-fast", "--fast", "--sensitive" and "--very-sensitive" + options. (Default: "sensitive" - use Bowtie 2's default) + + --star-path <path> + The path to STAR's executable. (Default: the path to STAR executable + is assumed to be in user's PATH environment variable) + + --star-gzipped-read-file + (STAR parameter) Input read file(s) is compressed by gzip. (Default: + off) + + --star-bzipped-read-file + (STAR parameter) Input read file(s) is compressed by bzip2. + (Default: off) + + --star-output-genome-bam + (STAR parameter) Save the BAM file from STAR alignment under genomic + coordinate to 'sample_name.STAR.genome.bam'. This file is NOT sorted + by genomic coordinate. In this file, according to STAR's manual, + 'paired ends of an alignment are always adjacent, and multiple + alignments of a read are adjacent as well'. (Default: off) + +ADVANCED OPTIONS + --tag <string> + The name of the optional field used in the SAM input for identifying + a read with too many valid alignments. The field should have the + format <tagName>:i:<value>, where a <value> bigger than 0 indicates + a read with too many alignments. (Default: "") + + --fragment-length-min <int> + Minimum read/insert length allowed. This is also the value for the + Bowtie/Bowtie2 -I option. (Default: 1) + + --fragment-length-max <int> + Maximum read/insert length allowed. This is also the value for the + Bowtie/Bowtie 2 -X option. (Default: 1000) + + --fragment-length-mean <double> + (single-end data only) The mean of the fragment length distribution, + which is assumed to be a Gaussian. (Default: -1, which disables use + of the fragment length distribution) + + --fragment-length-sd <double> + (single-end data only) The standard deviation of the fragment length + distribution, which is assumed to be a Gaussian. (Default: 0, which + assumes that all fragments are of the same length, given by the + rounded value of --fragment-length-mean) + + --estimate-rspd + Set this option if you want to estimate the read start position + distribution (RSPD) from data. Otherwise, RSEM will use a uniform + RSPD. (Default: off) + + --num-rspd-bins <int> + Number of bins in the RSPD. Only relevant when '--estimate-rspd' is + specified. Use of the default setting is recommended. (Default: 20) + + --gibbs-burnin <int> + The number of burn-in rounds for RSEM's Gibbs sampler. Each round + passes over the entire data set once. If RSEM can use multiple + threads, multiple Gibbs samplers will start at the same time and all + samplers share the same burn-in number. (Default: 200) + + --gibbs-number-of-samples <int> + The total number of count vectors RSEM will collect from its Gibbs + samplers. (Default: 1000) + + --gibbs-sampling-gap <int> + The number of rounds between two succinct count vectors RSEM + collects. If the count vector after round N is collected, the count + vector after round N + <int> will also be collected. (Default: 1) + + --ci-credibility-level <double> + The credibility level for credibility intervals. (Default: 0.95) + + --ci-memory <int> + Maximum size (in memory, MB) of the auxiliary buffer used for + computing credibility intervals (CI). (Default: 1024) + + --ci-number-of-samples-per-count-vector <int> + The number of read generating probability vectors sampled per + sampled count vector. The crebility intervals are calculated by + first sampling P(C | D) and then sampling P(Theta | C) for each + sampled count vector. This option controls how many Theta vectors + are sampled per sampled count vector. (Default: 50) + + --keep-intermediate-files + Keep temporary files generated by RSEM. RSEM creates a temporary + directory, 'sample_name.temp', into which it puts all intermediate + output files. If this directory already exists, RSEM overwrites all + files generated by previous RSEM runs inside of it. By default, + after RSEM finishes, the temporary directory is deleted. Set this + option to prevent the deletion of this directory and the + intermediate files inside of it. (Default: off) + + --temporary-folder <string> + Set where to put the temporary files generated by RSEM. If the + folder specified does not exist, RSEM will try to create it. + (Default: sample_name.temp) + + --time + Output time consumed by each step of RSEM to 'sample_name.time'. + (Default: off) + +PRIOR-ENHANCED RSEM OPTIONS + --run-pRSEM + Running prior-enhanced RSEM (pRSEM). Prior parameters, i.e. + isoform's initial pseudo-count for RSEM's Gibbs sampling, will be + learned from input RNA-seq data and an external data set. When pRSEM + needs and only needs ChIP-seq peak information to partition isoforms + (e.g. in pRSEM's default partition model), either ChIP-seq peak file + (with the '--chipseq-peak-file' option) or ChIP-seq FASTQ files for + target and input and the path for Bowtie executables are required + (with the '--chipseq-target-read-files <string>', + '--chipseq-control-read-files <string>', and '--bowtie-path <path> + options), otherwise, ChIP-seq FASTQ files for target and control and + the path to Bowtie executables are required. (Default: off) + + --chipseq-peak-file <string> + Full path to a ChIP-seq peak file in ENCODE's narrowPeak, i.e. + BED6+4, format. This file is used when running prior-enhanced RSEM + in the default two-partition model. It partitions isoforms by + whether they have ChIP-seq overlapping with their transcription + start site region or not. Each partition will have its own prior + parameter learned from a training set. This file can be either + gzipped or ungzipped. (Default: "") + + --chipseq-target-read-files <string> + Comma-separated full path of FASTQ read file(s) for ChIP-seq target. + This option is used when running prior-enhanced RSEM. It provides + information to calculate ChIP-seq peaks and signals. The file(s) can + be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The + options '--bowtie-path <path>' and '--chipseq-control-read-files + <string>' must be defined when this option is specified. (Default: + "") + + --chipseq-control-read-files <string> + Comma-separated full path of FASTQ read file(s) for ChIP-seq conrol. + This option is used when running prior-enhanced RSEM. It provides + information to call ChIP-seq peaks. The file(s) can be either + ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options + '--bowtie-path <path>' and '--chipseq-target-read-files <string>' + must be defined when this option is specified. (Default: "") + + --chipseq-read-files-multi-targets <string> + Comma-separated full path of FASTQ read files for multiple ChIP-seq + targets. This option is used when running prior-enhanced RSEM, where + prior is learned from multiple complementary data sets. It provides + information to calculate ChIP-seq signals. All files can be either + ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this + option is specified, the option '--bowtie-path <path>' must be + defined and the option '--partition-model <string>' will be set to + 'cmb_lgt' automatically. (Default: "") + + --chipseq-bed-files-multi-targets <string> + Comma-separated full path of BED files for multiple ChIP-seq + targets. This option is used when running prior-enhanced RSEM, where + prior is learned from multiple complementary data sets. It provides + information of ChIP-seq signals and must have at least the first six + BED columns. All files can be either ungzipped or gzipped with a + suffix '.gz' or '.gzip'. When this option is specified, the option + '--partition-model <string>' will be set to 'cmb_lgt' automatically. + (Default: "") + + --cap-stacked-chipseq-reads + Keep a maximum number of ChIP-seq reads that aligned to the same + genomic interval. This option is used when running prior-enhanced + RSEM, where prior is learned from multiple complementary data sets. + This option is only in use when either + '--chipseq-read-files-multi-targets <string>' or + '--chipseq-bed-files-multi-targets <string>' is specified. (Default: + off) + + --n-max-stacked-chipseq-reads <int> + The maximum number of stacked ChIP-seq reads to keep. This option is + used when running prior-enhanced RSEM, where prior is learned from + multiple complementary data sets. This option is only in use when + the option '--cap-stacked-chipseq-reads' is set. (Default: 5) + + --partition-model <string> + A keyword to specify the partition model used by prior-enhanced + RSEM. It must be one of the following keywords: + + - pk + Partitioned by whether an isoform has a ChIP-seq peak overlapping + with its transcription start site (TSS) region. The TSS region is + defined as [TSS-500bp, TSS+500bp]. For simplicity, we refer this + type of peak as 'TSS peak' when explaining other keywords. + + - pk_lgtnopk + First partitioned by TSS peak. Then, for isoforms in the 'no TSS + peak' set, a logistic model is employed to further classify them + into two partitions. + + - lm3, lm4, lm5, or lm6 + Based on their ChIP-seq signals, isoforms are classified into 3, + 4, 5, or 6 partitions by a linear regression model. + + - nopk_lm2pk, nopk_lm3pk, nopk_lm4pk, or nopk_lm5pk + First partitioned by TSS peak. Then, for isoforms in the 'with TSS + peak' set, a linear regression model is employed to further + classify them into 2, 3, 4, or 5 partitions. + + - pk_lm2nopk, pk_lm3nopk, pk_lm4nopk, or pk_lm5nopk + First partitioned by TSS peak. Then, for isoforms in the 'no TSS + peak' set, a linear regression model is employed to further + classify them into 2, 3, 4, or 5 partitions. + + - cmb_lgt + Using a logistic regression to combine TSS signals from multiple + complementary data sets and partition training set isoform into + 'expressed' and 'not expressed'. This partition model is only in + use when either '--chipseq-read-files-multi-targets <string>' or + '--chipseq-bed-files-multi-targets <string> is specified. + + Parameters for all the above models are learned from a training set. + For detailed explanations, please see prior-enhanced RSEM's paper. + (Default: 'pk') + +DEPRECATED OPTIONS + The options in this section are deprecated. They are here only for + compatibility reasons and may be removed in future releases. + + --sam + Inputs are alignments in SAM format. (Default: off) + + --bam + Inputs are alignments in BAM format. (Default: off) + + --strand-specific + Equivalent to '--strandedness forward'. (Default: off) + + --forward-prob <double> + Probability of generating a read from the forward strand of a + transcript. Set to 1 for a strand-specific protocol where all + (upstream) reads are derived from the forward strand, 0 for a + strand-specific protocol where all (upstream) read are derived from + the reverse strand, or 0.5 for a non-strand-specific protocol. + (Default: off) + +DESCRIPTION + In its default mode, this program aligns input reads against a reference + transcriptome with Bowtie and calculates expression values using the + alignments. RSEM assumes the data are single-end reads with quality + scores, unless the '--paired-end' or '--no-qualities' options are + specified. Alternatively, users can use STAR to align reads using the + '--star' option. RSEM has provided options in 'rsem-prepare-reference' + to prepare STAR's genome indices. Users may use an alternative aligner + by specifying '--alignments', and providing an alignment file in + SAM/BAM/CRAM format. However, users should make sure that they align + against the indices generated by 'rsem-prepare-reference' and the + alignment file satisfies the requirements mentioned in ARGUMENTS + section. + + One simple way to make the alignment file satisfying RSEM's requirements + is to use the 'convert-sam-for-rsem' script. This script accepts + SAM/BAM/CRAM files as input and outputs a BAM file. For example, type + the following command to convert a SAM file, 'input.sam', to a + ready-for-use BAM file, 'input_for_rsem.bam': + + convert-sam-for-rsem input.sam input_for_rsem + + For details, please refer to 'convert-sam-for-rsem's documentation page. + +NOTES + 1. Users must run 'rsem-prepare-reference' with the appropriate + reference before using this program. + + 2. For single-end data, it is strongly recommended that the user provide + the fragment length distribution parameters (--fragment-length-mean and + --fragment-length-sd). For paired-end data, RSEM will automatically + learn a fragment length distribution from the data. + + 3. Some aligner parameters have default values different from their + original settings. + + 4. With the '--calc-pme' option, posterior mean estimates will be + calculated in addition to maximum likelihood estimates. + + 5. With the '--calc-ci' option, 95% credibility intervals and posterior + mean estimates will be calculated in addition to maximum likelihood + estimates. + + 6. The temporary directory and all intermediate files will be removed + when RSEM finishes unless '--keep-intermediate-files' is specified. + + With the '--run-pRSEM' option and associated options (see section + 'PRIOR-ENHANCED RSEM OPTIONS' above for details), prior-enhanced RSEM + will be running. Prior parameters will be learned from supplied external + data set(s) and assigned as initial pseudo-counts for isoforms in the + corresponding partition for Gibbs sampling. + +OUTPUT + sample_name.isoforms.results + File containing isoform level expression estimates. The first line + contains column names separated by the tab character. The format of + each line in the rest of this file is: + + transcript_id gene_id length effective_length expected_count TPM + FPKM IsoPct [posterior_mean_count + posterior_standard_deviation_of_count pme_TPM pme_FPKM + IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound + TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound + FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation] + + Fields are separated by the tab character. Fields within "[]" are + optional. They will not be presented if neither '--calc-pme' nor + '--calc-ci' is set. + + 'transcript_id' is the transcript name of this transcript. 'gene_id' + is the gene name of the gene which this transcript belongs to + (denote this gene as its parent gene). If no gene information is + provided, 'gene_id' and 'transcript_id' are the same. + + 'length' is this transcript's sequence length (poly(A) tail is not + counted). 'effective_length' counts only the positions that can + generate a valid fragment. If no poly(A) tail is added, + 'effective_length' is equal to transcript length - mean fragment + length + 1. If one transcript's effective length is less than 1, + this transcript's both effective length and abundance estimates are + set to 0. + + 'expected_count' is the sum of the posterior probability of each + read comes from this transcript over all reads. Because 1) each read + aligning to this transcript has a probability of being generated + from background noise; 2) RSEM may filter some alignable low quality + reads, the sum of expected counts for all transcript are generally + less than the total number of reads aligned. + + 'TPM' stands for Transcripts Per Million. It is a relative measure + of transcript abundance. The sum of all transcripts' TPM is 1 + million. 'FPKM' stands for Fragments Per Kilobase of transcript per + Million mapped reads. It is another relative measure of transcript + abundance. If we define l_bar be the mean transcript length in a + sample, which can be calculated as + + l_bar = \sum_i TPM_i / 10^6 * effective_length_i (i goes through + every transcript), + + the following equation is hold: + + FPKM_i = 10^3 / l_bar * TPM_i. + + We can see that the sum of FPKM is not a constant across samples. + + 'IsoPct' stands for isoform percentage. It is the percentage of this + transcript's abandunce over its parent gene's abandunce. If its + parent gene has only one isoform or the gene information is not + provided, this field will be set to 100. + + 'posterior_mean_count', 'pme_TPM', 'pme_FPKM' are posterior mean + estimates calculated by RSEM's Gibbs sampler. + 'posterior_standard_deviation_of_count' is the posterior standard + deviation of counts. 'IsoPct_from_pme_TPM' is the isoform percentage + calculated from 'pme_TPM' values. + + 'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound' + and 'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95% + credibility intervals for TPM and FPKM values. The bounds are + inclusive (i.e. [l, u]). + + 'TPM_coefficient_of_quartile_variation' and + 'FPKM_coefficient_of_quartile_variation' are coefficients of + quartile variation (CQV) for TPM and FPKM values. CQV is a robust + way of measuring the ratio between the standard deviation and the + mean. It is defined as + + CQV := (Q3 - Q1) / (Q3 + Q1), + + where Q1 and Q3 are the first and third quartiles. + + sample_name.genes.results + File containing gene level expression estimates. The first line + contains column names separated by the tab character. The format of + each line in the rest of this file is: + + gene_id transcript_id(s) length effective_length expected_count TPM + FPKM [posterior_mean_count posterior_standard_deviation_of_count + pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound + TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound + FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation] + + Fields are separated by the tab character. Fields within "[]" are + optional. They will not be presented if neither '--calc-pme' nor + '--calc-ci' is set. + + 'transcript_id(s)' is a comma-separated list of transcript_ids + belonging to this gene. If no gene information is provided, + 'gene_id' and 'transcript_id(s)' are identical (the + 'transcript_id'). + + A gene's 'length' and 'effective_length' are defined as the weighted + average of its transcripts' lengths and effective lengths (weighted + by 'IsoPct'). A gene's abundance estimates are just the sum of its + transcripts' abundance estimates. + + sample_name.alleles.results + Only generated when the RSEM references are built with + allele-specific transcripts. + + This file contains allele level expression estimates for + allele-specific expression calculation. The first line contains + column names separated by the tab character. The format of each line + in the rest of this file is: + + allele_id transcript_id gene_id length effective_length + expected_count TPM FPKM AlleleIsoPct AlleleGenePct + [posterior_mean_count posterior_standard_deviation_of_count pme_TPM + pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM + TPM_ci_lower_bound TPM_ci_upper_bound + TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound + FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation] + + Fields are separated by the tab character. Fields within "[]" are + optional. They will not be presented if neither '--calc-pme' nor + '--calc-ci' is set. + + 'allele_id' is the allele-specific name of this allele-specific + transcript. + + 'AlleleIsoPct' stands for allele-specific percentage on isoform + level. It is the percentage of this allele-specific transcript's + abundance over its parent transcript's abundance. If its parent + transcript has only one allele variant form, this field will be set + to 100. + + 'AlleleGenePct' stands for allele-specific percentage on gene level. + It is the percentage of this allele-specific transcript's abundance + over its parent gene's abundance. + + 'AlleleIsoPct_from_pme_TPM' and 'AlleleGenePct_from_pme_TPM' have + similar meanings. They are calculated based on posterior mean + estimates. + + Please note that if this file is present, the fields 'length' and + 'effective_length' in 'sample_name.isoforms.results' should be + interpreted similarly as the corresponding definitions in + 'sample_name.genes.results'. + + sample_name.transcript.bam + Only generated when --no-bam-output is not specified. + + 'sample_name.transcript.bam' is a BAM-formatted file of read + alignments in transcript coordinates. The MAPQ field of each + alignment is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), + where w is the posterior probability of that alignment being the + true mapping of a read. In addition, RSEM pads a new tag ZW:f:value, + where value is a single precision floating number representing the + posterior probability. Because this file contains all alignment + lines produced by bowtie or user-specified aligners, it can also be + used as a replacement of the aligner generated BAM/SAM file. + + sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai + Only generated when --no-bam-output is not specified and --sort-bam-by-coordinate is specified. + + 'sample_name.transcript.sorted.bam' and + 'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and + indices generated by samtools (included in RSEM package). + + sample_name.genome.bam + Only generated when --no-bam-output is not specified and + --output-genome-bam is specified. + + 'sample_name.genome.bam' is a BAM-formatted file of read alignments + in genomic coordinates. Alignments of reads that have identical + genomic coordinates (i.e., alignments to different isoforms that + share the same genomic region) are collapsed into one alignment. The + MAPQ field of each alignment is set to min(100, floor(-10 * + log10(1.0 - w) + 0.5)), where w is the posterior probability of that + alignment being the true mapping of a read. In addition, RSEM pads a + new tag ZW:f:value, where value is a single precision floating + number representing the posterior probability. If an alignment is + spliced, a XS:A:value tag is also added, where value is either '+' + or '-' indicating the strand of the transcript it aligns to. + + sample_name.genome.sorted.bam and sample_name.genome.sorted.bam.bai + Only generated when --no-bam-output is not specified, and + --sort-bam-by-coordinate and --output-genome-bam are specified. + + 'sample_name.genome.sorted.bam' and + 'sample_name.genome.sorted.bam.bai' are the sorted BAM file and + indices generated by samtools (included in RSEM package). + + sample_name.time + Only generated when --time is specified. + + It contains time (in seconds) consumed by aligning reads, estimating + expression levels and calculating credibility intervals. + + sample_name.stat + This is a folder instead of a file. All model related statistics are + stored in this folder. Use 'rsem-plot-model' can generate plots + using this folder. + + 'sample_name.stat/sample_name.cnt' contains alignment statistics. + The format and meanings of each field are described in + 'cnt_file_description.txt' under RSEM directory. + + 'sample_name.stat/sample_name.model' stores RNA-Seq model parameters + learned from the data. The format and meanings of each filed of this + file are described in 'model_file_description.txt' under RSEM + directory. + + The following four output files will be generated only by + prior-enhanced RSEM + + - 'sample_name.stat/sample_name_prsem.all_tr_features' + It stores isofrom features for deriving and assigning pRSEM prior. + The first line is a header and the rest is one isoform per line. + The description for each column is: + + * trid: transcript ID from input annotation + + * geneid: gene ID from input anntation + + * chrom: isoform's chromosome name + + * strand: isoform's strand name + + * start: isoform's end with the lowest genomic loci + + * end: isoform's end with the highest genomic loci + + * tss_mpp: average mappability of [TSS-500bp, TSS+500bp], where + TSS is isoform's transcription start site, i.e. 5'-end + + * body_mpp: average mappability of (TSS+500bp, TES-500bp), where + TES is isoform's transcription end site, i.e. 3'-end + + * tes_mpp: average mappability of [TES-500bp, TES+500bp] + + * pme_count: isoform's fragment or read count from RSEM's + posterior mean estimates + + * tss: isoform's TSS loci + + * tss_pk: equal to 1 if isoform's [TSS-500bp, TSS+500bp] region + overlaps with a RNA Pol II peak; 0 otherwise + + * is_training: equal to 1 if isoform is in the training set where + Pol II prior is learned; 0 otherwise + + - 'sample_name.stat/sample_name_prsem.all_tr_prior' + It stores prior parameters for every isoform. This file does not + have a header. Each line contains a prior parameter and an + isoform's transcript ID delimited by " # ". + + - 'sample_name.stat/sample_name_uniform_prior_1.isoforms.results' + RSEM's posterior mean estimates on the isoform level with an + initial pseudo-count of one for every isoform. It is in the same + format as the 'sample_name.isoforms.results'. + + - 'sample_name.stat/sample_name_uniform_prior_1.genes.results' + RSEM's posterior mean estimates on the gene level with an initial + pseudo-count of one for every isoform. It is in the same format as + the 'sample_name.genes.results'. + + When learning prior from multiple external data sets in + prior-enhanced RSEM, two additional output files will be generated. + + - 'sample_name.stat/sample_name.pval_LL' + It stores a p-value and a log-likelihood. The p-value indicates + whether the combination of multiple complementary data sets is + informative for RNA-seq quantification. The log-likelihood shows + how well pRSEM's Dirichlet-multinomial model fits the read counts + of partitioned training set isoforms. + + - 'sample_name.stat/sample_name.lgt_mdl.RData' + It stores an R object named 'glmmdl', which is a logistic + regression model on the training set isoforms and multiple + external data sets. + + In addition, extra columns will be added to + 'sample_name.stat/all_tr_features' + + * is_expr: equal to 1 if isoform has an abundance >= 1 TPM and a + non-zero read count from RSEM's posterior mean estimates; 0 + otherwise + + * "$external_data_set_basename": log10 of external data's signal at + [TSS-500, TSS+500]. Signal is the number of reads aligned within + that interval and normalized to RPKM by read depth and interval + length. It will be set to -4 if no read aligned to that interval. + + There are multiple columns like this one, where each represents an + external data set. + + * prd_expr_prob: predicted probability from logistic regression + model on whether this isoform is expressed or not. A probability + higher than 0.5 is considered as expressed + + * partition: group index, to which this isoforms is partitioned + + * prior: prior parameter for this isoform + +EXAMPLES + Assume the path to the bowtie executables is in the user's PATH + environment variable. Reference files are under '/ref' with name + 'mouse_125'. + + 1) '/data/mmliver.fq', single-end reads with quality scores. Quality + scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 + threads and generate a genome BAM file. In addition, we want to append + gene/transcript names to the result files: + + rsem-calculate-expression --phred64-quals \ + -p 8 \ + --append-names \ + --output-genome-bam \ + /data/mmliver.fq \ + /ref/mouse_125 \ + mmliver_single_quals + + 2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', stranded paired-end + reads with quality scores. Suppose the library is prepared using TruSeq + Stranded Kit, which means the first mate should map to the reverse + strand. Quality scores are in SANGER format. We want to use 8 threads + and do not generate a genome BAM file: + + rsem-calculate-expression -p 8 \ + --paired-end \ + --strandedness reverse \ + /data/mmliver_1.fq \ + /data/mmliver_2.fq \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 3) '/data/mmliver.fa', single-end reads without quality scores. We want + to use 8 threads: + + rsem-calculate-expression -p 8 \ + --no-qualities \ + /data/mmliver.fa \ + /ref/mouse_125 \ + mmliver_single_without_quals + + 4) Data are the same as 1). This time we assume the bowtie executables + are under '/sw/bowtie'. We want to take a fragment length distribution + into consideration. We set the fragment length mean to 150 and the + standard deviation to 35. In addition to a BAM file, we also want to + generate credibility intervals. We allow RSEM to use 1GB of memory for + CI calculation: + + rsem-calculate-expression --bowtie-path /sw/bowtie \ + --phred64-quals \ + --fragment-length-mean 150.0 \ + --fragment-length-sd 35.0 \ + -p 8 \ + --output-genome-bam \ + --calc-ci \ + --ci-memory 1024 \ + /data/mmliver.fq \ + /ref/mouse_125 \ + mmliver_single_quals + + 5) '/data/mmliver_paired_end_quals.bam', BAM-formatted alignments for + paired-end reads with quality scores. We want to use 8 threads: + + rsem-calculate-expression --paired-end \ + --alignments \ + -p 8 \ + /data/mmliver_paired_end_quals.bam \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 6) '/data/mmliver_1.fq.gz' and '/data/mmliver_2.fq.gz', paired-end reads + with quality scores and read files are compressed by gzip. We want to + use STAR to aligned reads and assume STAR executable is '/sw/STAR'. + Suppose we want to use 8 threads and do not generate a genome BAM file: + + rsem-calculate-expression --paired-end \ + --star \ + --star-path /sw/STAR \ + --gzipped-read-file \ + --paired-end \ + -p 8 \ + /data/mmliver_1.fq.gz \ + /data/mmliver_2.fq.gz \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 7) In the above example, suppose we want to run prior-enhanced RSEM + instead. Assuming we want to learn priors from a ChIP-seq peak file + '/data/mmlive.narrowPeak.gz': + + rsem-calculate-expression --star \ + --star-path /sw/STAR \ + --gzipped-read-file \ + --paired-end \ + --calc-pme \ + --run-pRSEM \ + --chipseq-peak-file /data/mmliver.narrowPeak.gz \ + -p 8 \ + /data/mmliver_1.fq.gz \ + /data/mmliver_2.fq.gz \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 8) Similar to the example in 7), suppose we want to use the partition + model 'pk_lm2nopk' (partitioning isoforms by Pol II TSS peak first and + then partitioning 'no TSS peak' isoforms into two bins by a linear + regression model), and we want to partition isoforms by RNA Pol II's + ChIP-seq read files '/data/mmliver_PolIIRep1.fq.gz' and + '/data/mmliver_PolIIRep2.fq.gz', and the control ChIP-seq read files + '/data/mmliver_ChIPseqCtrl.fq.gz'. Also, assuming Bowtie's executables + are under '/sw/bowtie/': + + rsem-calculate-expression --star \ + --star-path /sw/STAR \ + --gzipped-read-file \ + --paired-end \ + --calc-pme \ + --run-pRSEM \ + --chipseq-target-read-files /data/mmliver_PolIIRep1.fq.gz,/data/mmliver_PolIIRep2.fq.gz \ + --chipseq-control-read-files /data/mmliver_ChIPseqCtrl.fq.gz \ + --partition-model pk_lm2nopk \ + --bowtie-path /sw/bowtie \ + -p 8 \ + /data/mmliver_1.fq.gz \ + /data/mmliver_2.fq.gz \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 9) Similar to the example in 8), suppose we want to derive prior from + four histone modification ChIP-seq read data sets: + '/data/H3K27Ac.fastq.gz', '/data/H3K4me1.fastq.gz', + '/data/H3K4me2.fastq.gz', and '/data/H3K4me3.fastq.gz'. Also, assuming + Bowtie's executables are under '/sw/bowtie/': + + rsem-calculate-expression --star \ + --star-path /sw/STAR \ + --gzipped-read-file \ + --paired-end \ + --calc-pme \ + --run-pRSEM \ + --partition-model cmb_lgt \ + --chipseq-read-files-multi-targets /data/H3K27Ac.fastq.gz,/data/H3K4me1.fastq.gz,/data/H3K4me2.fastq.gz,/data/H3K4me3.fastq.gz \ + --bowtie-path /sw/bowtie \ + -p 8 \ + /data/mmliver_1.fq.gz \ + /data/mmliver_2.fq.gz \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + </help> + <citations> + <citation type="doi">10.1186/1471-2105-12-323</citation> + </citations> + +</tool>