comparison rsem.xml @ 0:e5e836936d60 draft

planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit d84a0359354698a4b29df12ab581c2618bffcf80
author artbio
date Sat, 31 Mar 2018 21:30:07 -0400
parents
children 49795544dac7
comparison
equal deleted inserted replaced
-1:000000000000 0:e5e836936d60
1 <tool id="rsembowtie" name="RSEM-Bowtie" version="0.4.0">
2 <description></description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <requirements>
7 <requirement type="package" version="1.3.0">rsem</requirement>
8 <requirement type="package" version="1.2.2=py27pl5.22.0_0">bowtie</requirement>
9 </requirements>
10 <stdio>
11 <exit_code range="1:" level="warning" description="Tool exception" />
12 </stdio>
13 <command detect_errors="exit_code"><![CDATA[
14 #if $job.select_job == "index":
15 echo ${job.reference_name} " " | tee $reference_file &&
16 mkdir $reference_file.files_path &&
17 rsem-prepare-reference
18 #if $job.polya.polya_use == 'add':
19 #if $job.polya.polya_length:
20 --polyA-length $job.polya.polya_length
21 #end if
22 #elif $job.polya.polya_use == 'subset':
23 --no-polyA-subset $job.polya.no_polya_subset
24 #if $job.polya.polya_length:
25 --polyA-length $job.polya.polya_length
26 #end if
27 #elif $job.polya.polya_use == 'none':
28 --no-polyA
29 #end if
30 $job.ntog
31 #if $job.transcript_to_gene_map:
32 --transcript-to-gene-map $job.transcript_to_gene_map
33 #end if
34 --bowtie
35 #if $job.self_reference.ref_type == 'transcripts':
36 $job.self_reference.reference_fasta_file
37 #else:
38 --gtf $job.self_reference.gtf
39 $job.self_reference.reference_fasta_file
40 #end if
41 ${reference_file.files_path}/${job.reference_name}
42 > ${reference_file.files_path}/${job.reference_name}.log
43 #end if
44
45 #if $job.select_job == "index" and $run_rsem.select == "Yes":
46 &&
47 #end if
48
49 #if $run_rsem.select == "Yes":
50 ## uncompress fastq.gz or fastqsanger.gz if needed
51 #if $run_rsem.input.fastq.matepair=="single":
52 #if $run_rsem.input.fastq.singlefastq.is_of_type('fastq.gz') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger.gz'):
53 gunzip < '$run_rsem.input.fastq.singlefastq' > uncomp_single.fastq &&
54 #elif $run_rsem.input.fastq.singlefastq.is_of_type('fastq') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger'):
55 ln -f -s '$run_rsem.input.fastq.singlefastq' 'uncomp_single.fastq' &&
56 #end if
57 #elif $run_rsem.input.fastq.matepair=="paired":
58 #if $run_rsem.input.fastq.fastq1.is_of_type('fastq.gz') or $run_rsem.input.fastq.fastq1.is_of_type('fastqsanger.gz'):
59 gunzip < '$run_rsem.input.fastq.fastq1' > uncomp_pair1.fastq &&
60 gunzip < '$run_rsem.input.fastq.fastq2' > uncomp_pair2.fastq &&
61 #elif $run_rsem.input.fastq.singlefastq.is_of_type('fastq') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger'):
62 ln -f -s '$run_rsem.input.fastq.fastq1' 'uncomp_pair1.fastq' &&
63 ln -f -s '$run_rsem.input.fastq.fastq2' 'uncomp_pair2.fastq' &&
64 #end if
65 #end if
66 rsem-calculate-expression
67 ## --tag string
68 #if $run_rsem.seedlength:
69 --seed-length $run_rsem.seedlength
70 #end if
71 --forward-prob $run_rsem.forward_prob
72 #if $run_rsem.rsem_options.fullparams == 'fullset':
73 ## Fragment info
74 #if $run_rsem.rsem_options.fragment_length_mean:
75 --fragment-length-mean $run_rsem.rsem_options.fragment_length_mean
76 #end if
77 #if $run_rsem.rsem_options.fragment_length_min:
78 --fragment-length-min $run_rsem.rsem_options.fragment_length_min
79 #end if
80 #if $run_rsem.rsem_options.fragment_length_sd:
81 --fragment-length-sd $run_rsem.rsem_options.fragment_length_sd
82 #end if
83 #if $run_rsem.rsem_options.fragment_length_max:
84 --fragment-length-max $run_rsem.rsem_options.fragment_length_max
85 #end if
86 ## RSPD
87 #if $run_rsem.rsem_options.rspd.estimate == 'yes':
88 --estimate-rspd
89 #if $run_rsem.rsem_options.rspd.num_rspd_bins:
90 --num-rspd-bins $run_rsem.rsem_options.rspd.num_rspd_bins
91 #end if
92 #end if
93 ## Calculate 95% credibility intervals and posterior mean estimates.
94 #if $run_rsem.rsem_options.useci.ci == 'yes':
95 --calc-ci
96 #if $run_rsem.rsem_options.useci.cimem:
97 --ci-memory $run_rsem.rsem_options.useci.cimem
98 #end if
99 #end if
100 #end if
101 --num-threads \${GALAXY_SLOTS:-4}
102 #if $run_rsem.input.format == 'fasta' and $run_rsem.input.bowtie_options.fullparams == 'fullset':
103 ## Bowtie params
104 #if $run_rsem.input.bowtie_options.bowtie_e:
105 --bowtie-e $run_rsem.input.bowtie_options.bowtie_e
106 #end if
107 #if $run_rsem.input.bowtie_options.bowtie_m:
108 --bowtie-m $run_rsem.input.bowtie_options.bowtie_m
109 #end if
110 --bowtie-n $run_rsem.input.bowtie_options.bowtie_n
111 #end if
112 #if $run_rsem.input.format == 'fastq' and $run_rsem.input.bowtie_options.fullparams == 'fullset':
113 ## Bowtie params
114 #if $run_rsem.input.bowtie_options.bowtie_e:
115 --bowtie-e $run_rsem.input.bowtie_options.bowtie_e
116 #end if
117 #if $run_rsem.input.bowtie_options.bowtie_m:
118 --bowtie-m $run_rsem.input.bowtie_options.bowtie_m
119 #end if
120 --bowtie-n $run_rsem.input.bowtie_options.bowtie_n
121 #end if
122 ## Outputs
123 #if $run_rsem.rsem_outputs.result_bams == 'none':
124 --no-bam-output
125 #elif $run_rsem.rsem_outputs.result_bams == 'default':
126 --sort-bam-by-coordinate
127 #else
128 --sort-bam-by-coordinate
129 --output-genome-bam
130 $run_rsem.rsem_outputs.sampling_for_bam
131 #end if
132 ## Input data
133 #if $run_rsem.input.format=="fastq"
134 $run_rsem.input.fastq_select
135 #if $run_rsem.input.fastq.matepair=="single":
136 uncomp_single.fastq
137 #elif $run_rsem.input.fastq.matepair=="paired":
138 --paired-end
139 uncomp_pair1.fastq
140 uncomp_pair2.fastq
141 #end if
142 #elif $run_rsem.input.format=="fasta"
143 --no-qualities
144 #if $run_rsem.input.fasta.matepair=="single":
145 $run_rsem.input.fasta.singlefasta
146 #elif $run_rsem.input.fasta.matepair=="paired":
147 --paired-end
148 $run_rsem.input.fasta.fasta1
149 $run_rsem.input.fasta.fasta2
150 #end if
151 #elif $run_rsem.input.format=="sam"
152 #if $run_rsem.input.matepair=="paired":
153 --paired-end
154 #end if
155 #if $run_rsem.input.rsem_sam._extension == 'sam':
156 --sam
157 #elif $run_rsem.input.rsem_sam._extension == 'bam':
158 --bam
159 #end if
160 $run_rsem.input.rsem_sam
161 #end if
162 ## RSEM reference
163 #if $run_rsem.reference.refSrc == 'history':
164 ${run_rsem.reference.rsem_ref.extra_files_path}/${run_rsem.reference.rsem_ref.metadata.reference_name}
165 #elif $run_rsem.reference.refSrc == 'self':
166 ${reference_file.files_path}/${job.reference_name}
167 #end if
168 ## sample_name: use a hard coded name so we can pull out galaxy outputs
169 rsem_output
170 ## direct output into logfile
171 > $log
172 #end if
173 ]]></command>
174
175 <inputs>
176 <conditional name="job">
177 <param name="select_job" type="select" label="rsem reference">
178 <option value="index">Build rsem reference</option>
179 <option value="no-index" selected="true">rsem reference available from history</option>
180 </param>
181 <when value="index">
182 <conditional name="self_reference">
183 <param name="ref_type" type="select" label="Reference transcript source">
184 <option value="transcripts">transcript fasta</option>
185 <option value="genomic">reference genome and gtf</option>
186 </param>
187 <when value="transcripts">
188 <param name="reference_fasta_file" type="data" format="fasta" label="reference fasta file"
189 help="The files should contain the sequences of transcripts."/>
190 </when>
191 <when value="genomic">
192 <param name="reference_fasta_file" type="data" format="fasta" label="reference fasta file"
193 help="The file should contain the sequence of an entire genome."/>
194 <param name="gtf" type="data" format="gtf" label="gtf"
195 help="extract transcript reference sequences using the gene annotations specified in this GTF" />
196 </when>
197 </conditional>
198 <param name="transcript_to_gene_map" type="data" format="tabular" optional="true" label="Map of gene ids to transcript (isoform) ids" >
199 <help>
200 Each line of should be of the form: gene_id transcript_id ( with the two fields separated by a tab character )
201 The map can be obtained from the UCSC table browser
202 group: Genes and Gene Prediction Tracks
203 table: knownIsoforms
204 Without a map:
205 If a reference genome and gtf is used, then RSEM uses the "gene_id" and "transcript_id" attributes in the GTF file.
206 Otherwise, RSEM assumes that each sequence in the reference sequence files is a separate gene.
207 </help>
208 </param>
209 <param name="reference_name" type="text" value="rsem_ref_name" label="reference name">
210 <help>A one word name for this RSEM reference containing only letters, digits, and underscore characters</help>
211 <validator type="regex" message="Use only letters, digits, and underscore characters">^\w+$</validator>
212 </param>
213 <conditional name="polya">
214 <param name="polya_use" type="select" label="PolyA ">
215 <option value="add" selected="true">Add poly(A) tails to all transcripts</option>
216 <option value="subset">Exclude poly(A) tails from selected transcripts</option>
217 <option value="none">Do not add poly(A) tails to any transcripts</option>
218 </param>
219 <when value="add">
220 <param name="polya_length" type="integer" value="125" optional="true" label="The length of the poly(A) tails to be added. (Default: 125)">
221 <validator type="in_range" message="must be positive " min="1"/>
222 </param>
223 </when>
224 <when value="subset">
225 <param name="no_polya_subset" type="data" format="tabular" optional="true" label="List of transcript IDs (one per line) that should should not have polyA tails added."/>
226 <param name="polya_length" type="integer" value="125" optional="true" label="The length of the poly(A) tails to be added. (Default: 125)">
227 <validator type="in_range" message="must be positive " min="1"/>
228 </param>
229 </when>
230 <when value="none"/>
231 </conditional>
232 <param name="ntog" type="boolean" truevalue="--no-ntog" falsevalue="" checked="false" label="Disable the conversion of 'N' characters to 'G' characters in the reference sequences" help="Bowtie uses the automatic N to G conversion to align against all positions in the reference."/>
233 </when>
234 <when value="no-index">
235 </when>
236 </conditional>
237
238 <conditional name="run_rsem">
239 <param name="select" type="select" label="calculate expression with rsem">
240 <option value="No">Just build rsem reference for latter rsem profiling</option>
241 <option value="Yes" selected="true">profile expression with rsem</option>
242 </param>
243 <when value="Yes">
244 <param name="sample" type="text" value="rsem_sample" label="Sample name" />
245 <conditional name="reference">
246 <param name="refSrc" type="select" label="RSEM Reference Source">
247 <option value="history">From your history</option>
248 <option value="self">Prepare RSEM Reference with this tool</option>
249 </param>
250 <when value="history">
251 <param name="rsem_ref" type="data" format="rsem_ref" label="RSEM reference" />
252 </when>
253 <when value="self">
254 </when>
255 </conditional>
256 <conditional name="input">
257 <param name="format" type="select" label="RSEM Input file type">
258 <option value="fastq">FASTQ</option>
259 <option value="fasta">FASTA</option>
260 <option value="sam">SAM/BAM</option>
261 </param>
262 <when value="fastq">
263 <param name="fastq_select" size="15" type="select" label="FASTQ type" >
264 <option value="--phred33-quals" selected="true">phred33 qualities (default for sanger)</option>
265 <option value="--solexa-quals">solexa qualities</option>
266 <option value="--phred64-quals">phred64 qualities</option>
267 </param>
268 <conditional name="fastq">
269 <param name="matepair" type="select" label="Library type">
270 <option value="single">Single End Reads</option>
271 <option value="paired">Paired End Reads</option>
272 </param>
273 <when value="single">
274 <param name="singlefastq" type="data" format="fastq,fastq.gz" label="FASTQ file" />
275 </when>
276 <when value="paired">
277 <param name="fastq1" type="data" format="fastq,fastq.gz" label="Read 1 fastq file" />
278 <param name="fastq2" type="data" format="fastq,fastq.gz" label="Read 2 fastq file" />
279 </when>
280 </conditional>
281 <expand macro="bowtie_options"/>
282 </when>
283 <when value="fasta">
284 <conditional name="fasta">
285 <param name="matepair" type="select" label="Library Type">
286 <option value="single">Single End Reads</option>
287 <option value="paired">Paired End Reads</option>
288 </param>
289 <when value="single">
290 <param name="singlefasta" type="data" format="fasta" label="fasta file" />
291 </when>
292 <when value="paired">
293 <param name="fasta1" type="data" format="fasta" label="Read 1 fasta file" />
294 <param name="fasta2" type="data" format="fasta" label="Read 2 fasta file" />
295 </when>
296 </conditional>
297 <expand macro="bowtie_options"/>
298 </when>
299 <when value="sam">
300 <!-- convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam -->
301 <param name="matepair" type="select" label="Library Type">
302 <option value="single">Single End Reads</option>
303 <option value="paired">Paired End Reads</option>
304 </param>
305 <param name="rsem_sam" type="data" format="rsem_sam" label="RSEM formatted SAM file" />
306 </when>
307 </conditional>
308 <expand macro="rsem_options"/>
309 <conditional name="rsem_outputs">
310 <param name="result_bams" type="select" label="Create bam results files"
311 help="In addition to the transcript-coordinate-based BAM file output, also output a BAM file with the read alignments in genomic coordinates" >
312 <option value="none">No BAM results files</option>
313 <option value="default" selected="true">Transcript BAM results file</option>
314 <option value="both">Transcript and genome BAM results files</option>
315 </param>
316 <when value="none"/>
317 <when value="default">
318 <expand macro="sampling_for_bam"/>
319 </when>
320 <when value="both">
321 <expand macro="sampling_for_bam"/>
322 </when>
323 </conditional>
324 </when>
325 <when value="No">
326 </when>
327 </conditional>
328 </inputs>
329
330 <outputs>
331 <data format="rsem_ref" name="reference_file" label="RSEM ${job.reference_name} reference">
332 <filter>job['select_job'] == "index"</filter>
333 </data>
334 <data format="tabular" name="gene_abundances" label="${run_rsem.sample}.gene_abundances" from_work_dir="rsem_output.genes.results">
335 <filter>run_rsem['select'] == "Yes"</filter>
336 </data>
337 <data format="tabular" name="isoform_abundances" label="${run_rsem.sample}.isoform_abundances" from_work_dir="rsem_output.isoforms.results">
338 <filter>run_rsem['select'] == "Yes"</filter>
339 </data>
340 <data format="bam" name="transcript_sorted_bam" label="${run_rsem.sample}.transcript.bam" from_work_dir="rsem_output.transcript.sorted.bam" >
341 <filter>run_rsem['select'] == "Yes" and run_rsem['rsem_outputs']['result_bams'] != "none"</filter>
342 </data>
343 <data format="bam" name="genome_sorted_bam" label="${run_rsem.sample}.genome.bam" from_work_dir="rsem_output.genome.sorted.bam">
344 <filter>run_rsem['select'] == "Yes" and run_rsem['rsem_outputs']['result_bams'] == "both"</filter>
345 </data>
346 <data format="txt" name="log" label="${run_rsem.sample}.rsem_log">
347 <filter>run_rsem['select'] == "Yes"</filter>
348 </data>
349 </outputs>
350
351 <tests>
352 <test>
353 <param name="select_job" value="index"/>
354 <param name="ref_type" value="genomic"/>
355 <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/>
356 <param name="gtf" value="ref.gtf" ftype="gtf"/>
357 <param name="reference_name" value="ref"/>
358 <param name="select" value="Yes"/>
359 <param name="sample" value="rsem_sample"/>
360 <param name="refSrc" value="self"/>
361 <param name="format" value="fastq"/>
362 <param name="matepair" value="single"/>
363 <param name="singlefastq" value="test.fq" ftype="fastqsanger"/>
364 <param name="result_bams" value="none"/>
365 <output name="reference_file">
366 <assert_contents>
367 <has_text text="ref" />
368 </assert_contents>
369 </output>
370 <output name="gene_abundances" value="gene_abundances.tab"/>
371 <output name="isoform_abundances" value="isoform_abundances.tab" />
372 <output name="log">
373 <assert_contents>
374 <has_text text="Expression Results are written" />
375 </assert_contents>
376 </output>
377 </test>
378 <test>
379 <param name="select_job" value="index"/>
380 <param name="ref_type" value="genomic"/>
381 <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/>
382 <param name="gtf" value="ref.gtf" ftype="gtf"/>
383 <param name="reference_name" value="ref"/>
384 <param name="select" value="Yes"/>
385 <param name="sample" value="rsem_sample"/>
386 <param name="refSrc" value="self"/>
387 <param name="format" value="fastq"/>
388 <param name="matepair" value="single"/>
389 <param name="singlefastq" value="test.fastq.gz" ftype="fastqsanger.gz"/>
390 <param name="result_bams" value="none"/>
391 <output name="reference_file">
392 <assert_contents>
393 <has_text text="ref" />
394 </assert_contents>
395 </output>
396 <output name="gene_abundances" value="gene_abundances.tab"/>
397 <output name="isoform_abundances" value="isoform_abundances.tab" />
398 <output name="log">
399 <assert_contents>
400 <has_text text="Expression Results are written" />
401 </assert_contents>
402 </output>
403 </test>
404 <test>
405 <param name="select_job" value="index"/>
406 <param name="ref_type" value="genomic"/>
407 <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/>
408 <param name="gtf" value="ref.gtf" ftype="gtf"/>
409 <param name="reference_name" value="ref"/>
410 <param name="select" value="No"/>
411 <output name="reference_file">
412 <assert_contents>
413 <has_text text="ref" />
414 </assert_contents>
415 </output>
416 </test>
417 </tests>
418
419 <help>
420 .. class:: infomark
421
422 RSEM HOME PAGE - http://deweylab.biostat.wisc.edu/rsem/
423
424 NAME
425 rsem-prepare-reference
426
427 SYNOPSIS
428 rsem-prepare-reference [options] reference_fasta_file(s) reference_name
429
430 DESCRIPTION
431 The rsem-prepare-reference program extracts/preprocesses the reference sequences and builds Bowtie indices using default parameters.
432 This program is used in conjunction with the 'rsem-calculate-expression' program.
433
434 INPUTS
435 A fasta file of transcripts
436 or
437 A genome sequence fasta file and a GTF gene annotation file. (When using UCSC data, include the related knownIsoforms.txt)
438
439 ---
440
441 NAME
442 rsem-calculate-expression - Estimate gene and isoform expression from
443 RNA-Seq data.
444
445 SYNOPSIS
446 rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name
447 rsem-calculate-expression [options] --paired-end upstream_read_file(s) downstream_read_file(s) reference_name sample_name
448 rsem-calculate-expression [options] --alignments [--paired-end] input reference_name sample_name
449
450 ARGUMENTS
451 upstream_read_files(s)
452 Comma-separated list of files containing single-end reads or
453 upstream reads for paired-end data. By default, these files are
454 assumed to be in FASTQ format. If the --no-qualities option is
455 specified, then FASTA format is expected.
456
457 downstream_read_file(s)
458 Comma-separated list of files containing downstream reads which are
459 paired with the upstream reads. By default, these files are assumed
460 to be in FASTQ format. If the --no-qualities option is specified,
461 then FASTA format is expected.
462
463 input
464 SAM/BAM/CRAM formatted input file. If "-" is specified for the
465 filename, the input is instead assumed to come from standard input.
466 RSEM requires all alignments of the same read group together. For
467 paired-end reads, RSEM also requires the two mates of any alignment
468 be adjacent. In addition, RSEM does not allow the SEQ and QUAL
469 fields to be empty. See Description section for how to make input
470 file obey RSEM's requirements.
471
472 reference_name
473 The name of the reference used. The user must have run
474 'rsem-prepare-reference' with this reference_name before running
475 this program.
476
477 sample_name
478 The name of the sample analyzed. All output files are prefixed by
479 this name (e.g., sample_name.genes.results)
480
481 BASIC OPTIONS
482 --paired-end
483 Input reads are paired-end reads. (Default: off)
484
485 --no-qualities
486 Input reads do not contain quality scores. (Default: off)
487
488 --strandedness &lt;none|forward|reverse&gt;
489 This option defines the strandedness of the RNA-Seq reads. It
490 recognizes three values: 'none', 'forward', and 'reverse'. 'none'
491 refers to non-strand-specific protocols. 'forward' means all
492 (upstream) reads are derived from the forward strand. 'reverse'
493 means all (upstream) reads are derived from the reverse strand. If
494 'forward'/'reverse' is set, the '--norc'/'--nofw' Bowtie/Bowtie 2
495 option will also be enabled to avoid aligning reads to the opposite
496 strand. For Illumina TruSeq Stranded protocols, please use
497 'reverse'. (Default: 'none')
498
499 -p/--num-threads &lt;int&gt;
500 Number of threads to use. Both Bowtie/Bowtie2, expression estimation
501 and 'samtools sort' will use this many threads. (Default: 1)
502
503 --alignments
504 Input file contains alignments in SAM/BAM/CRAM format. The exact
505 file format will be determined automatically. (Default: off)
506
507 --fai &lt;file&gt;
508 If the header section of input alignment file does not contain
509 reference sequence information, this option should be turned on.
510 &lt;file&gt; is a FAI format file containing each reference sequence's
511 name and length. Please refer to the SAM official website for the
512 details of FAI format. (Default: off)
513
514 --bowtie2
515 Use Bowtie 2 instead of Bowtie to align reads. Since currently RSEM
516 does not handle indel, local and discordant alignments, the Bowtie2
517 parameters are set in a way to avoid those alignments. In
518 particular, we use options '--sensitive --dpad 0 --gbar 99999999
519 --mp 1,1 --np 1 --score-min L,0,-0.1' by default. The last parameter
520 of '--score-min', '-0.1', is the negative of maximum mismatch rate.
521 This rate can be set by option '--bowtie2-mismatch-rate'. If reads
522 are paired-end, we additionally use options '--no-mixed' and
523 '--no-discordant'. (Default: off)
524
525 --star
526 Use STAR to align reads. Alignment parameters are from ENCODE3's
527 STAR-RSEM pipeline. To save computational time and memory resources,
528 STAR's Output BAM file is unsorted. It is stored in RSEM's temporary
529 directory with name as 'sample_name.bam'. Each STAR job will have
530 its own private copy of the genome in memory. (Default: off)
531
532 --append-names
533 If gene_name/transcript_name is available, append it to the end of
534 gene_id/transcript_id (separated by '_') in files
535 'sample_name.isoforms.results' and 'sample_name.genes.results'.
536 (Default: off)
537
538 --seed &lt;uint32&gt;
539 Set the seed for the random number generators used in calculating
540 posterior mean estimates and credibility intervals. The seed must be
541 a non-negative 32 bit integer. (Default: off)
542
543 --single-cell-prior
544 By default, RSEM uses Dirichlet(1) as the prior to calculate
545 posterior mean estimates and credibility intervals. However, much
546 less genes are expressed in single cell RNA-Seq data. Thus, if you
547 want to compute posterior mean estimates and/or credibility
548 intervals and you have single-cell RNA-Seq data, you are recommended
549 to turn on this option. Then RSEM will use Dirichlet(0.1) as the
550 prior which encourage the sparsity of the expression levels.
551 (Default: off)
552
553 --calc-pme
554 Run RSEM's collapsed Gibbs sampler to calculate posterior mean
555 estimates. (Default: off)
556
557 --calc-ci
558 Calculate 95% credibility intervals and posterior mean estimates.
559 The credibility level can be changed by setting
560 '--ci-credibility-level'. (Default: off)
561
562 -q/--quiet
563 Suppress the output of logging information. (Default: off)
564
565 -h/--help
566 Show help information.
567
568 --version
569 Show version information.
570
571 OUTPUT OPTIONS
572 --sort-bam-by-read-name
573 Sort BAM file aligned under transcript coordidate by read name.
574 Setting this option on will produce deterministic maximum likelihood
575 estimations from independent runs. Note that sorting will take long
576 time and lots of memory. (Default: off)
577
578 --no-bam-output
579 Do not output any BAM file. (Default: off)
580
581 --sampling-for-bam
582 When RSEM generates a BAM file, instead of outputting all alignments
583 a read has with their posterior probabilities, one alignment is
584 sampled according to the posterior probabilities. The sampling
585 procedure includes the alignment to the "noise" transcript, which
586 does not appear in the BAM file. Only the sampled alignment has a
587 weight of 1. All other alignments have weight 0. If the "noise"
588 transcript is sampled, all alignments appeared in the BAM file
589 should have weight 0. (Default: off)
590
591 --output-genome-bam
592 Generate a BAM file, 'sample_name.genome.bam', with alignments
593 mapped to genomic coordinates and annotated with their posterior
594 probabilities. In addition, RSEM will call samtools (included in
595 RSEM package) to sort and index the bam file.
596 'sample_name.genome.sorted.bam' and
597 'sample_name.genome.sorted.bam.bai' will be generated. (Default:
598 off)
599
600 --sort-bam-by-coordinate
601 Sort RSEM generated transcript and genome BAM files by coordinates
602 and build associated indices. (Default: off)
603
604 --sort-bam-memory-per-thread &lt;string&gt;
605 Set the maximum memory per thread that can be used by 'samtools
606 sort'. &lt;string&gt; represents the memory and accepts suffices 'K/M/G'.
607 RSEM will pass &lt;string&gt; to the '-m' option of 'samtools sort'. Note
608 that the default used here is different from the default used by
609 samtools. (Default: 1G)
610
611 ALIGNER OPTIONS
612 --seed-length &lt;int&gt;
613 Seed length used by the read aligner. Providing the correct value is
614 important for RSEM. If RSEM runs Bowtie, it uses this value for
615 Bowtie's seed length parameter. Any read with its or at least one of
616 its mates' (for paired-end reads) length less than this value will
617 be ignored. If the references are not added poly(A) tails, the
618 minimum allowed value is 5, otherwise, the minimum allowed value is
619 25. Note that this script will only check if the value &gt;= 5 and give
620 a warning message if the value &lt; 25 but &gt;= 5. (Default: 25)
621
622 --phred33-quals
623 Input quality scores are encoded as Phred+33. (Default: on)
624
625 --phred64-quals
626 Input quality scores are encoded as Phred+64 (default for GA
627 Pipeline ver. &gt;= 1.3). (Default: off)
628
629 --solexa-quals
630 Input quality scores are solexa encoded (from GA Pipeline ver. &lt;
631 1.3). (Default: off)
632
633 --bowtie-path &lt;path&gt;
634 The path to the Bowtie executables. (Default: the path to the Bowtie
635 executables is assumed to be in the user's PATH environment
636 variable)
637
638 --bowtie-n &lt;int&gt;
639 (Bowtie parameter) max # of mismatches in the seed. (Range: 0-3,
640 Default: 2)
641
642 --bowtie-e &lt;int&gt;
643 (Bowtie parameter) max sum of mismatch quality scores across the
644 alignment. (Default: 99999999)
645
646 --bowtie-m &lt;int&gt;
647 (Bowtie parameter) suppress all alignments for a read if &gt; &lt;int&gt;
648 valid alignments exist. (Default: 200)
649
650 --bowtie-chunkmbs &lt;int&gt;
651 (Bowtie parameter) memory allocated for best first alignment
652 calculation (Default: 0 - use Bowtie's default)
653
654 --bowtie2-path &lt;path&gt;
655 (Bowtie 2 parameter) The path to the Bowtie 2 executables. (Default:
656 the path to the Bowtie 2 executables is assumed to be in the user's
657 PATH environment variable)
658
659 --bowtie2-mismatch-rate &lt;double&gt;
660 (Bowtie 2 parameter) The maximum mismatch rate allowed. (Default:
661 0.1)
662
663 --bowtie2-k &lt;int&gt;
664 (Bowtie 2 parameter) Find up to &lt;int&gt; alignments per read. (Default:
665 200)
666
667 --bowtie2-sensitivity-level &lt;string&gt;
668 (Bowtie 2 parameter) Set Bowtie 2's preset options in --end-to-end
669 mode. This option controls how hard Bowtie 2 tries to find
670 alignments. &lt;string&gt; must be one of "very_fast", "fast", "sensitive"
671 and "very_sensitive". The four candidates correspond to Bowtie 2's
672 "--very-fast", "--fast", "--sensitive" and "--very-sensitive"
673 options. (Default: "sensitive" - use Bowtie 2's default)
674
675 --star-path &lt;path&gt;
676 The path to STAR's executable. (Default: the path to STAR executable
677 is assumed to be in user's PATH environment variable)
678
679 --star-gzipped-read-file
680 (STAR parameter) Input read file(s) is compressed by gzip. (Default:
681 off)
682
683 --star-bzipped-read-file
684 (STAR parameter) Input read file(s) is compressed by bzip2.
685 (Default: off)
686
687 --star-output-genome-bam
688 (STAR parameter) Save the BAM file from STAR alignment under genomic
689 coordinate to 'sample_name.STAR.genome.bam'. This file is NOT sorted
690 by genomic coordinate. In this file, according to STAR's manual,
691 'paired ends of an alignment are always adjacent, and multiple
692 alignments of a read are adjacent as well'. (Default: off)
693
694 ADVANCED OPTIONS
695 --tag &lt;string&gt;
696 The name of the optional field used in the SAM input for identifying
697 a read with too many valid alignments. The field should have the
698 format &lt;tagName&gt;:i:&lt;value&gt;, where a &lt;value&gt; bigger than 0 indicates
699 a read with too many alignments. (Default: "")
700
701 --fragment-length-min &lt;int&gt;
702 Minimum read/insert length allowed. This is also the value for the
703 Bowtie/Bowtie2 -I option. (Default: 1)
704
705 --fragment-length-max &lt;int&gt;
706 Maximum read/insert length allowed. This is also the value for the
707 Bowtie/Bowtie 2 -X option. (Default: 1000)
708
709 --fragment-length-mean &lt;double&gt;
710 (single-end data only) The mean of the fragment length distribution,
711 which is assumed to be a Gaussian. (Default: -1, which disables use
712 of the fragment length distribution)
713
714 --fragment-length-sd &lt;double&gt;
715 (single-end data only) The standard deviation of the fragment length
716 distribution, which is assumed to be a Gaussian. (Default: 0, which
717 assumes that all fragments are of the same length, given by the
718 rounded value of --fragment-length-mean)
719
720 --estimate-rspd
721 Set this option if you want to estimate the read start position
722 distribution (RSPD) from data. Otherwise, RSEM will use a uniform
723 RSPD. (Default: off)
724
725 --num-rspd-bins &lt;int&gt;
726 Number of bins in the RSPD. Only relevant when '--estimate-rspd' is
727 specified. Use of the default setting is recommended. (Default: 20)
728
729 --gibbs-burnin &lt;int&gt;
730 The number of burn-in rounds for RSEM's Gibbs sampler. Each round
731 passes over the entire data set once. If RSEM can use multiple
732 threads, multiple Gibbs samplers will start at the same time and all
733 samplers share the same burn-in number. (Default: 200)
734
735 --gibbs-number-of-samples &lt;int&gt;
736 The total number of count vectors RSEM will collect from its Gibbs
737 samplers. (Default: 1000)
738
739 --gibbs-sampling-gap &lt;int&gt;
740 The number of rounds between two succinct count vectors RSEM
741 collects. If the count vector after round N is collected, the count
742 vector after round N + &lt;int&gt; will also be collected. (Default: 1)
743
744 --ci-credibility-level &lt;double&gt;
745 The credibility level for credibility intervals. (Default: 0.95)
746
747 --ci-memory &lt;int&gt;
748 Maximum size (in memory, MB) of the auxiliary buffer used for
749 computing credibility intervals (CI). (Default: 1024)
750
751 --ci-number-of-samples-per-count-vector &lt;int&gt;
752 The number of read generating probability vectors sampled per
753 sampled count vector. The crebility intervals are calculated by
754 first sampling P(C | D) and then sampling P(Theta | C) for each
755 sampled count vector. This option controls how many Theta vectors
756 are sampled per sampled count vector. (Default: 50)
757
758 --keep-intermediate-files
759 Keep temporary files generated by RSEM. RSEM creates a temporary
760 directory, 'sample_name.temp', into which it puts all intermediate
761 output files. If this directory already exists, RSEM overwrites all
762 files generated by previous RSEM runs inside of it. By default,
763 after RSEM finishes, the temporary directory is deleted. Set this
764 option to prevent the deletion of this directory and the
765 intermediate files inside of it. (Default: off)
766
767 --temporary-folder &lt;string&gt;
768 Set where to put the temporary files generated by RSEM. If the
769 folder specified does not exist, RSEM will try to create it.
770 (Default: sample_name.temp)
771
772 --time
773 Output time consumed by each step of RSEM to 'sample_name.time'.
774 (Default: off)
775
776 PRIOR-ENHANCED RSEM OPTIONS
777 --run-pRSEM
778 Running prior-enhanced RSEM (pRSEM). Prior parameters, i.e.
779 isoform's initial pseudo-count for RSEM's Gibbs sampling, will be
780 learned from input RNA-seq data and an external data set. When pRSEM
781 needs and only needs ChIP-seq peak information to partition isoforms
782 (e.g. in pRSEM's default partition model), either ChIP-seq peak file
783 (with the '--chipseq-peak-file' option) or ChIP-seq FASTQ files for
784 target and input and the path for Bowtie executables are required
785 (with the '--chipseq-target-read-files &lt;string&gt;',
786 '--chipseq-control-read-files &lt;string&gt;', and '--bowtie-path &lt;path&gt;
787 options), otherwise, ChIP-seq FASTQ files for target and control and
788 the path to Bowtie executables are required. (Default: off)
789
790 --chipseq-peak-file &lt;string&gt;
791 Full path to a ChIP-seq peak file in ENCODE's narrowPeak, i.e.
792 BED6+4, format. This file is used when running prior-enhanced RSEM
793 in the default two-partition model. It partitions isoforms by
794 whether they have ChIP-seq overlapping with their transcription
795 start site region or not. Each partition will have its own prior
796 parameter learned from a training set. This file can be either
797 gzipped or ungzipped. (Default: "")
798
799 --chipseq-target-read-files &lt;string&gt;
800 Comma-separated full path of FASTQ read file(s) for ChIP-seq target.
801 This option is used when running prior-enhanced RSEM. It provides
802 information to calculate ChIP-seq peaks and signals. The file(s) can
803 be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The
804 options '--bowtie-path &lt;path&gt;' and '--chipseq-control-read-files
805 &lt;string&gt;' must be defined when this option is specified. (Default:
806 "")
807
808 --chipseq-control-read-files &lt;string&gt;
809 Comma-separated full path of FASTQ read file(s) for ChIP-seq conrol.
810 This option is used when running prior-enhanced RSEM. It provides
811 information to call ChIP-seq peaks. The file(s) can be either
812 ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options
813 '--bowtie-path &lt;path&gt;' and '--chipseq-target-read-files &lt;string&gt;'
814 must be defined when this option is specified. (Default: "")
815
816 --chipseq-read-files-multi-targets &lt;string&gt;
817 Comma-separated full path of FASTQ read files for multiple ChIP-seq
818 targets. This option is used when running prior-enhanced RSEM, where
819 prior is learned from multiple complementary data sets. It provides
820 information to calculate ChIP-seq signals. All files can be either
821 ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this
822 option is specified, the option '--bowtie-path &lt;path&gt;' must be
823 defined and the option '--partition-model &lt;string&gt;' will be set to
824 'cmb_lgt' automatically. (Default: "")
825
826 --chipseq-bed-files-multi-targets &lt;string&gt;
827 Comma-separated full path of BED files for multiple ChIP-seq
828 targets. This option is used when running prior-enhanced RSEM, where
829 prior is learned from multiple complementary data sets. It provides
830 information of ChIP-seq signals and must have at least the first six
831 BED columns. All files can be either ungzipped or gzipped with a
832 suffix '.gz' or '.gzip'. When this option is specified, the option
833 '--partition-model &lt;string&gt;' will be set to 'cmb_lgt' automatically.
834 (Default: "")
835
836 --cap-stacked-chipseq-reads
837 Keep a maximum number of ChIP-seq reads that aligned to the same
838 genomic interval. This option is used when running prior-enhanced
839 RSEM, where prior is learned from multiple complementary data sets.
840 This option is only in use when either
841 '--chipseq-read-files-multi-targets &lt;string&gt;' or
842 '--chipseq-bed-files-multi-targets &lt;string&gt;' is specified. (Default:
843 off)
844
845 --n-max-stacked-chipseq-reads &lt;int&gt;
846 The maximum number of stacked ChIP-seq reads to keep. This option is
847 used when running prior-enhanced RSEM, where prior is learned from
848 multiple complementary data sets. This option is only in use when
849 the option '--cap-stacked-chipseq-reads' is set. (Default: 5)
850
851 --partition-model &lt;string&gt;
852 A keyword to specify the partition model used by prior-enhanced
853 RSEM. It must be one of the following keywords:
854
855 - pk
856 Partitioned by whether an isoform has a ChIP-seq peak overlapping
857 with its transcription start site (TSS) region. The TSS region is
858 defined as [TSS-500bp, TSS+500bp]. For simplicity, we refer this
859 type of peak as 'TSS peak' when explaining other keywords.
860
861 - pk_lgtnopk
862 First partitioned by TSS peak. Then, for isoforms in the 'no TSS
863 peak' set, a logistic model is employed to further classify them
864 into two partitions.
865
866 - lm3, lm4, lm5, or lm6
867 Based on their ChIP-seq signals, isoforms are classified into 3,
868 4, 5, or 6 partitions by a linear regression model.
869
870 - nopk_lm2pk, nopk_lm3pk, nopk_lm4pk, or nopk_lm5pk
871 First partitioned by TSS peak. Then, for isoforms in the 'with TSS
872 peak' set, a linear regression model is employed to further
873 classify them into 2, 3, 4, or 5 partitions.
874
875 - pk_lm2nopk, pk_lm3nopk, pk_lm4nopk, or pk_lm5nopk
876 First partitioned by TSS peak. Then, for isoforms in the 'no TSS
877 peak' set, a linear regression model is employed to further
878 classify them into 2, 3, 4, or 5 partitions.
879
880 - cmb_lgt
881 Using a logistic regression to combine TSS signals from multiple
882 complementary data sets and partition training set isoform into
883 'expressed' and 'not expressed'. This partition model is only in
884 use when either '--chipseq-read-files-multi-targets &lt;string&gt;' or
885 '--chipseq-bed-files-multi-targets &lt;string&gt; is specified.
886
887 Parameters for all the above models are learned from a training set.
888 For detailed explanations, please see prior-enhanced RSEM's paper.
889 (Default: 'pk')
890
891 DEPRECATED OPTIONS
892 The options in this section are deprecated. They are here only for
893 compatibility reasons and may be removed in future releases.
894
895 --sam
896 Inputs are alignments in SAM format. (Default: off)
897
898 --bam
899 Inputs are alignments in BAM format. (Default: off)
900
901 --strand-specific
902 Equivalent to '--strandedness forward'. (Default: off)
903
904 --forward-prob &lt;double&gt;
905 Probability of generating a read from the forward strand of a
906 transcript. Set to 1 for a strand-specific protocol where all
907 (upstream) reads are derived from the forward strand, 0 for a
908 strand-specific protocol where all (upstream) read are derived from
909 the reverse strand, or 0.5 for a non-strand-specific protocol.
910 (Default: off)
911
912 DESCRIPTION
913 In its default mode, this program aligns input reads against a reference
914 transcriptome with Bowtie and calculates expression values using the
915 alignments. RSEM assumes the data are single-end reads with quality
916 scores, unless the '--paired-end' or '--no-qualities' options are
917 specified. Alternatively, users can use STAR to align reads using the
918 '--star' option. RSEM has provided options in 'rsem-prepare-reference'
919 to prepare STAR's genome indices. Users may use an alternative aligner
920 by specifying '--alignments', and providing an alignment file in
921 SAM/BAM/CRAM format. However, users should make sure that they align
922 against the indices generated by 'rsem-prepare-reference' and the
923 alignment file satisfies the requirements mentioned in ARGUMENTS
924 section.
925
926 One simple way to make the alignment file satisfying RSEM's requirements
927 is to use the 'convert-sam-for-rsem' script. This script accepts
928 SAM/BAM/CRAM files as input and outputs a BAM file. For example, type
929 the following command to convert a SAM file, 'input.sam', to a
930 ready-for-use BAM file, 'input_for_rsem.bam':
931
932 convert-sam-for-rsem input.sam input_for_rsem
933
934 For details, please refer to 'convert-sam-for-rsem's documentation page.
935
936 NOTES
937 1. Users must run 'rsem-prepare-reference' with the appropriate
938 reference before using this program.
939
940 2. For single-end data, it is strongly recommended that the user provide
941 the fragment length distribution parameters (--fragment-length-mean and
942 --fragment-length-sd). For paired-end data, RSEM will automatically
943 learn a fragment length distribution from the data.
944
945 3. Some aligner parameters have default values different from their
946 original settings.
947
948 4. With the '--calc-pme' option, posterior mean estimates will be
949 calculated in addition to maximum likelihood estimates.
950
951 5. With the '--calc-ci' option, 95% credibility intervals and posterior
952 mean estimates will be calculated in addition to maximum likelihood
953 estimates.
954
955 6. The temporary directory and all intermediate files will be removed
956 when RSEM finishes unless '--keep-intermediate-files' is specified.
957
958 With the '--run-pRSEM' option and associated options (see section
959 'PRIOR-ENHANCED RSEM OPTIONS' above for details), prior-enhanced RSEM
960 will be running. Prior parameters will be learned from supplied external
961 data set(s) and assigned as initial pseudo-counts for isoforms in the
962 corresponding partition for Gibbs sampling.
963
964 OUTPUT
965 sample_name.isoforms.results
966 File containing isoform level expression estimates. The first line
967 contains column names separated by the tab character. The format of
968 each line in the rest of this file is:
969
970 transcript_id gene_id length effective_length expected_count TPM
971 FPKM IsoPct [posterior_mean_count
972 posterior_standard_deviation_of_count pme_TPM pme_FPKM
973 IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound
974 TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound
975 FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation]
976
977 Fields are separated by the tab character. Fields within "[]" are
978 optional. They will not be presented if neither '--calc-pme' nor
979 '--calc-ci' is set.
980
981 'transcript_id' is the transcript name of this transcript. 'gene_id'
982 is the gene name of the gene which this transcript belongs to
983 (denote this gene as its parent gene). If no gene information is
984 provided, 'gene_id' and 'transcript_id' are the same.
985
986 'length' is this transcript's sequence length (poly(A) tail is not
987 counted). 'effective_length' counts only the positions that can
988 generate a valid fragment. If no poly(A) tail is added,
989 'effective_length' is equal to transcript length - mean fragment
990 length + 1. If one transcript's effective length is less than 1,
991 this transcript's both effective length and abundance estimates are
992 set to 0.
993
994 'expected_count' is the sum of the posterior probability of each
995 read comes from this transcript over all reads. Because 1) each read
996 aligning to this transcript has a probability of being generated
997 from background noise; 2) RSEM may filter some alignable low quality
998 reads, the sum of expected counts for all transcript are generally
999 less than the total number of reads aligned.
1000
1001 'TPM' stands for Transcripts Per Million. It is a relative measure
1002 of transcript abundance. The sum of all transcripts' TPM is 1
1003 million. 'FPKM' stands for Fragments Per Kilobase of transcript per
1004 Million mapped reads. It is another relative measure of transcript
1005 abundance. If we define l_bar be the mean transcript length in a
1006 sample, which can be calculated as
1007
1008 l_bar = \sum_i TPM_i / 10^6 * effective_length_i (i goes through
1009 every transcript),
1010
1011 the following equation is hold:
1012
1013 FPKM_i = 10^3 / l_bar * TPM_i.
1014
1015 We can see that the sum of FPKM is not a constant across samples.
1016
1017 'IsoPct' stands for isoform percentage. It is the percentage of this
1018 transcript's abandunce over its parent gene's abandunce. If its
1019 parent gene has only one isoform or the gene information is not
1020 provided, this field will be set to 100.
1021
1022 'posterior_mean_count', 'pme_TPM', 'pme_FPKM' are posterior mean
1023 estimates calculated by RSEM's Gibbs sampler.
1024 'posterior_standard_deviation_of_count' is the posterior standard
1025 deviation of counts. 'IsoPct_from_pme_TPM' is the isoform percentage
1026 calculated from 'pme_TPM' values.
1027
1028 'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound'
1029 and 'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95%
1030 credibility intervals for TPM and FPKM values. The bounds are
1031 inclusive (i.e. [l, u]).
1032
1033 'TPM_coefficient_of_quartile_variation' and
1034 'FPKM_coefficient_of_quartile_variation' are coefficients of
1035 quartile variation (CQV) for TPM and FPKM values. CQV is a robust
1036 way of measuring the ratio between the standard deviation and the
1037 mean. It is defined as
1038
1039 CQV := (Q3 - Q1) / (Q3 + Q1),
1040
1041 where Q1 and Q3 are the first and third quartiles.
1042
1043 sample_name.genes.results
1044 File containing gene level expression estimates. The first line
1045 contains column names separated by the tab character. The format of
1046 each line in the rest of this file is:
1047
1048 gene_id transcript_id(s) length effective_length expected_count TPM
1049 FPKM [posterior_mean_count posterior_standard_deviation_of_count
1050 pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound
1051 TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound
1052 FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation]
1053
1054 Fields are separated by the tab character. Fields within "[]" are
1055 optional. They will not be presented if neither '--calc-pme' nor
1056 '--calc-ci' is set.
1057
1058 'transcript_id(s)' is a comma-separated list of transcript_ids
1059 belonging to this gene. If no gene information is provided,
1060 'gene_id' and 'transcript_id(s)' are identical (the
1061 'transcript_id').
1062
1063 A gene's 'length' and 'effective_length' are defined as the weighted
1064 average of its transcripts' lengths and effective lengths (weighted
1065 by 'IsoPct'). A gene's abundance estimates are just the sum of its
1066 transcripts' abundance estimates.
1067
1068 sample_name.alleles.results
1069 Only generated when the RSEM references are built with
1070 allele-specific transcripts.
1071
1072 This file contains allele level expression estimates for
1073 allele-specific expression calculation. The first line contains
1074 column names separated by the tab character. The format of each line
1075 in the rest of this file is:
1076
1077 allele_id transcript_id gene_id length effective_length
1078 expected_count TPM FPKM AlleleIsoPct AlleleGenePct
1079 [posterior_mean_count posterior_standard_deviation_of_count pme_TPM
1080 pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM
1081 TPM_ci_lower_bound TPM_ci_upper_bound
1082 TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound
1083 FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation]
1084
1085 Fields are separated by the tab character. Fields within "[]" are
1086 optional. They will not be presented if neither '--calc-pme' nor
1087 '--calc-ci' is set.
1088
1089 'allele_id' is the allele-specific name of this allele-specific
1090 transcript.
1091
1092 'AlleleIsoPct' stands for allele-specific percentage on isoform
1093 level. It is the percentage of this allele-specific transcript's
1094 abundance over its parent transcript's abundance. If its parent
1095 transcript has only one allele variant form, this field will be set
1096 to 100.
1097
1098 'AlleleGenePct' stands for allele-specific percentage on gene level.
1099 It is the percentage of this allele-specific transcript's abundance
1100 over its parent gene's abundance.
1101
1102 'AlleleIsoPct_from_pme_TPM' and 'AlleleGenePct_from_pme_TPM' have
1103 similar meanings. They are calculated based on posterior mean
1104 estimates.
1105
1106 Please note that if this file is present, the fields 'length' and
1107 'effective_length' in 'sample_name.isoforms.results' should be
1108 interpreted similarly as the corresponding definitions in
1109 'sample_name.genes.results'.
1110
1111 sample_name.transcript.bam
1112 Only generated when --no-bam-output is not specified.
1113
1114 'sample_name.transcript.bam' is a BAM-formatted file of read
1115 alignments in transcript coordinates. The MAPQ field of each
1116 alignment is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)),
1117 where w is the posterior probability of that alignment being the
1118 true mapping of a read. In addition, RSEM pads a new tag ZW:f:value,
1119 where value is a single precision floating number representing the
1120 posterior probability. Because this file contains all alignment
1121 lines produced by bowtie or user-specified aligners, it can also be
1122 used as a replacement of the aligner generated BAM/SAM file.
1123
1124 sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai
1125 Only generated when --no-bam-output is not specified and --sort-bam-by-coordinate is specified.
1126
1127 'sample_name.transcript.sorted.bam' and
1128 'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and
1129 indices generated by samtools (included in RSEM package).
1130
1131 sample_name.genome.bam
1132 Only generated when --no-bam-output is not specified and
1133 --output-genome-bam is specified.
1134
1135 'sample_name.genome.bam' is a BAM-formatted file of read alignments
1136 in genomic coordinates. Alignments of reads that have identical
1137 genomic coordinates (i.e., alignments to different isoforms that
1138 share the same genomic region) are collapsed into one alignment. The
1139 MAPQ field of each alignment is set to min(100, floor(-10 *
1140 log10(1.0 - w) + 0.5)), where w is the posterior probability of that
1141 alignment being the true mapping of a read. In addition, RSEM pads a
1142 new tag ZW:f:value, where value is a single precision floating
1143 number representing the posterior probability. If an alignment is
1144 spliced, a XS:A:value tag is also added, where value is either '+'
1145 or '-' indicating the strand of the transcript it aligns to.
1146
1147 sample_name.genome.sorted.bam and sample_name.genome.sorted.bam.bai
1148 Only generated when --no-bam-output is not specified, and
1149 --sort-bam-by-coordinate and --output-genome-bam are specified.
1150
1151 'sample_name.genome.sorted.bam' and
1152 'sample_name.genome.sorted.bam.bai' are the sorted BAM file and
1153 indices generated by samtools (included in RSEM package).
1154
1155 sample_name.time
1156 Only generated when --time is specified.
1157
1158 It contains time (in seconds) consumed by aligning reads, estimating
1159 expression levels and calculating credibility intervals.
1160
1161 sample_name.stat
1162 This is a folder instead of a file. All model related statistics are
1163 stored in this folder. Use 'rsem-plot-model' can generate plots
1164 using this folder.
1165
1166 'sample_name.stat/sample_name.cnt' contains alignment statistics.
1167 The format and meanings of each field are described in
1168 'cnt_file_description.txt' under RSEM directory.
1169
1170 'sample_name.stat/sample_name.model' stores RNA-Seq model parameters
1171 learned from the data. The format and meanings of each filed of this
1172 file are described in 'model_file_description.txt' under RSEM
1173 directory.
1174
1175 The following four output files will be generated only by
1176 prior-enhanced RSEM
1177
1178 - 'sample_name.stat/sample_name_prsem.all_tr_features'
1179 It stores isofrom features for deriving and assigning pRSEM prior.
1180 The first line is a header and the rest is one isoform per line.
1181 The description for each column is:
1182
1183 * trid: transcript ID from input annotation
1184
1185 * geneid: gene ID from input anntation
1186
1187 * chrom: isoform's chromosome name
1188
1189 * strand: isoform's strand name
1190
1191 * start: isoform's end with the lowest genomic loci
1192
1193 * end: isoform's end with the highest genomic loci
1194
1195 * tss_mpp: average mappability of [TSS-500bp, TSS+500bp], where
1196 TSS is isoform's transcription start site, i.e. 5'-end
1197
1198 * body_mpp: average mappability of (TSS+500bp, TES-500bp), where
1199 TES is isoform's transcription end site, i.e. 3'-end
1200
1201 * tes_mpp: average mappability of [TES-500bp, TES+500bp]
1202
1203 * pme_count: isoform's fragment or read count from RSEM's
1204 posterior mean estimates
1205
1206 * tss: isoform's TSS loci
1207
1208 * tss_pk: equal to 1 if isoform's [TSS-500bp, TSS+500bp] region
1209 overlaps with a RNA Pol II peak; 0 otherwise
1210
1211 * is_training: equal to 1 if isoform is in the training set where
1212 Pol II prior is learned; 0 otherwise
1213
1214 - 'sample_name.stat/sample_name_prsem.all_tr_prior'
1215 It stores prior parameters for every isoform. This file does not
1216 have a header. Each line contains a prior parameter and an
1217 isoform's transcript ID delimited by " # ".
1218
1219 - 'sample_name.stat/sample_name_uniform_prior_1.isoforms.results'
1220 RSEM's posterior mean estimates on the isoform level with an
1221 initial pseudo-count of one for every isoform. It is in the same
1222 format as the 'sample_name.isoforms.results'.
1223
1224 - 'sample_name.stat/sample_name_uniform_prior_1.genes.results'
1225 RSEM's posterior mean estimates on the gene level with an initial
1226 pseudo-count of one for every isoform. It is in the same format as
1227 the 'sample_name.genes.results'.
1228
1229 When learning prior from multiple external data sets in
1230 prior-enhanced RSEM, two additional output files will be generated.
1231
1232 - 'sample_name.stat/sample_name.pval_LL'
1233 It stores a p-value and a log-likelihood. The p-value indicates
1234 whether the combination of multiple complementary data sets is
1235 informative for RNA-seq quantification. The log-likelihood shows
1236 how well pRSEM's Dirichlet-multinomial model fits the read counts
1237 of partitioned training set isoforms.
1238
1239 - 'sample_name.stat/sample_name.lgt_mdl.RData'
1240 It stores an R object named 'glmmdl', which is a logistic
1241 regression model on the training set isoforms and multiple
1242 external data sets.
1243
1244 In addition, extra columns will be added to
1245 'sample_name.stat/all_tr_features'
1246
1247 * is_expr: equal to 1 if isoform has an abundance &gt;= 1 TPM and a
1248 non-zero read count from RSEM's posterior mean estimates; 0
1249 otherwise
1250
1251 * "$external_data_set_basename": log10 of external data's signal at
1252 [TSS-500, TSS+500]. Signal is the number of reads aligned within
1253 that interval and normalized to RPKM by read depth and interval
1254 length. It will be set to -4 if no read aligned to that interval.
1255
1256 There are multiple columns like this one, where each represents an
1257 external data set.
1258
1259 * prd_expr_prob: predicted probability from logistic regression
1260 model on whether this isoform is expressed or not. A probability
1261 higher than 0.5 is considered as expressed
1262
1263 * partition: group index, to which this isoforms is partitioned
1264
1265 * prior: prior parameter for this isoform
1266
1267 EXAMPLES
1268 Assume the path to the bowtie executables is in the user's PATH
1269 environment variable. Reference files are under '/ref' with name
1270 'mouse_125'.
1271
1272 1) '/data/mmliver.fq', single-end reads with quality scores. Quality
1273 scores are encoded as for 'GA pipeline version &gt;= 1.3'. We want to use 8
1274 threads and generate a genome BAM file. In addition, we want to append
1275 gene/transcript names to the result files:
1276
1277 rsem-calculate-expression --phred64-quals \
1278 -p 8 \
1279 --append-names \
1280 --output-genome-bam \
1281 /data/mmliver.fq \
1282 /ref/mouse_125 \
1283 mmliver_single_quals
1284
1285 2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', stranded paired-end
1286 reads with quality scores. Suppose the library is prepared using TruSeq
1287 Stranded Kit, which means the first mate should map to the reverse
1288 strand. Quality scores are in SANGER format. We want to use 8 threads
1289 and do not generate a genome BAM file:
1290
1291 rsem-calculate-expression -p 8 \
1292 --paired-end \
1293 --strandedness reverse \
1294 /data/mmliver_1.fq \
1295 /data/mmliver_2.fq \
1296 /ref/mouse_125 \
1297 mmliver_paired_end_quals
1298
1299 3) '/data/mmliver.fa', single-end reads without quality scores. We want
1300 to use 8 threads:
1301
1302 rsem-calculate-expression -p 8 \
1303 --no-qualities \
1304 /data/mmliver.fa \
1305 /ref/mouse_125 \
1306 mmliver_single_without_quals
1307
1308 4) Data are the same as 1). This time we assume the bowtie executables
1309 are under '/sw/bowtie'. We want to take a fragment length distribution
1310 into consideration. We set the fragment length mean to 150 and the
1311 standard deviation to 35. In addition to a BAM file, we also want to
1312 generate credibility intervals. We allow RSEM to use 1GB of memory for
1313 CI calculation:
1314
1315 rsem-calculate-expression --bowtie-path /sw/bowtie \
1316 --phred64-quals \
1317 --fragment-length-mean 150.0 \
1318 --fragment-length-sd 35.0 \
1319 -p 8 \
1320 --output-genome-bam \
1321 --calc-ci \
1322 --ci-memory 1024 \
1323 /data/mmliver.fq \
1324 /ref/mouse_125 \
1325 mmliver_single_quals
1326
1327 5) '/data/mmliver_paired_end_quals.bam', BAM-formatted alignments for
1328 paired-end reads with quality scores. We want to use 8 threads:
1329
1330 rsem-calculate-expression --paired-end \
1331 --alignments \
1332 -p 8 \
1333 /data/mmliver_paired_end_quals.bam \
1334 /ref/mouse_125 \
1335 mmliver_paired_end_quals
1336
1337 6) '/data/mmliver_1.fq.gz' and '/data/mmliver_2.fq.gz', paired-end reads
1338 with quality scores and read files are compressed by gzip. We want to
1339 use STAR to aligned reads and assume STAR executable is '/sw/STAR'.
1340 Suppose we want to use 8 threads and do not generate a genome BAM file:
1341
1342 rsem-calculate-expression --paired-end \
1343 --star \
1344 --star-path /sw/STAR \
1345 --gzipped-read-file \
1346 --paired-end \
1347 -p 8 \
1348 /data/mmliver_1.fq.gz \
1349 /data/mmliver_2.fq.gz \
1350 /ref/mouse_125 \
1351 mmliver_paired_end_quals
1352
1353 7) In the above example, suppose we want to run prior-enhanced RSEM
1354 instead. Assuming we want to learn priors from a ChIP-seq peak file
1355 '/data/mmlive.narrowPeak.gz':
1356
1357 rsem-calculate-expression --star \
1358 --star-path /sw/STAR \
1359 --gzipped-read-file \
1360 --paired-end \
1361 --calc-pme \
1362 --run-pRSEM \
1363 --chipseq-peak-file /data/mmliver.narrowPeak.gz \
1364 -p 8 \
1365 /data/mmliver_1.fq.gz \
1366 /data/mmliver_2.fq.gz \
1367 /ref/mouse_125 \
1368 mmliver_paired_end_quals
1369
1370 8) Similar to the example in 7), suppose we want to use the partition
1371 model 'pk_lm2nopk' (partitioning isoforms by Pol II TSS peak first and
1372 then partitioning 'no TSS peak' isoforms into two bins by a linear
1373 regression model), and we want to partition isoforms by RNA Pol II's
1374 ChIP-seq read files '/data/mmliver_PolIIRep1.fq.gz' and
1375 '/data/mmliver_PolIIRep2.fq.gz', and the control ChIP-seq read files
1376 '/data/mmliver_ChIPseqCtrl.fq.gz'. Also, assuming Bowtie's executables
1377 are under '/sw/bowtie/':
1378
1379 rsem-calculate-expression --star \
1380 --star-path /sw/STAR \
1381 --gzipped-read-file \
1382 --paired-end \
1383 --calc-pme \
1384 --run-pRSEM \
1385 --chipseq-target-read-files /data/mmliver_PolIIRep1.fq.gz,/data/mmliver_PolIIRep2.fq.gz \
1386 --chipseq-control-read-files /data/mmliver_ChIPseqCtrl.fq.gz \
1387 --partition-model pk_lm2nopk \
1388 --bowtie-path /sw/bowtie \
1389 -p 8 \
1390 /data/mmliver_1.fq.gz \
1391 /data/mmliver_2.fq.gz \
1392 /ref/mouse_125 \
1393 mmliver_paired_end_quals
1394
1395 9) Similar to the example in 8), suppose we want to derive prior from
1396 four histone modification ChIP-seq read data sets:
1397 '/data/H3K27Ac.fastq.gz', '/data/H3K4me1.fastq.gz',
1398 '/data/H3K4me2.fastq.gz', and '/data/H3K4me3.fastq.gz'. Also, assuming
1399 Bowtie's executables are under '/sw/bowtie/':
1400
1401 rsem-calculate-expression --star \
1402 --star-path /sw/STAR \
1403 --gzipped-read-file \
1404 --paired-end \
1405 --calc-pme \
1406 --run-pRSEM \
1407 --partition-model cmb_lgt \
1408 --chipseq-read-files-multi-targets /data/H3K27Ac.fastq.gz,/data/H3K4me1.fastq.gz,/data/H3K4me2.fastq.gz,/data/H3K4me3.fastq.gz \
1409 --bowtie-path /sw/bowtie \
1410 -p 8 \
1411 /data/mmliver_1.fq.gz \
1412 /data/mmliver_2.fq.gz \
1413 /ref/mouse_125 \
1414 mmliver_paired_end_quals
1415
1416 </help>
1417 <citations>
1418 <citation type="doi">10.1186/1471-2105-12-323</citation>
1419 </citations>
1420
1421 </tool>