Mercurial > repos > artbio > rsem
comparison rsem.xml @ 0:e5e836936d60 draft
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit d84a0359354698a4b29df12ab581c2618bffcf80
author | artbio |
---|---|
date | Sat, 31 Mar 2018 21:30:07 -0400 |
parents | |
children | 49795544dac7 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e5e836936d60 |
---|---|
1 <tool id="rsembowtie" name="RSEM-Bowtie" version="0.4.0"> | |
2 <description></description> | |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
6 <requirements> | |
7 <requirement type="package" version="1.3.0">rsem</requirement> | |
8 <requirement type="package" version="1.2.2=py27pl5.22.0_0">bowtie</requirement> | |
9 </requirements> | |
10 <stdio> | |
11 <exit_code range="1:" level="warning" description="Tool exception" /> | |
12 </stdio> | |
13 <command detect_errors="exit_code"><![CDATA[ | |
14 #if $job.select_job == "index": | |
15 echo ${job.reference_name} " " | tee $reference_file && | |
16 mkdir $reference_file.files_path && | |
17 rsem-prepare-reference | |
18 #if $job.polya.polya_use == 'add': | |
19 #if $job.polya.polya_length: | |
20 --polyA-length $job.polya.polya_length | |
21 #end if | |
22 #elif $job.polya.polya_use == 'subset': | |
23 --no-polyA-subset $job.polya.no_polya_subset | |
24 #if $job.polya.polya_length: | |
25 --polyA-length $job.polya.polya_length | |
26 #end if | |
27 #elif $job.polya.polya_use == 'none': | |
28 --no-polyA | |
29 #end if | |
30 $job.ntog | |
31 #if $job.transcript_to_gene_map: | |
32 --transcript-to-gene-map $job.transcript_to_gene_map | |
33 #end if | |
34 --bowtie | |
35 #if $job.self_reference.ref_type == 'transcripts': | |
36 $job.self_reference.reference_fasta_file | |
37 #else: | |
38 --gtf $job.self_reference.gtf | |
39 $job.self_reference.reference_fasta_file | |
40 #end if | |
41 ${reference_file.files_path}/${job.reference_name} | |
42 > ${reference_file.files_path}/${job.reference_name}.log | |
43 #end if | |
44 | |
45 #if $job.select_job == "index" and $run_rsem.select == "Yes": | |
46 && | |
47 #end if | |
48 | |
49 #if $run_rsem.select == "Yes": | |
50 ## uncompress fastq.gz or fastqsanger.gz if needed | |
51 #if $run_rsem.input.fastq.matepair=="single": | |
52 #if $run_rsem.input.fastq.singlefastq.is_of_type('fastq.gz') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger.gz'): | |
53 gunzip < '$run_rsem.input.fastq.singlefastq' > uncomp_single.fastq && | |
54 #elif $run_rsem.input.fastq.singlefastq.is_of_type('fastq') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger'): | |
55 ln -f -s '$run_rsem.input.fastq.singlefastq' 'uncomp_single.fastq' && | |
56 #end if | |
57 #elif $run_rsem.input.fastq.matepair=="paired": | |
58 #if $run_rsem.input.fastq.fastq1.is_of_type('fastq.gz') or $run_rsem.input.fastq.fastq1.is_of_type('fastqsanger.gz'): | |
59 gunzip < '$run_rsem.input.fastq.fastq1' > uncomp_pair1.fastq && | |
60 gunzip < '$run_rsem.input.fastq.fastq2' > uncomp_pair2.fastq && | |
61 #elif $run_rsem.input.fastq.singlefastq.is_of_type('fastq') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger'): | |
62 ln -f -s '$run_rsem.input.fastq.fastq1' 'uncomp_pair1.fastq' && | |
63 ln -f -s '$run_rsem.input.fastq.fastq2' 'uncomp_pair2.fastq' && | |
64 #end if | |
65 #end if | |
66 rsem-calculate-expression | |
67 ## --tag string | |
68 #if $run_rsem.seedlength: | |
69 --seed-length $run_rsem.seedlength | |
70 #end if | |
71 --forward-prob $run_rsem.forward_prob | |
72 #if $run_rsem.rsem_options.fullparams == 'fullset': | |
73 ## Fragment info | |
74 #if $run_rsem.rsem_options.fragment_length_mean: | |
75 --fragment-length-mean $run_rsem.rsem_options.fragment_length_mean | |
76 #end if | |
77 #if $run_rsem.rsem_options.fragment_length_min: | |
78 --fragment-length-min $run_rsem.rsem_options.fragment_length_min | |
79 #end if | |
80 #if $run_rsem.rsem_options.fragment_length_sd: | |
81 --fragment-length-sd $run_rsem.rsem_options.fragment_length_sd | |
82 #end if | |
83 #if $run_rsem.rsem_options.fragment_length_max: | |
84 --fragment-length-max $run_rsem.rsem_options.fragment_length_max | |
85 #end if | |
86 ## RSPD | |
87 #if $run_rsem.rsem_options.rspd.estimate == 'yes': | |
88 --estimate-rspd | |
89 #if $run_rsem.rsem_options.rspd.num_rspd_bins: | |
90 --num-rspd-bins $run_rsem.rsem_options.rspd.num_rspd_bins | |
91 #end if | |
92 #end if | |
93 ## Calculate 95% credibility intervals and posterior mean estimates. | |
94 #if $run_rsem.rsem_options.useci.ci == 'yes': | |
95 --calc-ci | |
96 #if $run_rsem.rsem_options.useci.cimem: | |
97 --ci-memory $run_rsem.rsem_options.useci.cimem | |
98 #end if | |
99 #end if | |
100 #end if | |
101 --num-threads \${GALAXY_SLOTS:-4} | |
102 #if $run_rsem.input.format == 'fasta' and $run_rsem.input.bowtie_options.fullparams == 'fullset': | |
103 ## Bowtie params | |
104 #if $run_rsem.input.bowtie_options.bowtie_e: | |
105 --bowtie-e $run_rsem.input.bowtie_options.bowtie_e | |
106 #end if | |
107 #if $run_rsem.input.bowtie_options.bowtie_m: | |
108 --bowtie-m $run_rsem.input.bowtie_options.bowtie_m | |
109 #end if | |
110 --bowtie-n $run_rsem.input.bowtie_options.bowtie_n | |
111 #end if | |
112 #if $run_rsem.input.format == 'fastq' and $run_rsem.input.bowtie_options.fullparams == 'fullset': | |
113 ## Bowtie params | |
114 #if $run_rsem.input.bowtie_options.bowtie_e: | |
115 --bowtie-e $run_rsem.input.bowtie_options.bowtie_e | |
116 #end if | |
117 #if $run_rsem.input.bowtie_options.bowtie_m: | |
118 --bowtie-m $run_rsem.input.bowtie_options.bowtie_m | |
119 #end if | |
120 --bowtie-n $run_rsem.input.bowtie_options.bowtie_n | |
121 #end if | |
122 ## Outputs | |
123 #if $run_rsem.rsem_outputs.result_bams == 'none': | |
124 --no-bam-output | |
125 #elif $run_rsem.rsem_outputs.result_bams == 'default': | |
126 --sort-bam-by-coordinate | |
127 #else | |
128 --sort-bam-by-coordinate | |
129 --output-genome-bam | |
130 $run_rsem.rsem_outputs.sampling_for_bam | |
131 #end if | |
132 ## Input data | |
133 #if $run_rsem.input.format=="fastq" | |
134 $run_rsem.input.fastq_select | |
135 #if $run_rsem.input.fastq.matepair=="single": | |
136 uncomp_single.fastq | |
137 #elif $run_rsem.input.fastq.matepair=="paired": | |
138 --paired-end | |
139 uncomp_pair1.fastq | |
140 uncomp_pair2.fastq | |
141 #end if | |
142 #elif $run_rsem.input.format=="fasta" | |
143 --no-qualities | |
144 #if $run_rsem.input.fasta.matepair=="single": | |
145 $run_rsem.input.fasta.singlefasta | |
146 #elif $run_rsem.input.fasta.matepair=="paired": | |
147 --paired-end | |
148 $run_rsem.input.fasta.fasta1 | |
149 $run_rsem.input.fasta.fasta2 | |
150 #end if | |
151 #elif $run_rsem.input.format=="sam" | |
152 #if $run_rsem.input.matepair=="paired": | |
153 --paired-end | |
154 #end if | |
155 #if $run_rsem.input.rsem_sam._extension == 'sam': | |
156 --sam | |
157 #elif $run_rsem.input.rsem_sam._extension == 'bam': | |
158 --bam | |
159 #end if | |
160 $run_rsem.input.rsem_sam | |
161 #end if | |
162 ## RSEM reference | |
163 #if $run_rsem.reference.refSrc == 'history': | |
164 ${run_rsem.reference.rsem_ref.extra_files_path}/${run_rsem.reference.rsem_ref.metadata.reference_name} | |
165 #elif $run_rsem.reference.refSrc == 'self': | |
166 ${reference_file.files_path}/${job.reference_name} | |
167 #end if | |
168 ## sample_name: use a hard coded name so we can pull out galaxy outputs | |
169 rsem_output | |
170 ## direct output into logfile | |
171 > $log | |
172 #end if | |
173 ]]></command> | |
174 | |
175 <inputs> | |
176 <conditional name="job"> | |
177 <param name="select_job" type="select" label="rsem reference"> | |
178 <option value="index">Build rsem reference</option> | |
179 <option value="no-index" selected="true">rsem reference available from history</option> | |
180 </param> | |
181 <when value="index"> | |
182 <conditional name="self_reference"> | |
183 <param name="ref_type" type="select" label="Reference transcript source"> | |
184 <option value="transcripts">transcript fasta</option> | |
185 <option value="genomic">reference genome and gtf</option> | |
186 </param> | |
187 <when value="transcripts"> | |
188 <param name="reference_fasta_file" type="data" format="fasta" label="reference fasta file" | |
189 help="The files should contain the sequences of transcripts."/> | |
190 </when> | |
191 <when value="genomic"> | |
192 <param name="reference_fasta_file" type="data" format="fasta" label="reference fasta file" | |
193 help="The file should contain the sequence of an entire genome."/> | |
194 <param name="gtf" type="data" format="gtf" label="gtf" | |
195 help="extract transcript reference sequences using the gene annotations specified in this GTF" /> | |
196 </when> | |
197 </conditional> | |
198 <param name="transcript_to_gene_map" type="data" format="tabular" optional="true" label="Map of gene ids to transcript (isoform) ids" > | |
199 <help> | |
200 Each line of should be of the form: gene_id transcript_id ( with the two fields separated by a tab character ) | |
201 The map can be obtained from the UCSC table browser | |
202 group: Genes and Gene Prediction Tracks | |
203 table: knownIsoforms | |
204 Without a map: | |
205 If a reference genome and gtf is used, then RSEM uses the "gene_id" and "transcript_id" attributes in the GTF file. | |
206 Otherwise, RSEM assumes that each sequence in the reference sequence files is a separate gene. | |
207 </help> | |
208 </param> | |
209 <param name="reference_name" type="text" value="rsem_ref_name" label="reference name"> | |
210 <help>A one word name for this RSEM reference containing only letters, digits, and underscore characters</help> | |
211 <validator type="regex" message="Use only letters, digits, and underscore characters">^\w+$</validator> | |
212 </param> | |
213 <conditional name="polya"> | |
214 <param name="polya_use" type="select" label="PolyA "> | |
215 <option value="add" selected="true">Add poly(A) tails to all transcripts</option> | |
216 <option value="subset">Exclude poly(A) tails from selected transcripts</option> | |
217 <option value="none">Do not add poly(A) tails to any transcripts</option> | |
218 </param> | |
219 <when value="add"> | |
220 <param name="polya_length" type="integer" value="125" optional="true" label="The length of the poly(A) tails to be added. (Default: 125)"> | |
221 <validator type="in_range" message="must be positive " min="1"/> | |
222 </param> | |
223 </when> | |
224 <when value="subset"> | |
225 <param name="no_polya_subset" type="data" format="tabular" optional="true" label="List of transcript IDs (one per line) that should should not have polyA tails added."/> | |
226 <param name="polya_length" type="integer" value="125" optional="true" label="The length of the poly(A) tails to be added. (Default: 125)"> | |
227 <validator type="in_range" message="must be positive " min="1"/> | |
228 </param> | |
229 </when> | |
230 <when value="none"/> | |
231 </conditional> | |
232 <param name="ntog" type="boolean" truevalue="--no-ntog" falsevalue="" checked="false" label="Disable the conversion of 'N' characters to 'G' characters in the reference sequences" help="Bowtie uses the automatic N to G conversion to align against all positions in the reference."/> | |
233 </when> | |
234 <when value="no-index"> | |
235 </when> | |
236 </conditional> | |
237 | |
238 <conditional name="run_rsem"> | |
239 <param name="select" type="select" label="calculate expression with rsem"> | |
240 <option value="No">Just build rsem reference for latter rsem profiling</option> | |
241 <option value="Yes" selected="true">profile expression with rsem</option> | |
242 </param> | |
243 <when value="Yes"> | |
244 <param name="sample" type="text" value="rsem_sample" label="Sample name" /> | |
245 <conditional name="reference"> | |
246 <param name="refSrc" type="select" label="RSEM Reference Source"> | |
247 <option value="history">From your history</option> | |
248 <option value="self">Prepare RSEM Reference with this tool</option> | |
249 </param> | |
250 <when value="history"> | |
251 <param name="rsem_ref" type="data" format="rsem_ref" label="RSEM reference" /> | |
252 </when> | |
253 <when value="self"> | |
254 </when> | |
255 </conditional> | |
256 <conditional name="input"> | |
257 <param name="format" type="select" label="RSEM Input file type"> | |
258 <option value="fastq">FASTQ</option> | |
259 <option value="fasta">FASTA</option> | |
260 <option value="sam">SAM/BAM</option> | |
261 </param> | |
262 <when value="fastq"> | |
263 <param name="fastq_select" size="15" type="select" label="FASTQ type" > | |
264 <option value="--phred33-quals" selected="true">phred33 qualities (default for sanger)</option> | |
265 <option value="--solexa-quals">solexa qualities</option> | |
266 <option value="--phred64-quals">phred64 qualities</option> | |
267 </param> | |
268 <conditional name="fastq"> | |
269 <param name="matepair" type="select" label="Library type"> | |
270 <option value="single">Single End Reads</option> | |
271 <option value="paired">Paired End Reads</option> | |
272 </param> | |
273 <when value="single"> | |
274 <param name="singlefastq" type="data" format="fastq,fastq.gz" label="FASTQ file" /> | |
275 </when> | |
276 <when value="paired"> | |
277 <param name="fastq1" type="data" format="fastq,fastq.gz" label="Read 1 fastq file" /> | |
278 <param name="fastq2" type="data" format="fastq,fastq.gz" label="Read 2 fastq file" /> | |
279 </when> | |
280 </conditional> | |
281 <expand macro="bowtie_options"/> | |
282 </when> | |
283 <when value="fasta"> | |
284 <conditional name="fasta"> | |
285 <param name="matepair" type="select" label="Library Type"> | |
286 <option value="single">Single End Reads</option> | |
287 <option value="paired">Paired End Reads</option> | |
288 </param> | |
289 <when value="single"> | |
290 <param name="singlefasta" type="data" format="fasta" label="fasta file" /> | |
291 </when> | |
292 <when value="paired"> | |
293 <param name="fasta1" type="data" format="fasta" label="Read 1 fasta file" /> | |
294 <param name="fasta2" type="data" format="fasta" label="Read 2 fasta file" /> | |
295 </when> | |
296 </conditional> | |
297 <expand macro="bowtie_options"/> | |
298 </when> | |
299 <when value="sam"> | |
300 <!-- convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam --> | |
301 <param name="matepair" type="select" label="Library Type"> | |
302 <option value="single">Single End Reads</option> | |
303 <option value="paired">Paired End Reads</option> | |
304 </param> | |
305 <param name="rsem_sam" type="data" format="rsem_sam" label="RSEM formatted SAM file" /> | |
306 </when> | |
307 </conditional> | |
308 <expand macro="rsem_options"/> | |
309 <conditional name="rsem_outputs"> | |
310 <param name="result_bams" type="select" label="Create bam results files" | |
311 help="In addition to the transcript-coordinate-based BAM file output, also output a BAM file with the read alignments in genomic coordinates" > | |
312 <option value="none">No BAM results files</option> | |
313 <option value="default" selected="true">Transcript BAM results file</option> | |
314 <option value="both">Transcript and genome BAM results files</option> | |
315 </param> | |
316 <when value="none"/> | |
317 <when value="default"> | |
318 <expand macro="sampling_for_bam"/> | |
319 </when> | |
320 <when value="both"> | |
321 <expand macro="sampling_for_bam"/> | |
322 </when> | |
323 </conditional> | |
324 </when> | |
325 <when value="No"> | |
326 </when> | |
327 </conditional> | |
328 </inputs> | |
329 | |
330 <outputs> | |
331 <data format="rsem_ref" name="reference_file" label="RSEM ${job.reference_name} reference"> | |
332 <filter>job['select_job'] == "index"</filter> | |
333 </data> | |
334 <data format="tabular" name="gene_abundances" label="${run_rsem.sample}.gene_abundances" from_work_dir="rsem_output.genes.results"> | |
335 <filter>run_rsem['select'] == "Yes"</filter> | |
336 </data> | |
337 <data format="tabular" name="isoform_abundances" label="${run_rsem.sample}.isoform_abundances" from_work_dir="rsem_output.isoforms.results"> | |
338 <filter>run_rsem['select'] == "Yes"</filter> | |
339 </data> | |
340 <data format="bam" name="transcript_sorted_bam" label="${run_rsem.sample}.transcript.bam" from_work_dir="rsem_output.transcript.sorted.bam" > | |
341 <filter>run_rsem['select'] == "Yes" and run_rsem['rsem_outputs']['result_bams'] != "none"</filter> | |
342 </data> | |
343 <data format="bam" name="genome_sorted_bam" label="${run_rsem.sample}.genome.bam" from_work_dir="rsem_output.genome.sorted.bam"> | |
344 <filter>run_rsem['select'] == "Yes" and run_rsem['rsem_outputs']['result_bams'] == "both"</filter> | |
345 </data> | |
346 <data format="txt" name="log" label="${run_rsem.sample}.rsem_log"> | |
347 <filter>run_rsem['select'] == "Yes"</filter> | |
348 </data> | |
349 </outputs> | |
350 | |
351 <tests> | |
352 <test> | |
353 <param name="select_job" value="index"/> | |
354 <param name="ref_type" value="genomic"/> | |
355 <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/> | |
356 <param name="gtf" value="ref.gtf" ftype="gtf"/> | |
357 <param name="reference_name" value="ref"/> | |
358 <param name="select" value="Yes"/> | |
359 <param name="sample" value="rsem_sample"/> | |
360 <param name="refSrc" value="self"/> | |
361 <param name="format" value="fastq"/> | |
362 <param name="matepair" value="single"/> | |
363 <param name="singlefastq" value="test.fq" ftype="fastqsanger"/> | |
364 <param name="result_bams" value="none"/> | |
365 <output name="reference_file"> | |
366 <assert_contents> | |
367 <has_text text="ref" /> | |
368 </assert_contents> | |
369 </output> | |
370 <output name="gene_abundances" value="gene_abundances.tab"/> | |
371 <output name="isoform_abundances" value="isoform_abundances.tab" /> | |
372 <output name="log"> | |
373 <assert_contents> | |
374 <has_text text="Expression Results are written" /> | |
375 </assert_contents> | |
376 </output> | |
377 </test> | |
378 <test> | |
379 <param name="select_job" value="index"/> | |
380 <param name="ref_type" value="genomic"/> | |
381 <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/> | |
382 <param name="gtf" value="ref.gtf" ftype="gtf"/> | |
383 <param name="reference_name" value="ref"/> | |
384 <param name="select" value="Yes"/> | |
385 <param name="sample" value="rsem_sample"/> | |
386 <param name="refSrc" value="self"/> | |
387 <param name="format" value="fastq"/> | |
388 <param name="matepair" value="single"/> | |
389 <param name="singlefastq" value="test.fastq.gz" ftype="fastqsanger.gz"/> | |
390 <param name="result_bams" value="none"/> | |
391 <output name="reference_file"> | |
392 <assert_contents> | |
393 <has_text text="ref" /> | |
394 </assert_contents> | |
395 </output> | |
396 <output name="gene_abundances" value="gene_abundances.tab"/> | |
397 <output name="isoform_abundances" value="isoform_abundances.tab" /> | |
398 <output name="log"> | |
399 <assert_contents> | |
400 <has_text text="Expression Results are written" /> | |
401 </assert_contents> | |
402 </output> | |
403 </test> | |
404 <test> | |
405 <param name="select_job" value="index"/> | |
406 <param name="ref_type" value="genomic"/> | |
407 <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/> | |
408 <param name="gtf" value="ref.gtf" ftype="gtf"/> | |
409 <param name="reference_name" value="ref"/> | |
410 <param name="select" value="No"/> | |
411 <output name="reference_file"> | |
412 <assert_contents> | |
413 <has_text text="ref" /> | |
414 </assert_contents> | |
415 </output> | |
416 </test> | |
417 </tests> | |
418 | |
419 <help> | |
420 .. class:: infomark | |
421 | |
422 RSEM HOME PAGE - http://deweylab.biostat.wisc.edu/rsem/ | |
423 | |
424 NAME | |
425 rsem-prepare-reference | |
426 | |
427 SYNOPSIS | |
428 rsem-prepare-reference [options] reference_fasta_file(s) reference_name | |
429 | |
430 DESCRIPTION | |
431 The rsem-prepare-reference program extracts/preprocesses the reference sequences and builds Bowtie indices using default parameters. | |
432 This program is used in conjunction with the 'rsem-calculate-expression' program. | |
433 | |
434 INPUTS | |
435 A fasta file of transcripts | |
436 or | |
437 A genome sequence fasta file and a GTF gene annotation file. (When using UCSC data, include the related knownIsoforms.txt) | |
438 | |
439 --- | |
440 | |
441 NAME | |
442 rsem-calculate-expression - Estimate gene and isoform expression from | |
443 RNA-Seq data. | |
444 | |
445 SYNOPSIS | |
446 rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name | |
447 rsem-calculate-expression [options] --paired-end upstream_read_file(s) downstream_read_file(s) reference_name sample_name | |
448 rsem-calculate-expression [options] --alignments [--paired-end] input reference_name sample_name | |
449 | |
450 ARGUMENTS | |
451 upstream_read_files(s) | |
452 Comma-separated list of files containing single-end reads or | |
453 upstream reads for paired-end data. By default, these files are | |
454 assumed to be in FASTQ format. If the --no-qualities option is | |
455 specified, then FASTA format is expected. | |
456 | |
457 downstream_read_file(s) | |
458 Comma-separated list of files containing downstream reads which are | |
459 paired with the upstream reads. By default, these files are assumed | |
460 to be in FASTQ format. If the --no-qualities option is specified, | |
461 then FASTA format is expected. | |
462 | |
463 input | |
464 SAM/BAM/CRAM formatted input file. If "-" is specified for the | |
465 filename, the input is instead assumed to come from standard input. | |
466 RSEM requires all alignments of the same read group together. For | |
467 paired-end reads, RSEM also requires the two mates of any alignment | |
468 be adjacent. In addition, RSEM does not allow the SEQ and QUAL | |
469 fields to be empty. See Description section for how to make input | |
470 file obey RSEM's requirements. | |
471 | |
472 reference_name | |
473 The name of the reference used. The user must have run | |
474 'rsem-prepare-reference' with this reference_name before running | |
475 this program. | |
476 | |
477 sample_name | |
478 The name of the sample analyzed. All output files are prefixed by | |
479 this name (e.g., sample_name.genes.results) | |
480 | |
481 BASIC OPTIONS | |
482 --paired-end | |
483 Input reads are paired-end reads. (Default: off) | |
484 | |
485 --no-qualities | |
486 Input reads do not contain quality scores. (Default: off) | |
487 | |
488 --strandedness <none|forward|reverse> | |
489 This option defines the strandedness of the RNA-Seq reads. It | |
490 recognizes three values: 'none', 'forward', and 'reverse'. 'none' | |
491 refers to non-strand-specific protocols. 'forward' means all | |
492 (upstream) reads are derived from the forward strand. 'reverse' | |
493 means all (upstream) reads are derived from the reverse strand. If | |
494 'forward'/'reverse' is set, the '--norc'/'--nofw' Bowtie/Bowtie 2 | |
495 option will also be enabled to avoid aligning reads to the opposite | |
496 strand. For Illumina TruSeq Stranded protocols, please use | |
497 'reverse'. (Default: 'none') | |
498 | |
499 -p/--num-threads <int> | |
500 Number of threads to use. Both Bowtie/Bowtie2, expression estimation | |
501 and 'samtools sort' will use this many threads. (Default: 1) | |
502 | |
503 --alignments | |
504 Input file contains alignments in SAM/BAM/CRAM format. The exact | |
505 file format will be determined automatically. (Default: off) | |
506 | |
507 --fai <file> | |
508 If the header section of input alignment file does not contain | |
509 reference sequence information, this option should be turned on. | |
510 <file> is a FAI format file containing each reference sequence's | |
511 name and length. Please refer to the SAM official website for the | |
512 details of FAI format. (Default: off) | |
513 | |
514 --bowtie2 | |
515 Use Bowtie 2 instead of Bowtie to align reads. Since currently RSEM | |
516 does not handle indel, local and discordant alignments, the Bowtie2 | |
517 parameters are set in a way to avoid those alignments. In | |
518 particular, we use options '--sensitive --dpad 0 --gbar 99999999 | |
519 --mp 1,1 --np 1 --score-min L,0,-0.1' by default. The last parameter | |
520 of '--score-min', '-0.1', is the negative of maximum mismatch rate. | |
521 This rate can be set by option '--bowtie2-mismatch-rate'. If reads | |
522 are paired-end, we additionally use options '--no-mixed' and | |
523 '--no-discordant'. (Default: off) | |
524 | |
525 --star | |
526 Use STAR to align reads. Alignment parameters are from ENCODE3's | |
527 STAR-RSEM pipeline. To save computational time and memory resources, | |
528 STAR's Output BAM file is unsorted. It is stored in RSEM's temporary | |
529 directory with name as 'sample_name.bam'. Each STAR job will have | |
530 its own private copy of the genome in memory. (Default: off) | |
531 | |
532 --append-names | |
533 If gene_name/transcript_name is available, append it to the end of | |
534 gene_id/transcript_id (separated by '_') in files | |
535 'sample_name.isoforms.results' and 'sample_name.genes.results'. | |
536 (Default: off) | |
537 | |
538 --seed <uint32> | |
539 Set the seed for the random number generators used in calculating | |
540 posterior mean estimates and credibility intervals. The seed must be | |
541 a non-negative 32 bit integer. (Default: off) | |
542 | |
543 --single-cell-prior | |
544 By default, RSEM uses Dirichlet(1) as the prior to calculate | |
545 posterior mean estimates and credibility intervals. However, much | |
546 less genes are expressed in single cell RNA-Seq data. Thus, if you | |
547 want to compute posterior mean estimates and/or credibility | |
548 intervals and you have single-cell RNA-Seq data, you are recommended | |
549 to turn on this option. Then RSEM will use Dirichlet(0.1) as the | |
550 prior which encourage the sparsity of the expression levels. | |
551 (Default: off) | |
552 | |
553 --calc-pme | |
554 Run RSEM's collapsed Gibbs sampler to calculate posterior mean | |
555 estimates. (Default: off) | |
556 | |
557 --calc-ci | |
558 Calculate 95% credibility intervals and posterior mean estimates. | |
559 The credibility level can be changed by setting | |
560 '--ci-credibility-level'. (Default: off) | |
561 | |
562 -q/--quiet | |
563 Suppress the output of logging information. (Default: off) | |
564 | |
565 -h/--help | |
566 Show help information. | |
567 | |
568 --version | |
569 Show version information. | |
570 | |
571 OUTPUT OPTIONS | |
572 --sort-bam-by-read-name | |
573 Sort BAM file aligned under transcript coordidate by read name. | |
574 Setting this option on will produce deterministic maximum likelihood | |
575 estimations from independent runs. Note that sorting will take long | |
576 time and lots of memory. (Default: off) | |
577 | |
578 --no-bam-output | |
579 Do not output any BAM file. (Default: off) | |
580 | |
581 --sampling-for-bam | |
582 When RSEM generates a BAM file, instead of outputting all alignments | |
583 a read has with their posterior probabilities, one alignment is | |
584 sampled according to the posterior probabilities. The sampling | |
585 procedure includes the alignment to the "noise" transcript, which | |
586 does not appear in the BAM file. Only the sampled alignment has a | |
587 weight of 1. All other alignments have weight 0. If the "noise" | |
588 transcript is sampled, all alignments appeared in the BAM file | |
589 should have weight 0. (Default: off) | |
590 | |
591 --output-genome-bam | |
592 Generate a BAM file, 'sample_name.genome.bam', with alignments | |
593 mapped to genomic coordinates and annotated with their posterior | |
594 probabilities. In addition, RSEM will call samtools (included in | |
595 RSEM package) to sort and index the bam file. | |
596 'sample_name.genome.sorted.bam' and | |
597 'sample_name.genome.sorted.bam.bai' will be generated. (Default: | |
598 off) | |
599 | |
600 --sort-bam-by-coordinate | |
601 Sort RSEM generated transcript and genome BAM files by coordinates | |
602 and build associated indices. (Default: off) | |
603 | |
604 --sort-bam-memory-per-thread <string> | |
605 Set the maximum memory per thread that can be used by 'samtools | |
606 sort'. <string> represents the memory and accepts suffices 'K/M/G'. | |
607 RSEM will pass <string> to the '-m' option of 'samtools sort'. Note | |
608 that the default used here is different from the default used by | |
609 samtools. (Default: 1G) | |
610 | |
611 ALIGNER OPTIONS | |
612 --seed-length <int> | |
613 Seed length used by the read aligner. Providing the correct value is | |
614 important for RSEM. If RSEM runs Bowtie, it uses this value for | |
615 Bowtie's seed length parameter. Any read with its or at least one of | |
616 its mates' (for paired-end reads) length less than this value will | |
617 be ignored. If the references are not added poly(A) tails, the | |
618 minimum allowed value is 5, otherwise, the minimum allowed value is | |
619 25. Note that this script will only check if the value >= 5 and give | |
620 a warning message if the value < 25 but >= 5. (Default: 25) | |
621 | |
622 --phred33-quals | |
623 Input quality scores are encoded as Phred+33. (Default: on) | |
624 | |
625 --phred64-quals | |
626 Input quality scores are encoded as Phred+64 (default for GA | |
627 Pipeline ver. >= 1.3). (Default: off) | |
628 | |
629 --solexa-quals | |
630 Input quality scores are solexa encoded (from GA Pipeline ver. < | |
631 1.3). (Default: off) | |
632 | |
633 --bowtie-path <path> | |
634 The path to the Bowtie executables. (Default: the path to the Bowtie | |
635 executables is assumed to be in the user's PATH environment | |
636 variable) | |
637 | |
638 --bowtie-n <int> | |
639 (Bowtie parameter) max # of mismatches in the seed. (Range: 0-3, | |
640 Default: 2) | |
641 | |
642 --bowtie-e <int> | |
643 (Bowtie parameter) max sum of mismatch quality scores across the | |
644 alignment. (Default: 99999999) | |
645 | |
646 --bowtie-m <int> | |
647 (Bowtie parameter) suppress all alignments for a read if > <int> | |
648 valid alignments exist. (Default: 200) | |
649 | |
650 --bowtie-chunkmbs <int> | |
651 (Bowtie parameter) memory allocated for best first alignment | |
652 calculation (Default: 0 - use Bowtie's default) | |
653 | |
654 --bowtie2-path <path> | |
655 (Bowtie 2 parameter) The path to the Bowtie 2 executables. (Default: | |
656 the path to the Bowtie 2 executables is assumed to be in the user's | |
657 PATH environment variable) | |
658 | |
659 --bowtie2-mismatch-rate <double> | |
660 (Bowtie 2 parameter) The maximum mismatch rate allowed. (Default: | |
661 0.1) | |
662 | |
663 --bowtie2-k <int> | |
664 (Bowtie 2 parameter) Find up to <int> alignments per read. (Default: | |
665 200) | |
666 | |
667 --bowtie2-sensitivity-level <string> | |
668 (Bowtie 2 parameter) Set Bowtie 2's preset options in --end-to-end | |
669 mode. This option controls how hard Bowtie 2 tries to find | |
670 alignments. <string> must be one of "very_fast", "fast", "sensitive" | |
671 and "very_sensitive". The four candidates correspond to Bowtie 2's | |
672 "--very-fast", "--fast", "--sensitive" and "--very-sensitive" | |
673 options. (Default: "sensitive" - use Bowtie 2's default) | |
674 | |
675 --star-path <path> | |
676 The path to STAR's executable. (Default: the path to STAR executable | |
677 is assumed to be in user's PATH environment variable) | |
678 | |
679 --star-gzipped-read-file | |
680 (STAR parameter) Input read file(s) is compressed by gzip. (Default: | |
681 off) | |
682 | |
683 --star-bzipped-read-file | |
684 (STAR parameter) Input read file(s) is compressed by bzip2. | |
685 (Default: off) | |
686 | |
687 --star-output-genome-bam | |
688 (STAR parameter) Save the BAM file from STAR alignment under genomic | |
689 coordinate to 'sample_name.STAR.genome.bam'. This file is NOT sorted | |
690 by genomic coordinate. In this file, according to STAR's manual, | |
691 'paired ends of an alignment are always adjacent, and multiple | |
692 alignments of a read are adjacent as well'. (Default: off) | |
693 | |
694 ADVANCED OPTIONS | |
695 --tag <string> | |
696 The name of the optional field used in the SAM input for identifying | |
697 a read with too many valid alignments. The field should have the | |
698 format <tagName>:i:<value>, where a <value> bigger than 0 indicates | |
699 a read with too many alignments. (Default: "") | |
700 | |
701 --fragment-length-min <int> | |
702 Minimum read/insert length allowed. This is also the value for the | |
703 Bowtie/Bowtie2 -I option. (Default: 1) | |
704 | |
705 --fragment-length-max <int> | |
706 Maximum read/insert length allowed. This is also the value for the | |
707 Bowtie/Bowtie 2 -X option. (Default: 1000) | |
708 | |
709 --fragment-length-mean <double> | |
710 (single-end data only) The mean of the fragment length distribution, | |
711 which is assumed to be a Gaussian. (Default: -1, which disables use | |
712 of the fragment length distribution) | |
713 | |
714 --fragment-length-sd <double> | |
715 (single-end data only) The standard deviation of the fragment length | |
716 distribution, which is assumed to be a Gaussian. (Default: 0, which | |
717 assumes that all fragments are of the same length, given by the | |
718 rounded value of --fragment-length-mean) | |
719 | |
720 --estimate-rspd | |
721 Set this option if you want to estimate the read start position | |
722 distribution (RSPD) from data. Otherwise, RSEM will use a uniform | |
723 RSPD. (Default: off) | |
724 | |
725 --num-rspd-bins <int> | |
726 Number of bins in the RSPD. Only relevant when '--estimate-rspd' is | |
727 specified. Use of the default setting is recommended. (Default: 20) | |
728 | |
729 --gibbs-burnin <int> | |
730 The number of burn-in rounds for RSEM's Gibbs sampler. Each round | |
731 passes over the entire data set once. If RSEM can use multiple | |
732 threads, multiple Gibbs samplers will start at the same time and all | |
733 samplers share the same burn-in number. (Default: 200) | |
734 | |
735 --gibbs-number-of-samples <int> | |
736 The total number of count vectors RSEM will collect from its Gibbs | |
737 samplers. (Default: 1000) | |
738 | |
739 --gibbs-sampling-gap <int> | |
740 The number of rounds between two succinct count vectors RSEM | |
741 collects. If the count vector after round N is collected, the count | |
742 vector after round N + <int> will also be collected. (Default: 1) | |
743 | |
744 --ci-credibility-level <double> | |
745 The credibility level for credibility intervals. (Default: 0.95) | |
746 | |
747 --ci-memory <int> | |
748 Maximum size (in memory, MB) of the auxiliary buffer used for | |
749 computing credibility intervals (CI). (Default: 1024) | |
750 | |
751 --ci-number-of-samples-per-count-vector <int> | |
752 The number of read generating probability vectors sampled per | |
753 sampled count vector. The crebility intervals are calculated by | |
754 first sampling P(C | D) and then sampling P(Theta | C) for each | |
755 sampled count vector. This option controls how many Theta vectors | |
756 are sampled per sampled count vector. (Default: 50) | |
757 | |
758 --keep-intermediate-files | |
759 Keep temporary files generated by RSEM. RSEM creates a temporary | |
760 directory, 'sample_name.temp', into which it puts all intermediate | |
761 output files. If this directory already exists, RSEM overwrites all | |
762 files generated by previous RSEM runs inside of it. By default, | |
763 after RSEM finishes, the temporary directory is deleted. Set this | |
764 option to prevent the deletion of this directory and the | |
765 intermediate files inside of it. (Default: off) | |
766 | |
767 --temporary-folder <string> | |
768 Set where to put the temporary files generated by RSEM. If the | |
769 folder specified does not exist, RSEM will try to create it. | |
770 (Default: sample_name.temp) | |
771 | |
772 --time | |
773 Output time consumed by each step of RSEM to 'sample_name.time'. | |
774 (Default: off) | |
775 | |
776 PRIOR-ENHANCED RSEM OPTIONS | |
777 --run-pRSEM | |
778 Running prior-enhanced RSEM (pRSEM). Prior parameters, i.e. | |
779 isoform's initial pseudo-count for RSEM's Gibbs sampling, will be | |
780 learned from input RNA-seq data and an external data set. When pRSEM | |
781 needs and only needs ChIP-seq peak information to partition isoforms | |
782 (e.g. in pRSEM's default partition model), either ChIP-seq peak file | |
783 (with the '--chipseq-peak-file' option) or ChIP-seq FASTQ files for | |
784 target and input and the path for Bowtie executables are required | |
785 (with the '--chipseq-target-read-files <string>', | |
786 '--chipseq-control-read-files <string>', and '--bowtie-path <path> | |
787 options), otherwise, ChIP-seq FASTQ files for target and control and | |
788 the path to Bowtie executables are required. (Default: off) | |
789 | |
790 --chipseq-peak-file <string> | |
791 Full path to a ChIP-seq peak file in ENCODE's narrowPeak, i.e. | |
792 BED6+4, format. This file is used when running prior-enhanced RSEM | |
793 in the default two-partition model. It partitions isoforms by | |
794 whether they have ChIP-seq overlapping with their transcription | |
795 start site region or not. Each partition will have its own prior | |
796 parameter learned from a training set. This file can be either | |
797 gzipped or ungzipped. (Default: "") | |
798 | |
799 --chipseq-target-read-files <string> | |
800 Comma-separated full path of FASTQ read file(s) for ChIP-seq target. | |
801 This option is used when running prior-enhanced RSEM. It provides | |
802 information to calculate ChIP-seq peaks and signals. The file(s) can | |
803 be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The | |
804 options '--bowtie-path <path>' and '--chipseq-control-read-files | |
805 <string>' must be defined when this option is specified. (Default: | |
806 "") | |
807 | |
808 --chipseq-control-read-files <string> | |
809 Comma-separated full path of FASTQ read file(s) for ChIP-seq conrol. | |
810 This option is used when running prior-enhanced RSEM. It provides | |
811 information to call ChIP-seq peaks. The file(s) can be either | |
812 ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options | |
813 '--bowtie-path <path>' and '--chipseq-target-read-files <string>' | |
814 must be defined when this option is specified. (Default: "") | |
815 | |
816 --chipseq-read-files-multi-targets <string> | |
817 Comma-separated full path of FASTQ read files for multiple ChIP-seq | |
818 targets. This option is used when running prior-enhanced RSEM, where | |
819 prior is learned from multiple complementary data sets. It provides | |
820 information to calculate ChIP-seq signals. All files can be either | |
821 ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this | |
822 option is specified, the option '--bowtie-path <path>' must be | |
823 defined and the option '--partition-model <string>' will be set to | |
824 'cmb_lgt' automatically. (Default: "") | |
825 | |
826 --chipseq-bed-files-multi-targets <string> | |
827 Comma-separated full path of BED files for multiple ChIP-seq | |
828 targets. This option is used when running prior-enhanced RSEM, where | |
829 prior is learned from multiple complementary data sets. It provides | |
830 information of ChIP-seq signals and must have at least the first six | |
831 BED columns. All files can be either ungzipped or gzipped with a | |
832 suffix '.gz' or '.gzip'. When this option is specified, the option | |
833 '--partition-model <string>' will be set to 'cmb_lgt' automatically. | |
834 (Default: "") | |
835 | |
836 --cap-stacked-chipseq-reads | |
837 Keep a maximum number of ChIP-seq reads that aligned to the same | |
838 genomic interval. This option is used when running prior-enhanced | |
839 RSEM, where prior is learned from multiple complementary data sets. | |
840 This option is only in use when either | |
841 '--chipseq-read-files-multi-targets <string>' or | |
842 '--chipseq-bed-files-multi-targets <string>' is specified. (Default: | |
843 off) | |
844 | |
845 --n-max-stacked-chipseq-reads <int> | |
846 The maximum number of stacked ChIP-seq reads to keep. This option is | |
847 used when running prior-enhanced RSEM, where prior is learned from | |
848 multiple complementary data sets. This option is only in use when | |
849 the option '--cap-stacked-chipseq-reads' is set. (Default: 5) | |
850 | |
851 --partition-model <string> | |
852 A keyword to specify the partition model used by prior-enhanced | |
853 RSEM. It must be one of the following keywords: | |
854 | |
855 - pk | |
856 Partitioned by whether an isoform has a ChIP-seq peak overlapping | |
857 with its transcription start site (TSS) region. The TSS region is | |
858 defined as [TSS-500bp, TSS+500bp]. For simplicity, we refer this | |
859 type of peak as 'TSS peak' when explaining other keywords. | |
860 | |
861 - pk_lgtnopk | |
862 First partitioned by TSS peak. Then, for isoforms in the 'no TSS | |
863 peak' set, a logistic model is employed to further classify them | |
864 into two partitions. | |
865 | |
866 - lm3, lm4, lm5, or lm6 | |
867 Based on their ChIP-seq signals, isoforms are classified into 3, | |
868 4, 5, or 6 partitions by a linear regression model. | |
869 | |
870 - nopk_lm2pk, nopk_lm3pk, nopk_lm4pk, or nopk_lm5pk | |
871 First partitioned by TSS peak. Then, for isoforms in the 'with TSS | |
872 peak' set, a linear regression model is employed to further | |
873 classify them into 2, 3, 4, or 5 partitions. | |
874 | |
875 - pk_lm2nopk, pk_lm3nopk, pk_lm4nopk, or pk_lm5nopk | |
876 First partitioned by TSS peak. Then, for isoforms in the 'no TSS | |
877 peak' set, a linear regression model is employed to further | |
878 classify them into 2, 3, 4, or 5 partitions. | |
879 | |
880 - cmb_lgt | |
881 Using a logistic regression to combine TSS signals from multiple | |
882 complementary data sets and partition training set isoform into | |
883 'expressed' and 'not expressed'. This partition model is only in | |
884 use when either '--chipseq-read-files-multi-targets <string>' or | |
885 '--chipseq-bed-files-multi-targets <string> is specified. | |
886 | |
887 Parameters for all the above models are learned from a training set. | |
888 For detailed explanations, please see prior-enhanced RSEM's paper. | |
889 (Default: 'pk') | |
890 | |
891 DEPRECATED OPTIONS | |
892 The options in this section are deprecated. They are here only for | |
893 compatibility reasons and may be removed in future releases. | |
894 | |
895 --sam | |
896 Inputs are alignments in SAM format. (Default: off) | |
897 | |
898 --bam | |
899 Inputs are alignments in BAM format. (Default: off) | |
900 | |
901 --strand-specific | |
902 Equivalent to '--strandedness forward'. (Default: off) | |
903 | |
904 --forward-prob <double> | |
905 Probability of generating a read from the forward strand of a | |
906 transcript. Set to 1 for a strand-specific protocol where all | |
907 (upstream) reads are derived from the forward strand, 0 for a | |
908 strand-specific protocol where all (upstream) read are derived from | |
909 the reverse strand, or 0.5 for a non-strand-specific protocol. | |
910 (Default: off) | |
911 | |
912 DESCRIPTION | |
913 In its default mode, this program aligns input reads against a reference | |
914 transcriptome with Bowtie and calculates expression values using the | |
915 alignments. RSEM assumes the data are single-end reads with quality | |
916 scores, unless the '--paired-end' or '--no-qualities' options are | |
917 specified. Alternatively, users can use STAR to align reads using the | |
918 '--star' option. RSEM has provided options in 'rsem-prepare-reference' | |
919 to prepare STAR's genome indices. Users may use an alternative aligner | |
920 by specifying '--alignments', and providing an alignment file in | |
921 SAM/BAM/CRAM format. However, users should make sure that they align | |
922 against the indices generated by 'rsem-prepare-reference' and the | |
923 alignment file satisfies the requirements mentioned in ARGUMENTS | |
924 section. | |
925 | |
926 One simple way to make the alignment file satisfying RSEM's requirements | |
927 is to use the 'convert-sam-for-rsem' script. This script accepts | |
928 SAM/BAM/CRAM files as input and outputs a BAM file. For example, type | |
929 the following command to convert a SAM file, 'input.sam', to a | |
930 ready-for-use BAM file, 'input_for_rsem.bam': | |
931 | |
932 convert-sam-for-rsem input.sam input_for_rsem | |
933 | |
934 For details, please refer to 'convert-sam-for-rsem's documentation page. | |
935 | |
936 NOTES | |
937 1. Users must run 'rsem-prepare-reference' with the appropriate | |
938 reference before using this program. | |
939 | |
940 2. For single-end data, it is strongly recommended that the user provide | |
941 the fragment length distribution parameters (--fragment-length-mean and | |
942 --fragment-length-sd). For paired-end data, RSEM will automatically | |
943 learn a fragment length distribution from the data. | |
944 | |
945 3. Some aligner parameters have default values different from their | |
946 original settings. | |
947 | |
948 4. With the '--calc-pme' option, posterior mean estimates will be | |
949 calculated in addition to maximum likelihood estimates. | |
950 | |
951 5. With the '--calc-ci' option, 95% credibility intervals and posterior | |
952 mean estimates will be calculated in addition to maximum likelihood | |
953 estimates. | |
954 | |
955 6. The temporary directory and all intermediate files will be removed | |
956 when RSEM finishes unless '--keep-intermediate-files' is specified. | |
957 | |
958 With the '--run-pRSEM' option and associated options (see section | |
959 'PRIOR-ENHANCED RSEM OPTIONS' above for details), prior-enhanced RSEM | |
960 will be running. Prior parameters will be learned from supplied external | |
961 data set(s) and assigned as initial pseudo-counts for isoforms in the | |
962 corresponding partition for Gibbs sampling. | |
963 | |
964 OUTPUT | |
965 sample_name.isoforms.results | |
966 File containing isoform level expression estimates. The first line | |
967 contains column names separated by the tab character. The format of | |
968 each line in the rest of this file is: | |
969 | |
970 transcript_id gene_id length effective_length expected_count TPM | |
971 FPKM IsoPct [posterior_mean_count | |
972 posterior_standard_deviation_of_count pme_TPM pme_FPKM | |
973 IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound | |
974 TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound | |
975 FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation] | |
976 | |
977 Fields are separated by the tab character. Fields within "[]" are | |
978 optional. They will not be presented if neither '--calc-pme' nor | |
979 '--calc-ci' is set. | |
980 | |
981 'transcript_id' is the transcript name of this transcript. 'gene_id' | |
982 is the gene name of the gene which this transcript belongs to | |
983 (denote this gene as its parent gene). If no gene information is | |
984 provided, 'gene_id' and 'transcript_id' are the same. | |
985 | |
986 'length' is this transcript's sequence length (poly(A) tail is not | |
987 counted). 'effective_length' counts only the positions that can | |
988 generate a valid fragment. If no poly(A) tail is added, | |
989 'effective_length' is equal to transcript length - mean fragment | |
990 length + 1. If one transcript's effective length is less than 1, | |
991 this transcript's both effective length and abundance estimates are | |
992 set to 0. | |
993 | |
994 'expected_count' is the sum of the posterior probability of each | |
995 read comes from this transcript over all reads. Because 1) each read | |
996 aligning to this transcript has a probability of being generated | |
997 from background noise; 2) RSEM may filter some alignable low quality | |
998 reads, the sum of expected counts for all transcript are generally | |
999 less than the total number of reads aligned. | |
1000 | |
1001 'TPM' stands for Transcripts Per Million. It is a relative measure | |
1002 of transcript abundance. The sum of all transcripts' TPM is 1 | |
1003 million. 'FPKM' stands for Fragments Per Kilobase of transcript per | |
1004 Million mapped reads. It is another relative measure of transcript | |
1005 abundance. If we define l_bar be the mean transcript length in a | |
1006 sample, which can be calculated as | |
1007 | |
1008 l_bar = \sum_i TPM_i / 10^6 * effective_length_i (i goes through | |
1009 every transcript), | |
1010 | |
1011 the following equation is hold: | |
1012 | |
1013 FPKM_i = 10^3 / l_bar * TPM_i. | |
1014 | |
1015 We can see that the sum of FPKM is not a constant across samples. | |
1016 | |
1017 'IsoPct' stands for isoform percentage. It is the percentage of this | |
1018 transcript's abandunce over its parent gene's abandunce. If its | |
1019 parent gene has only one isoform or the gene information is not | |
1020 provided, this field will be set to 100. | |
1021 | |
1022 'posterior_mean_count', 'pme_TPM', 'pme_FPKM' are posterior mean | |
1023 estimates calculated by RSEM's Gibbs sampler. | |
1024 'posterior_standard_deviation_of_count' is the posterior standard | |
1025 deviation of counts. 'IsoPct_from_pme_TPM' is the isoform percentage | |
1026 calculated from 'pme_TPM' values. | |
1027 | |
1028 'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound' | |
1029 and 'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95% | |
1030 credibility intervals for TPM and FPKM values. The bounds are | |
1031 inclusive (i.e. [l, u]). | |
1032 | |
1033 'TPM_coefficient_of_quartile_variation' and | |
1034 'FPKM_coefficient_of_quartile_variation' are coefficients of | |
1035 quartile variation (CQV) for TPM and FPKM values. CQV is a robust | |
1036 way of measuring the ratio between the standard deviation and the | |
1037 mean. It is defined as | |
1038 | |
1039 CQV := (Q3 - Q1) / (Q3 + Q1), | |
1040 | |
1041 where Q1 and Q3 are the first and third quartiles. | |
1042 | |
1043 sample_name.genes.results | |
1044 File containing gene level expression estimates. The first line | |
1045 contains column names separated by the tab character. The format of | |
1046 each line in the rest of this file is: | |
1047 | |
1048 gene_id transcript_id(s) length effective_length expected_count TPM | |
1049 FPKM [posterior_mean_count posterior_standard_deviation_of_count | |
1050 pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound | |
1051 TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound | |
1052 FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation] | |
1053 | |
1054 Fields are separated by the tab character. Fields within "[]" are | |
1055 optional. They will not be presented if neither '--calc-pme' nor | |
1056 '--calc-ci' is set. | |
1057 | |
1058 'transcript_id(s)' is a comma-separated list of transcript_ids | |
1059 belonging to this gene. If no gene information is provided, | |
1060 'gene_id' and 'transcript_id(s)' are identical (the | |
1061 'transcript_id'). | |
1062 | |
1063 A gene's 'length' and 'effective_length' are defined as the weighted | |
1064 average of its transcripts' lengths and effective lengths (weighted | |
1065 by 'IsoPct'). A gene's abundance estimates are just the sum of its | |
1066 transcripts' abundance estimates. | |
1067 | |
1068 sample_name.alleles.results | |
1069 Only generated when the RSEM references are built with | |
1070 allele-specific transcripts. | |
1071 | |
1072 This file contains allele level expression estimates for | |
1073 allele-specific expression calculation. The first line contains | |
1074 column names separated by the tab character. The format of each line | |
1075 in the rest of this file is: | |
1076 | |
1077 allele_id transcript_id gene_id length effective_length | |
1078 expected_count TPM FPKM AlleleIsoPct AlleleGenePct | |
1079 [posterior_mean_count posterior_standard_deviation_of_count pme_TPM | |
1080 pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM | |
1081 TPM_ci_lower_bound TPM_ci_upper_bound | |
1082 TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound | |
1083 FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation] | |
1084 | |
1085 Fields are separated by the tab character. Fields within "[]" are | |
1086 optional. They will not be presented if neither '--calc-pme' nor | |
1087 '--calc-ci' is set. | |
1088 | |
1089 'allele_id' is the allele-specific name of this allele-specific | |
1090 transcript. | |
1091 | |
1092 'AlleleIsoPct' stands for allele-specific percentage on isoform | |
1093 level. It is the percentage of this allele-specific transcript's | |
1094 abundance over its parent transcript's abundance. If its parent | |
1095 transcript has only one allele variant form, this field will be set | |
1096 to 100. | |
1097 | |
1098 'AlleleGenePct' stands for allele-specific percentage on gene level. | |
1099 It is the percentage of this allele-specific transcript's abundance | |
1100 over its parent gene's abundance. | |
1101 | |
1102 'AlleleIsoPct_from_pme_TPM' and 'AlleleGenePct_from_pme_TPM' have | |
1103 similar meanings. They are calculated based on posterior mean | |
1104 estimates. | |
1105 | |
1106 Please note that if this file is present, the fields 'length' and | |
1107 'effective_length' in 'sample_name.isoforms.results' should be | |
1108 interpreted similarly as the corresponding definitions in | |
1109 'sample_name.genes.results'. | |
1110 | |
1111 sample_name.transcript.bam | |
1112 Only generated when --no-bam-output is not specified. | |
1113 | |
1114 'sample_name.transcript.bam' is a BAM-formatted file of read | |
1115 alignments in transcript coordinates. The MAPQ field of each | |
1116 alignment is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), | |
1117 where w is the posterior probability of that alignment being the | |
1118 true mapping of a read. In addition, RSEM pads a new tag ZW:f:value, | |
1119 where value is a single precision floating number representing the | |
1120 posterior probability. Because this file contains all alignment | |
1121 lines produced by bowtie or user-specified aligners, it can also be | |
1122 used as a replacement of the aligner generated BAM/SAM file. | |
1123 | |
1124 sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai | |
1125 Only generated when --no-bam-output is not specified and --sort-bam-by-coordinate is specified. | |
1126 | |
1127 'sample_name.transcript.sorted.bam' and | |
1128 'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and | |
1129 indices generated by samtools (included in RSEM package). | |
1130 | |
1131 sample_name.genome.bam | |
1132 Only generated when --no-bam-output is not specified and | |
1133 --output-genome-bam is specified. | |
1134 | |
1135 'sample_name.genome.bam' is a BAM-formatted file of read alignments | |
1136 in genomic coordinates. Alignments of reads that have identical | |
1137 genomic coordinates (i.e., alignments to different isoforms that | |
1138 share the same genomic region) are collapsed into one alignment. The | |
1139 MAPQ field of each alignment is set to min(100, floor(-10 * | |
1140 log10(1.0 - w) + 0.5)), where w is the posterior probability of that | |
1141 alignment being the true mapping of a read. In addition, RSEM pads a | |
1142 new tag ZW:f:value, where value is a single precision floating | |
1143 number representing the posterior probability. If an alignment is | |
1144 spliced, a XS:A:value tag is also added, where value is either '+' | |
1145 or '-' indicating the strand of the transcript it aligns to. | |
1146 | |
1147 sample_name.genome.sorted.bam and sample_name.genome.sorted.bam.bai | |
1148 Only generated when --no-bam-output is not specified, and | |
1149 --sort-bam-by-coordinate and --output-genome-bam are specified. | |
1150 | |
1151 'sample_name.genome.sorted.bam' and | |
1152 'sample_name.genome.sorted.bam.bai' are the sorted BAM file and | |
1153 indices generated by samtools (included in RSEM package). | |
1154 | |
1155 sample_name.time | |
1156 Only generated when --time is specified. | |
1157 | |
1158 It contains time (in seconds) consumed by aligning reads, estimating | |
1159 expression levels and calculating credibility intervals. | |
1160 | |
1161 sample_name.stat | |
1162 This is a folder instead of a file. All model related statistics are | |
1163 stored in this folder. Use 'rsem-plot-model' can generate plots | |
1164 using this folder. | |
1165 | |
1166 'sample_name.stat/sample_name.cnt' contains alignment statistics. | |
1167 The format and meanings of each field are described in | |
1168 'cnt_file_description.txt' under RSEM directory. | |
1169 | |
1170 'sample_name.stat/sample_name.model' stores RNA-Seq model parameters | |
1171 learned from the data. The format and meanings of each filed of this | |
1172 file are described in 'model_file_description.txt' under RSEM | |
1173 directory. | |
1174 | |
1175 The following four output files will be generated only by | |
1176 prior-enhanced RSEM | |
1177 | |
1178 - 'sample_name.stat/sample_name_prsem.all_tr_features' | |
1179 It stores isofrom features for deriving and assigning pRSEM prior. | |
1180 The first line is a header and the rest is one isoform per line. | |
1181 The description for each column is: | |
1182 | |
1183 * trid: transcript ID from input annotation | |
1184 | |
1185 * geneid: gene ID from input anntation | |
1186 | |
1187 * chrom: isoform's chromosome name | |
1188 | |
1189 * strand: isoform's strand name | |
1190 | |
1191 * start: isoform's end with the lowest genomic loci | |
1192 | |
1193 * end: isoform's end with the highest genomic loci | |
1194 | |
1195 * tss_mpp: average mappability of [TSS-500bp, TSS+500bp], where | |
1196 TSS is isoform's transcription start site, i.e. 5'-end | |
1197 | |
1198 * body_mpp: average mappability of (TSS+500bp, TES-500bp), where | |
1199 TES is isoform's transcription end site, i.e. 3'-end | |
1200 | |
1201 * tes_mpp: average mappability of [TES-500bp, TES+500bp] | |
1202 | |
1203 * pme_count: isoform's fragment or read count from RSEM's | |
1204 posterior mean estimates | |
1205 | |
1206 * tss: isoform's TSS loci | |
1207 | |
1208 * tss_pk: equal to 1 if isoform's [TSS-500bp, TSS+500bp] region | |
1209 overlaps with a RNA Pol II peak; 0 otherwise | |
1210 | |
1211 * is_training: equal to 1 if isoform is in the training set where | |
1212 Pol II prior is learned; 0 otherwise | |
1213 | |
1214 - 'sample_name.stat/sample_name_prsem.all_tr_prior' | |
1215 It stores prior parameters for every isoform. This file does not | |
1216 have a header. Each line contains a prior parameter and an | |
1217 isoform's transcript ID delimited by " # ". | |
1218 | |
1219 - 'sample_name.stat/sample_name_uniform_prior_1.isoforms.results' | |
1220 RSEM's posterior mean estimates on the isoform level with an | |
1221 initial pseudo-count of one for every isoform. It is in the same | |
1222 format as the 'sample_name.isoforms.results'. | |
1223 | |
1224 - 'sample_name.stat/sample_name_uniform_prior_1.genes.results' | |
1225 RSEM's posterior mean estimates on the gene level with an initial | |
1226 pseudo-count of one for every isoform. It is in the same format as | |
1227 the 'sample_name.genes.results'. | |
1228 | |
1229 When learning prior from multiple external data sets in | |
1230 prior-enhanced RSEM, two additional output files will be generated. | |
1231 | |
1232 - 'sample_name.stat/sample_name.pval_LL' | |
1233 It stores a p-value and a log-likelihood. The p-value indicates | |
1234 whether the combination of multiple complementary data sets is | |
1235 informative for RNA-seq quantification. The log-likelihood shows | |
1236 how well pRSEM's Dirichlet-multinomial model fits the read counts | |
1237 of partitioned training set isoforms. | |
1238 | |
1239 - 'sample_name.stat/sample_name.lgt_mdl.RData' | |
1240 It stores an R object named 'glmmdl', which is a logistic | |
1241 regression model on the training set isoforms and multiple | |
1242 external data sets. | |
1243 | |
1244 In addition, extra columns will be added to | |
1245 'sample_name.stat/all_tr_features' | |
1246 | |
1247 * is_expr: equal to 1 if isoform has an abundance >= 1 TPM and a | |
1248 non-zero read count from RSEM's posterior mean estimates; 0 | |
1249 otherwise | |
1250 | |
1251 * "$external_data_set_basename": log10 of external data's signal at | |
1252 [TSS-500, TSS+500]. Signal is the number of reads aligned within | |
1253 that interval and normalized to RPKM by read depth and interval | |
1254 length. It will be set to -4 if no read aligned to that interval. | |
1255 | |
1256 There are multiple columns like this one, where each represents an | |
1257 external data set. | |
1258 | |
1259 * prd_expr_prob: predicted probability from logistic regression | |
1260 model on whether this isoform is expressed or not. A probability | |
1261 higher than 0.5 is considered as expressed | |
1262 | |
1263 * partition: group index, to which this isoforms is partitioned | |
1264 | |
1265 * prior: prior parameter for this isoform | |
1266 | |
1267 EXAMPLES | |
1268 Assume the path to the bowtie executables is in the user's PATH | |
1269 environment variable. Reference files are under '/ref' with name | |
1270 'mouse_125'. | |
1271 | |
1272 1) '/data/mmliver.fq', single-end reads with quality scores. Quality | |
1273 scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 | |
1274 threads and generate a genome BAM file. In addition, we want to append | |
1275 gene/transcript names to the result files: | |
1276 | |
1277 rsem-calculate-expression --phred64-quals \ | |
1278 -p 8 \ | |
1279 --append-names \ | |
1280 --output-genome-bam \ | |
1281 /data/mmliver.fq \ | |
1282 /ref/mouse_125 \ | |
1283 mmliver_single_quals | |
1284 | |
1285 2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', stranded paired-end | |
1286 reads with quality scores. Suppose the library is prepared using TruSeq | |
1287 Stranded Kit, which means the first mate should map to the reverse | |
1288 strand. Quality scores are in SANGER format. We want to use 8 threads | |
1289 and do not generate a genome BAM file: | |
1290 | |
1291 rsem-calculate-expression -p 8 \ | |
1292 --paired-end \ | |
1293 --strandedness reverse \ | |
1294 /data/mmliver_1.fq \ | |
1295 /data/mmliver_2.fq \ | |
1296 /ref/mouse_125 \ | |
1297 mmliver_paired_end_quals | |
1298 | |
1299 3) '/data/mmliver.fa', single-end reads without quality scores. We want | |
1300 to use 8 threads: | |
1301 | |
1302 rsem-calculate-expression -p 8 \ | |
1303 --no-qualities \ | |
1304 /data/mmliver.fa \ | |
1305 /ref/mouse_125 \ | |
1306 mmliver_single_without_quals | |
1307 | |
1308 4) Data are the same as 1). This time we assume the bowtie executables | |
1309 are under '/sw/bowtie'. We want to take a fragment length distribution | |
1310 into consideration. We set the fragment length mean to 150 and the | |
1311 standard deviation to 35. In addition to a BAM file, we also want to | |
1312 generate credibility intervals. We allow RSEM to use 1GB of memory for | |
1313 CI calculation: | |
1314 | |
1315 rsem-calculate-expression --bowtie-path /sw/bowtie \ | |
1316 --phred64-quals \ | |
1317 --fragment-length-mean 150.0 \ | |
1318 --fragment-length-sd 35.0 \ | |
1319 -p 8 \ | |
1320 --output-genome-bam \ | |
1321 --calc-ci \ | |
1322 --ci-memory 1024 \ | |
1323 /data/mmliver.fq \ | |
1324 /ref/mouse_125 \ | |
1325 mmliver_single_quals | |
1326 | |
1327 5) '/data/mmliver_paired_end_quals.bam', BAM-formatted alignments for | |
1328 paired-end reads with quality scores. We want to use 8 threads: | |
1329 | |
1330 rsem-calculate-expression --paired-end \ | |
1331 --alignments \ | |
1332 -p 8 \ | |
1333 /data/mmliver_paired_end_quals.bam \ | |
1334 /ref/mouse_125 \ | |
1335 mmliver_paired_end_quals | |
1336 | |
1337 6) '/data/mmliver_1.fq.gz' and '/data/mmliver_2.fq.gz', paired-end reads | |
1338 with quality scores and read files are compressed by gzip. We want to | |
1339 use STAR to aligned reads and assume STAR executable is '/sw/STAR'. | |
1340 Suppose we want to use 8 threads and do not generate a genome BAM file: | |
1341 | |
1342 rsem-calculate-expression --paired-end \ | |
1343 --star \ | |
1344 --star-path /sw/STAR \ | |
1345 --gzipped-read-file \ | |
1346 --paired-end \ | |
1347 -p 8 \ | |
1348 /data/mmliver_1.fq.gz \ | |
1349 /data/mmliver_2.fq.gz \ | |
1350 /ref/mouse_125 \ | |
1351 mmliver_paired_end_quals | |
1352 | |
1353 7) In the above example, suppose we want to run prior-enhanced RSEM | |
1354 instead. Assuming we want to learn priors from a ChIP-seq peak file | |
1355 '/data/mmlive.narrowPeak.gz': | |
1356 | |
1357 rsem-calculate-expression --star \ | |
1358 --star-path /sw/STAR \ | |
1359 --gzipped-read-file \ | |
1360 --paired-end \ | |
1361 --calc-pme \ | |
1362 --run-pRSEM \ | |
1363 --chipseq-peak-file /data/mmliver.narrowPeak.gz \ | |
1364 -p 8 \ | |
1365 /data/mmliver_1.fq.gz \ | |
1366 /data/mmliver_2.fq.gz \ | |
1367 /ref/mouse_125 \ | |
1368 mmliver_paired_end_quals | |
1369 | |
1370 8) Similar to the example in 7), suppose we want to use the partition | |
1371 model 'pk_lm2nopk' (partitioning isoforms by Pol II TSS peak first and | |
1372 then partitioning 'no TSS peak' isoforms into two bins by a linear | |
1373 regression model), and we want to partition isoforms by RNA Pol II's | |
1374 ChIP-seq read files '/data/mmliver_PolIIRep1.fq.gz' and | |
1375 '/data/mmliver_PolIIRep2.fq.gz', and the control ChIP-seq read files | |
1376 '/data/mmliver_ChIPseqCtrl.fq.gz'. Also, assuming Bowtie's executables | |
1377 are under '/sw/bowtie/': | |
1378 | |
1379 rsem-calculate-expression --star \ | |
1380 --star-path /sw/STAR \ | |
1381 --gzipped-read-file \ | |
1382 --paired-end \ | |
1383 --calc-pme \ | |
1384 --run-pRSEM \ | |
1385 --chipseq-target-read-files /data/mmliver_PolIIRep1.fq.gz,/data/mmliver_PolIIRep2.fq.gz \ | |
1386 --chipseq-control-read-files /data/mmliver_ChIPseqCtrl.fq.gz \ | |
1387 --partition-model pk_lm2nopk \ | |
1388 --bowtie-path /sw/bowtie \ | |
1389 -p 8 \ | |
1390 /data/mmliver_1.fq.gz \ | |
1391 /data/mmliver_2.fq.gz \ | |
1392 /ref/mouse_125 \ | |
1393 mmliver_paired_end_quals | |
1394 | |
1395 9) Similar to the example in 8), suppose we want to derive prior from | |
1396 four histone modification ChIP-seq read data sets: | |
1397 '/data/H3K27Ac.fastq.gz', '/data/H3K4me1.fastq.gz', | |
1398 '/data/H3K4me2.fastq.gz', and '/data/H3K4me3.fastq.gz'. Also, assuming | |
1399 Bowtie's executables are under '/sw/bowtie/': | |
1400 | |
1401 rsem-calculate-expression --star \ | |
1402 --star-path /sw/STAR \ | |
1403 --gzipped-read-file \ | |
1404 --paired-end \ | |
1405 --calc-pme \ | |
1406 --run-pRSEM \ | |
1407 --partition-model cmb_lgt \ | |
1408 --chipseq-read-files-multi-targets /data/H3K27Ac.fastq.gz,/data/H3K4me1.fastq.gz,/data/H3K4me2.fastq.gz,/data/H3K4me3.fastq.gz \ | |
1409 --bowtie-path /sw/bowtie \ | |
1410 -p 8 \ | |
1411 /data/mmliver_1.fq.gz \ | |
1412 /data/mmliver_2.fq.gz \ | |
1413 /ref/mouse_125 \ | |
1414 mmliver_paired_end_quals | |
1415 | |
1416 </help> | |
1417 <citations> | |
1418 <citation type="doi">10.1186/1471-2105-12-323</citation> | |
1419 </citations> | |
1420 | |
1421 </tool> |