Mercurial > repos > artbio > rsem
comparison rsem-bwt2.xml @ 0:e5e836936d60 draft
planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit d84a0359354698a4b29df12ab581c2618bffcf80
author | artbio |
---|---|
date | Sat, 31 Mar 2018 21:30:07 -0400 |
parents | |
children | 49795544dac7 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e5e836936d60 |
---|---|
1 <tool id="rsembowtie2" name="RSEM-Bowtie2" version="0.4.0"> | |
2 <description></description> | |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
6 <requirements> | |
7 <requirement type="package" version="1.3.0">rsem</requirement> | |
8 <requirement type="package" version="2.3.4">bowtie2</requirement> | |
9 </requirements> | |
10 <stdio> | |
11 <exit_code range="1:" level="warning" description="Tool exception" /> | |
12 </stdio> | |
13 <command detect_errors="exit_code"><![CDATA[ | |
14 #if $job.select_job == "index": | |
15 echo ${job.reference_name} " " | tee $reference_file && | |
16 mkdir $reference_file.files_path && | |
17 rsem-prepare-reference | |
18 #if $job.polya.polya_use == 'add': | |
19 #if $job.polya.polya_length: | |
20 --polyA-length $job.polya.polya_length | |
21 #end if | |
22 #elif $job.polya.polya_use == 'subset': | |
23 --no-polyA-subset $job.polya.no_polya_subset | |
24 #if $job.polya.polya_length: | |
25 --polyA-length $job.polya.polya_length | |
26 #end if | |
27 #elif $job.polya.polya_use == 'none': | |
28 --no-polyA | |
29 #end if | |
30 $job.ntog | |
31 #if $job.transcript_to_gene_map: | |
32 --transcript-to-gene-map $job.transcript_to_gene_map | |
33 #end if | |
34 --bowtie2 | |
35 #if $job.self_reference.ref_type == 'transcripts': | |
36 $job.self_reference.reference_fasta_file | |
37 #else: | |
38 --gtf $job.self_reference.gtf | |
39 $job.self_reference.reference_fasta_file | |
40 #end if | |
41 ${reference_file.files_path}/${job.reference_name} | |
42 > ${reference_file.files_path}/${job.reference_name}.log | |
43 #end if | |
44 | |
45 #if $job.select_job == "index" and $run_rsem.select == "Yes": | |
46 && | |
47 #end if | |
48 | |
49 #if $run_rsem.select == "Yes": | |
50 ## uncompress fastq.gz or fastqsanger.gz if needed | |
51 #if $run_rsem.input.fastq.matepair=="single": | |
52 #if $run_rsem.input.fastq.singlefastq.is_of_type('fastq.gz') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger.gz'): | |
53 gunzip < '$run_rsem.input.fastq.singlefastq' > uncomp_single.fastq && | |
54 #elif $run_rsem.input.fastq.singlefastq.is_of_type('fastq') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger'): | |
55 ln -f -s '$run_rsem.input.fastq.singlefastq' 'uncomp_single.fastq' && | |
56 #end if | |
57 #elif $run_rsem.input.fastq.matepair=="paired": | |
58 #if $run_rsem.input.fastq.fastq1.is_of_type('fastq.gz') or $run_rsem.input.fastq.fastq1.is_of_type('fastqsanger.gz'): | |
59 gunzip < '$run_rsem.input.fastq.fastq1' > uncomp_pair1.fastq && | |
60 gunzip < '$run_rsem.input.fastq.fastq2' > uncomp_pair2.fastq && | |
61 #elif $run_rsem.input.fastq.singlefastq.is_of_type('fastq') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger'): | |
62 ln -f -s '$run_rsem.input.fastq.fastq1' 'uncomp_pair1.fastq' && | |
63 ln -f -s '$run_rsem.input.fastq.fastq2' 'uncomp_pair2.fastq' && | |
64 #end if | |
65 #end if | |
66 rsem-calculate-expression | |
67 ## --tag string | |
68 #if $run_rsem.seedlength: | |
69 --seed-length $run_rsem.seedlength | |
70 #end if | |
71 --forward-prob $run_rsem.forward_prob | |
72 #if $run_rsem.rsem_options.fullparams == 'fullset': | |
73 ## Fragment info | |
74 #if $run_rsem.rsem_options.fragment_length_mean: | |
75 --fragment-length-mean $run_rsem.rsem_options.fragment_length_mean | |
76 #end if | |
77 #if $run_rsem.rsem_options.fragment_length_min: | |
78 --fragment-length-min $run_rsem.rsem_options.fragment_length_min | |
79 #end if | |
80 #if $run_rsem.rsem_options.fragment_length_sd: | |
81 --fragment-length-sd $run_rsem.rsem_options.fragment_length_sd | |
82 #end if | |
83 #if $run_rsem.rsem_options.fragment_length_max: | |
84 --fragment-length-max $run_rsem.rsem_options.fragment_length_max | |
85 #end if | |
86 ## RSPD | |
87 #if $run_rsem.rsem_options.rspd.estimate == 'yes': | |
88 --estimate-rspd | |
89 #if $run_rsem.rsem_options.rspd.num_rspd_bins: | |
90 --num-rspd-bins $run_rsem.rsem_options.rspd.num_rspd_bins | |
91 #end if | |
92 #end if | |
93 ## Calculate 95% credibility intervals and posterior mean estimates. | |
94 #if $run_rsem.rsem_options.useci.ci == 'yes': | |
95 --calc-ci | |
96 #if $run_rsem.rsem_options.useci.cimem: | |
97 --ci-memory $run_rsem.rsem_options.useci.cimem | |
98 #end if | |
99 #end if | |
100 #end if | |
101 --num-threads \${GALAXY_SLOTS:-4} | |
102 --bowtie2 | |
103 #if $run_rsem.input.format == 'fasta' and $run_rsem.input.bowtie2_options.fullparams == 'fullset': | |
104 ## Bowtie params | |
105 #if $run_rsem.input.bowtie2_options.bowtie2_mismatch_rate: | |
106 --bowtie2-mismatch-rate $run_rsem.input.bowtie2_options.bowtie2_mismatch_rate | |
107 #end if | |
108 #if $run_rsem.input.bowtie2_options.bowtie2_k: | |
109 --bowtie2-k $run_rsem.input.bowtie2_options.bowtie2_k | |
110 #end if | |
111 #if $run_rsem.input.bowtie2_options.bowtie2_sensitivity_level: | |
112 --bowtie2-sensitivity-level $run_rsem.input.bowtie2_options.bowtie2_sensitivity_level | |
113 #end if | |
114 #end if | |
115 ## Outputs | |
116 #if $run_rsem.rsem_outputs.result_bams == 'none': | |
117 --no-bam-output | |
118 #elif $run_rsem.rsem_outputs.result_bams == 'default': | |
119 --sort-bam-by-coordinate | |
120 #else | |
121 --sort-bam-by-coordinate | |
122 --output-genome-bam | |
123 $run_rsem.rsem_outputs.sampling_for_bam | |
124 #end if | |
125 ## Input data | |
126 #if $run_rsem.input.format=="fastq" | |
127 $run_rsem.input.fastq_select | |
128 #if $run_rsem.input.fastq.matepair=="single": | |
129 uncomp_single.fastq | |
130 #elif $run_rsem.input.fastq.matepair=="paired": | |
131 --paired-end | |
132 uncomp_pair1.fastq | |
133 uncomp_pair2.fastq | |
134 #end if | |
135 #elif $run_rsem.input.format=="fasta" | |
136 --no-qualities | |
137 #if $run_rsem.input.fasta.matepair=="single": | |
138 $run_rsem.input.fasta.singlefasta | |
139 #elif $run_rsem.input.fasta.matepair=="paired": | |
140 --paired-end | |
141 $run_rsem.input.fasta.fasta1 | |
142 $run_rsem.input.fasta.fasta2 | |
143 #end if | |
144 #elif $run_rsem.input.format=="sam" | |
145 #if $run_rsem.input.matepair=="paired": | |
146 --paired-end | |
147 #end if | |
148 #if $run_rsem.input.rsem_sam._extension == 'sam': | |
149 --sam | |
150 #elif $run_rsem.input.rsem_sam._extension == 'bam': | |
151 --bam | |
152 #end if | |
153 $run_rsem.input.rsem_sam | |
154 #end if | |
155 ## RSEM reference | |
156 #if $run_rsem.reference.refSrc == 'history': | |
157 ${run_rsem.reference.rsem_ref.extra_files_path}/${run_rsem.reference.rsem_ref.metadata.reference_name} | |
158 #elif $run_rsem.reference.refSrc == 'self': | |
159 ${reference_file.files_path}/${job.reference_name} | |
160 #end if | |
161 ## sample_name: use a hard coded name so we can pull out galaxy outputs | |
162 rsem_output | |
163 ## direct output into logfile | |
164 > $log | |
165 #end if | |
166 ]]></command> | |
167 | |
168 <inputs> | |
169 <conditional name="job"> | |
170 <param name="select_job" type="select" label="rsem reference"> | |
171 <option value="index">Build rsem reference</option> | |
172 <option value="no-index" selected="true">rsem reference available from history</option> | |
173 </param> | |
174 <when value="index"> | |
175 <conditional name="self_reference"> | |
176 <param name="ref_type" type="select" label="Reference transcript source"> | |
177 <option value="transcripts">transcript fasta</option> | |
178 <option value="genomic">reference genome and gtf</option> | |
179 </param> | |
180 <when value="transcripts"> | |
181 <param name="reference_fasta_file" type="data" format="fasta" label="reference fasta file" | |
182 help="The files should contain the sequences of transcripts."/> | |
183 </when> | |
184 <when value="genomic"> | |
185 <param name="reference_fasta_file" type="data" format="fasta" label="reference fasta file" | |
186 help="The file should contain the sequence of an entire genome."/> | |
187 <param name="gtf" type="data" format="gtf" label="gtf" | |
188 help="extract transcript reference sequences using the gene annotations specified in this GTF" /> | |
189 </when> | |
190 </conditional> | |
191 <param name="transcript_to_gene_map" type="data" format="tabular" optional="true" label="Map of gene ids to transcript (isoform) ids" > | |
192 <help> | |
193 Each line of should be of the form: gene_id transcript_id ( with the two fields separated by a tab character ) | |
194 The map can be obtained from the UCSC table browser | |
195 group: Genes and Gene Prediction Tracks | |
196 table: knownIsoforms | |
197 Without a map: | |
198 If a reference genome and gtf is used, then RSEM uses the "gene_id" and "transcript_id" attributes in the GTF file. | |
199 Otherwise, RSEM assumes that each sequence in the reference sequence files is a separate gene. | |
200 </help> | |
201 </param> | |
202 <param name="reference_name" type="text" value="rsem_ref_name" label="reference name"> | |
203 <help>A one word name for this RSEM reference containing only letters, digits, and underscore characters</help> | |
204 <validator type="regex" message="Use only letters, digits, and underscore characters">^\w+$</validator> | |
205 </param> | |
206 <conditional name="polya"> | |
207 <param name="polya_use" type="select" label="PolyA "> | |
208 <option value="add" selected="true">Add poly(A) tails to all transcripts</option> | |
209 <option value="subset">Exclude poly(A) tails from selected transcripts</option> | |
210 <option value="none">Do not add poly(A) tails to any transcripts</option> | |
211 </param> | |
212 <when value="add"> | |
213 <param name="polya_length" type="integer" value="125" optional="true" label="The length of the poly(A) tails to be added. (Default: 125)"> | |
214 <validator type="in_range" message="must be positive " min="1"/> | |
215 </param> | |
216 </when> | |
217 <when value="subset"> | |
218 <param name="no_polya_subset" type="data" format="tabular" optional="true" label="List of transcript IDs (one per line) that should should not have polyA tails added."/> | |
219 <param name="polya_length" type="integer" value="125" optional="true" label="The length of the poly(A) tails to be added. (Default: 125)"> | |
220 <validator type="in_range" message="must be positive " min="1"/> | |
221 </param> | |
222 </when> | |
223 <when value="none"/> | |
224 </conditional> | |
225 <param name="ntog" type="boolean" truevalue="--no-ntog" falsevalue="" checked="false" label="Disable the conversion of 'N' characters to 'G' characters in the reference sequences" help="Bowtie uses the automatic N to G conversion to align against all positions in the reference."/> | |
226 </when> | |
227 <when value="no-index"> | |
228 </when> | |
229 </conditional> | |
230 | |
231 <conditional name="run_rsem"> | |
232 <param name="select" type="select" label="calculate expression with rsem"> | |
233 <option value="No">Just build rsem reference for latter rsem profiling</option> | |
234 <option value="Yes" selected="true">profile expression with rsem</option> | |
235 </param> | |
236 <when value="Yes"> | |
237 <param name="sample" type="text" value="rsem_sample" label="Sample name" /> | |
238 <conditional name="reference"> | |
239 <param name="refSrc" type="select" label="RSEM Reference Source"> | |
240 <option value="history">From your history</option> | |
241 <option value="self">Prepare RSEM Reference with this tool</option> | |
242 </param> | |
243 <when value="history"> | |
244 <param name="rsem_ref" type="data" format="rsem_ref" label="RSEM reference" /> | |
245 </when> | |
246 <when value="self"> | |
247 </when> | |
248 </conditional> | |
249 <conditional name="input"> | |
250 <param name="format" type="select" label="RSEM Input file type"> | |
251 <option value="fastq">FASTQ</option> | |
252 <option value="fasta">FASTA</option> | |
253 <option value="sam">SAM/BAM</option> | |
254 </param> | |
255 <when value="fastq"> | |
256 <param name="fastq_select" size="15" type="select" label="FASTQ type" > | |
257 <option value="--phred33-quals" selected="true">phred33 qualities (default for sanger)</option> | |
258 <option value="--solexa-quals">solexa qualities</option> | |
259 <option value="--phred64-quals">phred64 qualities</option> | |
260 </param> | |
261 <conditional name="fastq"> | |
262 <param name="matepair" type="select" label="Library type"> | |
263 <option value="single">Single End Reads</option> | |
264 <option value="paired">Paired End Reads</option> | |
265 </param> | |
266 <when value="single"> | |
267 <param name="singlefastq" type="data" format="fastq,fastq.gz" label="FASTQ file" /> | |
268 </when> | |
269 <when value="paired"> | |
270 <param name="fastq1" type="data" format="fastq,fastq.gz" label="Read 1 fastq file" /> | |
271 <param name="fastq2" type="data" format="fastq,fastq.gz" label="Read 2 fastq file" /> | |
272 </when> | |
273 </conditional> | |
274 <expand macro="bowtie2_options"/> | |
275 </when> | |
276 <when value="fasta"> | |
277 <conditional name="fasta"> | |
278 <param name="matepair" type="select" label="Library Type"> | |
279 <option value="single">Single End Reads</option> | |
280 <option value="paired">Paired End Reads</option> | |
281 </param> | |
282 <when value="single"> | |
283 <param name="singlefasta" type="data" format="fasta" label="fasta file" /> | |
284 </when> | |
285 <when value="paired"> | |
286 <param name="fasta1" type="data" format="fasta" label="Read 1 fasta file" /> | |
287 <param name="fasta2" type="data" format="fasta" label="Read 2 fasta file" /> | |
288 </when> | |
289 </conditional> | |
290 <expand macro="bowtie2_options"/> | |
291 </when> | |
292 <when value="sam"> | |
293 <!-- convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam --> | |
294 <param name="matepair" type="select" label="Library Type"> | |
295 <option value="single">Single End Reads</option> | |
296 <option value="paired">Paired End Reads</option> | |
297 </param> | |
298 <param name="rsem_sam" type="data" format="rsem_sam" label="RSEM formatted SAM file" /> | |
299 </when> | |
300 </conditional> | |
301 <expand macro="rsem_options"/> | |
302 <conditional name="rsem_outputs"> | |
303 <param name="result_bams" type="select" label="Create bam results files" | |
304 help="In addition to the transcript-coordinate-based BAM file output, also output a BAM file with the read alignments in genomic coordinates" > | |
305 <option value="none">No BAM results files</option> | |
306 <option value="default" selected="true">Transcript BAM results file</option> | |
307 <option value="both">Transcript and genome BAM results files</option> | |
308 </param> | |
309 <when value="none"/> | |
310 <when value="default"> | |
311 <expand macro="sampling_for_bam"/> | |
312 </when> | |
313 <when value="both"> | |
314 <expand macro="sampling_for_bam"/> | |
315 </when> | |
316 </conditional> | |
317 </when> | |
318 <when value="No"> | |
319 </when> | |
320 </conditional> | |
321 </inputs> | |
322 | |
323 <outputs> | |
324 <data format="rsem_ref" name="reference_file" label="RSEM ${job.reference_name} reference"> | |
325 <filter>job['select_job'] == "index"</filter> | |
326 </data> | |
327 <data format="tabular" name="gene_abundances" label="${run_rsem.sample}.gene_abundances" from_work_dir="rsem_output.genes.results"> | |
328 <filter>run_rsem['select'] == "Yes"</filter> | |
329 </data> | |
330 <data format="tabular" name="isoform_abundances" label="${run_rsem.sample}.isoform_abundances" from_work_dir="rsem_output.isoforms.results"> | |
331 <filter>run_rsem['select'] == "Yes"</filter> | |
332 </data> | |
333 <data format="bam" name="transcript_sorted_bam" label="${run_rsem.sample}.transcript.bam" from_work_dir="rsem_output.transcript.sorted.bam" > | |
334 <filter>run_rsem['select'] == "Yes" and run_rsem['rsem_outputs']['result_bams'] != "none"</filter> | |
335 </data> | |
336 <data format="bam" name="genome_sorted_bam" label="${run_rsem.sample}.genome.bam" from_work_dir="rsem_output.genome.sorted.bam"> | |
337 <filter>run_rsem['select'] == "Yes" and run_rsem['rsem_outputs']['result_bams'] == "both"</filter> | |
338 </data> | |
339 <data format="txt" name="log" label="${run_rsem.sample}.rsem_log"> | |
340 <filter>run_rsem['select'] == "Yes"</filter> | |
341 </data> | |
342 </outputs> | |
343 | |
344 <tests> | |
345 <test> | |
346 <param name="select_job" value="index"/> | |
347 <param name="ref_type" value="genomic"/> | |
348 <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/> | |
349 <param name="gtf" value="ref.gtf" ftype="gtf"/> | |
350 <param name="reference_name" value="ref"/> | |
351 <param name="select" value="Yes"/> | |
352 <param name="sample" value="rsem_sample"/> | |
353 <param name="refSrc" value="self"/> | |
354 <param name="format" value="fastq"/> | |
355 <param name="matepair" value="single"/> | |
356 <param name="singlefastq" value="test.fq" ftype="fastqsanger"/> | |
357 <param name="result_bams" value="none"/> | |
358 <output name="reference_file"> | |
359 <assert_contents> | |
360 <has_text text="ref" /> | |
361 </assert_contents> | |
362 </output> | |
363 <output name="gene_abundances" value="gene_abundances.tab2"/> | |
364 <output name="isoform_abundances" value="isoform_abundances.tab2" /> | |
365 <output name="log"> | |
366 <assert_contents> | |
367 <has_text text="Expression Results are written" /> | |
368 </assert_contents> | |
369 </output> | |
370 </test> | |
371 <test> | |
372 <param name="select_job" value="index"/> | |
373 <param name="ref_type" value="genomic"/> | |
374 <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/> | |
375 <param name="gtf" value="ref.gtf" ftype="gtf"/> | |
376 <param name="reference_name" value="ref"/> | |
377 <param name="select" value="Yes"/> | |
378 <param name="sample" value="rsem_sample"/> | |
379 <param name="refSrc" value="self"/> | |
380 <param name="format" value="fastq"/> | |
381 <param name="matepair" value="single"/> | |
382 <param name="singlefastq" value="test.fastq.gz" ftype="fastqsanger.gz"/> | |
383 <param name="result_bams" value="none"/> | |
384 <output name="reference_file"> | |
385 <assert_contents> | |
386 <has_text text="ref" /> | |
387 </assert_contents> | |
388 </output> | |
389 <output name="gene_abundances" value="gene_abundances.tab2"/> | |
390 <output name="isoform_abundances" value="isoform_abundances.tab2" /> | |
391 <output name="log"> | |
392 <assert_contents> | |
393 <has_text text="Expression Results are written" /> | |
394 </assert_contents> | |
395 </output> | |
396 </test> | |
397 <test> | |
398 <param name="select_job" value="index"/> | |
399 <param name="ref_type" value="genomic"/> | |
400 <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/> | |
401 <param name="gtf" value="ref.gtf" ftype="gtf"/> | |
402 <param name="reference_name" value="ref"/> | |
403 <param name="select" value="No"/> | |
404 <output name="reference_file"> | |
405 <assert_contents> | |
406 <has_text text="ref" /> | |
407 </assert_contents> | |
408 </output> | |
409 </test> | |
410 <test> | |
411 <param name="select_job" value="index"/> | |
412 <param name="ref_type" value="genomic"/> | |
413 <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/> | |
414 <param name="gtf" value="ref.gtf" ftype="gtf"/> | |
415 <param name="reference_name" value="ref"/> | |
416 <param name="select" value="No"/> | |
417 <output name="reference_file"> | |
418 <assert_contents> | |
419 <has_text text="ref" /> | |
420 </assert_contents> | |
421 </output> | |
422 </test> | |
423 </tests> | |
424 | |
425 <help> | |
426 .. class:: infomark | |
427 | |
428 RSEM HOME PAGE - http://deweylab.biostat.wisc.edu/rsem/ | |
429 | |
430 NAME | |
431 rsem-prepare-reference | |
432 | |
433 SYNOPSIS | |
434 rsem-prepare-reference [options] reference_fasta_file(s) reference_name | |
435 | |
436 DESCRIPTION | |
437 The rsem-prepare-reference program extracts/preprocesses the reference sequences and builds Bowtie indices using default parameters. | |
438 This program is used in conjunction with the 'rsem-calculate-expression' program. | |
439 | |
440 INPUTS | |
441 A fasta file of transcripts | |
442 or | |
443 A genome sequence fasta file and a GTF gene annotation file. (When using UCSC data, include the related knownIsoforms.txt) | |
444 | |
445 --- | |
446 | |
447 NAME | |
448 rsem-calculate-expression - Estimate gene and isoform expression from | |
449 RNA-Seq data. | |
450 | |
451 SYNOPSIS | |
452 rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name | |
453 rsem-calculate-expression [options] --paired-end upstream_read_file(s) downstream_read_file(s) reference_name sample_name | |
454 rsem-calculate-expression [options] --alignments [--paired-end] input reference_name sample_name | |
455 | |
456 ARGUMENTS | |
457 upstream_read_files(s) | |
458 Comma-separated list of files containing single-end reads or | |
459 upstream reads for paired-end data. By default, these files are | |
460 assumed to be in FASTQ format. If the --no-qualities option is | |
461 specified, then FASTA format is expected. | |
462 | |
463 downstream_read_file(s) | |
464 Comma-separated list of files containing downstream reads which are | |
465 paired with the upstream reads. By default, these files are assumed | |
466 to be in FASTQ format. If the --no-qualities option is specified, | |
467 then FASTA format is expected. | |
468 | |
469 input | |
470 SAM/BAM/CRAM formatted input file. If "-" is specified for the | |
471 filename, the input is instead assumed to come from standard input. | |
472 RSEM requires all alignments of the same read group together. For | |
473 paired-end reads, RSEM also requires the two mates of any alignment | |
474 be adjacent. In addition, RSEM does not allow the SEQ and QUAL | |
475 fields to be empty. See Description section for how to make input | |
476 file obey RSEM's requirements. | |
477 | |
478 reference_name | |
479 The name of the reference used. The user must have run | |
480 'rsem-prepare-reference' with this reference_name before running | |
481 this program. | |
482 | |
483 sample_name | |
484 The name of the sample analyzed. All output files are prefixed by | |
485 this name (e.g., sample_name.genes.results) | |
486 | |
487 BASIC OPTIONS | |
488 --paired-end | |
489 Input reads are paired-end reads. (Default: off) | |
490 | |
491 --no-qualities | |
492 Input reads do not contain quality scores. (Default: off) | |
493 | |
494 --strandedness <none|forward|reverse> | |
495 This option defines the strandedness of the RNA-Seq reads. It | |
496 recognizes three values: 'none', 'forward', and 'reverse'. 'none' | |
497 refers to non-strand-specific protocols. 'forward' means all | |
498 (upstream) reads are derived from the forward strand. 'reverse' | |
499 means all (upstream) reads are derived from the reverse strand. If | |
500 'forward'/'reverse' is set, the '--norc'/'--nofw' Bowtie/Bowtie 2 | |
501 option will also be enabled to avoid aligning reads to the opposite | |
502 strand. For Illumina TruSeq Stranded protocols, please use | |
503 'reverse'. (Default: 'none') | |
504 | |
505 -p/--num-threads <int> | |
506 Number of threads to use. Both Bowtie/Bowtie2, expression estimation | |
507 and 'samtools sort' will use this many threads. (Default: 1) | |
508 | |
509 --alignments | |
510 Input file contains alignments in SAM/BAM/CRAM format. The exact | |
511 file format will be determined automatically. (Default: off) | |
512 | |
513 --fai <file> | |
514 If the header section of input alignment file does not contain | |
515 reference sequence information, this option should be turned on. | |
516 <file> is a FAI format file containing each reference sequence's | |
517 name and length. Please refer to the SAM official website for the | |
518 details of FAI format. (Default: off) | |
519 | |
520 --bowtie2 | |
521 Use Bowtie 2 instead of Bowtie to align reads. Since currently RSEM | |
522 does not handle indel, local and discordant alignments, the Bowtie2 | |
523 parameters are set in a way to avoid those alignments. In | |
524 particular, we use options '--sensitive --dpad 0 --gbar 99999999 | |
525 --mp 1,1 --np 1 --score-min L,0,-0.1' by default. The last parameter | |
526 of '--score-min', '-0.1', is the negative of maximum mismatch rate. | |
527 This rate can be set by option '--bowtie2-mismatch-rate'. If reads | |
528 are paired-end, we additionally use options '--no-mixed' and | |
529 '--no-discordant'. (Default: off) | |
530 | |
531 --star | |
532 Use STAR to align reads. Alignment parameters are from ENCODE3's | |
533 STAR-RSEM pipeline. To save computational time and memory resources, | |
534 STAR's Output BAM file is unsorted. It is stored in RSEM's temporary | |
535 directory with name as 'sample_name.bam'. Each STAR job will have | |
536 its own private copy of the genome in memory. (Default: off) | |
537 | |
538 --append-names | |
539 If gene_name/transcript_name is available, append it to the end of | |
540 gene_id/transcript_id (separated by '_') in files | |
541 'sample_name.isoforms.results' and 'sample_name.genes.results'. | |
542 (Default: off) | |
543 | |
544 --seed <uint32> | |
545 Set the seed for the random number generators used in calculating | |
546 posterior mean estimates and credibility intervals. The seed must be | |
547 a non-negative 32 bit integer. (Default: off) | |
548 | |
549 --single-cell-prior | |
550 By default, RSEM uses Dirichlet(1) as the prior to calculate | |
551 posterior mean estimates and credibility intervals. However, much | |
552 less genes are expressed in single cell RNA-Seq data. Thus, if you | |
553 want to compute posterior mean estimates and/or credibility | |
554 intervals and you have single-cell RNA-Seq data, you are recommended | |
555 to turn on this option. Then RSEM will use Dirichlet(0.1) as the | |
556 prior which encourage the sparsity of the expression levels. | |
557 (Default: off) | |
558 | |
559 --calc-pme | |
560 Run RSEM's collapsed Gibbs sampler to calculate posterior mean | |
561 estimates. (Default: off) | |
562 | |
563 --calc-ci | |
564 Calculate 95% credibility intervals and posterior mean estimates. | |
565 The credibility level can be changed by setting | |
566 '--ci-credibility-level'. (Default: off) | |
567 | |
568 -q/--quiet | |
569 Suppress the output of logging information. (Default: off) | |
570 | |
571 -h/--help | |
572 Show help information. | |
573 | |
574 --version | |
575 Show version information. | |
576 | |
577 OUTPUT OPTIONS | |
578 --sort-bam-by-read-name | |
579 Sort BAM file aligned under transcript coordidate by read name. | |
580 Setting this option on will produce deterministic maximum likelihood | |
581 estimations from independent runs. Note that sorting will take long | |
582 time and lots of memory. (Default: off) | |
583 | |
584 --no-bam-output | |
585 Do not output any BAM file. (Default: off) | |
586 | |
587 --sampling-for-bam | |
588 When RSEM generates a BAM file, instead of outputting all alignments | |
589 a read has with their posterior probabilities, one alignment is | |
590 sampled according to the posterior probabilities. The sampling | |
591 procedure includes the alignment to the "noise" transcript, which | |
592 does not appear in the BAM file. Only the sampled alignment has a | |
593 weight of 1. All other alignments have weight 0. If the "noise" | |
594 transcript is sampled, all alignments appeared in the BAM file | |
595 should have weight 0. (Default: off) | |
596 | |
597 --output-genome-bam | |
598 Generate a BAM file, 'sample_name.genome.bam', with alignments | |
599 mapped to genomic coordinates and annotated with their posterior | |
600 probabilities. In addition, RSEM will call samtools (included in | |
601 RSEM package) to sort and index the bam file. | |
602 'sample_name.genome.sorted.bam' and | |
603 'sample_name.genome.sorted.bam.bai' will be generated. (Default: | |
604 off) | |
605 | |
606 --sort-bam-by-coordinate | |
607 Sort RSEM generated transcript and genome BAM files by coordinates | |
608 and build associated indices. (Default: off) | |
609 | |
610 --sort-bam-memory-per-thread <string> | |
611 Set the maximum memory per thread that can be used by 'samtools | |
612 sort'. <string> represents the memory and accepts suffices 'K/M/G'. | |
613 RSEM will pass <string> to the '-m' option of 'samtools sort'. Note | |
614 that the default used here is different from the default used by | |
615 samtools. (Default: 1G) | |
616 | |
617 ALIGNER OPTIONS | |
618 --seed-length <int> | |
619 Seed length used by the read aligner. Providing the correct value is | |
620 important for RSEM. If RSEM runs Bowtie, it uses this value for | |
621 Bowtie's seed length parameter. Any read with its or at least one of | |
622 its mates' (for paired-end reads) length less than this value will | |
623 be ignored. If the references are not added poly(A) tails, the | |
624 minimum allowed value is 5, otherwise, the minimum allowed value is | |
625 25. Note that this script will only check if the value >= 5 and give | |
626 a warning message if the value < 25 but >= 5. (Default: 25) | |
627 | |
628 --phred33-quals | |
629 Input quality scores are encoded as Phred+33. (Default: on) | |
630 | |
631 --phred64-quals | |
632 Input quality scores are encoded as Phred+64 (default for GA | |
633 Pipeline ver. >= 1.3). (Default: off) | |
634 | |
635 --solexa-quals | |
636 Input quality scores are solexa encoded (from GA Pipeline ver. < | |
637 1.3). (Default: off) | |
638 | |
639 --bowtie-path <path> | |
640 The path to the Bowtie executables. (Default: the path to the Bowtie | |
641 executables is assumed to be in the user's PATH environment | |
642 variable) | |
643 | |
644 --bowtie-n <int> | |
645 (Bowtie parameter) max # of mismatches in the seed. (Range: 0-3, | |
646 Default: 2) | |
647 | |
648 --bowtie-e <int> | |
649 (Bowtie parameter) max sum of mismatch quality scores across the | |
650 alignment. (Default: 99999999) | |
651 | |
652 --bowtie-m <int> | |
653 (Bowtie parameter) suppress all alignments for a read if > <int> | |
654 valid alignments exist. (Default: 200) | |
655 | |
656 --bowtie-chunkmbs <int> | |
657 (Bowtie parameter) memory allocated for best first alignment | |
658 calculation (Default: 0 - use Bowtie's default) | |
659 | |
660 --bowtie2-path <path> | |
661 (Bowtie 2 parameter) The path to the Bowtie 2 executables. (Default: | |
662 the path to the Bowtie 2 executables is assumed to be in the user's | |
663 PATH environment variable) | |
664 | |
665 --bowtie2-mismatch-rate <double> | |
666 (Bowtie 2 parameter) The maximum mismatch rate allowed. (Default: | |
667 0.1) | |
668 | |
669 --bowtie2-k <int> | |
670 (Bowtie 2 parameter) Find up to <int> alignments per read. (Default: | |
671 200) | |
672 | |
673 --bowtie2-sensitivity-level <string> | |
674 (Bowtie 2 parameter) Set Bowtie 2's preset options in --end-to-end | |
675 mode. This option controls how hard Bowtie 2 tries to find | |
676 alignments. <string> must be one of "very_fast", "fast", "sensitive" | |
677 and "very_sensitive". The four candidates correspond to Bowtie 2's | |
678 "--very-fast", "--fast", "--sensitive" and "--very-sensitive" | |
679 options. (Default: "sensitive" - use Bowtie 2's default) | |
680 | |
681 --star-path <path> | |
682 The path to STAR's executable. (Default: the path to STAR executable | |
683 is assumed to be in user's PATH environment variable) | |
684 | |
685 --star-gzipped-read-file | |
686 (STAR parameter) Input read file(s) is compressed by gzip. (Default: | |
687 off) | |
688 | |
689 --star-bzipped-read-file | |
690 (STAR parameter) Input read file(s) is compressed by bzip2. | |
691 (Default: off) | |
692 | |
693 --star-output-genome-bam | |
694 (STAR parameter) Save the BAM file from STAR alignment under genomic | |
695 coordinate to 'sample_name.STAR.genome.bam'. This file is NOT sorted | |
696 by genomic coordinate. In this file, according to STAR's manual, | |
697 'paired ends of an alignment are always adjacent, and multiple | |
698 alignments of a read are adjacent as well'. (Default: off) | |
699 | |
700 ADVANCED OPTIONS | |
701 --tag <string> | |
702 The name of the optional field used in the SAM input for identifying | |
703 a read with too many valid alignments. The field should have the | |
704 format <tagName>:i:<value>, where a <value> bigger than 0 indicates | |
705 a read with too many alignments. (Default: "") | |
706 | |
707 --fragment-length-min <int> | |
708 Minimum read/insert length allowed. This is also the value for the | |
709 Bowtie/Bowtie2 -I option. (Default: 1) | |
710 | |
711 --fragment-length-max <int> | |
712 Maximum read/insert length allowed. This is also the value for the | |
713 Bowtie/Bowtie 2 -X option. (Default: 1000) | |
714 | |
715 --fragment-length-mean <double> | |
716 (single-end data only) The mean of the fragment length distribution, | |
717 which is assumed to be a Gaussian. (Default: -1, which disables use | |
718 of the fragment length distribution) | |
719 | |
720 --fragment-length-sd <double> | |
721 (single-end data only) The standard deviation of the fragment length | |
722 distribution, which is assumed to be a Gaussian. (Default: 0, which | |
723 assumes that all fragments are of the same length, given by the | |
724 rounded value of --fragment-length-mean) | |
725 | |
726 --estimate-rspd | |
727 Set this option if you want to estimate the read start position | |
728 distribution (RSPD) from data. Otherwise, RSEM will use a uniform | |
729 RSPD. (Default: off) | |
730 | |
731 --num-rspd-bins <int> | |
732 Number of bins in the RSPD. Only relevant when '--estimate-rspd' is | |
733 specified. Use of the default setting is recommended. (Default: 20) | |
734 | |
735 --gibbs-burnin <int> | |
736 The number of burn-in rounds for RSEM's Gibbs sampler. Each round | |
737 passes over the entire data set once. If RSEM can use multiple | |
738 threads, multiple Gibbs samplers will start at the same time and all | |
739 samplers share the same burn-in number. (Default: 200) | |
740 | |
741 --gibbs-number-of-samples <int> | |
742 The total number of count vectors RSEM will collect from its Gibbs | |
743 samplers. (Default: 1000) | |
744 | |
745 --gibbs-sampling-gap <int> | |
746 The number of rounds between two succinct count vectors RSEM | |
747 collects. If the count vector after round N is collected, the count | |
748 vector after round N + <int> will also be collected. (Default: 1) | |
749 | |
750 --ci-credibility-level <double> | |
751 The credibility level for credibility intervals. (Default: 0.95) | |
752 | |
753 --ci-memory <int> | |
754 Maximum size (in memory, MB) of the auxiliary buffer used for | |
755 computing credibility intervals (CI). (Default: 1024) | |
756 | |
757 --ci-number-of-samples-per-count-vector <int> | |
758 The number of read generating probability vectors sampled per | |
759 sampled count vector. The crebility intervals are calculated by | |
760 first sampling P(C | D) and then sampling P(Theta | C) for each | |
761 sampled count vector. This option controls how many Theta vectors | |
762 are sampled per sampled count vector. (Default: 50) | |
763 | |
764 --keep-intermediate-files | |
765 Keep temporary files generated by RSEM. RSEM creates a temporary | |
766 directory, 'sample_name.temp', into which it puts all intermediate | |
767 output files. If this directory already exists, RSEM overwrites all | |
768 files generated by previous RSEM runs inside of it. By default, | |
769 after RSEM finishes, the temporary directory is deleted. Set this | |
770 option to prevent the deletion of this directory and the | |
771 intermediate files inside of it. (Default: off) | |
772 | |
773 --temporary-folder <string> | |
774 Set where to put the temporary files generated by RSEM. If the | |
775 folder specified does not exist, RSEM will try to create it. | |
776 (Default: sample_name.temp) | |
777 | |
778 --time | |
779 Output time consumed by each step of RSEM to 'sample_name.time'. | |
780 (Default: off) | |
781 | |
782 PRIOR-ENHANCED RSEM OPTIONS | |
783 --run-pRSEM | |
784 Running prior-enhanced RSEM (pRSEM). Prior parameters, i.e. | |
785 isoform's initial pseudo-count for RSEM's Gibbs sampling, will be | |
786 learned from input RNA-seq data and an external data set. When pRSEM | |
787 needs and only needs ChIP-seq peak information to partition isoforms | |
788 (e.g. in pRSEM's default partition model), either ChIP-seq peak file | |
789 (with the '--chipseq-peak-file' option) or ChIP-seq FASTQ files for | |
790 target and input and the path for Bowtie executables are required | |
791 (with the '--chipseq-target-read-files <string>', | |
792 '--chipseq-control-read-files <string>', and '--bowtie-path <path> | |
793 options), otherwise, ChIP-seq FASTQ files for target and control and | |
794 the path to Bowtie executables are required. (Default: off) | |
795 | |
796 --chipseq-peak-file <string> | |
797 Full path to a ChIP-seq peak file in ENCODE's narrowPeak, i.e. | |
798 BED6+4, format. This file is used when running prior-enhanced RSEM | |
799 in the default two-partition model. It partitions isoforms by | |
800 whether they have ChIP-seq overlapping with their transcription | |
801 start site region or not. Each partition will have its own prior | |
802 parameter learned from a training set. This file can be either | |
803 gzipped or ungzipped. (Default: "") | |
804 | |
805 --chipseq-target-read-files <string> | |
806 Comma-separated full path of FASTQ read file(s) for ChIP-seq target. | |
807 This option is used when running prior-enhanced RSEM. It provides | |
808 information to calculate ChIP-seq peaks and signals. The file(s) can | |
809 be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The | |
810 options '--bowtie-path <path>' and '--chipseq-control-read-files | |
811 <string>' must be defined when this option is specified. (Default: | |
812 "") | |
813 | |
814 --chipseq-control-read-files <string> | |
815 Comma-separated full path of FASTQ read file(s) for ChIP-seq conrol. | |
816 This option is used when running prior-enhanced RSEM. It provides | |
817 information to call ChIP-seq peaks. The file(s) can be either | |
818 ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options | |
819 '--bowtie-path <path>' and '--chipseq-target-read-files <string>' | |
820 must be defined when this option is specified. (Default: "") | |
821 | |
822 --chipseq-read-files-multi-targets <string> | |
823 Comma-separated full path of FASTQ read files for multiple ChIP-seq | |
824 targets. This option is used when running prior-enhanced RSEM, where | |
825 prior is learned from multiple complementary data sets. It provides | |
826 information to calculate ChIP-seq signals. All files can be either | |
827 ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this | |
828 option is specified, the option '--bowtie-path <path>' must be | |
829 defined and the option '--partition-model <string>' will be set to | |
830 'cmb_lgt' automatically. (Default: "") | |
831 | |
832 --chipseq-bed-files-multi-targets <string> | |
833 Comma-separated full path of BED files for multiple ChIP-seq | |
834 targets. This option is used when running prior-enhanced RSEM, where | |
835 prior is learned from multiple complementary data sets. It provides | |
836 information of ChIP-seq signals and must have at least the first six | |
837 BED columns. All files can be either ungzipped or gzipped with a | |
838 suffix '.gz' or '.gzip'. When this option is specified, the option | |
839 '--partition-model <string>' will be set to 'cmb_lgt' automatically. | |
840 (Default: "") | |
841 | |
842 --cap-stacked-chipseq-reads | |
843 Keep a maximum number of ChIP-seq reads that aligned to the same | |
844 genomic interval. This option is used when running prior-enhanced | |
845 RSEM, where prior is learned from multiple complementary data sets. | |
846 This option is only in use when either | |
847 '--chipseq-read-files-multi-targets <string>' or | |
848 '--chipseq-bed-files-multi-targets <string>' is specified. (Default: | |
849 off) | |
850 | |
851 --n-max-stacked-chipseq-reads <int> | |
852 The maximum number of stacked ChIP-seq reads to keep. This option is | |
853 used when running prior-enhanced RSEM, where prior is learned from | |
854 multiple complementary data sets. This option is only in use when | |
855 the option '--cap-stacked-chipseq-reads' is set. (Default: 5) | |
856 | |
857 --partition-model <string> | |
858 A keyword to specify the partition model used by prior-enhanced | |
859 RSEM. It must be one of the following keywords: | |
860 | |
861 - pk | |
862 Partitioned by whether an isoform has a ChIP-seq peak overlapping | |
863 with its transcription start site (TSS) region. The TSS region is | |
864 defined as [TSS-500bp, TSS+500bp]. For simplicity, we refer this | |
865 type of peak as 'TSS peak' when explaining other keywords. | |
866 | |
867 - pk_lgtnopk | |
868 First partitioned by TSS peak. Then, for isoforms in the 'no TSS | |
869 peak' set, a logistic model is employed to further classify them | |
870 into two partitions. | |
871 | |
872 - lm3, lm4, lm5, or lm6 | |
873 Based on their ChIP-seq signals, isoforms are classified into 3, | |
874 4, 5, or 6 partitions by a linear regression model. | |
875 | |
876 - nopk_lm2pk, nopk_lm3pk, nopk_lm4pk, or nopk_lm5pk | |
877 First partitioned by TSS peak. Then, for isoforms in the 'with TSS | |
878 peak' set, a linear regression model is employed to further | |
879 classify them into 2, 3, 4, or 5 partitions. | |
880 | |
881 - pk_lm2nopk, pk_lm3nopk, pk_lm4nopk, or pk_lm5nopk | |
882 First partitioned by TSS peak. Then, for isoforms in the 'no TSS | |
883 peak' set, a linear regression model is employed to further | |
884 classify them into 2, 3, 4, or 5 partitions. | |
885 | |
886 - cmb_lgt | |
887 Using a logistic regression to combine TSS signals from multiple | |
888 complementary data sets and partition training set isoform into | |
889 'expressed' and 'not expressed'. This partition model is only in | |
890 use when either '--chipseq-read-files-multi-targets <string>' or | |
891 '--chipseq-bed-files-multi-targets <string> is specified. | |
892 | |
893 Parameters for all the above models are learned from a training set. | |
894 For detailed explanations, please see prior-enhanced RSEM's paper. | |
895 (Default: 'pk') | |
896 | |
897 DEPRECATED OPTIONS | |
898 The options in this section are deprecated. They are here only for | |
899 compatibility reasons and may be removed in future releases. | |
900 | |
901 --sam | |
902 Inputs are alignments in SAM format. (Default: off) | |
903 | |
904 --bam | |
905 Inputs are alignments in BAM format. (Default: off) | |
906 | |
907 --strand-specific | |
908 Equivalent to '--strandedness forward'. (Default: off) | |
909 | |
910 --forward-prob <double> | |
911 Probability of generating a read from the forward strand of a | |
912 transcript. Set to 1 for a strand-specific protocol where all | |
913 (upstream) reads are derived from the forward strand, 0 for a | |
914 strand-specific protocol where all (upstream) read are derived from | |
915 the reverse strand, or 0.5 for a non-strand-specific protocol. | |
916 (Default: off) | |
917 | |
918 DESCRIPTION | |
919 In its default mode, this program aligns input reads against a reference | |
920 transcriptome with Bowtie and calculates expression values using the | |
921 alignments. RSEM assumes the data are single-end reads with quality | |
922 scores, unless the '--paired-end' or '--no-qualities' options are | |
923 specified. Alternatively, users can use STAR to align reads using the | |
924 '--star' option. RSEM has provided options in 'rsem-prepare-reference' | |
925 to prepare STAR's genome indices. Users may use an alternative aligner | |
926 by specifying '--alignments', and providing an alignment file in | |
927 SAM/BAM/CRAM format. However, users should make sure that they align | |
928 against the indices generated by 'rsem-prepare-reference' and the | |
929 alignment file satisfies the requirements mentioned in ARGUMENTS | |
930 section. | |
931 | |
932 One simple way to make the alignment file satisfying RSEM's requirements | |
933 is to use the 'convert-sam-for-rsem' script. This script accepts | |
934 SAM/BAM/CRAM files as input and outputs a BAM file. For example, type | |
935 the following command to convert a SAM file, 'input.sam', to a | |
936 ready-for-use BAM file, 'input_for_rsem.bam': | |
937 | |
938 convert-sam-for-rsem input.sam input_for_rsem | |
939 | |
940 For details, please refer to 'convert-sam-for-rsem's documentation page. | |
941 | |
942 NOTES | |
943 1. Users must run 'rsem-prepare-reference' with the appropriate | |
944 reference before using this program. | |
945 | |
946 2. For single-end data, it is strongly recommended that the user provide | |
947 the fragment length distribution parameters (--fragment-length-mean and | |
948 --fragment-length-sd). For paired-end data, RSEM will automatically | |
949 learn a fragment length distribution from the data. | |
950 | |
951 3. Some aligner parameters have default values different from their | |
952 original settings. | |
953 | |
954 4. With the '--calc-pme' option, posterior mean estimates will be | |
955 calculated in addition to maximum likelihood estimates. | |
956 | |
957 5. With the '--calc-ci' option, 95% credibility intervals and posterior | |
958 mean estimates will be calculated in addition to maximum likelihood | |
959 estimates. | |
960 | |
961 6. The temporary directory and all intermediate files will be removed | |
962 when RSEM finishes unless '--keep-intermediate-files' is specified. | |
963 | |
964 With the '--run-pRSEM' option and associated options (see section | |
965 'PRIOR-ENHANCED RSEM OPTIONS' above for details), prior-enhanced RSEM | |
966 will be running. Prior parameters will be learned from supplied external | |
967 data set(s) and assigned as initial pseudo-counts for isoforms in the | |
968 corresponding partition for Gibbs sampling. | |
969 | |
970 OUTPUT | |
971 sample_name.isoforms.results | |
972 File containing isoform level expression estimates. The first line | |
973 contains column names separated by the tab character. The format of | |
974 each line in the rest of this file is: | |
975 | |
976 transcript_id gene_id length effective_length expected_count TPM | |
977 FPKM IsoPct [posterior_mean_count | |
978 posterior_standard_deviation_of_count pme_TPM pme_FPKM | |
979 IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound | |
980 TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound | |
981 FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation] | |
982 | |
983 Fields are separated by the tab character. Fields within "[]" are | |
984 optional. They will not be presented if neither '--calc-pme' nor | |
985 '--calc-ci' is set. | |
986 | |
987 'transcript_id' is the transcript name of this transcript. 'gene_id' | |
988 is the gene name of the gene which this transcript belongs to | |
989 (denote this gene as its parent gene). If no gene information is | |
990 provided, 'gene_id' and 'transcript_id' are the same. | |
991 | |
992 'length' is this transcript's sequence length (poly(A) tail is not | |
993 counted). 'effective_length' counts only the positions that can | |
994 generate a valid fragment. If no poly(A) tail is added, | |
995 'effective_length' is equal to transcript length - mean fragment | |
996 length + 1. If one transcript's effective length is less than 1, | |
997 this transcript's both effective length and abundance estimates are | |
998 set to 0. | |
999 | |
1000 'expected_count' is the sum of the posterior probability of each | |
1001 read comes from this transcript over all reads. Because 1) each read | |
1002 aligning to this transcript has a probability of being generated | |
1003 from background noise; 2) RSEM may filter some alignable low quality | |
1004 reads, the sum of expected counts for all transcript are generally | |
1005 less than the total number of reads aligned. | |
1006 | |
1007 'TPM' stands for Transcripts Per Million. It is a relative measure | |
1008 of transcript abundance. The sum of all transcripts' TPM is 1 | |
1009 million. 'FPKM' stands for Fragments Per Kilobase of transcript per | |
1010 Million mapped reads. It is another relative measure of transcript | |
1011 abundance. If we define l_bar be the mean transcript length in a | |
1012 sample, which can be calculated as | |
1013 | |
1014 l_bar = \sum_i TPM_i / 10^6 * effective_length_i (i goes through | |
1015 every transcript), | |
1016 | |
1017 the following equation is hold: | |
1018 | |
1019 FPKM_i = 10^3 / l_bar * TPM_i. | |
1020 | |
1021 We can see that the sum of FPKM is not a constant across samples. | |
1022 | |
1023 'IsoPct' stands for isoform percentage. It is the percentage of this | |
1024 transcript's abandunce over its parent gene's abandunce. If its | |
1025 parent gene has only one isoform or the gene information is not | |
1026 provided, this field will be set to 100. | |
1027 | |
1028 'posterior_mean_count', 'pme_TPM', 'pme_FPKM' are posterior mean | |
1029 estimates calculated by RSEM's Gibbs sampler. | |
1030 'posterior_standard_deviation_of_count' is the posterior standard | |
1031 deviation of counts. 'IsoPct_from_pme_TPM' is the isoform percentage | |
1032 calculated from 'pme_TPM' values. | |
1033 | |
1034 'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound' | |
1035 and 'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95% | |
1036 credibility intervals for TPM and FPKM values. The bounds are | |
1037 inclusive (i.e. [l, u]). | |
1038 | |
1039 'TPM_coefficient_of_quartile_variation' and | |
1040 'FPKM_coefficient_of_quartile_variation' are coefficients of | |
1041 quartile variation (CQV) for TPM and FPKM values. CQV is a robust | |
1042 way of measuring the ratio between the standard deviation and the | |
1043 mean. It is defined as | |
1044 | |
1045 CQV := (Q3 - Q1) / (Q3 + Q1), | |
1046 | |
1047 where Q1 and Q3 are the first and third quartiles. | |
1048 | |
1049 sample_name.genes.results | |
1050 File containing gene level expression estimates. The first line | |
1051 contains column names separated by the tab character. The format of | |
1052 each line in the rest of this file is: | |
1053 | |
1054 gene_id transcript_id(s) length effective_length expected_count TPM | |
1055 FPKM [posterior_mean_count posterior_standard_deviation_of_count | |
1056 pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound | |
1057 TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound | |
1058 FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation] | |
1059 | |
1060 Fields are separated by the tab character. Fields within "[]" are | |
1061 optional. They will not be presented if neither '--calc-pme' nor | |
1062 '--calc-ci' is set. | |
1063 | |
1064 'transcript_id(s)' is a comma-separated list of transcript_ids | |
1065 belonging to this gene. If no gene information is provided, | |
1066 'gene_id' and 'transcript_id(s)' are identical (the | |
1067 'transcript_id'). | |
1068 | |
1069 A gene's 'length' and 'effective_length' are defined as the weighted | |
1070 average of its transcripts' lengths and effective lengths (weighted | |
1071 by 'IsoPct'). A gene's abundance estimates are just the sum of its | |
1072 transcripts' abundance estimates. | |
1073 | |
1074 sample_name.alleles.results | |
1075 Only generated when the RSEM references are built with | |
1076 allele-specific transcripts. | |
1077 | |
1078 This file contains allele level expression estimates for | |
1079 allele-specific expression calculation. The first line contains | |
1080 column names separated by the tab character. The format of each line | |
1081 in the rest of this file is: | |
1082 | |
1083 allele_id transcript_id gene_id length effective_length | |
1084 expected_count TPM FPKM AlleleIsoPct AlleleGenePct | |
1085 [posterior_mean_count posterior_standard_deviation_of_count pme_TPM | |
1086 pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM | |
1087 TPM_ci_lower_bound TPM_ci_upper_bound | |
1088 TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound | |
1089 FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation] | |
1090 | |
1091 Fields are separated by the tab character. Fields within "[]" are | |
1092 optional. They will not be presented if neither '--calc-pme' nor | |
1093 '--calc-ci' is set. | |
1094 | |
1095 'allele_id' is the allele-specific name of this allele-specific | |
1096 transcript. | |
1097 | |
1098 'AlleleIsoPct' stands for allele-specific percentage on isoform | |
1099 level. It is the percentage of this allele-specific transcript's | |
1100 abundance over its parent transcript's abundance. If its parent | |
1101 transcript has only one allele variant form, this field will be set | |
1102 to 100. | |
1103 | |
1104 'AlleleGenePct' stands for allele-specific percentage on gene level. | |
1105 It is the percentage of this allele-specific transcript's abundance | |
1106 over its parent gene's abundance. | |
1107 | |
1108 'AlleleIsoPct_from_pme_TPM' and 'AlleleGenePct_from_pme_TPM' have | |
1109 similar meanings. They are calculated based on posterior mean | |
1110 estimates. | |
1111 | |
1112 Please note that if this file is present, the fields 'length' and | |
1113 'effective_length' in 'sample_name.isoforms.results' should be | |
1114 interpreted similarly as the corresponding definitions in | |
1115 'sample_name.genes.results'. | |
1116 | |
1117 sample_name.transcript.bam | |
1118 Only generated when --no-bam-output is not specified. | |
1119 | |
1120 'sample_name.transcript.bam' is a BAM-formatted file of read | |
1121 alignments in transcript coordinates. The MAPQ field of each | |
1122 alignment is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), | |
1123 where w is the posterior probability of that alignment being the | |
1124 true mapping of a read. In addition, RSEM pads a new tag ZW:f:value, | |
1125 where value is a single precision floating number representing the | |
1126 posterior probability. Because this file contains all alignment | |
1127 lines produced by bowtie or user-specified aligners, it can also be | |
1128 used as a replacement of the aligner generated BAM/SAM file. | |
1129 | |
1130 sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai | |
1131 Only generated when --no-bam-output is not specified and --sort-bam-by-coordinate is specified. | |
1132 | |
1133 'sample_name.transcript.sorted.bam' and | |
1134 'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and | |
1135 indices generated by samtools (included in RSEM package). | |
1136 | |
1137 sample_name.genome.bam | |
1138 Only generated when --no-bam-output is not specified and | |
1139 --output-genome-bam is specified. | |
1140 | |
1141 'sample_name.genome.bam' is a BAM-formatted file of read alignments | |
1142 in genomic coordinates. Alignments of reads that have identical | |
1143 genomic coordinates (i.e., alignments to different isoforms that | |
1144 share the same genomic region) are collapsed into one alignment. The | |
1145 MAPQ field of each alignment is set to min(100, floor(-10 * | |
1146 log10(1.0 - w) + 0.5)), where w is the posterior probability of that | |
1147 alignment being the true mapping of a read. In addition, RSEM pads a | |
1148 new tag ZW:f:value, where value is a single precision floating | |
1149 number representing the posterior probability. If an alignment is | |
1150 spliced, a XS:A:value tag is also added, where value is either '+' | |
1151 or '-' indicating the strand of the transcript it aligns to. | |
1152 | |
1153 sample_name.genome.sorted.bam and sample_name.genome.sorted.bam.bai | |
1154 Only generated when --no-bam-output is not specified, and | |
1155 --sort-bam-by-coordinate and --output-genome-bam are specified. | |
1156 | |
1157 'sample_name.genome.sorted.bam' and | |
1158 'sample_name.genome.sorted.bam.bai' are the sorted BAM file and | |
1159 indices generated by samtools (included in RSEM package). | |
1160 | |
1161 sample_name.time | |
1162 Only generated when --time is specified. | |
1163 | |
1164 It contains time (in seconds) consumed by aligning reads, estimating | |
1165 expression levels and calculating credibility intervals. | |
1166 | |
1167 sample_name.stat | |
1168 This is a folder instead of a file. All model related statistics are | |
1169 stored in this folder. Use 'rsem-plot-model' can generate plots | |
1170 using this folder. | |
1171 | |
1172 'sample_name.stat/sample_name.cnt' contains alignment statistics. | |
1173 The format and meanings of each field are described in | |
1174 'cnt_file_description.txt' under RSEM directory. | |
1175 | |
1176 'sample_name.stat/sample_name.model' stores RNA-Seq model parameters | |
1177 learned from the data. The format and meanings of each filed of this | |
1178 file are described in 'model_file_description.txt' under RSEM | |
1179 directory. | |
1180 | |
1181 The following four output files will be generated only by | |
1182 prior-enhanced RSEM | |
1183 | |
1184 - 'sample_name.stat/sample_name_prsem.all_tr_features' | |
1185 It stores isofrom features for deriving and assigning pRSEM prior. | |
1186 The first line is a header and the rest is one isoform per line. | |
1187 The description for each column is: | |
1188 | |
1189 * trid: transcript ID from input annotation | |
1190 | |
1191 * geneid: gene ID from input anntation | |
1192 | |
1193 * chrom: isoform's chromosome name | |
1194 | |
1195 * strand: isoform's strand name | |
1196 | |
1197 * start: isoform's end with the lowest genomic loci | |
1198 | |
1199 * end: isoform's end with the highest genomic loci | |
1200 | |
1201 * tss_mpp: average mappability of [TSS-500bp, TSS+500bp], where | |
1202 TSS is isoform's transcription start site, i.e. 5'-end | |
1203 | |
1204 * body_mpp: average mappability of (TSS+500bp, TES-500bp), where | |
1205 TES is isoform's transcription end site, i.e. 3'-end | |
1206 | |
1207 * tes_mpp: average mappability of [TES-500bp, TES+500bp] | |
1208 | |
1209 * pme_count: isoform's fragment or read count from RSEM's | |
1210 posterior mean estimates | |
1211 | |
1212 * tss: isoform's TSS loci | |
1213 | |
1214 * tss_pk: equal to 1 if isoform's [TSS-500bp, TSS+500bp] region | |
1215 overlaps with a RNA Pol II peak; 0 otherwise | |
1216 | |
1217 * is_training: equal to 1 if isoform is in the training set where | |
1218 Pol II prior is learned; 0 otherwise | |
1219 | |
1220 - 'sample_name.stat/sample_name_prsem.all_tr_prior' | |
1221 It stores prior parameters for every isoform. This file does not | |
1222 have a header. Each line contains a prior parameter and an | |
1223 isoform's transcript ID delimited by " # ". | |
1224 | |
1225 - 'sample_name.stat/sample_name_uniform_prior_1.isoforms.results' | |
1226 RSEM's posterior mean estimates on the isoform level with an | |
1227 initial pseudo-count of one for every isoform. It is in the same | |
1228 format as the 'sample_name.isoforms.results'. | |
1229 | |
1230 - 'sample_name.stat/sample_name_uniform_prior_1.genes.results' | |
1231 RSEM's posterior mean estimates on the gene level with an initial | |
1232 pseudo-count of one for every isoform. It is in the same format as | |
1233 the 'sample_name.genes.results'. | |
1234 | |
1235 When learning prior from multiple external data sets in | |
1236 prior-enhanced RSEM, two additional output files will be generated. | |
1237 | |
1238 - 'sample_name.stat/sample_name.pval_LL' | |
1239 It stores a p-value and a log-likelihood. The p-value indicates | |
1240 whether the combination of multiple complementary data sets is | |
1241 informative for RNA-seq quantification. The log-likelihood shows | |
1242 how well pRSEM's Dirichlet-multinomial model fits the read counts | |
1243 of partitioned training set isoforms. | |
1244 | |
1245 - 'sample_name.stat/sample_name.lgt_mdl.RData' | |
1246 It stores an R object named 'glmmdl', which is a logistic | |
1247 regression model on the training set isoforms and multiple | |
1248 external data sets. | |
1249 | |
1250 In addition, extra columns will be added to | |
1251 'sample_name.stat/all_tr_features' | |
1252 | |
1253 * is_expr: equal to 1 if isoform has an abundance >= 1 TPM and a | |
1254 non-zero read count from RSEM's posterior mean estimates; 0 | |
1255 otherwise | |
1256 | |
1257 * "$external_data_set_basename": log10 of external data's signal at | |
1258 [TSS-500, TSS+500]. Signal is the number of reads aligned within | |
1259 that interval and normalized to RPKM by read depth and interval | |
1260 length. It will be set to -4 if no read aligned to that interval. | |
1261 | |
1262 There are multiple columns like this one, where each represents an | |
1263 external data set. | |
1264 | |
1265 * prd_expr_prob: predicted probability from logistic regression | |
1266 model on whether this isoform is expressed or not. A probability | |
1267 higher than 0.5 is considered as expressed | |
1268 | |
1269 * partition: group index, to which this isoforms is partitioned | |
1270 | |
1271 * prior: prior parameter for this isoform | |
1272 | |
1273 EXAMPLES | |
1274 Assume the path to the bowtie executables is in the user's PATH | |
1275 environment variable. Reference files are under '/ref' with name | |
1276 'mouse_125'. | |
1277 | |
1278 1) '/data/mmliver.fq', single-end reads with quality scores. Quality | |
1279 scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 | |
1280 threads and generate a genome BAM file. In addition, we want to append | |
1281 gene/transcript names to the result files: | |
1282 | |
1283 rsem-calculate-expression --phred64-quals \ | |
1284 -p 8 \ | |
1285 --append-names \ | |
1286 --output-genome-bam \ | |
1287 /data/mmliver.fq \ | |
1288 /ref/mouse_125 \ | |
1289 mmliver_single_quals | |
1290 | |
1291 2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', stranded paired-end | |
1292 reads with quality scores. Suppose the library is prepared using TruSeq | |
1293 Stranded Kit, which means the first mate should map to the reverse | |
1294 strand. Quality scores are in SANGER format. We want to use 8 threads | |
1295 and do not generate a genome BAM file: | |
1296 | |
1297 rsem-calculate-expression -p 8 \ | |
1298 --paired-end \ | |
1299 --strandedness reverse \ | |
1300 /data/mmliver_1.fq \ | |
1301 /data/mmliver_2.fq \ | |
1302 /ref/mouse_125 \ | |
1303 mmliver_paired_end_quals | |
1304 | |
1305 3) '/data/mmliver.fa', single-end reads without quality scores. We want | |
1306 to use 8 threads: | |
1307 | |
1308 rsem-calculate-expression -p 8 \ | |
1309 --no-qualities \ | |
1310 /data/mmliver.fa \ | |
1311 /ref/mouse_125 \ | |
1312 mmliver_single_without_quals | |
1313 | |
1314 4) Data are the same as 1). This time we assume the bowtie executables | |
1315 are under '/sw/bowtie'. We want to take a fragment length distribution | |
1316 into consideration. We set the fragment length mean to 150 and the | |
1317 standard deviation to 35. In addition to a BAM file, we also want to | |
1318 generate credibility intervals. We allow RSEM to use 1GB of memory for | |
1319 CI calculation: | |
1320 | |
1321 rsem-calculate-expression --bowtie-path /sw/bowtie \ | |
1322 --phred64-quals \ | |
1323 --fragment-length-mean 150.0 \ | |
1324 --fragment-length-sd 35.0 \ | |
1325 -p 8 \ | |
1326 --output-genome-bam \ | |
1327 --calc-ci \ | |
1328 --ci-memory 1024 \ | |
1329 /data/mmliver.fq \ | |
1330 /ref/mouse_125 \ | |
1331 mmliver_single_quals | |
1332 | |
1333 5) '/data/mmliver_paired_end_quals.bam', BAM-formatted alignments for | |
1334 paired-end reads with quality scores. We want to use 8 threads: | |
1335 | |
1336 rsem-calculate-expression --paired-end \ | |
1337 --alignments \ | |
1338 -p 8 \ | |
1339 /data/mmliver_paired_end_quals.bam \ | |
1340 /ref/mouse_125 \ | |
1341 mmliver_paired_end_quals | |
1342 | |
1343 6) '/data/mmliver_1.fq.gz' and '/data/mmliver_2.fq.gz', paired-end reads | |
1344 with quality scores and read files are compressed by gzip. We want to | |
1345 use STAR to aligned reads and assume STAR executable is '/sw/STAR'. | |
1346 Suppose we want to use 8 threads and do not generate a genome BAM file: | |
1347 | |
1348 rsem-calculate-expression --paired-end \ | |
1349 --star \ | |
1350 --star-path /sw/STAR \ | |
1351 --gzipped-read-file \ | |
1352 --paired-end \ | |
1353 -p 8 \ | |
1354 /data/mmliver_1.fq.gz \ | |
1355 /data/mmliver_2.fq.gz \ | |
1356 /ref/mouse_125 \ | |
1357 mmliver_paired_end_quals | |
1358 | |
1359 7) In the above example, suppose we want to run prior-enhanced RSEM | |
1360 instead. Assuming we want to learn priors from a ChIP-seq peak file | |
1361 '/data/mmlive.narrowPeak.gz': | |
1362 | |
1363 rsem-calculate-expression --star \ | |
1364 --star-path /sw/STAR \ | |
1365 --gzipped-read-file \ | |
1366 --paired-end \ | |
1367 --calc-pme \ | |
1368 --run-pRSEM \ | |
1369 --chipseq-peak-file /data/mmliver.narrowPeak.gz \ | |
1370 -p 8 \ | |
1371 /data/mmliver_1.fq.gz \ | |
1372 /data/mmliver_2.fq.gz \ | |
1373 /ref/mouse_125 \ | |
1374 mmliver_paired_end_quals | |
1375 | |
1376 8) Similar to the example in 7), suppose we want to use the partition | |
1377 model 'pk_lm2nopk' (partitioning isoforms by Pol II TSS peak first and | |
1378 then partitioning 'no TSS peak' isoforms into two bins by a linear | |
1379 regression model), and we want to partition isoforms by RNA Pol II's | |
1380 ChIP-seq read files '/data/mmliver_PolIIRep1.fq.gz' and | |
1381 '/data/mmliver_PolIIRep2.fq.gz', and the control ChIP-seq read files | |
1382 '/data/mmliver_ChIPseqCtrl.fq.gz'. Also, assuming Bowtie's executables | |
1383 are under '/sw/bowtie/': | |
1384 | |
1385 rsem-calculate-expression --star \ | |
1386 --star-path /sw/STAR \ | |
1387 --gzipped-read-file \ | |
1388 --paired-end \ | |
1389 --calc-pme \ | |
1390 --run-pRSEM \ | |
1391 --chipseq-target-read-files /data/mmliver_PolIIRep1.fq.gz,/data/mmliver_PolIIRep2.fq.gz \ | |
1392 --chipseq-control-read-files /data/mmliver_ChIPseqCtrl.fq.gz \ | |
1393 --partition-model pk_lm2nopk \ | |
1394 --bowtie-path /sw/bowtie \ | |
1395 -p 8 \ | |
1396 /data/mmliver_1.fq.gz \ | |
1397 /data/mmliver_2.fq.gz \ | |
1398 /ref/mouse_125 \ | |
1399 mmliver_paired_end_quals | |
1400 | |
1401 9) Similar to the example in 8), suppose we want to derive prior from | |
1402 four histone modification ChIP-seq read data sets: | |
1403 '/data/H3K27Ac.fastq.gz', '/data/H3K4me1.fastq.gz', | |
1404 '/data/H3K4me2.fastq.gz', and '/data/H3K4me3.fastq.gz'. Also, assuming | |
1405 Bowtie's executables are under '/sw/bowtie/': | |
1406 | |
1407 rsem-calculate-expression --star \ | |
1408 --star-path /sw/STAR \ | |
1409 --gzipped-read-file \ | |
1410 --paired-end \ | |
1411 --calc-pme \ | |
1412 --run-pRSEM \ | |
1413 --partition-model cmb_lgt \ | |
1414 --chipseq-read-files-multi-targets /data/H3K27Ac.fastq.gz,/data/H3K4me1.fastq.gz,/data/H3K4me2.fastq.gz,/data/H3K4me3.fastq.gz \ | |
1415 --bowtie-path /sw/bowtie \ | |
1416 -p 8 \ | |
1417 /data/mmliver_1.fq.gz \ | |
1418 /data/mmliver_2.fq.gz \ | |
1419 /ref/mouse_125 \ | |
1420 mmliver_paired_end_quals | |
1421 | |
1422 </help> | |
1423 <citations> | |
1424 <citation type="doi">10.1186/1471-2105-12-323</citation> | |
1425 </citations> | |
1426 | |
1427 </tool> |