comparison arriba.xml @ 11:8ed8af5836d1 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/arriba commit e0aa03add09ecc4ad5a5d41c439b8af9551fc53c"
author jjohnson
date Tue, 26 Apr 2022 20:21:29 +0000
parents c58d1774c762
children 73fd7703a743
comparison
equal deleted inserted replaced
10:c58d1774c762 11:8ed8af5836d1
1 <tool id="arriba" name="Arriba" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" python_template_version="3.5"> 1 <tool id="arriba" name="Arriba" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" python_template_version="3.5">
2 <description>detect gene fusions from STAR aligned RNA-Seq data</description> 2 <description>detect gene fusions from STAR aligned RNA-Seq data</description>
3 <macros> 3 <macros>
4 <import>macros.xml</import> 4 <import>macros.xml</import>
5 <xml name="fusion_actions">
6 <actions>
7 <action name="comment_lines" type="metadata" default="1" />
8 <action name="column_names" type="metadata" default="gene1,gene2,strand1(gene/fusion),strand2(gene/fusion),breakpoint1,breakpoint2,site1,site2,type,split_reads1,split_reads2,discordant_mates,coverage1,coverage2,confidence,reading_frame,tags,retained_protein_domains,closest_genomic_breakpoint1,closest_genomic_breakpoint2,gene_id1,gene_id2,transcript_id1,transcript_id2,direction1,direction2,filters,fusion_transcript,peptide_sequence,read_identifiers" />
9 </actions>
10 </xml>
5 </macros> 11 </macros>
6 <expand macro="requirements" /> 12 <expand macro="requirements" />
7 <expand macro="version_command" /> 13 <expand macro="version_command" />
8 <command detect_errors="exit_code"><![CDATA[ 14 <command detect_errors="exit_code"><![CDATA[
9 @GENOME_SOURCE@ 15 @GENOME_SOURCE@
37 #if str($input_params.index.index_source) == "history" 43 #if str($input_params.index.index_source) == "history"
38 #set $star_index_dir = $input_params.index.star_index.extra_files_path 44 #set $star_index_dir = $input_params.index.star_index.extra_files_path
39 #else 45 #else
40 #set $star_index_dir = $input_params.index.arriba_ref.fields.star_index 46 #set $star_index_dir = $input_params.index.arriba_ref.fields.star_index
41 #end if 47 #end if
48 #if $blacklist
49 #if $blacklist.is_of_type('tabular.gz')
50 #set $blacklist_file = 'blacklist.tsv.gz'
51 ln -sf '$blacklist' $blacklist_file &&
52 #else
53 #set $blacklist_file = $blacklist
54 #end if
55 #end if
56 #if $known_fusions
57 #if $known_fusions.is_of_type('tabular.gz')
58 #set $known_fusions_file = 'known_fusions.tsv.gz'
59 ln -sf '$known_fusions' $known_fusions_file &&
60 #else
61 #set $known_fusions_file = $known_fusions
62 #end if
63 #end if
64 #if $tags
65 #if $tags.is_of_type('tabular.gz')
66 #set $tags_file = 'tags.tsv.gz'
67 ln -sf '$tags' $tags_file &&
68 #else
69 #set $tags_file = $tags
70 #end if
71 #end if
72
42 STAR 73 STAR
43 --runThreadN \${GALAXY_SLOTS:-1} 74 --runThreadN \${GALAXY_SLOTS:-1}
44 --genomeDir $star_index_dir 75 --genomeDir $star_index_dir
45 --genomeLoad NoSharedMemory 76 --genomeLoad NoSharedMemory
46 --readFilesIn $read1 $read2 77 --readFilesIn $read1 $read2
72 #end if 103 #end if
73 #end if 104 #end if
74 -a '$genome_assembly' 105 -a '$genome_assembly'
75 -g '$genome_annotation' 106 -g '$genome_annotation'
76 #if $blacklist 107 #if $blacklist
77 -b '$blacklist' 108 -b '$blacklist_file'
78 #else 109 #else
79 -f 'blacklist' 110 -f 'blacklist'
80 #end if 111 #end if
81 #if $protein_domains 112 #if $protein_domains
82 -p '$protein_domains' 113 -p '$protein_domains'
83 #end if 114 #end if
84 #if $known_fusions 115 #if $known_fusions
85 -k '$known_fusions' 116 -k '$known_fusions_file'
86 #end if 117 #end if
87 #if $tags 118 #if $tags
88 -t '$tags' 119 -t '$tags_file'
89 #end if 120 #end if
90 #if str($wgs.use_wgs) == "yes" 121 #if str($wgs.use_wgs) == "yes"
91 -d '$wgs.wgs' 122 -d '$wgs.wgs'
92 #if $wgs.max_genomic_breakpoint_distance 123 #if $wgs.max_genomic_breakpoint_distance
93 -D $wgs.max_genomic_breakpoint_distance 124 -D $wgs.max_genomic_breakpoint_distance
175 && samtools index Aligned.sortedByCoord.out.bam 206 && samtools index Aligned.sortedByCoord.out.bam
176 #elif str($visualization.do_viz) == "yes" 207 #elif str($visualization.do_viz) == "yes"
177 && samtools sort -@ \${GALAXY_SLOTS:-1} -m 4G -T tmp -O bam '$input_params.input' > Aligned.sortedByCoord.out.bam 208 && samtools sort -@ \${GALAXY_SLOTS:-1} -m 4G -T tmp -O bam '$input_params.input' > Aligned.sortedByCoord.out.bam
178 && samtools index Aligned.sortedByCoord.out.bam 209 && samtools index Aligned.sortedByCoord.out.bam
179 #end if 210 #end if
211 #if $output_fusions_vcf
212 && convert_fusions_to_vcf.sh '$genome_assembly' fusions.tsv fusions.vcf
213 #end if
214 #if $output_fusion_bams
215 && mkdir fusion_bams
216 && extract_fusion-supporting_alignments.sh fusions.tsv Aligned.sortedByCoord.out.bam 'fusion_bams/fusion'
217 #end if
180 #if str($visualization.do_viz) == "yes" 218 #if str($visualization.do_viz) == "yes"
181 #set $fusions = 'fusions.tsv' 219 #set $fusions = 'fusions.tsv'
182 && @DRAW_FUSIONS@ 220 && @DRAW_FUSIONS@
183 #end if 221 #end if
184 ]]></command> 222 ]]></command>
185 <inputs> 223 <inputs>
186 <conditional name="input_params"> 224 <conditional name="input_params">
187 <param name="input_source" type="select" label="Use output from earlier STAR run or let Arriba running STAR"> 225 <param name="input_source" type="select" label="Use output from earlier STAR run or let Arriba running STAR">
188 <option value="use_star">Use output from earlier STAR</option> 226 <option value="use_star">Use output from earlier STAR</option>
189 <option value="use_fastq">Let Arriba control running STAR</option> 227 <option value="use_fastq">Let Arriba control running STAR</option>
190 </param> 228 </param>
191 <when value="use_star"> 229 <when value="use_star">
192 <param name="input" argument="-x" type="data" format="sam,bam,cram" label="STAR Aligned.out.sam"/> 230 <param name="input" argument="-x" type="data" format="sam,bam,cram" label="STAR Aligned.out.sam">
231 <help><![CDATA[ recommended STAR options: --chimSegmentMin 10 --chimOutType WithinBAM ]]></help>
232 </param>
193 <param name="chimeric" argument="-c" type="data" format="sam,bam,cram" optional="true" label="STAR Chimeric.out.sam"> 233 <param name="chimeric" argument="-c" type="data" format="sam,bam,cram" optional="true" label="STAR Chimeric.out.sam">
194 <help><![CDATA[ only required, if STAR was run with the parameter '--chimOutType SeparateSAMold' ]]></help> 234 <help><![CDATA[ only required, if STAR was run with the parameter '--chimOutType SeparateSAMold' ]]></help>
195 </param> 235 </param>
196 </when> 236 </when>
197 <when value="use_fastq"> 237 <when value="use_fastq">
421 (denoted as '...'), fill the gaps using the assembly sequence wherever possible. 461 (denoted as '...'), fill the gaps using the assembly sequence wherever possible.
422 </help> 462 </help>
423 </param> 463 </param>
424 </section> 464 </section>
425 <param name="output_fusions_discarded" argument="-O" type="boolean" truevalue="yes" falsevalue="no" checked="true" label="Output fusions.discarded.tsv"/> 465 <param name="output_fusions_discarded" argument="-O" type="boolean" truevalue="yes" falsevalue="no" checked="true" label="Output fusions.discarded.tsv"/>
466 <param name="output_fusions_vcf" type="boolean" truevalue="yes" falsevalue="no" checked="true" label="Output fusions.vcf"/>
467 <param name="output_fusion_bams" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Output fusion BAMs"/>
426 <conditional name="visualization"> 468 <conditional name="visualization">
427 <param name="do_viz" type="select" label="Generate visualization"> 469 <param name="do_viz" type="select" label="Generate visualization">
428 <option value="yes">Yes</option> 470 <option value="yes">Yes</option>
429 <option value="no">no</option> 471 <option value="no">no</option>
430 </param> 472 </param>
431 <when value="yes"> 473 <when value="yes">
432 <expand macro="visualization_options" /> 474 <expand macro="visualization_options" />
433 </when> 475 </when>
434 <when value="no"/> 476 <when value="no"/>
435 </conditional> 477 </conditional>
436
437 </inputs> 478 </inputs>
438 <outputs> 479 <outputs>
439 <data name="fusions_tsv" format="tabular" label="${tool.name} on ${on_string}: fusions.tsv" from_work_dir="fusions.tsv"/> 480 <data name="fusions_tsv" format="tabular" label="${tool.name} on ${on_string}: fusions.tsv" from_work_dir="fusions.tsv">
481 <expand macro="fusion_actions" />
482 </data>
483
440 <data name="discarded_fusions_tsv" format="tabular" label="${tool.name} on ${on_string}: fusions.discarded.tsv" from_work_dir="fusions.discarded.tsv"> 484 <data name="discarded_fusions_tsv" format="tabular" label="${tool.name} on ${on_string}: fusions.discarded.tsv" from_work_dir="fusions.discarded.tsv">
441 <filter> output_fusions_discarded == True</filter> 485 <filter> output_fusions_discarded == True</filter>
486 <expand macro="fusion_actions" />
442 </data> 487 </data>
488 <data name="fusions_vcf" format="vcf" label="${tool.name} on ${on_string}: fusions.vcf" from_work_dir="fusions.vcf">
489 <filter> output_fusions_vcf == True</filter>
490 </data>
491 <collection name="fusion_bams" type="list" label="${tool.name} on ${on_string}: Fusion Alignments">
492 <discover_datasets pattern="(?P&lt;name&gt;fusion_\d+\.bam)$" format="bam" directory="fusion_bams" visible="false"/>
493 <filter>output_fusion_bams == True</filter>
494 </collection>
443 <data name="aligned_bam" format="bam" label="${tool.name} on ${on_string}: Aligned.bam" from_work_dir="Aligned.sortedByCoord.out.bam"> 495 <data name="aligned_bam" format="bam" label="${tool.name} on ${on_string}: Aligned.bam" from_work_dir="Aligned.sortedByCoord.out.bam">
444 <filter>input_params['input_source'] == "use_fastq"</filter> 496 <filter>input_params['input_source'] == "use_fastq"</filter>
445 </data> 497 </data>
446 <data name="fusions_pdf" format="pdf" label="${tool.name} on ${on_string}: fusions.pdf" from_work_dir="fusions.pdf"> 498 <data name="fusions_pdf" format="pdf" label="${tool.name} on ${on_string}: fusions.pdf" from_work_dir="fusions.pdf">
447 <filter>visualization['do_viz'] == "yes"</filter> 499 <filter>visualization['do_viz'] == "yes"</filter>
469 <has_text_matching expression="BCR\tABL1"/> 521 <has_text_matching expression="BCR\tABL1"/>
470 </assert_contents> 522 </assert_contents>
471 </output> 523 </output>
472 </test> 524 </test>
473 <!-- Test 2 - From exisitng BAM with protein_domains and visualization --> 525 <!-- Test 2 - From exisitng BAM with protein_domains and visualization -->
474
475 <test> 526 <test>
476 <conditional name="input_params"> 527 <conditional name="input_params">
477 <param name="input_source" value="use_star"/> 528 <param name="input_source" value="use_star"/>
478 <param name="input" ftype="sam" value="Aligned.out.sam"/> 529 <param name="input" ftype="sam" value="Aligned.out.sam"/>
479 </conditional> 530 </conditional>
535 586
536 - Alignments 587 - Alignments
537 588
538 Arriba takes the main output file of STAR (Aligned.out.bam) as input (parameter -x). If STAR was run with the parameter --chimOutType WithinBAM, then this file contains all the information needed by Arriba to find fusions. When STAR was run with the parameter --chimOutType SeparateSAMold, the main output file lacks chimeric alignments. Instead, STAR writes them to a separate output file named Chimeric.out.sam. In this case, the file needs to be passed to Arriba via the parameter -c in addition to the main output file Aligned.out.bam. 589 Arriba takes the main output file of STAR (Aligned.out.bam) as input (parameter -x). If STAR was run with the parameter --chimOutType WithinBAM, then this file contains all the information needed by Arriba to find fusions. When STAR was run with the parameter --chimOutType SeparateSAMold, the main output file lacks chimeric alignments. Instead, STAR writes them to a separate output file named Chimeric.out.sam. In this case, the file needs to be passed to Arriba via the parameter -c in addition to the main output file Aligned.out.bam.
539 590
591 STAR index create recommended parameter value:
592
593 * --sjdbOverhang 250
594
595
596 STAR recommended parameter values ::
597
598 * --outSAMunmapped Within
599 * --outFilterMultimapNmax 50
600 * --peOverlapNbasesMin 10
601 * --alignSplicedMateMapLminOverLmate 0.5
602 * --alignSJstitchMismatchNmax 5 -1 5 5
603 * --chimSegmentMin 10
604 * --chimOutType WithinBAM HardClip
605 * --chimJunctionOverhangMin 10
606 * --chimScoreDropMax 30
607 * --chimScoreJunctionNonGTAG 0
608 * --chimScoreSeparation 1
609 * --chimSegmentReadGapMax 3
610 * --chimMultimapNmax 50
611
612
540 Arriba extracts three types of reads from the alignment file(s): 613 Arriba extracts three types of reads from the alignment file(s):
541 614
542 * Split-reads, i.e., reads composed of segments which map in a non-linear way. STAR stores such reads as supplementary alignments. 615 * Split-reads, i.e., reads composed of segments which map in a non-linear way. STAR stores such reads as supplementary alignments.
543 * Discordant mates, i.e., paired-end reads which originate from the same fragment but which align in a non-linear way. 616 * Discordant mates, i.e., paired-end reads which originate from the same fragment but which align in a non-linear way.
544 * Alignments which cross the boundaries of annotated genes, because these alignments might arise from focal deletions. In RNA-Seq data deletions of up to several hundred kb are hard to distinguish from splicing. They are represented identically as gapped alignments, because the sizes of many introns are in fact of this order of magnitude. STAR applies a rather arbitrary measure to decide whether a gapped alignment arises from splicing or from a genomic deletion: The parameter --alignIntronMax determines what gap size is still assumed to be a splicing event and introns are used to represent these gaps. Only gaps larger than this limit are classified as potential evidence for genomic deletions and are stored as chimeric alignments. Most STAR-based fusion detection tools only consider chimeric alignments as evidence for gene fusions and are blind to focal deletions, hence. As a workaround, these tools recommend reducing the value of the parameter --alignIntronMax. But this impairs the quality of alignment, because it reduces the scope that STAR searches to find a spliced alignment. To avoid compromising the quality of alignment for the sake of fusion detection, the only solution would be to run STAR twice - once with settings optimized for regular alignment and once for fusion detection. This would double the runtime. In contrast, Arriba does not require to reduce the maximum intron size. It employs a more sensible criterion to distinguish splicing from deletions: Arriba considers all those reads as potential evidence for deletions that span the boundary of annotated genes. 617 * Alignments which cross the boundaries of annotated genes, because these alignments might arise from focal deletions. In RNA-Seq data deletions of up to several hundred kb are hard to distinguish from splicing. They are represented identically as gapped alignments, because the sizes of many introns are in fact of this order of magnitude. STAR applies a rather arbitrary measure to decide whether a gapped alignment arises from splicing or from a genomic deletion: The parameter --alignIntronMax determines what gap size is still assumed to be a splicing event and introns are used to represent these gaps. Only gaps larger than this limit are classified as potential evidence for genomic deletions and are stored as chimeric alignments. Most STAR-based fusion detection tools only consider chimeric alignments as evidence for gene fusions and are blind to focal deletions, hence. As a workaround, these tools recommend reducing the value of the parameter --alignIntronMax. But this impairs the quality of alignment, because it reduces the scope that STAR searches to find a spliced alignment. To avoid compromising the quality of alignment for the sake of fusion detection, the only solution would be to run STAR twice - once with settings optimized for regular alignment and once for fusion detection. This would double the runtime. In contrast, Arriba does not require to reduce the maximum intron size. It employs a more sensible criterion to distinguish splicing from deletions: Arriba considers all those reads as potential evidence for deletions that span the boundary of annotated genes.