comparison arriba.xml @ 9:8c4c97fd0555 draft

"planemo upload for repository https://github.com/jj-umn/tools-iuc/tree/arriba/tools/arriba commit bd2c6bea7cb7dc30ca57f9d69ad49460ddf7f14b"
author jjohnson
date Wed, 13 Oct 2021 18:45:16 +0000
parents 1a56888ddb7d
children c58d1774c762
comparison
equal deleted inserted replaced
8:1a56888ddb7d 9:8c4c97fd0555
4 <import>macros.xml</import> 4 <import>macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="requirements" /> 6 <expand macro="requirements" />
7 <expand macro="version_command" /> 7 <expand macro="version_command" />
8 <command detect_errors="exit_code"><![CDATA[ 8 <command detect_errors="exit_code"><![CDATA[
9 @GENOME_SOURCE@
9 #if str($input_params.input_source) == "use_fastq" 10 #if str($input_params.input_source) == "use_fastq"
10 #set $readFilesCommand = '' 11 #set $readFilesCommand = ''
11 #if $input_params.left_fq.is_of_type("fastq.gz"): 12 #set $read2 = ''
12 #set read1 = 'input_1.fastq.gz' 13 #if str($input_params.singlePaired.sPaired) == 'paired_collection':
13 #set $readFilesCommand = '--readFilesCommand zcat' 14 #if $input_params.singlePaired.input.forward.is_of_type('fastq.gz', 'fastqsanger.gz'):
14 #else: 15 #set $readFilesCommand = '--readFilesCommand zcat'
15 #set read1 = 'input_1.fastq' 16 #set read1 = 'input_1.fastq.gz'
16 #end if 17 #set read2 = 'input_2.fastq.gz'
17 ln -f -s '${input_params.left_fq}' ${read1} && 18 #else
18 #if $input_params.right_fq.is_of_type("fastq.gz"): 19 #set read1 = 'input_1.fastq'
19 #set read2 = 'input_2.fastq.gz' 20 #set read2 = 'input_2.fastq'
20 #else: 21 #end if
21 #set read2 = 'input_2.fastq' 22 ln -sf '${$input_params.singlePaired.input.forward}' ${read1} &&
22 #end if 23 ln -sf '${$input_params.singlePaired.input.reverse}' ${read2} &&
23 ln -f -s '${input_params.right_fq}' ${read2} && 24 #else
25 #if $input_params.singlePaired.input1.is_of_type('fastq.gz', 'fastqsanger.gz'):
26 #set $readFilesCommand = '--readFilesCommand zcat'
27 #set read1 = 'input_1.fastq.gz'
28 #else
29 #set read1 = 'input_1.fastq'
30 #end if
31 ln -sf '$input_params.singlePaired.input1' ${read1} &&
32 #if str($input_params.singlePaired.sPaired) == 'paired':
33 #set $read2 = $read1.replace('1','2')
34 ln -sf '$input_params.singlePaired.input2' ${read2} &&
35 #end if
36 #end if
24 #if str($input_params.index.index_source) == "history" 37 #if str($input_params.index.index_source) == "history"
25 #set $star_index_dir = $input_params.index.star_index.extra_files_path 38 #set $star_index_dir = $input_params.index.star_index.extra_files_path
39 #else
40 #set $star_index_dir = $input_params.index.arriba_ref.fields.star_index
26 #end if 41 #end if
27 STAR 42 STAR
28 --runThreadN \${GALAXY_SLOTS:-1} 43 --runThreadN \${GALAXY_SLOTS:-1}
29 --genomeDir $star_index_dir 44 --genomeDir $star_index_dir
30 --genomeLoad NoSharedMemory 45 --genomeLoad NoSharedMemory
55 #if $input_params.chimeric 70 #if $input_params.chimeric
56 -c '$input_params.chimeric' 71 -c '$input_params.chimeric'
57 #end if 72 #end if
58 #end if 73 #end if
59 -a '$genome_assembly' 74 -a '$genome_assembly'
60 -g '$annotation' 75 -g '$genome_annotation'
61 #if $blacklist 76 #if $blacklist
62 -b '$blacklist' 77 -b '$blacklist'
63 #else 78 #else
64 -f 'blacklist' 79 -f 'blacklist'
65 #end if 80 #end if
172 <param name="chimeric" argument="-c" type="data" format="sam,bam,cram" optional="true" label="STAR Chimeric.out.sam"> 187 <param name="chimeric" argument="-c" type="data" format="sam,bam,cram" optional="true" label="STAR Chimeric.out.sam">
173 <help><![CDATA[ only required, if STAR was run with the parameter '--chimOutType SeparateSAMold' ]]></help> 188 <help><![CDATA[ only required, if STAR was run with the parameter '--chimOutType SeparateSAMold' ]]></help>
174 </param> 189 </param>
175 </when> 190 </when>
176 <when value="use_fastq"> 191 <when value="use_fastq">
177 <param name="left_fq" 192 <conditional name="singlePaired">
178 type="data" 193 <param name="sPaired" type="select" label="Single-end or paired-end reads">
179 format="fastqsanger,fastqsanger.gz" 194 <option value="single" selected="true">Single-end</option>
180 argument="--left_fq" 195 <option value="paired">Paired-end (as individual datasets)</option>
181 label="left.fq file"/> 196 <option value="paired_collection">Paired-end (as collection)</option>
182 <param name="right_fq" 197 </param>
183 type="data" 198 <when value="single">
184 format="fastqsanger,fastqsanger.gz" 199 <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input1" type="data" label="RNA-Seq FASTQ/FASTA file"/>
185 argument="--right_fq" 200 </when>
186 label="right.fq file"/> 201 <when value="paired">
202 <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input1" type="data" label="RNA-Seq FASTQ/FASTA file, forward reads"/>
203 <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input2" type="data" label="RNA-Seq FASTQ/FASTA file, reverse reads"/>
204 </when>
205 <when value="paired_collection">
206 <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input" type="data_collection" collection_type="paired" label="RNA-Seq FASTQ/FASTA paired reads"/>
207 </when>
208 </conditional>
187 <conditional name="index"> 209 <conditional name="index">
188 <param name="index_source" type="select" label="Arriba STAR index source"> 210 <param name="index_source" type="select" label="Arriba STAR index source">
189 <option value="history">Arriba STAR index from your history</option> 211 <option value="history">Arriba STAR index from your history</option>
212 <option value="cached">Use a built-in Arriba STAR index</option>
190 </param> 213 </param>
191 <when value="history"> 214 <when value="history">
192 <param name="star_index" argument="--genomeDir" type="data" format="txt" label="Arriba STAR index" 215 <param name="star_index" argument="--genomeDir" type="data" format="txt" label="Arriba STAR index"
193 help="generated by: Arriba Reference"/> 216 help="generated by: Arriba Reference"/>
194 </when> 217 </when>
218 <when value="cached">
219 <param name="arriba_ref" type="select" label="Arriba STAR index">
220 <options from_data_table="arriba_indexes">
221 </options>
222 </param>
223 </when>
195 </conditional> 224 </conditional>
225
196 </when> 226 </when>
197 </conditional> 227 </conditional>
198 <param name="genome_assembly" argument="-a" type="data" format="fasta" label="genome assembly fasta"/> 228 <expand macro="genome_source" />
199 <param name="annotation" argument="-g" type="data" format="gtf" label="GTF file with gene annotation"/>
200 <param name="blacklist" argument="-b" type="data" format="tabular,tabular.gz" optional="true" label="File containing blacklisted ranges."/> 229 <param name="blacklist" argument="-b" type="data" format="tabular,tabular.gz" optional="true" label="File containing blacklisted ranges."/>
201 <param name="protein_domains" argument="-p" type="data" format="gff3" optional="true" label="File containing protein domains"/> 230 <param name="protein_domains" argument="-p" type="data" format="gff3" optional="true" label="File containing protein domains"/>
202 <param name="known_fusions" argument="-k" type="data" format="tabular,tabular.gz" optional="true" label="File containing known fusions"> 231 <param name="known_fusions" argument="-k" type="data" format="tabular,tabular.gz" optional="true" label="File containing known fusions">
203 <help><![CDATA[ file two TAB separated columns: five-prime region three-prime region ]]></help> 232 <help><![CDATA[ file two TAB separated columns: five-prime region three-prime region ]]></help>
204 </param> 233 </param>
393 422
394 </inputs> 423 </inputs>
395 <outputs> 424 <outputs>
396 <data name="fusions_tsv" format="tabular" label="${tool.name} on ${on_string}: fusions.tsv" from_work_dir="fusions.tsv"/> 425 <data name="fusions_tsv" format="tabular" label="${tool.name} on ${on_string}: fusions.tsv" from_work_dir="fusions.tsv"/>
397 <data name="discarded_fusions_tsv" format="tabular" label="${tool.name} on ${on_string}: fusions.discarded.tsv" from_work_dir="fusions.discarded.tsv"> 426 <data name="discarded_fusions_tsv" format="tabular" label="${tool.name} on ${on_string}: fusions.discarded.tsv" from_work_dir="fusions.discarded.tsv">
398 <filter> output_fusions_discarded == "yes"</filter> 427 <filter> output_fusions_discarded == True</filter>
399 </data> 428 </data>
400 <data name="aligned_bam" format="bam" label="${tool.name} on ${on_string}: Aligned.bam" from_work_dir="Aligned.sortedByCoord.out.bam"> 429 <data name="aligned_bam" format="bam" label="${tool.name} on ${on_string}: Aligned.bam" from_work_dir="Aligned.sortedByCoord.out.bam">
401 <filter>input_params['input_source'] == "use_fastq"</filter> 430 <filter>input_params['input_source'] == "use_fastq"</filter>
402 </data> 431 </data>
403 <data name="fusions_pdf" format="pdf" label="${tool.name} on ${on_string}: fusions.pdf" from_work_dir="fusions.pdf"> 432 <data name="fusions_pdf" format="pdf" label="${tool.name} on ${on_string}: fusions.pdf" from_work_dir="fusions.pdf">
409 <test> 438 <test>
410 <conditional name="input_params"> 439 <conditional name="input_params">
411 <param name="input_source" value="use_star"/> 440 <param name="input_source" value="use_star"/>
412 <param name="input" ftype="sam" value="Aligned.out.sam"/> 441 <param name="input" ftype="sam" value="Aligned.out.sam"/>
413 </conditional> 442 </conditional>
414 <param name="genome_assembly" ftype="fasta" value="genome.fasta"/> 443 <conditional name="genome">
415 <param name="annotation" ftype="gtf" value="genome.gtf"/> 444 <param name="genome_source" value="history"/>
445 <param name="assembly" ftype="fasta" value="genome.fasta"/>
446 <param name="annotation" ftype="gtf" value="genome.gtf"/>
447 </conditional>
416 <param name="protein_domains" ftype="gff3" value="protein_domains.gff3"/> 448 <param name="protein_domains" ftype="gff3" value="protein_domains.gff3"/>
417 <conditional name="visualization"> 449 <conditional name="visualization">
418 <param name="do_viz" value="no"/> 450 <param name="do_viz" value="no"/>
419 <param name="cytobands" ftype="tabular" value="cytobands.tsv"/> 451 <param name="cytobands" ftype="tabular" value="cytobands.tsv"/>
420 </conditional> 452 </conditional>
423 <has_text_matching expression="BCR\tABL1"/> 455 <has_text_matching expression="BCR\tABL1"/>
424 </assert_contents> 456 </assert_contents>
425 </output> 457 </output>
426 </test> 458 </test>
427 <!-- Test 2 - From exisitng BAM with protein_domains and visualization --> 459 <!-- Test 2 - From exisitng BAM with protein_domains and visualization -->
460
428 <test> 461 <test>
429 <conditional name="input_params"> 462 <conditional name="input_params">
430 <param name="input_source" value="use_star"/> 463 <param name="input_source" value="use_star"/>
431 <param name="input" ftype="sam" value="Aligned.out.sam"/> 464 <param name="input" ftype="sam" value="Aligned.out.sam"/>
432 </conditional> 465 </conditional>
433 <param name="genome_assembly" ftype="fasta" value="genome.fasta"/> 466 <conditional name="genome">
434 <param name="annotation" ftype="gtf" value="genome.gtf"/> 467 <param name="genome_source" value="history"/>
468 <param name="assembly" ftype="fasta" value="genome.fasta"/>
469 <param name="annotation" ftype="gtf" value="genome.gtf"/>
470 </conditional>
435 <param name="protein_domains" ftype="gff3" value="protein_domains.gff3"/> 471 <param name="protein_domains" ftype="gff3" value="protein_domains.gff3"/>
436 <conditional name="visualization"> 472 <conditional name="visualization">
437 <param name="do_viz" value="yes"/> 473 <param name="do_viz" value="yes"/>
438 <param name="cytobands" ftype="tabular" value="cytobands.tsv"/> 474 <param name="cytobands" ftype="tabular" value="cytobands.tsv"/>
439 </conditional> 475 </conditional>
446 <assert_contents> 482 <assert_contents>
447 <has_size value= "64000" delta="5000" /> 483 <has_size value= "64000" delta="5000" />
448 </assert_contents> 484 </assert_contents>
449 </output> 485 </output>
450 </test> 486 </test>
487 <!-- Test 3 - From exisitng BAM using cached genome source -->
488 <test>
489 <conditional name="input_params">
490 <param name="input_source" value="use_star"/>
491 <param name="input" ftype="sam" value="Aligned.out.sam"/>
492 </conditional>
493 <conditional name="genome">
494 <param name="genome_source" value="cached"/>
495 <param name="arriba_ref" value="GRCh38+ENSEMBL93"/>
496 </conditional>
497 <param name="protein_domains" ftype="gff3" value="protein_domains.gff3"/>
498 <conditional name="visualization">
499 <param name="do_viz" value="no"/>
500 <param name="cytobands" ftype="tabular" value="cytobands.tsv"/>
501 </conditional>
502 <output name="fusions_tsv">
503 <assert_contents>
504 <has_text_matching expression="BCR\tABL1"/>
505 </assert_contents>
506 </output>
507 </test>
451 508
452 </tests> 509 </tests>
453 <help><![CDATA[ 510 <help><![CDATA[
454 **Arriba** 511 **Arriba**
455 512
599 Arriba checks if the orientation of the structural variant matches that of a fusion detected in the RNA-Seq data. If, for example, Arriba predicts the 5' end of a gene to be retained in a fusion, then a structural variant is expected to confirm this, or else the variant is not considered to be related. 656 Arriba checks if the orientation of the structural variant matches that of a fusion detected in the RNA-Seq data. If, for example, Arriba predicts the 5' end of a gene to be retained in a fusion, then a structural variant is expected to confirm this, or else the variant is not considered to be related.
600 657
601 NOTE: Arriba was designed for alignments from RNA-Seq data. It should not be run on WGS data directly. Many assumptions made by Arriba about the data (statistical models, blacklist, etc.) only apply to RNA-Seq data and are not valid for DNA-Seq data. For such data, a structural variant calling algorithm should be used and the results should be passed to Arriba. 658 NOTE: Arriba was designed for alignments from RNA-Seq data. It should not be run on WGS data directly. Many assumptions made by Arriba about the data (statistical models, blacklist, etc.) only apply to RNA-Seq data and are not valid for DNA-Seq data. For such data, a structural variant calling algorithm should be used and the results should be passed to Arriba.
602 659
603 660
604 **OUTPUTS**
605
606 See: https://arriba.readthedocs.io/en/latest/output-files/
607
608 - fusions.tsv
609
610 The file fusions.tsv (as specified by the parameter -o) contains fusions which pass all of Arriba's filters. It should be highly enriched for true predictions. The predictions are listed from highest to lowest confidence. The following paragraphs describe the columns in detail:
611
612 * gene1 and gene2 : gene1 contains the gene which makes up the 5' end of the transcript and gene2 the gene which makes up the 3' end. The order is predicted on the basis of the strands that the supporting reads map to, how the reads are oriented, and splice patterns. Both columns may contain the same gene, if the event is intragenic. If a breakpoint is in an intergenic region, Arriba lists the closest genes upstream and downstream from the breakpoint, separated by a comma. The numbers in parentheses after the closest genes state the distance to the genes. If no genes are annotated for a contig (e.g., for viral genomes), the column contains a dot (.).
613
614 * strand1(gene/fusion) and strand2(gene/fusion) : Each of these columns contains two values seperated by a slash. The strand before the slash reflects the strand of the gene according to the gene annotation supplied to Arriba via the parameter -g. If the breakpoint is in an intergenic region, the value is .. The value after the slash reflects the strand that is transcribed. This does not necessarily match the strand of the gene, namely when the sense strand of a gene serves as the template for transcription. Occassionally, the strand that is transcribed cannot be predicted reliably. In this case, Arriba indicates the lack of information as a dot (.). Arriba uses splice-patterns of the alignments to assign a read to the appropriate originating gene. If a strand-specific library was used, Arriba also evaluates the strandedness in ambiguous situations, for example, when none of the supporting reads overlaps a splice-site.
615
616 * breakpoint1 and breakpoint2 : The columns contain the coordinates of the breakpoints in gene1 and gene2, respectively. If an event is not supported by any split reads but only by discordant mates, the coordinates given here are those of the discordant mates which are closest to the true but unknown breakpoint.
617
618 * site1 and site2 : These columns add information about the location of the breakpoints. Possible values are: 5' UTR, 3' UTR, UTR (overlapping with a 5' UTR as well as a 3' UTR), CDS (coding sequence), exon, intron, and intergenic. The keyword exon is used for non-coding genes or for ambiguous situations where the breakpoint overlaps with both a coding exon and a UTR. If the breakpoint coincides with an exon boundary, the additional keyword splice-site is appended.
619
620 * type : Based on the orientation of the supporting reads and the coordinates of breakpoints, the type of event can be inferred. Possible values are: translocation (between different chromosomes), duplication, inversion, and deletion. If genes are fused head-to-head or tail-to-tail, this is indicated as 5'-5' or 3'-3' respectively. Genes fused in such an orientation cannot yield a chimeric protein, since one of the genes is transcribed from the wrong strand. This type of event is equivalent to the truncation of the genes. The following types of events are flagged with an extra keyword, because they are frequent types of false positives and/or it is not clear if they are somatic or germline variants: Deletions with a size in the range of introns (<400kb) are flagged as read-through, because there is a high chance that the fusion arises from read-through transcription rather than an underlying genomic deletion. Intragenic duplications with both breakpoints at splice-sites are flagged as non-canonical-splicing, because the supporting reads might originate from circular RNAs, which are very abundant even in normal tissue, but manifest as duplications in RNA-Seq data. Internal tandem duplications are flagged as ITD. It is not always clear whether the ITDs observable in RNA-Seq data are somatic or germline variants, because ITDs are abundant in the germline and germline variants cannot be filtered effectively due to lack of a normal control.
621
622 * split_reads1 and split_reads2 : The number of supporting split fragments with an anchor in gene1 or gene2, respectively, is given in these columns. The gene to which the longer segment of the split read aligns is defined as the anchor.
623
624 * discordant_mates : This column contains the number of pairs (fragments) of discordant mates (a.k.a. spanning reads or bridge reads) supporting the fusion.
625
626 * coverage1 and coverage2 : These two columns show the coverage near breakpoint1 and breakpoint2, respectively. The coverage is calculated as the number of fragments near the breakpoint on the side of the breakpoint that is retained in the fusion transcript. Note that the coverage calculation counts all fragments (even duplicates), whereas the columns split_reads1, split_reads2, and discordant_mates only count non-discarded reads. Fragments discarded due to being duplicates or other types of artifacts can be found in the column filters.
627
628 * confidence : Each prediction is assigned one of the confidences low, medium, or high. Several characteristics are taken into account, including: the number of supporting reads, the balance of split reads and discordant mates, the distance between the breakpoints, the type of event, whether the breakpoints are intragenic or not, and whether there are other events which corroborate the prediction, e.g. multiple isoforms or balanced translocations. See section Interpretation of results for further advice on judging the credibility of predictions.
629
630 * reading_frame : This column states whether the gene at the 3' end of the fusion is fused in-frame or out-of-frame. The value stop-codon indicates that there is a stop codon prior to the fusion junction, such that the 3' end is not translated, even if the reading frame is preserved across the junction. The prediction of the reading frame builds on the prediction of the peptide sequence. A dot (.) indicates that the peptide sequence cannot be predicted, for example, because the transcript sequence could not be determined or because the breakpoint of the 5' gene does not overlap a coding region.
631
632 * tags : When a user-defined list of tags is provided via the parameter -t, this column is populated with the provided tag whenever a fusion matches the coordinates specified for the respective tag. When multiple tags match, they are separated by a comma.
633
634 * retained_protein_domains : If Arriba is provided with protein domain annotation using the parameter -p, then this column is populated with protein domains retained in the fusion. Multiple protein domains are separated by a comma. Redundant protein domains are only listed once. After every domain the fraction that is retained is stated as a percentage value in parentheses. The protein domains of the 5' and 3' genes are separated by a pipe symbol (|).
635
636 * closest_genomic_breakpoint1 and closest_genomic_breakpoint2 : When a matched whole-genome sequencing sample is available, one can feed structural variant calls obtained therefrom into Arriba (see parameter -d). Arriba then considers this information during fusion calling, which improves the overall accuracy. These two columns contain the coordinates of the genomic breakpoints which are closest to the transcriptomic breakpoints given in the columns breakpoint1 and breakpoint2. The values in parentheses are the distances between transcriptomic and genomic breakpoints.
637
638 * gene_id1 and gene_id2 : These two columns state the identifiers of the fused genes as given in the gene_id attribute in the GTF file.
639
640 * transcript_id1 and transcript_id2 : For both fused genes, Arriba determines the best matching isoform that is transcribed as part of the fusion. The isoform is selected by how well its annotated exons match the splice pattern of the supporting reads of a fusion.
641
642 * direction1 and direction2 : These columns indicate the orientation of the fusion. A value of downstream means that the partner is fused downstream of the breakpoint, i.e. at a coordinate higher than the breakpoint. A value of upstream means the partner is fused at a coordinate lower than the breakpoint. When the prediction of the strands or of the 5' gene fails, this information gives insight into which parts of the fused genes are retained in the fusion.
643
644 * filters : This column lists the filters which removed one or more of the supporting reads. The section Internal algorithm describes all filters in detail. The number of filtered reads is given in parentheses after the name of the filter. The total number of supporting reads can be obtained by summing up the reads given in the columns split_reads1, split_reads2, discordant_mates, and filters. If a filter discarded the event as a whole (all reads), the number of filtered reads is not stated.
645
646 * fusion_transcript : This column contains the fusion transcript sequence. The sequence is assembled from the supporting reads of the most highly expressed transcript. It represents the transcript isoform that is most likely expressed according to the splice patterns of the supporting reads. The column contains a dot (.), when the sequence could not be predicted. This is the case when the strands or the 5' end of the transcript could not be predicted reliably. The breakpoint is represented as a pipe symbol (|). When non-template bases are inserted between the fused genes, these bases are represented as lowercase letters between two pipes. Reference mismatches (SNPs or SNVs) are indicated as lowercase letters, insertions as bases between brackets ([ and ]), deleted bases as one or more dashes (-), introns as three underscores (___), and ambiguous positions, such as positions with diverse reference mismatches, are represented as ?. Missing information due to insufficient coverage is denoted as an ellipsis (...). If the switch -I is used, then an attempt is made to fill missing information with the assembly sequence. A sequence stretch that was taken from the assembly sequence rather than the supporting reads is wrapped in parentheses (( and )). In addition, when -I is used, the sequence is trimmed to the boundaries of the fused transcripts. The coordinate of the fusion breakpoint relative to the start of the transcript can thus easily be inferred by counting the bases from the beginning of the fusion transcript to the breakpoint character (|). In case the full sequence could be constructed from the combined information of supporting reads and assembly sequence, the start of the fusion transcript is marked by a caret sign (^) and the end by a dollar sign ($). If the full sequence could not be constructed, these signs are missing.
647
648 * peptide_sequence : This column contains the fusion peptide sequence. The sequence is translated from the fusion transcript given in the column fusion_transcript and determines the reading frame of the fused genes according to the transcript isoforms given in the columns transcript_id1 and transcript_id2. Translation starts at the start of the assembled fusion transcript or when the start codon is encountered in the 5' gene. Translation ends when either the end of the assembled fusion transcript is reached or when a stop codon is encountered. If the fusion transcript contains an ellipsis (...), the sequence beyond the ellipsis is trimmed before translation, because the reading frame cannot be determined reliably. The column contains a dot (.), when the transcript sequence could not be predicted or when the precise breakpoints are unknown due to lack of split reads or when the fusion transcript does not overlap any coding exons in the 5' gene or when no start codon could be found in the 5' gene or when there is a stop codon prior to the fusion junction (in which case the column reading_frame contains the value stop-codon). The breakpoint is represented as a pipe symbol (|). If a codon spans the breakpoint, the amino acid is placed on the side of the breakpoint where two of the three bases reside. Codons resulting from non-template bases are flanked by two pipes. Amino acids are written as lowercase characters in the following situations: non-silent SNVs/SNPs, insertions, frameshifts, codons spanning the breakpoint, non-coding regions (introns/intergenic regions/UTRs), and non-template bases. Codons which cannot be translated to amino acids, such as those having invalid characters, are represented as ?.
649
650 * read_identifiers : This column contains the names of the supporting reads separated by commas.
651
652 - fusions.discarded.tsv
653
654 The file fusions.discarded.tsv (as specified by the parameter -O) contains all events that Arriba classified as an artifact or that are also observed in healthy tissue. It has the same format as the file fusions.tsv.
655
656
657 **VISUALIZATION**
658
659 See: https://arriba.readthedocs.io/en/latest/visualization/
660
661 - fusions.pdf
662
663 A PDF file with one page for each predicted fusion. Each page depicts the fusion partners, their orientation, the retained exons in the fusion transcript, statistics about the number of supporting reads, and if the column fusion_transcript has a value an excerpt of the sequence around the breakpoint.
664
665
666 **OPTIONS** 661 **OPTIONS**
667 662
668 - Arriba: https://arriba.readthedocs.io/en/latest/command-line-options/#arriba 663 - Arriba: https://arriba.readthedocs.io/en/latest/command-line-options/#arriba
669 - Visualization: https://arriba.readthedocs.io/en/latest/command-line-options/#draw_fusionsr 664 - Visualization: https://arriba.readthedocs.io/en/latest/command-line-options/#draw_fusionsr
670 - RNA STAR: https://arriba.readthedocs.io/en/latest/workflow/ 665 - RNA STAR: https://arriba.readthedocs.io/en/latest/workflow/
666
667
668 **OUTPUTS**
669
670 See: https://arriba.readthedocs.io/en/latest/output-files/
671
672 - fusions.tsv
673
674 The file fusions.tsv (as specified by the parameter -o) contains fusions which pass all of Arriba's filters. It should be highly enriched for true predictions. The predictions are listed from highest to lowest confidence. The following paragraphs describe the columns in detail:
675
676 * gene1 and gene2 : gene1 contains the gene which makes up the 5' end of the transcript and gene2 the gene which makes up the 3' end. The order is predicted on the basis of the strands that the supporting reads map to, how the reads are oriented, and splice patterns. Both columns may contain the same gene, if the event is intragenic. If a breakpoint is in an intergenic region, Arriba lists the closest genes upstream and downstream from the breakpoint, separated by a comma. The numbers in parentheses after the closest genes state the distance to the genes. If no genes are annotated for a contig (e.g., for viral genomes), the column contains a dot (.).
677
678 * strand1(gene/fusion) and strand2(gene/fusion) : Each of these columns contains two values seperated by a slash. The strand before the slash reflects the strand of the gene according to the gene annotation supplied to Arriba via the parameter -g. If the breakpoint is in an intergenic region, the value is .. The value after the slash reflects the strand that is transcribed. This does not necessarily match the strand of the gene, namely when the sense strand of a gene serves as the template for transcription. Occassionally, the strand that is transcribed cannot be predicted reliably. In this case, Arriba indicates the lack of information as a dot (.). Arriba uses splice-patterns of the alignments to assign a read to the appropriate originating gene. If a strand-specific library was used, Arriba also evaluates the strandedness in ambiguous situations, for example, when none of the supporting reads overlaps a splice-site.
679
680 * breakpoint1 and breakpoint2 : The columns contain the coordinates of the breakpoints in gene1 and gene2, respectively. If an event is not supported by any split reads but only by discordant mates, the coordinates given here are those of the discordant mates which are closest to the true but unknown breakpoint.
681
682 * site1 and site2 : These columns add information about the location of the breakpoints. Possible values are: 5' UTR, 3' UTR, UTR (overlapping with a 5' UTR as well as a 3' UTR), CDS (coding sequence), exon, intron, and intergenic. The keyword exon is used for non-coding genes or for ambiguous situations where the breakpoint overlaps with both a coding exon and a UTR. If the breakpoint coincides with an exon boundary, the additional keyword splice-site is appended.
683
684 * type : Based on the orientation of the supporting reads and the coordinates of breakpoints, the type of event can be inferred. Possible values are: translocation (between different chromosomes), duplication, inversion, and deletion. If genes are fused head-to-head or tail-to-tail, this is indicated as 5'-5' or 3'-3' respectively. Genes fused in such an orientation cannot yield a chimeric protein, since one of the genes is transcribed from the wrong strand. This type of event is equivalent to the truncation of the genes. The following types of events are flagged with an extra keyword, because they are frequent types of false positives and/or it is not clear if they are somatic or germline variants: Deletions with a size in the range of introns (<400kb) are flagged as read-through, because there is a high chance that the fusion arises from read-through transcription rather than an underlying genomic deletion. Intragenic duplications with both breakpoints at splice-sites are flagged as non-canonical-splicing, because the supporting reads might originate from circular RNAs, which are very abundant even in normal tissue, but manifest as duplications in RNA-Seq data. Internal tandem duplications are flagged as ITD. It is not always clear whether the ITDs observable in RNA-Seq data are somatic or germline variants, because ITDs are abundant in the germline and germline variants cannot be filtered effectively due to lack of a normal control.
685
686 * split_reads1 and split_reads2 : The number of supporting split fragments with an anchor in gene1 or gene2, respectively, is given in these columns. The gene to which the longer segment of the split read aligns is defined as the anchor.
687
688 * discordant_mates : This column contains the number of pairs (fragments) of discordant mates (a.k.a. spanning reads or bridge reads) supporting the fusion.
689
690 * coverage1 and coverage2 : These two columns show the coverage near breakpoint1 and breakpoint2, respectively. The coverage is calculated as the number of fragments near the breakpoint on the side of the breakpoint that is retained in the fusion transcript. Note that the coverage calculation counts all fragments (even duplicates), whereas the columns split_reads1, split_reads2, and discordant_mates only count non-discarded reads. Fragments discarded due to being duplicates or other types of artifacts can be found in the column filters.
691
692 * confidence : Each prediction is assigned one of the confidences low, medium, or high. Several characteristics are taken into account, including: the number of supporting reads, the balance of split reads and discordant mates, the distance between the breakpoints, the type of event, whether the breakpoints are intragenic or not, and whether there are other events which corroborate the prediction, e.g. multiple isoforms or balanced translocations. See section Interpretation of results for further advice on judging the credibility of predictions.
693
694 * reading_frame : This column states whether the gene at the 3' end of the fusion is fused in-frame or out-of-frame. The value stop-codon indicates that there is a stop codon prior to the fusion junction, such that the 3' end is not translated, even if the reading frame is preserved across the junction. The prediction of the reading frame builds on the prediction of the peptide sequence. A dot (.) indicates that the peptide sequence cannot be predicted, for example, because the transcript sequence could not be determined or because the breakpoint of the 5' gene does not overlap a coding region.
695
696 * tags : When a user-defined list of tags is provided via the parameter -t, this column is populated with the provided tag whenever a fusion matches the coordinates specified for the respective tag. When multiple tags match, they are separated by a comma.
697
698 * retained_protein_domains : If Arriba is provided with protein domain annotation using the parameter -p, then this column is populated with protein domains retained in the fusion. Multiple protein domains are separated by a comma. Redundant protein domains are only listed once. After every domain the fraction that is retained is stated as a percentage value in parentheses. The protein domains of the 5' and 3' genes are separated by a pipe symbol (|).
699
700 * closest_genomic_breakpoint1 and closest_genomic_breakpoint2 : When a matched whole-genome sequencing sample is available, one can feed structural variant calls obtained therefrom into Arriba (see parameter -d). Arriba then considers this information during fusion calling, which improves the overall accuracy. These two columns contain the coordinates of the genomic breakpoints which are closest to the transcriptomic breakpoints given in the columns breakpoint1 and breakpoint2. The values in parentheses are the distances between transcriptomic and genomic breakpoints.
701
702 * gene_id1 and gene_id2 : These two columns state the identifiers of the fused genes as given in the gene_id attribute in the GTF file.
703
704 * transcript_id1 and transcript_id2 : For both fused genes, Arriba determines the best matching isoform that is transcribed as part of the fusion. The isoform is selected by how well its annotated exons match the splice pattern of the supporting reads of a fusion.
705
706 * direction1 and direction2 : These columns indicate the orientation of the fusion. A value of downstream means that the partner is fused downstream of the breakpoint, i.e. at a coordinate higher than the breakpoint. A value of upstream means the partner is fused at a coordinate lower than the breakpoint. When the prediction of the strands or of the 5' gene fails, this information gives insight into which parts of the fused genes are retained in the fusion.
707
708 * filters : This column lists the filters which removed one or more of the supporting reads. The section Internal algorithm describes all filters in detail. The number of filtered reads is given in parentheses after the name of the filter. The total number of supporting reads can be obtained by summing up the reads given in the columns split_reads1, split_reads2, discordant_mates, and filters. If a filter discarded the event as a whole (all reads), the number of filtered reads is not stated.
709
710 * fusion_transcript : This column contains the fusion transcript sequence. The sequence is assembled from the supporting reads of the most highly expressed transcript. It represents the transcript isoform that is most likely expressed according to the splice patterns of the supporting reads. The column contains a dot (.), when the sequence could not be predicted. This is the case when the strands or the 5' end of the transcript could not be predicted reliably. The breakpoint is represented as a pipe symbol (|). When non-template bases are inserted between the fused genes, these bases are represented as lowercase letters between two pipes. Reference mismatches (SNPs or SNVs) are indicated as lowercase letters, insertions as bases between brackets ([ and ]), deleted bases as one or more dashes (-), introns as three underscores (___), and ambiguous positions, such as positions with diverse reference mismatches, are represented as ?. Missing information due to insufficient coverage is denoted as an ellipsis (...). If the switch -I is used, then an attempt is made to fill missing information with the assembly sequence. A sequence stretch that was taken from the assembly sequence rather than the supporting reads is wrapped in parentheses (( and )). In addition, when -I is used, the sequence is trimmed to the boundaries of the fused transcripts. The coordinate of the fusion breakpoint relative to the start of the transcript can thus easily be inferred by counting the bases from the beginning of the fusion transcript to the breakpoint character (|). In case the full sequence could be constructed from the combined information of supporting reads and assembly sequence, the start of the fusion transcript is marked by a caret sign (^) and the end by a dollar sign ($). If the full sequence could not be constructed, these signs are missing.
711
712 * peptide_sequence : This column contains the fusion peptide sequence. The sequence is translated from the fusion transcript given in the column fusion_transcript and determines the reading frame of the fused genes according to the transcript isoforms given in the columns transcript_id1 and transcript_id2. Translation starts at the start of the assembled fusion transcript or when the start codon is encountered in the 5' gene. Translation ends when either the end of the assembled fusion transcript is reached or when a stop codon is encountered. If the fusion transcript contains an ellipsis (...), the sequence beyond the ellipsis is trimmed before translation, because the reading frame cannot be determined reliably. The column contains a dot (.), when the transcript sequence could not be predicted or when the precise breakpoints are unknown due to lack of split reads or when the fusion transcript does not overlap any coding exons in the 5' gene or when no start codon could be found in the 5' gene or when there is a stop codon prior to the fusion junction (in which case the column reading_frame contains the value stop-codon). The breakpoint is represented as a pipe symbol (|). If a codon spans the breakpoint, the amino acid is placed on the side of the breakpoint where two of the three bases reside. Codons resulting from non-template bases are flanked by two pipes. Amino acids are written as lowercase characters in the following situations: non-silent SNVs/SNPs, insertions, frameshifts, codons spanning the breakpoint, non-coding regions (introns/intergenic regions/UTRs), and non-template bases. Codons which cannot be translated to amino acids, such as those having invalid characters, are represented as ?.
713
714 * read_identifiers : This column contains the names of the supporting reads separated by commas.
715
716 - fusions.discarded.tsv
717
718 The file fusions.discarded.tsv (as specified by the parameter -O) contains all events that Arriba classified as an artifact or that are also observed in healthy tissue. It has the same format as the file fusions.tsv.
719
720
721 **VISUALIZATION**
722
723 See: https://arriba.readthedocs.io/en/latest/visualization/
724
725 - fusions.pdf
726
727 A PDF file with one page for each predicted fusion. Each page depicts the fusion partners, their orientation, the retained exons in the fusion transcript, statistics about the number of supporting reads, and if the column fusion_transcript has a value an excerpt of the sequence around the breakpoint.
728
729 .. image:: draw-fusions-example.png
730 :width: 800
731 :height: 467
671 732
672 733
673 .. _Arriba: https://arriba.readthedocs.io/en/latest/ 734 .. _Arriba: https://arriba.readthedocs.io/en/latest/
674 .. _INPUTS: https://arriba.readthedocs.io/en/latest/input-files/ 735 .. _INPUTS: https://arriba.readthedocs.io/en/latest/input-files/
675 .. _OUTPUTS: https://arriba.readthedocs.io/en/latest/output-files/ 736 .. _OUTPUTS: https://arriba.readthedocs.io/en/latest/output-files/