# HG changeset patch
# User jjohnson
# Date 1633916842 0
# Node ID 7253b367c082ba2bc5fab6b2ab95dae0c69562b6
# Parent 005b200c8841774966038cfd2bb1cf2e8c92bef0
"planemo upload for repository https://github.com/jj-umn/tools-iuc/tree/arriba/tools/arriba commit ea14642edb0816912a856281944eb5e8a37c11ea"
diff -r 005b200c8841 -r 7253b367c082 arriba.xml
--- a/arriba.xml Sun Oct 10 13:00:45 2021 +0000
+++ b/arriba.xml Mon Oct 11 01:47:22 2021 +0000
@@ -70,10 +70,83 @@
#if $tags
-t '$tags'
#end if
+ #if str($wgs.use_wgs) == "yes"
+ -d '$wgs.wgs'
+ #if $wgs.max_genomic_breakpoint_distance
+ -D $wgs.max_genomic_breakpoint_distance
+ #end if
+ #end if
-o fusions.tsv
#if $output_fusions_discarded
-O fusions.discarded.tsv
#end if
+## Arriba options
+ #if $options.gtf_features
+ -G $options.gtf_features
+ #end if
+ #if $options.strandedness
+ -s $options.strandedness
+ #end if
+ #if $options.genome_contigs
+ -i $options.genome_contigs
+ #end if
+ #if $options.viral_contigs
+ -v $options.viral_contigs
+ #end if
+ #if $options.max_evalue
+ -E $options.max_evalue
+ #end if
+ #if $options.min_supporting_reads
+ -S $options.min_supporting_reads
+ #end if
+ #if $options.max_mismappers
+ -m $options.max_mismappers
+ #end if
+ #if $options.max_homolog_identity
+ -L $options.max_homolog_identity
+ #end if
+ #if $options.homopolymer_length
+ -H $options.homopolymer_length
+ #end if
+ #if $options.read_through_distance
+ -R $options.read_through_distance
+ #end if
+ #if $options.min_anchor_length
+ -A $options.min_anchor_length
+ #end if
+ #if $options.many_spliced_events
+ -M $options.many_spliced_events
+ #end if
+ #if $options.max_kmer_content
+ -m $options.max_kmer_content
+ #end if
+ #if $options.max_mismatch_pvalue
+ -V $options.max_mismatch_pvalue
+ #end if
+ #if $options.fragment_length
+ -F $options.fragment_length
+ #end if
+ #if $options.max_reads
+ -U $options.max_reads
+ #end if
+ #if $options.quantile
+ -Q $options.quantile
+ #end if
+ #if $options.exonic_fraction
+ -e $options.exonic_fraction
+ #end if
+ #if $options.top_n
+ -T $options.top_n
+ #end if
+ #if $options.covered_fraction
+ -C $options.covered_fraction
+ #end if
+ #if $options.max_itd_length
+ -l $options.max_itd_length
+ #end if
+ $options.duplicate_marking
+ $options.fill_discarded_columns
+ $options.fill_the_gaps
#if str($input_params.input_source) == "use_fastq"
&& samtools sort -@ \${GALAXY_SLOTS:-1} -m 4G -T tmp -O bam Aligned.out.bam > Aligned.sortedByCoord.out.bam
&& samtools index Aligned.sortedByCoord.out.bam
@@ -85,16 +158,55 @@
&& draw_fusions.R
--fusions=fusions.tsv
--alignments=Aligned.sortedByCoord.out.bam
+ --annotation='$gtf'
--output=fusions.pdf
- --annotation='$gtf'
#if $visualization.cytobands
--cytobands='$visualization.cytobands'
#end if
#if $protein_domains
--proteinDomains='$protein_domains'
#end if
+ ## Visualization Options
+ #if $visualization.options.transcriptSelection
+ --transcriptSelection=$visualization.options.transcriptSelection
+ #end if
+ #if $visualization.options.minConfidenceForCircosPlot
+ --minConfidenceForCircosPlot=$visualization.options.minConfidenceForCircosPlot
+ #end if
+ #if $visualization.options.showIntergenicVicinity
+ --showIntergenicVicinity=$visualization.options.showIntergenicVicinity
+ #end if
+ #if $visualization.options.squishIntrons
+ --squishIntrons=$visualization.options.squishIntrons
+ #end if
+ #if $visualization.options.mergeDomainsOverlappingBy
+ --mergeDomainsOverlappingBy=$visualization.options.mergeDomainsOverlappingBy
+ #end if
+ #if $visualization.options.printExonLabels
+ --printExonLabels=$visualization.options.printExonLabels
+ #end if
+ #if $visualization.options.render3dEffect
+ --render3dEffect=$visualization.options.render3dEffect
+ #end if
+ #if $visualization.options.optimizeDomainColors
+ --optimizeDomainColors=$visualization.options.optimizeDomainColors
+ #end if
+ #if $visualization.options.color1
+ --color1=$visualization.options.color1
+ #end if
+ #if $visualization.options.color2
+ --color2=$visualization.options.color2
+ #end if
+ #if $visualization.options.pdfWidth
+ --pdfWidth=$visualization.options.pdfWidth
+ #end if
+ #if $visualization.options.pdfHeight
+ --pdfHeight=$visualization.options.pdfHeight
+ #end if
+ #if $visualization.options.fontSize
+ --fontSize=$visualization.options.fontSize
+ #end if
#end if
-
]]>
@@ -139,7 +251,182 @@
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Commma or SPACE separated list, default: gene_name=gene_name gene_id=gene_id transcript_id=transcript_id feature_exon=exon feature_CDS=CDS
+ ^(gene_name|gene_id|transcript_id|feature_exon|feature_CDS)=[^ ,]+([ ,](gene_name|gene_id|transcript_id|feature_exon|feature_CDS)=[^ ,]+)?$
+
+
+ When unstranded data is processed, the strand can sometimes be inferred from splice-patterns. But in unclear situations, stranded data helps resolve ambiguities.
+
+
+
+
+
+
+ Comma-/space-separated list of interesting contigs.
+ Fusions between genes on other contigs are ignored. Contigs can be specified with or without the prefix "chr".
+ Asterisks (*) are treated as wild-cards.
+ Default: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y AC_* NC_*
+
+
+
+ Comma-/space-separated list of viral contigs.
+ Asterisks (*) are treated as wild-cards.
+ Default: AC_* NC_*
+
+
+
+ Arriba estimates the number of fusions with a given number of supporting
+ reads which one would expect to see by random chance. If the expected number
+ of fusions (e-value) is higher than this threshold, the fusion is
+ discarded by the 'relative_support' filter. Note: Increasing this
+ threshold can dramatically increase the number of false positives and may
+ increase the runtime of resource-intensive steps. Fractional values are possible.
+ Default: 0.300000
+
+
+
+
+ discard all fusions with fewer than this many supporting reads (split reads and discordant mates combined).
+ Default: 2
+
+
+
+ When more than this fraction of supporting reads turns out to be mismappers,
+ the 'mismappers' filter discards the fusion.
+ Default: 0.800000
+
+
+
+ Genes with more than the given fraction of sequence identity are
+ considered homologs and removed by the 'homologs' filter.
+ Default: 0.300000
+
+
+
+ The 'homopolymer' filter removes breakpoints adjacent to homopolymers of the given length or more.
+ Default: 6
+
+
+
+ The 'read_through' filter removes read-through fusions
+ where the breakpoints are less than the given distance away from each other.
+ Default: 10000
+
+
+
+ Alignment artifacts are often characterized by split reads coming
+ from only one gene and no discordant mates. Moreover, the split
+ reads only align to a short stretch in one of the genes. The
+ 'short_anchor' filter removes these fusions. This parameter sets
+ the threshold in bp for what the filter considers short.
+ Default: 23
+
+
+
+ The 'many_spliced' filter recovers fusions between genes that
+ have at least this many spliced breakpoints.
+ Default: 4
+
+
+
+ The 'low_entropy' filter removes reads with repetitive 3-mers. If
+ the 3-mers make up more than the given fraction of the sequence, then
+ the read is discarded.
+ Default: 0.600000
+
+
+
+
+ The 'mismatches' filter uses a binomial model to calculate a
+ p-value for observing a given number of mismatches in a read.
+ If the number of mismatches is too high, the read is discarded.
+ Default: 0.010000
+
+
+
+
+ When paired-end data is given, the fragment length is estimated
+ automatically and this parameter has no effect. But when single-end
+ data is given, the mean fragment length should be specified to
+ effectively filter fusions that arise from hairpin structures.
+ Default: 200
+
+
+
+ Subsample fusions with more than the given number of supporting reads. This
+ improves performance without compromising sensitivity, as long as the
+ threshold is high. Counting of supporting reads beyond the threshold is
+ inaccurate, obviously.
+ Default: 300
+
+
+
+ Highly expressed genes are prone to produce artifacts during library preparation.
+ Genes with an expression above the given quantile are eligible for filtering by the 'in_vitro' filter.
+ Default: 0.998000
+
+
+
+ The breakpoints of false-positive predictions of intragenic events
+ are often both in exons. True predictions are more likely to have at
+ least one breakpoint in an intron, because introns are larger.
+ If the fraction of exonic sequence between two breakpoints is smaller than
+ the given fraction, the 'intragenic_exonic' filter discards the event.
+ Default: 0.330000
+
+
+
+
+ Only report viral integration sites of the top N most highly expressed viral contigs.
+ Default: 5
+
+
+
+ Ignore virally associated events if the virus is not fully expressed,
+ i.e., less than the given fraction of the viral contig is transcribed.
+ Default: 0.150000
+
+
+
+ Note: Increasing this value beyond the default can impair performance and lead to many false positives.
+ Default: 100
+
+
+
+ Instead of performing duplicate marking itself, Arriba relies on duplicate marking by a
+ preceding program using the BAM_FDUP flag. This makes sense when unique molecular
+ identifiers (UMI) are used.
+
+
+
+ To reduce the runtime and file size, by default, the columns 'fusion_transcript',
+ 'peptide_sequence', and 'read_identifiers' are left empty in the file containing
+ discarded fusion candidates (see parameter -O). When this flag is set, this extra
+ information is reported in the discarded fusions file.
+
+
+
+ If assembly of the fusion transcript sequence from the supporting reads is incomplete
+ (denoted as '...'), fill the gaps using the assembly sequence wherever possible.
+
+
+
+
@@ -147,9 +434,115 @@
+
+
+ By default the transcript isoform with the highest coverage is drawn.
+ Alternatively, the transcript isoform that is provided in the columns
+ transcript_id1 and transcript_id2 in the given fusions file can be drawn.
+ Selecting the isoform with the highest coverage usually produces nicer plots,
+ in the sense that the coverage track is smooth and shows a visible increase in coverage after the fusion breakpoint.
+ However, the isoform with the highest coverage may not be the one that is involved in the fusion.
+ Often, genomic rearrangements lead to non-canonical isoforms being transcribed.
+ For this reason, it can make sense to rely on the transcript selection provided by the columns transcript_id1/2,
+ which reflect the actual isoforms involved in a fusion.
+\ As a third option, the transcripts that are annotated as canonical can be drawn.
+ Transcript isoforms tagged with appris_principal, appris_candidate, or CCDS are considered canonical.
+
+
+
+
+
+
+ The fusion of interest is drawn as a solid line in the circos plot.
+ To give an impression of the overall degree of rearrangement,
+ all other fusions are drawn as semi-transparent lines in the background.
+ This option determines which other fusions should be included in the circos plot.
+ Values specify the minimum confidence a fusion must have to be included.
+ It usually makes no sense to include low-confidence fusions in circos plots,
+ because they are abundant and unreliable, and would clutter up the circos plot.
+ Default: medium
+
+
+
+
+
+
+
+ This option only applies to intergenic breakpoints.
+ If it is set to a value greater than 0, then the script draws the genes
+ which are no more than the given distance away from an intergenic breakpoint.
+ Note that this option is incompatible with squishIntrons.
+ Default: 0
+
+
+
+ Exons usually make up only a small fraction of a gene.
+ They may be hard to see in the plot. i
+ Since introns are in most situations of no interest in the context of gene fusions,
+ this switch can be used to shrink the size of introns to a fixed, negligible size.
+ It makes sense to disable this feature, if breakpoints in introns are of importance.
+ Default: TRUE
+
+
+
+
+
+
+ Occasionally, domains are annotated redundantly.
+ For example, tyrosine kinase domains are frequently annotated as
+ Protein tyrosine kinase and Protein kinase domain.
+ In order to simplify the visualization, such domains can be merged into one,
+ given that they overlap by the given fraction.
+ The description of the larger domain is used.
+ Default: 0.9
+
+
+
+ By default the number of an exon is printed inside each exon,
+ which is taken from the attribute exon_number of the GTF annotation.
+ When a gene has many exons, the boxes may be too narrow to contain the labels,
+ resulting in unreadable exon labels. In these situations, i
+ it may be better to turn off exon labels.
+ Default: TRUE
+
+
+
+
+
+ Whether light and shadow should be rendered to give objects a 3D effect.
+ Default: TRUE
+
+
+
+
+
+ By default, the script colorizes domains according to the colors
+ specified in the file given in --annotation.
+ This way, coloring of domains is consistent across all proteins.
+ But since there are more distinct domains than colors,
+ this can lead to different domains having the same color.
+ If this option is set to TRUE, the colors are recomputed for each fusion separately.
+ This ensures that the colors have the maximum distance for each individual fusion,
+ but they are no longer consistent across different fusions.
+ Default: FALSE
+
+
+
+
+
+
+
+
+
+
+
+
@@ -203,15 +596,16 @@