changeset 0:3f13e9565679 draft

Uploaded
author dnbenso
date Mon, 24 Jan 2022 00:00:38 +0000
parents
children 03edd7b30f66
files .default-masurca-config.swp default-masurca-config masurca.xml test-data/illumina_reads_1.fastq test-data/illumina_reads_2.fastq test-data/nanopore_reads.fastq test-data/reference_genome.fasta
diffstat 7 files changed, 227 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
Binary file .default-masurca-config.swp has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/default-masurca-config	Mon Jan 24 00:00:38 2022 +0000
@@ -0,0 +1,63 @@
+DATA
+#Illumina paired end reads supplied as <two-character prefix> <fragment mean> <fragment stdev> <forward_reads> <reverse_reads>
+#if single-end, do not specify <reverse_reads>
+#MUST HAVE Illumina paired end reads to use MaSuRCA
+#PE= pe 500 50  /FULL_PATH/frag_1.fastq  /FULL_PATH/frag_2.fastq
+PE= pe MEAN STDDEV INPUTREAD1 INPUTREAD2
+#Illumina mate pair reads supplied as <two-character prefix> <fragment mean> <fragment stdev> <forward_reads> <reverse_reads>
+#JUMP= sh 3600 200  /FULL_PATH/short_1.fastq  /FULL_PATH/short_2.fastq
+#pacbio OR nanopore reads must be in a single fasta or fastq file with absolute path, can be gzipped
+#if you have both types of reads supply them both as NANOPORE type
+#PACBIO=/FULL_PATH/pacbio.fa
+#PACBIO=INPUTREADLONG
+#NANOPORE=/FULL_PATH/nanopore.fa
+#NANOPORE=INPUTREADLONG
+#Other reads (Sanger, 454, etc) one frg file, concatenate your frg files into one if you have many
+#OTHER=/FULL_PATH/file.frg
+#synteny-assisted assembly, concatenate all reference genomes into one reference.fa; works for Illumina-only data
+#REFERENCE=REF
+END
+
+PARAMETERS
+#PLEASE READ all comments to essential parameters below, and set the parameters according to your project
+#set this to 1 if your Illumina jumping library reads are shorter than 100bp
+EXTEND_JUMP_READS=0
+#this is k-mer size for deBruijn graph values between 25 and 127 are supported, auto will compute the optimal size based on the read data and GC content
+GRAPH_KMER_SIZE = auto
+#set this to 1 for all Illumina-only assemblies
+#set this to 0 if you have more than 15x coverage by long reads (Pacbio or Nanopore) or any other long reads/mate pairs (Illumina MP, Sanger, 454, etc)
+USE_LINKING_MATES = 0
+#specifies whether to run the assembly on the grid
+USE_GRID=0
+#specifies grid engine to use SGE or SLURM
+GRID_ENGINE=SLURM
+#specifies queue (for SGE) or partition (for SLURM) to use when running on the grid MANDATORY
+GRID_QUEUE=defq
+#batch size in the amount of long read sequence for each batch on the grid
+GRID_BATCH_SIZE=500000000
+#use at most this much coverage by the longest Pacbio or Nanopore reads, discard the rest of the reads
+#can increase this to 30 or 35 if your reads are short (N50<7000bp)
+LHE_COVERAGE=25
+#set to 0 (default) to do two passes of mega-reads for slower, but higher quality assembly, otherwise set to 1
+MEGA_READS_ONE_PASS=0
+#this parameter is useful if you have too many Illumina jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms 
+LIMIT_JUMP_COVERAGE = 300
+#these are the additional parameters to Celera Assembler.  do not worry about performance, number or processors or batch sizes -- these are computed automatically. 
+#CABOG ASSEMBLY ONLY: set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms.
+CA_PARAMETERS =  cgwErrorRate=0.15
+#CABOG ASSEMBLY ONLY: whether to attempt to close gaps in scaffolds with Illumina  or long read data
+CLOSE_GAPS=1
+#number of cpus to use, set this to the number of CPUs/threads per node you will be using
+NUM_THREADS = GALAXY_SLOTS
+#this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*20
+JF_SIZE = JELLYFISHSIZE
+#ILLUMINA ONLY. Set this to 1 to use SOAPdenovo contigging/scaffolding module.  
+#Assembly will be worse but will run faster. Useful for very large (>=8Gbp) genomes from Illumina-only data
+SOAP_ASSEMBLY=0
+#If you are doing Hybrid Illumina paired end + Nanopore/PacBio assembly ONLY (no Illumina mate pairs or OTHER frg files).  
+#Set this to 1 to use Flye assembler for final assembly of corrected mega-reads.  
+#A lot faster than CABOG, AND QUALITY IS THE SAME OR BETTER. 
+#Works well even when MEGA_READS_ONE_PASS is set to 1.  
+#DO NOT use if you have less than 15x coverage by long reads.
+FLYE_ASSEMBLY=0
+END 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/masurca.xml	Mon Jan 24 00:00:38 2022 +0000
@@ -0,0 +1,160 @@
+<tool id="masurca" name="MaSuRCA" version="@TOOL_VERSION@+galaxy0">
+    <description>The MaSuRCA (Maryland Super Read Cabog Assembler) genome assembly and analysis toolkit with config</description>
+    <macros>
+        <token name="@TOOL_VERSION@">4.0.6</token>
+    </macros>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">masurca</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        cp $__tool_directory__/default-masurca-config  config.txt &&
+        #if $nanopore_input.np_input == "Yes":
+            #if $pacbio_input.pb_input == "Yes":
+                cat '$nanopore_input.nano' '$pacbio_input.pacbio' > long.fastq.gz &&
+            #else:
+                ln -s '$nanopore_input.nano' long.fastq.gz &&
+            #end if
+            sed -i 's|#NANOPORE=INPUTREADLONG|NANOPORE=long.fastq.gz|' config.txt &&
+        #elif $pacbio_input.pb_input == "Yes":
+            ln -s '$pacbio_input.pacbio' long.fastq.gz &&
+            sed -i 's|#PACBIO=INPUTREADLONG|PACBIO=long.fastq.gz|' config.txt &&
+        #end if
+        #if str( $illumina_input.input_type ) == "single"
+            ln -s '$illumina_input.fastq_input1' ill_1.fastq.gz &&
+            sed -i 's|INPUTREAD1|ill_1.fastq.gz|' config.txt &&
+        #elif str( $illumina_input.input_type ) == "paired"
+            ln -s '$illumina_input.fastq_input1' ill_1.fastq.gz &&
+            sed -i 's|INPUTREAD1|ill_1.fastq.gz|' config.txt && 
+            ln -s '$illumina_input.fastq_input2' ill_2.fastq.gz &&
+            sed -i 's|INPUTREAD2|ill_2.fastq.gz|' config.txt &&
+        #elif str( $illumina_input.input_type ) == "paired_collection"
+            ln -s '$illumina_input.fastq_input1' ill_1.fastq.gz &&
+            sed -i 's|INPUTREAD1|ill_1.fastq.gz|' config.txt && 
+            ln -s '$illumina_input.fastq_input2' ill_2.fastq.gz &&
+            sed -i 's|INPUTREAD2|ill_2.fastq.gz|' config.txt &&
+        #end if
+        #if $reference_input.ref_input == "Yes":
+            sed -i 's|#REFERENCE=REF|REFERENCE=$ref|' config.txt &&
+        #end if
+        sed -i 's|GALAXY_SLOTS|'\${GALAXY_SLOTS:-8}'|' config.txt &&
+        sed -i 's|MEAN|$mean|' config.txt &&
+        sed -i 's|STDDEV|$stddev|' config.txt &&
+        sed -i 's|JELLYFISHSIZE|$jfsize|' config.txt &&
+        sed -i 's|USE_LINKING_MATES = 0|USE_LINKING_MATES = $lnkmts|' config.txt &&
+        sed -i 's|FLYE_ASSEMBLY=0|FLYE_ASSEMBLY=$flye|' config.txt &&
+        masurca config.txt &&
+        bash assemble.sh
+    ]]></command>
+    <inputs>
+        <conditional name="illumina_input">
+            <param name="input_type" type="select" label="Paired-end reads" help="Select between paired and paired collection">
+                <option value="single">Single</option>
+                <option value="paired">Paired</option>
+                <option value="paired_collection">Paired Collection</option>
+            </param>
+            <when value="single">
+                <param type="data" name="fastq_input1" format="fastqsanger,fastqsanger.gz"
+                    label="Select unpaired reads" help="Specify dataset with unpaired reads"/>
+            </when>
+            <when value="paired">
+                <param type="data" name="fastq_input1" format="fastqsanger,fastqsanger.gz"
+                    label="Select first set of reads" help="Specify dataset with forward reads"/>
+                <param type="data" name="fastq_input2" format="fastqsanger,fastqsanger.gz"
+                    label="Select second set of reads" help="Specify dataset with reverse reads"/>
+            </when>
+            <when value="paired_collection">
+                <param name="fastq_input1" format="fastqsanger,fastqsanger.gz" type="data_collection" collection_type="paired" label="Select a paired collection" />
+            </when>
+        </conditional>
+        <param type="integer" name="mean" value="500" label="Mean size" help="Libarary insert average length" />
+        <param type="integer" name="stddev" value="50" label="Standard deviation"
+               help="Library insert standard deviation - if not known, set it to approximately 15% of the mean" />
+        <conditional name="nanopore_input">
+            <param name="np_input" type="select" label="Use Nanopore long reads" help="Optional Nanopore reads must be in a single fasta or fastq file">
+                <option value="No" selected="true">No</option>
+                <option value="Yes">Yes</option>
+            </param>
+            <when value="No"/>
+            <when value="Yes">
+                <param type="data" name="nano" format="fastqsanger,fastqsanger.gz,fasta,fasta.gz" label="nanopore reads" />
+            </when>
+        </conditional>
+        <conditional name="pacbio_input">
+            <param name="pb_input" type="select" label="Use Pacbio long reads" help="Optional Pacbio reads must be in a single fasta or fastq file">
+                <option value="No" selected="true">No</option>
+                <option value="Yes">Yes</option>
+            </param>
+            <when value="No"/>
+            <when value="Yes">
+                <param type="data" name="pacbio" format="fastqsanger,fastqsanger.gz,fasta,fasta.gz" label="pacbio reads" />
+            </when>
+        </conditional>
+        <conditional name="reference_input">
+            <param name="ref_input" type="select" label="Synteny-assisted assembly" help="Concatenate all reference genomes into one reference.fa; works for Illumina-only data">
+                <option value="No" selected="true">No</option>
+                <option value="Yes">Yes</option>
+            </param>
+            <when value="No"/>
+            <when value="Yes">
+                <param type="data" name="ref" format="fasta,fasta.gz" label="Reference" />
+            </when>
+        </conditional>
+        <param type="integer" name="jfsize" value="20000000" label="Jellyfish hash size" help="Set this to about 10x the genome size" />
+        <param type="boolean" name="flye" truevalue="1" falsevalue="0" label="Set this to use Flye assembler for final assembly of corrected mega-reads"
+            help="If you are doing Hybrid Illumina paired end + Nanopore/PacBio assembly ONLY (no Illumina mate pairs or OTHER frg files). DO NOT use if you have less than 15x coverage by long read" />
+        <param type="boolean" name="lnkmts" truevalue="1" falsevalue="0" label="Include Linking Mates"
+            help="Most of the paired end reads end up in the same super read and thus are not passed to the assembler. Those that do not end up in the same super read are called ”linking mates” . The best assembly results are achieved by setting this parameter to 1 for Illumina-only assemblies. If you have more than 2x coverage by long reads, set this to 0." />
+    </inputs>
+    <outputs>
+        <data name="superReads" format="fasta" from_work_dir="superReadSequences.named.fasta" label="${tool.name} on ${on_string}: named_superReads" />
+        <data name="scaffold_prm" format="fasta" from_work_dir="CA.mr.*/primary.genome.scf.fasta" label="${tool.name} on ${on_string}: primary_genome">
+            <filter>flye == False</filter>
+        </data>
+        <data name="scaffold_alt" format="fasta" from_work_dir="CA.mr.*/alternative.genome.scf.fasta" label="${tool.name} on ${on_string}: alternative_genome">
+            <filter>flye == False</filter>
+        </data>
+        <data name="flye_assembly" format="fasta" from_work_dir="flye.mr.*/assembly.fasta" label="${tool.name} on ${on_string}: flye_assembly">
+            <filter>flye == True</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <conditional name="illumina_input">
+                <param name="input_type" value="paired" />
+                <param name="fastq_input1" value="illumina_reads_1.fastq"/>
+                <param name="fastq_input2" value="illumina_reads_2.fastq"/>
+            </conditional>
+            <conditional name="nanopore_input">
+                <param name="np_input" value="Yes" />
+                <param name="nano" value="nanopore_reads.fastq" />
+            </conditional>
+            <conditional name="pacbio_input">
+                <param name="pb_input" value="No" />
+            </conditional>
+            <conditional name="reference_input">
+                <param name="ref_input" value="Yes" />
+                <param name="ref" value="reference_genome.fasta" />
+            </conditional>
+            <param name="mean" value="500" />
+            <param name="stddev" value="50" />
+            <param name="jfsize" value="80349460" />
+            <param name="flye" value="1" />
+            <param name="lnkmts" value="0" />
+            <output name="superReads" ftype="fasta">
+                <assert_contents>
+                   <has_line_matching expression="^GAAAGCCGTGGCTTGGAACGGTGCTGATTGATCCGGC.*"/> 
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+        This implementation of MaSuRCA uses a config file for more complicated
+        assemblies and to change default settings. Illumina reads (mandatory)
+        and long reads from PACBIO or Oxford Nanopore or both can be included.
+
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btt476</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/illumina_reads_1.fastq	Mon Jan 24 00:00:38 2022 +0000
@@ -0,0 +1,1 @@
+wget https://usegalaxy.org.au/datasets/5cec05905990af26/display?to_ext=fastqsanger -O illumina_reads_1.fastq
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/illumina_reads_2.fastq	Mon Jan 24 00:00:38 2022 +0000
@@ -0,0 +1,1 @@
+wget https://usegalaxy.org.au/datasets/abab1e6005ad1907/display?to_ext=fastqsanger -O illumina_reads_2.fastq
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/nanopore_reads.fastq	Mon Jan 24 00:00:38 2022 +0000
@@ -0,0 +1,1 @@
+wget https://usegalaxy.org.au/datasets/0af5c63726eb74ba/display?to_ext=fastq -O nanopore_reads.fastq
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/reference_genome.fasta	Mon Jan 24 00:00:38 2022 +0000
@@ -0,0 +1,1 @@
+wget https://usegalaxy.org.au/datasets/a902098a85e91ce7/display?to_ext=fasta -O reference_genome.fasta