changeset 0:e25357392813 draft

Uploaded
author ieguinoa
date Tue, 18 May 2021 16:30:52 +0000
parents
children f24eb2f2cb0c
files .shed.yml ena_consensus_submit.xml process_input.py
diffstat 3 files changed, 298 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml	Tue May 18 16:30:52 2021 +0000
@@ -0,0 +1,12 @@
+categories:
+    - Data Export
+description: |
+    Submits a genome assembly to the European Nucleotide Archive (ENA) using the Webin-CLI Submission tool.  
+long_description: |
+    The program submits genome assembly file(s) and respective metadata to the European Nucleotide Archive (ENA). 
+    The metadata can be filled in the UI of the tool or extracted from the submission receipt of the raw data submission tool.
+name: ena_webin_upload
+owner: ieguinoa
+remote_repository_url: https://github.com/ieguinoa/consensus_sequence_ena_galaxy
+homepage_url: https://ena-docs.readthedocs.io/en/latest/submit/general-guide/webin-cli.html
+type: unrestricted
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ena_consensus_submit.xml	Tue May 18 16:30:52 2021 +0000
@@ -0,0 +1,159 @@
+<tool id="ena_consensus_submit" name="Submit consensus sequence to ENA" version="0.1.0" python_template_version="3.5">
+    <macros>
+        <token name="@VERSION@">3.7.0</token>
+    </macros>
+    <requirements>
+        <requirement type="package" version="@VERSION@">ena-webin-cli</requirement>
+        <requirement type="package" version="1.76">biopython</requirement>
+        <requirement type="package" version="5.3">pyyaml</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+
+webin_id=`grep 'username' $credentials | cut -f2,2`;
+if [ "\$webin_id" = "" ]; then
+  ## No credentials in user defined preferences    
+  ## Fallback to global defined credentials (if exist)   
+  #import os
+  #if os.path.isfile(os.environ.get('GALAXY_ENA_SECRETS', '')):
+      credentials_path=\${GALAXY_ENA_SECRETS};     
+      webin_id=`grep 'username' \$GALAXY_ENA_SECRETS | cut -d' ' -f2,2`;
+      password=`grep 'password' \$GALAXY_ENA_SECRETS | cut -d' ' -f2,2`;
+      if [ "\$webin_id" = "" ]; then
+          echo "No global credentials defined. Check your GALAXY_ENA_SECRETS file or set your credentials via: User -> Preferences -> Manage Information";
+          exit 1;
+      fi;
+  #else:
+      echo "No ENA credentials defined. Set your credentials via: User -> Preferences -> Manage Information";
+      exit 1;
+  #end if
+else
+  password=`grep 'password' $credentials | cut -f2,2`;
+fi;
+
+## if it is still running at this point then the webin_id and password are set
+
+## create a manifaste base file with the parameters that are identical for all sequences that will be submited (if submitting a multifasta)
+#set $manifest_base = 'manifest_base.tab'
+mkdir manifests;
+mkdir fasta;
+
+echo -e "ASSEMBLY_TYPE\t$assembly_type" >> $manifest_base;
+echo -e "COVERAGE\t$coverage" >> $manifest_base;
+echo -e "PROGRAM\t$assembly_program" >> $manifest_base;
+echo -e "MINGAPLENGTH\t$min_gap_length" >> $manifest_base;
+echo -e "MOLECULETYPE\t$molecule_type" >> $manifest_base;
+        
+#if $metadata_file_or_form.metadata_format == "file":
+    ## process the input tables, this creates an intermediate file with information
+    python3 '$__tool_directory__/process_input.py' $metadata_file_or_form.ena_receipt $genome_fasta './manifests' './fasta' $manifest_base;
+    center_name=`grep 'center_name' $metadata_file_or_form.ena_receipt | cut -f2,2 | tr -d '\n'`;
+#else:
+    #set $generated_manifest='./manifests/generated_manifest.txt'
+    cp $manifest_base $generated_manifest
+    $study_id = $metadata_file_or_form.study_accession
+    $sample_id = $metadata_file_or_form.sample_accession
+    echo "STUDY\t$study_id" > $generated_manifest;
+    echo "SAMPLE\t$sample_id" >> $generated_manifest;
+    center_name=$metadata_file_or_form.center_name;
+    echo "ASSEMBLY_NAME\t$metadata_file_or_form.assembly_name" >> $generated_manifest;
+    echo "PLATFORM\t$platform_name" >> $generated_manifest;
+#end if
+        
+
+#if $metadata_file_or_form.metadata_format == "file":
+    ## iterate over the list of manifest - fasta generated by the process_input
+    ## in case of errors, this list is empty
+    while read line; do
+        manifest=`echo \$line | cut -d' ' -f1,1`;
+        ena-webin-cli
+        -context genome
+        -userName \$webin_id
+        -password \$password
+        -centerName \$center_name 
+        -manifest \$manifest
+        -inputDir "./fasta"
+        #if $dry_run == "true":
+            -validate
+        #end if
+        ;
+    done < submit_list.tab
+
+#else:
+    gzip -c $genome_fasta > consensus.fasta.gz;
+    echo -e "FASTA\tconsensus.fasta.gz" >> $generated_manifest;
+    ena-webin-cli
+    -context genome 
+    -userName \$webin_id
+    -password \$password
+    -centerName '\$center_name' 
+    -manifest $generated_manifest
+    -inputDir "./"
+    #if $submit_test == "true":
+        -test
+    #end if
+    #if $dry_run == "true":
+        -validate
+    #else:
+        -submit
+    #end if
+#end if
+]]></command>
+    <configfiles>
+        <configfile name="credentials"><![CDATA[
+#set $webin_id = $__user__.extra_preferences.get('ena_webin_account|webin_id', "").strip()
+#set $password = $__user__.extra_preferences.get('ena_webin_account|password', "").strip()
+#if $webin_id != "":
+    username\t"$webin_id"
+    password\t"$password"
+#end if
+        ]]></configfile>
+    </configfiles>
+<inputs>
+    <param name="submit_test" type="boolean" default="False" label="Submit to test server" help="Suggested to test metadata format" />
+    <param name="dry_run" type="boolean" default="False" label="Validate files and metadata but do not submit" help="Generate input files and run Webin-CLI with -validate option."/>
+    <param name="genome_fasta" type="data" label="Select the consensus sequence assembly file" format="fasta"/>
+    <param name="assembly_type" type="select" label="Assembly type">
+        <option value="clone">Clone</option>
+        <option value="isolate">Isolate</option>
+    </param>
+    <param name="assembly_program" type="text" optional="False" label="Assembly program"/>
+    <param name="molecule_type" type="select" label="Molecule type">
+        <option value="genomic RNA" selected="True">genomic RNA</option>
+        <option value="viral cRNA">viral cRNA</option>
+        <option value="genomic DNA">genomic DNA</option>
+    </param>
+    <param name="min_gap_length" type="text" optional="True" label="Minimum gap length"/>
+    <param name="coverage" type="float" optional="False" value="10000" label="Coverage"/>
+    <conditional name="metadata_file_or_form">
+        <param name="metadata_format" type="select" label="Select the method to load study and sample metadata">
+            <option value="file" selected="True">I used Galaxy ENA upload tool for raw data submission, parse my submission receipt</option>
+            <option value="form">Fill in required metadata for linking</option>
+        </param>
+        <when value="file">
+            <param type="data" format="txt" name="ena_receipt" label="Submission receipt obtained from ENA upload tool"/>
+        </when>
+        <when value="form">
+            <param name="assembly_name" type="text" optional="False" label="Assembly name"/>
+            <param name="study_accession" type="text" optional="False" label="Study accession or unique name (alias)"/>
+            <param name="sample_accession" type="text" optional="False" label="Sample accession or unique name (alias)"/>
+            <param name="sequencing_platform" type="text" optional="False" label="Sequencing platform"/>
+            <param name="description" type="text" optional="True" value="" label="Description" help="Free text description of the genome assembly (optional)"/>
+            <param name="center_name" type="text" optional="False" label="Center name"/>
+        </when>
+    </conditional>
+    </inputs>
+    <outputs>
+        <collection name="generated_manifests" type="list" label="Generated manifests">
+            <!--<discover_datasets pattern="__name_and_ext__" directory="manifests" />-->
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.manifest\.txt$" ext="txt" directory="manifests" />
+        </collection>
+        <collection name="manifests_reports" type="list" label="Manifests reports">
+            <!--[><discover_datasets pattern="__name_and_ext__" directory="manifests" /><]-->
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.manifest\.txt\.report$" ext="txt" directory="manifests" />
+        </collection>
+        <data name="webin_cli_report" label="ENA submission receipt" format="txt" from_work_dir="manifests/webin-cli.report"/>
+    </outputs>
+    <help><![CDATA[
+        TODO: Fill in help.
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/process_input.py	Tue May 18 16:30:52 2021 +0000
@@ -0,0 +1,127 @@
+import gzip
+import os
+import sys
+import shutil
+import yaml
+
+from Bio import SeqIO
+
+
+"""
+Takes as input:
+    1. A receipt obtained from ENA submission tool. 
+    A txt file that includes a YAML section with 
+
+    2. A fasta file with fasta entries ids defined after the files used for the raw submission.
+
+    3. Path to write generated manifests
+    4. Path to write generated fasta files
+    5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..)
+"""
+
+def get_section_string(f, start_line, end_line):
+    # consume starting lines
+    start_string = iter(f.readline, start_line)
+    start_string = ''.join(line for line in start_string)
+    # read YAML lines
+    yaml_string = iter(f.readline, end_line)
+    return ''.join(x for x in yaml_string)
+
+
+def main():
+    input_file = open(sys.argv[1])
+    fasta_in = open(sys.argv[2])
+    out_manifest_base = sys.argv[3] 
+    out_fasta_base = sys.argv[4] 
+    manifest_template = sys.argv[5]
+    yaml_delimiter = 'YAML -------------\n'
+    yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter))
+    # print(yaml_only)
+    submission_tuples_list = []
+    # parse the sequence IDs
+    for record in SeqIO.parse(fasta_in, "fasta"):
+        seq_id = record.id
+        # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name? 
+        #  ... but for paired it may not be the exact same
+        # ...initially I should attempt to find a 
+        #  .. if this 
+        # in any case, if I cant make the right match then I should do a kind of substring match
+
+        # find the exp_alias associated with the file
+        exp_alias = None
+        for index,run in yaml_only['ENA_run'].items():
+            if run['file_name'] == seq_id:
+                ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something
+                exp_alias = run['experiment_alias']
+                break
+        if not exp_alias:
+            raise Exception("No run files match for the sequence entry {seq_id}")
+        # find the sample and study for that experiment
+        sample_alias = None
+        study_alias = None
+        for index,exp in yaml_only['ENA_experiment'].items():
+            if exp['alias'] == exp_alias:
+                sample_alias = exp['sample_alias']
+                study_alias = exp['study_alias']
+                platform = exp['platform']
+                break
+        if not sample_alias:
+            raise Exception("No sample associated with experiment {exp_alias}")
+        if not study_alias:
+            raise Exception("No study associated with experiment {exp_alias}")
+
+
+        # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID)
+        fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta')
+        with open(fasta_path, "w") as output_handle:
+            SeqIO.write([record], output_handle, "fasta")
+        #gzip the file (required by ENA upload tool)
+        fasta_path_gz = fasta_path + '.gz'
+        with open(fasta_path, 'rb') as f_in:
+            with gzip.open(fasta_path_gz, 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
+        # create the manifest
+        # add to the manifest the: 
+        # 
+        manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt')
+        with open(manifest_path, "w") as output_handle:
+            # first dump the contents of manifest template
+            # containing the global vars
+            with open(manifest_template) as m_template:
+                output_handle.write(m_template.read())
+            output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n")
+            output_handle.write("PLATFORM\t" + platform + "\n")
+            output_handle.write("STUDY\t" + study_alias + "\n")
+            output_handle.write("SAMPLE\t" + sample_alias + "\n")
+            output_handle.write("FASTA\t" + fasta_path_gz + "\n")
+
+        # ... and a dict  (or tuple list???) that contains for each study - sample  the name of the file that has the consensus sequence
+        # ****  is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions??
+        # in the latest case then I also need to parse the  Study accession details: and Sample accession details: entries
+        # samples_dir[study][sample] = seq_id + '.fasta'
+        submission_tuples_list.append((manifest_path, fasta_path))
+
+    with open('submit_list.tab', "w") as output_handle:
+        for submit_tuple in submission_tuples_list:
+            output_handle.write('\t'.join(submit_tuple) + '\n')
+    ## DEBUG CASE
+    #study details
+    # start_study = 'Study accession details:\n'
+    # empty_end = '\n'
+    # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end)
+    # if len(study_data.split('\n')) > 2:
+        # # more than 1 study accession
+        # raise Exception("Multiple study accessions found")
+    # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n')
+    # start_sample = 'Sample accession details:\n'
+    # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end)
+    # if len(sample_data.split('\n')) > 2:
+        # # more than 1 study accession
+        # raise Exception("Multiple sample accessions found")
+    # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n')
+    # platform = 'Ion Torrent'
+    # out_manifest.write(f"PLATFORM\t{platform}\n")
+    # out_manifest.close()
+
+if __name__ == '__main__':
+    main()