Mercurial > repos > ieguinoa > ena_webin_cli
changeset 0:e25357392813 draft
Uploaded
author | ieguinoa |
---|---|
date | Tue, 18 May 2021 16:30:52 +0000 |
parents | |
children | f24eb2f2cb0c |
files | .shed.yml ena_consensus_submit.xml process_input.py |
diffstat | 3 files changed, 298 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.shed.yml Tue May 18 16:30:52 2021 +0000 @@ -0,0 +1,12 @@ +categories: + - Data Export +description: | + Submits a genome assembly to the European Nucleotide Archive (ENA) using the Webin-CLI Submission tool. +long_description: | + The program submits genome assembly file(s) and respective metadata to the European Nucleotide Archive (ENA). + The metadata can be filled in the UI of the tool or extracted from the submission receipt of the raw data submission tool. +name: ena_webin_upload +owner: ieguinoa +remote_repository_url: https://github.com/ieguinoa/consensus_sequence_ena_galaxy +homepage_url: https://ena-docs.readthedocs.io/en/latest/submit/general-guide/webin-cli.html +type: unrestricted
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ena_consensus_submit.xml Tue May 18 16:30:52 2021 +0000 @@ -0,0 +1,159 @@ +<tool id="ena_consensus_submit" name="Submit consensus sequence to ENA" version="0.1.0" python_template_version="3.5"> + <macros> + <token name="@VERSION@">3.7.0</token> + </macros> + <requirements> + <requirement type="package" version="@VERSION@">ena-webin-cli</requirement> + <requirement type="package" version="1.76">biopython</requirement> + <requirement type="package" version="5.3">pyyaml</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + +webin_id=`grep 'username' $credentials | cut -f2,2`; +if [ "\$webin_id" = "" ]; then + ## No credentials in user defined preferences + ## Fallback to global defined credentials (if exist) + #import os + #if os.path.isfile(os.environ.get('GALAXY_ENA_SECRETS', '')): + credentials_path=\${GALAXY_ENA_SECRETS}; + webin_id=`grep 'username' \$GALAXY_ENA_SECRETS | cut -d' ' -f2,2`; + password=`grep 'password' \$GALAXY_ENA_SECRETS | cut -d' ' -f2,2`; + if [ "\$webin_id" = "" ]; then + echo "No global credentials defined. Check your GALAXY_ENA_SECRETS file or set your credentials via: User -> Preferences -> Manage Information"; + exit 1; + fi; + #else: + echo "No ENA credentials defined. Set your credentials via: User -> Preferences -> Manage Information"; + exit 1; + #end if +else + password=`grep 'password' $credentials | cut -f2,2`; +fi; + +## if it is still running at this point then the webin_id and password are set + +## create a manifaste base file with the parameters that are identical for all sequences that will be submited (if submitting a multifasta) +#set $manifest_base = 'manifest_base.tab' +mkdir manifests; +mkdir fasta; + +echo -e "ASSEMBLY_TYPE\t$assembly_type" >> $manifest_base; +echo -e "COVERAGE\t$coverage" >> $manifest_base; +echo -e "PROGRAM\t$assembly_program" >> $manifest_base; +echo -e "MINGAPLENGTH\t$min_gap_length" >> $manifest_base; +echo -e "MOLECULETYPE\t$molecule_type" >> $manifest_base; + +#if $metadata_file_or_form.metadata_format == "file": + ## process the input tables, this creates an intermediate file with information + python3 '$__tool_directory__/process_input.py' $metadata_file_or_form.ena_receipt $genome_fasta './manifests' './fasta' $manifest_base; + center_name=`grep 'center_name' $metadata_file_or_form.ena_receipt | cut -f2,2 | tr -d '\n'`; +#else: + #set $generated_manifest='./manifests/generated_manifest.txt' + cp $manifest_base $generated_manifest + $study_id = $metadata_file_or_form.study_accession + $sample_id = $metadata_file_or_form.sample_accession + echo "STUDY\t$study_id" > $generated_manifest; + echo "SAMPLE\t$sample_id" >> $generated_manifest; + center_name=$metadata_file_or_form.center_name; + echo "ASSEMBLY_NAME\t$metadata_file_or_form.assembly_name" >> $generated_manifest; + echo "PLATFORM\t$platform_name" >> $generated_manifest; +#end if + + +#if $metadata_file_or_form.metadata_format == "file": + ## iterate over the list of manifest - fasta generated by the process_input + ## in case of errors, this list is empty + while read line; do + manifest=`echo \$line | cut -d' ' -f1,1`; + ena-webin-cli + -context genome + -userName \$webin_id + -password \$password + -centerName \$center_name + -manifest \$manifest + -inputDir "./fasta" + #if $dry_run == "true": + -validate + #end if + ; + done < submit_list.tab + +#else: + gzip -c $genome_fasta > consensus.fasta.gz; + echo -e "FASTA\tconsensus.fasta.gz" >> $generated_manifest; + ena-webin-cli + -context genome + -userName \$webin_id + -password \$password + -centerName '\$center_name' + -manifest $generated_manifest + -inputDir "./" + #if $submit_test == "true": + -test + #end if + #if $dry_run == "true": + -validate + #else: + -submit + #end if +#end if +]]></command> + <configfiles> + <configfile name="credentials"><![CDATA[ +#set $webin_id = $__user__.extra_preferences.get('ena_webin_account|webin_id', "").strip() +#set $password = $__user__.extra_preferences.get('ena_webin_account|password', "").strip() +#if $webin_id != "": + username\t"$webin_id" + password\t"$password" +#end if + ]]></configfile> + </configfiles> +<inputs> + <param name="submit_test" type="boolean" default="False" label="Submit to test server" help="Suggested to test metadata format" /> + <param name="dry_run" type="boolean" default="False" label="Validate files and metadata but do not submit" help="Generate input files and run Webin-CLI with -validate option."/> + <param name="genome_fasta" type="data" label="Select the consensus sequence assembly file" format="fasta"/> + <param name="assembly_type" type="select" label="Assembly type"> + <option value="clone">Clone</option> + <option value="isolate">Isolate</option> + </param> + <param name="assembly_program" type="text" optional="False" label="Assembly program"/> + <param name="molecule_type" type="select" label="Molecule type"> + <option value="genomic RNA" selected="True">genomic RNA</option> + <option value="viral cRNA">viral cRNA</option> + <option value="genomic DNA">genomic DNA</option> + </param> + <param name="min_gap_length" type="text" optional="True" label="Minimum gap length"/> + <param name="coverage" type="float" optional="False" value="10000" label="Coverage"/> + <conditional name="metadata_file_or_form"> + <param name="metadata_format" type="select" label="Select the method to load study and sample metadata"> + <option value="file" selected="True">I used Galaxy ENA upload tool for raw data submission, parse my submission receipt</option> + <option value="form">Fill in required metadata for linking</option> + </param> + <when value="file"> + <param type="data" format="txt" name="ena_receipt" label="Submission receipt obtained from ENA upload tool"/> + </when> + <when value="form"> + <param name="assembly_name" type="text" optional="False" label="Assembly name"/> + <param name="study_accession" type="text" optional="False" label="Study accession or unique name (alias)"/> + <param name="sample_accession" type="text" optional="False" label="Sample accession or unique name (alias)"/> + <param name="sequencing_platform" type="text" optional="False" label="Sequencing platform"/> + <param name="description" type="text" optional="True" value="" label="Description" help="Free text description of the genome assembly (optional)"/> + <param name="center_name" type="text" optional="False" label="Center name"/> + </when> + </conditional> + </inputs> + <outputs> + <collection name="generated_manifests" type="list" label="Generated manifests"> + <!--<discover_datasets pattern="__name_and_ext__" directory="manifests" />--> + <discover_datasets pattern="(?P<designation>.+)\.manifest\.txt$" ext="txt" directory="manifests" /> + </collection> + <collection name="manifests_reports" type="list" label="Manifests reports"> + <!--[><discover_datasets pattern="__name_and_ext__" directory="manifests" /><]--> + <discover_datasets pattern="(?P<designation>.+)\.manifest\.txt\.report$" ext="txt" directory="manifests" /> + </collection> + <data name="webin_cli_report" label="ENA submission receipt" format="txt" from_work_dir="manifests/webin-cli.report"/> + </outputs> + <help><![CDATA[ + TODO: Fill in help. + ]]></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/process_input.py Tue May 18 16:30:52 2021 +0000 @@ -0,0 +1,127 @@ +import gzip +import os +import sys +import shutil +import yaml + +from Bio import SeqIO + + +""" +Takes as input: + 1. A receipt obtained from ENA submission tool. + A txt file that includes a YAML section with + + 2. A fasta file with fasta entries ids defined after the files used for the raw submission. + + 3. Path to write generated manifests + 4. Path to write generated fasta files + 5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..) +""" + +def get_section_string(f, start_line, end_line): + # consume starting lines + start_string = iter(f.readline, start_line) + start_string = ''.join(line for line in start_string) + # read YAML lines + yaml_string = iter(f.readline, end_line) + return ''.join(x for x in yaml_string) + + +def main(): + input_file = open(sys.argv[1]) + fasta_in = open(sys.argv[2]) + out_manifest_base = sys.argv[3] + out_fasta_base = sys.argv[4] + manifest_template = sys.argv[5] + yaml_delimiter = 'YAML -------------\n' + yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter)) + # print(yaml_only) + submission_tuples_list = [] + # parse the sequence IDs + for record in SeqIO.parse(fasta_in, "fasta"): + seq_id = record.id + # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name? + # ... but for paired it may not be the exact same + # ...initially I should attempt to find a + # .. if this + # in any case, if I cant make the right match then I should do a kind of substring match + + # find the exp_alias associated with the file + exp_alias = None + for index,run in yaml_only['ENA_run'].items(): + if run['file_name'] == seq_id: + ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something + exp_alias = run['experiment_alias'] + break + if not exp_alias: + raise Exception("No run files match for the sequence entry {seq_id}") + # find the sample and study for that experiment + sample_alias = None + study_alias = None + for index,exp in yaml_only['ENA_experiment'].items(): + if exp['alias'] == exp_alias: + sample_alias = exp['sample_alias'] + study_alias = exp['study_alias'] + platform = exp['platform'] + break + if not sample_alias: + raise Exception("No sample associated with experiment {exp_alias}") + if not study_alias: + raise Exception("No study associated with experiment {exp_alias}") + + + # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID) + fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta') + with open(fasta_path, "w") as output_handle: + SeqIO.write([record], output_handle, "fasta") + #gzip the file (required by ENA upload tool) + fasta_path_gz = fasta_path + '.gz' + with open(fasta_path, 'rb') as f_in: + with gzip.open(fasta_path_gz, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + # create the manifest + # add to the manifest the: + # + manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt') + with open(manifest_path, "w") as output_handle: + # first dump the contents of manifest template + # containing the global vars + with open(manifest_template) as m_template: + output_handle.write(m_template.read()) + output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n") + output_handle.write("PLATFORM\t" + platform + "\n") + output_handle.write("STUDY\t" + study_alias + "\n") + output_handle.write("SAMPLE\t" + sample_alias + "\n") + output_handle.write("FASTA\t" + fasta_path_gz + "\n") + + # ... and a dict (or tuple list???) that contains for each study - sample the name of the file that has the consensus sequence + # **** is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions?? + # in the latest case then I also need to parse the Study accession details: and Sample accession details: entries + # samples_dir[study][sample] = seq_id + '.fasta' + submission_tuples_list.append((manifest_path, fasta_path)) + + with open('submit_list.tab', "w") as output_handle: + for submit_tuple in submission_tuples_list: + output_handle.write('\t'.join(submit_tuple) + '\n') + ## DEBUG CASE + #study details + # start_study = 'Study accession details:\n' + # empty_end = '\n' + # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end) + # if len(study_data.split('\n')) > 2: + # # more than 1 study accession + # raise Exception("Multiple study accessions found") + # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n') + # start_sample = 'Sample accession details:\n' + # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end) + # if len(sample_data.split('\n')) > 2: + # # more than 1 study accession + # raise Exception("Multiple sample accessions found") + # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n') + # platform = 'Ion Torrent' + # out_manifest.write(f"PLATFORM\t{platform}\n") + # out_manifest.close() + +if __name__ == '__main__': + main()