ena_webin_cli: process_input.py comparison

comparison process_input.py @ 3:7d751b5943b0 draft default tip

Uploaded

author	ieguinoa
date	Tue, 22 Feb 2022 11:03:34 +0000
parents	e25357392813
children

comparison

equal deleted inserted replaced

-:1ecd8ce07db4
+:7d751b5943b0
 import gzip
+import json
 import os
 import sys
 import shutil
 import yaml
-from Bio import SeqIO
+def get_section_string(f, start_line, end_line, return_string=False):
-"""
-Takes as input:
-1. A receipt obtained from ENA submission tool.
-A txt file that includes a YAML section with
-2. A fasta file with fasta entries ids defined after the files used for the raw submission.
-3. Path to write generated manifests
-4. Path to write generated fasta files
-5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..)
-"""
-def get_section_string(f, start_line, end_line):
 # consume starting lines
 start_string = iter(f.readline, start_line)
 start_string = ''.join(line for line in start_string)
 # read YAML lines
 yaml_string = iter(f.readline, end_line)
-return ''.join(x for x in yaml_string)
+if return_string:
+return ''.join(x for x in yaml_string)
+else:
+return [x for x in yaml_string]
+def fill_from_yaml_data(yaml_only_dict, studies_samples_dict):
+# fill experiment information (platform)  ****
+for index,exp in yaml_only_dict['ENA_experiment'].items():
+study_alias = exp['study_alias']
+sample_alias = exp['sample_alias']
+if study_alias in studies_samples_dict.keys():
+if sample_alias in studies_samples_dict[study_alias].keys():
+studies_samples_dict[study_alias][sample_alias]['experiments'].append({'platform': exp['platform']})
+else:
+studies_samples_dict[study_alias][sample_alias] = {'experiments': [{'platform': exp['platform']}]}
+else:
+studies_samples_dict[study_alias] = {sample_alias: {'experiments':[{'platform': exp['platform']}]}}
+def load_receipt_data(input_file_path):
+# should do some health check of the input file?
+# load yaml section
+loaded_data = {}
+yaml_delimiter = 'YAML -------------\n'
+with open(input_file_path) as input_file:
+yaml_only_section = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter, return_string=True))
+fill_from_yaml_data(yaml_only_section, loaded_data)
+# read study accessions
+study_delimiter = 'Study accession details:\n'
+end_line = '\n'
+with open(input_file_path) as input_file:
+studies_accession_lines = get_section_string(input_file, start_line=study_delimiter, end_line=end_line)
+# loaded_data['studies'] = {}
+for study_line in studies_accession_lines:
+if study_line != '\n':
+alias, accession, *_ = study_line.split('\t')
+try:
+loaded_data[alias]['accession'] = accession
+except KeyError:
+print(f"Experiment {exp} has unknown study or sample")
+# loaded_data['studies'][alias]['accession'] = accession
+samples_delimiter = 'Sample accession details:\n'
+with open(input_file_path) as input_file:
+samples_accession_lines = get_section_string(input_file, start_line=samples_delimiter, end_line=end_line)
+## need to iterate over all studies, because here I don't know which study is the sample from.
+# loaded_data['samples'] = {}
+for sample_line in samples_accession_lines:
+if sample_line != '\n':
+alias, accession, *_ = sample_line.split('\t')
+for study in loaded_data.keys():
+if alias in loaded_data[study].keys():
+loaded_data[study][alias]['accession'] = accession
+break
+return loaded_data
+"""
+Takes as input:
+1. A receipt obtained from ENA submission tool:
+a txt file that contains sections describing submission details.
+2. A json file with the list of fasta that the user loaded
+3. Path to write generated manifests
+4. Manifest template path: the manifest with the global values set
+(e.g COVERAGE, MINGAPLENGHT..)
+"""
 def main():
-input_file = open(sys.argv[1])
+input_file_path = sys.argv[1]
-fasta_in = open(sys.argv[2])
+fasta_names_list_path = sys.argv[2]
 out_manifest_base = sys.argv[3]
-out_fasta_base = sys.argv[4]
+manifest_template = sys.argv[4]
-manifest_template = sys.argv[5]
+# load submitted data from receipt file
-yaml_delimiter = 'YAML -------------\n'
+data_dict = load_receipt_data(input_file_path)
-yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter))
+# iterate over the list of fasta files
-# print(yaml_only)
+with open(fasta_names_list_path, 'r') as fasta_files_json_file:
-submission_tuples_list = []
+fasta_files_list = json.load(fasta_files_json_file)
-# parse the sequence IDs
+with open('submit_list.tab', 'w') as written_manifests_out:
-for record in SeqIO.parse(fasta_in, "fasta"):
+for fasta_file in fasta_files_list:
-seq_id = record.id
+if fasta_file.endswith('.fasta.gz'):
-# need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name?
+sample_alias = fasta_file[:-9]
-#  ... but for paired it may not be the exact same
+else:
-# ...initially I should attempt to find a
+sample_alias = fasta_file[:-6]
-#  .. if this
+print(f'Processing {sample_alias}')
-# in any case, if I cant make the right match then I should do a kind of substring match
+found_metadata = False
+for study_alias in data_dict.keys():
+if sample_alias in data_dict[study_alias].keys():
+sample_accession = data_dict[study_alias][sample_alias]['accession']
+study_accession = data_dict[study_alias]['accession']
+### TODO get a string that concatenates plaform information from multiple exp
+platform = data_dict[study_alias][sample_alias]['experiments'][0]['platform']
+manifest_path = os.path.join(out_manifest_base, sample_alias + '.manifest.txt')
+with open(manifest_path, "w") as output_handle:
+# first dump the contents of manifest template
+# containing the global vars
+with open(manifest_template) as m_template:
+output_handle.write(m_template.read())
+output_handle.write("ASSEMBLYNAME\tconsensus_" + sample_alias + "\n")
+output_handle.write("PLATFORM\t" + platform + "\n")
+output_handle.write("STUDY\t" + study_accession + "\n")
+output_handle.write("SAMPLE\t" + sample_accession + "\n")
+# files should be available in the corresponding dir and named:
+#  sample_alias.fasta.gz
+output_handle.write("FASTA\t" + sample_alias + '.fasta.gz' + "\n")
+found_metadata = True
+written_manifests_out.write(manifest_path + '\n')
+break
+if not found_metadata:
+print(f'No metadata found for sample {sample_alias}')
-# find the exp_alias associated with the file
-exp_alias = None
-for index,run in yaml_only['ENA_run'].items():
-if run['file_name'] == seq_id:
-## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something
-exp_alias = run['experiment_alias']
-break
-if not exp_alias:
-raise Exception("No run files match for the sequence entry {seq_id}")
-# find the sample and study for that experiment
-sample_alias = None
-study_alias = None
-for index,exp in yaml_only['ENA_experiment'].items():
-if exp['alias'] == exp_alias:
-sample_alias = exp['sample_alias']
-study_alias = exp['study_alias']
-platform = exp['platform']
-break
-if not sample_alias:
-raise Exception("No sample associated with experiment {exp_alias}")
-if not study_alias:
-raise Exception("No study associated with experiment {exp_alias}")
-# and finally create a fasta file for each sequence (e.g named with the seq id or the run ID)
-fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta')
-with open(fasta_path, "w") as output_handle:
-SeqIO.write([record], output_handle, "fasta")
-#gzip the file (required by ENA upload tool)
-fasta_path_gz = fasta_path + '.gz'
-with open(fasta_path, 'rb') as f_in:
-with gzip.open(fasta_path_gz, 'wb') as f_out:
-shutil.copyfileobj(f_in, f_out)
-# create the manifest
-# add to the manifest the:
-#
-manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt')
-with open(manifest_path, "w") as output_handle:
-# first dump the contents of manifest template
-# containing the global vars
-with open(manifest_template) as m_template:
-output_handle.write(m_template.read())
-output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n")
-output_handle.write("PLATFORM\t" + platform + "\n")
-output_handle.write("STUDY\t" + study_alias + "\n")
-output_handle.write("SAMPLE\t" + sample_alias + "\n")
-output_handle.write("FASTA\t" + fasta_path_gz + "\n")
-# ... and a dict  (or tuple list???) that contains for each study - sample  the name of the file that has the consensus sequence
-# ****  is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions??
-# in the latest case then I also need to parse the  Study accession details: and Sample accession details: entries
-# samples_dir[study][sample] = seq_id + '.fasta'
-submission_tuples_list.append((manifest_path, fasta_path))
-with open('submit_list.tab', "w") as output_handle:
-for submit_tuple in submission_tuples_list:
-output_handle.write('\t'.join(submit_tuple) + '\n')
-## DEBUG CASE
-#study details
-# start_study = 'Study accession details:\n'
-# empty_end = '\n'
-# study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end)
-# if len(study_data.split('\n')) > 2:
-# # more than 1 study accession
-# raise Exception("Multiple study accessions found")
-# out_manifest.write(f'STUDY\t{study_data.split()[1]}\n')
-# start_sample = 'Sample accession details:\n'
-# sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end)
-# if len(sample_data.split('\n')) > 2:
-# # more than 1 study accession
-# raise Exception("Multiple sample accessions found")
-# out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n')
-# platform = 'Ion Torrent'
-# out_manifest.write(f"PLATFORM\t{platform}\n")
-# out_manifest.close()
 if __name__ == '__main__':
 main()

Mercurial > repos > ieguinoa > ena_webin_cli

comparison process_input.py @ 3:7d751b5943b0 draft default tip