view process_input.py @ 3:7d751b5943b0 draft default tip

Uploaded
author ieguinoa
date Tue, 22 Feb 2022 11:03:34 +0000
parents e25357392813
children
line wrap: on
line source

import gzip
import json
import os
import sys
import shutil
import yaml

def get_section_string(f, start_line, end_line, return_string=False):
    # consume starting lines
    start_string = iter(f.readline, start_line)
    start_string = ''.join(line for line in start_string)
    # read YAML lines
    yaml_string = iter(f.readline, end_line)
    if return_string:
        return ''.join(x for x in yaml_string)
    else:
        return [x for x in yaml_string]
  
def fill_from_yaml_data(yaml_only_dict, studies_samples_dict):
    # fill experiment information (platform)  **** 
    for index,exp in yaml_only_dict['ENA_experiment'].items():
        study_alias = exp['study_alias']
        sample_alias = exp['sample_alias']
        if study_alias in studies_samples_dict.keys():
            if sample_alias in studies_samples_dict[study_alias].keys():
                studies_samples_dict[study_alias][sample_alias]['experiments'].append({'platform': exp['platform']})
            else:
                studies_samples_dict[study_alias][sample_alias] = {'experiments': [{'platform': exp['platform']}]}
        else:
            studies_samples_dict[study_alias] = {sample_alias: {'experiments':[{'platform': exp['platform']}]}}


def load_receipt_data(input_file_path):
    # should do some health check of the input file?
    # load yaml section
    loaded_data = {} 
    yaml_delimiter = 'YAML -------------\n'
    with open(input_file_path) as input_file:
        yaml_only_section = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter, return_string=True))
    fill_from_yaml_data(yaml_only_section, loaded_data)
    # read study accessions
    study_delimiter = 'Study accession details:\n'
    end_line = '\n'
    with open(input_file_path) as input_file:
        studies_accession_lines = get_section_string(input_file, start_line=study_delimiter, end_line=end_line)
    # loaded_data['studies'] = {}
    for study_line in studies_accession_lines:
        if study_line != '\n':
            alias, accession, *_ = study_line.split('\t')
            try:
                loaded_data[alias]['accession'] = accession
            except KeyError:
                print(f"Experiment {exp} has unknown study or sample")
            # loaded_data['studies'][alias]['accession'] = accession
    samples_delimiter = 'Sample accession details:\n'
    with open(input_file_path) as input_file:
        samples_accession_lines = get_section_string(input_file, start_line=samples_delimiter, end_line=end_line)
        ## need to iterate over all studies, because here I don't know which study is the sample from.
    # loaded_data['samples'] = {}
    for sample_line in samples_accession_lines:
        if sample_line != '\n':
            alias, accession, *_ = sample_line.split('\t')
            for study in loaded_data.keys():
                if alias in loaded_data[study].keys():
                    loaded_data[study][alias]['accession'] = accession
                    break
    return loaded_data


"""
Takes as input:
    1. A receipt obtained from ENA submission tool: 
        a txt file that contains sections describing submission details.
    2. A json file with the list of fasta that the user loaded
    3. Path to write generated manifests
    4. Manifest template path: the manifest with the global values set 
        (e.g COVERAGE, MINGAPLENGHT..)
"""



def main():
    input_file_path = sys.argv[1]
    fasta_names_list_path = sys.argv[2]
    out_manifest_base = sys.argv[3] 
    manifest_template = sys.argv[4]
    # load submitted data from receipt file
    data_dict = load_receipt_data(input_file_path)
    # iterate over the list of fasta files
    with open(fasta_names_list_path, 'r') as fasta_files_json_file:
        fasta_files_list = json.load(fasta_files_json_file)
    with open('submit_list.tab', 'w') as written_manifests_out:
        for fasta_file in fasta_files_list:
            if fasta_file.endswith('.fasta.gz'):
                sample_alias = fasta_file[:-9]
            else:
                sample_alias = fasta_file[:-6]
            print(f'Processing {sample_alias}')
            found_metadata = False
            for study_alias in data_dict.keys():
                if sample_alias in data_dict[study_alias].keys():
                    sample_accession = data_dict[study_alias][sample_alias]['accession']
                    study_accession = data_dict[study_alias]['accession']
                    ### TODO get a string that concatenates plaform information from multiple exp
                    platform = data_dict[study_alias][sample_alias]['experiments'][0]['platform']
                    manifest_path = os.path.join(out_manifest_base, sample_alias + '.manifest.txt')
                    with open(manifest_path, "w") as output_handle:
                        # first dump the contents of manifest template
                        # containing the global vars
                        with open(manifest_template) as m_template:
                            output_handle.write(m_template.read())
                        output_handle.write("ASSEMBLYNAME\tconsensus_" + sample_alias + "\n")
                        output_handle.write("PLATFORM\t" + platform + "\n")
                        output_handle.write("STUDY\t" + study_accession + "\n")
                        output_handle.write("SAMPLE\t" + sample_accession + "\n")
                        # files should be available in the corresponding dir and named:
                        #  sample_alias.fasta.gz 
                        output_handle.write("FASTA\t" + sample_alias + '.fasta.gz' + "\n")
                    found_metadata = True
                    written_manifests_out.write(manifest_path + '\n')
                    break
            if not found_metadata:
                print(f'No metadata found for sample {sample_alias}')


if __name__ == '__main__':
    main()