view process_input.py @ 0:e25357392813 draft

Uploaded
author ieguinoa
date Tue, 18 May 2021 16:30:52 +0000
parents
children 7d751b5943b0
line wrap: on
line source

import gzip
import os
import sys
import shutil
import yaml

from Bio import SeqIO


"""
Takes as input:
    1. A receipt obtained from ENA submission tool. 
    A txt file that includes a YAML section with 

    2. A fasta file with fasta entries ids defined after the files used for the raw submission.

    3. Path to write generated manifests
    4. Path to write generated fasta files
    5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..)
"""

def get_section_string(f, start_line, end_line):
    # consume starting lines
    start_string = iter(f.readline, start_line)
    start_string = ''.join(line for line in start_string)
    # read YAML lines
    yaml_string = iter(f.readline, end_line)
    return ''.join(x for x in yaml_string)


def main():
    input_file = open(sys.argv[1])
    fasta_in = open(sys.argv[2])
    out_manifest_base = sys.argv[3] 
    out_fasta_base = sys.argv[4] 
    manifest_template = sys.argv[5]
    yaml_delimiter = 'YAML -------------\n'
    yaml_only = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter))
    # print(yaml_only)
    submission_tuples_list = []
    # parse the sequence IDs
    for record in SeqIO.parse(fasta_in, "fasta"):
        seq_id = record.id
        # need to map the seq ID to 1 or more seq files submitted: for single files these is the exact file name? 
        #  ... but for paired it may not be the exact same
        # ...initially I should attempt to find a 
        #  .. if this 
        # in any case, if I cant make the right match then I should do a kind of substring match

        # find the exp_alias associated with the file
        exp_alias = None
        for index,run in yaml_only['ENA_run'].items():
            if run['file_name'] == seq_id:
                ## TODO: match also cases when the seq entry name is == entry_[1|2].fastq.gz or something
                exp_alias = run['experiment_alias']
                break
        if not exp_alias:
            raise Exception("No run files match for the sequence entry {seq_id}")
        # find the sample and study for that experiment
        sample_alias = None
        study_alias = None
        for index,exp in yaml_only['ENA_experiment'].items():
            if exp['alias'] == exp_alias:
                sample_alias = exp['sample_alias']
                study_alias = exp['study_alias']
                platform = exp['platform']
                break
        if not sample_alias:
            raise Exception("No sample associated with experiment {exp_alias}")
        if not study_alias:
            raise Exception("No study associated with experiment {exp_alias}")


        # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID)
        fasta_path = os.path.join(out_fasta_base, seq_id + '.fasta')
        with open(fasta_path, "w") as output_handle:
            SeqIO.write([record], output_handle, "fasta")
        #gzip the file (required by ENA upload tool)
        fasta_path_gz = fasta_path + '.gz'
        with open(fasta_path, 'rb') as f_in:
            with gzip.open(fasta_path_gz, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        # create the manifest
        # add to the manifest the: 
        # 
        manifest_path = os.path.join(out_manifest_base, seq_id + '.manifest.txt')
        with open(manifest_path, "w") as output_handle:
            # first dump the contents of manifest template
            # containing the global vars
            with open(manifest_template) as m_template:
                output_handle.write(m_template.read())
            output_handle.write("ASSEMBLYNAME\tconsensus_" + seq_id + "\n")
            output_handle.write("PLATFORM\t" + platform + "\n")
            output_handle.write("STUDY\t" + study_alias + "\n")
            output_handle.write("SAMPLE\t" + sample_alias + "\n")
            output_handle.write("FASTA\t" + fasta_path_gz + "\n")

        # ... and a dict  (or tuple list???) that contains for each study - sample  the name of the file that has the consensus sequence
        # ****  is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions??
        # in the latest case then I also need to parse the  Study accession details: and Sample accession details: entries
        # samples_dir[study][sample] = seq_id + '.fasta'
        submission_tuples_list.append((manifest_path, fasta_path))

    with open('submit_list.tab', "w") as output_handle:
        for submit_tuple in submission_tuples_list:
            output_handle.write('\t'.join(submit_tuple) + '\n')
    ## DEBUG CASE
    #study details
    # start_study = 'Study accession details:\n'
    # empty_end = '\n'
    # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end)
    # if len(study_data.split('\n')) > 2:
        # # more than 1 study accession
        # raise Exception("Multiple study accessions found")
    # out_manifest.write(f'STUDY\t{study_data.split()[1]}\n')
    # start_sample = 'Sample accession details:\n'
    # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end)
    # if len(sample_data.split('\n')) > 2:
        # # more than 1 study accession
        # raise Exception("Multiple sample accessions found")
    # out_manifest.write(f'SAMPLE\t{sample_data.split()[1]}\n')
    # platform = 'Ion Torrent'
    # out_manifest.write(f"PLATFORM\t{platform}\n")
    # out_manifest.close()

if __name__ == '__main__':
    main()