view mature_mir_gff_translation.py @ 15:ffcd42f85b61 draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit 5eb8570dce4e22fb2759cc16c8e1ce9d304508fe
author artbio
date Sat, 10 Feb 2024 17:15:04 +0000
parents b045c30fb768
children
line wrap: on
line source

import argparse
from datetime import datetime


def Parser():
    the_parser = argparse.ArgumentParser()
    the_parser.add_argument(
        '--gff_path', action="store", type=str,
        help="path to miRBase GFF3 file")
    the_parser.add_argument(
        '--output', action="store", type=str,
        help="output GFF3 file with converted mature mir coordinates")
    args = the_parser.parse_args()
    return args


def convert_and_print_gff(gff_input_file, output):

    def get_gff_header(gff_input_file):
        string_list = []
        for line in open(gff_input_file, "r"):
            if line[0] == '#':
                string_list.append(line)
        string_list.append('# generated by mature_mir_gff_translation.py \
                            %s\n#\n' % str(datetime.now()))
        return ''.join(string_list)

    gff_dict = {}
    # see https://github.com/ARTbio/tools-artbio/issues/246
    # currently fixed by perl pretreatment or the gff3 file
    for line in open(gff_input_file, "r"):
        if line[0] == "#":
            continue
        gff_fields = line[:-1].split("\t")
        ID = gff_fields[8].split("ID=")[1].split(";")[0]
        if gff_fields[2] == "miRNA_primary_transcript":
            gff_dict[ID] = {}
            gff_dict[ID]["premir_name"] = gff_fields[8].split(
                "Name=")[1].split(";")[0]
            gff_dict[ID]["primary"] = line[:-1]
            gff_dict[ID]["miRNAs"] = []
        elif gff_fields[2] == "miRNA":
            if "_" in ID:
                continue
            parent_ID = gff_fields[8].split("erives_from=")[1]
            gff_dict[parent_ID]["miRNAs"].append(line[:-1])
    # Now reorganise features and recalculate coordinates of premirs and mirs
    gff_list = []
    for ID in sorted(gff_dict, key=lambda x: (gff_dict[x]['premir_name'])):
        # delete premir and their mir with ID containing "_"
        if "_" in ID:
            del gff_dict[ID]
        else:
            primary_fields = gff_dict[ID]["primary"].split('\t')
            seqid = primary_fields[8].split("Name=")[1].split(";")[0]
            source = primary_fields[1]
            type = primary_fields[2]
            start = primary_fields[3]
            newstart = "1"
            end = primary_fields[4]
            newend = str(int(end)-int(start)+1)
            score = primary_fields[5]
            strand = primary_fields[6]
            phase = primary_fields[7]
            attributes = primary_fields[8]
            gff_list.append('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (seqid,
                            source, type, newstart, newend, score, strand,
                            phase, attributes))
            # ensure their is only 2 child miRNAs at best
            if len(gff_dict[ID]["miRNAs"]) > 2:
                gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][:2]
            # sort child miRNAs 5p first 3p second,
            # if there are two miR mature at least !
            if len(gff_dict[ID]["miRNAs"]) > 1 and \
                    gff_dict[ID]["miRNAs"][0].find('5p') == -1:
                gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][::-1]
            for mir in gff_dict[ID]["miRNAs"]:
                mir_fields = mir.split('\t')
                mir_seqid = mir_fields[8].split("Name=")[1].split(";")[0]
                mir_source = mir_fields[1]
                mir_type = mir_fields[2]
                mir_start = mir_fields[3]
                mir_end = mir_fields[4]
                new_mir_start = str(int(mir_start)-int(start)+1)
                new_mir_end = str(int(mir_end)-int(start)+1)
                mir_score = mir_fields[5]
                mir_strand = mir_fields[6]
                mir_phase = mir_fields[7]
                mir_attributes = mir_fields[8]
                mir_sfx = ";Parent_mir_Name=%s" % gff_dict[ID]["premir_name"]
                gff_list.append(
                                '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%s' % (
                                    mir_seqid, mir_source, mir_type,
                                    new_mir_start, new_mir_end, mir_score,
                                    mir_strand, mir_phase, mir_attributes,
                                    mir_sfx))
    with open(output, "w") as output:
        output.write('%s' % get_gff_header(gff_input_file))
        output.write('\n'.join(gff_list))
        output.write('\n')


def main(gff_path, outfile):
    convert_and_print_gff(gff_path, outfile)


if __name__ == "__main__":
    args = Parser()
    main(args.gff_path, args.output)