Mercurial > repos > artbio > mircounts
view mature_mir_gff_translation.py @ 15:ffcd42f85b61 draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit 5eb8570dce4e22fb2759cc16c8e1ce9d304508fe
author | artbio |
---|---|
date | Sat, 10 Feb 2024 17:15:04 +0000 |
parents | b045c30fb768 |
children |
line wrap: on
line source
import argparse from datetime import datetime def Parser(): the_parser = argparse.ArgumentParser() the_parser.add_argument( '--gff_path', action="store", type=str, help="path to miRBase GFF3 file") the_parser.add_argument( '--output', action="store", type=str, help="output GFF3 file with converted mature mir coordinates") args = the_parser.parse_args() return args def convert_and_print_gff(gff_input_file, output): def get_gff_header(gff_input_file): string_list = [] for line in open(gff_input_file, "r"): if line[0] == '#': string_list.append(line) string_list.append('# generated by mature_mir_gff_translation.py \ %s\n#\n' % str(datetime.now())) return ''.join(string_list) gff_dict = {} # see https://github.com/ARTbio/tools-artbio/issues/246 # currently fixed by perl pretreatment or the gff3 file for line in open(gff_input_file, "r"): if line[0] == "#": continue gff_fields = line[:-1].split("\t") ID = gff_fields[8].split("ID=")[1].split(";")[0] if gff_fields[2] == "miRNA_primary_transcript": gff_dict[ID] = {} gff_dict[ID]["premir_name"] = gff_fields[8].split( "Name=")[1].split(";")[0] gff_dict[ID]["primary"] = line[:-1] gff_dict[ID]["miRNAs"] = [] elif gff_fields[2] == "miRNA": if "_" in ID: continue parent_ID = gff_fields[8].split("erives_from=")[1] gff_dict[parent_ID]["miRNAs"].append(line[:-1]) # Now reorganise features and recalculate coordinates of premirs and mirs gff_list = [] for ID in sorted(gff_dict, key=lambda x: (gff_dict[x]['premir_name'])): # delete premir and their mir with ID containing "_" if "_" in ID: del gff_dict[ID] else: primary_fields = gff_dict[ID]["primary"].split('\t') seqid = primary_fields[8].split("Name=")[1].split(";")[0] source = primary_fields[1] type = primary_fields[2] start = primary_fields[3] newstart = "1" end = primary_fields[4] newend = str(int(end)-int(start)+1) score = primary_fields[5] strand = primary_fields[6] phase = primary_fields[7] attributes = primary_fields[8] gff_list.append('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (seqid, source, type, newstart, newend, score, strand, phase, attributes)) # ensure their is only 2 child miRNAs at best if len(gff_dict[ID]["miRNAs"]) > 2: gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][:2] # sort child miRNAs 5p first 3p second, # if there are two miR mature at least ! if len(gff_dict[ID]["miRNAs"]) > 1 and \ gff_dict[ID]["miRNAs"][0].find('5p') == -1: gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][::-1] for mir in gff_dict[ID]["miRNAs"]: mir_fields = mir.split('\t') mir_seqid = mir_fields[8].split("Name=")[1].split(";")[0] mir_source = mir_fields[1] mir_type = mir_fields[2] mir_start = mir_fields[3] mir_end = mir_fields[4] new_mir_start = str(int(mir_start)-int(start)+1) new_mir_end = str(int(mir_end)-int(start)+1) mir_score = mir_fields[5] mir_strand = mir_fields[6] mir_phase = mir_fields[7] mir_attributes = mir_fields[8] mir_sfx = ";Parent_mir_Name=%s" % gff_dict[ID]["premir_name"] gff_list.append( '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%s' % ( mir_seqid, mir_source, mir_type, new_mir_start, new_mir_end, mir_score, mir_strand, mir_phase, mir_attributes, mir_sfx)) with open(output, "w") as output: output.write('%s' % get_gff_header(gff_input_file)) output.write('\n'.join(gff_list)) output.write('\n') def main(gff_path, outfile): convert_and_print_gff(gff_path, outfile) if __name__ == "__main__": args = Parser() main(args.gff_path, args.output)