comparison mature_mir_gff_translation.py @ 9:2a08a6eb471c draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit 6013aaf29ff7aa2d1aab434f2355da327c7ef102
author artbio
date Wed, 25 Apr 2018 12:48:27 -0400
parents 3f62272192f9
children de227b7307cf
comparison
equal deleted inserted replaced
8:3f62272192f9 9:2a08a6eb471c
11 help="output GFF3 file with converted mature mir coordinates") 11 help="output GFF3 file with converted mature mir coordinates")
12 args = the_parser.parse_args() 12 args = the_parser.parse_args()
13 return args 13 return args
14 14
15 15
16 def get_gff_header(gff_input_file): 16 def convert_and_print_gff(gff_input_file, output):
17 string_list = []
18 for line in open(gff_input_file, "r"):
19 if line[0] == '#':
20 string_list.append(line)
21 string_list.append('# generated by mature_mir_gff_translation.py %s\n#\n' %
22 str(datetime.now()))
23 return ''.join(string_list)
24 17
18 def get_gff_header(gff_input_file):
19 string_list = []
20 for line in open(gff_input_file, "r"):
21 if line[0] == '#':
22 string_list.append(line)
23 string_list.append('# generated by mature_mir_gff_translation.py \
24 %s\n#\n' % str(datetime.now()))
25 return ''.join(string_list)
25 26
26 def load_gff_in_dict(gff_input_file):
27 '''
28 Reads the gff3 file and return a dictionary of dictionaries
29 with keys equal to standard gff3 fields (9)
30 Note that the key of the primary dictionary is the ID
31 '''
32 gff_dict = {} 27 gff_dict = {}
28 # see https://github.com/ARTbio/tools-artbio/issues/246
29 # currently fixed by perl pretreatment or the gff3 file
33 for line in open(gff_input_file, "r"): 30 for line in open(gff_input_file, "r"):
34 if line[0] == "#": 31 if line[0] == "#":
35 continue 32 continue
36 gff_fields = line[:-1].split("\t") 33 gff_fields = line[:-1].split("\t")
37 ID = gff_fields[8].split("ID=")[1].split(";")[0] 34 ID = gff_fields[8].split("ID=")[1].split(";")[0]
38 gff_dict[ID] = {} 35 if gff_fields[2] == "miRNA_primary_transcript":
39 gff_dict[ID]["seqid"] = gff_fields[0] 36 gff_dict[ID] = {}
40 gff_dict[ID]["source"] = gff_fields[1] 37 gff_dict[ID]["premir_name"] = gff_fields[8].split(
41 gff_dict[ID]["type"] = gff_fields[2] 38 "Name=")[1].split(";")[0]
42 gff_dict[ID]["start"] = gff_fields[3] 39 gff_dict[ID]["primary"] = line[:-1]
43 gff_dict[ID]["end"] = gff_fields[4] 40 gff_dict[ID]["miRNAs"] = []
44 gff_dict[ID]["score"] = gff_fields[5] 41 elif gff_fields[2] == "miRNA":
45 gff_dict[ID]["strand"] = gff_fields[6] 42 parent_ID = gff_fields[8].split("erives_from=")[1]
46 gff_dict[ID]["phase"] = gff_fields[7] 43 gff_dict[parent_ID]["miRNAs"].append(line[:-1])
47 gff_dict[ID]["attributes"] = gff_fields[8] 44 # Now reorganise features and recalculate coordinates of premirs and mirs
48 if "erives_from" in gff_dict[ID]["attributes"]: 45 gff_list = []
49 parent_primary_transcript = gff_dict[ID]["attributes"].split( 46 for ID in sorted(gff_dict, key=lambda x: (gff_dict[x]['premir_name'])):
50 "erives_from=")[1] 47 # delete premir and their mir with ID containing "_"
51 parent_primary_transcript = gff_dict[parent_primary_transcript][ 48 if "_" in ID:
52 "attributes"].split("Name=")[1] 49 del gff_dict[ID]
53 gff_dict[ID]["attributes"] = "%s;Parent_mir_Name=%s" % (
54 gff_dict[ID]["attributes"], parent_primary_transcript)
55 return gff_dict
56
57
58 def genome_to_mir_gff(gff_dict, output, header):
59 '''
60 Converts seqid field from chromosome to item Name
61 Then converts coordinates relative to "miRNA_primary_transcript"
62 Note that GFF files are 1-based coordinates
63 '''
64 for key in gff_dict:
65 name = gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
66 gff_dict[key]["seqid"] = name
67 if "erives_from=" in gff_dict[key]["attributes"]:
68 parent_ID = gff_dict[key]["attributes"].split(
69 "erives_from=")[1].split(";")[0]
70 gff_dict[key]["start"] = str(int(gff_dict[key]["start"])-int(
71 gff_dict[parent_ID]["start"])+1)
72 gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int(
73 gff_dict[parent_ID]["start"])+1)
74 hairpins = {}
75 matures = {}
76 # treats miRNA_primary_transcript coordinates
77 # in a second loop to avoid errors in conversion
78 for key in gff_dict:
79 if gff_dict[key]["type"] == "miRNA_primary_transcript":
80 gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int(
81 gff_dict[key]["start"]) + 1)
82 gff_dict[key]["start"] = '1'
83 # now, do a dict[ID]=Name but only for miRNA_primary_transcript
84 hairpins[key] = gff_dict[key]["attributes"].split(
85 "Name=")[1].split(
86 ";")[0]
87 else: 50 else:
88 matures[key] = gff_dict[key]["attributes"].split( 51 primary_fields = gff_dict[ID]["primary"].split('\t')
89 "Name=")[1].split( 52 seqid = primary_fields[8].split("Name=")[1].split(";")[0]
90 ";")[0] 53 source = primary_fields[1]
54 type = primary_fields[2]
55 start = primary_fields[3]
56 newstart = "1"
57 end = primary_fields[4]
58 newend = str(int(end)-int(start)+1)
59 score = primary_fields[5]
60 strand = primary_fields[6]
61 phase = primary_fields[7]
62 attributes = primary_fields[8]
63 gff_list.append('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (seqid,
64 source, type, newstart, newend, score, strand,
65 phase, attributes))
66 # ensure their is only 2 child miRNAs at best
67 if len(gff_dict[ID]["miRNAs"]) > 2:
68 gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][:2]
69 # sort child miRNAs 5p first 3p second
70 if gff_dict[ID]["miRNAs"][0].find('5p') == -1:
71 gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][::-1]
72 for mir in gff_dict[ID]["miRNAs"]:
73 mir_fields = mir.split('\t')
74 mir_seqid = mir_fields[8].split("Name=")[1].split(";")[0]
75 mir_source = mir_fields[1]
76 mir_type = mir_fields[2]
77 mir_start = mir_fields[3]
78 mir_end = mir_fields[4]
79 new_mir_start = str(int(mir_start)-int(start)+1)
80 new_mir_end = str(int(mir_end)-int(start)+1)
81 mir_score = mir_fields[5]
82 mir_strand = mir_fields[6]
83 mir_phase = mir_fields[7]
84 mir_attributes = mir_fields[8]
85 mir_sfx = ";Parent_mir_Name=%s" % gff_dict[ID]["premir_name"]
86 gff_list.append(
87 '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%s' % (
88 mir_seqid, mir_source, mir_type,
89 new_mir_start, new_mir_end, mir_score,
90 mir_strand, mir_phase, mir_attributes,
91 mir_sfx))
91 with open(output, "w") as output: 92 with open(output, "w") as output:
92 output.write(header) 93 output.write('%s' % get_gff_header(gff_input_file))
93 for ID in sorted(hairpins, key=hairpins.get): 94 output.write('\n'.join(gff_list))
94 output.write("\t".join([gff_dict[ID]["seqid"], 95 output.write('\n')
95 gff_dict[ID]["source"], gff_dict[ID]["type"],
96 gff_dict[ID]["start"], gff_dict[ID]["end"],
97 gff_dict[ID]["score"], gff_dict[ID]["strand"],
98 gff_dict[ID]["phase"], gff_dict[ID]["attributes"]]))
99 output.write("\n")
100 for id in sorted(matures, key=matures.get, reverse=True):
101 if ID in gff_dict[id]["attributes"]:
102 output.write("\t".join([gff_dict[id]["seqid"],
103 gff_dict[id]["source"], gff_dict[id]["type"],
104 gff_dict[id]["start"], gff_dict[id]["end"],
105 gff_dict[id]["score"],
106 gff_dict[id]["strand"],
107 gff_dict[id]["phase"],
108 gff_dict[id]["attributes"]]))
109 output.write("\n")
110 96
111 97
112 def main(infile, outfile): 98 def main(infile, outfile):
113 gff_dict = load_gff_in_dict(infile) 99 convert_and_print_gff(infile, outfile)
114 genome_to_mir_gff(gff_dict, outfile, get_gff_header(infile))
115 100
116 101
117 if __name__ == "__main__": 102 if __name__ == "__main__":
118 args = Parser() 103 args = Parser()
119 main(args.input, args.output) 104 main(args.input, args.output)