Mercurial > repos > artbio > mircounts
comparison mature_mir_gff_translation.py @ 9:2a08a6eb471c draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit 6013aaf29ff7aa2d1aab434f2355da327c7ef102
author | artbio |
---|---|
date | Wed, 25 Apr 2018 12:48:27 -0400 |
parents | 3f62272192f9 |
children | de227b7307cf |
comparison
equal
deleted
inserted
replaced
8:3f62272192f9 | 9:2a08a6eb471c |
---|---|
11 help="output GFF3 file with converted mature mir coordinates") | 11 help="output GFF3 file with converted mature mir coordinates") |
12 args = the_parser.parse_args() | 12 args = the_parser.parse_args() |
13 return args | 13 return args |
14 | 14 |
15 | 15 |
16 def get_gff_header(gff_input_file): | 16 def convert_and_print_gff(gff_input_file, output): |
17 string_list = [] | |
18 for line in open(gff_input_file, "r"): | |
19 if line[0] == '#': | |
20 string_list.append(line) | |
21 string_list.append('# generated by mature_mir_gff_translation.py %s\n#\n' % | |
22 str(datetime.now())) | |
23 return ''.join(string_list) | |
24 | 17 |
18 def get_gff_header(gff_input_file): | |
19 string_list = [] | |
20 for line in open(gff_input_file, "r"): | |
21 if line[0] == '#': | |
22 string_list.append(line) | |
23 string_list.append('# generated by mature_mir_gff_translation.py \ | |
24 %s\n#\n' % str(datetime.now())) | |
25 return ''.join(string_list) | |
25 | 26 |
26 def load_gff_in_dict(gff_input_file): | |
27 ''' | |
28 Reads the gff3 file and return a dictionary of dictionaries | |
29 with keys equal to standard gff3 fields (9) | |
30 Note that the key of the primary dictionary is the ID | |
31 ''' | |
32 gff_dict = {} | 27 gff_dict = {} |
28 # see https://github.com/ARTbio/tools-artbio/issues/246 | |
29 # currently fixed by perl pretreatment or the gff3 file | |
33 for line in open(gff_input_file, "r"): | 30 for line in open(gff_input_file, "r"): |
34 if line[0] == "#": | 31 if line[0] == "#": |
35 continue | 32 continue |
36 gff_fields = line[:-1].split("\t") | 33 gff_fields = line[:-1].split("\t") |
37 ID = gff_fields[8].split("ID=")[1].split(";")[0] | 34 ID = gff_fields[8].split("ID=")[1].split(";")[0] |
38 gff_dict[ID] = {} | 35 if gff_fields[2] == "miRNA_primary_transcript": |
39 gff_dict[ID]["seqid"] = gff_fields[0] | 36 gff_dict[ID] = {} |
40 gff_dict[ID]["source"] = gff_fields[1] | 37 gff_dict[ID]["premir_name"] = gff_fields[8].split( |
41 gff_dict[ID]["type"] = gff_fields[2] | 38 "Name=")[1].split(";")[0] |
42 gff_dict[ID]["start"] = gff_fields[3] | 39 gff_dict[ID]["primary"] = line[:-1] |
43 gff_dict[ID]["end"] = gff_fields[4] | 40 gff_dict[ID]["miRNAs"] = [] |
44 gff_dict[ID]["score"] = gff_fields[5] | 41 elif gff_fields[2] == "miRNA": |
45 gff_dict[ID]["strand"] = gff_fields[6] | 42 parent_ID = gff_fields[8].split("erives_from=")[1] |
46 gff_dict[ID]["phase"] = gff_fields[7] | 43 gff_dict[parent_ID]["miRNAs"].append(line[:-1]) |
47 gff_dict[ID]["attributes"] = gff_fields[8] | 44 # Now reorganise features and recalculate coordinates of premirs and mirs |
48 if "erives_from" in gff_dict[ID]["attributes"]: | 45 gff_list = [] |
49 parent_primary_transcript = gff_dict[ID]["attributes"].split( | 46 for ID in sorted(gff_dict, key=lambda x: (gff_dict[x]['premir_name'])): |
50 "erives_from=")[1] | 47 # delete premir and their mir with ID containing "_" |
51 parent_primary_transcript = gff_dict[parent_primary_transcript][ | 48 if "_" in ID: |
52 "attributes"].split("Name=")[1] | 49 del gff_dict[ID] |
53 gff_dict[ID]["attributes"] = "%s;Parent_mir_Name=%s" % ( | |
54 gff_dict[ID]["attributes"], parent_primary_transcript) | |
55 return gff_dict | |
56 | |
57 | |
58 def genome_to_mir_gff(gff_dict, output, header): | |
59 ''' | |
60 Converts seqid field from chromosome to item Name | |
61 Then converts coordinates relative to "miRNA_primary_transcript" | |
62 Note that GFF files are 1-based coordinates | |
63 ''' | |
64 for key in gff_dict: | |
65 name = gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] | |
66 gff_dict[key]["seqid"] = name | |
67 if "erives_from=" in gff_dict[key]["attributes"]: | |
68 parent_ID = gff_dict[key]["attributes"].split( | |
69 "erives_from=")[1].split(";")[0] | |
70 gff_dict[key]["start"] = str(int(gff_dict[key]["start"])-int( | |
71 gff_dict[parent_ID]["start"])+1) | |
72 gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int( | |
73 gff_dict[parent_ID]["start"])+1) | |
74 hairpins = {} | |
75 matures = {} | |
76 # treats miRNA_primary_transcript coordinates | |
77 # in a second loop to avoid errors in conversion | |
78 for key in gff_dict: | |
79 if gff_dict[key]["type"] == "miRNA_primary_transcript": | |
80 gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int( | |
81 gff_dict[key]["start"]) + 1) | |
82 gff_dict[key]["start"] = '1' | |
83 # now, do a dict[ID]=Name but only for miRNA_primary_transcript | |
84 hairpins[key] = gff_dict[key]["attributes"].split( | |
85 "Name=")[1].split( | |
86 ";")[0] | |
87 else: | 50 else: |
88 matures[key] = gff_dict[key]["attributes"].split( | 51 primary_fields = gff_dict[ID]["primary"].split('\t') |
89 "Name=")[1].split( | 52 seqid = primary_fields[8].split("Name=")[1].split(";")[0] |
90 ";")[0] | 53 source = primary_fields[1] |
54 type = primary_fields[2] | |
55 start = primary_fields[3] | |
56 newstart = "1" | |
57 end = primary_fields[4] | |
58 newend = str(int(end)-int(start)+1) | |
59 score = primary_fields[5] | |
60 strand = primary_fields[6] | |
61 phase = primary_fields[7] | |
62 attributes = primary_fields[8] | |
63 gff_list.append('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (seqid, | |
64 source, type, newstart, newend, score, strand, | |
65 phase, attributes)) | |
66 # ensure their is only 2 child miRNAs at best | |
67 if len(gff_dict[ID]["miRNAs"]) > 2: | |
68 gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][:2] | |
69 # sort child miRNAs 5p first 3p second | |
70 if gff_dict[ID]["miRNAs"][0].find('5p') == -1: | |
71 gff_dict[ID]["miRNAs"] = gff_dict[ID]["miRNAs"][::-1] | |
72 for mir in gff_dict[ID]["miRNAs"]: | |
73 mir_fields = mir.split('\t') | |
74 mir_seqid = mir_fields[8].split("Name=")[1].split(";")[0] | |
75 mir_source = mir_fields[1] | |
76 mir_type = mir_fields[2] | |
77 mir_start = mir_fields[3] | |
78 mir_end = mir_fields[4] | |
79 new_mir_start = str(int(mir_start)-int(start)+1) | |
80 new_mir_end = str(int(mir_end)-int(start)+1) | |
81 mir_score = mir_fields[5] | |
82 mir_strand = mir_fields[6] | |
83 mir_phase = mir_fields[7] | |
84 mir_attributes = mir_fields[8] | |
85 mir_sfx = ";Parent_mir_Name=%s" % gff_dict[ID]["premir_name"] | |
86 gff_list.append( | |
87 '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%s' % ( | |
88 mir_seqid, mir_source, mir_type, | |
89 new_mir_start, new_mir_end, mir_score, | |
90 mir_strand, mir_phase, mir_attributes, | |
91 mir_sfx)) | |
91 with open(output, "w") as output: | 92 with open(output, "w") as output: |
92 output.write(header) | 93 output.write('%s' % get_gff_header(gff_input_file)) |
93 for ID in sorted(hairpins, key=hairpins.get): | 94 output.write('\n'.join(gff_list)) |
94 output.write("\t".join([gff_dict[ID]["seqid"], | 95 output.write('\n') |
95 gff_dict[ID]["source"], gff_dict[ID]["type"], | |
96 gff_dict[ID]["start"], gff_dict[ID]["end"], | |
97 gff_dict[ID]["score"], gff_dict[ID]["strand"], | |
98 gff_dict[ID]["phase"], gff_dict[ID]["attributes"]])) | |
99 output.write("\n") | |
100 for id in sorted(matures, key=matures.get, reverse=True): | |
101 if ID in gff_dict[id]["attributes"]: | |
102 output.write("\t".join([gff_dict[id]["seqid"], | |
103 gff_dict[id]["source"], gff_dict[id]["type"], | |
104 gff_dict[id]["start"], gff_dict[id]["end"], | |
105 gff_dict[id]["score"], | |
106 gff_dict[id]["strand"], | |
107 gff_dict[id]["phase"], | |
108 gff_dict[id]["attributes"]])) | |
109 output.write("\n") | |
110 | 96 |
111 | 97 |
112 def main(infile, outfile): | 98 def main(infile, outfile): |
113 gff_dict = load_gff_in_dict(infile) | 99 convert_and_print_gff(infile, outfile) |
114 genome_to_mir_gff(gff_dict, outfile, get_gff_header(infile)) | |
115 | 100 |
116 | 101 |
117 if __name__ == "__main__": | 102 if __name__ == "__main__": |
118 args = Parser() | 103 args = Parser() |
119 main(args.input, args.output) | 104 main(args.input, args.output) |