Mercurial > repos > artbio > mircounts
comparison mature_mir_gff_translation.py @ 5:9ea96a02c416 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit 04980585c257ab5f8eb5d10de007316c47c5d1ce
author | artbio |
---|---|
date | Tue, 05 Sep 2017 06:33:16 -0400 |
parents | 6b8adacd4750 |
children | 3f62272192f9 |
comparison
equal
deleted
inserted
replaced
4:da1aa7de2b19 | 5:9ea96a02c416 |
---|---|
1 #!/usr/bin/env python | 1 import argparse |
2 | 2 |
3 import argparse | 3 from datetime import datetime |
4 | 4 |
5 | 5 |
6 def Parser(): | 6 def Parser(): |
7 the_parser = argparse.ArgumentParser() | 7 the_parser = argparse.ArgumentParser() |
8 the_parser.add_argument( | 8 the_parser.add_argument( |
12 help="output GFF3 file with converted mature mir coordinates") | 12 help="output GFF3 file with converted mature mir coordinates") |
13 args = the_parser.parse_args() | 13 args = the_parser.parse_args() |
14 return args | 14 return args |
15 | 15 |
16 | 16 |
17 GFF3_header = '''##gff-version 3 | 17 def get_gff_header(gff_input_file): |
18 ##generated by mature_mir_gff_translation.py | 18 string_list = [] |
19 # | 19 for line in open(gff_input_file, "r"): |
20 # Chromosomal coordinates of microRNAs ** relative to the hairpin precursors ** | 20 if line[0] == '#': |
21 # microRNAs: miRBase current_version | 21 string_list.append(line) |
22 # genome-build-id: check http://mirbase.org/ | 22 string_list.append('# generated by mature_mir_gff_translation.py %s\n#\n' % |
23 # | 23 str(datetime.now())) |
24 # Hairpin precursor sequences have type "miRNA_primary_transcript". | 24 return ''.join(string_list) |
25 # Note, these sequences do not represent the full primary transcript, | |
26 # rather a predicted stem-loop portion that includes the precursor | |
27 # miRNA. Mature sequences have type "miRNA". | |
28 # | |
29 ''' | |
30 | 25 |
31 | 26 |
32 def load_gff_in_dict(gff_input_file): | 27 def load_gff_in_dict(gff_input_file): |
33 ''' | 28 ''' |
34 Reads the gff3 file and return a dictionary of dictionaries | 29 Reads the gff3 file and return a dictionary of dictionaries |
49 gff_dict[ID]["end"] = gff_fields[4] | 44 gff_dict[ID]["end"] = gff_fields[4] |
50 gff_dict[ID]["score"] = gff_fields[5] | 45 gff_dict[ID]["score"] = gff_fields[5] |
51 gff_dict[ID]["strand"] = gff_fields[6] | 46 gff_dict[ID]["strand"] = gff_fields[6] |
52 gff_dict[ID]["phase"] = gff_fields[7] | 47 gff_dict[ID]["phase"] = gff_fields[7] |
53 gff_dict[ID]["attributes"] = gff_fields[8] | 48 gff_dict[ID]["attributes"] = gff_fields[8] |
54 if "Derives_from" in gff_dict[ID]["attributes"]: | 49 if "erives_from" in gff_dict[ID]["attributes"]: |
55 parent_primary_transcript = gff_dict[ID]["attributes"].split( | 50 parent_primary_transcript = gff_dict[ID]["attributes"].split( |
56 "Derives_from=")[1] | 51 "erives_from=")[1] |
57 parent_primary_transcript = gff_dict[parent_primary_transcript][ | 52 parent_primary_transcript = gff_dict[parent_primary_transcript][ |
58 "attributes"].split("Name=")[1] | 53 "attributes"].split("Name=")[1] |
59 gff_dict[ID]["attributes"] = "%s;Parent_mir_Name=%s" % ( | 54 gff_dict[ID]["attributes"] = "%s;Parent_mir_Name=%s" % ( |
60 gff_dict[ID]["attributes"], parent_primary_transcript) | 55 gff_dict[ID]["attributes"], parent_primary_transcript) |
61 return gff_dict | 56 return gff_dict |
62 | 57 |
63 | 58 |
64 def genome_to_mir_gff(gff_dict, output): | 59 def genome_to_mir_gff(gff_dict, output, header): |
65 ''' | 60 ''' |
66 Converts seqid field from chromosome to item Name | 61 Converts seqid field from chromosome to item Name |
67 Then converts coordinates relative to "miRNA_primary_transcript" | 62 Then converts coordinates relative to "miRNA_primary_transcript" |
68 Note that GFF files are 1-based coordinates | 63 Note that GFF files are 1-based coordinates |
69 ''' | 64 ''' |
70 for key in gff_dict: | 65 for key in gff_dict: |
71 name = gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] | 66 name = gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] |
72 gff_dict[key]["seqid"] = name | 67 gff_dict[key]["seqid"] = name |
73 if "Derives_from=" in gff_dict[key]["attributes"]: | 68 if "erives_from=" in gff_dict[key]["attributes"]: |
74 parent_ID = gff_dict[key]["attributes"].split( | 69 parent_ID = gff_dict[key]["attributes"].split( |
75 "Derives_from=")[1].split(";")[0] | 70 "erives_from=")[1].split(";")[0] |
76 gff_dict[key]["start"] = str(int(gff_dict[key]["start"])-int( | 71 gff_dict[key]["start"] = str(int(gff_dict[key]["start"])-int( |
77 gff_dict[parent_ID]["start"])+1) | 72 gff_dict[parent_ID]["start"])+1) |
78 gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int( | 73 gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int( |
79 gff_dict[parent_ID]["start"])+1) | 74 gff_dict[parent_ID]["start"])+1) |
80 hairpins = {} | 75 hairpins = {} |
93 else: | 88 else: |
94 matures[key] = gff_dict[key]["attributes"].split( | 89 matures[key] = gff_dict[key]["attributes"].split( |
95 "Name=")[1].split( | 90 "Name=")[1].split( |
96 ";")[0] | 91 ";")[0] |
97 with open(output, "w") as output: | 92 with open(output, "w") as output: |
98 output.write(GFF3_header) | 93 output.write(header) |
99 for ID in sorted(hairpins, key=hairpins.get): | 94 for ID in sorted(hairpins, key=hairpins.get): |
100 output.write("\t".join([gff_dict[ID]["seqid"], | 95 output.write("\t".join([gff_dict[ID]["seqid"], |
101 gff_dict[ID]["source"], gff_dict[ID]["type"], | 96 gff_dict[ID]["source"], gff_dict[ID]["type"], |
102 gff_dict[ID]["start"], gff_dict[ID]["end"], | 97 gff_dict[ID]["start"], gff_dict[ID]["end"], |
103 gff_dict[ID]["score"], gff_dict[ID]["strand"], | 98 gff_dict[ID]["score"], gff_dict[ID]["strand"], |
115 output.write("\n") | 110 output.write("\n") |
116 | 111 |
117 | 112 |
118 def main(infile, outfile): | 113 def main(infile, outfile): |
119 gff_dict = load_gff_in_dict(infile) | 114 gff_dict = load_gff_in_dict(infile) |
120 genome_to_mir_gff(gff_dict, outfile) | 115 genome_to_mir_gff(gff_dict, outfile, get_gff_header(infile)) |
121 | 116 |
122 | 117 |
123 if __name__ == "__main__": | 118 if __name__ == "__main__": |
124 args = Parser() | 119 args = Parser() |
125 main(args.input, args.output) | 120 main(args.input, args.output) |