comparison mature_mir_gff_translation.py @ 5:9ea96a02c416 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit 04980585c257ab5f8eb5d10de007316c47c5d1ce
author artbio
date Tue, 05 Sep 2017 06:33:16 -0400
parents 6b8adacd4750
children 3f62272192f9
comparison
equal deleted inserted replaced
4:da1aa7de2b19 5:9ea96a02c416
1 #!/usr/bin/env python 1 import argparse
2 2
3 import argparse 3 from datetime import datetime
4 4
5 5
6 def Parser(): 6 def Parser():
7 the_parser = argparse.ArgumentParser() 7 the_parser = argparse.ArgumentParser()
8 the_parser.add_argument( 8 the_parser.add_argument(
12 help="output GFF3 file with converted mature mir coordinates") 12 help="output GFF3 file with converted mature mir coordinates")
13 args = the_parser.parse_args() 13 args = the_parser.parse_args()
14 return args 14 return args
15 15
16 16
17 GFF3_header = '''##gff-version 3 17 def get_gff_header(gff_input_file):
18 ##generated by mature_mir_gff_translation.py 18 string_list = []
19 # 19 for line in open(gff_input_file, "r"):
20 # Chromosomal coordinates of microRNAs ** relative to the hairpin precursors ** 20 if line[0] == '#':
21 # microRNAs: miRBase current_version 21 string_list.append(line)
22 # genome-build-id: check http://mirbase.org/ 22 string_list.append('# generated by mature_mir_gff_translation.py %s\n#\n' %
23 # 23 str(datetime.now()))
24 # Hairpin precursor sequences have type "miRNA_primary_transcript". 24 return ''.join(string_list)
25 # Note, these sequences do not represent the full primary transcript,
26 # rather a predicted stem-loop portion that includes the precursor
27 # miRNA. Mature sequences have type "miRNA".
28 #
29 '''
30 25
31 26
32 def load_gff_in_dict(gff_input_file): 27 def load_gff_in_dict(gff_input_file):
33 ''' 28 '''
34 Reads the gff3 file and return a dictionary of dictionaries 29 Reads the gff3 file and return a dictionary of dictionaries
49 gff_dict[ID]["end"] = gff_fields[4] 44 gff_dict[ID]["end"] = gff_fields[4]
50 gff_dict[ID]["score"] = gff_fields[5] 45 gff_dict[ID]["score"] = gff_fields[5]
51 gff_dict[ID]["strand"] = gff_fields[6] 46 gff_dict[ID]["strand"] = gff_fields[6]
52 gff_dict[ID]["phase"] = gff_fields[7] 47 gff_dict[ID]["phase"] = gff_fields[7]
53 gff_dict[ID]["attributes"] = gff_fields[8] 48 gff_dict[ID]["attributes"] = gff_fields[8]
54 if "Derives_from" in gff_dict[ID]["attributes"]: 49 if "erives_from" in gff_dict[ID]["attributes"]:
55 parent_primary_transcript = gff_dict[ID]["attributes"].split( 50 parent_primary_transcript = gff_dict[ID]["attributes"].split(
56 "Derives_from=")[1] 51 "erives_from=")[1]
57 parent_primary_transcript = gff_dict[parent_primary_transcript][ 52 parent_primary_transcript = gff_dict[parent_primary_transcript][
58 "attributes"].split("Name=")[1] 53 "attributes"].split("Name=")[1]
59 gff_dict[ID]["attributes"] = "%s;Parent_mir_Name=%s" % ( 54 gff_dict[ID]["attributes"] = "%s;Parent_mir_Name=%s" % (
60 gff_dict[ID]["attributes"], parent_primary_transcript) 55 gff_dict[ID]["attributes"], parent_primary_transcript)
61 return gff_dict 56 return gff_dict
62 57
63 58
64 def genome_to_mir_gff(gff_dict, output): 59 def genome_to_mir_gff(gff_dict, output, header):
65 ''' 60 '''
66 Converts seqid field from chromosome to item Name 61 Converts seqid field from chromosome to item Name
67 Then converts coordinates relative to "miRNA_primary_transcript" 62 Then converts coordinates relative to "miRNA_primary_transcript"
68 Note that GFF files are 1-based coordinates 63 Note that GFF files are 1-based coordinates
69 ''' 64 '''
70 for key in gff_dict: 65 for key in gff_dict:
71 name = gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] 66 name = gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
72 gff_dict[key]["seqid"] = name 67 gff_dict[key]["seqid"] = name
73 if "Derives_from=" in gff_dict[key]["attributes"]: 68 if "erives_from=" in gff_dict[key]["attributes"]:
74 parent_ID = gff_dict[key]["attributes"].split( 69 parent_ID = gff_dict[key]["attributes"].split(
75 "Derives_from=")[1].split(";")[0] 70 "erives_from=")[1].split(";")[0]
76 gff_dict[key]["start"] = str(int(gff_dict[key]["start"])-int( 71 gff_dict[key]["start"] = str(int(gff_dict[key]["start"])-int(
77 gff_dict[parent_ID]["start"])+1) 72 gff_dict[parent_ID]["start"])+1)
78 gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int( 73 gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int(
79 gff_dict[parent_ID]["start"])+1) 74 gff_dict[parent_ID]["start"])+1)
80 hairpins = {} 75 hairpins = {}
93 else: 88 else:
94 matures[key] = gff_dict[key]["attributes"].split( 89 matures[key] = gff_dict[key]["attributes"].split(
95 "Name=")[1].split( 90 "Name=")[1].split(
96 ";")[0] 91 ";")[0]
97 with open(output, "w") as output: 92 with open(output, "w") as output:
98 output.write(GFF3_header) 93 output.write(header)
99 for ID in sorted(hairpins, key=hairpins.get): 94 for ID in sorted(hairpins, key=hairpins.get):
100 output.write("\t".join([gff_dict[ID]["seqid"], 95 output.write("\t".join([gff_dict[ID]["seqid"],
101 gff_dict[ID]["source"], gff_dict[ID]["type"], 96 gff_dict[ID]["source"], gff_dict[ID]["type"],
102 gff_dict[ID]["start"], gff_dict[ID]["end"], 97 gff_dict[ID]["start"], gff_dict[ID]["end"],
103 gff_dict[ID]["score"], gff_dict[ID]["strand"], 98 gff_dict[ID]["score"], gff_dict[ID]["strand"],
115 output.write("\n") 110 output.write("\n")
116 111
117 112
118 def main(infile, outfile): 113 def main(infile, outfile):
119 gff_dict = load_gff_in_dict(infile) 114 gff_dict = load_gff_in_dict(infile)
120 genome_to_mir_gff(gff_dict, outfile) 115 genome_to_mir_gff(gff_dict, outfile, get_gff_header(infile))
121 116
122 117
123 if __name__ == "__main__": 118 if __name__ == "__main__":
124 args = Parser() 119 args = Parser()
125 main(args.input, args.output) 120 main(args.input, args.output)