Mercurial > repos > artbio > mircounts
comparison mature_mir_gff_translation.py @ 3:6b8adacd4750 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit fa65a844f9041a83767f5305ab360abfdf68f59f
author | artbio |
---|---|
date | Wed, 26 Jul 2017 19:15:08 -0400 |
parents | da29af78a960 |
children | 9ea96a02c416 |
comparison
equal
deleted
inserted
replaced
2:f59c643b00fc | 3:6b8adacd4750 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 import sys | |
4 import argparse | 3 import argparse |
5 | 4 |
6 | 5 |
7 def Parser(): | 6 def Parser(): |
8 the_parser = argparse.ArgumentParser() | 7 the_parser = argparse.ArgumentParser() |
9 the_parser.add_argument( | 8 the_parser.add_argument( |
10 '--input', action="store", type=str, help="input miRBase GFF3 file") | 9 '--input', action="store", type=str, help="input miRBase GFF3 file") |
11 the_parser.add_argument( | 10 the_parser.add_argument( |
12 '--output', action="store", type=str, help="output GFF3 file with converted mature mir coordinates") | 11 '--output', action="store", type=str, |
12 help="output GFF3 file with converted mature mir coordinates") | |
13 args = the_parser.parse_args() | 13 args = the_parser.parse_args() |
14 return args | 14 return args |
15 | 15 |
16 GFF3_header= '''##gff-version 3 | 16 |
17 GFF3_header = '''##gff-version 3 | |
17 ##generated by mature_mir_gff_translation.py | 18 ##generated by mature_mir_gff_translation.py |
18 # | 19 # |
19 # Chromosomal coordinates of microRNAs ** relative to the hairpin precursors ** | 20 # Chromosomal coordinates of microRNAs ** relative to the hairpin precursors ** |
20 # microRNAs: miRBase current_version | 21 # microRNAs: miRBase current_version |
21 # genome-build-id: check http://mirbase.org/ | 22 # genome-build-id: check http://mirbase.org/ |
22 # | 23 # |
23 # Hairpin precursor sequences have type "miRNA_primary_transcript". | 24 # Hairpin precursor sequences have type "miRNA_primary_transcript". |
24 # Note, these sequences do not represent the full primary transcript, | 25 # Note, these sequences do not represent the full primary transcript, |
25 # rather a predicted stem-loop portion that includes the precursor | 26 # rather a predicted stem-loop portion that includes the precursor |
26 # miRNA. Mature sequences have type "miRNA". | 27 # miRNA. Mature sequences have type "miRNA". |
27 # | 28 # |
28 ''' | 29 ''' |
30 | |
29 | 31 |
30 def load_gff_in_dict(gff_input_file): | 32 def load_gff_in_dict(gff_input_file): |
31 ''' | 33 ''' |
32 Reads the gff3 file and return a dictionary of dictionaries | 34 Reads the gff3 file and return a dictionary of dictionaries |
33 with keys equal to standard gff3 fields (9) | 35 with keys equal to standard gff3 fields (9) |
34 Note that the key of the primary dictionary is the ID | 36 Note that the key of the primary dictionary is the ID |
35 ''' | 37 ''' |
36 gff_dict = {} | 38 gff_dict = {} |
37 for line in open(gff_input_file, "r"): | 39 for line in open(gff_input_file, "r"): |
38 if line[0]=="#": | 40 if line[0] == "#": |
39 continue | 41 continue |
40 gff_fields=line[:-1].split("\t") | 42 gff_fields = line[:-1].split("\t") |
41 ID=gff_fields[8].split("ID=")[1].split(";")[0] | 43 ID = gff_fields[8].split("ID=")[1].split(";")[0] |
42 gff_dict[ID] = {} | 44 gff_dict[ID] = {} |
43 gff_dict[ID]["seqid"]=gff_fields[0] | 45 gff_dict[ID]["seqid"] = gff_fields[0] |
44 gff_dict[ID]["source"]=gff_fields[1] | 46 gff_dict[ID]["source"] = gff_fields[1] |
45 gff_dict[ID]["type"]=gff_fields[2] | 47 gff_dict[ID]["type"] = gff_fields[2] |
46 gff_dict[ID]["start"]=gff_fields[3] | 48 gff_dict[ID]["start"] = gff_fields[3] |
47 gff_dict[ID]["end"]=gff_fields[4] | 49 gff_dict[ID]["end"] = gff_fields[4] |
48 gff_dict[ID]["score"]=gff_fields[5] | 50 gff_dict[ID]["score"] = gff_fields[5] |
49 gff_dict[ID]["strand"]=gff_fields[6] | 51 gff_dict[ID]["strand"] = gff_fields[6] |
50 gff_dict[ID]["phase"]=gff_fields[7] | 52 gff_dict[ID]["phase"] = gff_fields[7] |
51 gff_dict[ID]["attributes"]=gff_fields[8] | 53 gff_dict[ID]["attributes"] = gff_fields[8] |
52 if "Derives_from" in gff_dict[ID]["attributes"]: | 54 if "Derives_from" in gff_dict[ID]["attributes"]: |
53 parent_primary_transcript=gff_dict[ID]["attributes"].split("Derives_from=")[1] | 55 parent_primary_transcript = gff_dict[ID]["attributes"].split( |
54 parent_primary_transcript=gff_dict[parent_primary_transcript]["attributes"].split("Name=")[1] | 56 "Derives_from=")[1] |
55 gff_dict[ID]["attributes"]="%s;Parent_mir_Name=%s" % (gff_dict[ID]["attributes"], parent_primary_transcript) | 57 parent_primary_transcript = gff_dict[parent_primary_transcript][ |
58 "attributes"].split("Name=")[1] | |
59 gff_dict[ID]["attributes"] = "%s;Parent_mir_Name=%s" % ( | |
60 gff_dict[ID]["attributes"], parent_primary_transcript) | |
56 return gff_dict | 61 return gff_dict |
57 | 62 |
58 | 63 |
59 def genome_to_mir_gff(gff_dict, output): | 64 def genome_to_mir_gff(gff_dict, output): |
60 ''' | 65 ''' |
61 Converts seqid field from chromosome to item Name | 66 Converts seqid field from chromosome to item Name |
62 Then converts coordinates relative to "miRNA_primary_transcript" | 67 Then converts coordinates relative to "miRNA_primary_transcript" |
63 Note that GFF files are 1-based coordinates | 68 Note that GFF files are 1-based coordinates |
64 ''' | 69 ''' |
65 for key in gff_dict: | 70 for key in gff_dict: |
66 name=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] | 71 name = gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] |
67 gff_dict[key]["seqid"]=name | 72 gff_dict[key]["seqid"] = name |
68 if "Derives_from=" in gff_dict[key]["attributes"]: | 73 if "Derives_from=" in gff_dict[key]["attributes"]: |
69 parent_ID=gff_dict[key]["attributes"].split("Derives_from=")[1].split(";")[0] | 74 parent_ID = gff_dict[key]["attributes"].split( |
70 gff_dict[key]["start"]=str(int(gff_dict[key]["start"]) - int(gff_dict[parent_ID]["start"]) + 1) | 75 "Derives_from=")[1].split(";")[0] |
71 gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[parent_ID]["start"]) + 1) | 76 gff_dict[key]["start"] = str(int(gff_dict[key]["start"])-int( |
72 hairpins={} | 77 gff_dict[parent_ID]["start"])+1) |
73 matures={} | 78 gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int( |
74 for key in gff_dict: ## treats miRNA_primary_transcript coordinates in a second loop to avoid errors in conversion | 79 gff_dict[parent_ID]["start"])+1) |
75 if gff_dict[key]["type"]=="miRNA_primary_transcript": | 80 hairpins = {} |
76 gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[key]["start"]) + 1) | 81 matures = {} |
77 gff_dict[key]["start"]="1" | 82 # treats miRNA_primary_transcript coordinates |
83 # in a second loop to avoid errors in conversion | |
84 for key in gff_dict: | |
85 if gff_dict[key]["type"] == "miRNA_primary_transcript": | |
86 gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int( | |
87 gff_dict[key]["start"]) + 1) | |
88 gff_dict[key]["start"] = '1' | |
78 # now, do a dict[ID]=Name but only for miRNA_primary_transcript | 89 # now, do a dict[ID]=Name but only for miRNA_primary_transcript |
79 hairpins[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] | 90 hairpins[key] = gff_dict[key]["attributes"].split( |
91 "Name=")[1].split( | |
92 ";")[0] | |
80 else: | 93 else: |
81 matures[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] | 94 matures[key] = gff_dict[key]["attributes"].split( |
95 "Name=")[1].split( | |
96 ";")[0] | |
82 with open(output, "w") as output: | 97 with open(output, "w") as output: |
83 output.write(GFF3_header) | 98 output.write(GFF3_header) |
84 for ID in sorted(hairpins, key=hairpins.get): | 99 for ID in sorted(hairpins, key=hairpins.get): |
85 output.write("\t".join([gff_dict[ID]["seqid"], gff_dict[ID]["source"], | 100 output.write("\t".join([gff_dict[ID]["seqid"], |
86 gff_dict[ID]["type"], gff_dict[ID]["start"], gff_dict[ID]["end"], | 101 gff_dict[ID]["source"], gff_dict[ID]["type"], |
87 gff_dict[ID]["score"], gff_dict[ID]["strand"], gff_dict[ID]["phase"], | 102 gff_dict[ID]["start"], gff_dict[ID]["end"], |
88 gff_dict[ID]["attributes"]])) | 103 gff_dict[ID]["score"], gff_dict[ID]["strand"], |
104 gff_dict[ID]["phase"], gff_dict[ID]["attributes"]])) | |
89 output.write("\n") | 105 output.write("\n") |
90 for id in sorted(matures, key=matures.get, reverse=True): | 106 for id in sorted(matures, key=matures.get, reverse=True): |
91 if ID in gff_dict[id]["attributes"]: | 107 if ID in gff_dict[id]["attributes"]: |
92 output.write("\t".join([gff_dict[id]["seqid"], gff_dict[id]["source"], | 108 output.write("\t".join([gff_dict[id]["seqid"], |
93 gff_dict[id]["type"], gff_dict[id]["start"], gff_dict[id]["end"], | 109 gff_dict[id]["source"], gff_dict[id]["type"], |
94 gff_dict[id]["score"], gff_dict[id]["strand"], | 110 gff_dict[id]["start"], gff_dict[id]["end"], |
95 gff_dict[id]["phase"], gff_dict[id]["attributes"]])) | 111 gff_dict[id]["score"], |
112 gff_dict[id]["strand"], | |
113 gff_dict[id]["phase"], | |
114 gff_dict[id]["attributes"]])) | |
96 output.write("\n") | 115 output.write("\n") |
97 | 116 |
98 | 117 |
99 def main(infile, outfile): | 118 def main(infile, outfile): |
100 gff_dict = load_gff_in_dict(infile) | 119 gff_dict = load_gff_in_dict(infile) |