5
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 GFF feature mapping program, to find the relation between different features described in a given GFF file.
|
|
4
|
|
5 Usage:
|
|
6 python gff_fmap.py in.gff > out.txt
|
|
7
|
|
8 Courtesy: Brad Chapman
|
|
9 Few functions are inherited from bcbio-GFFParser program.
|
|
10 """
|
|
11
|
|
12 import re
|
|
13 import sys
|
|
14 import urllib
|
|
15 import collections
|
|
16 from helper import open_file
|
|
17
|
|
18 def _gff_line_map(line):
|
|
19 """Parses a line of GFF into a dictionary.
|
|
20 Given an input line from a GFF file, this:
|
|
21 - breaks it into component elements
|
|
22 - determines the type of attribute (flat, parent, child or annotation)
|
|
23 - generates a dictionary of GFF info
|
|
24 """
|
|
25 gff3_kw_pat = re.compile("\w+=")
|
|
26 def _split_keyvals(keyval_str):
|
|
27 """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.
|
|
28 GFF3 has key value pairs like:
|
|
29 count=9;gene=amx-2;sequence=SAGE:aacggagccg
|
|
30 GFF2 and GTF have:
|
|
31 Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"
|
|
32 name "fgenesh1_pg.C_chr_1000003"; transcriptId 869
|
|
33 """
|
|
34 quals = collections.defaultdict(list)
|
|
35 if keyval_str is None:
|
|
36 return quals
|
|
37 # ensembl GTF has a stray semi-colon at the end
|
|
38 if keyval_str[-1] == ';':
|
|
39 keyval_str = keyval_str[:-1]
|
|
40 # GFF2/GTF has a semi-colon with at least one space after it.
|
|
41 # It can have spaces on both sides; wormbase does this.
|
|
42 # GFF3 works with no spaces.
|
|
43 # Split at the first one we can recognize as working
|
|
44 parts = keyval_str.split(" ; ")
|
|
45 if len(parts) == 1:
|
|
46 parts = keyval_str.split("; ")
|
|
47 if len(parts) == 1:
|
|
48 parts = keyval_str.split(";")
|
|
49 # check if we have GFF3 style key-vals (with =)
|
|
50 is_gff2 = True
|
|
51 if gff3_kw_pat.match(parts[0]):
|
|
52 is_gff2 = False
|
|
53 key_vals = [p.split('=') for p in parts]
|
|
54 # otherwise, we are separated by a space with a key as the first item
|
|
55 else:
|
|
56 pieces = []
|
|
57 for p in parts:
|
|
58 # fix misplaced semi-colons in keys in some GFF2 files
|
|
59 if p and p[0] == ';':
|
|
60 p = p[1:]
|
|
61 pieces.append(p.strip().split(" "))
|
|
62 key_vals = [(p[0], " ".join(p[1:])) for p in pieces]
|
|
63 for key, val in key_vals:
|
|
64 # remove quotes in GFF2 files
|
|
65 if (len(val) > 0 and val[0] == '"' and val[-1] == '"'):
|
|
66 val = val[1:-1]
|
|
67 if val:
|
|
68 quals[key].extend(val.split(','))
|
|
69 # if we don't have a value, make this a key=True/False style
|
|
70 # attribute
|
|
71 else:
|
|
72 quals[key].append('true')
|
|
73 for key, vals in quals.items():
|
|
74 quals[key] = [urllib.unquote(v) for v in vals]
|
|
75 return quals, is_gff2
|
|
76
|
|
77 def _nest_gff2_features(gff_parts):
|
|
78 """Provide nesting of GFF2 transcript parts with transcript IDs.
|
|
79
|
|
80 exons and coding sequences are mapped to a parent with a transcript_id
|
|
81 in GFF2. This is implemented differently at different genome centers
|
|
82 and this function attempts to resolve that and map things to the GFF3
|
|
83 way of doing them.
|
|
84 """
|
|
85 # map protein or transcript ids to a parent
|
|
86 for transcript_id in ["transcript_id", "transcriptId", "proteinId"]:
|
|
87 try:
|
|
88 gff_parts["quals"]["Parent"] = \
|
|
89 gff_parts["quals"][transcript_id]
|
|
90 break
|
|
91 except KeyError:
|
|
92 pass
|
|
93 # case for WormBase GFF -- everything labelled as Transcript or CDS
|
|
94 for flat_name in ["Transcript", "CDS"]:
|
|
95 if gff_parts["quals"].has_key(flat_name):
|
|
96 # parent types
|
|
97 if gff_parts["type"] in [flat_name]:
|
|
98 if not gff_parts["id"]:
|
|
99 gff_parts["id"] = gff_parts["quals"][flat_name][0]
|
|
100 gff_parts["quals"]["ID"] = [gff_parts["id"]]
|
|
101 # children types
|
|
102 elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
|
|
103 "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
|
|
104 "start_codon"]:
|
|
105 gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name]
|
|
106 break
|
|
107
|
|
108 return gff_parts
|
|
109
|
|
110 line = line.strip()
|
|
111 if line == '':return [('directive', line)] # sometimes the blank lines will be there
|
|
112 if line[0] == '>':return [('directive', '')] # sometimes it will be a FATSA header
|
|
113 if line[0] == "#":
|
|
114 return [('directive', line[2:])]
|
|
115 elif line:
|
|
116 parts = line.split('\t')
|
|
117 if len(parts) == 1 and re.search(r'\w+', parts[0]):return [('directive', '')] ## GFF files with FASTA sequence together
|
|
118 assert len(parts) == 9, line
|
|
119 gff_parts = [(None if p == '.' else p) for p in parts]
|
|
120 gff_info = dict()
|
|
121
|
|
122 # collect all of the base qualifiers for this item
|
|
123 quals, is_gff2 = _split_keyvals(gff_parts[8])
|
|
124
|
|
125 gff_info["is_gff2"] = is_gff2
|
|
126
|
|
127 if gff_parts[1]:quals["source"].append(gff_parts[1])
|
|
128 gff_info['quals'] = dict(quals)
|
|
129
|
|
130 # if we are describing a location, then we are a feature
|
|
131 if gff_parts[3] and gff_parts[4]:
|
|
132 gff_info['type'] = gff_parts[2]
|
|
133 gff_info['id'] = quals.get('ID', [''])[0]
|
|
134
|
|
135 if is_gff2:gff_info = _nest_gff2_features(gff_info)
|
|
136 # features that have parents need to link so we can pick up
|
|
137 # the relationship
|
|
138 if gff_info['quals'].has_key('Parent'):
|
|
139 final_key = 'child'
|
|
140 elif gff_info['id']:
|
|
141 final_key = 'parent'
|
|
142 # Handle flat features
|
|
143 else:
|
|
144 final_key = 'feature'
|
|
145 # otherwise, associate these annotations with the full record
|
|
146 else:
|
|
147 final_key = 'annotation'
|
|
148 return [(final_key, gff_info)]
|
|
149
|
|
150 def parent_child_id_map(gff_handle):
|
|
151 """
|
|
152 Provide a mapping of parent to child relationships in the file.
|
|
153 Gives a dictionary of parent child relationships:
|
|
154
|
|
155 keys -- tuple of (source, type) for each parent
|
|
156 values -- tuple of (source, type) as children of that parent
|
|
157 """
|
|
158 # collect all of the parent and child types mapped to IDs
|
|
159 parent_sts = dict()
|
|
160 child_sts = collections.defaultdict(list)
|
|
161 for line in gff_handle:
|
|
162 line_type, line_info = _gff_line_map(line)[0]
|
|
163 if (line_type == 'parent' or (line_type == 'child' and line_info['id'])):
|
|
164 parent_sts[line_info['id']] = (line_info['quals']['source'][0], line_info['type'])
|
|
165 if line_type == 'child':
|
|
166 for parent_id in line_info['quals']['Parent']:
|
|
167 child_sts[parent_id].append((line_info['quals']['source'][0], line_info['type']))
|
|
168 gff_handle.close()
|
|
169 # generate a dictionary of the unique final type relationships
|
|
170 pc_map = collections.defaultdict(list)
|
|
171 for parent_id, parent_type in parent_sts.items():
|
|
172 for child_type in child_sts[parent_id]:
|
|
173 pc_map[parent_type].append(child_type)
|
|
174 pc_final_map = dict()
|
|
175 for ptype, ctypes in pc_map.items():
|
|
176 unique_ctypes = list(set(ctypes))
|
|
177 unique_ctypes.sort()
|
|
178 pc_final_map[ptype] = unique_ctypes
|
|
179 # some cases the GFF file represents a single feature type
|
|
180 if not pc_final_map:
|
|
181 for fid, stypes in parent_sts.items():
|
|
182 pc_final_map[stypes] = dict()
|
|
183 # generate a report on feature id mapping in the file
|
|
184 print '+---------------------+---------------------------------+'
|
|
185 print '| Parent feature type | Associated child feature type(s)|'
|
|
186 print '+---------------------+---------------------------------+'
|
|
187 for key, value in pc_final_map.items():
|
|
188 print key[0], key[1]
|
|
189 for child_to in value:
|
|
190 print '\t\t\t|-',child_to[0], child_to[1]
|
|
191 print '+---------------------+---------------------------------+'
|
|
192
|
|
193
|
|
194 if __name__=='__main__':
|
|
195
|
|
196 try:
|
|
197 gff_file = sys.argv[1]
|
|
198 except:
|
|
199 print __doc__
|
|
200 sys.exit(-1)
|
|
201
|
|
202 gff_handle = open_file(gff_file)
|
|
203 parent_child_id_map(gff_handle)
|