comparison fml_gff_groomer/scripts/gff_id_mapper.py @ 0:79726c328621 default tip

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author vipints
date Tue, 07 Jun 2011 17:29:24 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:79726c328621
1 #!/usr/bin/env python
2 #
3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 3 of the License, or
6 # (at your option) any later version.
7 #
8 # Written (W) 2010 Vipin T Sreedharan, Friedrich Miescher Laboratory of the Max Planck Society
9 # Copyright (C) 2010 Max Planck Society
10 #
11
12 # Description : Provides feature to sub feature identifier mapping in a given GFF3 file.
13
14 import re, sys
15 import collections
16 import urllib
17 import time
18
19 def _gff_line_map(line):
20 """Parses a line of GFF into a dictionary.
21 Given an input line from a GFF file, this:
22 - breaks it into component elements
23 - determines the type of attribute (flat, parent, child or annotation)
24 - generates a dictionary of GFF info
25 """
26 gff3_kw_pat = re.compile("\w+=")
27 def _split_keyvals(keyval_str):
28 """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.
29
30 GFF3 has key value pairs like:
31 count=9;gene=amx-2;sequence=SAGE:aacggagccg
32 GFF2 and GTF have:
33 Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"
34 name "fgenesh1_pg.C_chr_1000003"; transcriptId 869
35 """
36 quals = collections.defaultdict(list)
37 if keyval_str is None:
38 return quals
39 # ensembl GTF has a stray semi-colon at the end
40 if keyval_str[-1] == ';':
41 keyval_str = keyval_str[:-1]
42 # GFF2/GTF has a semi-colon with at least one space after it.
43 # It can have spaces on both sides; wormbase does this.
44 # GFF3 works with no spaces.
45 # Split at the first one we can recognize as working
46 parts = keyval_str.split(" ; ")
47 if len(parts) == 1:
48 parts = keyval_str.split("; ")
49 if len(parts) == 1:
50 parts = keyval_str.split(";")
51 # check if we have GFF3 style key-vals (with =)
52 is_gff2 = True
53 if gff3_kw_pat.match(parts[0]):
54 is_gff2 = False
55 key_vals = [p.split('=') for p in parts]
56 # otherwise, we are separated by a space with a key as the first item
57 else:
58 pieces = []
59 for p in parts:
60 # fix misplaced semi-colons in keys in some GFF2 files
61 if p and p[0] == ';':
62 p = p[1:]
63 pieces.append(p.strip().split(" "))
64 key_vals = [(p[0], " ".join(p[1:])) for p in pieces]
65 for key, val in key_vals:
66 # remove quotes in GFF2 files
67 if (len(val) > 0 and val[0] == '"' and val[-1] == '"'):
68 val = val[1:-1]
69 if val:
70 quals[key].extend(val.split(','))
71 # if we don't have a value, make this a key=True/False style
72 # attribute
73 else:
74 quals[key].append('true')
75 for key, vals in quals.items():
76 quals[key] = [urllib.unquote(v) for v in vals]
77 return quals, is_gff2
78
79 def _nest_gff2_features(gff_parts):
80 """Provide nesting of GFF2 transcript parts with transcript IDs.
81
82 exons and coding sequences are mapped to a parent with a transcript_id
83 in GFF2. This is implemented differently at different genome centers
84 and this function attempts to resolve that and map things to the GFF3
85 way of doing them.
86 """
87 # map protein or transcript ids to a parent
88 for transcript_id in ["transcript_id", "transcriptId", "proteinId"]:
89 try:
90 gff_parts["quals"]["Parent"] = \
91 gff_parts["quals"][transcript_id]
92 break
93 except KeyError:
94 pass
95 # case for WormBase GFF -- everything labelled as Transcript or CDS
96 for flat_name in ["Transcript", "CDS"]:
97 if gff_parts["quals"].has_key(flat_name):
98 # parent types
99 if gff_parts["type"] in [flat_name]:
100 if not gff_parts["id"]:
101 gff_parts["id"] = gff_parts["quals"][flat_name][0]
102 gff_parts["quals"]["ID"] = [gff_parts["id"]]
103 # children types
104 elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
105 "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
106 "start_codon"]:
107 gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name]
108 break
109
110 return gff_parts
111
112 line = line.strip()
113 if line == '':return [('directive', line)] # sometimes the blank lines will be there
114 if line[0] == '>':return [('directive', '')] # sometimes it will be a FATSA header
115 if line[0] == "#":
116 return [('directive', line[2:])]
117 elif line:
118 parts = line.split('\t')
119 if len(parts) == 1 and re.search(r'\w+', parts[0]):return [('directive', '')] ## GFF files with FASTA sequence together
120 assert len(parts) == 9, line
121 gff_parts = [(None if p == '.' else p) for p in parts]
122 gff_info = dict()
123
124 # collect all of the base qualifiers for this item
125 quals, is_gff2 = _split_keyvals(gff_parts[8])
126
127 gff_info["is_gff2"] = is_gff2
128
129 if gff_parts[1]:quals["source"].append(gff_parts[1])
130 gff_info['quals'] = dict(quals)
131
132 # if we are describing a location, then we are a feature
133 if gff_parts[3] and gff_parts[4]:
134 gff_info['type'] = gff_parts[2]
135 gff_info['id'] = quals.get('ID', [''])[0]
136
137 if is_gff2:gff_info = _nest_gff2_features(gff_info)
138 # features that have parents need to link so we can pick up
139 # the relationship
140 if gff_info['quals'].has_key('Parent'):
141 final_key = 'child'
142 elif gff_info['id']:
143 final_key = 'parent'
144 # Handle flat features
145 else:
146 final_key = 'feature'
147 # otherwise, associate these annotations with the full record
148 else:
149 final_key = 'annotation'
150 return [(final_key, gff_info)]
151
152 def parent_child_id_map(gff_handle):
153 """Provide a mapping of parent to child relationships in the file.
154
155 Gives a dictionary of parent child relationships:
156
157 keys -- tuple of (source, type) for each parent
158 values -- tuple of (source, type) as children of that parent"""
159
160 # collect all of the parent and child types mapped to IDs
161 parent_sts = dict()
162 child_sts = collections.defaultdict(list)
163
164 for line in gff_handle:
165 line_type, line_info = _gff_line_map(line)[0]
166 if (line_type == 'parent' or (line_type == 'child' and line_info['id'])):
167 parent_sts[line_info['id']] = (line_info['quals']['source'][0], line_info['type'])
168 if line_type == 'child':
169 for parent_id in line_info['quals']['Parent']:
170 child_sts[parent_id].append((line_info['quals']['source'][0], line_info['type']))
171 gff_handle.close()
172
173 # generate a dictionary of the unique final type relationships
174 pc_map = collections.defaultdict(list)
175 for parent_id, parent_type in parent_sts.items():
176 for child_type in child_sts[parent_id]:
177 pc_map[parent_type].append(child_type)
178 pc_final_map = dict()
179 for ptype, ctypes in pc_map.items():
180 unique_ctypes = list(set(ctypes))
181 unique_ctypes.sort()
182 pc_final_map[ptype] = unique_ctypes
183
184 # Check for Parent Child relations
185 level1, level2, level3, sec_level_mis = {}, {}, {}, {}
186 for etype, fchild in pc_final_map.items():
187 level2_flag = 0
188 for kp, vp in pc_final_map.items():
189 if etype in vp:level2_flag = 1; level2[etype] = 1 # check for second level features
190 if level2_flag == 0: # first level features
191 level1[etype] =1
192 for eachfch in fchild: # perform a check for all level1 objects values were defined as level2 keys.
193 if not eachfch in pc_final_map.keys(): # figure out the missing level2 objects
194 if etype in sec_level_mis:
195 sec_level_mis[etype].append(eachfch)
196 else:
197 sec_level_mis[etype]=[eachfch]
198 if level2_flag == 1:level3[str(fchild)] =1 # taking third level features
199 # disply the result
200 if level1==level2==level3=={} and sec_level_mis == {}:
201 print
202 print 'ONLY FIRST level feature(s):'
203 source_type = dict()
204 gff_handle = open(gff_handle.name, 'rU')
205 for line in gff_handle:
206 line = line.strip('\n\r')
207 if line[0] == '#': continue
208 parts = line.split('\t')
209 if parts[-1] == '':parts.pop()
210 assert len(parts) == 9, line
211 source_type[(parts[1], parts[2])] = 1
212 gff_handle.close()
213 for ele in source_type:print '\t' + str(ele)
214 print
215 else:
216 print
217 print '===Report on different level features from GFF file==='
218 print
219 print 'FIRST level feature(s):'
220 for ele in level1: print '\t' + str(ele)
221 print
222 print 'SECOND level feature(s):'
223 for ele in level2: print '\t' + str(ele)
224 print
225 print 'THIRD level feature(s):'
226 for ele in level3:print '\t' + str(ele[1:-1])
227 print
228 # wrong way mapped feature mapping description
229 for wf, wfv in sec_level_mis.items():
230 if wf[1]=='gene':
231 print 'GFF Parsing modules from publicly available packages like Bio-python, Bio-perl etc. are heavily dependent on feature identifier mapping.'
232 print 'Here few features seems to be wrongly mapped to its child, which inturn cause problems while extracting the annotation based on feature identifier.\n'
233 for ehv in wfv:
234 if ehv[1]=='exon' or ehv[1]=='intron' or ehv[1]=='CDS' or ehv[1]=='three_prime_UTR' or ehv[1]=='five_prime_UTR':print 'Error in ID mapping: Level1 feature ' + str(wf) + ' maps to Level3 feature ' + str(ehv)
235
236 if __name__=='__main__':
237
238 stime = time.asctime( time.localtime(time.time()) )
239 print '-------------------------------------------------------'
240 print 'GFFExamine started on ' + stime
241 print '-------------------------------------------------------'
242
243 try:
244 gff_handle = open(sys.argv[1], 'rU')
245 except:
246 sys.stderr.write("Can't open the GFF3 file, Cannot continue...\n")
247 sys.stderr.write("USAGE: gff_id_mapper.py <gff3 file> \n")
248 sys.exit(-1)
249 parent_child_id_map(gff_handle)
250
251 stime = time.asctime( time.localtime(time.time()) )
252 print '-------------------------------------------------------'
253 print 'GFFExamine finished at ' + stime
254 print '-------------------------------------------------------'