annotate glimmerHMM/BCBio/GFF/GFFOutput.py @ 0:0a15677c6668 default tip

Uploaded
author bjoern-gruening
date Wed, 11 Jan 2012 09:58:35 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
1 """Output Biopython SeqRecords and SeqFeatures to GFF3 format.
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
2
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
3 The target format is GFF3, the current GFF standard:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
4 http://www.sequenceontology.org/gff3.shtml
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
5 """
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
6 import urllib
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
7
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
8 class _IdHandler:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
9 """Generate IDs for GFF3 Parent/Child relationships where they don't exist.
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
10 """
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
11 def __init__(self):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
12 self._prefix = "biopygen"
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
13 self._counter = 1
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
14 self._seen_ids = []
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
15
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
16 def _generate_id(self, quals):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
17 """Generate a unique ID not present in our existing IDs.
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
18 """
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
19 gen_id = self._get_standard_id(quals)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
20 if gen_id is None:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
21 while 1:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
22 gen_id = "%s%s" % (self._prefix, self._counter)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
23 if gen_id not in self._seen_ids:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
24 break
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
25 self._counter += 1
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
26 return gen_id
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
27
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
28 def _get_standard_id(self, quals):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
29 """Retrieve standardized IDs from other sources like NCBI GenBank.
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
30
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
31 This tries to find IDs from known key/values when stored differently
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
32 than GFF3 specifications.
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
33 """
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
34 possible_keys = ["transcript_id", "protein_id"]
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
35 for test_key in possible_keys:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
36 if quals.has_key(test_key):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
37 cur_id = quals[test_key]
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
38 if isinstance(cur_id, tuple) or isinstance(cur_id, list):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
39 return cur_id[0]
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
40 else:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
41 return cur_id
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
42 return None
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
43
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
44 def update_quals(self, quals, has_children):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
45 """Update a set of qualifiers, adding an ID if necessary.
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
46 """
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
47 cur_id = quals.get("ID", None)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
48 # if we have an ID, record it
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
49 if cur_id:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
50 if not isinstance(cur_id, list) and not isinstance(cur_id, tuple):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
51 cur_id = [cur_id]
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
52 for add_id in cur_id:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
53 self._seen_ids.append(add_id)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
54 # if we need one and don't have it, create a new one
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
55 elif has_children:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
56 new_id = self._generate_id(quals)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
57 self._seen_ids.append(new_id)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
58 quals["ID"] = [new_id]
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
59 return quals
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
60
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
61 class GFF3Writer:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
62 """Write GFF3 files starting with standard Biopython objects.
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
63 """
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
64 def __init__(self):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
65 pass
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
66
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
67 def write(self, recs, out_handle):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
68 """Write the provided records to the given handle in GFF3 format.
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
69 """
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
70 id_handler = _IdHandler()
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
71 self._write_header(out_handle)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
72 for rec in recs:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
73 self._write_rec(rec, out_handle)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
74 self._write_annotations(rec.annotations, rec.id, out_handle)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
75 for sf in rec.features:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
76 sf = self._clean_feature(sf)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
77 id_handler = self._write_feature(sf, rec.id, out_handle,
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
78 id_handler)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
79
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
80 def _clean_feature(self, feature):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
81 quals = {}
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
82 for key, val in feature.qualifiers.items():
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
83 if not isinstance(val, (list, tuple)):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
84 val = [val]
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
85 val = [str(x) for x in val]
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
86 quals[key] = val
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
87 feature.qualifiers = quals
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
88 clean_sub = [self._clean_feature(f) for f in feature.sub_features]
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
89 feature.sub_features = clean_sub
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
90 return feature
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
91
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
92 def _write_rec(self, rec, out_handle):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
93 # if we have a SeqRecord, write out optional directive
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
94 if len(rec.seq) > 0:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
95 out_handle.write("##sequence-region %s 1 %s\n" % (rec.id, len(rec.seq)))
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
96
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
97 def _write_feature(self, feature, rec_id, out_handle, id_handler,
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
98 parent_id=None):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
99 """Write a feature with location information.
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
100 """
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
101 if feature.strand == 1:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
102 strand = '+'
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
103 elif feature.strand == -1:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
104 strand = '-'
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
105 else:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
106 strand = '.'
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
107 # remove any standard features from the qualifiers
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
108 quals = feature.qualifiers.copy()
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
109 for std_qual in ["source", "score", "phase"]:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
110 if quals.has_key(std_qual) and len(quals[std_qual]) == 1:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
111 del quals[std_qual]
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
112 # add a link to a parent identifier if it exists
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
113 if parent_id:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
114 if not quals.has_key("Parent"):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
115 quals["Parent"] = []
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
116 quals["Parent"].append(parent_id)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
117 quals = id_handler.update_quals(quals, len(feature.sub_features) > 0)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
118 if feature.type:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
119 ftype = feature.type
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
120 else:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
121 ftype = "sequence_feature"
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
122 parts = [str(rec_id),
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
123 feature.qualifiers.get("source", ["feature"])[0],
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
124 ftype,
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
125 str(feature.location.nofuzzy_start + 1), # 1-based indexing
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
126 str(feature.location.nofuzzy_end),
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
127 feature.qualifiers.get("score", ["."])[0],
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
128 strand,
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
129 str(feature.qualifiers.get("phase", ["."])[0]),
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
130 self._format_keyvals(quals)]
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
131 out_handle.write("\t".join(parts) + "\n")
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
132 for sub_feature in feature.sub_features:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
133 id_handler = self._write_feature(sub_feature, rec_id, out_handle,
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
134 id_handler, quals["ID"][0])
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
135 return id_handler
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
136
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
137 def _format_keyvals(self, keyvals):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
138 format_kvs = []
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
139 for key, values in keyvals.items():
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
140 key = key.strip()
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
141 format_vals = []
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
142 if not isinstance(values, list) or isinstance(values, tuple):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
143 values = [values]
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
144 for val in values:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
145 val = urllib.quote(str(val).strip())
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
146 if ((key and val) and val not in format_vals):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
147 format_vals.append(val)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
148 format_kvs.append("%s=%s" % (key, ",".join(format_vals)))
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
149 return ";".join(format_kvs)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
150
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
151 def _write_annotations(self, anns, rec_id, out_handle):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
152 """Add annotations which refer to an entire sequence.
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
153 """
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
154 format_anns = self._format_keyvals(anns)
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
155 if format_anns:
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
156 parts = [rec_id, "annotation", "remark", ".", ".", ".", ".", ".",
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
157 format_anns]
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
158 out_handle.write("\t".join(parts) + "\n")
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
159
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
160 def _write_header(self, out_handle):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
161 """Write out standard header directives.
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
162 """
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
163 out_handle.write("##gff-version 3\n")
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
164
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
165 def write(recs, out_handle):
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
166 """High level interface to write GFF3 files from SeqRecords and SeqFeatures.
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
167 """
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
168 writer = GFF3Writer()
0a15677c6668 Uploaded
bjoern-gruening
parents:
diff changeset
169 return writer.write(recs, out_handle)