annotate cpt_intron_detect/intron_detection.py @ 0:1a19092729be draft

Uploaded
author cpt
date Fri, 13 May 2022 05:08:54 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1a19092729be Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env python
1a19092729be Uploaded
cpt
parents:
diff changeset
2 import sys
1a19092729be Uploaded
cpt
parents:
diff changeset
3 import re
1a19092729be Uploaded
cpt
parents:
diff changeset
4 import itertools
1a19092729be Uploaded
cpt
parents:
diff changeset
5 import argparse
1a19092729be Uploaded
cpt
parents:
diff changeset
6 import hashlib
1a19092729be Uploaded
cpt
parents:
diff changeset
7 import copy
1a19092729be Uploaded
cpt
parents:
diff changeset
8 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
1a19092729be Uploaded
cpt
parents:
diff changeset
9 from Bio.Blast import NCBIXML
1a19092729be Uploaded
cpt
parents:
diff changeset
10 from Bio.SeqFeature import SeqFeature, FeatureLocation
1a19092729be Uploaded
cpt
parents:
diff changeset
11 from gff3 import feature_lambda
1a19092729be Uploaded
cpt
parents:
diff changeset
12 from collections import OrderedDict
1a19092729be Uploaded
cpt
parents:
diff changeset
13 import logging
1a19092729be Uploaded
cpt
parents:
diff changeset
14
1a19092729be Uploaded
cpt
parents:
diff changeset
15 logging.basicConfig(level=logging.DEBUG)
1a19092729be Uploaded
cpt
parents:
diff changeset
16 log = logging.getLogger()
1a19092729be Uploaded
cpt
parents:
diff changeset
17
1a19092729be Uploaded
cpt
parents:
diff changeset
18
1a19092729be Uploaded
cpt
parents:
diff changeset
19 def parse_xml(blastxml, thresh):
1a19092729be Uploaded
cpt
parents:
diff changeset
20 """ Parses xml file to get desired info (genes, hits, etc) """
1a19092729be Uploaded
cpt
parents:
diff changeset
21 blast = []
1a19092729be Uploaded
cpt
parents:
diff changeset
22 discarded_records = 0
1a19092729be Uploaded
cpt
parents:
diff changeset
23 totLen = 0
1a19092729be Uploaded
cpt
parents:
diff changeset
24 for iter_num, blast_record in enumerate(NCBIXML.parse(blastxml), 1):
1a19092729be Uploaded
cpt
parents:
diff changeset
25 blast_gene = []
1a19092729be Uploaded
cpt
parents:
diff changeset
26 align_num = 0
1a19092729be Uploaded
cpt
parents:
diff changeset
27 for alignment in blast_record.alignments:
1a19092729be Uploaded
cpt
parents:
diff changeset
28 align_num += 1
1a19092729be Uploaded
cpt
parents:
diff changeset
29 # hit_gis = alignment.hit_id + alignment.hit_def
1a19092729be Uploaded
cpt
parents:
diff changeset
30 # gi_nos = [str(gi) for gi in re.findall('(?<=gi\|)\d{9,10}', hit_gis)]
1a19092729be Uploaded
cpt
parents:
diff changeset
31 gi_nos = str(alignment.accession)
1a19092729be Uploaded
cpt
parents:
diff changeset
32
1a19092729be Uploaded
cpt
parents:
diff changeset
33 for hsp in alignment.hsps:
1a19092729be Uploaded
cpt
parents:
diff changeset
34 x = float(hsp.identities - 1) / ((hsp.query_end) - hsp.query_start)
1a19092729be Uploaded
cpt
parents:
diff changeset
35 if x < thresh:
1a19092729be Uploaded
cpt
parents:
diff changeset
36 discarded_records += 1
1a19092729be Uploaded
cpt
parents:
diff changeset
37 continue
1a19092729be Uploaded
cpt
parents:
diff changeset
38 nice_name = blast_record.query
1a19092729be Uploaded
cpt
parents:
diff changeset
39
1a19092729be Uploaded
cpt
parents:
diff changeset
40 if " " in nice_name:
1a19092729be Uploaded
cpt
parents:
diff changeset
41 nice_name = nice_name[0 : nice_name.index(" ")]
1a19092729be Uploaded
cpt
parents:
diff changeset
42
1a19092729be Uploaded
cpt
parents:
diff changeset
43 blast_gene.append(
1a19092729be Uploaded
cpt
parents:
diff changeset
44 {
1a19092729be Uploaded
cpt
parents:
diff changeset
45 "gi_nos": gi_nos,
1a19092729be Uploaded
cpt
parents:
diff changeset
46 "sbjct_length": alignment.length,
1a19092729be Uploaded
cpt
parents:
diff changeset
47 "query_length": blast_record.query_length,
1a19092729be Uploaded
cpt
parents:
diff changeset
48 "sbjct_range": (hsp.sbjct_start, hsp.sbjct_end),
1a19092729be Uploaded
cpt
parents:
diff changeset
49 "query_range": (hsp.query_start, hsp.query_end),
1a19092729be Uploaded
cpt
parents:
diff changeset
50 "name": nice_name,
1a19092729be Uploaded
cpt
parents:
diff changeset
51 "evalue": hsp.expect,
1a19092729be Uploaded
cpt
parents:
diff changeset
52 "identity": hsp.identities,
1a19092729be Uploaded
cpt
parents:
diff changeset
53 "identity_percent": x,
1a19092729be Uploaded
cpt
parents:
diff changeset
54 "hit_num": align_num,
1a19092729be Uploaded
cpt
parents:
diff changeset
55 "iter_num": iter_num,
1a19092729be Uploaded
cpt
parents:
diff changeset
56 "match_id": alignment.title.partition(">")[0],
1a19092729be Uploaded
cpt
parents:
diff changeset
57 }
1a19092729be Uploaded
cpt
parents:
diff changeset
58 )
1a19092729be Uploaded
cpt
parents:
diff changeset
59
1a19092729be Uploaded
cpt
parents:
diff changeset
60 blast.append(blast_gene)
1a19092729be Uploaded
cpt
parents:
diff changeset
61 totLen += len(blast_gene)
1a19092729be Uploaded
cpt
parents:
diff changeset
62 log.debug("parse_blastxml %s -> %s", totLen + discarded_records, totLen)
1a19092729be Uploaded
cpt
parents:
diff changeset
63 return blast
1a19092729be Uploaded
cpt
parents:
diff changeset
64
1a19092729be Uploaded
cpt
parents:
diff changeset
65
1a19092729be Uploaded
cpt
parents:
diff changeset
66 def filter_lone_clusters(clusters):
1a19092729be Uploaded
cpt
parents:
diff changeset
67 """ Removes all clusters with only one member and those with no hits """
1a19092729be Uploaded
cpt
parents:
diff changeset
68 filtered_clusters = {}
1a19092729be Uploaded
cpt
parents:
diff changeset
69 for key in clusters:
1a19092729be Uploaded
cpt
parents:
diff changeset
70 if len(clusters[key]) > 1 and len(key) > 0:
1a19092729be Uploaded
cpt
parents:
diff changeset
71 filtered_clusters[key] = clusters[key]
1a19092729be Uploaded
cpt
parents:
diff changeset
72 log.debug("filter_lone_clusters %s -> %s", len(clusters), len(filtered_clusters))
1a19092729be Uploaded
cpt
parents:
diff changeset
73 return filtered_clusters
1a19092729be Uploaded
cpt
parents:
diff changeset
74
1a19092729be Uploaded
cpt
parents:
diff changeset
75
1a19092729be Uploaded
cpt
parents:
diff changeset
76 def test_true(feature, **kwargs):
1a19092729be Uploaded
cpt
parents:
diff changeset
77 return True
1a19092729be Uploaded
cpt
parents:
diff changeset
78
1a19092729be Uploaded
cpt
parents:
diff changeset
79
1a19092729be Uploaded
cpt
parents:
diff changeset
80 def parse_gff(gff3):
1a19092729be Uploaded
cpt
parents:
diff changeset
81 """ Extracts strand and start location to be used in cluster filtering """
1a19092729be Uploaded
cpt
parents:
diff changeset
82 log.debug("parse_gff3")
1a19092729be Uploaded
cpt
parents:
diff changeset
83 gff_info = {}
1a19092729be Uploaded
cpt
parents:
diff changeset
84 _rec = None
1a19092729be Uploaded
cpt
parents:
diff changeset
85 for rec in gffParse(gff3):
1a19092729be Uploaded
cpt
parents:
diff changeset
86 endBase = len(rec.seq)
1a19092729be Uploaded
cpt
parents:
diff changeset
87
1a19092729be Uploaded
cpt
parents:
diff changeset
88 _rec = rec
1a19092729be Uploaded
cpt
parents:
diff changeset
89 _rec.annotations = {}
1a19092729be Uploaded
cpt
parents:
diff changeset
90 for feat in feature_lambda(rec.features, test_true, {}, subfeatures=False):
1a19092729be Uploaded
cpt
parents:
diff changeset
91 if feat.type == "CDS":
1a19092729be Uploaded
cpt
parents:
diff changeset
92 if "Name" in feat.qualifiers.keys():
1a19092729be Uploaded
cpt
parents:
diff changeset
93 CDSname = feat.qualifiers["Name"]
1a19092729be Uploaded
cpt
parents:
diff changeset
94 else:
1a19092729be Uploaded
cpt
parents:
diff changeset
95 CDSname = feat.qualifiers["ID"]
1a19092729be Uploaded
cpt
parents:
diff changeset
96 gff_info[feat.id] = {
1a19092729be Uploaded
cpt
parents:
diff changeset
97 "strand": feat.strand,
1a19092729be Uploaded
cpt
parents:
diff changeset
98 "start": feat.location.start,
1a19092729be Uploaded
cpt
parents:
diff changeset
99 "end": feat.location.end,
1a19092729be Uploaded
cpt
parents:
diff changeset
100 "loc": feat.location,
1a19092729be Uploaded
cpt
parents:
diff changeset
101 "feat": feat,
1a19092729be Uploaded
cpt
parents:
diff changeset
102 "name": CDSname,
1a19092729be Uploaded
cpt
parents:
diff changeset
103 }
1a19092729be Uploaded
cpt
parents:
diff changeset
104
1a19092729be Uploaded
cpt
parents:
diff changeset
105 gff_info = OrderedDict(sorted(gff_info.items(), key=lambda k: k[1]["start"]))
1a19092729be Uploaded
cpt
parents:
diff changeset
106 # endBase = 0
1a19092729be Uploaded
cpt
parents:
diff changeset
107 for i, feat_id in enumerate(gff_info):
1a19092729be Uploaded
cpt
parents:
diff changeset
108 gff_info[feat_id].update({"index": i})
1a19092729be Uploaded
cpt
parents:
diff changeset
109 if gff_info[feat_id]["loc"].end > endBase:
1a19092729be Uploaded
cpt
parents:
diff changeset
110 endBase = gff_info[feat_id]["loc"].end
1a19092729be Uploaded
cpt
parents:
diff changeset
111
1a19092729be Uploaded
cpt
parents:
diff changeset
112 return dict(gff_info), _rec, endBase
1a19092729be Uploaded
cpt
parents:
diff changeset
113
1a19092729be Uploaded
cpt
parents:
diff changeset
114
1a19092729be Uploaded
cpt
parents:
diff changeset
115 def all_same(genes_list):
1a19092729be Uploaded
cpt
parents:
diff changeset
116 """ Returns True if all gene names in cluster are identical """
1a19092729be Uploaded
cpt
parents:
diff changeset
117 return all(gene["name"] == genes_list[0]["name"] for gene in genes_list[1:])
1a19092729be Uploaded
cpt
parents:
diff changeset
118
1a19092729be Uploaded
cpt
parents:
diff changeset
119
1a19092729be Uploaded
cpt
parents:
diff changeset
120 def remove_duplicates(clusters):
1a19092729be Uploaded
cpt
parents:
diff changeset
121 """ Removes clusters with multiple members but only one gene name """
1a19092729be Uploaded
cpt
parents:
diff changeset
122 filtered_clusters = {}
1a19092729be Uploaded
cpt
parents:
diff changeset
123 for key in clusters:
1a19092729be Uploaded
cpt
parents:
diff changeset
124 if all_same(clusters[key]):
1a19092729be Uploaded
cpt
parents:
diff changeset
125 continue
1a19092729be Uploaded
cpt
parents:
diff changeset
126 else:
1a19092729be Uploaded
cpt
parents:
diff changeset
127 filtered_clusters[key] = clusters[key]
1a19092729be Uploaded
cpt
parents:
diff changeset
128 log.debug("remove_duplicates %s -> %s", len(clusters), len(filtered_clusters))
1a19092729be Uploaded
cpt
parents:
diff changeset
129 return filtered_clusters
1a19092729be Uploaded
cpt
parents:
diff changeset
130
1a19092729be Uploaded
cpt
parents:
diff changeset
131
1a19092729be Uploaded
cpt
parents:
diff changeset
132 class IntronFinder(object):
1a19092729be Uploaded
cpt
parents:
diff changeset
133 """ IntronFinder objects are lists that contain a list of hits for every gene """
1a19092729be Uploaded
cpt
parents:
diff changeset
134
1a19092729be Uploaded
cpt
parents:
diff changeset
135 def __init__(self, gff3, blastp, thresh):
1a19092729be Uploaded
cpt
parents:
diff changeset
136 self.blast = []
1a19092729be Uploaded
cpt
parents:
diff changeset
137 self.clusters = {}
1a19092729be Uploaded
cpt
parents:
diff changeset
138 self.gff_info = {}
1a19092729be Uploaded
cpt
parents:
diff changeset
139 self.length = 0
1a19092729be Uploaded
cpt
parents:
diff changeset
140
1a19092729be Uploaded
cpt
parents:
diff changeset
141 (self.gff_info, self.rec, self.length) = parse_gff(gff3)
1a19092729be Uploaded
cpt
parents:
diff changeset
142 self.blast = parse_xml(blastp, thresh)
1a19092729be Uploaded
cpt
parents:
diff changeset
143
1a19092729be Uploaded
cpt
parents:
diff changeset
144 def create_clusters(self):
1a19092729be Uploaded
cpt
parents:
diff changeset
145 """ Finds 2 or more genes with matching hits """
1a19092729be Uploaded
cpt
parents:
diff changeset
146 clusters = {}
1a19092729be Uploaded
cpt
parents:
diff changeset
147 for gene in self.blast:
1a19092729be Uploaded
cpt
parents:
diff changeset
148 for hit in gene:
1a19092729be Uploaded
cpt
parents:
diff changeset
149 if " " in hit["gi_nos"]:
1a19092729be Uploaded
cpt
parents:
diff changeset
150 hit["gi_nos"] = hit["gi_nos"][0 : hit["gi_nos"].index(" ")]
1a19092729be Uploaded
cpt
parents:
diff changeset
151
1a19092729be Uploaded
cpt
parents:
diff changeset
152 nameCheck = hit["gi_nos"]
1a19092729be Uploaded
cpt
parents:
diff changeset
153 if nameCheck == "":
1a19092729be Uploaded
cpt
parents:
diff changeset
154 continue
1a19092729be Uploaded
cpt
parents:
diff changeset
155 name = hashlib.md5((nameCheck).encode()).hexdigest()
1a19092729be Uploaded
cpt
parents:
diff changeset
156
1a19092729be Uploaded
cpt
parents:
diff changeset
157 if name in clusters:
1a19092729be Uploaded
cpt
parents:
diff changeset
158 if hit not in clusters[name]:
1a19092729be Uploaded
cpt
parents:
diff changeset
159 clusters[name].append(hit)
1a19092729be Uploaded
cpt
parents:
diff changeset
160 else:
1a19092729be Uploaded
cpt
parents:
diff changeset
161 clusters[name] = [hit]
1a19092729be Uploaded
cpt
parents:
diff changeset
162 log.debug("create_clusters %s -> %s", len(self.blast), len(clusters))
1a19092729be Uploaded
cpt
parents:
diff changeset
163 self.clusters = filter_lone_clusters(clusters)
1a19092729be Uploaded
cpt
parents:
diff changeset
164
1a19092729be Uploaded
cpt
parents:
diff changeset
165 def check_strand(self):
1a19092729be Uploaded
cpt
parents:
diff changeset
166 """ filters clusters for genes on the same strand """
1a19092729be Uploaded
cpt
parents:
diff changeset
167 filtered_clusters = {}
1a19092729be Uploaded
cpt
parents:
diff changeset
168 for key in self.clusters:
1a19092729be Uploaded
cpt
parents:
diff changeset
169 pos_strand = []
1a19092729be Uploaded
cpt
parents:
diff changeset
170 neg_strand = []
1a19092729be Uploaded
cpt
parents:
diff changeset
171 for gene in self.clusters[key]:
1a19092729be Uploaded
cpt
parents:
diff changeset
172 if self.gff_info[gene["name"]]["strand"] == 1:
1a19092729be Uploaded
cpt
parents:
diff changeset
173 pos_strand.append(gene)
1a19092729be Uploaded
cpt
parents:
diff changeset
174 else:
1a19092729be Uploaded
cpt
parents:
diff changeset
175 neg_strand.append(gene)
1a19092729be Uploaded
cpt
parents:
diff changeset
176 if len(pos_strand) == 0 or len(neg_strand) == 0:
1a19092729be Uploaded
cpt
parents:
diff changeset
177 filtered_clusters[key] = self.clusters[key]
1a19092729be Uploaded
cpt
parents:
diff changeset
178 else:
1a19092729be Uploaded
cpt
parents:
diff changeset
179 if len(pos_strand) > 1:
1a19092729be Uploaded
cpt
parents:
diff changeset
180 filtered_clusters[key + "_+1"] = pos_strand
1a19092729be Uploaded
cpt
parents:
diff changeset
181 if len(neg_strand) > 1:
1a19092729be Uploaded
cpt
parents:
diff changeset
182 filtered_clusters[key + "_-1"] = neg_strand
1a19092729be Uploaded
cpt
parents:
diff changeset
183
1a19092729be Uploaded
cpt
parents:
diff changeset
184 return filtered_clusters
1a19092729be Uploaded
cpt
parents:
diff changeset
185
1a19092729be Uploaded
cpt
parents:
diff changeset
186 def check_gene_gap(self, maximum=10000):
1a19092729be Uploaded
cpt
parents:
diff changeset
187 filtered_clusters = {}
1a19092729be Uploaded
cpt
parents:
diff changeset
188 for key in self.clusters:
1a19092729be Uploaded
cpt
parents:
diff changeset
189 hits_lists = []
1a19092729be Uploaded
cpt
parents:
diff changeset
190 gene_added = False
1a19092729be Uploaded
cpt
parents:
diff changeset
191 for gene in self.clusters[key]:
1a19092729be Uploaded
cpt
parents:
diff changeset
192 for hits in hits_lists:
1a19092729be Uploaded
cpt
parents:
diff changeset
193 for hit in hits:
1a19092729be Uploaded
cpt
parents:
diff changeset
194 lastStart = max(
1a19092729be Uploaded
cpt
parents:
diff changeset
195 self.gff_info[gene["name"]]["start"],
1a19092729be Uploaded
cpt
parents:
diff changeset
196 self.gff_info[hit["name"]]["start"],
1a19092729be Uploaded
cpt
parents:
diff changeset
197 )
1a19092729be Uploaded
cpt
parents:
diff changeset
198 lastEnd = max(
1a19092729be Uploaded
cpt
parents:
diff changeset
199 self.gff_info[gene["name"]]["end"],
1a19092729be Uploaded
cpt
parents:
diff changeset
200 self.gff_info[hit["name"]]["end"],
1a19092729be Uploaded
cpt
parents:
diff changeset
201 )
1a19092729be Uploaded
cpt
parents:
diff changeset
202 firstEnd = min(
1a19092729be Uploaded
cpt
parents:
diff changeset
203 self.gff_info[gene["name"]]["end"],
1a19092729be Uploaded
cpt
parents:
diff changeset
204 self.gff_info[hit["name"]]["end"],
1a19092729be Uploaded
cpt
parents:
diff changeset
205 )
1a19092729be Uploaded
cpt
parents:
diff changeset
206 firstStart = min(
1a19092729be Uploaded
cpt
parents:
diff changeset
207 self.gff_info[gene["name"]]["start"],
1a19092729be Uploaded
cpt
parents:
diff changeset
208 self.gff_info[hit["name"]]["start"],
1a19092729be Uploaded
cpt
parents:
diff changeset
209 )
1a19092729be Uploaded
cpt
parents:
diff changeset
210 if (
1a19092729be Uploaded
cpt
parents:
diff changeset
211 lastStart - firstEnd <= maximum
1a19092729be Uploaded
cpt
parents:
diff changeset
212 or self.length - lastEnd + firstStart <= maximum
1a19092729be Uploaded
cpt
parents:
diff changeset
213 ):
1a19092729be Uploaded
cpt
parents:
diff changeset
214 hits.append(gene)
1a19092729be Uploaded
cpt
parents:
diff changeset
215 gene_added = True
1a19092729be Uploaded
cpt
parents:
diff changeset
216 break
1a19092729be Uploaded
cpt
parents:
diff changeset
217 if not gene_added:
1a19092729be Uploaded
cpt
parents:
diff changeset
218 hits_lists.append([gene])
1a19092729be Uploaded
cpt
parents:
diff changeset
219
1a19092729be Uploaded
cpt
parents:
diff changeset
220 for i, hits in enumerate(hits_lists):
1a19092729be Uploaded
cpt
parents:
diff changeset
221 if len(hits) >= 2:
1a19092729be Uploaded
cpt
parents:
diff changeset
222 filtered_clusters[key + "_" + str(i)] = hits
1a19092729be Uploaded
cpt
parents:
diff changeset
223 # for i in filtered_clusters:
1a19092729be Uploaded
cpt
parents:
diff changeset
224 # print(i)
1a19092729be Uploaded
cpt
parents:
diff changeset
225 # print(filtered_clusters[i])
1a19092729be Uploaded
cpt
parents:
diff changeset
226 log.debug("check_gene_gap %s -> %s", len(self.clusters), len(filtered_clusters))
1a19092729be Uploaded
cpt
parents:
diff changeset
227
1a19092729be Uploaded
cpt
parents:
diff changeset
228 return remove_duplicates(
1a19092729be Uploaded
cpt
parents:
diff changeset
229 filtered_clusters
1a19092729be Uploaded
cpt
parents:
diff changeset
230 ) # call remove_duplicates somewhere else?
1a19092729be Uploaded
cpt
parents:
diff changeset
231
1a19092729be Uploaded
cpt
parents:
diff changeset
232 # maybe figure out how to merge with check_gene_gap?
1a19092729be Uploaded
cpt
parents:
diff changeset
233 # def check_seq_gap():
1a19092729be Uploaded
cpt
parents:
diff changeset
234
1a19092729be Uploaded
cpt
parents:
diff changeset
235 # also need a check for gap in sequence coverage?
1a19092729be Uploaded
cpt
parents:
diff changeset
236 def check_seq_overlap(self, minimum=-1):
1a19092729be Uploaded
cpt
parents:
diff changeset
237 filtered_clusters = {}
1a19092729be Uploaded
cpt
parents:
diff changeset
238 for key in self.clusters:
1a19092729be Uploaded
cpt
parents:
diff changeset
239 add_cluster = True
1a19092729be Uploaded
cpt
parents:
diff changeset
240 sbjct_ranges = []
1a19092729be Uploaded
cpt
parents:
diff changeset
241 query_ranges = []
1a19092729be Uploaded
cpt
parents:
diff changeset
242 for gene in self.clusters[key]:
1a19092729be Uploaded
cpt
parents:
diff changeset
243 sbjct_ranges.append(gene["sbjct_range"])
1a19092729be Uploaded
cpt
parents:
diff changeset
244 query_ranges.append(gene["query_range"])
1a19092729be Uploaded
cpt
parents:
diff changeset
245
1a19092729be Uploaded
cpt
parents:
diff changeset
246 combinations = list(itertools.combinations(sbjct_ranges, 2))
1a19092729be Uploaded
cpt
parents:
diff changeset
247
1a19092729be Uploaded
cpt
parents:
diff changeset
248 for pair in combinations:
1a19092729be Uploaded
cpt
parents:
diff changeset
249 overlap = len(
1a19092729be Uploaded
cpt
parents:
diff changeset
250 set(range(pair[0][0], pair[0][1]))
1a19092729be Uploaded
cpt
parents:
diff changeset
251 & set(range(pair[1][0], pair[1][1]))
1a19092729be Uploaded
cpt
parents:
diff changeset
252 )
1a19092729be Uploaded
cpt
parents:
diff changeset
253 minPair = pair[0]
1a19092729be Uploaded
cpt
parents:
diff changeset
254 maxPair = pair[1]
1a19092729be Uploaded
cpt
parents:
diff changeset
255
1a19092729be Uploaded
cpt
parents:
diff changeset
256 if minPair[0] > maxPair[0]:
1a19092729be Uploaded
cpt
parents:
diff changeset
257 minPair = pair[1]
1a19092729be Uploaded
cpt
parents:
diff changeset
258 maxPair = pair[0]
1a19092729be Uploaded
cpt
parents:
diff changeset
259 elif minPair[0] == maxPair[0] and minPair[1] > maxPair[1]:
1a19092729be Uploaded
cpt
parents:
diff changeset
260 minPair = pair[1]
1a19092729be Uploaded
cpt
parents:
diff changeset
261 maxPair = pair[0]
1a19092729be Uploaded
cpt
parents:
diff changeset
262 if overlap > 0:
1a19092729be Uploaded
cpt
parents:
diff changeset
263 dist1 = maxPair[0] - minPair[0]
1a19092729be Uploaded
cpt
parents:
diff changeset
264 else:
1a19092729be Uploaded
cpt
parents:
diff changeset
265 dist1 = abs(maxPair[0] - minPair[1])
1a19092729be Uploaded
cpt
parents:
diff changeset
266
1a19092729be Uploaded
cpt
parents:
diff changeset
267 if minimum < 0:
1a19092729be Uploaded
cpt
parents:
diff changeset
268 if overlap > (minimum * -1):
1a19092729be Uploaded
cpt
parents:
diff changeset
269 # print("Rejcting: Neg min but too much overlap: " + str(pair))
1a19092729be Uploaded
cpt
parents:
diff changeset
270 add_cluster = False
1a19092729be Uploaded
cpt
parents:
diff changeset
271 elif minimum == 0:
1a19092729be Uploaded
cpt
parents:
diff changeset
272 if overlap > 0:
1a19092729be Uploaded
cpt
parents:
diff changeset
273 # print("Rejcting: 0 min and overlap: " + str(pair))
1a19092729be Uploaded
cpt
parents:
diff changeset
274 add_cluster = False
1a19092729be Uploaded
cpt
parents:
diff changeset
275 elif overlap > 0:
1a19092729be Uploaded
cpt
parents:
diff changeset
276 # print("Rejcting: Pos min and overlap: " + str(pair))
1a19092729be Uploaded
cpt
parents:
diff changeset
277 add_cluster = False
1a19092729be Uploaded
cpt
parents:
diff changeset
278
1a19092729be Uploaded
cpt
parents:
diff changeset
279 if (dist1 < minimum) and (minimum >= 0):
1a19092729be Uploaded
cpt
parents:
diff changeset
280 # print("Rejcting: Dist failure: " + str(pair) + " D1: " + dist1)
1a19092729be Uploaded
cpt
parents:
diff changeset
281 add_cluster = False
1a19092729be Uploaded
cpt
parents:
diff changeset
282 # if add_cluster:
1a19092729be Uploaded
cpt
parents:
diff changeset
283 # print("Accepted: " + str(pair) + " D1: " + str(dist1) + " Ov: " + str(overlap))
1a19092729be Uploaded
cpt
parents:
diff changeset
284 if add_cluster:
1a19092729be Uploaded
cpt
parents:
diff changeset
285
1a19092729be Uploaded
cpt
parents:
diff changeset
286 filtered_clusters[key] = self.clusters[key]
1a19092729be Uploaded
cpt
parents:
diff changeset
287
1a19092729be Uploaded
cpt
parents:
diff changeset
288 log.debug(
1a19092729be Uploaded
cpt
parents:
diff changeset
289 "check_seq_overlap %s -> %s", len(self.clusters), len(filtered_clusters)
1a19092729be Uploaded
cpt
parents:
diff changeset
290 )
1a19092729be Uploaded
cpt
parents:
diff changeset
291 # print(self.clusters)
1a19092729be Uploaded
cpt
parents:
diff changeset
292 return filtered_clusters
1a19092729be Uploaded
cpt
parents:
diff changeset
293
1a19092729be Uploaded
cpt
parents:
diff changeset
294 def cluster_report(self):
1a19092729be Uploaded
cpt
parents:
diff changeset
295 condensed_report = {}
1a19092729be Uploaded
cpt
parents:
diff changeset
296 for key in self.clusters:
1a19092729be Uploaded
cpt
parents:
diff changeset
297 for gene in self.clusters[key]:
1a19092729be Uploaded
cpt
parents:
diff changeset
298 if gene["name"] in condensed_report:
1a19092729be Uploaded
cpt
parents:
diff changeset
299 condensed_report[gene["name"]].append(gene["sbjct_range"])
1a19092729be Uploaded
cpt
parents:
diff changeset
300 else:
1a19092729be Uploaded
cpt
parents:
diff changeset
301 condensed_report[gene["name"]] = [gene["sbjct_range"]]
1a19092729be Uploaded
cpt
parents:
diff changeset
302 return condensed_report
1a19092729be Uploaded
cpt
parents:
diff changeset
303
1a19092729be Uploaded
cpt
parents:
diff changeset
304 def cluster_report_2(self):
1a19092729be Uploaded
cpt
parents:
diff changeset
305 condensed_report = {}
1a19092729be Uploaded
cpt
parents:
diff changeset
306 for key in self.clusters:
1a19092729be Uploaded
cpt
parents:
diff changeset
307 gene_names = []
1a19092729be Uploaded
cpt
parents:
diff changeset
308 for gene in self.clusters[key]:
1a19092729be Uploaded
cpt
parents:
diff changeset
309 gene_names.append((gene["name"]).strip("CPT_phageK_"))
1a19092729be Uploaded
cpt
parents:
diff changeset
310 if ", ".join(gene_names) in condensed_report:
1a19092729be Uploaded
cpt
parents:
diff changeset
311 condensed_report[", ".join(gene_names)] += 1
1a19092729be Uploaded
cpt
parents:
diff changeset
312 else:
1a19092729be Uploaded
cpt
parents:
diff changeset
313 condensed_report[", ".join(gene_names)] = 1
1a19092729be Uploaded
cpt
parents:
diff changeset
314 return condensed_report
1a19092729be Uploaded
cpt
parents:
diff changeset
315
1a19092729be Uploaded
cpt
parents:
diff changeset
316 def cluster_report_3(self):
1a19092729be Uploaded
cpt
parents:
diff changeset
317 condensed_report = {}
1a19092729be Uploaded
cpt
parents:
diff changeset
318 for key in self.clusters:
1a19092729be Uploaded
cpt
parents:
diff changeset
319 gene_names = []
1a19092729be Uploaded
cpt
parents:
diff changeset
320 gi_nos = []
1a19092729be Uploaded
cpt
parents:
diff changeset
321 for i, gene in enumerate(self.clusters[key]):
1a19092729be Uploaded
cpt
parents:
diff changeset
322 if i == 0:
1a19092729be Uploaded
cpt
parents:
diff changeset
323 gi_nos = gene["gi_nos"]
1a19092729be Uploaded
cpt
parents:
diff changeset
324 gene_names.append((gene["name"]).strip(".p01").strip("CPT_phageK_gp"))
1a19092729be Uploaded
cpt
parents:
diff changeset
325 if ", ".join(gene_names) in condensed_report:
1a19092729be Uploaded
cpt
parents:
diff changeset
326 condensed_report[", ".join(gene_names)].append(gi_nos)
1a19092729be Uploaded
cpt
parents:
diff changeset
327 else:
1a19092729be Uploaded
cpt
parents:
diff changeset
328 condensed_report[", ".join(gene_names)] = [gi_nos]
1a19092729be Uploaded
cpt
parents:
diff changeset
329 return condensed_report
1a19092729be Uploaded
cpt
parents:
diff changeset
330
1a19092729be Uploaded
cpt
parents:
diff changeset
331 def output_gff3(self, clusters):
1a19092729be Uploaded
cpt
parents:
diff changeset
332 rec = copy.deepcopy(self.rec)
1a19092729be Uploaded
cpt
parents:
diff changeset
333 rec.features = []
1a19092729be Uploaded
cpt
parents:
diff changeset
334 for cluster_idx, cluster_id in enumerate(clusters):
1a19092729be Uploaded
cpt
parents:
diff changeset
335 # Get the list of genes in this cluster
1a19092729be Uploaded
cpt
parents:
diff changeset
336 associated_genes = set([x["name"] for x in clusters[cluster_id]])
1a19092729be Uploaded
cpt
parents:
diff changeset
337 # print(associated_genes)
1a19092729be Uploaded
cpt
parents:
diff changeset
338 # Get the gene locations
1a19092729be Uploaded
cpt
parents:
diff changeset
339 assoc_gene_info = {x: self.gff_info[x]["loc"] for x in associated_genes}
1a19092729be Uploaded
cpt
parents:
diff changeset
340 # Now we construct a gene from the children as a "standard gene model" gene.
1a19092729be Uploaded
cpt
parents:
diff changeset
341 # Get the minimum and maximum locations covered by all of the children genes
1a19092729be Uploaded
cpt
parents:
diff changeset
342 gene_min = min([min(x[1].start, x[1].end) for x in assoc_gene_info.items()])
1a19092729be Uploaded
cpt
parents:
diff changeset
343 gene_max = max([max(x[1].start, x[1].end) for x in assoc_gene_info.items()])
1a19092729be Uploaded
cpt
parents:
diff changeset
344
1a19092729be Uploaded
cpt
parents:
diff changeset
345 evidence_notes = []
1a19092729be Uploaded
cpt
parents:
diff changeset
346 for cluster_elem in clusters[cluster_id]:
1a19092729be Uploaded
cpt
parents:
diff changeset
347 note = "{name} had {ident}% identity to NCBI Protein ID {pretty_gi}".format(
1a19092729be Uploaded
cpt
parents:
diff changeset
348 pretty_gi=(cluster_elem["gi_nos"]),
1a19092729be Uploaded
cpt
parents:
diff changeset
349 ident=int(
1a19092729be Uploaded
cpt
parents:
diff changeset
350 100
1a19092729be Uploaded
cpt
parents:
diff changeset
351 * float(cluster_elem["identity"] - 1.00)
1a19092729be Uploaded
cpt
parents:
diff changeset
352 / abs(
1a19092729be Uploaded
cpt
parents:
diff changeset
353 cluster_elem["query_range"][1]
1a19092729be Uploaded
cpt
parents:
diff changeset
354 - cluster_elem["query_range"][0]
1a19092729be Uploaded
cpt
parents:
diff changeset
355 )
1a19092729be Uploaded
cpt
parents:
diff changeset
356 ),
1a19092729be Uploaded
cpt
parents:
diff changeset
357 **cluster_elem
1a19092729be Uploaded
cpt
parents:
diff changeset
358 )
1a19092729be Uploaded
cpt
parents:
diff changeset
359 evidence_notes.append(note)
1a19092729be Uploaded
cpt
parents:
diff changeset
360 if gene_max - gene_min > 0.8 * float(self.length):
1a19092729be Uploaded
cpt
parents:
diff changeset
361 evidence_notes.append(
1a19092729be Uploaded
cpt
parents:
diff changeset
362 "Intron is over 80% of the total length of the genome, possible wraparound scenario"
1a19092729be Uploaded
cpt
parents:
diff changeset
363 )
1a19092729be Uploaded
cpt
parents:
diff changeset
364 # With that we can create the top level gene
1a19092729be Uploaded
cpt
parents:
diff changeset
365 gene = gffSeqFeature(
1a19092729be Uploaded
cpt
parents:
diff changeset
366 location=FeatureLocation(gene_min, gene_max),
1a19092729be Uploaded
cpt
parents:
diff changeset
367 type="gene",
1a19092729be Uploaded
cpt
parents:
diff changeset
368 id=cluster_id,
1a19092729be Uploaded
cpt
parents:
diff changeset
369 qualifiers={
1a19092729be Uploaded
cpt
parents:
diff changeset
370 "ID": ["gp_%s" % cluster_idx],
1a19092729be Uploaded
cpt
parents:
diff changeset
371 "Percent_Identities": evidence_notes,
1a19092729be Uploaded
cpt
parents:
diff changeset
372 "Note": clusters[cluster_id][0]["match_id"],
1a19092729be Uploaded
cpt
parents:
diff changeset
373 },
1a19092729be Uploaded
cpt
parents:
diff changeset
374 )
1a19092729be Uploaded
cpt
parents:
diff changeset
375
1a19092729be Uploaded
cpt
parents:
diff changeset
376 # Below that we have an mRNA
1a19092729be Uploaded
cpt
parents:
diff changeset
377 mRNA = gffSeqFeature(
1a19092729be Uploaded
cpt
parents:
diff changeset
378 location=FeatureLocation(gene_min, gene_max),
1a19092729be Uploaded
cpt
parents:
diff changeset
379 type="mRNA",
1a19092729be Uploaded
cpt
parents:
diff changeset
380 id=cluster_id + ".mRNA",
1a19092729be Uploaded
cpt
parents:
diff changeset
381 qualifiers={"ID": ["gp_%s.mRNA" % cluster_idx], "note": evidence_notes},
1a19092729be Uploaded
cpt
parents:
diff changeset
382 )
1a19092729be Uploaded
cpt
parents:
diff changeset
383
1a19092729be Uploaded
cpt
parents:
diff changeset
384 # Now come the CDSs.
1a19092729be Uploaded
cpt
parents:
diff changeset
385 cdss = []
1a19092729be Uploaded
cpt
parents:
diff changeset
386 # We sort them just for kicks
1a19092729be Uploaded
cpt
parents:
diff changeset
387 for idx, gene_name in enumerate(
1a19092729be Uploaded
cpt
parents:
diff changeset
388 sorted(associated_genes, key=lambda x: int(self.gff_info[x]["start"]))
1a19092729be Uploaded
cpt
parents:
diff changeset
389 ):
1a19092729be Uploaded
cpt
parents:
diff changeset
390 # Copy the CDS so we don't muck up a good one
1a19092729be Uploaded
cpt
parents:
diff changeset
391 cds = copy.copy(self.gff_info[gene_name]["feat"])
1a19092729be Uploaded
cpt
parents:
diff changeset
392 # Get the associated cluster element (used in the Notes above)
1a19092729be Uploaded
cpt
parents:
diff changeset
393 cluster_elem = [
1a19092729be Uploaded
cpt
parents:
diff changeset
394 x for x in clusters[cluster_id] if x["name"] == gene_name
1a19092729be Uploaded
cpt
parents:
diff changeset
395 ][0]
1a19092729be Uploaded
cpt
parents:
diff changeset
396
1a19092729be Uploaded
cpt
parents:
diff changeset
397 # Calculate %identity which we'll use to score
1a19092729be Uploaded
cpt
parents:
diff changeset
398 score = int(
1a19092729be Uploaded
cpt
parents:
diff changeset
399 1000
1a19092729be Uploaded
cpt
parents:
diff changeset
400 * float(cluster_elem["identity"])
1a19092729be Uploaded
cpt
parents:
diff changeset
401 / abs(
1a19092729be Uploaded
cpt
parents:
diff changeset
402 cluster_elem["query_range"][1] - cluster_elem["query_range"][0]
1a19092729be Uploaded
cpt
parents:
diff changeset
403 )
1a19092729be Uploaded
cpt
parents:
diff changeset
404 )
1a19092729be Uploaded
cpt
parents:
diff changeset
405
1a19092729be Uploaded
cpt
parents:
diff changeset
406 tempLoc = FeatureLocation(
1a19092729be Uploaded
cpt
parents:
diff changeset
407 cds.location.start + (3 * (cluster_elem["query_range"][0] - 1)),
1a19092729be Uploaded
cpt
parents:
diff changeset
408 cds.location.start + (3 * (cluster_elem["query_range"][1])),
1a19092729be Uploaded
cpt
parents:
diff changeset
409 cds.location.strand,
1a19092729be Uploaded
cpt
parents:
diff changeset
410 )
1a19092729be Uploaded
cpt
parents:
diff changeset
411 cds.location = tempLoc
1a19092729be Uploaded
cpt
parents:
diff changeset
412 # Set the qualifiers appropriately
1a19092729be Uploaded
cpt
parents:
diff changeset
413 cds.qualifiers = {
1a19092729be Uploaded
cpt
parents:
diff changeset
414 "ID": ["gp_%s.CDS.%s" % (cluster_idx, idx)],
1a19092729be Uploaded
cpt
parents:
diff changeset
415 "score": score,
1a19092729be Uploaded
cpt
parents:
diff changeset
416 "Name": self.gff_info[gene_name]["name"],
1a19092729be Uploaded
cpt
parents:
diff changeset
417 "evalue": cluster_elem["evalue"],
1a19092729be Uploaded
cpt
parents:
diff changeset
418 "Identity": cluster_elem["identity_percent"] * 100,
1a19092729be Uploaded
cpt
parents:
diff changeset
419 #'|'.join(cluster_elem['gi_nos']) + "| title goes here."
1a19092729be Uploaded
cpt
parents:
diff changeset
420 }
1a19092729be Uploaded
cpt
parents:
diff changeset
421 # cds.location.start = cds.location.start +
1a19092729be Uploaded
cpt
parents:
diff changeset
422 cdss.append(cds)
1a19092729be Uploaded
cpt
parents:
diff changeset
423
1a19092729be Uploaded
cpt
parents:
diff changeset
424 # And we attach the things properly.
1a19092729be Uploaded
cpt
parents:
diff changeset
425 mRNA.sub_features = cdss
1a19092729be Uploaded
cpt
parents:
diff changeset
426 mRNA.location = FeatureLocation(mRNA.location.start, mRNA.location.end, cds.location.strand)
1a19092729be Uploaded
cpt
parents:
diff changeset
427 gene.sub_features = [mRNA]
1a19092729be Uploaded
cpt
parents:
diff changeset
428 gene.location = FeatureLocation(gene.location.start, gene.location.end, cds.location.strand)
1a19092729be Uploaded
cpt
parents:
diff changeset
429
1a19092729be Uploaded
cpt
parents:
diff changeset
430 # And append to our record
1a19092729be Uploaded
cpt
parents:
diff changeset
431 rec.features.append(gene)
1a19092729be Uploaded
cpt
parents:
diff changeset
432 return rec
1a19092729be Uploaded
cpt
parents:
diff changeset
433
1a19092729be Uploaded
cpt
parents:
diff changeset
434 def output_xml(self, clusters):
1a19092729be Uploaded
cpt
parents:
diff changeset
435 threeLevel = {}
1a19092729be Uploaded
cpt
parents:
diff changeset
436 # print((clusters.viewkeys()))
1a19092729be Uploaded
cpt
parents:
diff changeset
437 # print(type(enumerate(clusters)))
1a19092729be Uploaded
cpt
parents:
diff changeset
438 # print(type(clusters))
1a19092729be Uploaded
cpt
parents:
diff changeset
439 for cluster_idx, cluster_id in enumerate(clusters):
1a19092729be Uploaded
cpt
parents:
diff changeset
440 # print(type(cluster_id))
1a19092729be Uploaded
cpt
parents:
diff changeset
441 # print(type(cluster_idx))
1a19092729be Uploaded
cpt
parents:
diff changeset
442 # print(type(clusters[cluster_id][0]['hit_num']))
1a19092729be Uploaded
cpt
parents:
diff changeset
443 if not (clusters[cluster_id][0]["iter_num"] in threeLevel.keys):
1a19092729be Uploaded
cpt
parents:
diff changeset
444 threeLevel[clusters[cluster_id][0]["iter_num"]] = {}
1a19092729be Uploaded
cpt
parents:
diff changeset
445 # for cluster_idx, cluster_id in enumerate(clusters):
1a19092729be Uploaded
cpt
parents:
diff changeset
446 # print(type(clusters[cluster_id]))
1a19092729be Uploaded
cpt
parents:
diff changeset
447 # b = {clusters[cluster_id][i]: clusters[cluster_id][i+1] for i in range(0, len(clusters[cluster_id]), 2)}
1a19092729be Uploaded
cpt
parents:
diff changeset
448 # print(type(b))#['name']))
1a19092729be Uploaded
cpt
parents:
diff changeset
449 # for hspList in clusters:
1a19092729be Uploaded
cpt
parents:
diff changeset
450 # for x, idx in (enumerate(clusters)):#for hsp in hspList:
1a19092729be Uploaded
cpt
parents:
diff changeset
451 # print("In X")
1a19092729be Uploaded
cpt
parents:
diff changeset
452
1a19092729be Uploaded
cpt
parents:
diff changeset
453
1a19092729be Uploaded
cpt
parents:
diff changeset
454 if __name__ == "__main__":
1a19092729be Uploaded
cpt
parents:
diff changeset
455 parser = argparse.ArgumentParser(description="Intron detection")
1a19092729be Uploaded
cpt
parents:
diff changeset
456 parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 gene calls")
1a19092729be Uploaded
cpt
parents:
diff changeset
457 parser.add_argument(
1a19092729be Uploaded
cpt
parents:
diff changeset
458 "blastp", type=argparse.FileType("r"), help="blast XML protein results"
1a19092729be Uploaded
cpt
parents:
diff changeset
459 )
1a19092729be Uploaded
cpt
parents:
diff changeset
460 parser.add_argument(
1a19092729be Uploaded
cpt
parents:
diff changeset
461 "--minimum",
1a19092729be Uploaded
cpt
parents:
diff changeset
462 help="Gap minimum (Default -1, set to a negative number to allow overlap)",
1a19092729be Uploaded
cpt
parents:
diff changeset
463 default=-1,
1a19092729be Uploaded
cpt
parents:
diff changeset
464 type=int,
1a19092729be Uploaded
cpt
parents:
diff changeset
465 )
1a19092729be Uploaded
cpt
parents:
diff changeset
466 parser.add_argument(
1a19092729be Uploaded
cpt
parents:
diff changeset
467 "--maximum",
1a19092729be Uploaded
cpt
parents:
diff changeset
468 help="Gap maximum in genome (Default 10000)",
1a19092729be Uploaded
cpt
parents:
diff changeset
469 default=10000,
1a19092729be Uploaded
cpt
parents:
diff changeset
470 type=int,
1a19092729be Uploaded
cpt
parents:
diff changeset
471 )
1a19092729be Uploaded
cpt
parents:
diff changeset
472 parser.add_argument(
1a19092729be Uploaded
cpt
parents:
diff changeset
473 "--idThresh", help="ID Percent Threshold", default=0.4, type=float
1a19092729be Uploaded
cpt
parents:
diff changeset
474 )
1a19092729be Uploaded
cpt
parents:
diff changeset
475
1a19092729be Uploaded
cpt
parents:
diff changeset
476 args = parser.parse_args()
1a19092729be Uploaded
cpt
parents:
diff changeset
477
1a19092729be Uploaded
cpt
parents:
diff changeset
478 threshCap = args.idThresh
1a19092729be Uploaded
cpt
parents:
diff changeset
479 if threshCap > 1.00:
1a19092729be Uploaded
cpt
parents:
diff changeset
480 threshCap = 1.00
1a19092729be Uploaded
cpt
parents:
diff changeset
481 if threshCap < 0:
1a19092729be Uploaded
cpt
parents:
diff changeset
482 threshCap = 0
1a19092729be Uploaded
cpt
parents:
diff changeset
483
1a19092729be Uploaded
cpt
parents:
diff changeset
484 # create new IntronFinder object based on user input
1a19092729be Uploaded
cpt
parents:
diff changeset
485 ifinder = IntronFinder(args.gff3, args.blastp, threshCap)
1a19092729be Uploaded
cpt
parents:
diff changeset
486 ifinder.create_clusters()
1a19092729be Uploaded
cpt
parents:
diff changeset
487 ifinder.clusters = ifinder.check_strand()
1a19092729be Uploaded
cpt
parents:
diff changeset
488 ifinder.clusters = ifinder.check_gene_gap(maximum=args.maximum)
1a19092729be Uploaded
cpt
parents:
diff changeset
489 ifinder.clusters = ifinder.check_seq_overlap(minimum=args.minimum)
1a19092729be Uploaded
cpt
parents:
diff changeset
490 # ifinder.output_xml(ifinder.clusters)
1a19092729be Uploaded
cpt
parents:
diff changeset
491 # for x, idx in (enumerate(ifinder.clusters)):
1a19092729be Uploaded
cpt
parents:
diff changeset
492 # print(ifinder.blast)
1a19092729be Uploaded
cpt
parents:
diff changeset
493
1a19092729be Uploaded
cpt
parents:
diff changeset
494 condensed_report = ifinder.cluster_report()
1a19092729be Uploaded
cpt
parents:
diff changeset
495 rec = ifinder.output_gff3(ifinder.clusters)
1a19092729be Uploaded
cpt
parents:
diff changeset
496 gffWrite([rec], sys.stdout)
1a19092729be Uploaded
cpt
parents:
diff changeset
497
1a19092729be Uploaded
cpt
parents:
diff changeset
498 # import pprint; pprint.pprint(ifinder.clusters)