0
|
1 #!/usr/bin/env python
|
|
2 import sys
|
|
3 import re
|
|
4 import itertools
|
|
5 import argparse
|
|
6 import hashlib
|
|
7 import copy
|
|
8 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
|
|
9 from Bio.Blast import NCBIXML
|
|
10 from Bio.SeqFeature import SeqFeature, FeatureLocation
|
|
11 from gff3 import feature_lambda
|
|
12 from collections import OrderedDict
|
|
13 import logging
|
|
14
|
|
15 logging.basicConfig(level=logging.DEBUG)
|
|
16 log = logging.getLogger()
|
|
17
|
|
18
|
|
19 def parse_xml(blastxml, thresh):
|
|
20 """ Parses xml file to get desired info (genes, hits, etc) """
|
|
21 blast = []
|
|
22 discarded_records = 0
|
|
23 totLen = 0
|
|
24 for iter_num, blast_record in enumerate(NCBIXML.parse(blastxml), 1):
|
|
25 blast_gene = []
|
|
26 align_num = 0
|
|
27 for alignment in blast_record.alignments:
|
|
28 align_num += 1
|
|
29 # hit_gis = alignment.hit_id + alignment.hit_def
|
|
30 # gi_nos = [str(gi) for gi in re.findall('(?<=gi\|)\d{9,10}', hit_gis)]
|
|
31 gi_nos = str(alignment.accession)
|
|
32
|
|
33 for hsp in alignment.hsps:
|
|
34 x = float(hsp.identities - 1) / ((hsp.query_end) - hsp.query_start)
|
|
35 if x < thresh:
|
|
36 discarded_records += 1
|
|
37 continue
|
|
38 nice_name = blast_record.query
|
|
39
|
|
40 if " " in nice_name:
|
|
41 nice_name = nice_name[0 : nice_name.index(" ")]
|
|
42
|
|
43 blast_gene.append(
|
|
44 {
|
|
45 "gi_nos": gi_nos,
|
|
46 "sbjct_length": alignment.length,
|
|
47 "query_length": blast_record.query_length,
|
|
48 "sbjct_range": (hsp.sbjct_start, hsp.sbjct_end),
|
|
49 "query_range": (hsp.query_start, hsp.query_end),
|
|
50 "name": nice_name,
|
|
51 "evalue": hsp.expect,
|
|
52 "identity": hsp.identities,
|
|
53 "identity_percent": x,
|
|
54 "hit_num": align_num,
|
|
55 "iter_num": iter_num,
|
|
56 "match_id": alignment.title.partition(">")[0],
|
|
57 }
|
|
58 )
|
|
59
|
|
60 blast.append(blast_gene)
|
|
61 totLen += len(blast_gene)
|
|
62 log.debug("parse_blastxml %s -> %s", totLen + discarded_records, totLen)
|
|
63 return blast
|
|
64
|
|
65
|
|
66 def filter_lone_clusters(clusters):
|
|
67 """ Removes all clusters with only one member and those with no hits """
|
|
68 filtered_clusters = {}
|
|
69 for key in clusters:
|
|
70 if len(clusters[key]) > 1 and len(key) > 0:
|
|
71 filtered_clusters[key] = clusters[key]
|
|
72 log.debug("filter_lone_clusters %s -> %s", len(clusters), len(filtered_clusters))
|
|
73 return filtered_clusters
|
|
74
|
|
75
|
|
76 def test_true(feature, **kwargs):
|
|
77 return True
|
|
78
|
|
79
|
|
80 def parse_gff(gff3):
|
|
81 """ Extracts strand and start location to be used in cluster filtering """
|
|
82 log.debug("parse_gff3")
|
|
83 gff_info = {}
|
|
84 _rec = None
|
|
85 for rec in gffParse(gff3):
|
|
86 endBase = len(rec.seq)
|
|
87
|
|
88 _rec = rec
|
|
89 _rec.annotations = {}
|
|
90 for feat in feature_lambda(rec.features, test_true, {}, subfeatures=False):
|
|
91 if feat.type == "CDS":
|
|
92 if "Name" in feat.qualifiers.keys():
|
|
93 CDSname = feat.qualifiers["Name"]
|
|
94 else:
|
|
95 CDSname = feat.qualifiers["ID"]
|
|
96 gff_info[feat.id] = {
|
|
97 "strand": feat.strand,
|
|
98 "start": feat.location.start,
|
|
99 "end": feat.location.end,
|
|
100 "loc": feat.location,
|
|
101 "feat": feat,
|
|
102 "name": CDSname,
|
|
103 }
|
|
104
|
|
105 gff_info = OrderedDict(sorted(gff_info.items(), key=lambda k: k[1]["start"]))
|
|
106 # endBase = 0
|
|
107 for i, feat_id in enumerate(gff_info):
|
|
108 gff_info[feat_id].update({"index": i})
|
|
109 if gff_info[feat_id]["loc"].end > endBase:
|
|
110 endBase = gff_info[feat_id]["loc"].end
|
|
111
|
|
112 return dict(gff_info), _rec, endBase
|
|
113
|
|
114
|
|
115 def all_same(genes_list):
|
|
116 """ Returns True if all gene names in cluster are identical """
|
|
117 return all(gene["name"] == genes_list[0]["name"] for gene in genes_list[1:])
|
|
118
|
|
119
|
|
120 def remove_duplicates(clusters):
|
|
121 """ Removes clusters with multiple members but only one gene name """
|
|
122 filtered_clusters = {}
|
|
123 for key in clusters:
|
|
124 if all_same(clusters[key]):
|
|
125 continue
|
|
126 else:
|
|
127 filtered_clusters[key] = clusters[key]
|
|
128 log.debug("remove_duplicates %s -> %s", len(clusters), len(filtered_clusters))
|
|
129 return filtered_clusters
|
|
130
|
|
131
|
|
132 class IntronFinder(object):
|
|
133 """ IntronFinder objects are lists that contain a list of hits for every gene """
|
|
134
|
|
135 def __init__(self, gff3, blastp, thresh):
|
|
136 self.blast = []
|
|
137 self.clusters = {}
|
|
138 self.gff_info = {}
|
|
139 self.length = 0
|
|
140
|
|
141 (self.gff_info, self.rec, self.length) = parse_gff(gff3)
|
|
142 self.blast = parse_xml(blastp, thresh)
|
|
143
|
|
144 def create_clusters(self):
|
|
145 """ Finds 2 or more genes with matching hits """
|
|
146 clusters = {}
|
|
147 for gene in self.blast:
|
|
148 for hit in gene:
|
|
149 if " " in hit["gi_nos"]:
|
|
150 hit["gi_nos"] = hit["gi_nos"][0 : hit["gi_nos"].index(" ")]
|
|
151
|
|
152 nameCheck = hit["gi_nos"]
|
|
153 if nameCheck == "":
|
|
154 continue
|
|
155 name = hashlib.md5((nameCheck).encode()).hexdigest()
|
|
156
|
|
157 if name in clusters:
|
|
158 if hit not in clusters[name]:
|
|
159 clusters[name].append(hit)
|
|
160 else:
|
|
161 clusters[name] = [hit]
|
|
162 log.debug("create_clusters %s -> %s", len(self.blast), len(clusters))
|
|
163 self.clusters = filter_lone_clusters(clusters)
|
|
164
|
|
165 def check_strand(self):
|
|
166 """ filters clusters for genes on the same strand """
|
|
167 filtered_clusters = {}
|
|
168 for key in self.clusters:
|
|
169 pos_strand = []
|
|
170 neg_strand = []
|
|
171 for gene in self.clusters[key]:
|
|
172 if self.gff_info[gene["name"]]["strand"] == 1:
|
|
173 pos_strand.append(gene)
|
|
174 else:
|
|
175 neg_strand.append(gene)
|
|
176 if len(pos_strand) == 0 or len(neg_strand) == 0:
|
|
177 filtered_clusters[key] = self.clusters[key]
|
|
178 else:
|
|
179 if len(pos_strand) > 1:
|
|
180 filtered_clusters[key + "_+1"] = pos_strand
|
|
181 if len(neg_strand) > 1:
|
|
182 filtered_clusters[key + "_-1"] = neg_strand
|
|
183
|
|
184 return filtered_clusters
|
|
185
|
|
186 def check_gene_gap(self, maximum=10000):
|
|
187 filtered_clusters = {}
|
|
188 for key in self.clusters:
|
|
189 hits_lists = []
|
|
190 gene_added = False
|
|
191 for gene in self.clusters[key]:
|
|
192 for hits in hits_lists:
|
|
193 for hit in hits:
|
|
194 lastStart = max(
|
|
195 self.gff_info[gene["name"]]["start"],
|
|
196 self.gff_info[hit["name"]]["start"],
|
|
197 )
|
|
198 lastEnd = max(
|
|
199 self.gff_info[gene["name"]]["end"],
|
|
200 self.gff_info[hit["name"]]["end"],
|
|
201 )
|
|
202 firstEnd = min(
|
|
203 self.gff_info[gene["name"]]["end"],
|
|
204 self.gff_info[hit["name"]]["end"],
|
|
205 )
|
|
206 firstStart = min(
|
|
207 self.gff_info[gene["name"]]["start"],
|
|
208 self.gff_info[hit["name"]]["start"],
|
|
209 )
|
|
210 if (
|
|
211 lastStart - firstEnd <= maximum
|
|
212 or self.length - lastEnd + firstStart <= maximum
|
|
213 ):
|
|
214 hits.append(gene)
|
|
215 gene_added = True
|
|
216 break
|
|
217 if not gene_added:
|
|
218 hits_lists.append([gene])
|
|
219
|
|
220 for i, hits in enumerate(hits_lists):
|
|
221 if len(hits) >= 2:
|
|
222 filtered_clusters[key + "_" + str(i)] = hits
|
|
223 # for i in filtered_clusters:
|
|
224 # print(i)
|
|
225 # print(filtered_clusters[i])
|
|
226 log.debug("check_gene_gap %s -> %s", len(self.clusters), len(filtered_clusters))
|
|
227
|
|
228 return remove_duplicates(
|
|
229 filtered_clusters
|
|
230 ) # call remove_duplicates somewhere else?
|
|
231
|
|
232 # maybe figure out how to merge with check_gene_gap?
|
|
233 # def check_seq_gap():
|
|
234
|
|
235 # also need a check for gap in sequence coverage?
|
|
236 def check_seq_overlap(self, minimum=-1):
|
|
237 filtered_clusters = {}
|
|
238 for key in self.clusters:
|
|
239 add_cluster = True
|
|
240 sbjct_ranges = []
|
|
241 query_ranges = []
|
|
242 for gene in self.clusters[key]:
|
|
243 sbjct_ranges.append(gene["sbjct_range"])
|
|
244 query_ranges.append(gene["query_range"])
|
|
245
|
|
246 combinations = list(itertools.combinations(sbjct_ranges, 2))
|
|
247
|
|
248 for pair in combinations:
|
|
249 overlap = len(
|
|
250 set(range(pair[0][0], pair[0][1]))
|
|
251 & set(range(pair[1][0], pair[1][1]))
|
|
252 )
|
|
253 minPair = pair[0]
|
|
254 maxPair = pair[1]
|
|
255
|
|
256 if minPair[0] > maxPair[0]:
|
|
257 minPair = pair[1]
|
|
258 maxPair = pair[0]
|
|
259 elif minPair[0] == maxPair[0] and minPair[1] > maxPair[1]:
|
|
260 minPair = pair[1]
|
|
261 maxPair = pair[0]
|
|
262 if overlap > 0:
|
|
263 dist1 = maxPair[0] - minPair[0]
|
|
264 else:
|
|
265 dist1 = abs(maxPair[0] - minPair[1])
|
|
266
|
|
267 if minimum < 0:
|
|
268 if overlap > (minimum * -1):
|
|
269 # print("Rejcting: Neg min but too much overlap: " + str(pair))
|
|
270 add_cluster = False
|
|
271 elif minimum == 0:
|
|
272 if overlap > 0:
|
|
273 # print("Rejcting: 0 min and overlap: " + str(pair))
|
|
274 add_cluster = False
|
|
275 elif overlap > 0:
|
|
276 # print("Rejcting: Pos min and overlap: " + str(pair))
|
|
277 add_cluster = False
|
|
278
|
|
279 if (dist1 < minimum) and (minimum >= 0):
|
|
280 # print("Rejcting: Dist failure: " + str(pair) + " D1: " + dist1)
|
|
281 add_cluster = False
|
|
282 # if add_cluster:
|
|
283 # print("Accepted: " + str(pair) + " D1: " + str(dist1) + " Ov: " + str(overlap))
|
|
284 if add_cluster:
|
|
285
|
|
286 filtered_clusters[key] = self.clusters[key]
|
|
287
|
|
288 log.debug(
|
|
289 "check_seq_overlap %s -> %s", len(self.clusters), len(filtered_clusters)
|
|
290 )
|
|
291 # print(self.clusters)
|
|
292 return filtered_clusters
|
|
293
|
|
294 def cluster_report(self):
|
|
295 condensed_report = {}
|
|
296 for key in self.clusters:
|
|
297 for gene in self.clusters[key]:
|
|
298 if gene["name"] in condensed_report:
|
|
299 condensed_report[gene["name"]].append(gene["sbjct_range"])
|
|
300 else:
|
|
301 condensed_report[gene["name"]] = [gene["sbjct_range"]]
|
|
302 return condensed_report
|
|
303
|
|
304 def cluster_report_2(self):
|
|
305 condensed_report = {}
|
|
306 for key in self.clusters:
|
|
307 gene_names = []
|
|
308 for gene in self.clusters[key]:
|
|
309 gene_names.append((gene["name"]).strip("CPT_phageK_"))
|
|
310 if ", ".join(gene_names) in condensed_report:
|
|
311 condensed_report[", ".join(gene_names)] += 1
|
|
312 else:
|
|
313 condensed_report[", ".join(gene_names)] = 1
|
|
314 return condensed_report
|
|
315
|
|
316 def cluster_report_3(self):
|
|
317 condensed_report = {}
|
|
318 for key in self.clusters:
|
|
319 gene_names = []
|
|
320 gi_nos = []
|
|
321 for i, gene in enumerate(self.clusters[key]):
|
|
322 if i == 0:
|
|
323 gi_nos = gene["gi_nos"]
|
|
324 gene_names.append((gene["name"]).strip(".p01").strip("CPT_phageK_gp"))
|
|
325 if ", ".join(gene_names) in condensed_report:
|
|
326 condensed_report[", ".join(gene_names)].append(gi_nos)
|
|
327 else:
|
|
328 condensed_report[", ".join(gene_names)] = [gi_nos]
|
|
329 return condensed_report
|
|
330
|
|
331 def output_gff3(self, clusters):
|
|
332 rec = copy.deepcopy(self.rec)
|
|
333 rec.features = []
|
|
334 for cluster_idx, cluster_id in enumerate(clusters):
|
|
335 # Get the list of genes in this cluster
|
|
336 associated_genes = set([x["name"] for x in clusters[cluster_id]])
|
|
337 # print(associated_genes)
|
|
338 # Get the gene locations
|
|
339 assoc_gene_info = {x: self.gff_info[x]["loc"] for x in associated_genes}
|
|
340 # Now we construct a gene from the children as a "standard gene model" gene.
|
|
341 # Get the minimum and maximum locations covered by all of the children genes
|
|
342 gene_min = min([min(x[1].start, x[1].end) for x in assoc_gene_info.items()])
|
|
343 gene_max = max([max(x[1].start, x[1].end) for x in assoc_gene_info.items()])
|
|
344
|
|
345 evidence_notes = []
|
|
346 for cluster_elem in clusters[cluster_id]:
|
|
347 note = "{name} had {ident}% identity to NCBI Protein ID {pretty_gi}".format(
|
|
348 pretty_gi=(cluster_elem["gi_nos"]),
|
|
349 ident=int(
|
|
350 100
|
|
351 * float(cluster_elem["identity"] - 1.00)
|
|
352 / abs(
|
|
353 cluster_elem["query_range"][1]
|
|
354 - cluster_elem["query_range"][0]
|
|
355 )
|
|
356 ),
|
|
357 **cluster_elem
|
|
358 )
|
|
359 evidence_notes.append(note)
|
|
360 if gene_max - gene_min > 0.8 * float(self.length):
|
|
361 evidence_notes.append(
|
|
362 "Intron is over 80% of the total length of the genome, possible wraparound scenario"
|
|
363 )
|
|
364 # With that we can create the top level gene
|
|
365 gene = gffSeqFeature(
|
|
366 location=FeatureLocation(gene_min, gene_max),
|
|
367 type="gene",
|
|
368 id=cluster_id,
|
|
369 qualifiers={
|
|
370 "ID": ["gp_%s" % cluster_idx],
|
|
371 "Percent_Identities": evidence_notes,
|
|
372 "Note": clusters[cluster_id][0]["match_id"],
|
|
373 },
|
|
374 )
|
|
375
|
|
376 # Below that we have an mRNA
|
|
377 mRNA = gffSeqFeature(
|
|
378 location=FeatureLocation(gene_min, gene_max),
|
|
379 type="mRNA",
|
|
380 id=cluster_id + ".mRNA",
|
|
381 qualifiers={"ID": ["gp_%s.mRNA" % cluster_idx], "note": evidence_notes},
|
|
382 )
|
|
383
|
|
384 # Now come the CDSs.
|
|
385 cdss = []
|
|
386 # We sort them just for kicks
|
|
387 for idx, gene_name in enumerate(
|
|
388 sorted(associated_genes, key=lambda x: int(self.gff_info[x]["start"]))
|
|
389 ):
|
|
390 # Copy the CDS so we don't muck up a good one
|
|
391 cds = copy.copy(self.gff_info[gene_name]["feat"])
|
|
392 # Get the associated cluster element (used in the Notes above)
|
|
393 cluster_elem = [
|
|
394 x for x in clusters[cluster_id] if x["name"] == gene_name
|
|
395 ][0]
|
|
396
|
|
397 # Calculate %identity which we'll use to score
|
|
398 score = int(
|
|
399 1000
|
|
400 * float(cluster_elem["identity"])
|
|
401 / abs(
|
|
402 cluster_elem["query_range"][1] - cluster_elem["query_range"][0]
|
|
403 )
|
|
404 )
|
|
405
|
|
406 tempLoc = FeatureLocation(
|
|
407 cds.location.start + (3 * (cluster_elem["query_range"][0] - 1)),
|
|
408 cds.location.start + (3 * (cluster_elem["query_range"][1])),
|
|
409 cds.location.strand,
|
|
410 )
|
|
411 cds.location = tempLoc
|
|
412 # Set the qualifiers appropriately
|
|
413 cds.qualifiers = {
|
|
414 "ID": ["gp_%s.CDS.%s" % (cluster_idx, idx)],
|
|
415 "score": score,
|
|
416 "Name": self.gff_info[gene_name]["name"],
|
|
417 "evalue": cluster_elem["evalue"],
|
|
418 "Identity": cluster_elem["identity_percent"] * 100,
|
|
419 #'|'.join(cluster_elem['gi_nos']) + "| title goes here."
|
|
420 }
|
|
421 # cds.location.start = cds.location.start +
|
|
422 cdss.append(cds)
|
|
423
|
|
424 # And we attach the things properly.
|
|
425 mRNA.sub_features = cdss
|
|
426 mRNA.location = FeatureLocation(mRNA.location.start, mRNA.location.end, cds.location.strand)
|
|
427 gene.sub_features = [mRNA]
|
|
428 gene.location = FeatureLocation(gene.location.start, gene.location.end, cds.location.strand)
|
|
429
|
|
430 # And append to our record
|
|
431 rec.features.append(gene)
|
|
432 return rec
|
|
433
|
|
434 def output_xml(self, clusters):
|
|
435 threeLevel = {}
|
|
436 # print((clusters.viewkeys()))
|
|
437 # print(type(enumerate(clusters)))
|
|
438 # print(type(clusters))
|
|
439 for cluster_idx, cluster_id in enumerate(clusters):
|
|
440 # print(type(cluster_id))
|
|
441 # print(type(cluster_idx))
|
|
442 # print(type(clusters[cluster_id][0]['hit_num']))
|
|
443 if not (clusters[cluster_id][0]["iter_num"] in threeLevel.keys):
|
|
444 threeLevel[clusters[cluster_id][0]["iter_num"]] = {}
|
|
445 # for cluster_idx, cluster_id in enumerate(clusters):
|
|
446 # print(type(clusters[cluster_id]))
|
|
447 # b = {clusters[cluster_id][i]: clusters[cluster_id][i+1] for i in range(0, len(clusters[cluster_id]), 2)}
|
|
448 # print(type(b))#['name']))
|
|
449 # for hspList in clusters:
|
|
450 # for x, idx in (enumerate(clusters)):#for hsp in hspList:
|
|
451 # print("In X")
|
|
452
|
|
453
|
|
454 if __name__ == "__main__":
|
|
455 parser = argparse.ArgumentParser(description="Intron detection")
|
|
456 parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 gene calls")
|
|
457 parser.add_argument(
|
|
458 "blastp", type=argparse.FileType("r"), help="blast XML protein results"
|
|
459 )
|
|
460 parser.add_argument(
|
|
461 "--minimum",
|
|
462 help="Gap minimum (Default -1, set to a negative number to allow overlap)",
|
|
463 default=-1,
|
|
464 type=int,
|
|
465 )
|
|
466 parser.add_argument(
|
|
467 "--maximum",
|
|
468 help="Gap maximum in genome (Default 10000)",
|
|
469 default=10000,
|
|
470 type=int,
|
|
471 )
|
|
472 parser.add_argument(
|
|
473 "--idThresh", help="ID Percent Threshold", default=0.4, type=float
|
|
474 )
|
|
475
|
|
476 args = parser.parse_args()
|
|
477
|
|
478 threshCap = args.idThresh
|
|
479 if threshCap > 1.00:
|
|
480 threshCap = 1.00
|
|
481 if threshCap < 0:
|
|
482 threshCap = 0
|
|
483
|
|
484 # create new IntronFinder object based on user input
|
|
485 ifinder = IntronFinder(args.gff3, args.blastp, threshCap)
|
|
486 ifinder.create_clusters()
|
|
487 ifinder.clusters = ifinder.check_strand()
|
|
488 ifinder.clusters = ifinder.check_gene_gap(maximum=args.maximum)
|
|
489 ifinder.clusters = ifinder.check_seq_overlap(minimum=args.minimum)
|
|
490 # ifinder.output_xml(ifinder.clusters)
|
|
491 # for x, idx in (enumerate(ifinder.clusters)):
|
|
492 # print(ifinder.blast)
|
|
493
|
|
494 condensed_report = ifinder.cluster_report()
|
|
495 rec = ifinder.output_gff3(ifinder.clusters)
|
|
496 gffWrite([rec], sys.stdout)
|
|
497
|
|
498 # import pprint; pprint.pprint(ifinder.clusters)
|