comparison cpt_phageqc_annotation/phage_annotation_validator.py @ 0:c3140b08d703 draft default tip

Uploaded
author cpt
date Fri, 17 Jun 2022 13:00:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c3140b08d703
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # vim: set fileencoding=utf-8
4 import os
5 import sys
6 import json
7 import math
8 import numpy
9 import argparse
10 import itertools
11 import logging
12 from gff3 import (
13 feature_lambda,
14 coding_genes,
15 genes,
16 get_gff3_id,
17 feature_test_location,
18 get_rbs_from,
19 nice_name,
20 )
21 from shinefind import NaiveSDCaller
22 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
23 from Bio import SeqIO
24 from Bio.SeqRecord import SeqRecord
25 from Bio.SeqFeature import SeqFeature, FeatureLocation
26 from jinja2 import Environment, FileSystemLoader
27 from cpt import MGAFinder
28
29 logging.basicConfig(level=logging.DEBUG)
30 log = logging.getLogger(name="pav")
31
32 # Path to script, required because of Galaxy.
33 SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
34 # Path to the HTML template for the report
35
36 ENCOURAGEMENT = (
37 (100, "Perfection itself!"),
38 (90, "Amazing!"),
39 (80, "Not too bad, a few minor things to fix..."),
40 (70, "Some issues to address"),
41 (
42 50,
43 """Issues detected! </p><p class="text-muted">Have you heard of the
44 <a href="https://cpt.tamu.edu">CPT</a>\'s Automated Phage Annotation
45 Pipeline?""",
46 ),
47 (
48 0,
49 """<b>MAJOR</b> issues detected! Please consider using the
50 <a href="https://cpt.tamu.edu">CPT</a>\'s Automated Phage Annotation Pipeline""",
51 ),
52 )
53
54
55 def gen_qc_feature(start, end, message, strand=0, id_src=None, type_src="gene"):
56 kwargs = {"qualifiers": {"note": [message]}}
57 kwargs["type"] = type_src
58 kwargs["strand"] = strand
59 kwargs["phase"]=0
60 kwargs["score"]=0.0
61 kwargs["source"]="feature"
62 if id_src is not None:
63 kwargs["id"] = id_src.id
64 kwargs["qualifiers"]["ID"] = [id_src.id]
65 kwargs["qualifiers"]["Name"] = id_src.qualifiers.get("Name", [])
66
67
68 if end >= start:
69 return gffSeqFeature(FeatureLocation(start, end, strand=strand), **kwargs)
70 else:
71 return gffSeqFeature(FeatureLocation(end, start, strand=strand), **kwargs)
72
73
74 def __ensure_location_in_bounds(start=0, end=0, parent_length=0):
75 # This prevents frameshift errors
76 while start < 0:
77 start += 3
78 while end < 0:
79 end += 3
80 while start > parent_length:
81 start -= 3
82 while end > parent_length:
83 end -= 3
84 return (start, end)
85
86
87 def missing_rbs(record, lookahead_min=5, lookahead_max=15):
88 """
89 Identify gene features with missing RBSs
90
91 This "looks ahead" 5-15 bases ahead of each gene feature, and checks if
92 there's an RBS feature in those bounds.
93
94 The returned data is a set of genes with the RBS sequence in the __upstream
95 attribute, and a message in the __message attribute.
96 """
97 results = []
98 good = 0
99 bad = 0
100 qc_features = []
101 sd_finder = NaiveSDCaller()
102
103 any_rbss = False
104
105 for gene in coding_genes(record.features):
106 # Check if there are RBSs, TODO: make this recursive. Each feature in
107 # gene.sub_features can also have sub_features.
108 rbss = get_rbs_from(gene)
109 # No RBS found
110 if len(rbss) == 0:
111 # Get the sequence lookahead_min to lookahead_max upstream
112 if gene.strand > 0:
113 start = gene.location.start - lookahead_max
114 end = gene.location.start - lookahead_min
115 else:
116 start = gene.location.end + lookahead_min
117 end = gene.location.end + lookahead_max
118 # We have to ensure the feature is ON the genome, otherwise we may
119 # be trying to access a location outside of the length of the
120 # genome, which would be bad.
121 (start, end) = __ensure_location_in_bounds(
122 start=start, end=end, parent_length=len(record)
123 )
124 # Temporary feature to extract sequence
125 tmp = gffSeqFeature(
126 FeatureLocation(start, end, strand=gene.strand), type="domain"
127 )
128 # Get the sequence
129 seq = str(tmp.extract(record.seq))
130 # Set the default properties
131 gene.__upstream = seq.lower()
132 gene.__message = "No RBS annotated, None found"
133
134 # Try and do an automated shinefind call
135 sds = sd_finder.list_sds(seq)
136 if len(sds) > 0:
137 sd = sds[0]
138 gene.__upstream = sd_finder.highlight_sd(
139 seq.lower(), sd["start"], sd["end"]
140 )
141 gene.__message = "Unannotated but valid RBS"
142
143 qc_features.append(
144 gen_qc_feature(
145 start, end, "Missing RBS", strand=gene.strand, id_src=gene, type_src="gene"
146 )
147 )
148
149 bad += 1
150 results.append(gene)
151 results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand)
152 else:
153 if len(rbss) > 1:
154 log.warn("%s RBSs found for gene %s", rbss[0].id, get_gff3_id(gene))
155 any_rbss = True
156 # get first RBS/CDS
157 cds = list(genes(gene.sub_features, feature_type="CDS"))[0]
158 rbs = rbss[0]
159
160 # Get the distance between the two
161 if gene.strand > 0:
162 distance = cds.location.start - rbs.location.end
163 else:
164 distance = rbs.location.start - cds.location.end
165
166 # If the RBS is too far away, annotate that
167 if distance > lookahead_max:
168 gene.__message = "RBS too far away (%s nt)" % distance
169
170 qc_features.append(
171 gen_qc_feature(
172 rbs.location.start,
173 rbs.location.end,
174 gene.__message,
175 strand=gene.strand,
176 id_src=gene,
177 type_src="gene"
178 )
179 )
180
181 bad += 1
182 results.append(gene)
183 results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand)
184 else:
185 good += 1
186
187 return good, bad, results, qc_features, any_rbss
188
189
190 # modified from get_orfs_or_cdss.py
191 # -----------------------------------------------------------
192
193
194 def require_sd(data, record, chrom_start, sd_min, sd_max):
195 sd_finder = NaiveSDCaller()
196 for putative_gene in data:
197 if putative_gene[2] > 0: # strand
198 start = chrom_start + putative_gene[0] - sd_max
199 end = chrom_start + putative_gene[0] - sd_min
200 else:
201 start = chrom_start + putative_gene[1] + sd_min
202 end = chrom_start + putative_gene[1] + sd_max
203
204 (start, end) = __ensure_location_in_bounds(
205 start=start, end=end, parent_length=len(record)
206 )
207 tmp = gffSeqFeature(
208 FeatureLocation(start, end, strand=putative_gene[2]), type="domain"
209 )
210 # Get the sequence
211 seq = str(tmp.extract(record.seq))
212 sds = sd_finder.list_sds(seq)
213 if len(sds) > 0:
214 yield putative_gene + (start, end)
215
216
217 def excessive_gap(
218 record,
219 excess=50,
220 excess_divergent=200,
221 min_gene=30,
222 slop=30,
223 lookahead_min=5,
224 lookahead_max=15,
225 ):
226 """
227 Identify excessive gaps between gene features.
228
229 Default "excessive" gap size is 10, but that should likely be larger.
230 """
231 results = []
232 good = 0
233 bad = 0
234
235 contiguous_regions = []
236
237 sorted_genes = sorted(
238 genes(record.features), key=lambda feature: feature.location.start
239 )
240 if len(sorted_genes) == 0:
241 log.warn("NO GENES FOUND")
242 return good, bad, results, []
243
244 current_gene = None
245 for gene in sorted_genes:
246 # If the gene's start is contiguous to the "current_gene", then we
247 # extend current_gene
248 for cds in genes(gene.sub_features, feature_type="CDS"):
249 if current_gene is None:
250 current_gene = [int(cds.location.start), int(cds.location.end)]
251
252 if cds.location.start <= current_gene[1] + excess:
253 # Don't want to decrease size
254 if int(cds.location.end) >= current_gene[1]:
255 current_gene[1] = int(cds.location.end)
256 else:
257 # If it's discontiguous, we append the region and clear.
258 contiguous_regions.append(current_gene)
259 current_gene = [int(cds.location.start), int(cds.location.end)]
260
261 # This generally expected that annotations would NOT continue unto the end
262 # of the genome, however that's a bug, and we can make it here with an
263 # empty contiguous_regions list
264 contiguous_regions.append(current_gene)
265
266 for i in range(len(contiguous_regions) + 1):
267 if i == 0:
268 a = (1, 1)
269 b = contiguous_regions[i]
270 elif i >= len(contiguous_regions):
271 a = contiguous_regions[i - 1]
272 b = (len(record.seq), None)
273 else:
274 a = contiguous_regions[i - 1]
275 b = contiguous_regions[i]
276
277 gap_size = abs(b[0] - a[1])
278
279 if gap_size > min(excess, excess_divergent):
280 a_feat_l = itertools.islice(
281 feature_lambda(
282 sorted_genes,
283 feature_test_location,
284 {"loc": a[1]},
285 subfeatures=False,
286 ),
287 1,
288 )
289 b_feat_l = itertools.islice(
290 feature_lambda(
291 sorted_genes,
292 feature_test_location,
293 {"loc": b[0]},
294 subfeatures=False,
295 ),
296 1,
297 )
298
299 try:
300 a_feat = next(a_feat_l)
301 except StopIteration:
302 # Triggers on end of genome
303 a_feat = None
304 try:
305 b_feat = next(b_feat_l)
306 except StopIteration:
307 # Triggers on end of genome
308 b_feat = None
309
310 result_obj = [
311 a[1],
312 b[0],
313 None if not a_feat else a_feat.location.strand,
314 None if not b_feat else b_feat.location.strand,
315 ]
316
317 if a_feat is None or b_feat is None:
318 if gap_size > excess_divergent:
319 results.append(result_obj)
320 else:
321 if (
322 a_feat.location.strand == b_feat.location.strand
323 and gap_size > excess
324 ):
325 results.append(result_obj)
326 elif (
327 a_feat.location.strand != b_feat.location.strand
328 and gap_size > excess_divergent
329 ):
330 results.append(result_obj)
331
332 better_results = []
333 qc_features = []
334 of = MGAFinder(11, "CDS", "closed", min_gene)
335 # of = OrfFinder(11, 'CDS', 'closed', min_gene)
336
337 for result_obj in results:
338 start = result_obj[0]
339 end = result_obj[1]
340 f = gen_qc_feature(start, end, "Excessive gap, %s bases" % abs(end - start), type_src="gene")
341 qc_features.append(f)
342 putative_genes = of.putative_genes_in_sequence(
343 str(record[start - slop : end + slop].seq)
344 )
345 putative_genes = list(
346 require_sd(putative_genes, record, start, lookahead_min, lookahead_max)
347 )
348 for putative_gene in putative_genes:
349 # (0, 33, 1, 'ATTATTTTATCAAAACGCTTTACAATCTTTTAG', 'MILSKRFTIF', 123123, 124324)
350 possible_gene_start = start + putative_gene[0]
351 possible_gene_end = start + putative_gene[1]
352
353 if possible_gene_start <= possible_gene_end:
354 possible_cds = gffSeqFeature(
355 FeatureLocation(
356 possible_gene_start, possible_gene_end, strand=putative_gene[2]
357 ),
358 type="CDS",
359 )
360 else:
361 possible_cds = gffSeqFeature(
362 FeatureLocation(
363 possible_gene_end, possible_gene_start, strand=putative_gene[2],
364 ),
365 type="CDS",
366 )
367
368 # Now we adjust our boundaries for the RBS that's required
369 # There are only two cases, the rbs is upstream of it, or downstream
370 if putative_gene[5] < possible_gene_start:
371 possible_gene_start = putative_gene[5]
372 else:
373 possible_gene_end = putative_gene[6]
374
375 if putative_gene[5] <= putative_gene[6]:
376 possible_rbs = gffSeqFeature(
377 FeatureLocation(
378 putative_gene[5], putative_gene[6], strand=putative_gene[2]
379 ),
380 type="Shine_Dalgarno_sequence",
381 )
382 else:
383 possible_rbs = gffSeqFeature(
384 FeatureLocation(
385 putative_gene[6], putative_gene[5], strand=putative_gene[2],
386 ),
387 type="Shine_Dalgarno_sequence",
388 )
389
390 if possible_gene_start <= possible_gene_end:
391 possible_gene = gffSeqFeature(
392 FeatureLocation(
393 possible_gene_start, possible_gene_end, strand=putative_gene[2]
394 ),
395 type="gene",
396 qualifiers={"note": ["Possible gene"]},
397 )
398 else:
399 possible_gene = gffSeqFeature(
400 FeatureLocation(
401 possible_gene_end, possible_gene_start, strand=putative_gene[2],
402 ),
403 type="gene",
404 qualifiers={"note": ["Possible gene"]},
405 )
406 possible_gene.sub_features = [possible_rbs, possible_cds]
407 qc_features.append(possible_gene)
408
409 better_results.append(result_obj + [len(putative_genes)])
410
411 # Bad gaps are those with more than zero possible genes found
412 bad = len([x for x in better_results if x[2] > 0])
413 # Generally taking "good" here as every possible gap in the genome
414 # Thus, good is TOTAL - gaps
415 good = len(sorted_genes) + 1 - bad
416 # and bad is just gaps
417 return good, bad, better_results, qc_features
418
419
420 def phi(x):
421 """Standard phi function used in calculation of normal distribution"""
422 return math.exp(-1 * math.pi * x * x)
423
424
425 def norm(x, mean=0, sd=1):
426 """
427 Normal distribution. Given an x position, a mean, and a standard
428 deviation, calculate the "y" value. Useful for score scaling
429
430 Modified to multiply by SD. This means even at sd=5, norm(x, mean) where x = mean => 1, rather than 1/5.
431 """
432 return (1 / float(sd)) * phi(float(x - mean) / float(sd)) * sd
433
434
435 def coding_density(record, mean=92.5, sd=20):
436 """
437 Find coding density in the genome
438 """
439 feature_lengths = 0
440
441 for gene_a in coding_genes(record.features):
442 feature_lengths += sum(
443 [len(x) for x in genes(gene_a.sub_features, feature_type="CDS")]
444 )
445
446 avgFeatLen = float(feature_lengths) / float(len(record.seq))
447 return int(norm(100 * avgFeatLen, mean=mean, sd=sd) * 100), int(100 * avgFeatLen)
448
449
450 def exact_coding_density(record, mean=92.5, sd=20):
451 """
452 Find exact coding density in the genome
453 """
454 data = numpy.zeros(len(record.seq))
455
456 for gene_a in coding_genes(record.features):
457 for cds in genes(gene_a.sub_features, feature_type="CDS"):
458 for i in range(cds.location.start, cds.location.end + 1):
459 data[i - 1] = 1
460
461 return float(sum(data)) / len(data)
462
463
464 def excessive_overlap(record, excess=15, excess_divergent=30):
465 """
466 Find excessive overlaps in the genome, where excessive is defined as 15
467 bases for same strand, and 30 for divergent translation.
468
469 Does a product of all the top-level features in the genome, and calculates
470 gaps.
471 """
472 results = []
473 bad = 0
474 qc_features = []
475
476 for (gene_a, gene_b) in itertools.combinations(coding_genes(record.features), 2):
477 # Get the CDS from the subfeature list.
478 # TODO: not recursive.
479 cds_a = [x for x in genes(gene_a.sub_features, feature_type="CDS")]
480 cds_b = [x for x in genes(gene_b.sub_features, feature_type="CDS")]
481
482 if len(cds_a) == 0:
483 log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_a))
484 continue
485
486 if len(cds_b) == 0:
487 log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_b))
488 continue
489
490 cds_a = cds_a[0]
491 cds_b = cds_b[0]
492
493 # Set of locations that are included in the CDS of A and the
494 # CDS of B
495 cas = set(range(cds_a.location.start, cds_a.location.end))
496 cbs = set(range(cds_b.location.start, cds_b.location.end))
497
498 # Here we calculate the intersection between the two sets, and
499 # if it's larger than our excessive size, we know that they're
500 # overlapped
501 ix = cas.intersection(cbs)
502
503 if (cds_a.location.strand == cds_b.location.strand and len(ix) >= excess) or (
504 cds_a.location.strand != cds_b.location.strand
505 and len(ix) >= excess_divergent
506 ):
507 bad += float(len(ix)) / float(min(excess, excess_divergent))
508 qc_features.append(
509 gen_qc_feature(min(ix), max(ix), "Excessive Overlap", id_src=gene_a, type_src="gene")
510 )
511 results.append((gene_a, gene_b, min(ix), max(ix)))
512
513 # Good isn't accurate here. It's a triangle number and just ugly, but we
514 # don't care enough to fix it.
515 good = len(list(coding_genes(record.features)))
516 good = int(good - bad)
517 if good < 0:
518 good = 0
519 return good, int(bad), results, qc_features
520
521
522 def get_encouragement(score):
523 """Some text telling the user how they did
524 """
525 for encouragement in ENCOURAGEMENT:
526 if score > encouragement[0]:
527 return encouragement[1]
528 return ENCOURAGEMENT[-1][1]
529
530
531 def genome_overview(record):
532 """Genome overview
533 """
534 data = {
535 "genes": {
536 "count": 0,
537 "bases": len(record.seq),
538 "density": 0, # genes / kb
539 "avg_len": [],
540 "comp": {"A": 0, "C": 0, "G": 0, "T": 0},
541 },
542 "overall": {
543 "comp": {
544 "A": record.seq.count("A") + record.seq.count("a"),
545 "C": record.seq.count("C") + record.seq.count("c"),
546 "G": record.seq.count("G") + record.seq.count("g"),
547 "T": record.seq.count("T") + record.seq.count("t"),
548 },
549 "gc": 0,
550 },
551 }
552 gene_features = list(coding_genes(record.features))
553 data["genes"]["count"] = len(gene_features)
554
555 for feat in gene_features:
556 data["genes"]["comp"]["A"] += feat.extract(record).seq.count("A") + feat.extract(record).seq.count("a")
557 data["genes"]["comp"]["C"] += feat.extract(record).seq.count("C") + feat.extract(record).seq.count("c")
558 data["genes"]["comp"]["T"] += feat.extract(record).seq.count("T") + feat.extract(record).seq.count("t")
559 data["genes"]["comp"]["G"] += feat.extract(record).seq.count("G") + feat.extract(record).seq.count("g")
560 #data["genes"]["bases"] += len(feat)
561 data["genes"]["avg_len"].append(len(feat))
562
563 data["genes"]["avg_len"] = float(sum(data["genes"]["avg_len"])) / len(gene_features)
564 data["overall"]["gc"] = float(
565 data["overall"]["comp"]["G"] + data["overall"]["comp"]["C"]
566 ) / len(record.seq)
567 return data
568
569
570 def find_morons(record):
571 """Locate morons in the genome
572
573 Don't even know why...
574
575 TODO: remove? Idk.
576 """
577 results = []
578 good = 0
579 bad = 0
580
581 gene_features = list(coding_genes(record.features))
582 for i, gene in enumerate(gene_features):
583 two_left = gene_features[i - 2 : i]
584 two_right = gene_features[i + 1 : i + 1 + 2]
585 strands = [x.strand for x in two_left] + [x.strand for x in two_right]
586 anticon = [x for x in strands if x != gene.strand]
587
588 if len(anticon) == 4:
589 has_rbs = [x.type == "Shine_Dalgarno_sequence" for x in gene.sub_features]
590 if any(has_rbs):
591 rbs = [
592 x for x in gene.sub_features if x.type == "Shine_Dalgarno_sequence"
593 ][0]
594 rbs_msg = str(rbs.extract(record.seq))
595 else:
596 rbs_msg = "No RBS Available"
597 results.append((gene, two_left, two_right, rbs_msg))
598 bad += 1
599 else:
600 good += 1
601 return good, bad, results, []
602
603
604 def bad_gene_model(record):
605 """Find features without product
606 """
607 results = []
608 good = 0
609 bad = 0
610 qc_features = []
611
612 for gene in coding_genes(record.features):
613 exons = [
614 x for x in genes(gene.sub_features, feature_type="exon") if len(x) > 10
615 ]
616 CDSs = [x for x in genes(gene.sub_features, feature_type="CDS")]
617 if len(exons) >= 1 and len(CDSs) >= 1:
618 if len(exons) != len(CDSs):
619 results.append(
620 (
621 get_gff3_id(gene),
622 None,
623 None,
624 "Mismatched number of exons and CDSs in gff3 representation",
625 )
626 )
627 qc_features.append(
628 gen_qc_feature(
629 gene.location.start,
630 gene.location.end,
631 "Mismatched number of exons and CDSs in gff3 representation",
632 strand=gene.strand,
633 id_src=gene,
634 type_src="gene"
635 )
636 )
637 bad += 1
638 else:
639 for (exon, cds) in zip(
640 sorted(exons, key=lambda x: x.location.start),
641 sorted(CDSs, key=lambda x: x.location.start),
642 ):
643 if len(exon) != len(cds):
644 results.append(
645 (
646 get_gff3_id(gene),
647 exon,
648 cds,
649 "CDS does not extend to full length of gene",
650 )
651 )
652 qc_features.append(
653 gen_qc_feature(
654 exon.location.start,
655 exon.location.end,
656 "CDS does not extend to full length of gene",
657 strand=exon.strand,
658 id_src=gene,
659 type_src="CDS"
660 )
661 )
662 bad += 1
663 else:
664 good += 1
665 else:
666 log.warn("Could not handle %s, %s", exons, CDSs)
667 results.append(
668 (
669 get_gff3_id(gene),
670 None,
671 None,
672 "{0} exons, {1} CDSs".format(len(exons), len(CDSs)),
673 )
674 )
675
676 return good, len(results) + bad, results, qc_features
677
678
679 def weird_starts(record):
680 """Find features without product
681 """
682 good = 0
683 bad = 0
684 qc_features = []
685 results = []
686
687 overall = {}
688 for gene in coding_genes(record.features):
689 seq = [x for x in genes(gene.sub_features, feature_type="CDS")]
690 if len(seq) == 0:
691 log.warn("No CDS for gene %s", get_gff3_id(gene))
692 continue
693 else:
694 seq = seq[0]
695
696 seq_str = str(seq.extract(record.seq))
697 start_codon = seq_str[0:3]
698 if len(seq_str) < 3:
699 sys.stderr.write("Fatal Error: CDS of length less than 3 at " + str(seq.location) + '\n')
700 exit(2)
701 # if len(seq_str) % 3 != 0:
702 # if len(seq_str) < 3:
703 # stop_codon = seq_str[-(len(seq_str))]
704 # else:
705 # stop_codon = seq_str[-3]
706 #
707 # log.warn("CDS at %s length is not a multiple of three (Length = %d)", get_gff3_id(gene), len(seq_str))
708 # seq.__error = "Bad CDS Length"
709 # results.append(seq)
710 # qc_features.append(
711 # gen_qc_feature(
712 # s, e, "Bad Length", strand=seq.strand, id_src=gene
713 # )
714 # )
715 # bad += 1
716 # seq.__start = start_codon
717 # seq.__stop = stop_codon
718 # continue
719
720 stop_codon = seq_str[-3]
721 seq.__start = start_codon
722 seq.__stop = stop_codon
723 if start_codon not in overall:
724 overall[start_codon] = 1
725 else:
726 overall[start_codon] += 1
727
728 if start_codon not in ("ATG", "TTG", "GTG"):
729 log.warn("Weird start codon (%s) on %s", start_codon, get_gff3_id(gene))
730 seq.__error = "Unusual start codon %s" % start_codon
731
732 s = 0
733 e = 0
734 if seq.strand > 0:
735 s = seq.location.start
736 e = seq.location.start + 3
737 else:
738 s = seq.location.end
739 e = seq.location.end - 3
740
741 results.append(seq)
742 results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand)
743 qc_features.append(
744 gen_qc_feature(
745 s, e, "Weird start codon", strand=seq.strand, id_src=gene, type_src="gene"
746 )
747 )
748 bad += 1
749 else:
750 good += 1
751
752 return good, bad, results, qc_features, overall
753
754
755 def missing_genes(record):
756 """Find features without product
757 """
758 results = []
759 good = 0
760 bad = 0
761 qc_features = []
762
763 for gene in coding_genes(record.features):
764 if gene.qualifiers.get("cpt_source", [None])[0] == "CPT_GENE_MODEL_CORRECTION":
765 results.append(gene)
766 bad += 1
767 else:
768 good += 1
769
770 return good, bad, results, qc_features
771
772
773 def gene_model_correction_issues(record):
774 """Find features that have issues from the gene model correction step.
775 These have qualifiers beginning with CPT_GMS
776 """
777 results = []
778 good = 0
779 bad = 0
780 qc_features = []
781
782 # For each gene
783 for gene in coding_genes(record.features):
784 # Get the list of child CDSs
785 cdss = [x for x in genes(gene.sub_features, feature_type="CDS")]
786 # And our matching qualifiers
787 gene_data = [(k, v) for (k, v) in gene.qualifiers.items() if k == "cpt_gmc"]
788 # If there are problems with ONLY the parent, let's complain
789 local_results = []
790 local_qc_features = []
791 for x in gene_data:
792 if "Missing Locus Tag" in x[1]:
793 # Missing locus tag is an either or thing, if it hits here
794 # there shouldn't be anything else wrong with it.
795
796 # Obviously missing so we remove it
797 gene.qualifiers["locus_tag"] = [""]
798 # Translation from bp_genbank2gff3.py
799 cdss[0].qualifiers["locus_tag"] = cdss[0].qualifiers["Name"]
800 # Append our results
801 local_results.append((gene, cdss[0], "Gene is missing a locus_tag"))
802 local_qc_features.append(
803 gen_qc_feature(
804 gene.location.start,
805 gene.location.end,
806 "Gene is missing a locus_tag",
807 strand=gene.strand,
808 type_src="gene"
809 )
810 )
811
812 # We need to alert on any child issues as well.
813 for cds in cdss:
814 cds_data = [
815 (k, v[0]) for (k, v) in cds.qualifiers.items() if k == "cpt_gmc"
816 ]
817 if len(gene_data) == 0 and len(cds_data) == 0:
818 # Alles gut
819 pass
820 else:
821 for _, problem in cds_data:
822 if problem == "BOTH Missing Locus Tag":
823 gene.qualifiers["locus_tag"] = [""]
824 cds.qualifiers["locus_tag"] = [""]
825 local_results.append(
826 (gene, cds, "Both gene and CDS are missing locus tags")
827 )
828 local_qc_features.append(
829 gen_qc_feature(
830 cds.location.start,
831 cds.location.end,
832 "CDS is missing a locus_tag",
833 strand=cds.strand,
834 type_src="CDS"
835 )
836 )
837 local_qc_features.append(
838 gen_qc_feature(
839 gene.location.start,
840 gene.location.end,
841 "Gene is missing a locus_tag",
842 strand=gene.strand,
843 type_src="gene"
844 )
845 )
846 elif problem == "Different locus tag from associated gene.":
847 gene.qualifiers["locus_tag"] = gene.qualifiers["Name"]
848 cds.qualifiers["locus_tag"] = cds.qualifiers["cpt_gmc_locus"]
849 local_results.append(
850 (gene, cds, "Gene and CDS have differing locus tags")
851 )
852 local_qc_features.append(
853 gen_qc_feature(
854 gene.location.start,
855 gene.location.end,
856 "Gene and CDS have differing locus tags",
857 strand=gene.strand,
858 type_src="gene"
859 )
860 )
861 elif problem == "Missing Locus Tag":
862 # Copy this over
863 gene.qualifiers["locus_tag"] = gene.qualifiers["Name"]
864 # This one is missing
865 cds.qualifiers["locus_tag"] = [""]
866 local_results.append((gene, cds, "CDS is missing a locus_tag"))
867 local_qc_features.append(
868 gen_qc_feature(
869 cds.location.start,
870 cds.location.end,
871 "CDS is missing a locus_tag",
872 strand=cds.strand,
873 type_src="CDS"
874 )
875 )
876 else:
877 log.warn("Cannot handle %s", problem)
878
879 if len(local_results) > 0:
880 bad += 1
881 else:
882 good += 1
883
884 qc_features.extend(local_qc_features)
885 results.extend(local_results)
886 return good, bad, results, qc_features
887
888
889 def missing_tags(record):
890 """Find features without product
891 """
892 results = []
893 good = 0
894 bad = 0
895 qc_features = []
896
897 for gene in coding_genes(record.features):
898 cds = [x for x in genes(gene.sub_features, feature_type="CDS")]
899 if len(cds) == 0:
900 log.warn("Gene missing CDS subfeature %s", get_gff3_id(gene))
901 continue
902
903 cds = cds[0]
904
905 if "product" not in cds.qualifiers:
906 log.info("Missing product tag on %s", get_gff3_id(gene))
907 qc_features.append(
908 gen_qc_feature(
909 cds.location.start,
910 cds.location.end,
911 "Missing product tag",
912 strand=cds.strand,
913 type_src="CDS"
914 )
915 )
916 results.append(cds)
917 bad += 1
918 else:
919 good += 1
920
921 return good, bad, results, qc_features
922
923
924 def evaluate_and_report(
925 annotations,
926 genome,
927 gff3=None,
928 tbl=None,
929 sd_min=5,
930 sd_max=15,
931 min_gene_length=30,
932 excessive_gap_dist=50,
933 excessive_gap_divergent_dist=200,
934 excessive_overlap_dist=25,
935 excessive_overlap_divergent_dist=50,
936 reportTemplateName="phage_annotation_validator.html",
937 ):
938 """
939 Generate our HTML evaluation of the genome
940 """
941 # Get features from GFF file
942 seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta"))
943 # Get the first GFF3 record
944 # TODO: support multiple GFF3 files.
945 mostFeat = 0
946 for rec in list(gffParse(annotations, base_dict=seq_dict)):
947 if len(rec.features) > mostFeat:
948 mostFeat = len(rec.features)
949 record = rec
950
951 gff3_qc_record = SeqRecord(record.id, id=record.id)
952 gff3_qc_record.features = []
953 gff3_qc_features = []
954
955 log.info("Locating missing RBSs")
956 # mb_any = "did they annotate ANY rbss? if so, take off from score."
957 mb_good, mb_bad, mb_results, mb_annotations, mb_any = missing_rbs(
958 record, lookahead_min=sd_min, lookahead_max=sd_max
959 )
960 gff3_qc_features += mb_annotations
961
962 log.info("Locating excessive gaps")
963 eg_good, eg_bad, eg_results, eg_annotations = excessive_gap(
964 record,
965 excess=excessive_gap_dist,
966 excess_divergent=excessive_gap_divergent_dist,
967 min_gene=min_gene_length,
968 slop=excessive_overlap_dist,
969 lookahead_min=sd_min,
970 lookahead_max=sd_max,
971 )
972 gff3_qc_features += eg_annotations
973
974 log.info("Locating excessive overlaps")
975 eo_good, eo_bad, eo_results, eo_annotations = excessive_overlap(
976 record,
977 excess=excessive_overlap_dist,
978 excess_divergent=excessive_overlap_divergent_dist,
979 )
980 gff3_qc_features += eo_annotations
981
982 log.info("Locating morons")
983 mo_good, mo_bad, mo_results, mo_annotations = find_morons(record)
984 gff3_qc_features += mo_annotations
985
986 log.info("Locating missing tags")
987 mt_good, mt_bad, mt_results, mt_annotations = missing_tags(record)
988 gff3_qc_features += mt_annotations
989
990 log.info("Locating missing gene features")
991 mg_good, mg_bad, mg_results, mg_annotations = missing_genes(record)
992 gff3_qc_features += mg_annotations
993
994 log.info("Determining coding density")
995 cd, cd_real = coding_density(record)
996
997 log.info("Locating weird starts")
998 ws_good, ws_bad, ws_results, ws_annotations, ws_overall = weird_starts(record)
999 gff3_qc_features += ws_annotations
1000
1001 log.info("Locating bad gene models")
1002 gm_good, gm_bad, gm_results, gm_annotations = bad_gene_model(record)
1003 if gm_good + gm_bad == 0:
1004 gm_bad = 1
1005
1006 log.info("Locating more bad gene models")
1007 gmc_good, gmc_bad, gmc_results, gmc_annotations = gene_model_correction_issues(
1008 record
1009 )
1010 if gmc_good + gmc_bad == 0:
1011 gmc_bad = 1
1012
1013 good_scores = [eg_good, eo_good, mt_good, ws_good, gm_good, gmc_good]
1014 bad_scores = [eg_bad, eo_bad, mt_bad, ws_bad, gm_bad, gmc_bad]
1015
1016 # Only if they tried to annotate RBSs do we consider them.
1017 if mb_any:
1018 good_scores.append(mb_good)
1019 bad_scores.append(mb_bad)
1020 subscores = []
1021
1022 for (g, b) in zip(good_scores, bad_scores):
1023 if g + b == 0:
1024 s = 0
1025 else:
1026 s = int(100 * float(g) / (float(b) + float(g)))
1027 subscores.append(s)
1028 subscores.append(cd)
1029
1030 score = int(float(sum(subscores)) / float(len(subscores)))
1031
1032 # This is data that will go into our HTML template
1033 kwargs = {
1034 "upstream_min": sd_min,
1035 "upstream_max": sd_max,
1036 "record_name": record.id,
1037 "record_nice_name": nice_name(record),
1038 "params": {
1039 "sd_min": sd_min,
1040 "sd_max": sd_max,
1041 "min_gene_length": min_gene_length,
1042 "excessive_gap_dist": excessive_gap_dist,
1043 "excessive_gap_divergent_dist": excessive_gap_divergent_dist,
1044 "excessive_overlap_dist": excessive_overlap_dist,
1045 "excessive_overlap_divergent_dist": excessive_overlap_divergent_dist,
1046 },
1047 "score": score,
1048 "encouragement": get_encouragement(score),
1049 "genome_overview": genome_overview(record),
1050 "rbss_annotated": mb_any,
1051 "missing_rbs": mb_results,
1052 "missing_rbs_good": mb_good,
1053 "missing_rbs_bad": mb_bad,
1054 "missing_rbs_score": 0
1055 if mb_good + mb_bad == 0
1056 else (100 * mb_good / (mb_good + mb_bad)),
1057 "excessive_gap": eg_results,
1058 "excessive_gap_good": eg_good,
1059 "excessive_gap_bad": eg_bad,
1060 "excessive_gap_score": 0
1061 if eo_good + eo_bad == 0
1062 else (100 * eo_good / (eo_good + eo_bad)),
1063 "excessive_overlap": eo_results,
1064 "excessive_overlap_good": eo_good,
1065 "excessive_overlap_bad": eo_bad,
1066 "excessive_overlap_score": 0
1067 if eo_good + eo_bad == 0
1068 else (100 * eo_good / (eo_good + eo_bad)),
1069 "morons": mo_results,
1070 "morons_good": mo_good,
1071 "morons_bad": mo_bad,
1072 "morons_score": 0
1073 if mo_good + mo_bad == 0
1074 else (100 * mo_good / (mo_good + mo_bad)),
1075 "missing_tags": mt_results,
1076 "missing_tags_good": mt_good,
1077 "missing_tags_bad": mt_bad,
1078 "missing_tags_score": 0
1079 if mt_good + mt_bad == 0
1080 else (100 * mt_good / (mt_good + mt_bad)),
1081 "missing_genes": mg_results,
1082 "missing_genes_good": mg_good,
1083 "missing_genes_bad": mg_bad,
1084 "missing_genes_score": 0
1085 if mg_good + mg_bad == 0
1086 else (100 * mg_good / (mg_good + mg_bad)),
1087 "weird_starts": ws_results,
1088 "weird_starts_good": ws_good,
1089 "weird_starts_bad": ws_bad,
1090 "weird_starts_overall": ws_overall,
1091 "weird_starts_overall_sorted_keys": sorted(
1092 ws_overall, reverse=True, key=lambda x: ws_overall[x]
1093 ),
1094 "weird_starts_score": 0
1095 if ws_good + ws_bad == 0
1096 else (100 * ws_good / (ws_good + ws_bad)),
1097 "gene_model": gm_results,
1098 "gene_model_good": gm_good,
1099 "gene_model_bad": gm_bad,
1100 "gene_model_score": 0
1101 if gm_good + gm_bad == 0
1102 else (100 * gm_good / (gm_good + gm_bad)),
1103 "gene_model_correction": gmc_results,
1104 "gene_model_correction_good": gmc_good,
1105 "gene_model_correction_bad": gmc_bad,
1106 "gene_model_correction_score": 0
1107 if gmc_good + gmc_bad == 0
1108 else (100 * gmc_good / (gmc_good + gmc_bad)),
1109 "coding_density": cd,
1110 "coding_density_exact": exact_coding_density(record),
1111 "coding_density_real": cd_real,
1112 "coding_density_score": cd,
1113 }
1114
1115 with open(tbl, "w") as handle:
1116 kw_subset = {}
1117 for key in kwargs:
1118 if (
1119 key in ("score", "record_name")
1120 or "_good" in key
1121 or "_bad" in key
1122 or "_overall" in key
1123 ):
1124 kw_subset[key] = kwargs[key]
1125 json.dump(kw_subset, handle)
1126
1127 with open(gff3, "w") as handle:
1128 gff3_qc_record.features = gff3_qc_features
1129 gff3_qc_record.annotations = {}
1130 gffWrite([gff3_qc_record], handle)
1131
1132 def nice_strand(direction):
1133 # It is somehow possible for whole gffSeqFeature objects to end up in here, apparently at the gene level
1134 if "SeqFeature" in str(type(direction)):
1135 direction = direction.location.strand
1136 if direction > 0:
1137 return "→"#.decode("utf-8")
1138 else:
1139 return "←"#.decode("utf-8")
1140
1141 def nice_strand_tex(direction):
1142 if "SeqFeature" in str(type(direction)):
1143 direction = direction.location.strand
1144 if direction > 0:
1145 return "$\\rightarrow$"
1146 else:
1147 return "$\\leftarrow$"
1148
1149 def texify(data):
1150 return data.replace("_", "\\_").replace("$", "\\$")
1151
1152 def length(data):
1153 return len(data)
1154
1155 def my_encode(data):
1156 return str(data)#.encode("utf-8")
1157
1158 def my_decode(data):
1159 # For production
1160 return str(data)#.decode("utf-8")
1161 # For local testing. No, I do not understand.
1162 return str(data)#.encode("utf-8")).decode("utf-8")
1163
1164 env = Environment(
1165 loader=FileSystemLoader(SCRIPT_PATH), trim_blocks=True, lstrip_blocks=True
1166 )
1167 env.filters.update(
1168 {
1169 "nice_id": get_gff3_id,
1170 "nice_strand": nice_strand,
1171 "nice_strand_tex": nice_strand_tex,
1172 "texify": texify,
1173 "length": length,
1174 "encode": my_encode,
1175 "decode": my_decode,
1176 }
1177 )
1178 tpl = env.get_template(reportTemplateName)
1179 return tpl.render(**kwargs)#.encode("utf-8")
1180
1181
1182 if __name__ == "__main__":
1183 parser = argparse.ArgumentParser(
1184 description="rebase gff3 features against parent locations", epilog=""
1185 )
1186 parser.add_argument(
1187 "annotations", type=argparse.FileType("r"), help="Parent GFF3 annotations"
1188 )
1189 parser.add_argument("genome", type=argparse.FileType("r"), help="Genome Sequence")
1190 parser.add_argument(
1191 "--gff3", type=str, help="GFF3 Annotations", default="qc_annotations.gff3"
1192 )
1193 parser.add_argument(
1194 "--tbl",
1195 type=str,
1196 help="Table for noninteractive parsing",
1197 default="qc_results.json",
1198 )
1199
1200 parser.add_argument(
1201 "--sd_min",
1202 type=int,
1203 help="Minimum distance from gene start for an SD to be",
1204 default=5,
1205 )
1206 parser.add_argument(
1207 "--sd_max",
1208 type=int,
1209 help="Maximum distance from gene start for an SD to be",
1210 default=15,
1211 )
1212
1213 parser.add_argument(
1214 "--min_gene_length",
1215 type=int,
1216 help="Minimum length for a putative gene call (AAs)",
1217 default=30,
1218 )
1219
1220 parser.add_argument(
1221 "--excessive_overlap_dist",
1222 type=int,
1223 help="Excessive overlap for genes in same direction",
1224 default=25,
1225 )
1226 parser.add_argument(
1227 "--excessive_overlap_divergent_dist",
1228 type=int,
1229 help="Excessive overlap for genes in diff directions",
1230 default=50,
1231 )
1232
1233 parser.add_argument(
1234 "--excessive_gap_dist",
1235 type=int,
1236 help="Maximum distance between two genes",
1237 default=40,
1238 )
1239 parser.add_argument(
1240 "--excessive_gap_divergent_dist",
1241 type=int,
1242 help="Maximum distance between two divergent genes",
1243 default=200,
1244 )
1245
1246 parser.add_argument(
1247 "--reportTemplateName",
1248 help="Report template file name",
1249 default="phageqc_report_full.html",
1250 )
1251
1252 args = parser.parse_args()
1253
1254 sys.stdout.write(evaluate_and_report(**vars(args)))