view cpt_phageqc_annotation/phage_annotation_validator.py @ 0:c3140b08d703 draft default tip

Uploaded
author cpt
date Fri, 17 Jun 2022 13:00:50 +0000
parents
children
line wrap: on
line source

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: set fileencoding=utf-8
import os
import sys
import json
import math
import numpy
import argparse
import itertools
import logging
from gff3 import (
    feature_lambda,
    coding_genes,
    genes,
    get_gff3_id,
    feature_test_location,
    get_rbs_from,
    nice_name,
)
from shinefind import NaiveSDCaller
from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from jinja2 import Environment, FileSystemLoader
from cpt import MGAFinder

logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(name="pav")

# Path to script, required because of Galaxy.
SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
# Path to the HTML template for the report

ENCOURAGEMENT = (
    (100, "Perfection itself!"),
    (90, "Amazing!"),
    (80, "Not too bad, a few minor things to fix..."),
    (70, "Some issues to address"),
    (
        50,
        """Issues detected! </p><p class="text-muted">Have you heard of the
     <a href="https://cpt.tamu.edu">CPT</a>\'s Automated Phage Annotation
     Pipeline?""",
    ),
    (
        0,
        """<b>MAJOR</b> issues detected! Please consider using the
     <a href="https://cpt.tamu.edu">CPT</a>\'s Automated Phage Annotation Pipeline""",
    ),
)


def gen_qc_feature(start, end, message, strand=0, id_src=None, type_src="gene"):
    kwargs = {"qualifiers": {"note": [message]}}
    kwargs["type"] = type_src
    kwargs["strand"] = strand
    kwargs["phase"]=0
    kwargs["score"]=0.0
    kwargs["source"]="feature"
    if id_src is not None:
        kwargs["id"] = id_src.id
        kwargs["qualifiers"]["ID"] = [id_src.id]
        kwargs["qualifiers"]["Name"] = id_src.qualifiers.get("Name", [])
	

    if end >= start:
        return gffSeqFeature(FeatureLocation(start, end, strand=strand), **kwargs)
    else:
        return gffSeqFeature(FeatureLocation(end, start, strand=strand), **kwargs)


def __ensure_location_in_bounds(start=0, end=0, parent_length=0):
    # This prevents frameshift errors
    while start < 0:
        start += 3
    while end < 0:
        end += 3
    while start > parent_length:
        start -= 3
    while end > parent_length:
        end -= 3
    return (start, end)


def missing_rbs(record, lookahead_min=5, lookahead_max=15):
    """
    Identify gene features with missing RBSs

    This "looks ahead" 5-15 bases ahead of each gene feature, and checks if
    there's an RBS feature in those bounds.

    The returned data is a set of genes with the RBS sequence in the __upstream
    attribute, and a message in the __message attribute.
    """
    results = []
    good = 0
    bad = 0
    qc_features = []
    sd_finder = NaiveSDCaller()

    any_rbss = False

    for gene in coding_genes(record.features):
        # Check if there are RBSs, TODO: make this recursive. Each feature in
        # gene.sub_features can also have sub_features.
        rbss = get_rbs_from(gene)
        # No RBS found
        if len(rbss) == 0:
            # Get the sequence lookahead_min to lookahead_max upstream
            if gene.strand > 0:
                start = gene.location.start - lookahead_max
                end = gene.location.start - lookahead_min
            else:
                start = gene.location.end + lookahead_min
                end = gene.location.end + lookahead_max
            # We have to ensure the feature is ON the genome, otherwise we may
            # be trying to access a location outside of the length of the
            # genome, which would be bad.
            (start, end) = __ensure_location_in_bounds(
                start=start, end=end, parent_length=len(record)
            )
            # Temporary feature to extract sequence
            tmp = gffSeqFeature(
                FeatureLocation(start, end, strand=gene.strand), type="domain"
            )
            # Get the sequence
            seq = str(tmp.extract(record.seq))
            # Set the default properties
            gene.__upstream = seq.lower()
            gene.__message = "No RBS annotated, None found"

            # Try and do an automated shinefind call
            sds = sd_finder.list_sds(seq)
            if len(sds) > 0:
                sd = sds[0]
                gene.__upstream = sd_finder.highlight_sd(
                    seq.lower(), sd["start"], sd["end"]
                )
                gene.__message = "Unannotated but valid RBS"

            qc_features.append(
                gen_qc_feature(
                    start, end, "Missing RBS", strand=gene.strand, id_src=gene, type_src="gene"
                )
            )

            bad += 1
            results.append(gene)
            results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand)
        else:
            if len(rbss) > 1:
                log.warn("%s RBSs found for gene %s", rbss[0].id, get_gff3_id(gene))
            any_rbss = True
            # get first RBS/CDS
            cds = list(genes(gene.sub_features, feature_type="CDS"))[0]
            rbs = rbss[0]

            # Get the distance between the two
            if gene.strand > 0:
                distance = cds.location.start - rbs.location.end
            else:
                distance = rbs.location.start - cds.location.end

            # If the RBS is too far away, annotate that
            if distance > lookahead_max:
                gene.__message = "RBS too far away (%s nt)" % distance

                qc_features.append(
                    gen_qc_feature(
                        rbs.location.start,
                        rbs.location.end,
                        gene.__message,
                        strand=gene.strand,
                        id_src=gene,
                        type_src="gene"
                    )
                )

                bad += 1
                results.append(gene)
                results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand)
            else:
                good += 1

    return good, bad, results, qc_features, any_rbss


# modified from get_orfs_or_cdss.py
# -----------------------------------------------------------


def require_sd(data, record, chrom_start, sd_min, sd_max):
    sd_finder = NaiveSDCaller()
    for putative_gene in data:
        if putative_gene[2] > 0:  # strand
            start = chrom_start + putative_gene[0] - sd_max
            end = chrom_start + putative_gene[0] - sd_min
        else:
            start = chrom_start + putative_gene[1] + sd_min
            end = chrom_start + putative_gene[1] + sd_max

        (start, end) = __ensure_location_in_bounds(
            start=start, end=end, parent_length=len(record)
        )
        tmp = gffSeqFeature(
            FeatureLocation(start, end, strand=putative_gene[2]), type="domain"
        )
        # Get the sequence
        seq = str(tmp.extract(record.seq))
        sds = sd_finder.list_sds(seq)
        if len(sds) > 0:
            yield putative_gene + (start, end)


def excessive_gap(
    record,
    excess=50,
    excess_divergent=200,
    min_gene=30,
    slop=30,
    lookahead_min=5,
    lookahead_max=15,
):
    """
    Identify excessive gaps between gene features.

    Default "excessive" gap size is 10, but that should likely be larger.
    """
    results = []
    good = 0
    bad = 0

    contiguous_regions = []

    sorted_genes = sorted(
        genes(record.features), key=lambda feature: feature.location.start
    )
    if len(sorted_genes) == 0:
        log.warn("NO GENES FOUND")
        return good, bad, results, []

    current_gene = None
    for gene in sorted_genes:
        # If the gene's start is contiguous to the "current_gene", then we
        # extend current_gene
        for cds in genes(gene.sub_features, feature_type="CDS"):
            if current_gene is None:
                current_gene = [int(cds.location.start), int(cds.location.end)]

            if cds.location.start <= current_gene[1] + excess:
                # Don't want to decrease size
                if int(cds.location.end) >= current_gene[1]:
                    current_gene[1] = int(cds.location.end)
            else:
                # If it's discontiguous, we append the region and clear.
                contiguous_regions.append(current_gene)
                current_gene = [int(cds.location.start), int(cds.location.end)]

    # This generally expected that annotations would NOT continue unto the end
    # of the genome, however that's a bug, and we can make it here with an
    # empty contiguous_regions list
    contiguous_regions.append(current_gene)

    for i in range(len(contiguous_regions) + 1):
        if i == 0:
            a = (1, 1)
            b = contiguous_regions[i]
        elif i >= len(contiguous_regions):
            a = contiguous_regions[i - 1]
            b = (len(record.seq), None)
        else:
            a = contiguous_regions[i - 1]
            b = contiguous_regions[i]

        gap_size = abs(b[0] - a[1])
        
        if gap_size > min(excess, excess_divergent):
            a_feat_l = itertools.islice(
                feature_lambda(
                    sorted_genes,
                    feature_test_location,
                    {"loc": a[1]},
                    subfeatures=False,
                ),
                1,
            )
            b_feat_l = itertools.islice(
                feature_lambda(
                    sorted_genes,
                    feature_test_location,
                    {"loc": b[0]},
                    subfeatures=False,
                ),
                1,
            )

            try:
                a_feat = next(a_feat_l)
            except StopIteration:
                # Triggers on end of genome
                a_feat = None
            try:
                b_feat = next(b_feat_l)
            except StopIteration:
                # Triggers on end of genome
                b_feat = None

            result_obj = [
                a[1],
                b[0],
                None if not a_feat else a_feat.location.strand,
                None if not b_feat else b_feat.location.strand,
            ]

            if a_feat is None or b_feat is None:
                if gap_size > excess_divergent:
                    results.append(result_obj)
            else:
                if (
                    a_feat.location.strand == b_feat.location.strand
                    and gap_size > excess
                ):
                    results.append(result_obj)
                elif (
                    a_feat.location.strand != b_feat.location.strand
                    and gap_size > excess_divergent
                ):
                    results.append(result_obj)

    better_results = []
    qc_features = []
    of = MGAFinder(11, "CDS", "closed", min_gene)
    # of = OrfFinder(11, 'CDS', 'closed', min_gene)

    for result_obj in results:
        start = result_obj[0]
        end = result_obj[1]
        f = gen_qc_feature(start, end, "Excessive gap, %s bases" % abs(end - start), type_src="gene")
        qc_features.append(f)
        putative_genes = of.putative_genes_in_sequence(
            str(record[start - slop : end + slop].seq)
        )
        putative_genes = list(
            require_sd(putative_genes, record, start, lookahead_min, lookahead_max)
        )
        for putative_gene in putative_genes:
            # (0, 33, 1, 'ATTATTTTATCAAAACGCTTTACAATCTTTTAG', 'MILSKRFTIF', 123123, 124324)
            possible_gene_start = start + putative_gene[0]
            possible_gene_end = start + putative_gene[1]

            if possible_gene_start <= possible_gene_end:
                possible_cds = gffSeqFeature(
                    FeatureLocation(
                        possible_gene_start, possible_gene_end, strand=putative_gene[2]
                    ),
                    type="CDS",
                )
            else:
                possible_cds = gffSeqFeature(
                    FeatureLocation(
                        possible_gene_end, possible_gene_start, strand=putative_gene[2],
                    ),
                    type="CDS",
                )

            # Now we adjust our boundaries for the RBS that's required
            # There are only two cases, the rbs is upstream of it, or downstream
            if putative_gene[5] < possible_gene_start:
                possible_gene_start = putative_gene[5]
            else:
                possible_gene_end = putative_gene[6]

            if putative_gene[5] <= putative_gene[6]:
                possible_rbs = gffSeqFeature(
                    FeatureLocation(
                        putative_gene[5], putative_gene[6], strand=putative_gene[2]
                    ),
                    type="Shine_Dalgarno_sequence",
                )
            else:
                possible_rbs = gffSeqFeature(
                    FeatureLocation(
                        putative_gene[6], putative_gene[5], strand=putative_gene[2],
                    ),
                    type="Shine_Dalgarno_sequence",
                )

            if possible_gene_start <= possible_gene_end:
                possible_gene = gffSeqFeature(
                    FeatureLocation(
                        possible_gene_start, possible_gene_end, strand=putative_gene[2]
                    ),
                    type="gene",
                    qualifiers={"note": ["Possible gene"]},
                )
            else:
                possible_gene = gffSeqFeature(
                    FeatureLocation(
                        possible_gene_end, possible_gene_start, strand=putative_gene[2],
                    ),
                    type="gene",
                    qualifiers={"note": ["Possible gene"]},
                )
            possible_gene.sub_features = [possible_rbs, possible_cds]
            qc_features.append(possible_gene)

        better_results.append(result_obj + [len(putative_genes)])

    # Bad gaps are those with more than zero possible genes found
    bad = len([x for x in better_results if x[2] > 0])
    # Generally taking "good" here as every possible gap in the genome
    # Thus, good is TOTAL - gaps
    good = len(sorted_genes) + 1 - bad
    # and bad is just gaps
    return good, bad, better_results, qc_features


def phi(x):
    """Standard phi function used in calculation of normal distribution"""
    return math.exp(-1 * math.pi * x * x)


def norm(x, mean=0, sd=1):
    """
    Normal distribution. Given an x position, a mean, and a standard
    deviation, calculate the "y" value. Useful for score scaling

    Modified to multiply by SD. This means even at sd=5, norm(x, mean) where x = mean => 1, rather than 1/5.
    """
    return (1 / float(sd)) * phi(float(x - mean) / float(sd)) * sd


def coding_density(record, mean=92.5, sd=20):
    """
    Find coding density in the genome
    """
    feature_lengths = 0

    for gene_a in coding_genes(record.features):
        feature_lengths += sum(
            [len(x) for x in genes(gene_a.sub_features, feature_type="CDS")]
        )

    avgFeatLen = float(feature_lengths) / float(len(record.seq))
    return int(norm(100 * avgFeatLen, mean=mean, sd=sd) * 100), int(100 * avgFeatLen)


def exact_coding_density(record, mean=92.5, sd=20):
    """
    Find exact coding density in the genome
    """
    data = numpy.zeros(len(record.seq))

    for gene_a in coding_genes(record.features):
        for cds in genes(gene_a.sub_features, feature_type="CDS"):
            for i in range(cds.location.start, cds.location.end + 1):
                data[i - 1] = 1

    return float(sum(data)) / len(data)


def excessive_overlap(record, excess=15, excess_divergent=30):
    """
    Find excessive overlaps in the genome, where excessive is defined as 15
    bases for same strand, and 30 for divergent translation.

    Does a product of all the top-level features in the genome, and calculates
    gaps.
    """
    results = []
    bad = 0
    qc_features = []

    for (gene_a, gene_b) in itertools.combinations(coding_genes(record.features), 2):
        # Get the CDS from the subfeature list.
        # TODO: not recursive.
        cds_a = [x for x in genes(gene_a.sub_features, feature_type="CDS")]
        cds_b = [x for x in genes(gene_b.sub_features, feature_type="CDS")]

        if len(cds_a) == 0:
            log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_a))
            continue

        if len(cds_b) == 0:
            log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_b))
            continue

        cds_a = cds_a[0]
        cds_b = cds_b[0]

        # Set of locations that are included in the CDS of A and the
        # CDS of B
        cas = set(range(cds_a.location.start, cds_a.location.end))
        cbs = set(range(cds_b.location.start, cds_b.location.end))

        # Here we calculate the intersection between the two sets, and
        # if it's larger than our excessive size, we know that they're
        # overlapped
        ix = cas.intersection(cbs)

        if (cds_a.location.strand == cds_b.location.strand and len(ix) >= excess) or (
            cds_a.location.strand != cds_b.location.strand
            and len(ix) >= excess_divergent
        ):
            bad += float(len(ix)) / float(min(excess, excess_divergent))
            qc_features.append(
                gen_qc_feature(min(ix), max(ix), "Excessive Overlap", id_src=gene_a, type_src="gene")
            )
            results.append((gene_a, gene_b, min(ix), max(ix)))

    # Good isn't accurate here. It's a triangle number and just ugly, but we
    # don't care enough to fix it.
    good = len(list(coding_genes(record.features)))
    good = int(good - bad)
    if good < 0:
        good = 0
    return good, int(bad), results, qc_features


def get_encouragement(score):
    """Some text telling the user how they did
    """
    for encouragement in ENCOURAGEMENT:
        if score > encouragement[0]:
            return encouragement[1]
    return ENCOURAGEMENT[-1][1]


def genome_overview(record):
    """Genome overview
    """
    data = {
        "genes": {
            "count": 0,
            "bases": len(record.seq),
            "density": 0,  # genes / kb
            "avg_len": [],
            "comp": {"A": 0, "C": 0, "G": 0, "T": 0},
        },
        "overall": {
            "comp": {
                "A": record.seq.count("A") + record.seq.count("a"),
                "C": record.seq.count("C") + record.seq.count("c"),
                "G": record.seq.count("G") + record.seq.count("g"),
                "T": record.seq.count("T") + record.seq.count("t"),
            },
            "gc": 0,
        },
    }
    gene_features = list(coding_genes(record.features))
    data["genes"]["count"] = len(gene_features)

    for feat in gene_features:
        data["genes"]["comp"]["A"] += feat.extract(record).seq.count("A") + feat.extract(record).seq.count("a")
        data["genes"]["comp"]["C"] += feat.extract(record).seq.count("C") + feat.extract(record).seq.count("c")
        data["genes"]["comp"]["T"] += feat.extract(record).seq.count("T") + feat.extract(record).seq.count("t")
        data["genes"]["comp"]["G"] += feat.extract(record).seq.count("G") + feat.extract(record).seq.count("g")
        #data["genes"]["bases"] += len(feat)
        data["genes"]["avg_len"].append(len(feat))

    data["genes"]["avg_len"] = float(sum(data["genes"]["avg_len"])) / len(gene_features)
    data["overall"]["gc"] = float(
        data["overall"]["comp"]["G"] + data["overall"]["comp"]["C"]
    ) / len(record.seq)
    return data


def find_morons(record):
    """Locate morons in the genome

    Don't even know why...

    TODO: remove? Idk.
    """
    results = []
    good = 0
    bad = 0

    gene_features = list(coding_genes(record.features))
    for i, gene in enumerate(gene_features):
        two_left = gene_features[i - 2 : i]
        two_right = gene_features[i + 1 : i + 1 + 2]
        strands = [x.strand for x in two_left] + [x.strand for x in two_right]
        anticon = [x for x in strands if x != gene.strand]

        if len(anticon) == 4:
            has_rbs = [x.type == "Shine_Dalgarno_sequence" for x in gene.sub_features]
            if any(has_rbs):
                rbs = [
                    x for x in gene.sub_features if x.type == "Shine_Dalgarno_sequence"
                ][0]
                rbs_msg = str(rbs.extract(record.seq))
            else:
                rbs_msg = "No RBS Available"
            results.append((gene, two_left, two_right, rbs_msg))
            bad += 1
        else:
            good += 1
    return good, bad, results, []


def bad_gene_model(record):
    """Find features without product
    """
    results = []
    good = 0
    bad = 0
    qc_features = []

    for gene in coding_genes(record.features):
        exons = [
            x for x in genes(gene.sub_features, feature_type="exon") if len(x) > 10
        ]
        CDSs = [x for x in genes(gene.sub_features, feature_type="CDS")]
        if len(exons) >= 1 and len(CDSs) >= 1:
            if len(exons) != len(CDSs):
                results.append(
                    (
                        get_gff3_id(gene),
                        None,
                        None,
                        "Mismatched number of exons and CDSs in gff3 representation",
                    )
                )
                qc_features.append(
                    gen_qc_feature(
                        gene.location.start,
                        gene.location.end,
                        "Mismatched number of exons and CDSs in gff3 representation",
                        strand=gene.strand,
                        id_src=gene, 
                        type_src="gene"
                    )
                )
                bad += 1
            else:
                for (exon, cds) in zip(
                    sorted(exons, key=lambda x: x.location.start),
                    sorted(CDSs, key=lambda x: x.location.start),
                ):
                    if len(exon) != len(cds):
                        results.append(
                            (
                                get_gff3_id(gene),
                                exon,
                                cds,
                                "CDS does not extend to full length of gene",
                            )
                        )
                        qc_features.append(
                            gen_qc_feature(
                                exon.location.start,
                                exon.location.end,
                                "CDS does not extend to full length of gene",
                                strand=exon.strand,
                                id_src=gene, 
                                type_src="CDS"
                            )
                        )
                        bad += 1
                    else:
                        good += 1
        else:
            log.warn("Could not handle %s, %s", exons, CDSs)
            results.append(
                (
                    get_gff3_id(gene),
                    None,
                    None,
                    "{0} exons, {1} CDSs".format(len(exons), len(CDSs)),
                )
            )

    return good, len(results) + bad, results, qc_features


def weird_starts(record):
    """Find features without product
    """
    good = 0
    bad = 0
    qc_features = []
    results = []

    overall = {}
    for gene in coding_genes(record.features):
        seq = [x for x in genes(gene.sub_features, feature_type="CDS")]
        if len(seq) == 0:
            log.warn("No CDS for gene %s", get_gff3_id(gene))
            continue
        else:
            seq = seq[0]

        seq_str = str(seq.extract(record.seq))
        start_codon = seq_str[0:3]
        if len(seq_str) < 3:
            sys.stderr.write("Fatal Error: CDS of length less than 3 at " + str(seq.location) + '\n')
            exit(2)
#        if len(seq_str) % 3 != 0:
#            if len(seq_str) < 3:
#                stop_codon = seq_str[-(len(seq_str))]
#            else:
#                stop_codon = seq_str[-3]
#            
#            log.warn("CDS at %s length is not a multiple of three (Length = %d)", get_gff3_id(gene), len(seq_str))
#            seq.__error = "Bad CDS Length"
#            results.append(seq)
#            qc_features.append(
#                gen_qc_feature(
#                    s, e, "Bad Length", strand=seq.strand, id_src=gene
#                )
#            )
#            bad += 1
#            seq.__start = start_codon
#            seq.__stop = stop_codon
#            continue 

        stop_codon = seq_str[-3]
        seq.__start = start_codon
        seq.__stop = stop_codon
        if start_codon not in overall:
            overall[start_codon] = 1
        else:
            overall[start_codon] += 1

        if start_codon not in ("ATG", "TTG", "GTG"):
            log.warn("Weird start codon (%s) on %s", start_codon, get_gff3_id(gene))
            seq.__error = "Unusual start codon %s" % start_codon

            s = 0
            e = 0
            if seq.strand > 0:
                s = seq.location.start
                e = seq.location.start + 3
            else:
                s = seq.location.end
                e = seq.location.end - 3

            results.append(seq)
            results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand) 
            qc_features.append(
                gen_qc_feature(
                    s, e, "Weird start codon", strand=seq.strand, id_src=gene, type_src="gene"
                )
            )
            bad += 1
        else:
            good += 1

    return good, bad, results, qc_features, overall


def missing_genes(record):
    """Find features without product
    """
    results = []
    good = 0
    bad = 0
    qc_features = []

    for gene in coding_genes(record.features):
        if gene.qualifiers.get("cpt_source", [None])[0] == "CPT_GENE_MODEL_CORRECTION":
            results.append(gene)
            bad += 1
        else:
            good += 1

    return good, bad, results, qc_features


def gene_model_correction_issues(record):
    """Find features that have issues from the gene model correction step.
    These have qualifiers beginning with CPT_GMS
    """
    results = []
    good = 0
    bad = 0
    qc_features = []

    # For each gene
    for gene in coding_genes(record.features):
        # Get the list of child CDSs
        cdss = [x for x in genes(gene.sub_features, feature_type="CDS")]
        # And our matching qualifiers
        gene_data = [(k, v) for (k, v) in gene.qualifiers.items() if k == "cpt_gmc"]
        # If there are problems with ONLY the parent, let's complain
        local_results = []
        local_qc_features = []
        for x in gene_data:
            if "Missing Locus Tag" in x[1]:
                # Missing locus tag is an either or thing, if it hits here
                # there shouldn't be anything else wrong with it.

                # Obviously missing so we remove it
                gene.qualifiers["locus_tag"] = [""]
                # Translation from bp_genbank2gff3.py
                cdss[0].qualifiers["locus_tag"] = cdss[0].qualifiers["Name"]
                # Append our results
                local_results.append((gene, cdss[0], "Gene is missing a locus_tag"))
                local_qc_features.append(
                    gen_qc_feature(
                        gene.location.start,
                        gene.location.end,
                        "Gene is missing a locus_tag",
                        strand=gene.strand, 
                        type_src="gene"
                    )
                )

        # We need to alert on any child issues as well.
        for cds in cdss:
            cds_data = [
                (k, v[0]) for (k, v) in cds.qualifiers.items() if k == "cpt_gmc"
            ]
            if len(gene_data) == 0 and len(cds_data) == 0:
                # Alles gut
                pass
            else:
                for _, problem in cds_data:
                    if problem == "BOTH Missing Locus Tag":
                        gene.qualifiers["locus_tag"] = [""]
                        cds.qualifiers["locus_tag"] = [""]
                        local_results.append(
                            (gene, cds, "Both gene and CDS are missing locus tags")
                        )
                        local_qc_features.append(
                            gen_qc_feature(
                                cds.location.start,
                                cds.location.end,
                                "CDS is missing a locus_tag",
                                strand=cds.strand, 
                                type_src="CDS"
                            )
                        )
                        local_qc_features.append(
                            gen_qc_feature(
                                gene.location.start,
                                gene.location.end,
                                "Gene is missing a locus_tag",
                                strand=gene.strand, 
                                type_src="gene"
                            )
                        )
                    elif problem == "Different locus tag from associated gene.":
                        gene.qualifiers["locus_tag"] = gene.qualifiers["Name"]
                        cds.qualifiers["locus_tag"] = cds.qualifiers["cpt_gmc_locus"]
                        local_results.append(
                            (gene, cds, "Gene and CDS have differing locus tags")
                        )
                        local_qc_features.append(
                            gen_qc_feature(
                                gene.location.start,
                                gene.location.end,
                                "Gene and CDS have differing locus tags",
                                strand=gene.strand, 
                                type_src="gene"
                            )
                        )
                    elif problem == "Missing Locus Tag":
                        # Copy this over
                        gene.qualifiers["locus_tag"] = gene.qualifiers["Name"]
                        # This one is missing
                        cds.qualifiers["locus_tag"] = [""]
                        local_results.append((gene, cds, "CDS is missing a locus_tag"))
                        local_qc_features.append(
                            gen_qc_feature(
                                cds.location.start,
                                cds.location.end,
                                "CDS is missing a locus_tag",
                                strand=cds.strand, 
                                type_src="CDS"
                            )
                        )
                    else:
                        log.warn("Cannot handle %s", problem)

        if len(local_results) > 0:
            bad += 1
        else:
            good += 1

        qc_features.extend(local_qc_features)
        results.extend(local_results)
    return good, bad, results, qc_features


def missing_tags(record):
    """Find features without product
    """
    results = []
    good = 0
    bad = 0
    qc_features = []

    for gene in coding_genes(record.features):
        cds = [x for x in genes(gene.sub_features, feature_type="CDS")]
        if len(cds) == 0:
            log.warn("Gene missing CDS subfeature %s", get_gff3_id(gene))
            continue

        cds = cds[0]

        if "product" not in cds.qualifiers:
            log.info("Missing product tag on %s", get_gff3_id(gene))
            qc_features.append(
                gen_qc_feature(
                    cds.location.start,
                    cds.location.end,
                    "Missing product tag",
                    strand=cds.strand,
                    type_src="CDS"
                )
            )
            results.append(cds)
            bad += 1
        else:
            good += 1

    return good, bad, results, qc_features


def evaluate_and_report(
    annotations,
    genome,
    gff3=None,
    tbl=None,
    sd_min=5,
    sd_max=15,
    min_gene_length=30,
    excessive_gap_dist=50,
    excessive_gap_divergent_dist=200,
    excessive_overlap_dist=25,
    excessive_overlap_divergent_dist=50,
    reportTemplateName="phage_annotation_validator.html",
):
    """
    Generate our HTML evaluation of the genome
    """
    # Get features from GFF file
    seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta"))
    # Get the first GFF3 record
    # TODO: support multiple GFF3 files.
    mostFeat = 0
    for rec in list(gffParse(annotations, base_dict=seq_dict)):
      if len(rec.features) > mostFeat:
        mostFeat = len(rec.features)
        record = rec

    gff3_qc_record = SeqRecord(record.id, id=record.id)
    gff3_qc_record.features = []
    gff3_qc_features = []

    log.info("Locating missing RBSs")
    # mb_any = "did they annotate ANY rbss? if so, take off from score."
    mb_good, mb_bad, mb_results, mb_annotations, mb_any = missing_rbs(
        record, lookahead_min=sd_min, lookahead_max=sd_max
    )
    gff3_qc_features += mb_annotations

    log.info("Locating excessive gaps")
    eg_good, eg_bad, eg_results, eg_annotations = excessive_gap(
        record,
        excess=excessive_gap_dist,
        excess_divergent=excessive_gap_divergent_dist,
        min_gene=min_gene_length,
        slop=excessive_overlap_dist,
        lookahead_min=sd_min,
        lookahead_max=sd_max,
    )
    gff3_qc_features += eg_annotations

    log.info("Locating excessive overlaps")
    eo_good, eo_bad, eo_results, eo_annotations = excessive_overlap(
        record,
        excess=excessive_overlap_dist,
        excess_divergent=excessive_overlap_divergent_dist,
    )
    gff3_qc_features += eo_annotations

    log.info("Locating morons")
    mo_good, mo_bad, mo_results, mo_annotations = find_morons(record)
    gff3_qc_features += mo_annotations

    log.info("Locating missing tags")
    mt_good, mt_bad, mt_results, mt_annotations = missing_tags(record)
    gff3_qc_features += mt_annotations

    log.info("Locating missing gene features")
    mg_good, mg_bad, mg_results, mg_annotations = missing_genes(record)
    gff3_qc_features += mg_annotations

    log.info("Determining coding density")
    cd, cd_real = coding_density(record)

    log.info("Locating weird starts")
    ws_good, ws_bad, ws_results, ws_annotations, ws_overall = weird_starts(record)
    gff3_qc_features += ws_annotations

    log.info("Locating bad gene models")
    gm_good, gm_bad, gm_results, gm_annotations = bad_gene_model(record)
    if gm_good + gm_bad == 0:
        gm_bad = 1

    log.info("Locating more bad gene models")
    gmc_good, gmc_bad, gmc_results, gmc_annotations = gene_model_correction_issues(
        record
    )
    if gmc_good + gmc_bad == 0:
        gmc_bad = 1

    good_scores = [eg_good, eo_good, mt_good, ws_good, gm_good, gmc_good]
    bad_scores = [eg_bad, eo_bad, mt_bad, ws_bad, gm_bad, gmc_bad]

    # Only if they tried to annotate RBSs do we consider them.
    if mb_any:
        good_scores.append(mb_good)
        bad_scores.append(mb_bad)
    subscores = []

    for (g, b) in zip(good_scores, bad_scores):
        if g + b == 0:
            s = 0
        else:
            s = int(100 * float(g) / (float(b) + float(g)))
        subscores.append(s)
    subscores.append(cd)

    score = int(float(sum(subscores)) / float(len(subscores)))

    # This is data that will go into our HTML template
    kwargs = {
        "upstream_min": sd_min,
        "upstream_max": sd_max,
        "record_name": record.id,
        "record_nice_name": nice_name(record),
        "params": {
            "sd_min": sd_min,
            "sd_max": sd_max,
            "min_gene_length": min_gene_length,
            "excessive_gap_dist": excessive_gap_dist,
            "excessive_gap_divergent_dist": excessive_gap_divergent_dist,
            "excessive_overlap_dist": excessive_overlap_dist,
            "excessive_overlap_divergent_dist": excessive_overlap_divergent_dist,
        },
        "score": score,
        "encouragement": get_encouragement(score),
        "genome_overview": genome_overview(record),
        "rbss_annotated": mb_any,
        "missing_rbs": mb_results,
        "missing_rbs_good": mb_good,
        "missing_rbs_bad": mb_bad,
        "missing_rbs_score": 0
        if mb_good + mb_bad == 0
        else (100 * mb_good / (mb_good + mb_bad)),
        "excessive_gap": eg_results,
        "excessive_gap_good": eg_good,
        "excessive_gap_bad": eg_bad,
        "excessive_gap_score": 0
        if eo_good + eo_bad == 0
        else (100 * eo_good / (eo_good + eo_bad)),
        "excessive_overlap": eo_results,
        "excessive_overlap_good": eo_good,
        "excessive_overlap_bad": eo_bad,
        "excessive_overlap_score": 0
        if eo_good + eo_bad == 0
        else (100 * eo_good / (eo_good + eo_bad)),
        "morons": mo_results,
        "morons_good": mo_good,
        "morons_bad": mo_bad,
        "morons_score": 0
        if mo_good + mo_bad == 0
        else (100 * mo_good / (mo_good + mo_bad)),
        "missing_tags": mt_results,
        "missing_tags_good": mt_good,
        "missing_tags_bad": mt_bad,
        "missing_tags_score": 0
        if mt_good + mt_bad == 0
        else (100 * mt_good / (mt_good + mt_bad)),
        "missing_genes": mg_results,
        "missing_genes_good": mg_good,
        "missing_genes_bad": mg_bad,
        "missing_genes_score": 0
        if mg_good + mg_bad == 0
        else (100 * mg_good / (mg_good + mg_bad)),
        "weird_starts": ws_results,
        "weird_starts_good": ws_good,
        "weird_starts_bad": ws_bad,
        "weird_starts_overall": ws_overall,
        "weird_starts_overall_sorted_keys": sorted(
            ws_overall, reverse=True, key=lambda x: ws_overall[x]
        ),
        "weird_starts_score": 0
        if ws_good + ws_bad == 0
        else (100 * ws_good / (ws_good + ws_bad)),
        "gene_model": gm_results,
        "gene_model_good": gm_good,
        "gene_model_bad": gm_bad,
        "gene_model_score": 0
        if gm_good + gm_bad == 0
        else (100 * gm_good / (gm_good + gm_bad)),
        "gene_model_correction": gmc_results,
        "gene_model_correction_good": gmc_good,
        "gene_model_correction_bad": gmc_bad,
        "gene_model_correction_score": 0
        if gmc_good + gmc_bad == 0
        else (100 * gmc_good / (gmc_good + gmc_bad)),
        "coding_density": cd,
        "coding_density_exact": exact_coding_density(record),
        "coding_density_real": cd_real,
        "coding_density_score": cd,
    }

    with open(tbl, "w") as handle:
        kw_subset = {}
        for key in kwargs:
            if (
                key in ("score", "record_name")
                or "_good" in key
                or "_bad" in key
                or "_overall" in key
            ):
                kw_subset[key] = kwargs[key]
        json.dump(kw_subset, handle)

    with open(gff3, "w") as handle:
        gff3_qc_record.features = gff3_qc_features
        gff3_qc_record.annotations = {}
        gffWrite([gff3_qc_record], handle)

    def nice_strand(direction):
        # It is somehow possible for whole gffSeqFeature objects to end up in here, apparently at the gene level
        if "SeqFeature" in str(type(direction)):
          direction = direction.location.strand
        if direction > 0:
            return "→"#.decode("utf-8")
        else:
            return "←"#.decode("utf-8")

    def nice_strand_tex(direction):
        if "SeqFeature" in str(type(direction)):
          direction = direction.location.strand
        if direction > 0:
            return "$\\rightarrow$"
        else:
            return "$\\leftarrow$"

    def texify(data):
        return data.replace("_", "\\_").replace("$", "\\$")

    def length(data):
        return len(data)

    def my_encode(data):
        return str(data)#.encode("utf-8")

    def my_decode(data):
        # For production
        return str(data)#.decode("utf-8")
        # For local testing. No, I do not understand.
        return str(data)#.encode("utf-8")).decode("utf-8")

    env = Environment(
        loader=FileSystemLoader(SCRIPT_PATH), trim_blocks=True, lstrip_blocks=True
    )
    env.filters.update(
        {
            "nice_id": get_gff3_id,
            "nice_strand": nice_strand,
            "nice_strand_tex": nice_strand_tex,
            "texify": texify,
            "length": length,
            "encode": my_encode,
            "decode": my_decode,
        }
    )
    tpl = env.get_template(reportTemplateName)
    return tpl.render(**kwargs)#.encode("utf-8")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="rebase gff3 features against parent locations", epilog=""
    )
    parser.add_argument(
        "annotations", type=argparse.FileType("r"), help="Parent GFF3 annotations"
    )
    parser.add_argument("genome", type=argparse.FileType("r"), help="Genome Sequence")
    parser.add_argument(
        "--gff3", type=str, help="GFF3 Annotations", default="qc_annotations.gff3"
    )
    parser.add_argument(
        "--tbl",
        type=str,
        help="Table for noninteractive parsing",
        default="qc_results.json",
    )

    parser.add_argument(
        "--sd_min",
        type=int,
        help="Minimum distance from gene start for an SD to be",
        default=5,
    )
    parser.add_argument(
        "--sd_max",
        type=int,
        help="Maximum distance from gene start for an SD to be",
        default=15,
    )

    parser.add_argument(
        "--min_gene_length",
        type=int,
        help="Minimum length for a putative gene call (AAs)",
        default=30,
    )

    parser.add_argument(
        "--excessive_overlap_dist",
        type=int,
        help="Excessive overlap for genes in same direction",
        default=25,
    )
    parser.add_argument(
        "--excessive_overlap_divergent_dist",
        type=int,
        help="Excessive overlap for genes in diff directions",
        default=50,
    )

    parser.add_argument(
        "--excessive_gap_dist",
        type=int,
        help="Maximum distance between two genes",
        default=40,
    )
    parser.add_argument(
        "--excessive_gap_divergent_dist",
        type=int,
        help="Maximum distance between two divergent genes",
        default=200,
    )

    parser.add_argument(
        "--reportTemplateName",
        help="Report template file name",
        default="phageqc_report_full.html",
    )

    args = parser.parse_args()

    sys.stdout.write(evaluate_and_report(**vars(args)))